All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/3] powerpc/mm: Rename find_linux_pte_or_hugepte
@ 2017-06-01 14:33 Aneesh Kumar K.V
  2017-06-01 14:33 ` [PATCH v2 2/3] powerpc/mm: Don't send IPI to all cpus on THP updates Aneesh Kumar K.V
  2017-06-01 14:33 ` [PATCH v2 3/3] powerpc/mm/cxl: Add the fault handling cpu to mm cpumask Aneesh Kumar K.V
  0 siblings, 2 replies; 4+ messages in thread
From: Aneesh Kumar K.V @ 2017-06-01 14:33 UTC (permalink / raw)
  To: benh, paulus, mpe, Frederic Barrat; +Cc: linuxppc-dev, Aneesh Kumar K.V

Add newer helpers to make the usage simpler. It is always recommended to use
find_current_mm_pte() for walking the page table. If cannot used, it should
be documented why the said usage of __find_linux_pte() is safe against a
parallel THP split.

For now we have KVM code use __find_linux_pte(). This is because kvm code ends
up calling __find_linux_pte() in real mode with MSR_EE=0 but PACA soft_enabled =
1. We may want to fix that later and make sure we keep the MSR_EE and  PACA
soft_enabled in sync. When we do that we can switch kvm to use find_linux_pte().

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pgtable.h     | 10 +--------
 arch/powerpc/include/asm/pte-walk.h    | 38 ++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/eeh.c              |  4 ++--
 arch/powerpc/kernel/io-workarounds.c   |  5 +++--
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  5 +++--
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 28 ++++++++++++-------------
 arch/powerpc/kvm/book3s_64_vio_hv.c    | 12 ++++++++++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c    | 18 ++++++++--------
 arch/powerpc/kvm/e500_mmu_host.c       |  3 ++-
 arch/powerpc/mm/hash_utils_64.c        |  5 +++--
 arch/powerpc/mm/hugetlbpage.c          | 24 ++++++++++++---------
 arch/powerpc/mm/tlb_hash64.c           |  6 ++++--
 arch/powerpc/perf/callchain.c          |  3 ++-
 13 files changed, 106 insertions(+), 55 deletions(-)
 create mode 100644 arch/powerpc/include/asm/pte-walk.h

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index dd01212935ac..9fa263ad7cb3 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -66,16 +66,8 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_large(pmd)		0
 #endif
-pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-				   bool *is_thp, unsigned *shift);
-static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-					       bool *is_thp, unsigned *shift)
-{
-	VM_WARN(!arch_irqs_disabled(),
-		"%s called with irq enabled\n", __func__);
-	return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift);
-}
 
+/* can we use this in kvm */
 unsigned long vmalloc_to_phys(void *vmalloc_addr);
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
diff --git a/arch/powerpc/include/asm/pte-walk.h b/arch/powerpc/include/asm/pte-walk.h
new file mode 100644
index 000000000000..3a5a391a4c6d
--- /dev/null
+++ b/arch/powerpc/include/asm/pte-walk.h
@@ -0,0 +1,38 @@
+#ifndef _ASM_POWERPC_PTE_WALK_H
+#define _ASM_POWERPC_PTE_WALK_H
+
+#ifndef __ASSEMBLY__
+#include <linux/sched.h>
+
+/* Don't use this directly */
+extern pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
+			       bool *is_thp, unsigned *hshift);
+
+static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea,
+				    bool *is_thp, unsigned *hshift)
+{
+	VM_WARN(!arch_irqs_disabled(),
+		"%s called with irq enabled\n", __func__);
+	return __find_linux_pte(pgdir, ea, is_thp, hshift);
+}
+
+static inline pte_t *find_init_mm_pte(unsigned long ea, unsigned *hshift)
+{
+	pgd_t *pgdir = init_mm.pgd;
+	return __find_linux_pte(pgdir, ea, NULL, hshift);
+}
+/*
+ * This is what we should always use. Any other lockless page table lookup needs
+ * careful audit against THP split.
+ */
+static inline pte_t *find_current_mm_pte(pgd_t *pgdir, unsigned long ea,
+					 bool *is_thp, unsigned *hshift)
+{
+	VM_WARN(!arch_irqs_disabled(),
+		"%s called with irq enabled\n", __func__);
+	VM_WARN(pgdir != current->mm->pgd,
+		"%s lock less page table lookup called on wrong mm\n", __func__);
+	return __find_linux_pte(pgdir, ea, is_thp, hshift);
+}
+#endif
+#endif
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 63992b2d8e15..5e6887c40528 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -44,6 +44,7 @@
 #include <asm/machdep.h>
 #include <asm/ppc-pci.h>
 #include <asm/rtas.h>
+#include <asm/pte-walk.h>
 
 
 /** Overview:
@@ -352,8 +353,7 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 	 * worried about _PAGE_SPLITTING/collapse. Also we will not hit
 	 * page table free, because of init_mm.
 	 */
-	ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token,
-					   NULL, &hugepage_shift);
+	ptep = find_init_mm_pte(token, &hugepage_shift);
 	if (!ptep)
 		return token;
 	WARN_ON(hugepage_shift);
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index a582e0d42525..bbe85f5aea71 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -19,6 +19,8 @@
 #include <asm/pgtable.h>
 #include <asm/ppc-pci.h>
 #include <asm/io-workarounds.h>
+#include <asm/pte-walk.h>
+
 
 #define IOWA_MAX_BUS	8
 
@@ -75,8 +77,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
 		 * We won't find huge pages here (iomem). Also can't hit
 		 * a page table free due to init_mm
 		 */
-		ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
-						   NULL, &hugepage_shift);
+		ptep = find_init_mm_pte(vaddr, &hugepage_shift);
 		if (ptep == NULL)
 			paddr = 0;
 		else {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 710e491206ed..fdf3c4846117 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -37,6 +37,7 @@
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 #include <asm/cputable.h>
+#include <asm/pte-walk.h>
 
 #include "trace_hv.h"
 
@@ -597,8 +598,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 * hugepage split and collapse.
 			 */
 			local_irq_save(flags);
-			ptep = find_linux_pte_or_hugepte(current->mm->pgd,
-							 hva, NULL, NULL);
+			ptep = find_current_mm_pte(current->mm->pgd,
+						   hva, NULL, NULL);
 			if (ptep) {
 				pte = kvmppc_read_update_linux_pte(ptep, 1);
 				if (__pte_write(pte))
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index f6b3e67c5762..7d719c8aa0bb 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -17,6 +17,7 @@
 #include <asm/mmu.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/pte-walk.h>
 
 /*
  * Supported radix tree geometry.
@@ -359,8 +360,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		if (writing)
 			pgflags |= _PAGE_DIRTY;
 		local_irq_save(flags);
-		ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva,
-						   NULL, NULL);
+		ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL);
 		if (ptep) {
 			pte = READ_ONCE(*ptep);
 			if (pte_present(pte) &&
@@ -374,8 +374,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				spin_unlock(&kvm->mmu_lock);
 				return RESUME_GUEST;
 			}
-			ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable,
-							gpa, NULL, &shift);
+			/*
+			 * We are walking the secondary page table here. We can do this
+			 * without disabling irq.
+			 */
+			ptep = __find_linux_pte(kvm->arch.pgtable,
+						gpa, NULL, &shift);
 			if (ptep && pte_present(*ptep)) {
 				kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
 							gpa, shift);
@@ -427,8 +431,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			pgflags |= _PAGE_WRITE;
 		} else {
 			local_irq_save(flags);
-			ptep = __find_linux_pte_or_hugepte(current->mm->pgd,
-							hva, NULL, NULL);
+			ptep = find_current_mm_pte(current->mm->pgd,
+						   hva, NULL, NULL);
 			if (ptep && pte_write(*ptep) && pte_dirty(*ptep))
 				pgflags |= _PAGE_WRITE;
 			local_irq_restore(flags);
@@ -499,8 +503,7 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	unsigned int shift;
 	unsigned long old;
 
-	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
-					   NULL, &shift);
+	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
 	if (ptep && pte_present(*ptep)) {
 		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
 					      gpa, shift);
@@ -525,8 +528,7 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	unsigned int shift;
 	int ref = 0;
 
-	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
-					   NULL, &shift);
+	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
 	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
 		kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
 					gpa, shift);
@@ -545,8 +547,7 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	unsigned int shift;
 	int ref = 0;
 
-	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
-					   NULL, &shift);
+	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
 	if (ptep && pte_present(*ptep) && pte_young(*ptep))
 		ref = 1;
 	return ref;
@@ -562,8 +563,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
 	unsigned int shift;
 	int ret = 0;
 
-	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
-					   NULL, &shift);
+	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
 	if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
 		ret = 1;
 		if (shift)
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 3adfd2f5301c..c32e9bfe75b1 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -39,6 +39,7 @@
 #include <asm/udbg.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
+#include <asm/pte-walk.h>
 
 #ifdef CONFIG_BUG
 
@@ -353,7 +354,16 @@ static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
 	pte_t *ptep, pte;
 	unsigned shift = 0;
 
-	ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift);
+	/*
+	 * Called in real mode with MSR_EE = 0. We are safe here.
+	 * It is ok to do the lookup with arch.pgdir here, because
+	 * we are doing this on secondary cpus and current task there
+	 * is not the hypervisor. Also this is safe against THP in the
+	 * host, because an IPI to primary thread will wait for the secondary
+	 * to exit which will agains result in the below page table walk
+	 * to finish.
+	 */
+	ptep = __find_linux_pte(vcpu->arch.pgdir, ua, NULL, &shift);
 	if (!ptep || !pte_present(*ptep))
 		return -ENXIO;
 	pte = *ptep;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index ce6f2121fffe..e19228556bba 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -21,6 +21,7 @@
 #include <asm/hvcall.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
+#include <asm/pte-walk.h>
 
 /* Translate address of a vmalloc'd thing to a linear map address */
 static void *real_vmalloc_addr(void *x)
@@ -30,9 +31,9 @@ static void *real_vmalloc_addr(void *x)
 	/*
 	 * assume we don't have huge pages in vmalloc space...
 	 * So don't worry about THP collapse/split. Called
-	 * Only in realmode, hence won't need irq_save/restore.
+	 * Only in realmode with MSR_EE = 0, hence won't need irq_save/restore.
 	 */
-	p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL, NULL);
+	p = find_init_mm_pte(addr, NULL);
 	if (!p || !pte_present(*p))
 		return NULL;
 	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
@@ -229,14 +230,13 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	 * If we had a page table table change after lookup, we would
 	 * retry via mmu_notifier_retry.
 	 */
-	if (realmode)
-		ptep = __find_linux_pte_or_hugepte(pgdir, hva, NULL,
-						   &hpage_shift);
-	else {
+	if (!realmode)
 		local_irq_save(irq_flags);
-		ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL,
-						 &hpage_shift);
-	}
+	/*
+	 * If called in real mode we have MSR_EE = 0. Otherwise
+	 * we disable irq above.
+	 */
+	ptep = __find_linux_pte(pgdir, hva, NULL, &hpage_shift);
 	if (ptep) {
 		pte_t pte;
 		unsigned int host_pte_size;
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 77fd043b3ecc..c6c734424c70 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -30,6 +30,7 @@
 #include <linux/vmalloc.h>
 #include <linux/hugetlb.h>
 #include <asm/kvm_ppc.h>
+#include <asm/pte-walk.h>
 
 #include "e500.h"
 #include "timing.h"
@@ -476,7 +477,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	 * can't run hence pfn won't change.
 	 */
 	local_irq_save(flags);
-	ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL, NULL);
+	ptep = find_linux_pte(pgdir, hva, NULL, NULL);
 	if (ptep) {
 		pte_t pte = READ_ONCE(*ptep);
 
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index c4b04c4aac86..6e1a3ee3e846 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -61,6 +61,7 @@
 #include <asm/tm.h>
 #include <asm/trace.h>
 #include <asm/ps3.h>
+#include <asm/pte-walk.h>
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -1295,7 +1296,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get PTE and page size from page tables */
-	ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift);
+	ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
 	if (ptep == NULL || !pte_present(*ptep)) {
 		DBG_LOW(" no PTE !\n");
 		rc = 1;
@@ -1524,7 +1525,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	 * THP pages use update_mmu_cache_pmd. We don't do
 	 * hash preload there. Hence can ignore THP here
 	 */
-	ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift);
+	ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
 	if (!ptep)
 		goto out_exit;
 
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 2bcb7e5b2ab6..11178845226a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -24,6 +24,8 @@
 #include <asm/tlb.h>
 #include <asm/setup.h>
 #include <asm/hugetlb.h>
+#include <asm/pte-walk.h>
+
 
 #ifdef CONFIG_HUGETLB_PAGE
 
@@ -39,8 +41,11 @@ unsigned int HPAGE_SHIFT;
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-	/* Only called for hugetlbfs pages, hence can ignore THP */
-	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
+	/*
+	 * Only called for hugetlbfs pages, hence can ignore THP and the
+	 * irq disabled walk.
+	 */
+	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -748,9 +753,8 @@ void flush_dcache_icache_hugepage(struct page *page)
  * This function need to be called with interrupts disabled. We use this variant
  * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
  */
-
-pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-				   bool *is_thp, unsigned *shift)
+pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
+			bool *is_thp, unsigned *hpage_shift)
 {
 	pgd_t pgd, *pgdp;
 	pud_t pud, *pudp;
@@ -759,8 +763,8 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 	hugepd_t *hpdp = NULL;
 	unsigned pdshift = PGDIR_SHIFT;
 
-	if (shift)
-		*shift = 0;
+	if (hpage_shift)
+		*hpage_shift = 0;
 
 	if (is_thp)
 		*is_thp = false;
@@ -830,11 +834,11 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
 	pdshift = hugepd_shift(*hpdp);
 out:
-	if (shift)
-		*shift = pdshift;
+	if (hpage_shift)
+		*hpage_shift = pdshift;
 	return ret_pte;
 }
-EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
+EXPORT_SYMBOL_GPL(__find_linux_pte);
 
 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 4517aa43a8b1..b3e6116b4317 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -29,6 +29,8 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/bug.h>
+#include <asm/pte-walk.h>
+
 
 #include <trace/events/thp.h>
 
@@ -209,8 +211,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 	local_irq_save(flags);
 	arch_enter_lazy_mmu_mode();
 	for (; start < end; start += PAGE_SIZE) {
-		pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp,
-							&hugepage_shift);
+		pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
+						  &hugepage_shift);
 		unsigned long pte;
 
 		if (ptep == NULL)
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 0fc26714780a..0af051a1974e 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -22,6 +22,7 @@
 #ifdef CONFIG_PPC64
 #include "../kernel/ppc32.h"
 #endif
+#include <asm/pte-walk.h>
 
 
 /*
@@ -127,7 +128,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
 		return -EFAULT;
 
 	local_irq_save(flags);
-	ptep = find_linux_pte_or_hugepte(pgdir, addr, NULL, &shift);
+	ptep = find_current_mm_pte(pgdir, addr, NULL, &shift);
 	if (!ptep)
 		goto err_out;
 	if (!shift)
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH v2 2/3] powerpc/mm: Don't send IPI to all cpus on THP updates
  2017-06-01 14:33 [PATCH v2 1/3] powerpc/mm: Rename find_linux_pte_or_hugepte Aneesh Kumar K.V
@ 2017-06-01 14:33 ` Aneesh Kumar K.V
  2017-06-01 14:33 ` [PATCH v2 3/3] powerpc/mm/cxl: Add the fault handling cpu to mm cpumask Aneesh Kumar K.V
  1 sibling, 0 replies; 4+ messages in thread
From: Aneesh Kumar K.V @ 2017-06-01 14:33 UTC (permalink / raw)
  To: benh, paulus, mpe, Frederic Barrat; +Cc: linuxppc-dev, Aneesh Kumar K.V

Now that we made sure that lockless walk of linux page table is mostly limitted
to current task(current->mm->pgdir) we can update the THP update sequence to
only send IPI to cpus on which this task has run. This helps in reducing the IPI
overload on systems with large number of CPUs.

W.r.t kvm even though kvm is walking page table with vpc->arch.pgdir, it is
done only on secondary cpus and in that case we have primary cpu added to
task's mm cpumask. Sending an IPI to primary will force the secondary to do
a vm exit and hence this mm cpumask usage is safe here.

W.r.t CAPI, we still end up walking linux page table with capi context MM. For
now the pte lookup serialization sends an IPI to all cpus in CPI is in use. We
can further improve this by adding the CAPI interrupt handling cpu to task
mm cpumask. That will be done in a later patch.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  1 +
 arch/powerpc/mm/pgtable-book3s64.c           | 32 +++++++++++++++++++++++++++-
 arch/powerpc/mm/pgtable-hash64.c             |  8 +++----
 arch/powerpc/mm/pgtable-radix.c              |  8 +++----
 4 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 85bc9875c3be..d8c3c18e220d 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1145,6 +1145,7 @@ static inline bool arch_needs_pgtable_deposit(void)
 		return false;
 	return true;
 }
+extern void serialize_against_pte_lookup(struct mm_struct *mm);
 
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 5fcb3dd74c13..2679f57b90e2 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -9,6 +9,7 @@
 
 #include <linux/sched.h>
 #include <linux/mm_types.h>
+#include <misc/cxl-base.h>
 
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
@@ -64,6 +65,35 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	trace_hugepage_set_pmd(addr, pmd_val(pmd));
 	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 }
+
+static void do_nothing(void *unused)
+{
+
+}
+/*
+ * Serialize against find_current_mm_pte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_current_mm_pte to finish.
+ */
+void serialize_against_pte_lookup(struct mm_struct *mm)
+{
+	smp_mb();
+	/*
+	 * Cxl fault handling requires us to do a lockless page table
+	 * walk while inserting hash page table entry with mm tracked
+	 * in cxl context. Hence we need to do a global flush.
+	 */
+	if (cxl_ctx_in_use())
+		smp_call_function(do_nothing, NULL, 1);
+	else
+		smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+}
+
 /*
  * We use this to invalidate a pmdp entry before switching from a
  * hugepte to regular pmd entry.
@@ -77,7 +107,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 	 * This ensures that generic code that rely on IRQ disabling
 	 * to prevent a parallel THP split work as expected.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(vma->vm_mm);
 }
 
 static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 8b85a14b08ea..f6313cc29ae4 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -159,7 +159,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres
 	 * by sending an IPI to all the cpus and executing a dummy
 	 * function there.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(vma->vm_mm);
 	/*
 	 * Now invalidate the hpte entries in the range
 	 * covered by pmd. This make sure we take a
@@ -299,16 +299,16 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 	 */
 	memset(pgtable, 0, PTE_FRAG_SIZE);
 	/*
-	 * Serialize against find_linux_pte_or_hugepte which does lock-less
+	 * Serialize against find_current_mm_pte variants which does lock-less
 	 * lookup in page tables with local interrupts disabled. For huge pages
 	 * it casts pmd_t to pte_t. Since format of pte_t is different from
 	 * pmd_t we want to prevent transit from pmd pointing to page table
 	 * to pmd pointing to huge page (and back) while interrupts are disabled.
 	 * We clear pmd to possibly replace it with page table pointer in
 	 * different code paths. So make sure we wait for the parallel
-	 * find_linux_pte_or_hugepage to finish.
+	 * find_curren_mm_pte to finish.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(mm);
 	return old_pmd;
 }
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index c28165d8970b..6e3d1518eef3 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -707,7 +707,7 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
 	pmd = *pmdp;
 	pmd_clear(pmdp);
 	/*FIXME!!  Verify whether we need this kick below */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(vma->vm_mm);
 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	return pmd;
 }
@@ -767,16 +767,16 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
 	old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 	old_pmd = __pmd(old);
 	/*
-	 * Serialize against find_linux_pte_or_hugepte which does lock-less
+	 * Serialize against find_current_mm_pte which does lock-less
 	 * lookup in page tables with local interrupts disabled. For huge pages
 	 * it casts pmd_t to pte_t. Since format of pte_t is different from
 	 * pmd_t we want to prevent transit from pmd pointing to page table
 	 * to pmd pointing to huge page (and back) while interrupts are disabled.
 	 * We clear pmd to possibly replace it with page table pointer in
 	 * different code paths. So make sure we wait for the parallel
-	 * find_linux_pte_or_hugepage to finish.
+	 * find_current_mm_pte to finish.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(mm);
 	return old_pmd;
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH v2 3/3] powerpc/mm/cxl: Add the fault handling cpu to mm cpumask
  2017-06-01 14:33 [PATCH v2 1/3] powerpc/mm: Rename find_linux_pte_or_hugepte Aneesh Kumar K.V
  2017-06-01 14:33 ` [PATCH v2 2/3] powerpc/mm: Don't send IPI to all cpus on THP updates Aneesh Kumar K.V
@ 2017-06-01 14:33 ` Aneesh Kumar K.V
  2017-06-09 16:14   ` Frederic Barrat
  1 sibling, 1 reply; 4+ messages in thread
From: Aneesh Kumar K.V @ 2017-06-01 14:33 UTC (permalink / raw)
  To: benh, paulus, mpe, Frederic Barrat; +Cc: linuxppc-dev, Aneesh Kumar K.V

We use mm cpumask for serializing against lockless page table walk. Anybody
who is doing a lockless page table walk is expected to disable irq and only
cpus in mm cpumask is expected do the lockless walk. This ensure that
a THP split can send IPI to only cpus in the mm cpumask, to make sure there
are no parallel lockless page table walk.

Add the CAPI fault handling cpu to the mm cpumask so that we can do the lockless
page table walk while inserting hash page table entries.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/mm/pgtable-book3s64.c | 10 +---------
 drivers/misc/cxl/fault.c           |  6 ++++++
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 2679f57b90e2..6a50ab23f722 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -83,15 +83,7 @@ static void do_nothing(void *unused)
 void serialize_against_pte_lookup(struct mm_struct *mm)
 {
 	smp_mb();
-	/*
-	 * Cxl fault handling requires us to do a lockless page table
-	 * walk while inserting hash page table entry with mm tracked
-	 * in cxl context. Hence we need to do a global flush.
-	 */
-	if (cxl_ctx_in_use())
-		smp_call_function(do_nothing, NULL, 1);
-	else
-		smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+	smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
 }
 
 /*
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 5344448f514e..02efaaa7cbd1 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -140,6 +140,12 @@ static void cxl_handle_page_fault(struct cxl_context *ctx,
 	unsigned long access, flags, inv_flags = 0;
 
 	trace_cxl_pte_miss(ctx, dsisr, dar);
+	/*
+	 * Add the fault handling cpu to task mm cpumask so that we
+	 * can do a safe lockless page table walk when inserting the
+	 * hash page table entry.
+	 */
+	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
 
 	if ((result = copro_handle_mm_fault(mm, dar, dsisr, &flt))) {
 		pr_devel("copro_handle_mm_fault failed: %#x\n", result);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2 3/3] powerpc/mm/cxl: Add the fault handling cpu to mm cpumask
  2017-06-01 14:33 ` [PATCH v2 3/3] powerpc/mm/cxl: Add the fault handling cpu to mm cpumask Aneesh Kumar K.V
@ 2017-06-09 16:14   ` Frederic Barrat
  0 siblings, 0 replies; 4+ messages in thread
From: Frederic Barrat @ 2017-06-09 16:14 UTC (permalink / raw)
  To: Aneesh Kumar K.V, benh, paulus, mpe; +Cc: linuxppc-dev



Le 01/06/2017 à 16:33, Aneesh Kumar K.V a écrit :
> We use mm cpumask for serializing against lockless page table walk. Anybody
> who is doing a lockless page table walk is expected to disable irq and only
> cpus in mm cpumask is expected do the lockless walk. This ensure that
> a THP split can send IPI to only cpus in the mm cpumask, to make sure there
> are no parallel lockless page table walk.
> 
> Add the CAPI fault handling cpu to the mm cpumask so that we can do the lockless
> page table walk while inserting hash page table entries.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---


Reviewed-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>



>   arch/powerpc/mm/pgtable-book3s64.c | 10 +---------
>   drivers/misc/cxl/fault.c           |  6 ++++++
>   2 files changed, 7 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
> index 2679f57b90e2..6a50ab23f722 100644
> --- a/arch/powerpc/mm/pgtable-book3s64.c
> +++ b/arch/powerpc/mm/pgtable-book3s64.c
> @@ -83,15 +83,7 @@ static void do_nothing(void *unused)
>   void serialize_against_pte_lookup(struct mm_struct *mm)
>   {
>   	smp_mb();
> -	/*
> -	 * Cxl fault handling requires us to do a lockless page table
> -	 * walk while inserting hash page table entry with mm tracked
> -	 * in cxl context. Hence we need to do a global flush.
> -	 */
> -	if (cxl_ctx_in_use())
> -		smp_call_function(do_nothing, NULL, 1);
> -	else
> -		smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
> +	smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
>   }
> 
>   /*
> diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
> index 5344448f514e..02efaaa7cbd1 100644
> --- a/drivers/misc/cxl/fault.c
> +++ b/drivers/misc/cxl/fault.c
> @@ -140,6 +140,12 @@ static void cxl_handle_page_fault(struct cxl_context *ctx,
>   	unsigned long access, flags, inv_flags = 0;
> 
>   	trace_cxl_pte_miss(ctx, dsisr, dar);
> +	/*
> +	 * Add the fault handling cpu to task mm cpumask so that we
> +	 * can do a safe lockless page table walk when inserting the
> +	 * hash page table entry.
> +	 */
> +	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
> 
>   	if ((result = copro_handle_mm_fault(mm, dar, dsisr, &flt))) {
>   		pr_devel("copro_handle_mm_fault failed: %#x\n", result);
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2017-06-09 16:14 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-01 14:33 [PATCH v2 1/3] powerpc/mm: Rename find_linux_pte_or_hugepte Aneesh Kumar K.V
2017-06-01 14:33 ` [PATCH v2 2/3] powerpc/mm: Don't send IPI to all cpus on THP updates Aneesh Kumar K.V
2017-06-01 14:33 ` [PATCH v2 3/3] powerpc/mm/cxl: Add the fault handling cpu to mm cpumask Aneesh Kumar K.V
2017-06-09 16:14   ` Frederic Barrat

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.