All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V4 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer
@ 2015-03-26 13:13 Aneesh Kumar K.V
  2015-03-26 13:13 ` [PATCH V4 2/4] KVM: PPC: Remove page table walk helpers Aneesh Kumar K.V
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Aneesh Kumar K.V @ 2015-03-26 13:13 UTC (permalink / raw)
  To: benh, paulus, mpe; +Cc: linuxppc-dev, Aneesh Kumar K.V

pte can get updated from other CPUs as part of multiple activities
like THP split, huge page collapse, unmap. We need to make sure we
don't reload the pte value again and again for different checks.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/kvm_book3s_64.h |  5 ++++-
 arch/powerpc/kvm/e500_mmu_host.c         | 20 ++++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index cc073a7ac2b7..f06820c67175 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -290,7 +290,10 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
 	pte_t old_pte, new_pte = __pte(0);
 
 	while (1) {
-		old_pte = *ptep;
+		/*
+		 * Make sure we don't reload from ptep
+		 */
+		old_pte = READ_ONCE(*ptep);
 		/*
 		 * wait until _PAGE_BUSY is clear then set it atomically
 		 */
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index cc536d4a75ef..5840d546aa03 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -469,14 +469,18 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 	pgdir = vcpu_e500->vcpu.arch.pgdir;
 	ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages);
-	if (pte_present(*ptep))
-		wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
-	else {
-		if (printk_ratelimit())
-			pr_err("%s: pte not present: gfn %lx, pfn %lx\n",
-				__func__, (long)gfn, pfn);
-		ret = -EINVAL;
-		goto out;
+	if (ptep) {
+		pte_t pte = READ_ONCE(*ptep);
+
+		if (pte_present(pte))
+			wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
+				MAS2_WIMGE_MASK;
+		else {
+			pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
+					   __func__, (long)gfn, pfn);
+			ret = -EINVAL;
+			goto out;
+		}
 	}
 	kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
 
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH V4 2/4] KVM: PPC: Remove page table walk helpers
  2015-03-26 13:13 [PATCH V4 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer Aneesh Kumar K.V
@ 2015-03-26 13:13 ` Aneesh Kumar K.V
  2015-03-26 13:13 ` [PATCH V4 3/4] powerpc/mm/thp: Make page table walk safe against thp split/collapse Aneesh Kumar K.V
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Aneesh Kumar K.V @ 2015-03-26 13:13 UTC (permalink / raw)
  To: benh, paulus, mpe; +Cc: linuxppc-dev, Aneesh Kumar K.V

This patch remove helpers which we had used only once in the code.
Limiting page table walk variants help in ensuring that we won't
end up with code walking page table with wrong assumptions.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pgtable.h  | 21 -------------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 ++++++++++++++++---------------------
 arch/powerpc/kvm/e500_mmu_host.c    |  2 +-
 3 files changed, 28 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 9835ac4173b7..92fe01c355a9 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -249,27 +249,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 #endif
 pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 				 unsigned *shift);
-
-static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva,
-				     unsigned long *pte_sizep)
-{
-	pte_t *ptep;
-	unsigned long ps = *pte_sizep;
-	unsigned int shift;
-
-	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
-	if (!ptep)
-		return NULL;
-	if (shift)
-		*pte_sizep = 1ul << shift;
-	else
-		*pte_sizep = PAGE_SIZE;
-
-	if (ps > *pte_sizep)
-		return NULL;
-
-	return ptep;
-}
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 625407e4d3b0..73e083cb9f7e 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -131,25 +131,6 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 	unlock_rmap(rmap);
 }
 
-static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
-			      int writing, unsigned long *pte_sizep)
-{
-	pte_t *ptep;
-	unsigned long ps = *pte_sizep;
-	unsigned int hugepage_shift;
-
-	ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
-	if (!ptep)
-		return __pte(0);
-	if (hugepage_shift)
-		*pte_sizep = 1ul << hugepage_shift;
-	else
-		*pte_sizep = PAGE_SIZE;
-	if (ps > *pte_sizep)
-		return __pte(0);
-	return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
-}
-
 static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
 {
 	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
@@ -166,10 +147,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	struct revmap_entry *rev;
 	unsigned long g_ptel;
 	struct kvm_memory_slot *memslot;
-	unsigned long pte_size;
+	unsigned hpage_shift;
 	unsigned long is_io;
 	unsigned long *rmap;
-	pte_t pte;
+	pte_t *ptep;
 	unsigned int writing;
 	unsigned long mmu_seq;
 	unsigned long rcbits;
@@ -208,22 +189,33 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 	/* Translate to host virtual address */
 	hva = __gfn_to_hva_memslot(memslot, gfn);
+	ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
+	if (ptep) {
+		pte_t pte;
+		unsigned int host_pte_size;
 
-	/* Look up the Linux PTE for the backing page */
-	pte_size = psize;
-	pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size);
-	if (pte_present(pte) && !pte_protnone(pte)) {
-		if (writing && !pte_write(pte))
-			/* make the actual HPTE be read-only */
-			ptel = hpte_make_readonly(ptel);
-		is_io = hpte_cache_bits(pte_val(pte));
-		pa = pte_pfn(pte) << PAGE_SHIFT;
-		pa |= hva & (pte_size - 1);
-		pa |= gpa & ~PAGE_MASK;
-	}
+		if (hpage_shift)
+			host_pte_size = 1ul << hpage_shift;
+		else
+			host_pte_size = PAGE_SIZE;
+		/*
+		 * We should always find the guest page size
+		 * to <= host page size, if host is using hugepage
+		 */
+		if (host_pte_size < psize)
+			return H_PARAMETER;
 
-	if (pte_size < psize)
-		return H_PARAMETER;
+		pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift);
+		if (pte_present(pte) && !pte_protnone(pte)) {
+			if (writing && !pte_write(pte))
+				/* make the actual HPTE be read-only */
+				ptel = hpte_make_readonly(ptel);
+			is_io = hpte_cache_bits(pte_val(pte));
+			pa = pte_pfn(pte) << PAGE_SHIFT;
+			pa |= hva & (host_pte_size - 1);
+			pa |= gpa & ~PAGE_MASK;
+		}
+	}
 
 	ptel &= ~(HPTE_R_PP0 - psize);
 	ptel |= pa;
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 5840d546aa03..a1f5b0d4b1d6 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -468,7 +468,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 
 	pgdir = vcpu_e500->vcpu.arch.pgdir;
-	ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages);
+	ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL);
 	if (ptep) {
 		pte_t pte = READ_ONCE(*ptep);
 
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH V4 3/4] powerpc/mm/thp: Make page table walk safe against thp split/collapse
  2015-03-26 13:13 [PATCH V4 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer Aneesh Kumar K.V
  2015-03-26 13:13 ` [PATCH V4 2/4] KVM: PPC: Remove page table walk helpers Aneesh Kumar K.V
@ 2015-03-26 13:13 ` Aneesh Kumar K.V
  2015-03-26 13:13 ` [PATCH V4 4/4] powerpc/mm/thp: Return pte address if we find trans_splitting Aneesh Kumar K.V
  2015-03-28 10:33 ` [V4, 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer Michael Ellerman
  3 siblings, 0 replies; 7+ messages in thread
From: Aneesh Kumar K.V @ 2015-03-26 13:13 UTC (permalink / raw)
  To: benh, paulus, mpe; +Cc: linuxppc-dev, Aneesh Kumar K.V

We can disable a THP split or a hugepage collapse by disabling irq.
We do send IPI to all the cpus in the early part of split/collapse,
and disabling local irq ensure we don't make progress with
split/collapse. If the THP is getting split we return NULL from
find_linux_pte_or_hugepte(). For all the current callers it should be ok.
We need to be careful if we want to use returned pte_t pointer outside
the irq disabled region. W.r.t to THP split, the pfn remains the same,
but then a hugepage collapse will result in a pfn change. There are
few steps we can take to avoid a hugepage collapse.One way is to take page
reference inside the irq disable region. Other option is to take
mmap_sem so that a parallel collapse will not happen. We can also
disable collapse by taking pmd_lock. Another method used by kvm
subsystem is to check whether we had a mmu_notifer update in between
using mmu_notifier_retry().

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
Changes from V3:
* Split the pmd_trans_splitting related changes into a new patch
  Now we don't have any change related how we handle trans splitting pmd
  in this patch

 arch/powerpc/include/asm/pgtable.h   | 11 ++++++++++-
 arch/powerpc/kernel/eeh.c            |  6 ++++--
 arch/powerpc/kernel/io-workarounds.c | 10 +++++-----
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  5 +++--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  | 30 +++++++++++++++++++++++-------
 arch/powerpc/kvm/e500_mmu_host.c     | 14 ++++++++++++--
 arch/powerpc/mm/hash_utils_64.c      |  2 +-
 arch/powerpc/mm/hugetlbpage.c        | 20 ++++++++++++++------
 arch/powerpc/perf/callchain.c        | 24 ++++++++++++++----------
 9 files changed, 86 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 92fe01c355a9..11a38635dd65 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -247,8 +247,17 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 #define pmd_large(pmd)		0
 #define has_transparent_hugepage() 0
 #endif
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 				 unsigned *shift);
+static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+					       unsigned *shift)
+{
+	if (!arch_irqs_disabled()) {
+		pr_info("%s called with irq enabled\n", __func__);
+		dump_stack();
+	}
+	return __find_linux_pte_or_hugepte(pgdir, ea, shift);
+}
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3b2252e7731b..8424b232e598 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -330,9 +330,11 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 	int hugepage_shift;
 
 	/*
-	 * We won't find hugepages here, iomem
+	 * We won't find hugepages here(this is iomem). Hence we are not
+	 * worried about _PAGE_SPLITTING/collapse. Also we will not hit
+	 * page table free, because of init_mm.
 	 */
-	ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
+	ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
 	if (!ptep)
 		return token;
 	WARN_ON(hugepage_shift);
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 24b968f8e4d8..63d9cc4d7366 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -71,15 +71,15 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
 		vaddr = (unsigned long)PCI_FIX_ADDR(addr);
 		if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
 			return NULL;
-
-		ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
+		/*
+		 * We won't find huge pages here (iomem). Also can't hit
+		 * a page table free due to init_mm
+		 */
+		ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
 						 &hugepage_shift);
 		if (ptep == NULL)
 			paddr = 0;
 		else {
-			/*
-			 * we don't have hugepages backing iomem
-			 */
 			WARN_ON(hugepage_shift);
 			paddr = pte_pfn(*ptep) << PAGE_SHIFT;
 		}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 534acb3c6c3d..26df3864d85a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -539,12 +539,13 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		if (!writing && hpte_is_writable(r)) {
 			unsigned int hugepage_shift;
 			pte_t *ptep, pte;
+			unsigned long flags;
 
 			/*
 			 * We need to protect against page table destruction
 			 * while looking up and updating the pte.
 			 */
-			rcu_read_lock_sched();
+			local_irq_save(flags);
 			ptep = find_linux_pte_or_hugepte(current->mm->pgd,
 							 hva, &hugepage_shift);
 			if (ptep) {
@@ -553,7 +554,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				if (pte_write(pte))
 					write_ok = 1;
 			}
-			rcu_read_unlock_sched();
+			local_irq_restore(flags);
 		}
 	}
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 73e083cb9f7e..f559b25de173 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -26,11 +26,14 @@ static void *real_vmalloc_addr(void *x)
 {
 	unsigned long addr = (unsigned long) x;
 	pte_t *p;
-
-	p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
+	/*
+	 * assume we don't have huge pages in vmalloc space...
+	 * So don't worry about THP collapse/split. Called
+	 * Only in realmode, hence won't need irq_save/restore.
+	 */
+	p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
 	if (!p || !pte_present(*p))
 		return NULL;
-	/* assume we don't have huge pages in vmalloc space... */
 	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
 	return __va(addr);
 }
@@ -153,7 +156,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	pte_t *ptep;
 	unsigned int writing;
 	unsigned long mmu_seq;
-	unsigned long rcbits;
+	unsigned long rcbits, irq_flags = 0;
 
 	psize = hpte_page_size(pteh, ptel);
 	if (!psize)
@@ -189,7 +192,16 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 	/* Translate to host virtual address */
 	hva = __gfn_to_hva_memslot(memslot, gfn);
-	ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
+	/*
+	 * If we had a page table table change after lookup, we would
+	 * retry via mmu_notifier_retry.
+	 */
+	if (realmode)
+		ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
+	else {
+		local_irq_save(irq_flags);
+		ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
+	}
 	if (ptep) {
 		pte_t pte;
 		unsigned int host_pte_size;
@@ -202,9 +214,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 		 * We should always find the guest page size
 		 * to <= host page size, if host is using hugepage
 		 */
-		if (host_pte_size < psize)
+		if (host_pte_size < psize) {
+			if (!realmode)
+				local_irq_restore(flags);
 			return H_PARAMETER;
-
+		}
 		pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift);
 		if (pte_present(pte) && !pte_protnone(pte)) {
 			if (writing && !pte_write(pte))
@@ -216,6 +230,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 			pa |= gpa & ~PAGE_MASK;
 		}
 	}
+	if (!realmode)
+		local_irq_restore(irq_flags);
 
 	ptel &= ~(HPTE_R_PP0 - psize);
 	ptel |= pa;
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index a1f5b0d4b1d6..4d33e199edcc 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -338,6 +338,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	pte_t *ptep;
 	unsigned int wimg = 0;
 	pgd_t *pgdir;
+	unsigned long flags;
 
 	/* used to check for invalidations in progress */
 	mmu_seq = kvm->mmu_notifier_seq;
@@ -468,14 +469,23 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 
 	pgdir = vcpu_e500->vcpu.arch.pgdir;
+	/*
+	 * We are just looking at the wimg bits, so we don't
+	 * care much about the trans splitting bit.
+	 * We are holding kvm->mmu_lock so a notifier invalidate
+	 * can't run hence pfn won't change.
+	 */
+	local_irq_save(flags);
 	ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL);
 	if (ptep) {
 		pte_t pte = READ_ONCE(*ptep);
 
-		if (pte_present(pte))
+		if (pte_present(pte)) {
 			wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
 				MAS2_WIMGE_MASK;
-		else {
+			local_irq_restore(flags);
+		} else {
+			local_irq_restore(flags);
 			pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
 					   __func__, (long)gfn, pfn);
 			ret = -EINVAL;
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 2c2022d16059..444f7a5b859b 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1066,7 +1066,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get PTE and page size from page tables */
-	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
+	ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
 	if (ptep == NULL || !pte_present(*ptep)) {
 		DBG_LOW(" no PTE !\n");
 		rc = 1;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index fa9d5c238d22..d79cd3190b39 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -109,7 +109,7 @@ int pgd_huge(pgd_t pgd)
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	/* Only called for hugetlbfs pages, hence can ignore THP */
-	return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -681,28 +681,35 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	} while (addr = next, addr != end);
 }
 
+/*
+ * We are holding mmap_sem, so a parallel huge page collapse cannot run.
+ * To prevent hugepage split, disable irq.
+ */
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
 	pte_t *ptep;
 	struct page *page;
 	unsigned shift;
-	unsigned long mask;
+	unsigned long mask, flags;
 	/*
 	 * Transparent hugepages are handled by generic code. We can skip them
 	 * here.
 	 */
+	local_irq_save(flags);
 	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
 
 	/* Verify it is a huge page else bail. */
-	if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
+	if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) {
+		local_irq_restore(flags);
 		return ERR_PTR(-EINVAL);
-
+	}
 	mask = (1UL << shift) - 1;
 	page = pte_page(*ptep);
 	if (page)
 		page += (address & mask) / PAGE_SIZE;
 
+	local_irq_restore(flags);
 	return page;
 }
 
@@ -951,7 +958,8 @@ void flush_dcache_icache_hugepage(struct page *page)
  * we can follow the address down to the the page and take a ref on it.
  */
 
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+				   unsigned *shift)
 {
 	pgd_t pgd, *pgdp;
 	pud_t pud, *pudp;
@@ -1030,7 +1038,7 @@ out:
 		*shift = pdshift;
 	return ret_pte;
 }
-EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
+EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
 
 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 2396dda282cd..a5162d789cfc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -111,41 +111,45 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  * interrupt context, so if the access faults, we read the page tables
  * to find which page (if any) is mapped and access it directly.
  */
-static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
+static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
 {
+	int ret = -EFAULT;
 	pgd_t *pgdir;
 	pte_t *ptep, pte;
 	unsigned shift;
 	unsigned long addr = (unsigned long) ptr;
 	unsigned long offset;
-	unsigned long pfn;
+	unsigned long pfn, flags;
 	void *kaddr;
 
 	pgdir = current->mm->pgd;
 	if (!pgdir)
 		return -EFAULT;
 
+	local_irq_save(flags);
 	ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
+	if (!ptep)
+		goto err_out;
 	if (!shift)
 		shift = PAGE_SHIFT;
 
 	/* align address to page boundary */
 	offset = addr & ((1UL << shift) - 1);
-	addr -= offset;
 
-	if (ptep == NULL)
-		return -EFAULT;
-	pte = *ptep;
+	pte = READ_ONCE(*ptep);
 	if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
-		return -EFAULT;
+		goto err_out;
 	pfn = pte_pfn(pte);
 	if (!page_is_ram(pfn))
-		return -EFAULT;
+		goto err_out;
 
 	/* no highmem to worry about here */
 	kaddr = pfn_to_kaddr(pfn);
-	memcpy(ret, kaddr + offset, nb);
-	return 0;
+	memcpy(buf, kaddr + offset, nb);
+	ret = 0;
+err_out:
+	local_irq_restore(flags);
+	return ret;
 }
 
 static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH V4 4/4] powerpc/mm/thp: Return pte address if we find trans_splitting.
  2015-03-26 13:13 [PATCH V4 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer Aneesh Kumar K.V
  2015-03-26 13:13 ` [PATCH V4 2/4] KVM: PPC: Remove page table walk helpers Aneesh Kumar K.V
  2015-03-26 13:13 ` [PATCH V4 3/4] powerpc/mm/thp: Make page table walk safe against thp split/collapse Aneesh Kumar K.V
@ 2015-03-26 13:13 ` Aneesh Kumar K.V
  2015-03-28 10:33 ` [V4, 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer Michael Ellerman
  3 siblings, 0 replies; 7+ messages in thread
From: Aneesh Kumar K.V @ 2015-03-26 13:13 UTC (permalink / raw)
  To: benh, paulus, mpe; +Cc: linuxppc-dev, Aneesh Kumar K.V

For THP that is marked trans splitting, we return the pte.
This require the callers to handle the pmd_trans_splitting scenario,
if they care. All the current callers are either looking at pfn or
write_ok, hence we don't need to update them.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
Changes from V3:
* Added this new patch.

 arch/powerpc/include/asm/kvm_book3s_64.h | 12 ++----------
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  9 +++------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |  2 +-
 arch/powerpc/mm/hugetlbpage.c            |  9 ++++-----
 4 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index f06820c67175..5233a35d80e2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -281,11 +281,9 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
 
 /*
  * If it's present and writable, atomically set dirty and referenced bits and
- * return the PTE, otherwise return 0. If we find a transparent hugepage
- * and if it is marked splitting we return 0;
+ * return the PTE, otherwise return 0.
  */
-static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
-						 unsigned int hugepage)
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
 {
 	pte_t old_pte, new_pte = __pte(0);
 
@@ -301,12 +299,6 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
 			cpu_relax();
 			continue;
 		}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		/* If hugepage and is trans splitting return None */
-		if (unlikely(hugepage &&
-			     pmd_trans_splitting(pte_pmd(old_pte))))
-			return __pte(0);
-#endif
 		/* If pte is not present return None */
 		if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT)))
 			return __pte(0);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 26df3864d85a..0fe9c92e78ed 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -537,20 +537,17 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 		/* if the guest wants write access, see if that is OK */
 		if (!writing && hpte_is_writable(r)) {
-			unsigned int hugepage_shift;
 			pte_t *ptep, pte;
 			unsigned long flags;
-
 			/*
 			 * We need to protect against page table destruction
-			 * while looking up and updating the pte.
+			 * hugepage split and collapse.
 			 */
 			local_irq_save(flags);
 			ptep = find_linux_pte_or_hugepte(current->mm->pgd,
-							 hva, &hugepage_shift);
+							 hva, NULL);
 			if (ptep) {
-				pte = kvmppc_read_update_linux_pte(ptep, 1,
-							   hugepage_shift);
+				pte = kvmppc_read_update_linux_pte(ptep, 1);
 				if (pte_write(pte))
 					write_ok = 1;
 			}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index f559b25de173..d839f08cb903 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -219,7 +219,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 				local_irq_restore(flags);
 			return H_PARAMETER;
 		}
-		pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift);
+		pte = kvmppc_read_update_linux_pte(ptep, writing);
 		if (pte_present(pte) && !pte_protnone(pte)) {
 			if (writing && !pte_write(pte))
 				/* make the actual HPTE be read-only */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index d79cd3190b39..b07270cd82a4 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1011,12 +1011,11 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 			 * A hugepage collapse is captured by pmd_none, because
 			 * it mark the pmd none and do a hpte invalidate.
 			 *
-			 * A hugepage split is captured by pmd_trans_splitting
-			 * because we mark the pmd trans splitting and do a
-			 * hpte invalidate
-			 *
+			 * We don't worry about pmd_trans_splitting here, The
+			 * caller if it needs to handle the splitting case
+			 * should check for that.
 			 */
-			if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+			if (pmd_none(pmd))
 				return NULL;
 
 			if (pmd_huge(pmd) || pmd_large(pmd)) {
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [V4, 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer
  2015-03-26 13:13 [PATCH V4 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer Aneesh Kumar K.V
                   ` (2 preceding siblings ...)
  2015-03-26 13:13 ` [PATCH V4 4/4] powerpc/mm/thp: Return pte address if we find trans_splitting Aneesh Kumar K.V
@ 2015-03-28 10:33 ` Michael Ellerman
  2015-03-29 16:46   ` Aneesh Kumar K.V
  3 siblings, 1 reply; 7+ messages in thread
From: Michael Ellerman @ 2015-03-28 10:33 UTC (permalink / raw)
  To: Aneesh Kumar K.V, benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V

On Thu, 2015-26-03 at 13:13:39 UTC, "Aneesh Kumar K.V" wrote:
> pte can get updated from other CPUs as part of multiple activities
> like THP split, huge page collapse, unmap. We need to make sure we
> don't reload the pte value again and again for different checks.
> 
> ---
>  arch/powerpc/include/asm/kvm_book3s_64.h |  5 ++++-
>  arch/powerpc/kvm/e500_mmu_host.c         | 20 ++++++++++++--------
>  2 files changed, 16 insertions(+), 9 deletions(-)

So this series is partly KVM but mostly powerpc.

I assume you can't split it into two separate series easily?

You haven't sent it to the KVM lists or to Alex AFAICS. You'll need to do that
for the KVM pieces at least. We can probably take it all via powerpc, but I'll
need an ACK from Alex at least.

cheers

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [V4, 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer
  2015-03-28 10:33 ` [V4, 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer Michael Ellerman
@ 2015-03-29 16:46   ` Aneesh Kumar K.V
  2015-03-29 23:41     ` [V4,1/4] " Benjamin Herrenschmidt
  0 siblings, 1 reply; 7+ messages in thread
From: Aneesh Kumar K.V @ 2015-03-29 16:46 UTC (permalink / raw)
  To: Michael Ellerman, benh, paulus; +Cc: linuxppc-dev

Michael Ellerman <mpe@ellerman.id.au> writes:

> On Thu, 2015-26-03 at 13:13:39 UTC, "Aneesh Kumar K.V" wrote:
>> pte can get updated from other CPUs as part of multiple activities
>> like THP split, huge page collapse, unmap. We need to make sure we
>> don't reload the pte value again and again for different checks.
>> 
>> ---
>>  arch/powerpc/include/asm/kvm_book3s_64.h |  5 ++++-
>>  arch/powerpc/kvm/e500_mmu_host.c         | 20 ++++++++++++--------
>>  2 files changed, 16 insertions(+), 9 deletions(-)
>
> So this series is partly KVM but mostly powerpc.
>
> I assume you can't split it into two separate series easily?

Yes, will do. But before that I was looking for feedback from Ben or
Paul.

>
> You haven't sent it to the KVM lists or to Alex AFAICS. You'll need to do that
> for the KVM pieces at least. We can probably take it all via powerpc, but I'll
> need an ACK from Alex at least.
>

If I can get an ack from Ben, that makes it easy for the kvm list.

-aneesh

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [V4,1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer
  2015-03-29 16:46   ` Aneesh Kumar K.V
@ 2015-03-29 23:41     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 7+ messages in thread
From: Benjamin Herrenschmidt @ 2015-03-29 23:41 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: paulus, linuxppc-dev

On Sun, 2015-03-29 at 22:16 +0530, Aneesh Kumar K.V wrote:
> Michael Ellerman <mpe@ellerman.id.au> writes:
> 
> > On Thu, 2015-26-03 at 13:13:39 UTC, "Aneesh Kumar K.V" wrote:
> >> pte can get updated from other CPUs as part of multiple activities
> >> like THP split, huge page collapse, unmap. We need to make sure we
> >> don't reload the pte value again and again for different checks.
> >> 
> >> ---
> >>  arch/powerpc/include/asm/kvm_book3s_64.h |  5 ++++-
> >>  arch/powerpc/kvm/e500_mmu_host.c         | 20 ++++++++++++--------
> >>  2 files changed, 16 insertions(+), 9 deletions(-)
> >
> > So this series is partly KVM but mostly powerpc.
> >
> > I assume you can't split it into two separate series easily?
> 
> Yes, will do. But before that I was looking for feedback from Ben or
> Paul.
> 
> >
> > You haven't sent it to the KVM lists or to Alex AFAICS. You'll need to do that
> > for the KVM pieces at least. We can probably take it all via powerpc, but I'll
> > need an ACK from Alex at least.
> >
> 
> If I can get an ack from Ben, that makes it easy for the kvm list.

Ack. Using ACCESS_ONCE in a lockless access of the PTE that does
multiple checks makes sense.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-03-29 23:41 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-26 13:13 [PATCH V4 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer Aneesh Kumar K.V
2015-03-26 13:13 ` [PATCH V4 2/4] KVM: PPC: Remove page table walk helpers Aneesh Kumar K.V
2015-03-26 13:13 ` [PATCH V4 3/4] powerpc/mm/thp: Make page table walk safe against thp split/collapse Aneesh Kumar K.V
2015-03-26 13:13 ` [PATCH V4 4/4] powerpc/mm/thp: Return pte address if we find trans_splitting Aneesh Kumar K.V
2015-03-28 10:33 ` [V4, 1/4] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer Michael Ellerman
2015-03-29 16:46   ` Aneesh Kumar K.V
2015-03-29 23:41     ` [V4,1/4] " Benjamin Herrenschmidt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.