linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements
@ 2017-11-07  7:53 Nicholas Piggin
  2017-11-07  7:53 ` [PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling Nicholas Piggin
                   ` (6 more replies)
  0 siblings, 7 replies; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07  7:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V

Resending, sorry for the noise.

Since the v1/RFC, I pulled the first 2 fix patches into this series,
and rediffed to powerpc merge branch. Dropped the final 2 patches
which were not completely agreed upon and baked.

Thanks,
Nick

Nicholas Piggin (7):
  powerpc/64s/radix: tlbie improve preempt handling
  powerpc/64s/radix: Fix process table entry cache invalidation
  powerpc/64s/radix: optimize TLB range flush barriers
  powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions
  powerpc/64s/radix: Optimize flush_tlb_range
  powerpc/64s/radix: Introduce local single page ceiling for TLB range
    flush
  powerpc/64s/radix: Improve TLB flushing for page table freeing

 arch/powerpc/include/asm/mmu_context.h |   4 +
 arch/powerpc/mm/mmu_context_book3s64.c |  25 ++-
 arch/powerpc/mm/tlb-radix.c            | 318 ++++++++++++++++++++++++---------
 3 files changed, 256 insertions(+), 91 deletions(-)

-- 
2.15.0

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling
  2017-11-07  7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
@ 2017-11-07  7:53 ` Nicholas Piggin
  2017-11-07  7:53 ` [PATCH v2 2/7] powerpc/64s/radix: Fix process table entry cache invalidation Nicholas Piggin
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07  7:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V

Preempt should be consistently disabled for mm_is_thread_local tests,
so bring the rest of these under preempt_disable().

Preempt does not need to be disabled for the mm->context.id tests,
which allows simplification and removal of gotos.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/tlb-radix.c | 50 ++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index e2f15810b9c0..feeb96693aeb 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -186,16 +186,15 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
 
-	preempt_disable();
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto no_context;
+		return;
 
+	preempt_disable();
 	if (!mm_is_thread_local(mm))
 		_tlbie_pid(pid, RIC_FLUSH_TLB);
 	else
 		_tlbiel_pid(pid, RIC_FLUSH_TLB);
-no_context:
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -204,16 +203,15 @@ void radix__flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
 
-	preempt_disable();
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto no_context;
+		return;
 
+	preempt_disable();
 	if (!mm_is_thread_local(mm))
 		_tlbie_pid(pid, RIC_FLUSH_ALL);
 	else
 		_tlbiel_pid(pid, RIC_FLUSH_ALL);
-no_context:
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_all_mm);
@@ -230,15 +228,14 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 	unsigned long pid;
 	unsigned long ap = mmu_get_ap(psize);
 
-	preempt_disable();
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto bail;
+		return;
+	preempt_disable();
 	if (!mm_is_thread_local(mm))
 		_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
 	else
 		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
-bail:
 	preempt_enable();
 }
 
@@ -322,15 +319,17 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 {
 	unsigned long pid;
 	unsigned long addr;
-	int local = mm_is_thread_local(mm);
+	bool local;
 	unsigned long ap = mmu_get_ap(psize);
 	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
 
 
-	preempt_disable();
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto err_out;
+		return;
+
+	preempt_disable();
+	local = mm_is_thread_local(mm);
 
 	if (end == TLB_FLUSH_ALL ||
 	    (end - start) > tlb_single_page_flush_ceiling * page_size) {
@@ -338,39 +337,38 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 			_tlbiel_pid(pid, RIC_FLUSH_TLB);
 		else
 			_tlbie_pid(pid, RIC_FLUSH_TLB);
-		goto err_out;
+	} else {
+		for (addr = start; addr < end; addr += page_size) {
+			if (local)
+				_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+			else
+				_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+		}
 	}
-	for (addr = start; addr < end; addr += page_size) {
 
-		if (local)
-			_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-		else
-			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-	}
-err_out:
 	preempt_enable();
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
-	int local = mm_is_thread_local(mm);
+	bool local;
 	unsigned long ap = mmu_get_ap(mmu_virtual_psize);
 	unsigned long pid, end;
 
-
 	pid = mm->context.id;
-	preempt_disable();
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto no_context;
+		return;
 
 	/* 4k page size, just blow the world */
 	if (PAGE_SIZE == 0x1000) {
 		radix__flush_all_mm(mm);
-		preempt_enable();
 		return;
 	}
 
+	preempt_disable();
+	local = mm_is_thread_local(mm);
+
 	/* Otherwise first do the PWC */
 	if (local)
 		_tlbiel_pid(pid, RIC_FLUSH_PWC);
@@ -385,7 +383,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 		else
 			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
 	}
-no_context:
+
 	preempt_enable();
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 2/7] powerpc/64s/radix: Fix process table entry cache invalidation
  2017-11-07  7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
  2017-11-07  7:53 ` [PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling Nicholas Piggin
@ 2017-11-07  7:53 ` Nicholas Piggin
  2017-11-07  7:53 ` [PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers Nicholas Piggin
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07  7:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V

According to the architecture, the process table entry cache must be
flushed with tlbie RIC=2.

Currently the process table entry is set to invalid right before the
PID is returned to the allocator, with no invalidation. This works on
existing implementations that are known to not cache the process table
entry for any except the current PIDR.

It is architecturally correct and cleaner to invalidate with RIC=2
after clearing the process table entry and before the PID is returned
to the allocator. This can be done in arch_exit_mmap that runs before
the final flush, and to ensure the final flush (fullmm) is always a
RIC=2 variant.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/mmu_context.h |  4 ++++
 arch/powerpc/mm/mmu_context_book3s64.c | 25 ++++++++++++++++++++-----
 arch/powerpc/mm/tlb-radix.c            |  6 +++++-
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b4cdf574cf61..6177d43f0ce8 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -165,9 +165,13 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
 {
 }
 
+#ifndef CONFIG_PPC_BOOK3S_64
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 }
+#else
+extern void arch_exit_mmap(struct mm_struct *mm);
+#endif
 
 static inline void arch_unmap(struct mm_struct *mm,
 			      struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 05e15386d4cb..6d724dab27c2 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -216,19 +216,34 @@ void destroy_context(struct mm_struct *mm)
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
 #endif
+	if (radix_enabled())
+		WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+	else
+		subpage_prot_free(mm);
+	destroy_pagetable_page(mm);
+	__destroy_context(mm->context.id);
+	mm->context.id = MMU_NO_CONTEXT;
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
 	if (radix_enabled()) {
 		/*
 		 * Radix doesn't have a valid bit in the process table
 		 * entries. However we know that at least P9 implementation
 		 * will avoid caching an entry with an invalid RTS field,
 		 * and 0 is invalid. So this will do.
+		 *
+		 * This runs before the "fullmm" tlb flush in exit_mmap,
+		 * which does a RIC=2 tlbie to clear the process table
+		 * entry. See the "fullmm" comments in tlb-radix.c.
+		 *
+		 * No barrier required here after the store because
+		 * this process will do the invalidate, which starts with
+		 * ptesync.
 		 */
 		process_tb[mm->context.id].prtb0 = 0;
-	} else
-		subpage_prot_free(mm);
-	destroy_pagetable_page(mm);
-	__destroy_context(mm->context.id);
-	mm->context.id = MMU_NO_CONTEXT;
+	}
 }
 
 #ifdef CONFIG_PPC_RADIX_MMU
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index feeb96693aeb..6e77ed2d7c6c 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -297,10 +297,14 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 	psize = radix_get_mmu_psize(page_size);
 	/*
 	 * if page size is not something we understand, do a full mm flush
+	 *
+	 * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
+	 * that flushes the process table entry cache upon process teardown.
+	 * See the comment for radix in arch_exit_mmap().
 	 */
 	if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
 		radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
-	else if (tlb->need_flush_all) {
+	else if (tlb->fullmm || tlb->need_flush_all) {
 		tlb->need_flush_all = 0;
 		radix__flush_all_mm(mm);
 	} else
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers
  2017-11-07  7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
  2017-11-07  7:53 ` [PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling Nicholas Piggin
  2017-11-07  7:53 ` [PATCH v2 2/7] powerpc/64s/radix: Fix process table entry cache invalidation Nicholas Piggin
@ 2017-11-07  7:53 ` Nicholas Piggin
  2017-11-14 11:12   ` [v2,3/7] " Michael Ellerman
  2017-11-07  7:53 ` [PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions Nicholas Piggin
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07  7:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V

Short range flushes issue a sequences of tlbie(l) instructions for
individual effective addresses. These do not all require individual
barrier sequences, only one covering all tlbie(l) instructions.

Commit f7327e0ba3 ("powerpc/mm/radix: Remove unnecessary ptesync")
made a similar optimization for tlbiel for PID flushing.

For tlbie, the ISA says:

    The tlbsync instruction provides an ordering function for the
    effects of all tlbie instructions executed by the thread executing
    the tlbsync instruction, with respect to the memory barrier
    created by a subsequent ptesync instruction executed by the same
    thread.

Time to munmap 30 pages of memory (after mmap, touch):
         local   global
vanilla  10.9us  22.3us
patched   3.4us  14.4us

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/tlb-radix.c | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 6e77ed2d7c6c..49e71c68f5b1 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -84,7 +84,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
-static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+static inline void __tlbiel_va(unsigned long va, unsigned long pid,
 			      unsigned long ap, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -95,14 +95,20 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid,
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
 
-	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	asm volatile("ptesync": : :"memory");
 	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
+static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+			      unsigned long ap, unsigned long ric)
+{
+	asm volatile("ptesync": : :"memory");
+	__tlbiel_va(va, pid, ap, ric);
+	asm volatile("ptesync": : :"memory");
+}
+
+static inline void __tlbie_va(unsigned long va, unsigned long pid,
 			     unsigned long ap, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -113,13 +119,20 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
 
-	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+			     unsigned long ap, unsigned long ric)
+{
+	asm volatile("ptesync": : :"memory");
+	__tlbie_va(va, pid, ap, ric);
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+
 /*
  * Base TLB flushing operations:
  *
@@ -341,13 +354,19 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 			_tlbiel_pid(pid, RIC_FLUSH_TLB);
 		else
 			_tlbie_pid(pid, RIC_FLUSH_TLB);
+
 	} else {
+		asm volatile("ptesync": : :"memory");
 		for (addr = start; addr < end; addr += page_size) {
 			if (local)
-				_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+				__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
 			else
-				_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+				__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
 		}
+		if (local)
+			asm volatile("ptesync": : :"memory");
+		else
+			asm volatile("eieio; tlbsync; ptesync": : :"memory");
 	}
 
 	preempt_enable();
@@ -380,6 +399,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 		_tlbie_pid(pid, RIC_FLUSH_PWC);
 
 	/* Then iterate the pages */
+	asm volatile("ptesync": : :"memory");
 	end = addr + HPAGE_PMD_SIZE;
 	for (; addr < end; addr += PAGE_SIZE) {
 		if (local)
@@ -387,7 +407,10 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 		else
 			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
 	}
-
+	if (local)
+		asm volatile("ptesync": : :"memory");
+	else
+		asm volatile("eieio; tlbsync; ptesync": : :"memory");
 	preempt_enable();
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions
  2017-11-07  7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
                   ` (2 preceding siblings ...)
  2017-11-07  7:53 ` [PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers Nicholas Piggin
@ 2017-11-07  7:53 ` Nicholas Piggin
  2017-11-14 11:12   ` [v2, " Michael Ellerman
  2017-11-07  7:53 ` [PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range Nicholas Piggin
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07  7:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V

Move the barriers and range iteration down into the _tlbie* level,
which improves readability.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/tlb-radix.c | 71 ++++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 49e71c68f5b1..645a35b7bc9d 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -85,7 +85,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 }
 
 static inline void __tlbiel_va(unsigned long va, unsigned long pid,
-			      unsigned long ap, unsigned long ric)
+			       unsigned long ap, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
@@ -101,13 +101,28 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid,
 }
 
 static inline void _tlbiel_va(unsigned long va, unsigned long pid,
-			      unsigned long ap, unsigned long ric)
+			      unsigned long psize, unsigned long ric)
 {
+	unsigned long ap = mmu_get_ap(psize);
+
 	asm volatile("ptesync": : :"memory");
 	__tlbiel_va(va, pid, ap, ric);
 	asm volatile("ptesync": : :"memory");
 }
 
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	asm volatile("ptesync": : :"memory");
+	for (addr = start; addr < end; addr += page_size)
+		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+	asm volatile("ptesync": : :"memory");
+}
+
 static inline void __tlbie_va(unsigned long va, unsigned long pid,
 			     unsigned long ap, unsigned long ric)
 {
@@ -125,13 +140,27 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
 }
 
 static inline void _tlbie_va(unsigned long va, unsigned long pid,
-			     unsigned long ap, unsigned long ric)
+			      unsigned long psize, unsigned long ric)
 {
+	unsigned long ap = mmu_get_ap(psize);
+
 	asm volatile("ptesync": : :"memory");
 	__tlbie_va(va, pid, ap, ric);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	asm volatile("ptesync": : :"memory");
+	for (addr = start; addr < end; addr += page_size)
+		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
 
 /*
  * Base TLB flushing operations:
@@ -174,12 +203,11 @@ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmadd
 				       int psize)
 {
 	unsigned long pid;
-	unsigned long ap = mmu_get_ap(psize);
 
 	preempt_disable();
 	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	preempt_enable();
 }
 
@@ -239,16 +267,15 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 				 int psize)
 {
 	unsigned long pid;
-	unsigned long ap = mmu_get_ap(psize);
 
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		return;
 	preempt_disable();
 	if (!mm_is_thread_local(mm))
-		_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	else
-		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	preempt_enable();
 }
 
@@ -335,9 +362,7 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 				  unsigned long end, int psize)
 {
 	unsigned long pid;
-	unsigned long addr;
 	bool local;
-	unsigned long ap = mmu_get_ap(psize);
 	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
 
 
@@ -356,17 +381,10 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 			_tlbie_pid(pid, RIC_FLUSH_TLB);
 
 	} else {
-		asm volatile("ptesync": : :"memory");
-		for (addr = start; addr < end; addr += page_size) {
-			if (local)
-				__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-			else
-				__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-		}
 		if (local)
-			asm volatile("ptesync": : :"memory");
+			_tlbiel_va_range(start, end, pid, page_size, psize);
 		else
-			asm volatile("eieio; tlbsync; ptesync": : :"memory");
+			_tlbie_va_range(start, end, pid, page_size, psize);
 	}
 
 	preempt_enable();
@@ -375,9 +393,8 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
-	bool local;
-	unsigned long ap = mmu_get_ap(mmu_virtual_psize);
 	unsigned long pid, end;
+	bool local;
 
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
@@ -399,18 +416,12 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 		_tlbie_pid(pid, RIC_FLUSH_PWC);
 
 	/* Then iterate the pages */
-	asm volatile("ptesync": : :"memory");
 	end = addr + HPAGE_PMD_SIZE;
-	for (; addr < end; addr += PAGE_SIZE) {
-		if (local)
-			_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-		else
-			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-	}
+
 	if (local)
-		asm volatile("ptesync": : :"memory");
+		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
 	else
-		asm volatile("eieio; tlbsync; ptesync": : :"memory");
+		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
 	preempt_enable();
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range
  2017-11-07  7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
                   ` (3 preceding siblings ...)
  2017-11-07  7:53 ` [PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions Nicholas Piggin
@ 2017-11-07  7:53 ` Nicholas Piggin
  2017-11-14 11:12   ` [v2,5/7] " Michael Ellerman
  2017-11-07  7:53 ` [PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush Nicholas Piggin
  2017-11-07  7:53 ` [PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing Nicholas Piggin
  6 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07  7:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V

Currently for radix, flush_tlb_range flushes the entire PID, because
the Linux mm code does not tell us about page size here for THP vs
regular pages. This is quite sub-optimal for small mremap / mprotect
/ change_protection.

So implement va range flushes with two flush passes, one for each
page size (regular and THP). The second flush has an order of matnitude
fewer tlbie instructions than the first, so it is a relatively small
additional cost.

There is still room for improvement here with some changes to generic
APIs, particularly if there are mostly THP pages to be invalidated,
the small page flushes could be reduced.

Time to mprotect 1 page of memory (after mmap, touch):
vanilla 2.9us   1.8us
patched 1.2us   1.6us

Time to mprotect 30 pages of memory (after mmap, touch):
vanilla 8.2us   7.2us
patched 6.9us   17.9us

Time to mprotect 34 pages of memory (after mmap, touch):
vanilla 9.1us   8.0us
patched 9.0us   8.0us

34 pages is the point at which the invalidation switches from va
to entire PID, which tlbie can do in a single instruction. This is
why in the case of 30 pages, the new code runs slower for this test.
This is a deliberate tradeoff already present in the unmap and THP
promotion code, the idea is that the benefit from avoiding flushing
entire TLB for this PID on all threads in the system.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/tlb-radix.c | 139 ++++++++++++++++++++++++++++++++------------
 1 file changed, 101 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 645a35b7bc9d..277497be7aaf 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -100,6 +100,17 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid,
 	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	for (addr = start; addr < end; addr += page_size)
+		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
 static inline void _tlbiel_va(unsigned long va, unsigned long pid,
 			      unsigned long psize, unsigned long ric)
 {
@@ -114,12 +125,8 @@ static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
 				    unsigned long psize)
 {
-	unsigned long addr;
-	unsigned long ap = mmu_get_ap(psize);
-
 	asm volatile("ptesync": : :"memory");
-	for (addr = start; addr < end; addr += page_size)
-		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+	__tlbiel_va_range(start, end, pid, page_size, psize);
 	asm volatile("ptesync": : :"memory");
 }
 
@@ -139,6 +146,17 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbie_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	for (addr = start; addr < end; addr += page_size)
+		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
 static inline void _tlbie_va(unsigned long va, unsigned long pid,
 			      unsigned long psize, unsigned long ric)
 {
@@ -153,12 +171,8 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
 				    unsigned long psize)
 {
-	unsigned long addr;
-	unsigned long ap = mmu_get_ap(psize);
-
 	asm volatile("ptesync": : :"memory");
-	for (addr = start; addr < end; addr += page_size)
-		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+	__tlbie_va_range(start, end, pid, page_size, psize);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -299,17 +313,78 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
 }
 EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
 
+#define TLB_FLUSH_ALL -1UL
+
 /*
- * Currently, for range flushing, we just do a full mm flush. Because
- * we use this in code path where we don' track the page size.
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page flushes to full-pid flushes.
  */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+
 void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		     unsigned long end)
 
 {
 	struct mm_struct *mm = vma->vm_mm;
+	unsigned long pid;
+	unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
+	unsigned long page_size = 1UL << page_shift;
+	unsigned long nr_pages = (end - start) >> page_shift;
+	bool local, full;
+
+#ifdef CONFIG_HUGETLB_PAGE
+	if (is_vm_hugetlb_page(vma))
+		return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
 
-	radix__flush_tlb_mm(mm);
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	preempt_disable();
+	local = mm_is_thread_local(mm);
+	full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
+
+	if (full) {
+		if (local)
+			_tlbiel_pid(pid, RIC_FLUSH_TLB);
+		else
+			_tlbie_pid(pid, RIC_FLUSH_TLB);
+	} else {
+		bool hflush = false;
+		unsigned long hstart, hend;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT;
+		hend = end >> HPAGE_PMD_SHIFT;
+		if (hstart < hend) {
+			hstart <<= HPAGE_PMD_SHIFT;
+			hend <<= HPAGE_PMD_SHIFT;
+			hflush = true;
+		}
+#endif
+
+		asm volatile("ptesync": : :"memory");
+		if (local) {
+			__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
+			if (hflush)
+				__tlbiel_va_range(hstart, hend, pid,
+						HPAGE_PMD_SIZE, MMU_PAGE_2M);
+			asm volatile("ptesync": : :"memory");
+		} else {
+			__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
+			if (hflush)
+				__tlbie_va_range(hstart, hend, pid,
+						HPAGE_PMD_SIZE, MMU_PAGE_2M);
+			asm volatile("eieio; tlbsync; ptesync": : :"memory");
+		}
+	}
+	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_range);
 
@@ -351,19 +426,14 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 		radix__flush_tlb_mm(mm);
 }
 
-#define TLB_FLUSH_ALL -1UL
-/*
- * Number of pages above which we will do a bcast tlbie. Just a
- * number at this point copied from x86
- */
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
-
 void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 				  unsigned long end, int psize)
 {
 	unsigned long pid;
-	bool local;
-	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
+	unsigned int page_shift = mmu_psize_defs[psize].shift;
+	unsigned long page_size = 1UL << page_shift;
+	unsigned long nr_pages = (end - start) >> page_shift;
+	bool local, full;
 
 
 	pid = mm->context.id;
@@ -372,14 +442,13 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 
 	preempt_disable();
 	local = mm_is_thread_local(mm);
+	full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
 
-	if (end == TLB_FLUSH_ALL ||
-	    (end - start) > tlb_single_page_flush_ceiling * page_size) {
+	if (full) {
 		if (local)
 			_tlbiel_pid(pid, RIC_FLUSH_TLB);
 		else
 			_tlbie_pid(pid, RIC_FLUSH_TLB);
-
 	} else {
 		if (local)
 			_tlbiel_va_range(start, end, pid, page_size, psize);
@@ -394,7 +463,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	unsigned long pid, end;
-	bool local;
 
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
@@ -406,22 +474,17 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 		return;
 	}
 
-	preempt_disable();
-	local = mm_is_thread_local(mm);
-
-	/* Otherwise first do the PWC */
-	if (local)
-		_tlbiel_pid(pid, RIC_FLUSH_PWC);
-	else
-		_tlbie_pid(pid, RIC_FLUSH_PWC);
-
-	/* Then iterate the pages */
 	end = addr + HPAGE_PMD_SIZE;
 
-	if (local)
+	/* Otherwise first do the PWC, then iterate the pages. */
+	preempt_disable();
+	if (mm_is_thread_local(mm)) {
+		_tlbiel_pid(pid, RIC_FLUSH_PWC);
 		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
-	else
+	} else {
+		_tlbie_pid(pid, RIC_FLUSH_PWC);
 		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
+	}
 	preempt_enable();
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush
  2017-11-07  7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
                   ` (4 preceding siblings ...)
  2017-11-07  7:53 ` [PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range Nicholas Piggin
@ 2017-11-07  7:53 ` Nicholas Piggin
  2017-11-14 11:12   ` [v2, " Michael Ellerman
  2017-11-07  7:53 ` [PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing Nicholas Piggin
  6 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07  7:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V

The single page flush ceiling is the cut-off point at which we switch
from invalidating individual pages, to invalidating the entire process
address space in response to a range flush.

Introduce a local variant of this heuristic because local and global
tlbie have significantly different properties:
- Local tlbiel requires 128 instructions to invalidate a PID, global
  tlbie only 1 instruction.
- Global tlbie instructions are expensive broadcast operations.

The local ceiling has been made much higher, 2x the number of
instructions required to invalidate the entire PID (i.e., 256 pages).

     Time to mprotect N pages of memory (after mmap, touch), local invalidate:
     N           32     34      64     128     256     512
     vanilla  7.4us  9.0us  14.6us  26.4us  50.2us  98.3us
     patched  7.4us  7.8us  13.8us  26.4us  51.9us  98.3us

The behaviour of both is identical at N=32 and N=512. Between there,
the vanilla kernel does a PID invalidate and the patched kernel does
a va range invalidate.

At N=128, these require the same number of tlbiel instructions, so
the patched version can be sen to be cheaper when < 128, and more
expensive when > 128. However this does not well capture the cost
of invalidated TLB.

The additional cost at 256 pages does not seem prohibitive. It may
be the case that increasing the limit further would continue to be
beneficial to avoid invalidating all of the process's TLB entries.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/tlb-radix.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 277497be7aaf..5842c98fbe48 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -325,6 +325,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
  * individual page flushes to full-pid flushes.
  */
 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
 
 void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		     unsigned long end)
@@ -347,8 +348,15 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		return;
 
 	preempt_disable();
-	local = mm_is_thread_local(mm);
-	full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
+	if (mm_is_thread_local(mm)) {
+		local = true;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_local_single_page_flush_ceiling);
+	} else {
+		local = false;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_single_page_flush_ceiling);
+	}
 
 	if (full) {
 		if (local)
@@ -441,8 +449,15 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 		return;
 
 	preempt_disable();
-	local = mm_is_thread_local(mm);
-	full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
+	if (mm_is_thread_local(mm)) {
+		local = true;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_local_single_page_flush_ceiling);
+	} else {
+		local = false;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_single_page_flush_ceiling);
+	}
 
 	if (full) {
 		if (local)
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing
  2017-11-07  7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
                   ` (5 preceding siblings ...)
  2017-11-07  7:53 ` [PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush Nicholas Piggin
@ 2017-11-07  7:53 ` Nicholas Piggin
  2017-11-14 11:12   ` [v2, " Michael Ellerman
  6 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07  7:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V

Unmaps that free page tables always flush the entire PID, which is
sub-optimal. Provide TLB range flushing with an additional PWC flush
that can be use for va range invalidations with PWC flush.

     Time to munmap N pages of memory including last level page table
     teardown (after mmap, touch), local invalidate:
     N           1       2      4      8     16     32     64
     vanilla  3.2us  3.3us  3.4us  3.6us  4.1us  5.2us  7.2us
     patched  1.4us  1.5us  1.7us  1.9us  2.6us  3.7us  6.2us

     Global invalidate:
     N           1       2      4      8     16      32     64
     vanilla  2.2us  2.3us  2.4us  2.6us  3.2us   4.1us  6.2us
     patched  2.1us  2.5us  3.4us  5.2us  8.7us  15.7us  6.2us

Local invalidates get much better across the board. Global ones have
the same issue where multiple tlbies for va flush do get slower than
the single tlbie to invalidate the PID. None of this test captures
the TLB benefits of avoiding killing everything.

Global gets worse, but it is brought in to line with global invalidate
for munmap()s that do not free page tables.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/tlb-radix.c | 90 ++++++++++++++++++++++++++++++---------------
 1 file changed, 61 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 5842c98fbe48..078f7da11ce1 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -39,6 +39,20 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
 	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = PPC_BIT(53); /* IS = 1 */
+	rs = pid << PPC_BITLSHIFT(31);
+	prs = 1; /* process scoped */
+	r = 1;   /* raidx format */
+
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
 /*
  * We use 128 set in radix mode and 256 set in hpt mode.
  */
@@ -70,18 +84,9 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 
 static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 {
-	unsigned long rb,rs,prs,r;
-
-	rb = PPC_BIT(53); /* IS = 1 */
-	rs = pid << PPC_BITLSHIFT(31);
-	prs = 1; /* process scoped */
-	r = 1;   /* raidx format */
-
 	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	__tlbie_pid(pid, ric);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
-	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
 static inline void __tlbiel_va(unsigned long va, unsigned long pid,
@@ -123,9 +128,11 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid,
 
 static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
-				    unsigned long psize)
+				    unsigned long psize, bool also_pwc)
 {
 	asm volatile("ptesync": : :"memory");
+	if (also_pwc)
+		__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
 	__tlbiel_va_range(start, end, pid, page_size, psize);
 	asm volatile("ptesync": : :"memory");
 }
@@ -169,9 +176,11 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
 
 static inline void _tlbie_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
-				    unsigned long psize)
+				    unsigned long psize, bool also_pwc)
 {
 	asm volatile("ptesync": : :"memory");
+	if (also_pwc)
+		__tlbie_pid(pid, RIC_FLUSH_PWC);
 	__tlbie_va_range(start, end, pid, page_size, psize);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
@@ -411,13 +420,15 @@ static int radix_get_mmu_psize(int page_size)
 	return psize;
 }
 
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize);
+
 void radix__tlb_flush(struct mmu_gather *tlb)
 {
 	int psize = 0;
 	struct mm_struct *mm = tlb->mm;
 	int page_size = tlb->page_size;
 
-	psize = radix_get_mmu_psize(page_size);
 	/*
 	 * if page size is not something we understand, do a full mm flush
 	 *
@@ -425,17 +436,28 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 	 * that flushes the process table entry cache upon process teardown.
 	 * See the comment for radix in arch_exit_mmap().
 	 */
-	if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
-		radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
-	else if (tlb->fullmm || tlb->need_flush_all) {
-		tlb->need_flush_all = 0;
+	if (tlb->fullmm) {
 		radix__flush_all_mm(mm);
-	} else
-		radix__flush_tlb_mm(mm);
+	} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
+		if (!tlb->need_flush_all)
+			radix__flush_tlb_mm(mm);
+		else
+			radix__flush_all_mm(mm);
+	} else {
+		unsigned long start = tlb->start;
+		unsigned long end = tlb->end;
+
+		if (!tlb->need_flush_all)
+			radix__flush_tlb_range_psize(mm, start, end, psize);
+		else
+			radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+	}
+	tlb->need_flush_all = 0;
 }
 
-void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
-				  unsigned long end, int psize)
+static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+				unsigned long start, unsigned long end,
+				int psize, bool also_pwc)
 {
 	unsigned long pid;
 	unsigned int page_shift = mmu_psize_defs[psize].shift;
@@ -461,19 +483,31 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 
 	if (full) {
 		if (local)
-			_tlbiel_pid(pid, RIC_FLUSH_TLB);
+			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
 		else
-			_tlbie_pid(pid, RIC_FLUSH_TLB);
+			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
 	} else {
 		if (local)
-			_tlbiel_va_range(start, end, pid, page_size, psize);
+			_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
 		else
-			_tlbie_va_range(start, end, pid, page_size, psize);
+			_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
 	}
 
 	preempt_enable();
 }
 
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize)
+{
+	return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+}
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize)
+{
+	__radix__flush_tlb_range_psize(mm, start, end, psize, true);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
@@ -494,11 +528,9 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 	/* Otherwise first do the PWC, then iterate the pages. */
 	preempt_disable();
 	if (mm_is_thread_local(mm)) {
-		_tlbiel_pid(pid, RIC_FLUSH_PWC);
-		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
+		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
 	} else {
-		_tlbie_pid(pid, RIC_FLUSH_PWC);
-		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
+		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
 	}
 	preempt_enable();
 }
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [v2,3/7] powerpc/64s/radix: optimize TLB range flush barriers
  2017-11-07  7:53 ` [PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers Nicholas Piggin
@ 2017-11-14 11:12   ` Michael Ellerman
  0 siblings, 0 replies; 13+ messages in thread
From: Michael Ellerman @ 2017-11-14 11:12 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin

On Tue, 2017-11-07 at 07:53:05 UTC, Nicholas Piggin wrote:
> Short range flushes issue a sequences of tlbie(l) instructions for
> individual effective addresses. These do not all require individual
> barrier sequences, only one covering all tlbie(l) instructions.
> 
> Commit f7327e0ba3 ("powerpc/mm/radix: Remove unnecessary ptesync")
> made a similar optimization for tlbiel for PID flushing.
> 
> For tlbie, the ISA says:
> 
>     The tlbsync instruction provides an ordering function for the
>     effects of all tlbie instructions executed by the thread executing
>     the tlbsync instruction, with respect to the memory barrier
>     created by a subsequent ptesync instruction executed by the same
>     thread.
> 
> Time to munmap 30 pages of memory (after mmap, touch):
>          local   global
> vanilla  10.9us  22.3us
> patched   3.4us  14.4us
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/14001c60939a754717893672209160

cheers

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [v2, 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions
  2017-11-07  7:53 ` [PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions Nicholas Piggin
@ 2017-11-14 11:12   ` Michael Ellerman
  0 siblings, 0 replies; 13+ messages in thread
From: Michael Ellerman @ 2017-11-14 11:12 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin

On Tue, 2017-11-07 at 07:53:06 UTC, Nicholas Piggin wrote:
> Move the barriers and range iteration down into the _tlbie* level,
> which improves readability.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/d665767e39fa4a9e725f92d77ba206

cheers

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [v2,5/7] powerpc/64s/radix: Optimize flush_tlb_range
  2017-11-07  7:53 ` [PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range Nicholas Piggin
@ 2017-11-14 11:12   ` Michael Ellerman
  0 siblings, 0 replies; 13+ messages in thread
From: Michael Ellerman @ 2017-11-14 11:12 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin

On Tue, 2017-11-07 at 07:53:07 UTC, Nicholas Piggin wrote:
> Currently for radix, flush_tlb_range flushes the entire PID, because
> the Linux mm code does not tell us about page size here for THP vs
> regular pages. This is quite sub-optimal for small mremap / mprotect
> / change_protection.
> 
> So implement va range flushes with two flush passes, one for each
> page size (regular and THP). The second flush has an order of matnitude
> fewer tlbie instructions than the first, so it is a relatively small
> additional cost.
> 
> There is still room for improvement here with some changes to generic
> APIs, particularly if there are mostly THP pages to be invalidated,
> the small page flushes could be reduced.
> 
> Time to mprotect 1 page of memory (after mmap, touch):
> vanilla 2.9us   1.8us
> patched 1.2us   1.6us
> 
> Time to mprotect 30 pages of memory (after mmap, touch):
> vanilla 8.2us   7.2us
> patched 6.9us   17.9us
> 
> Time to mprotect 34 pages of memory (after mmap, touch):
> vanilla 9.1us   8.0us
> patched 9.0us   8.0us
> 
> 34 pages is the point at which the invalidation switches from va
> to entire PID, which tlbie can do in a single instruction. This is
> why in the case of 30 pages, the new code runs slower for this test.
> This is a deliberate tradeoff already present in the unmap and THP
> promotion code, the idea is that the benefit from avoiding flushing
> entire TLB for this PID on all threads in the system.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/cbf09c837720f72f5e63ab7a2d331e

cheers

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [v2, 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush
  2017-11-07  7:53 ` [PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush Nicholas Piggin
@ 2017-11-14 11:12   ` Michael Ellerman
  0 siblings, 0 replies; 13+ messages in thread
From: Michael Ellerman @ 2017-11-14 11:12 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin

On Tue, 2017-11-07 at 07:53:08 UTC, Nicholas Piggin wrote:
> The single page flush ceiling is the cut-off point at which we switch
> from invalidating individual pages, to invalidating the entire process
> address space in response to a range flush.
> 
> Introduce a local variant of this heuristic because local and global
> tlbie have significantly different properties:
> - Local tlbiel requires 128 instructions to invalidate a PID, global
>   tlbie only 1 instruction.
> - Global tlbie instructions are expensive broadcast operations.
> 
> The local ceiling has been made much higher, 2x the number of
> instructions required to invalidate the entire PID (i.e., 256 pages).
> 
>      Time to mprotect N pages of memory (after mmap, touch), local invalidate:
>      N           32     34      64     128     256     512
>      vanilla  7.4us  9.0us  14.6us  26.4us  50.2us  98.3us
>      patched  7.4us  7.8us  13.8us  26.4us  51.9us  98.3us
> 
> The behaviour of both is identical at N=32 and N=512. Between there,
> the vanilla kernel does a PID invalidate and the patched kernel does
> a va range invalidate.
> 
> At N=128, these require the same number of tlbiel instructions, so
> the patched version can be sen to be cheaper when < 128, and more
> expensive when > 128. However this does not well capture the cost
> of invalidated TLB.
> 
> The additional cost at 256 pages does not seem prohibitive. It may
> be the case that increasing the limit further would continue to be
> beneficial to avoid invalidating all of the process's TLB entries.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/f6f27951fdf84a6edca3ea14077268

cheers

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [v2, 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing
  2017-11-07  7:53 ` [PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing Nicholas Piggin
@ 2017-11-14 11:12   ` Michael Ellerman
  0 siblings, 0 replies; 13+ messages in thread
From: Michael Ellerman @ 2017-11-14 11:12 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin

On Tue, 2017-11-07 at 07:53:09 UTC, Nicholas Piggin wrote:
> Unmaps that free page tables always flush the entire PID, which is
> sub-optimal. Provide TLB range flushing with an additional PWC flush
> that can be use for va range invalidations with PWC flush.
> 
>      Time to munmap N pages of memory including last level page table
>      teardown (after mmap, touch), local invalidate:
>      N           1       2      4      8     16     32     64
>      vanilla  3.2us  3.3us  3.4us  3.6us  4.1us  5.2us  7.2us
>      patched  1.4us  1.5us  1.7us  1.9us  2.6us  3.7us  6.2us
> 
>      Global invalidate:
>      N           1       2      4      8     16      32     64
>      vanilla  2.2us  2.3us  2.4us  2.6us  3.2us   4.1us  6.2us
>      patched  2.1us  2.5us  3.4us  5.2us  8.7us  15.7us  6.2us
> 
> Local invalidates get much better across the board. Global ones have
> the same issue where multiple tlbies for va flush do get slower than
> the single tlbie to invalidate the PID. None of this test captures
> the TLB benefits of avoiding killing everything.
> 
> Global gets worse, but it is brought in to line with global invalidate
> for munmap()s that do not free page tables.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/0b2f5a8a792755c88bd786f89712a9

cheers

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2017-11-14 11:12 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-07  7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
2017-11-07  7:53 ` [PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling Nicholas Piggin
2017-11-07  7:53 ` [PATCH v2 2/7] powerpc/64s/radix: Fix process table entry cache invalidation Nicholas Piggin
2017-11-07  7:53 ` [PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers Nicholas Piggin
2017-11-14 11:12   ` [v2,3/7] " Michael Ellerman
2017-11-07  7:53 ` [PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions Nicholas Piggin
2017-11-14 11:12   ` [v2, " Michael Ellerman
2017-11-07  7:53 ` [PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range Nicholas Piggin
2017-11-14 11:12   ` [v2,5/7] " Michael Ellerman
2017-11-07  7:53 ` [PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush Nicholas Piggin
2017-11-14 11:12   ` [v2, " Michael Ellerman
2017-11-07  7:53 ` [PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing Nicholas Piggin
2017-11-14 11:12   ` [v2, " Michael Ellerman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).