* [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements
@ 2017-11-07 7:53 Nicholas Piggin
2017-11-07 7:53 ` [PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling Nicholas Piggin
` (6 more replies)
0 siblings, 7 replies; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07 7:53 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V
Resending, sorry for the noise.
Since the v1/RFC, I pulled the first 2 fix patches into this series,
and rediffed to powerpc merge branch. Dropped the final 2 patches
which were not completely agreed upon and baked.
Thanks,
Nick
Nicholas Piggin (7):
powerpc/64s/radix: tlbie improve preempt handling
powerpc/64s/radix: Fix process table entry cache invalidation
powerpc/64s/radix: optimize TLB range flush barriers
powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions
powerpc/64s/radix: Optimize flush_tlb_range
powerpc/64s/radix: Introduce local single page ceiling for TLB range
flush
powerpc/64s/radix: Improve TLB flushing for page table freeing
arch/powerpc/include/asm/mmu_context.h | 4 +
arch/powerpc/mm/mmu_context_book3s64.c | 25 ++-
arch/powerpc/mm/tlb-radix.c | 318 ++++++++++++++++++++++++---------
3 files changed, 256 insertions(+), 91 deletions(-)
--
2.15.0
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling
2017-11-07 7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
@ 2017-11-07 7:53 ` Nicholas Piggin
2017-11-07 7:53 ` [PATCH v2 2/7] powerpc/64s/radix: Fix process table entry cache invalidation Nicholas Piggin
` (5 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07 7:53 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V
Preempt should be consistently disabled for mm_is_thread_local tests,
so bring the rest of these under preempt_disable().
Preempt does not need to be disabled for the mm->context.id tests,
which allows simplification and removal of gotos.
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/mm/tlb-radix.c | 50 ++++++++++++++++++++++-----------------------
1 file changed, 24 insertions(+), 26 deletions(-)
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index e2f15810b9c0..feeb96693aeb 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -186,16 +186,15 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
{
unsigned long pid;
- preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
- goto no_context;
+ return;
+ preempt_disable();
if (!mm_is_thread_local(mm))
_tlbie_pid(pid, RIC_FLUSH_TLB);
else
_tlbiel_pid(pid, RIC_FLUSH_TLB);
-no_context:
preempt_enable();
}
EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -204,16 +203,15 @@ void radix__flush_all_mm(struct mm_struct *mm)
{
unsigned long pid;
- preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
- goto no_context;
+ return;
+ preempt_disable();
if (!mm_is_thread_local(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
_tlbiel_pid(pid, RIC_FLUSH_ALL);
-no_context:
preempt_enable();
}
EXPORT_SYMBOL(radix__flush_all_mm);
@@ -230,15 +228,14 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
unsigned long pid;
unsigned long ap = mmu_get_ap(psize);
- preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
- goto bail;
+ return;
+ preempt_disable();
if (!mm_is_thread_local(mm))
_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
else
_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
-bail:
preempt_enable();
}
@@ -322,15 +319,17 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
{
unsigned long pid;
unsigned long addr;
- int local = mm_is_thread_local(mm);
+ bool local;
unsigned long ap = mmu_get_ap(psize);
unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
- preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
- goto err_out;
+ return;
+
+ preempt_disable();
+ local = mm_is_thread_local(mm);
if (end == TLB_FLUSH_ALL ||
(end - start) > tlb_single_page_flush_ceiling * page_size) {
@@ -338,39 +337,38 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
_tlbiel_pid(pid, RIC_FLUSH_TLB);
else
_tlbie_pid(pid, RIC_FLUSH_TLB);
- goto err_out;
+ } else {
+ for (addr = start; addr < end; addr += page_size) {
+ if (local)
+ _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+ else
+ _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+ }
}
- for (addr = start; addr < end; addr += page_size) {
- if (local)
- _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
- else
- _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
- }
-err_out:
preempt_enable();
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
{
- int local = mm_is_thread_local(mm);
+ bool local;
unsigned long ap = mmu_get_ap(mmu_virtual_psize);
unsigned long pid, end;
-
pid = mm->context.id;
- preempt_disable();
if (unlikely(pid == MMU_NO_CONTEXT))
- goto no_context;
+ return;
/* 4k page size, just blow the world */
if (PAGE_SIZE == 0x1000) {
radix__flush_all_mm(mm);
- preempt_enable();
return;
}
+ preempt_disable();
+ local = mm_is_thread_local(mm);
+
/* Otherwise first do the PWC */
if (local)
_tlbiel_pid(pid, RIC_FLUSH_PWC);
@@ -385,7 +383,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
else
_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
-no_context:
+
preempt_enable();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
--
2.15.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v2 2/7] powerpc/64s/radix: Fix process table entry cache invalidation
2017-11-07 7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
2017-11-07 7:53 ` [PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling Nicholas Piggin
@ 2017-11-07 7:53 ` Nicholas Piggin
2017-11-07 7:53 ` [PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers Nicholas Piggin
` (4 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07 7:53 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V
According to the architecture, the process table entry cache must be
flushed with tlbie RIC=2.
Currently the process table entry is set to invalid right before the
PID is returned to the allocator, with no invalidation. This works on
existing implementations that are known to not cache the process table
entry for any except the current PIDR.
It is architecturally correct and cleaner to invalidate with RIC=2
after clearing the process table entry and before the PID is returned
to the allocator. This can be done in arch_exit_mmap that runs before
the final flush, and to ensure the final flush (fullmm) is always a
RIC=2 variant.
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/mmu_context.h | 4 ++++
arch/powerpc/mm/mmu_context_book3s64.c | 25 ++++++++++++++++++++-----
arch/powerpc/mm/tlb-radix.c | 6 +++++-
3 files changed, 29 insertions(+), 6 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b4cdf574cf61..6177d43f0ce8 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -165,9 +165,13 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
{
}
+#ifndef CONFIG_PPC_BOOK3S_64
static inline void arch_exit_mmap(struct mm_struct *mm)
{
}
+#else
+extern void arch_exit_mmap(struct mm_struct *mm);
+#endif
static inline void arch_unmap(struct mm_struct *mm,
struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 05e15386d4cb..6d724dab27c2 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -216,19 +216,34 @@ void destroy_context(struct mm_struct *mm)
#ifdef CONFIG_SPAPR_TCE_IOMMU
WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
#endif
+ if (radix_enabled())
+ WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+ else
+ subpage_prot_free(mm);
+ destroy_pagetable_page(mm);
+ __destroy_context(mm->context.id);
+ mm->context.id = MMU_NO_CONTEXT;
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
if (radix_enabled()) {
/*
* Radix doesn't have a valid bit in the process table
* entries. However we know that at least P9 implementation
* will avoid caching an entry with an invalid RTS field,
* and 0 is invalid. So this will do.
+ *
+ * This runs before the "fullmm" tlb flush in exit_mmap,
+ * which does a RIC=2 tlbie to clear the process table
+ * entry. See the "fullmm" comments in tlb-radix.c.
+ *
+ * No barrier required here after the store because
+ * this process will do the invalidate, which starts with
+ * ptesync.
*/
process_tb[mm->context.id].prtb0 = 0;
- } else
- subpage_prot_free(mm);
- destroy_pagetable_page(mm);
- __destroy_context(mm->context.id);
- mm->context.id = MMU_NO_CONTEXT;
+ }
}
#ifdef CONFIG_PPC_RADIX_MMU
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index feeb96693aeb..6e77ed2d7c6c 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -297,10 +297,14 @@ void radix__tlb_flush(struct mmu_gather *tlb)
psize = radix_get_mmu_psize(page_size);
/*
* if page size is not something we understand, do a full mm flush
+ *
+ * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
+ * that flushes the process table entry cache upon process teardown.
+ * See the comment for radix in arch_exit_mmap().
*/
if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
- else if (tlb->need_flush_all) {
+ else if (tlb->fullmm || tlb->need_flush_all) {
tlb->need_flush_all = 0;
radix__flush_all_mm(mm);
} else
--
2.15.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers
2017-11-07 7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
2017-11-07 7:53 ` [PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling Nicholas Piggin
2017-11-07 7:53 ` [PATCH v2 2/7] powerpc/64s/radix: Fix process table entry cache invalidation Nicholas Piggin
@ 2017-11-07 7:53 ` Nicholas Piggin
2017-11-14 11:12 ` [v2,3/7] " Michael Ellerman
2017-11-07 7:53 ` [PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions Nicholas Piggin
` (3 subsequent siblings)
6 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07 7:53 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V
Short range flushes issue a sequences of tlbie(l) instructions for
individual effective addresses. These do not all require individual
barrier sequences, only one covering all tlbie(l) instructions.
Commit f7327e0ba3 ("powerpc/mm/radix: Remove unnecessary ptesync")
made a similar optimization for tlbiel for PID flushing.
For tlbie, the ISA says:
The tlbsync instruction provides an ordering function for the
effects of all tlbie instructions executed by the thread executing
the tlbsync instruction, with respect to the memory barrier
created by a subsequent ptesync instruction executed by the same
thread.
Time to munmap 30 pages of memory (after mmap, touch):
local global
vanilla 10.9us 22.3us
patched 3.4us 14.4us
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/mm/tlb-radix.c | 41 ++++++++++++++++++++++++++++++++---------
1 file changed, 32 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 6e77ed2d7c6c..49e71c68f5b1 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -84,7 +84,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
-static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+static inline void __tlbiel_va(unsigned long va, unsigned long pid,
unsigned long ap, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -95,14 +95,20 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid,
prs = 1; /* process scoped */
r = 1; /* raidx format */
- asm volatile("ptesync": : :"memory");
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- asm volatile("ptesync": : :"memory");
trace_tlbie(0, 1, rb, rs, ric, prs, r);
}
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
+static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+ unsigned long ap, unsigned long ric)
+{
+ asm volatile("ptesync": : :"memory");
+ __tlbiel_va(va, pid, ap, ric);
+ asm volatile("ptesync": : :"memory");
+}
+
+static inline void __tlbie_va(unsigned long va, unsigned long pid,
unsigned long ap, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -113,13 +119,20 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
prs = 1; /* process scoped */
r = 1; /* raidx format */
- asm volatile("ptesync": : :"memory");
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long ap, unsigned long ric)
+{
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, ap, ric);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+
/*
* Base TLB flushing operations:
*
@@ -341,13 +354,19 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
_tlbiel_pid(pid, RIC_FLUSH_TLB);
else
_tlbie_pid(pid, RIC_FLUSH_TLB);
+
} else {
+ asm volatile("ptesync": : :"memory");
for (addr = start; addr < end; addr += page_size) {
if (local)
- _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+ __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
else
- _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+ __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
+ if (local)
+ asm volatile("ptesync": : :"memory");
+ else
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
preempt_enable();
@@ -380,6 +399,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
_tlbie_pid(pid, RIC_FLUSH_PWC);
/* Then iterate the pages */
+ asm volatile("ptesync": : :"memory");
end = addr + HPAGE_PMD_SIZE;
for (; addr < end; addr += PAGE_SIZE) {
if (local)
@@ -387,7 +407,10 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
else
_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
-
+ if (local)
+ asm volatile("ptesync": : :"memory");
+ else
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
preempt_enable();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
--
2.15.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions
2017-11-07 7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
` (2 preceding siblings ...)
2017-11-07 7:53 ` [PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers Nicholas Piggin
@ 2017-11-07 7:53 ` Nicholas Piggin
2017-11-14 11:12 ` [v2, " Michael Ellerman
2017-11-07 7:53 ` [PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range Nicholas Piggin
` (2 subsequent siblings)
6 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07 7:53 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V
Move the barriers and range iteration down into the _tlbie* level,
which improves readability.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/mm/tlb-radix.c | 71 ++++++++++++++++++++++++++-------------------
1 file changed, 41 insertions(+), 30 deletions(-)
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 49e71c68f5b1..645a35b7bc9d 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -85,7 +85,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
}
static inline void __tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+ unsigned long ap, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -101,13 +101,28 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid,
}
static inline void _tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+ unsigned long psize, unsigned long ric)
{
+ unsigned long ap = mmu_get_ap(psize);
+
asm volatile("ptesync": : :"memory");
__tlbiel_va(va, pid, ap, ric);
asm volatile("ptesync": : :"memory");
}
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ for (addr = start; addr < end; addr += page_size)
+ __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+ asm volatile("ptesync": : :"memory");
+}
+
static inline void __tlbie_va(unsigned long va, unsigned long pid,
unsigned long ap, unsigned long ric)
{
@@ -125,13 +140,27 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
}
static inline void _tlbie_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+ unsigned long psize, unsigned long ric)
{
+ unsigned long ap = mmu_get_ap(psize);
+
asm volatile("ptesync": : :"memory");
__tlbie_va(va, pid, ap, ric);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
/*
* Base TLB flushing operations:
@@ -174,12 +203,11 @@ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmadd
int psize)
{
unsigned long pid;
- unsigned long ap = mmu_get_ap(psize);
preempt_disable();
pid = mm->context.id;
if (pid != MMU_NO_CONTEXT)
- _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
}
@@ -239,16 +267,15 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
int psize)
{
unsigned long pid;
- unsigned long ap = mmu_get_ap(psize);
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
if (!mm_is_thread_local(mm))
- _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+ _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
else
- _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
}
@@ -335,9 +362,7 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long end, int psize)
{
unsigned long pid;
- unsigned long addr;
bool local;
- unsigned long ap = mmu_get_ap(psize);
unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
@@ -356,17 +381,10 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
_tlbie_pid(pid, RIC_FLUSH_TLB);
} else {
- asm volatile("ptesync": : :"memory");
- for (addr = start; addr < end; addr += page_size) {
- if (local)
- __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
- else
- __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
- }
if (local)
- asm volatile("ptesync": : :"memory");
+ _tlbiel_va_range(start, end, pid, page_size, psize);
else
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
+ _tlbie_va_range(start, end, pid, page_size, psize);
}
preempt_enable();
@@ -375,9 +393,8 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
{
- bool local;
- unsigned long ap = mmu_get_ap(mmu_virtual_psize);
unsigned long pid, end;
+ bool local;
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
@@ -399,18 +416,12 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
_tlbie_pid(pid, RIC_FLUSH_PWC);
/* Then iterate the pages */
- asm volatile("ptesync": : :"memory");
end = addr + HPAGE_PMD_SIZE;
- for (; addr < end; addr += PAGE_SIZE) {
- if (local)
- _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
- else
- _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
- }
+
if (local)
- asm volatile("ptesync": : :"memory");
+ _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
else
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
+ _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
preempt_enable();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
--
2.15.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range
2017-11-07 7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
` (3 preceding siblings ...)
2017-11-07 7:53 ` [PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions Nicholas Piggin
@ 2017-11-07 7:53 ` Nicholas Piggin
2017-11-14 11:12 ` [v2,5/7] " Michael Ellerman
2017-11-07 7:53 ` [PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush Nicholas Piggin
2017-11-07 7:53 ` [PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing Nicholas Piggin
6 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07 7:53 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V
Currently for radix, flush_tlb_range flushes the entire PID, because
the Linux mm code does not tell us about page size here for THP vs
regular pages. This is quite sub-optimal for small mremap / mprotect
/ change_protection.
So implement va range flushes with two flush passes, one for each
page size (regular and THP). The second flush has an order of matnitude
fewer tlbie instructions than the first, so it is a relatively small
additional cost.
There is still room for improvement here with some changes to generic
APIs, particularly if there are mostly THP pages to be invalidated,
the small page flushes could be reduced.
Time to mprotect 1 page of memory (after mmap, touch):
vanilla 2.9us 1.8us
patched 1.2us 1.6us
Time to mprotect 30 pages of memory (after mmap, touch):
vanilla 8.2us 7.2us
patched 6.9us 17.9us
Time to mprotect 34 pages of memory (after mmap, touch):
vanilla 9.1us 8.0us
patched 9.0us 8.0us
34 pages is the point at which the invalidation switches from va
to entire PID, which tlbie can do in a single instruction. This is
why in the case of 30 pages, the new code runs slower for this test.
This is a deliberate tradeoff already present in the unmap and THP
promotion code, the idea is that the benefit from avoiding flushing
entire TLB for this PID on all threads in the system.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/mm/tlb-radix.c | 139 ++++++++++++++++++++++++++++++++------------
1 file changed, 101 insertions(+), 38 deletions(-)
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 645a35b7bc9d..277497be7aaf 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -100,6 +100,17 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid,
trace_tlbie(0, 1, rb, rs, ric, prs, r);
}
+static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
static inline void _tlbiel_va(unsigned long va, unsigned long pid,
unsigned long psize, unsigned long ric)
{
@@ -114,12 +125,8 @@ static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
unsigned long psize)
{
- unsigned long addr;
- unsigned long ap = mmu_get_ap(psize);
-
asm volatile("ptesync": : :"memory");
- for (addr = start; addr < end; addr += page_size)
- __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+ __tlbiel_va_range(start, end, pid, page_size, psize);
asm volatile("ptesync": : :"memory");
}
@@ -139,6 +146,17 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
+static inline void __tlbie_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
static inline void _tlbie_va(unsigned long va, unsigned long pid,
unsigned long psize, unsigned long ric)
{
@@ -153,12 +171,8 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
unsigned long psize)
{
- unsigned long addr;
- unsigned long ap = mmu_get_ap(psize);
-
asm volatile("ptesync": : :"memory");
- for (addr = start; addr < end; addr += page_size)
- __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+ __tlbie_va_range(start, end, pid, page_size, psize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
@@ -299,17 +313,78 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
}
EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+#define TLB_FLUSH_ALL -1UL
+
/*
- * Currently, for range flushing, we just do a full mm flush. Because
- * we use this in code path where we don' track the page size.
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page flushes to full-pid flushes.
*/
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
+ unsigned long pid;
+ unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
+ unsigned long page_size = 1UL << page_shift;
+ unsigned long nr_pages = (end - start) >> page_shift;
+ bool local, full;
+
+#ifdef CONFIG_HUGETLB_PAGE
+ if (is_vm_hugetlb_page(vma))
+ return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
- radix__flush_tlb_mm(mm);
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ local = mm_is_thread_local(mm);
+ full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
+
+ if (full) {
+ if (local)
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ else
+ _tlbie_pid(pid, RIC_FLUSH_TLB);
+ } else {
+ bool hflush = false;
+ unsigned long hstart, hend;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT;
+ hend = end >> HPAGE_PMD_SHIFT;
+ if (hstart < hend) {
+ hstart <<= HPAGE_PMD_SHIFT;
+ hend <<= HPAGE_PMD_SHIFT;
+ hflush = true;
+ }
+#endif
+
+ asm volatile("ptesync": : :"memory");
+ if (local) {
+ __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
+ if (hflush)
+ __tlbiel_va_range(hstart, hend, pid,
+ HPAGE_PMD_SIZE, MMU_PAGE_2M);
+ asm volatile("ptesync": : :"memory");
+ } else {
+ __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
+ if (hflush)
+ __tlbie_va_range(hstart, hend, pid,
+ HPAGE_PMD_SIZE, MMU_PAGE_2M);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+ }
+ }
+ preempt_enable();
}
EXPORT_SYMBOL(radix__flush_tlb_range);
@@ -351,19 +426,14 @@ void radix__tlb_flush(struct mmu_gather *tlb)
radix__flush_tlb_mm(mm);
}
-#define TLB_FLUSH_ALL -1UL
-/*
- * Number of pages above which we will do a bcast tlbie. Just a
- * number at this point copied from x86
- */
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
-
void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long end, int psize)
{
unsigned long pid;
- bool local;
- unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
+ unsigned int page_shift = mmu_psize_defs[psize].shift;
+ unsigned long page_size = 1UL << page_shift;
+ unsigned long nr_pages = (end - start) >> page_shift;
+ bool local, full;
pid = mm->context.id;
@@ -372,14 +442,13 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
preempt_disable();
local = mm_is_thread_local(mm);
+ full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
- if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling * page_size) {
+ if (full) {
if (local)
_tlbiel_pid(pid, RIC_FLUSH_TLB);
else
_tlbie_pid(pid, RIC_FLUSH_TLB);
-
} else {
if (local)
_tlbiel_va_range(start, end, pid, page_size, psize);
@@ -394,7 +463,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
{
unsigned long pid, end;
- bool local;
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
@@ -406,22 +474,17 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
return;
}
- preempt_disable();
- local = mm_is_thread_local(mm);
-
- /* Otherwise first do the PWC */
- if (local)
- _tlbiel_pid(pid, RIC_FLUSH_PWC);
- else
- _tlbie_pid(pid, RIC_FLUSH_PWC);
-
- /* Then iterate the pages */
end = addr + HPAGE_PMD_SIZE;
- if (local)
+ /* Otherwise first do the PWC, then iterate the pages. */
+ preempt_disable();
+ if (mm_is_thread_local(mm)) {
+ _tlbiel_pid(pid, RIC_FLUSH_PWC);
_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
- else
+ } else {
+ _tlbie_pid(pid, RIC_FLUSH_PWC);
_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
+ }
preempt_enable();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
--
2.15.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush
2017-11-07 7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
` (4 preceding siblings ...)
2017-11-07 7:53 ` [PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range Nicholas Piggin
@ 2017-11-07 7:53 ` Nicholas Piggin
2017-11-14 11:12 ` [v2, " Michael Ellerman
2017-11-07 7:53 ` [PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing Nicholas Piggin
6 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07 7:53 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V
The single page flush ceiling is the cut-off point at which we switch
from invalidating individual pages, to invalidating the entire process
address space in response to a range flush.
Introduce a local variant of this heuristic because local and global
tlbie have significantly different properties:
- Local tlbiel requires 128 instructions to invalidate a PID, global
tlbie only 1 instruction.
- Global tlbie instructions are expensive broadcast operations.
The local ceiling has been made much higher, 2x the number of
instructions required to invalidate the entire PID (i.e., 256 pages).
Time to mprotect N pages of memory (after mmap, touch), local invalidate:
N 32 34 64 128 256 512
vanilla 7.4us 9.0us 14.6us 26.4us 50.2us 98.3us
patched 7.4us 7.8us 13.8us 26.4us 51.9us 98.3us
The behaviour of both is identical at N=32 and N=512. Between there,
the vanilla kernel does a PID invalidate and the patched kernel does
a va range invalidate.
At N=128, these require the same number of tlbiel instructions, so
the patched version can be sen to be cheaper when < 128, and more
expensive when > 128. However this does not well capture the cost
of invalidated TLB.
The additional cost at 256 pages does not seem prohibitive. It may
be the case that increasing the limit further would continue to be
beneficial to avoid invalidating all of the process's TLB entries.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/mm/tlb-radix.c | 23 +++++++++++++++++++----
1 file changed, 19 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 277497be7aaf..5842c98fbe48 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -325,6 +325,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
* individual page flushes to full-pid flushes.
*/
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
@@ -347,8 +348,15 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
return;
preempt_disable();
- local = mm_is_thread_local(mm);
- full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
+ if (mm_is_thread_local(mm)) {
+ local = true;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_local_single_page_flush_ceiling);
+ } else {
+ local = false;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_single_page_flush_ceiling);
+ }
if (full) {
if (local)
@@ -441,8 +449,15 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
return;
preempt_disable();
- local = mm_is_thread_local(mm);
- full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
+ if (mm_is_thread_local(mm)) {
+ local = true;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_local_single_page_flush_ceiling);
+ } else {
+ local = false;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_single_page_flush_ceiling);
+ }
if (full) {
if (local)
--
2.15.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing
2017-11-07 7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
` (5 preceding siblings ...)
2017-11-07 7:53 ` [PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush Nicholas Piggin
@ 2017-11-07 7:53 ` Nicholas Piggin
2017-11-14 11:12 ` [v2, " Michael Ellerman
6 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2017-11-07 7:53 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V
Unmaps that free page tables always flush the entire PID, which is
sub-optimal. Provide TLB range flushing with an additional PWC flush
that can be use for va range invalidations with PWC flush.
Time to munmap N pages of memory including last level page table
teardown (after mmap, touch), local invalidate:
N 1 2 4 8 16 32 64
vanilla 3.2us 3.3us 3.4us 3.6us 4.1us 5.2us 7.2us
patched 1.4us 1.5us 1.7us 1.9us 2.6us 3.7us 6.2us
Global invalidate:
N 1 2 4 8 16 32 64
vanilla 2.2us 2.3us 2.4us 2.6us 3.2us 4.1us 6.2us
patched 2.1us 2.5us 3.4us 5.2us 8.7us 15.7us 6.2us
Local invalidates get much better across the board. Global ones have
the same issue where multiple tlbies for va flush do get slower than
the single tlbie to invalidate the PID. None of this test captures
the TLB benefits of avoiding killing everything.
Global gets worse, but it is brought in to line with global invalidate
for munmap()s that do not free page tables.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/mm/tlb-radix.c | 90 ++++++++++++++++++++++++++++++---------------
1 file changed, 61 insertions(+), 29 deletions(-)
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 5842c98fbe48..078f7da11ce1 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -39,6 +39,20 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
trace_tlbie(0, 1, rb, rs, ric, prs, r);
}
+static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* raidx format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
/*
* We use 128 set in radix mode and 256 set in hpt mode.
*/
@@ -70,18 +84,9 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
{
- unsigned long rb,rs,prs,r;
-
- rb = PPC_BIT(53); /* IS = 1 */
- rs = pid << PPC_BITLSHIFT(31);
- prs = 1; /* process scoped */
- r = 1; /* raidx format */
-
asm volatile("ptesync": : :"memory");
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ __tlbie_pid(pid, ric);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
- trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
static inline void __tlbiel_va(unsigned long va, unsigned long pid,
@@ -123,9 +128,11 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid,
static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
- unsigned long psize)
+ unsigned long psize, bool also_pwc)
{
asm volatile("ptesync": : :"memory");
+ if (also_pwc)
+ __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
__tlbiel_va_range(start, end, pid, page_size, psize);
asm volatile("ptesync": : :"memory");
}
@@ -169,9 +176,11 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
static inline void _tlbie_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
- unsigned long psize)
+ unsigned long psize, bool also_pwc)
{
asm volatile("ptesync": : :"memory");
+ if (also_pwc)
+ __tlbie_pid(pid, RIC_FLUSH_PWC);
__tlbie_va_range(start, end, pid, page_size, psize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
@@ -411,13 +420,15 @@ static int radix_get_mmu_psize(int page_size)
return psize;
}
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize);
+
void radix__tlb_flush(struct mmu_gather *tlb)
{
int psize = 0;
struct mm_struct *mm = tlb->mm;
int page_size = tlb->page_size;
- psize = radix_get_mmu_psize(page_size);
/*
* if page size is not something we understand, do a full mm flush
*
@@ -425,17 +436,28 @@ void radix__tlb_flush(struct mmu_gather *tlb)
* that flushes the process table entry cache upon process teardown.
* See the comment for radix in arch_exit_mmap().
*/
- if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
- radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
- else if (tlb->fullmm || tlb->need_flush_all) {
- tlb->need_flush_all = 0;
+ if (tlb->fullmm) {
radix__flush_all_mm(mm);
- } else
- radix__flush_tlb_mm(mm);
+ } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
+ if (!tlb->need_flush_all)
+ radix__flush_tlb_mm(mm);
+ else
+ radix__flush_all_mm(mm);
+ } else {
+ unsigned long start = tlb->start;
+ unsigned long end = tlb->end;
+
+ if (!tlb->need_flush_all)
+ radix__flush_tlb_range_psize(mm, start, end, psize);
+ else
+ radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+ }
+ tlb->need_flush_all = 0;
}
-void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
- unsigned long end, int psize)
+static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ int psize, bool also_pwc)
{
unsigned long pid;
unsigned int page_shift = mmu_psize_defs[psize].shift;
@@ -461,19 +483,31 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
if (full) {
if (local)
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
else
- _tlbie_pid(pid, RIC_FLUSH_TLB);
+ _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
} else {
if (local)
- _tlbiel_va_range(start, end, pid, page_size, psize);
+ _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
else
- _tlbie_va_range(start, end, pid, page_size, psize);
+ _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
}
preempt_enable();
}
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize)
+{
+ return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+}
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize)
+{
+ __radix__flush_tlb_range_psize(mm, start, end, psize, true);
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
{
@@ -494,11 +528,9 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
/* Otherwise first do the PWC, then iterate the pages. */
preempt_disable();
if (mm_is_thread_local(mm)) {
- _tlbiel_pid(pid, RIC_FLUSH_PWC);
- _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
+ _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
} else {
- _tlbie_pid(pid, RIC_FLUSH_PWC);
- _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
+ _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
}
preempt_enable();
}
--
2.15.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [v2,3/7] powerpc/64s/radix: optimize TLB range flush barriers
2017-11-07 7:53 ` [PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers Nicholas Piggin
@ 2017-11-14 11:12 ` Michael Ellerman
0 siblings, 0 replies; 13+ messages in thread
From: Michael Ellerman @ 2017-11-14 11:12 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin
On Tue, 2017-11-07 at 07:53:05 UTC, Nicholas Piggin wrote:
> Short range flushes issue a sequences of tlbie(l) instructions for
> individual effective addresses. These do not all require individual
> barrier sequences, only one covering all tlbie(l) instructions.
>
> Commit f7327e0ba3 ("powerpc/mm/radix: Remove unnecessary ptesync")
> made a similar optimization for tlbiel for PID flushing.
>
> For tlbie, the ISA says:
>
> The tlbsync instruction provides an ordering function for the
> effects of all tlbie instructions executed by the thread executing
> the tlbsync instruction, with respect to the memory barrier
> created by a subsequent ptesync instruction executed by the same
> thread.
>
> Time to munmap 30 pages of memory (after mmap, touch):
> local global
> vanilla 10.9us 22.3us
> patched 3.4us 14.4us
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Applied to powerpc next, thanks.
https://git.kernel.org/powerpc/c/14001c60939a754717893672209160
cheers
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [v2, 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions
2017-11-07 7:53 ` [PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions Nicholas Piggin
@ 2017-11-14 11:12 ` Michael Ellerman
0 siblings, 0 replies; 13+ messages in thread
From: Michael Ellerman @ 2017-11-14 11:12 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin
On Tue, 2017-11-07 at 07:53:06 UTC, Nicholas Piggin wrote:
> Move the barriers and range iteration down into the _tlbie* level,
> which improves readability.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Applied to powerpc next, thanks.
https://git.kernel.org/powerpc/c/d665767e39fa4a9e725f92d77ba206
cheers
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [v2,5/7] powerpc/64s/radix: Optimize flush_tlb_range
2017-11-07 7:53 ` [PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range Nicholas Piggin
@ 2017-11-14 11:12 ` Michael Ellerman
0 siblings, 0 replies; 13+ messages in thread
From: Michael Ellerman @ 2017-11-14 11:12 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin
On Tue, 2017-11-07 at 07:53:07 UTC, Nicholas Piggin wrote:
> Currently for radix, flush_tlb_range flushes the entire PID, because
> the Linux mm code does not tell us about page size here for THP vs
> regular pages. This is quite sub-optimal for small mremap / mprotect
> / change_protection.
>
> So implement va range flushes with two flush passes, one for each
> page size (regular and THP). The second flush has an order of matnitude
> fewer tlbie instructions than the first, so it is a relatively small
> additional cost.
>
> There is still room for improvement here with some changes to generic
> APIs, particularly if there are mostly THP pages to be invalidated,
> the small page flushes could be reduced.
>
> Time to mprotect 1 page of memory (after mmap, touch):
> vanilla 2.9us 1.8us
> patched 1.2us 1.6us
>
> Time to mprotect 30 pages of memory (after mmap, touch):
> vanilla 8.2us 7.2us
> patched 6.9us 17.9us
>
> Time to mprotect 34 pages of memory (after mmap, touch):
> vanilla 9.1us 8.0us
> patched 9.0us 8.0us
>
> 34 pages is the point at which the invalidation switches from va
> to entire PID, which tlbie can do in a single instruction. This is
> why in the case of 30 pages, the new code runs slower for this test.
> This is a deliberate tradeoff already present in the unmap and THP
> promotion code, the idea is that the benefit from avoiding flushing
> entire TLB for this PID on all threads in the system.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Applied to powerpc next, thanks.
https://git.kernel.org/powerpc/c/cbf09c837720f72f5e63ab7a2d331e
cheers
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [v2, 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush
2017-11-07 7:53 ` [PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush Nicholas Piggin
@ 2017-11-14 11:12 ` Michael Ellerman
0 siblings, 0 replies; 13+ messages in thread
From: Michael Ellerman @ 2017-11-14 11:12 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin
On Tue, 2017-11-07 at 07:53:08 UTC, Nicholas Piggin wrote:
> The single page flush ceiling is the cut-off point at which we switch
> from invalidating individual pages, to invalidating the entire process
> address space in response to a range flush.
>
> Introduce a local variant of this heuristic because local and global
> tlbie have significantly different properties:
> - Local tlbiel requires 128 instructions to invalidate a PID, global
> tlbie only 1 instruction.
> - Global tlbie instructions are expensive broadcast operations.
>
> The local ceiling has been made much higher, 2x the number of
> instructions required to invalidate the entire PID (i.e., 256 pages).
>
> Time to mprotect N pages of memory (after mmap, touch), local invalidate:
> N 32 34 64 128 256 512
> vanilla 7.4us 9.0us 14.6us 26.4us 50.2us 98.3us
> patched 7.4us 7.8us 13.8us 26.4us 51.9us 98.3us
>
> The behaviour of both is identical at N=32 and N=512. Between there,
> the vanilla kernel does a PID invalidate and the patched kernel does
> a va range invalidate.
>
> At N=128, these require the same number of tlbiel instructions, so
> the patched version can be sen to be cheaper when < 128, and more
> expensive when > 128. However this does not well capture the cost
> of invalidated TLB.
>
> The additional cost at 256 pages does not seem prohibitive. It may
> be the case that increasing the limit further would continue to be
> beneficial to avoid invalidating all of the process's TLB entries.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Applied to powerpc next, thanks.
https://git.kernel.org/powerpc/c/f6f27951fdf84a6edca3ea14077268
cheers
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [v2, 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing
2017-11-07 7:53 ` [PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing Nicholas Piggin
@ 2017-11-14 11:12 ` Michael Ellerman
0 siblings, 0 replies; 13+ messages in thread
From: Michael Ellerman @ 2017-11-14 11:12 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin
On Tue, 2017-11-07 at 07:53:09 UTC, Nicholas Piggin wrote:
> Unmaps that free page tables always flush the entire PID, which is
> sub-optimal. Provide TLB range flushing with an additional PWC flush
> that can be use for va range invalidations with PWC flush.
>
> Time to munmap N pages of memory including last level page table
> teardown (after mmap, touch), local invalidate:
> N 1 2 4 8 16 32 64
> vanilla 3.2us 3.3us 3.4us 3.6us 4.1us 5.2us 7.2us
> patched 1.4us 1.5us 1.7us 1.9us 2.6us 3.7us 6.2us
>
> Global invalidate:
> N 1 2 4 8 16 32 64
> vanilla 2.2us 2.3us 2.4us 2.6us 3.2us 4.1us 6.2us
> patched 2.1us 2.5us 3.4us 5.2us 8.7us 15.7us 6.2us
>
> Local invalidates get much better across the board. Global ones have
> the same issue where multiple tlbies for va flush do get slower than
> the single tlbie to invalidate the PID. None of this test captures
> the TLB benefits of avoiding killing everything.
>
> Global gets worse, but it is brought in to line with global invalidate
> for munmap()s that do not free page tables.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Applied to powerpc next, thanks.
https://git.kernel.org/powerpc/c/0b2f5a8a792755c88bd786f89712a9
cheers
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2017-11-14 11:12 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-07 7:53 [PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements Nicholas Piggin
2017-11-07 7:53 ` [PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling Nicholas Piggin
2017-11-07 7:53 ` [PATCH v2 2/7] powerpc/64s/radix: Fix process table entry cache invalidation Nicholas Piggin
2017-11-07 7:53 ` [PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers Nicholas Piggin
2017-11-14 11:12 ` [v2,3/7] " Michael Ellerman
2017-11-07 7:53 ` [PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions Nicholas Piggin
2017-11-14 11:12 ` [v2, " Michael Ellerman
2017-11-07 7:53 ` [PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range Nicholas Piggin
2017-11-14 11:12 ` [v2,5/7] " Michael Ellerman
2017-11-07 7:53 ` [PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush Nicholas Piggin
2017-11-14 11:12 ` [v2, " Michael Ellerman
2017-11-07 7:53 ` [PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing Nicholas Piggin
2017-11-14 11:12 ` [v2, " Michael Ellerman
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).