linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 1/2] powerpc/mm: Export flush_all_mm()
@ 2017-09-03 18:15 Frederic Barrat
  2017-09-03 18:15 ` [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts Frederic Barrat
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Frederic Barrat @ 2017-09-03 18:15 UTC (permalink / raw)
  To: mpe, linuxppc-dev, benh, andrew.donnellan, clombard, vaibhav; +Cc: alistair

With the optimizations introduced by commit a46cc7a90fd8
("powerpc/mm/radix: Improve TLB/PWC flushes"), flush_tlb_mm() no
longer flushes the page walk cache with radix. This patch introduces
flush_all_mm(), which flushes everything, tlb and pwc, for a given mm.

Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
---
Changelog:
v3: add comment to explain limitations on hash
v2: this patch is new

 arch/powerpc/include/asm/book3s/64/tlbflush-hash.h  | 20 ++++++++++++++++++++
 arch/powerpc/include/asm/book3s/64/tlbflush-radix.h |  3 +++
 arch/powerpc/include/asm/book3s/64/tlbflush.h       | 15 +++++++++++++++
 arch/powerpc/mm/tlb-radix.c                         |  6 ++++--
 4 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
index 2f6373144e2c..2ac45cf85042 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -65,6 +65,26 @@ static inline void hash__flush_tlb_mm(struct mm_struct *mm)
 {
 }
 
+static inline void hash__local_flush_all_mm(struct mm_struct *mm)
+{
+	/*
+	 * There's no Page Walk Cache for hash, so what is needed is
+	 * the same as flush_tlb_mm(), which doesn't really make sense
+	 * with hash. So the only thing we could do is flush the
+	 * entire LPID! Punt for now, as it's not being used.
+	 */
+}
+
+static inline void hash__flush_all_mm(struct mm_struct *mm)
+{
+	/*
+	 * There's no Page Walk Cache for hash, so what is needed is
+	 * the same as flush_tlb_mm(), which doesn't really make sense
+	 * with hash. So the only thing we could do is flush the
+	 * entire LPID! Punt for now, as it's not being used.
+	 */
+}
+
 static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma,
 					  unsigned long vmaddr)
 {
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 9b433a624bf3..af06c6fe8a9f 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -21,17 +21,20 @@ extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long sta
 extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
 extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
+extern void radix__local_flush_all_mm(struct mm_struct *mm);
 extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 					      int psize);
 extern void radix__tlb_flush(struct mmu_gather *tlb);
 #ifdef CONFIG_SMP
 extern void radix__flush_tlb_mm(struct mm_struct *mm);
+extern void radix__flush_all_mm(struct mm_struct *mm);
 extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 					int psize);
 #else
 #define radix__flush_tlb_mm(mm)		radix__local_flush_tlb_mm(mm)
+#define radix__flush_all_mm(mm)		radix__local_flush_all_mm(mm)
 #define radix__flush_tlb_page(vma,addr)	radix__local_flush_tlb_page(vma,addr)
 #define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p)
 #endif
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 72b925f97bab..70760d018bcd 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -57,6 +57,13 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma,
 	return hash__local_flush_tlb_page(vma, vmaddr);
 }
 
+static inline void local_flush_all_mm(struct mm_struct *mm)
+{
+	if (radix_enabled())
+		return radix__local_flush_all_mm(mm);
+	return hash__local_flush_all_mm(mm);
+}
+
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
 	if (radix_enabled())
@@ -79,9 +86,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 		return radix__flush_tlb_page(vma, vmaddr);
 	return hash__flush_tlb_page(vma, vmaddr);
 }
+
+static inline void flush_all_mm(struct mm_struct *mm)
+{
+	if (radix_enabled())
+		return radix__flush_all_mm(mm);
+	return hash__flush_all_mm(mm);
+}
 #else
 #define flush_tlb_mm(mm)		local_flush_tlb_mm(mm)
 #define flush_tlb_page(vma, addr)	local_flush_tlb_page(vma, addr)
+#define flush_all_mm(mm)		local_flush_all_mm(mm)
 #endif /* CONFIG_SMP */
 /*
  * flush the page walk cache for the address
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index b3e849c4886e..5a1f46eff3a2 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -144,7 +144,7 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm)
 EXPORT_SYMBOL(radix__local_flush_tlb_mm);
 
 #ifndef CONFIG_SMP
-static void radix__local_flush_all_mm(struct mm_struct *mm)
+void radix__local_flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
 
@@ -154,6 +154,7 @@ static void radix__local_flush_all_mm(struct mm_struct *mm)
 		_tlbiel_pid(pid, RIC_FLUSH_ALL);
 	preempt_enable();
 }
+EXPORT_SYMBOL(radix__local_flush_all_mm);
 #endif /* CONFIG_SMP */
 
 void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
@@ -200,7 +201,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
 
-static void radix__flush_all_mm(struct mm_struct *mm)
+void radix__flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
 
@@ -216,6 +217,7 @@ static void radix__flush_all_mm(struct mm_struct *mm)
 no_context:
 	preempt_enable();
 }
+EXPORT_SYMBOL(radix__flush_all_mm);
 
 void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 {
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts
  2017-09-03 18:15 [PATCH v3 1/2] powerpc/mm: Export flush_all_mm() Frederic Barrat
@ 2017-09-03 18:15 ` Frederic Barrat
  2017-09-08  6:56   ` Nicholas Piggin
  2017-09-13  3:58   ` Alistair Popple
  2017-09-13  4:04 ` [PATCH v3 1/2] powerpc/mm: Export flush_all_mm() Alistair Popple
  2017-10-05  4:21 ` [v3,1/2] " Michael Ellerman
  2 siblings, 2 replies; 11+ messages in thread
From: Frederic Barrat @ 2017-09-03 18:15 UTC (permalink / raw)
  To: mpe, linuxppc-dev, benh, andrew.donnellan, clombard, vaibhav; +Cc: alistair

The PSL and nMMU need to see all TLB invalidations for the memory
contexts used on the adapter. For the hash memory model, it is done by
making all TLBIs global as soon as the cxl driver is in use. For
radix, we need something similar, but we can refine and only convert
to global the invalidations for contexts actually used by the device.

The new mm_context_add_copro() API increments the 'active_cpus' count
for the contexts attached to the cxl adapter. As soon as there's more
than 1 active cpu, the TLBIs for the context become global. Active cpu
count must be decremented when detaching to restore locality if
possible and to avoid overflowing the counter.

The hash memory model support is somewhat limited, as we can't
decrement the active cpus count when mm_context_remove_copro() is
called, because we can't flush the TLB for a mm on hash. So TLBIs
remain global on hash.

Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Fixes: f24be42aab37 ("cxl: Add psl9 specific code")
---
Changelog:
v3: don't decrement active cpus count with hash, as we don't know how to flush
v2: Replace flush_tlb_mm() by the new flush_all_mm() to flush the TLBs
and PWCs (thanks to Ben)

 arch/powerpc/include/asm/mmu_context.h | 46 ++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/mmu_context.c          |  9 -------
 drivers/misc/cxl/api.c                 | 22 +++++++++++++---
 drivers/misc/cxl/context.c             |  3 +++
 drivers/misc/cxl/file.c                | 19 ++++++++++++--
 5 files changed, 85 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 309592589e30..a0d7145d6cd2 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -77,6 +77,52 @@ extern void switch_cop(struct mm_struct *next);
 extern int use_cop(unsigned long acop, struct mm_struct *mm);
 extern void drop_cop(unsigned long acop, struct mm_struct *mm);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline void inc_mm_active_cpus(struct mm_struct *mm)
+{
+	atomic_inc(&mm->context.active_cpus);
+}
+
+static inline void dec_mm_active_cpus(struct mm_struct *mm)
+{
+	atomic_dec(&mm->context.active_cpus);
+}
+
+static inline void mm_context_add_copro(struct mm_struct *mm)
+{
+	/*
+	 * On hash, should only be called once over the lifetime of
+	 * the context, as we can't decrement the active cpus count
+	 * and flush properly for the time being.
+	 */
+	inc_mm_active_cpus(mm);
+}
+
+static inline void mm_context_remove_copro(struct mm_struct *mm)
+{
+	/*
+	 * Need to broadcast a global flush of the full mm before
+	 * decrementing active_cpus count, as the next TLBI may be
+	 * local and the nMMU and/or PSL need to be cleaned up.
+	 * Should be rare enough so that it's acceptable.
+	 *
+	 * Skip on hash, as we don't know how to do the proper flush
+	 * for the time being. Invalidations will remain global if
+	 * used on hash.
+	 */
+	if (radix_enabled()) {
+		flush_all_mm(mm);
+		dec_mm_active_cpus(mm);
+	}
+}
+#else
+static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
+static inline void dec_mm_active_cpus(struct mm_struct *mm) { }
+static inline void mm_context_add_copro(struct mm_struct *mm) { }
+static inline void mm_context_remove_copro(struct mm_struct *mm) { }
+#endif
+
+
 extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			       struct task_struct *tsk);
 
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index 0f613bc63c50..d60a62bf4fc7 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -34,15 +34,6 @@ static inline void switch_mm_pgdir(struct task_struct *tsk,
 				   struct mm_struct *mm) { }
 #endif
 
-#ifdef CONFIG_PPC_BOOK3S_64
-static inline void inc_mm_active_cpus(struct mm_struct *mm)
-{
-	atomic_inc(&mm->context.active_cpus);
-}
-#else
-static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
-#endif
-
 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			struct task_struct *tsk)
 {
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index a0c44d16bf30..1137a2cc1d3e 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/sched/mm.h>
+#include <linux/mmu_context.h>
 
 #include "cxl.h"
 
@@ -331,9 +332,12 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
 		/* ensure this mm_struct can't be freed */
 		cxl_context_mm_count_get(ctx);
 
-		/* decrement the use count */
-		if (ctx->mm)
+		if (ctx->mm) {
+			/* decrement the use count from above */
 			mmput(ctx->mm);
+			/* make TLBIs for this context global */
+			mm_context_add_copro(ctx->mm);
+		}
 	}
 
 	/*
@@ -342,13 +346,25 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
 	 */
 	cxl_ctx_get();
 
+	/*
+	 * Barrier is needed to make sure all TLBIs are global before
+	 * we attach and the context starts being used by the adapter.
+	 *
+	 * Needed after mm_context_add_copro() for radix and
+	 * cxl_ctx_get() for hash/p8
+	 */
+	smp_mb();
+
 	if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) {
 		put_pid(ctx->pid);
 		ctx->pid = NULL;
 		cxl_adapter_context_put(ctx->afu->adapter);
 		cxl_ctx_put();
-		if (task)
+		if (task) {
 			cxl_context_mm_count_put(ctx);
+			if (ctx->mm)
+				mm_context_remove_copro(ctx->mm);
+		}
 		goto out;
 	}
 
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 8c32040b9c09..12a41b2753f0 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/sched/mm.h>
+#include <linux/mmu_context.h>
 #include <asm/cputable.h>
 #include <asm/current.h>
 #include <asm/copro.h>
@@ -267,6 +268,8 @@ int __detach_context(struct cxl_context *ctx)
 
 	/* Decrease the mm count on the context */
 	cxl_context_mm_count_put(ctx);
+	if (ctx->mm)
+		mm_context_remove_copro(ctx->mm);
 	ctx->mm = NULL;
 
 	return 0;
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 4bfad9f6dc9f..84b801b5d0e5 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -19,6 +19,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/sched/mm.h>
+#include <linux/mmu_context.h>
 #include <asm/cputable.h>
 #include <asm/current.h>
 #include <asm/copro.h>
@@ -220,9 +221,12 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 	/* ensure this mm_struct can't be freed */
 	cxl_context_mm_count_get(ctx);
 
-	/* decrement the use count */
-	if (ctx->mm)
+	if (ctx->mm) {
+		/* decrement the use count from above */
 		mmput(ctx->mm);
+		/* make TLBIs for this context global */
+		mm_context_add_copro(ctx->mm);
+	}
 
 	/*
 	 * Increment driver use count. Enables global TLBIs for hash
@@ -230,6 +234,15 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 	 */
 	cxl_ctx_get();
 
+	/*
+	 * Barrier is needed to make sure all TLBIs are global before
+	 * we attach and the context starts being used by the adapter.
+	 *
+	 * Needed after mm_context_add_copro() for radix and
+	 * cxl_ctx_get() for hash/p8
+	 */
+	smp_mb();
+
 	trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr);
 
 	if ((rc = cxl_ops->attach_process(ctx, false, work.work_element_descriptor,
@@ -240,6 +253,8 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 		ctx->pid = NULL;
 		cxl_ctx_put();
 		cxl_context_mm_count_put(ctx);
+		if (ctx->mm)
+			mm_context_remove_copro(ctx->mm);
 		goto out;
 	}
 
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts
  2017-09-03 18:15 ` [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts Frederic Barrat
@ 2017-09-08  6:56   ` Nicholas Piggin
  2017-09-08  7:34     ` Frederic Barrat
  2017-09-13  3:53     ` Alistair Popple
  2017-09-13  3:58   ` Alistair Popple
  1 sibling, 2 replies; 11+ messages in thread
From: Nicholas Piggin @ 2017-09-08  6:56 UTC (permalink / raw)
  To: Frederic Barrat
  Cc: mpe, linuxppc-dev, benh, andrew.donnellan, clombard, vaibhav, alistair

On Sun,  3 Sep 2017 20:15:13 +0200
Frederic Barrat <fbarrat@linux.vnet.ibm.com> wrote:

> The PSL and nMMU need to see all TLB invalidations for the memory
> contexts used on the adapter. For the hash memory model, it is done by
> making all TLBIs global as soon as the cxl driver is in use. For
> radix, we need something similar, but we can refine and only convert
> to global the invalidations for contexts actually used by the device.
> 
> The new mm_context_add_copro() API increments the 'active_cpus' count
> for the contexts attached to the cxl adapter. As soon as there's more
> than 1 active cpu, the TLBIs for the context become global. Active cpu
> count must be decremented when detaching to restore locality if
> possible and to avoid overflowing the counter.
> 
> The hash memory model support is somewhat limited, as we can't
> decrement the active cpus count when mm_context_remove_copro() is
> called, because we can't flush the TLB for a mm on hash. So TLBIs
> remain global on hash.

Sorry I didn't look at this earlier and just wading in here a bit, but
what do you think of using mmu notifiers for invalidating nMMU and
coprocessor caches, rather than put the details into the host MMU
management? npu-dma.c already looks to have almost everything covered
with its notifiers (in that it wouldn't have to rely on tlbie coming
from host MMU code).

This change is not too bad today, but if we get to more complicated
MMU/nMMU TLB management like directed invalidation of particular units,
then putting more knowledge into the host code will end up being
complex I think.

I also want to also do optimizations on the core code that assumes we
only have to take care of other CPUs, e.g.,

https://patchwork.ozlabs.org/patch/811068/

Or, another example, directed IPI invalidations from the mm_cpumask
bitmap.

I realize you want to get something merged! For the merge window and
backports this seems fine. I think it would be nice soon afterwards to
get nMMU knowledge out of the core code... Though I also realize with
our tlbie instruction that does everything then it may be tricky to
make a really optimal notifier.

Thanks,
Nick

> 
> Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
> Fixes: f24be42aab37 ("cxl: Add psl9 specific code")
> ---
> Changelog:
> v3: don't decrement active cpus count with hash, as we don't know how to flush
> v2: Replace flush_tlb_mm() by the new flush_all_mm() to flush the TLBs
> and PWCs (thanks to Ben)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts
  2017-09-08  6:56   ` Nicholas Piggin
@ 2017-09-08  7:34     ` Frederic Barrat
  2017-09-08 10:54       ` Nicholas Piggin
  2017-09-13  3:53     ` Alistair Popple
  1 sibling, 1 reply; 11+ messages in thread
From: Frederic Barrat @ 2017-09-08  7:34 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: mpe, linuxppc-dev, benh, andrew.donnellan, clombard, vaibhav, alistair



Le 08/09/2017 à 08:56, Nicholas Piggin a écrit :
> On Sun,  3 Sep 2017 20:15:13 +0200
> Frederic Barrat <fbarrat@linux.vnet.ibm.com> wrote:
> 
>> The PSL and nMMU need to see all TLB invalidations for the memory
>> contexts used on the adapter. For the hash memory model, it is done by
>> making all TLBIs global as soon as the cxl driver is in use. For
>> radix, we need something similar, but we can refine and only convert
>> to global the invalidations for contexts actually used by the device.
>>
>> The new mm_context_add_copro() API increments the 'active_cpus' count
>> for the contexts attached to the cxl adapter. As soon as there's more
>> than 1 active cpu, the TLBIs for the context become global. Active cpu
>> count must be decremented when detaching to restore locality if
>> possible and to avoid overflowing the counter.
>>
>> The hash memory model support is somewhat limited, as we can't
>> decrement the active cpus count when mm_context_remove_copro() is
>> called, because we can't flush the TLB for a mm on hash. So TLBIs
>> remain global on hash.
> 
> Sorry I didn't look at this earlier and just wading in here a bit, but
> what do you think of using mmu notifiers for invalidating nMMU and
> coprocessor caches, rather than put the details into the host MMU
> management? npu-dma.c already looks to have almost everything covered
> with its notifiers (in that it wouldn't have to rely on tlbie coming
> from host MMU code).

Does npu-dma.c really do mmio nMMU invalidations? My understanding was 
that those atsd_launch operations are really targeted at the device 
behind the NPU, i.e. the nvidia card.
At some point, it was not possible to do mmio invalidations on the nMMU. 
At least on dd1. I'm checking with the nMMU team the status on dd2.

Alistair: is your code really doing a nMMU invalidation? Considering 
you're trying to also reuse the mm_context_add_copro() from this patch, 
I think I know the answer.

There are also other components relying on broadcasted invalidations 
from hardware: the PSL (for capi FPGA) and the XSL on the Mellanox CX5 
card, when in capi mode. They rely on hardware TLBIs, snooped and 
forwarded to them by the CAPP.
For the PSL, we do have a mmio interface to do targeted invalidations, 
but it was removed from the capi architecture (and left as a debug 
feature for our PSL implementation), because the nMMU would be out of 
sync with the PSL (due to the lack of interface to sync the nMMU, as 
mentioned above).
For the XSL on the Mellanox CX5, it's even more complicated. AFAIK, they 
do have a way to trigger invalidations through software, though the 
interface is private and Mellanox would have to be involved. They've 
also stated the performance is much worse through software invalidation.

Another consideration is performance. Which is best? Short of having 
real numbers, it's probably hard to know for sure.

So the road of getting rid of hardware invalidations for external 
components, if at all possible or even desirable, may be long.

   Fred



> This change is not too bad today, but if we get to more complicated
> MMU/nMMU TLB management like directed invalidation of particular units,
> then putting more knowledge into the host code will end up being
> complex I think.
> 
> I also want to also do optimizations on the core code that assumes we
> only have to take care of other CPUs, e.g.,
> 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.ozlabs.org_patch_811068_&d=DwICAg&c=jf_iaSHvJObTbx-siA1ZOg&r=647QnUvvBMO2f-DWP2xkeFceXDYSjpgHeTL3m_I9fiA&m=VaerDVXunKigctgE7NLm8VjaTR90W1m08iMcohAAnPo&s=y25SSoLEB8zDwXOLaTb8FFSpX_qSKiIG3Z5Cf1m7xnw&e=
> 
> Or, another example, directed IPI invalidations from the mm_cpumask
> bitmap.
> 
> I realize you want to get something merged! For the merge window and
> backports this seems fine. I think it would be nice soon afterwards to
> get nMMU knowledge out of the core code... Though I also realize with
> our tlbie instruction that does everything then it may be tricky to
> make a really optimal notifier.
> 
> Thanks,
> Nick
> 
>>
>> Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
>> Fixes: f24be42aab37 ("cxl: Add psl9 specific code")
>> ---
>> Changelog:
>> v3: don't decrement active cpus count with hash, as we don't know how to flush
>> v2: Replace flush_tlb_mm() by the new flush_all_mm() to flush the TLBs
>> and PWCs (thanks to Ben)
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts
  2017-09-08  7:34     ` Frederic Barrat
@ 2017-09-08 10:54       ` Nicholas Piggin
  2017-09-08 11:18         ` Nicholas Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Nicholas Piggin @ 2017-09-08 10:54 UTC (permalink / raw)
  To: Frederic Barrat
  Cc: mpe, linuxppc-dev, benh, andrew.donnellan, clombard, vaibhav, alistair

On Fri, 8 Sep 2017 09:34:39 +0200
Frederic Barrat <fbarrat@linux.vnet.ibm.com> wrote:

> Le 08/09/2017 à 08:56, Nicholas Piggin a écrit :
> > On Sun,  3 Sep 2017 20:15:13 +0200
> > Frederic Barrat <fbarrat@linux.vnet.ibm.com> wrote:
> >   
> >> The PSL and nMMU need to see all TLB invalidations for the memory
> >> contexts used on the adapter. For the hash memory model, it is done by
> >> making all TLBIs global as soon as the cxl driver is in use. For
> >> radix, we need something similar, but we can refine and only convert
> >> to global the invalidations for contexts actually used by the device.
> >>
> >> The new mm_context_add_copro() API increments the 'active_cpus' count
> >> for the contexts attached to the cxl adapter. As soon as there's more
> >> than 1 active cpu, the TLBIs for the context become global. Active cpu
> >> count must be decremented when detaching to restore locality if
> >> possible and to avoid overflowing the counter.
> >>
> >> The hash memory model support is somewhat limited, as we can't
> >> decrement the active cpus count when mm_context_remove_copro() is
> >> called, because we can't flush the TLB for a mm on hash. So TLBIs
> >> remain global on hash.  
> > 
> > Sorry I didn't look at this earlier and just wading in here a bit, but
> > what do you think of using mmu notifiers for invalidating nMMU and
> > coprocessor caches, rather than put the details into the host MMU
> > management? npu-dma.c already looks to have almost everything covered
> > with its notifiers (in that it wouldn't have to rely on tlbie coming
> > from host MMU code).  
> 
> Does npu-dma.c really do mmio nMMU invalidations?

No, but it does do a flush_tlb_mm there to do a tlbie (probably
buggy in some cases and does tlbiel without this patch of yours).
But the point is when you control the flushing you don't have to
mess with making the core flush code give you tlbies.

Just add a flush_nmmu_mm or something that does what you need.

If you can make a more targeted nMMU invalidate, then that's
even better.

One downside at first I thought is that the core code might already
do a broadcast tlbie, then the mmu notifier does not easily know
about that so it will do a second one which will be suboptimal.

Possibly we could add some flag or state so the nmmu flush can
avoid the second one.

But now that I look again, the NPU code has this comment:

        /*
         * Unfortunately the nest mmu does not support flushing specific
         * addresses so we have to flush the whole mm.
         */

Which seems to indicate that you can't rely on core code to give
you full flushes because for range flushing it is possible that the
core code will do it with address flushes. Or am I missing something?

So it seems you really do need to always issue a full PID tlbie from
a notifier.

> My understanding was 
> that those atsd_launch operations are really targeted at the device 
> behind the NPU, i.e. the nvidia card.
> At some point, it was not possible to do mmio invalidations on the nMMU. 
> At least on dd1. I'm checking with the nMMU team the status on dd2.
> 
> Alistair: is your code really doing a nMMU invalidation? Considering 
> you're trying to also reuse the mm_context_add_copro() from this patch, 
> I think I know the answer.
> 
> There are also other components relying on broadcasted invalidations 
> from hardware: the PSL (for capi FPGA) and the XSL on the Mellanox CX5 
> card, when in capi mode. They rely on hardware TLBIs, snooped and 
> forwarded to them by the CAPP.
> For the PSL, we do have a mmio interface to do targeted invalidations, 
> but it was removed from the capi architecture (and left as a debug 
> feature for our PSL implementation), because the nMMU would be out of 
> sync with the PSL (due to the lack of interface to sync the nMMU, as 
> mentioned above).
> For the XSL on the Mellanox CX5, it's even more complicated. AFAIK, they 
> do have a way to trigger invalidations through software, though the 
> interface is private and Mellanox would have to be involved. They've 
> also stated the performance is much worse through software invalidation.

Okay, point is I think the nMMU and agent drivers will be in a better
position to handle all that. I don't see that flushing from your notifier
means that you can't issue a tlbie to do it.

> 
> Another consideration is performance. Which is best? Short of having 
> real numbers, it's probably hard to know for sure.

Let's come to that if we agree on a way to go. I *think* we can make it
at least no worse than we have today, using tlbie and possibly some small
changes to generic code callers.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts
  2017-09-08 10:54       ` Nicholas Piggin
@ 2017-09-08 11:18         ` Nicholas Piggin
  0 siblings, 0 replies; 11+ messages in thread
From: Nicholas Piggin @ 2017-09-08 11:18 UTC (permalink / raw)
  To: Frederic Barrat
  Cc: mpe, linuxppc-dev, benh, andrew.donnellan, clombard, vaibhav, alistair

On Fri, 8 Sep 2017 20:54:02 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> On Fri, 8 Sep 2017 09:34:39 +0200
> Frederic Barrat <fbarrat@linux.vnet.ibm.com> wrote:
> 
> > Le 08/09/2017 à 08:56, Nicholas Piggin a écrit :  
> > > On Sun,  3 Sep 2017 20:15:13 +0200
> > > Frederic Barrat <fbarrat@linux.vnet.ibm.com> wrote:
> > >     
> > >> The PSL and nMMU need to see all TLB invalidations for the memory
> > >> contexts used on the adapter. For the hash memory model, it is done by
> > >> making all TLBIs global as soon as the cxl driver is in use. For
> > >> radix, we need something similar, but we can refine and only convert
> > >> to global the invalidations for contexts actually used by the device.
> > >>
> > >> The new mm_context_add_copro() API increments the 'active_cpus' count
> > >> for the contexts attached to the cxl adapter. As soon as there's more
> > >> than 1 active cpu, the TLBIs for the context become global. Active cpu
> > >> count must be decremented when detaching to restore locality if
> > >> possible and to avoid overflowing the counter.
> > >>
> > >> The hash memory model support is somewhat limited, as we can't
> > >> decrement the active cpus count when mm_context_remove_copro() is
> > >> called, because we can't flush the TLB for a mm on hash. So TLBIs
> > >> remain global on hash.    
> > > 
> > > Sorry I didn't look at this earlier and just wading in here a bit, but
> > > what do you think of using mmu notifiers for invalidating nMMU and
> > > coprocessor caches, rather than put the details into the host MMU
> > > management? npu-dma.c already looks to have almost everything covered
> > > with its notifiers (in that it wouldn't have to rely on tlbie coming
> > > from host MMU code).    
> > 
> > Does npu-dma.c really do mmio nMMU invalidations?  
> 
> No, but it does do a flush_tlb_mm there to do a tlbie (probably
> buggy in some cases and does tlbiel without this patch of yours).
> But the point is when you control the flushing you don't have to
> mess with making the core flush code give you tlbies.
> 
> Just add a flush_nmmu_mm or something that does what you need.
> 
> If you can make a more targeted nMMU invalidate, then that's
> even better.
> 
> One downside at first I thought is that the core code might already
> do a broadcast tlbie, then the mmu notifier does not easily know
> about that so it will do a second one which will be suboptimal.
> 
> Possibly we could add some flag or state so the nmmu flush can
> avoid the second one.
> 
> But now that I look again, the NPU code has this comment:
> 
>         /*
>          * Unfortunately the nest mmu does not support flushing specific
>          * addresses so we have to flush the whole mm.
>          */
> 
> Which seems to indicate that you can't rely on core code to give
> you full flushes because for range flushing it is possible that the
> core code will do it with address flushes. Or am I missing something?
> 
> So it seems you really do need to always issue a full PID tlbie from
> a notifier.

Oh I see, actually it's fixed in newer firmware and there's a patch
out for it.

Okay, so the nMMU can take address tlbie, in that case it's not a
correctness issue (except for old firmware that still has the bug).

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts
  2017-09-08  6:56   ` Nicholas Piggin
  2017-09-08  7:34     ` Frederic Barrat
@ 2017-09-13  3:53     ` Alistair Popple
  1 sibling, 0 replies; 11+ messages in thread
From: Alistair Popple @ 2017-09-13  3:53 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Nicholas Piggin, Frederic Barrat, clombard, vaibhav, andrew.donnellan

On Fri, 8 Sep 2017 04:56:24 PM Nicholas Piggin wrote:
> On Sun,  3 Sep 2017 20:15:13 +0200
> Frederic Barrat <fbarrat@linux.vnet.ibm.com> wrote:
> 
> > The PSL and nMMU need to see all TLB invalidations for the memory
> > contexts used on the adapter. For the hash memory model, it is done by
> > making all TLBIs global as soon as the cxl driver is in use. For
> > radix, we need something similar, but we can refine and only convert
> > to global the invalidations for contexts actually used by the device.
> > 
> > The new mm_context_add_copro() API increments the 'active_cpus' count
> > for the contexts attached to the cxl adapter. As soon as there's more
> > than 1 active cpu, the TLBIs for the context become global. Active cpu
> > count must be decremented when detaching to restore locality if
> > possible and to avoid overflowing the counter.
> > 
> > The hash memory model support is somewhat limited, as we can't
> > decrement the active cpus count when mm_context_remove_copro() is
> > called, because we can't flush the TLB for a mm on hash. So TLBIs
> > remain global on hash.
> 
> Sorry I didn't look at this earlier and just wading in here a bit, but
> what do you think of using mmu notifiers for invalidating nMMU and
> coprocessor caches, rather than put the details into the host MMU
> management? npu-dma.c already looks to have almost everything covered
> with its notifiers (in that it wouldn't have to rely on tlbie coming
> from host MMU code).

Sorry, just finding time to catch up on this. From subsequent emails it looks
like you may have figured this out. The TLB flush in npu-dma.c is a workaround
for a HW issue rather than there to explicitly manage the NMMU caches. The
intent for NPU was always to have the NMMU snoop normal core tlbies rather than
do it via notifiers. A subsequent patch series
(https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=1681) removes
this flush now that the HW issue has been worked around via a FW fix.

I agree this is something we could look into optimising in the medium term, but
for the moment it would be good if we could get this series merged.

- Alistair

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts
  2017-09-03 18:15 ` [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts Frederic Barrat
  2017-09-08  6:56   ` Nicholas Piggin
@ 2017-09-13  3:58   ` Alistair Popple
  1 sibling, 0 replies; 11+ messages in thread
From: Alistair Popple @ 2017-09-13  3:58 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Frederic Barrat, mpe, benh, andrew.donnellan, clombard, vaibhav

I have tested the non-cxl specific parts
(mm_context_add_copro/mm_context_remove_copro) with this series -
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=1681 - and it
works well for npu.

Tested-by: Alistair Popple <alistair@popple.id.au>

On Sun, 3 Sep 2017 08:15:13 PM Frederic Barrat wrote:
> The PSL and nMMU need to see all TLB invalidations for the memory
> contexts used on the adapter. For the hash memory model, it is done by
> making all TLBIs global as soon as the cxl driver is in use. For
> radix, we need something similar, but we can refine and only convert
> to global the invalidations for contexts actually used by the device.
> 
> The new mm_context_add_copro() API increments the 'active_cpus' count
> for the contexts attached to the cxl adapter. As soon as there's more
> than 1 active cpu, the TLBIs for the context become global. Active cpu
> count must be decremented when detaching to restore locality if
> possible and to avoid overflowing the counter.
> 
> The hash memory model support is somewhat limited, as we can't
> decrement the active cpus count when mm_context_remove_copro() is
> called, because we can't flush the TLB for a mm on hash. So TLBIs
> remain global on hash.
> 
> Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
> Fixes: f24be42aab37 ("cxl: Add psl9 specific code")
> ---
> Changelog:
> v3: don't decrement active cpus count with hash, as we don't know how to flush
> v2: Replace flush_tlb_mm() by the new flush_all_mm() to flush the TLBs
> and PWCs (thanks to Ben)
> 
>  arch/powerpc/include/asm/mmu_context.h | 46 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/mm/mmu_context.c          |  9 -------
>  drivers/misc/cxl/api.c                 | 22 +++++++++++++---
>  drivers/misc/cxl/context.c             |  3 +++
>  drivers/misc/cxl/file.c                | 19 ++++++++++++--
>  5 files changed, 85 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> index 309592589e30..a0d7145d6cd2 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -77,6 +77,52 @@ extern void switch_cop(struct mm_struct *next);
>  extern int use_cop(unsigned long acop, struct mm_struct *mm);
>  extern void drop_cop(unsigned long acop, struct mm_struct *mm);
>  
> +#ifdef CONFIG_PPC_BOOK3S_64
> +static inline void inc_mm_active_cpus(struct mm_struct *mm)
> +{
> +	atomic_inc(&mm->context.active_cpus);
> +}
> +
> +static inline void dec_mm_active_cpus(struct mm_struct *mm)
> +{
> +	atomic_dec(&mm->context.active_cpus);
> +}
> +
> +static inline void mm_context_add_copro(struct mm_struct *mm)
> +{
> +	/*
> +	 * On hash, should only be called once over the lifetime of
> +	 * the context, as we can't decrement the active cpus count
> +	 * and flush properly for the time being.
> +	 */
> +	inc_mm_active_cpus(mm);
> +}
> +
> +static inline void mm_context_remove_copro(struct mm_struct *mm)
> +{
> +	/*
> +	 * Need to broadcast a global flush of the full mm before
> +	 * decrementing active_cpus count, as the next TLBI may be
> +	 * local and the nMMU and/or PSL need to be cleaned up.
> +	 * Should be rare enough so that it's acceptable.
> +	 *
> +	 * Skip on hash, as we don't know how to do the proper flush
> +	 * for the time being. Invalidations will remain global if
> +	 * used on hash.
> +	 */
> +	if (radix_enabled()) {
> +		flush_all_mm(mm);
> +		dec_mm_active_cpus(mm);
> +	}
> +}
> +#else
> +static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
> +static inline void dec_mm_active_cpus(struct mm_struct *mm) { }
> +static inline void mm_context_add_copro(struct mm_struct *mm) { }
> +static inline void mm_context_remove_copro(struct mm_struct *mm) { }
> +#endif
> +
> +
>  extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
>  			       struct task_struct *tsk);
>  
> diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
> index 0f613bc63c50..d60a62bf4fc7 100644
> --- a/arch/powerpc/mm/mmu_context.c
> +++ b/arch/powerpc/mm/mmu_context.c
> @@ -34,15 +34,6 @@ static inline void switch_mm_pgdir(struct task_struct *tsk,
>  				   struct mm_struct *mm) { }
>  #endif
>  
> -#ifdef CONFIG_PPC_BOOK3S_64
> -static inline void inc_mm_active_cpus(struct mm_struct *mm)
> -{
> -	atomic_inc(&mm->context.active_cpus);
> -}
> -#else
> -static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
> -#endif
> -
>  void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
>  			struct task_struct *tsk)
>  {
> diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
> index a0c44d16bf30..1137a2cc1d3e 100644
> --- a/drivers/misc/cxl/api.c
> +++ b/drivers/misc/cxl/api.c
> @@ -15,6 +15,7 @@
>  #include <linux/module.h>
>  #include <linux/mount.h>
>  #include <linux/sched/mm.h>
> +#include <linux/mmu_context.h>
>  
>  #include "cxl.h"
>  
> @@ -331,9 +332,12 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
>  		/* ensure this mm_struct can't be freed */
>  		cxl_context_mm_count_get(ctx);
>  
> -		/* decrement the use count */
> -		if (ctx->mm)
> +		if (ctx->mm) {
> +			/* decrement the use count from above */
>  			mmput(ctx->mm);
> +			/* make TLBIs for this context global */
> +			mm_context_add_copro(ctx->mm);
> +		}
>  	}
>  
>  	/*
> @@ -342,13 +346,25 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
>  	 */
>  	cxl_ctx_get();
>  
> +	/*
> +	 * Barrier is needed to make sure all TLBIs are global before
> +	 * we attach and the context starts being used by the adapter.
> +	 *
> +	 * Needed after mm_context_add_copro() for radix and
> +	 * cxl_ctx_get() for hash/p8
> +	 */
> +	smp_mb();
> +
>  	if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) {
>  		put_pid(ctx->pid);
>  		ctx->pid = NULL;
>  		cxl_adapter_context_put(ctx->afu->adapter);
>  		cxl_ctx_put();
> -		if (task)
> +		if (task) {
>  			cxl_context_mm_count_put(ctx);
> +			if (ctx->mm)
> +				mm_context_remove_copro(ctx->mm);
> +		}
>  		goto out;
>  	}
>  
> diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
> index 8c32040b9c09..12a41b2753f0 100644
> --- a/drivers/misc/cxl/context.c
> +++ b/drivers/misc/cxl/context.c
> @@ -18,6 +18,7 @@
>  #include <linux/slab.h>
>  #include <linux/idr.h>
>  #include <linux/sched/mm.h>
> +#include <linux/mmu_context.h>
>  #include <asm/cputable.h>
>  #include <asm/current.h>
>  #include <asm/copro.h>
> @@ -267,6 +268,8 @@ int __detach_context(struct cxl_context *ctx)
>  
>  	/* Decrease the mm count on the context */
>  	cxl_context_mm_count_put(ctx);
> +	if (ctx->mm)
> +		mm_context_remove_copro(ctx->mm);
>  	ctx->mm = NULL;
>  
>  	return 0;
> diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
> index 4bfad9f6dc9f..84b801b5d0e5 100644
> --- a/drivers/misc/cxl/file.c
> +++ b/drivers/misc/cxl/file.c
> @@ -19,6 +19,7 @@
>  #include <linux/mm.h>
>  #include <linux/slab.h>
>  #include <linux/sched/mm.h>
> +#include <linux/mmu_context.h>
>  #include <asm/cputable.h>
>  #include <asm/current.h>
>  #include <asm/copro.h>
> @@ -220,9 +221,12 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
>  	/* ensure this mm_struct can't be freed */
>  	cxl_context_mm_count_get(ctx);
>  
> -	/* decrement the use count */
> -	if (ctx->mm)
> +	if (ctx->mm) {
> +		/* decrement the use count from above */
>  		mmput(ctx->mm);
> +		/* make TLBIs for this context global */
> +		mm_context_add_copro(ctx->mm);
> +	}
>  
>  	/*
>  	 * Increment driver use count. Enables global TLBIs for hash
> @@ -230,6 +234,15 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
>  	 */
>  	cxl_ctx_get();
>  
> +	/*
> +	 * Barrier is needed to make sure all TLBIs are global before
> +	 * we attach and the context starts being used by the adapter.
> +	 *
> +	 * Needed after mm_context_add_copro() for radix and
> +	 * cxl_ctx_get() for hash/p8
> +	 */
> +	smp_mb();
> +
>  	trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr);
>  
>  	if ((rc = cxl_ops->attach_process(ctx, false, work.work_element_descriptor,
> @@ -240,6 +253,8 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
>  		ctx->pid = NULL;
>  		cxl_ctx_put();
>  		cxl_context_mm_count_put(ctx);
> +		if (ctx->mm)
> +			mm_context_remove_copro(ctx->mm);
>  		goto out;
>  	}
>  
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 1/2] powerpc/mm: Export flush_all_mm()
  2017-09-03 18:15 [PATCH v3 1/2] powerpc/mm: Export flush_all_mm() Frederic Barrat
  2017-09-03 18:15 ` [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts Frederic Barrat
@ 2017-09-13  4:04 ` Alistair Popple
  2017-09-13  8:34   ` Frederic Barrat
  2017-10-05  4:21 ` [v3,1/2] " Michael Ellerman
  2 siblings, 1 reply; 11+ messages in thread
From: Alistair Popple @ 2017-09-13  4:04 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Frederic Barrat, mpe, benh, andrew.donnellan, clombard, vaibhav

> +static inline void hash__local_flush_all_mm(struct mm_struct *mm)
> +{
> +	/*
> +	 * There's no Page Walk Cache for hash, so what is needed is
> +	 * the same as flush_tlb_mm(), which doesn't really make sense
> +	 * with hash. So the only thing we could do is flush the
> +	 * entire LPID! Punt for now, as it's not being used.
> +	 */

Do you think it is worth putting a WARN_ON_ONCE here if we're asserting this
isn't used on hash?

Otherwise looks good and is also needed for NPU.

Reviewed-By: Alistair Popple <alistair@popple.id.au>

> +}
> +
> +static inline void hash__flush_all_mm(struct mm_struct *mm)
> +{
> +	/*
> +	 * There's no Page Walk Cache for hash, so what is needed is
> +	 * the same as flush_tlb_mm(), which doesn't really make sense
> +	 * with hash. So the only thing we could do is flush the
> +	 * entire LPID! Punt for now, as it's not being used.
> +	 */
> +}
> +
>  static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma,
>  					  unsigned long vmaddr)
>  {
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> index 9b433a624bf3..af06c6fe8a9f 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> @@ -21,17 +21,20 @@ extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long sta
>  extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
>  
>  extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
> +extern void radix__local_flush_all_mm(struct mm_struct *mm);
>  extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
>  extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
>  					      int psize);
>  extern void radix__tlb_flush(struct mmu_gather *tlb);
>  #ifdef CONFIG_SMP
>  extern void radix__flush_tlb_mm(struct mm_struct *mm);
> +extern void radix__flush_all_mm(struct mm_struct *mm);
>  extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
>  extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
>  					int psize);
>  #else
>  #define radix__flush_tlb_mm(mm)		radix__local_flush_tlb_mm(mm)
> +#define radix__flush_all_mm(mm)		radix__local_flush_all_mm(mm)
>  #define radix__flush_tlb_page(vma,addr)	radix__local_flush_tlb_page(vma,addr)
>  #define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p)
>  #endif
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> index 72b925f97bab..70760d018bcd 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -57,6 +57,13 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma,
>  	return hash__local_flush_tlb_page(vma, vmaddr);
>  }
>  
> +static inline void local_flush_all_mm(struct mm_struct *mm)
> +{
> +	if (radix_enabled())
> +		return radix__local_flush_all_mm(mm);
> +	return hash__local_flush_all_mm(mm);
> +}
> +
>  static inline void tlb_flush(struct mmu_gather *tlb)
>  {
>  	if (radix_enabled())
> @@ -79,9 +86,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
>  		return radix__flush_tlb_page(vma, vmaddr);
>  	return hash__flush_tlb_page(vma, vmaddr);
>  }
> +
> +static inline void flush_all_mm(struct mm_struct *mm)
> +{
> +	if (radix_enabled())
> +		return radix__flush_all_mm(mm);
> +	return hash__flush_all_mm(mm);
> +}
>  #else
>  #define flush_tlb_mm(mm)		local_flush_tlb_mm(mm)
>  #define flush_tlb_page(vma, addr)	local_flush_tlb_page(vma, addr)
> +#define flush_all_mm(mm)		local_flush_all_mm(mm)
>  #endif /* CONFIG_SMP */
>  /*
>   * flush the page walk cache for the address
> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> index b3e849c4886e..5a1f46eff3a2 100644
> --- a/arch/powerpc/mm/tlb-radix.c
> +++ b/arch/powerpc/mm/tlb-radix.c
> @@ -144,7 +144,7 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm)
>  EXPORT_SYMBOL(radix__local_flush_tlb_mm);
>  
>  #ifndef CONFIG_SMP
> -static void radix__local_flush_all_mm(struct mm_struct *mm)
> +void radix__local_flush_all_mm(struct mm_struct *mm)
>  {
>  	unsigned long pid;
>  
> @@ -154,6 +154,7 @@ static void radix__local_flush_all_mm(struct mm_struct *mm)
>  		_tlbiel_pid(pid, RIC_FLUSH_ALL);
>  	preempt_enable();
>  }
> +EXPORT_SYMBOL(radix__local_flush_all_mm);
>  #endif /* CONFIG_SMP */
>  
>  void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
> @@ -200,7 +201,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
>  }
>  EXPORT_SYMBOL(radix__flush_tlb_mm);
>  
> -static void radix__flush_all_mm(struct mm_struct *mm)
> +void radix__flush_all_mm(struct mm_struct *mm)
>  {
>  	unsigned long pid;
>  
> @@ -216,6 +217,7 @@ static void radix__flush_all_mm(struct mm_struct *mm)
>  no_context:
>  	preempt_enable();
>  }
> +EXPORT_SYMBOL(radix__flush_all_mm);
>  
>  void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
>  {
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 1/2] powerpc/mm: Export flush_all_mm()
  2017-09-13  4:04 ` [PATCH v3 1/2] powerpc/mm: Export flush_all_mm() Alistair Popple
@ 2017-09-13  8:34   ` Frederic Barrat
  0 siblings, 0 replies; 11+ messages in thread
From: Frederic Barrat @ 2017-09-13  8:34 UTC (permalink / raw)
  To: Alistair Popple, linuxppc-dev
  Cc: mpe, benh, andrew.donnellan, clombard, vaibhav



Le 13/09/2017 à 06:04, Alistair Popple a écrit :
>> +static inline void hash__local_flush_all_mm(struct mm_struct *mm)
>> +{
>> +	/*
>> +	 * There's no Page Walk Cache for hash, so what is needed is
>> +	 * the same as flush_tlb_mm(), which doesn't really make sense
>> +	 * with hash. So the only thing we could do is flush the
>> +	 * entire LPID! Punt for now, as it's not being used.
>> +	 */
> 
> Do you think it is worth putting a WARN_ON_ONCE here if we're asserting this
> isn't used on hash?

I toyed with the idea. The reason I didn't add it was because 
hash__local_flush_tlb_mm() and hash__flush_tlb_mm() don't have one, yet 
it's also not supported. And I had faith in a developer thinking about 
using it would see the comment.
I was actually pretty close to have hash__local_flush_all_mm() call 
directly hash__local_flush_tlb_mm(), since the "all" stands for "pwc and 
tlb" and pwc doesn't exist for hash. But that doesn't do us any good for 
the time being.

Michael: any preference?

Side note: I'm under the impression that flush_tlb_mm() may be called on 
hash, even though it does nothing. kernel/fork.c, dup_mmap() and 
potentially another (through cscope).

   Fred


> Otherwise looks good and is also needed for NPU.
> 
> Reviewed-By: Alistair Popple <alistair@popple.id.au>
> 
>> +}
>> +
>> +static inline void hash__flush_all_mm(struct mm_struct *mm)
>> +{
>> +	/*
>> +	 * There's no Page Walk Cache for hash, so what is needed is
>> +	 * the same as flush_tlb_mm(), which doesn't really make sense
>> +	 * with hash. So the only thing we could do is flush the
>> +	 * entire LPID! Punt for now, as it's not being used.
>> +	 */
>> +}
>> +
>>   static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma,
>>   					  unsigned long vmaddr)
>>   {
>> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> index 9b433a624bf3..af06c6fe8a9f 100644
>> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> @@ -21,17 +21,20 @@ extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long sta
>>   extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
>>   
>>   extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
>> +extern void radix__local_flush_all_mm(struct mm_struct *mm);
>>   extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
>>   extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
>>   					      int psize);
>>   extern void radix__tlb_flush(struct mmu_gather *tlb);
>>   #ifdef CONFIG_SMP
>>   extern void radix__flush_tlb_mm(struct mm_struct *mm);
>> +extern void radix__flush_all_mm(struct mm_struct *mm);
>>   extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
>>   extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
>>   					int psize);
>>   #else
>>   #define radix__flush_tlb_mm(mm)		radix__local_flush_tlb_mm(mm)
>> +#define radix__flush_all_mm(mm)		radix__local_flush_all_mm(mm)
>>   #define radix__flush_tlb_page(vma,addr)	radix__local_flush_tlb_page(vma,addr)
>>   #define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p)
>>   #endif
>> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> index 72b925f97bab..70760d018bcd 100644
>> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> @@ -57,6 +57,13 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma,
>>   	return hash__local_flush_tlb_page(vma, vmaddr);
>>   }
>>   
>> +static inline void local_flush_all_mm(struct mm_struct *mm)
>> +{
>> +	if (radix_enabled())
>> +		return radix__local_flush_all_mm(mm);
>> +	return hash__local_flush_all_mm(mm);
>> +}
>> +
>>   static inline void tlb_flush(struct mmu_gather *tlb)
>>   {
>>   	if (radix_enabled())
>> @@ -79,9 +86,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
>>   		return radix__flush_tlb_page(vma, vmaddr);
>>   	return hash__flush_tlb_page(vma, vmaddr);
>>   }
>> +
>> +static inline void flush_all_mm(struct mm_struct *mm)
>> +{
>> +	if (radix_enabled())
>> +		return radix__flush_all_mm(mm);
>> +	return hash__flush_all_mm(mm);
>> +}
>>   #else
>>   #define flush_tlb_mm(mm)		local_flush_tlb_mm(mm)
>>   #define flush_tlb_page(vma, addr)	local_flush_tlb_page(vma, addr)
>> +#define flush_all_mm(mm)		local_flush_all_mm(mm)
>>   #endif /* CONFIG_SMP */
>>   /*
>>    * flush the page walk cache for the address
>> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
>> index b3e849c4886e..5a1f46eff3a2 100644
>> --- a/arch/powerpc/mm/tlb-radix.c
>> +++ b/arch/powerpc/mm/tlb-radix.c
>> @@ -144,7 +144,7 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm)
>>   EXPORT_SYMBOL(radix__local_flush_tlb_mm);
>>   
>>   #ifndef CONFIG_SMP
>> -static void radix__local_flush_all_mm(struct mm_struct *mm)
>> +void radix__local_flush_all_mm(struct mm_struct *mm)
>>   {
>>   	unsigned long pid;
>>   
>> @@ -154,6 +154,7 @@ static void radix__local_flush_all_mm(struct mm_struct *mm)
>>   		_tlbiel_pid(pid, RIC_FLUSH_ALL);
>>   	preempt_enable();
>>   }
>> +EXPORT_SYMBOL(radix__local_flush_all_mm);
>>   #endif /* CONFIG_SMP */
>>   
>>   void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
>> @@ -200,7 +201,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
>>   }
>>   EXPORT_SYMBOL(radix__flush_tlb_mm);
>>   
>> -static void radix__flush_all_mm(struct mm_struct *mm)
>> +void radix__flush_all_mm(struct mm_struct *mm)
>>   {
>>   	unsigned long pid;
>>   
>> @@ -216,6 +217,7 @@ static void radix__flush_all_mm(struct mm_struct *mm)
>>   no_context:
>>   	preempt_enable();
>>   }
>> +EXPORT_SYMBOL(radix__flush_all_mm);
>>   
>>   void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
>>   {
>>
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [v3,1/2] powerpc/mm: Export flush_all_mm()
  2017-09-03 18:15 [PATCH v3 1/2] powerpc/mm: Export flush_all_mm() Frederic Barrat
  2017-09-03 18:15 ` [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts Frederic Barrat
  2017-09-13  4:04 ` [PATCH v3 1/2] powerpc/mm: Export flush_all_mm() Alistair Popple
@ 2017-10-05  4:21 ` Michael Ellerman
  2 siblings, 0 replies; 11+ messages in thread
From: Michael Ellerman @ 2017-10-05  4:21 UTC (permalink / raw)
  To: Frederic Barrat, linuxppc-dev, benh, andrew.donnellan, clombard, vaibhav
  Cc: alistair

On Sun, 2017-09-03 at 18:15:12 UTC, Frederic Barrat wrote:
> With the optimizations introduced by commit a46cc7a90fd8
> ("powerpc/mm/radix: Improve TLB/PWC flushes"), flush_tlb_mm() no
> longer flushes the page walk cache with radix. This patch introduces
> flush_all_mm(), which flushes everything, tlb and pwc, for a given mm.
> 
> Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
> Reviewed-By: Alistair Popple <alistair@popple.id.au>

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/6110236b9bbd177debc045c5fc2922

cheers

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2017-10-05  4:21 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-03 18:15 [PATCH v3 1/2] powerpc/mm: Export flush_all_mm() Frederic Barrat
2017-09-03 18:15 ` [PATCH v3 2/2] cxl: Enable global TLBIs for cxl contexts Frederic Barrat
2017-09-08  6:56   ` Nicholas Piggin
2017-09-08  7:34     ` Frederic Barrat
2017-09-08 10:54       ` Nicholas Piggin
2017-09-08 11:18         ` Nicholas Piggin
2017-09-13  3:53     ` Alistair Popple
2017-09-13  3:58   ` Alistair Popple
2017-09-13  4:04 ` [PATCH v3 1/2] powerpc/mm: Export flush_all_mm() Alistair Popple
2017-09-13  8:34   ` Frederic Barrat
2017-10-05  4:21 ` [v3,1/2] " Michael Ellerman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).