linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/8] shoot lazy tlbs
@ 2020-11-28 16:01 Nicholas Piggin
  2020-11-28 16:01 ` [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb Nicholas Piggin
                   ` (7 more replies)
  0 siblings, 8 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, x86, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

This is a rebase now on top of Arnd's asm-generic tree, which has
reduced most of the fluff from this patch series.

The x86 refactoring is still in the way a bit, I hope to get some
movement on that rather than rebase the main patches off it, because
I think it's a good cleanup. I think it could go in a generic
mm/scheduler series if we get arch acks because it's really just
refactoring wrappers.

The main result is reduced contention on lazy tlb mm refcount that
helps very big systems.

Thanks,
Nick

Nicholas Piggin (8):
  lazy tlb: introduce exit_lazy_tlb
  x86: use exit_lazy_tlb rather than
    membarrier_mm_sync_core_before_usermode
  x86: remove ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
  lazy tlb: introduce lazy mm refcount helper functions
  lazy tlb: allow lazy tlb mm switching to be configurable
  lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  powerpc: use lazy mm refcount helper functions
  powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN

 .../membarrier-sync-core/arch-support.txt     |  6 +-
 arch/Kconfig                                  | 24 +++++
 arch/arm/mach-rpc/ecard.c                     |  3 +-
 arch/powerpc/Kconfig                          |  1 +
 arch/powerpc/kernel/smp.c                     |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c          |  5 +-
 arch/x86/Kconfig                              |  1 -
 arch/x86/include/asm/mmu_context.h            | 27 ++++++
 arch/x86/kernel/alternative.c                 |  2 +-
 arch/x86/kernel/cpu/mce/core.c                |  2 +-
 drivers/misc/sgi-gru/grufault.c               |  2 +-
 drivers/misc/sgi-gru/gruhandles.c             |  2 +-
 drivers/misc/sgi-gru/grukservices.c           |  2 +-
 fs/exec.c                                     |  6 +-
 include/asm-generic/mmu_context.h             | 21 ++++
 include/linux/sched/mm.h                      | 34 ++++---
 include/linux/sync_core.h                     | 21 ----
 init/Kconfig                                  |  3 -
 kernel/cpu.c                                  |  6 +-
 kernel/exit.c                                 |  2 +-
 kernel/fork.c                                 | 53 ++++++++++
 kernel/kthread.c                              | 12 ++-
 kernel/sched/core.c                           | 97 +++++++++++++------
 kernel/sched/sched.h                          |  4 +-
 24 files changed, 247 insertions(+), 91 deletions(-)
 delete mode 100644 include/linux/sync_core.h

-- 
2.23.0



^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb
  2020-11-28 16:01 [PATCH 0/8] shoot lazy tlbs Nicholas Piggin
@ 2020-11-28 16:01 ` Nicholas Piggin
  2020-11-29  0:38   ` Andy Lutomirski
  2020-11-28 16:01 ` [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode Nicholas Piggin
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 46+ messages in thread
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, x86, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

This is called at points where a lazy mm is switched away or made not
lazy (by its owner switching back).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm/mach-rpc/ecard.c            |  1 +
 arch/powerpc/mm/book3s64/radix_tlb.c |  1 +
 fs/exec.c                            |  6 ++++--
 include/asm-generic/mmu_context.h    | 21 +++++++++++++++++++++
 kernel/kthread.c                     |  1 +
 kernel/sched/core.c                  |  2 ++
 6 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 827b50f1c73e..43eb1bfba466 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,6 +253,7 @@ static int ecard_init_mm(void)
 	current->mm = mm;
 	current->active_mm = mm;
 	activate_mm(active_mm, mm);
+	exit_lazy_tlb(active_mm, current);
 	mmdrop(active_mm);
 	ecard_init_pgtables(mm);
 	return 0;
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b487b489d4b6..ac3fec03926a 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -661,6 +661,7 @@ static void do_exit_flush_lazy_tlb(void *arg)
 		mmgrab(&init_mm);
 		current->active_mm = &init_mm;
 		switch_mm_irqs_off(mm, &init_mm, current);
+		exit_lazy_tlb(mm, current);
 		mmdrop(mm);
 	}
 
diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..4b4dea1bb7ba 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1017,6 +1017,8 @@ static int exec_mmap(struct mm_struct *mm)
 	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
 		local_irq_enable();
 	activate_mm(active_mm, mm);
+	if (!old_mm)
+		exit_lazy_tlb(active_mm, tsk);
 	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
 		local_irq_enable();
 	tsk->mm->vmacache_seqnum = 0;
@@ -1028,9 +1030,9 @@ static int exec_mmap(struct mm_struct *mm)
 		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 		mm_update_next_owner(old_mm);
 		mmput(old_mm);
-		return 0;
+	} else {
+		mmdrop(active_mm);
 	}
-	mmdrop(active_mm);
 	return 0;
 }
 
diff --git a/include/asm-generic/mmu_context.h b/include/asm-generic/mmu_context.h
index 91727065bacb..4626d0020e65 100644
--- a/include/asm-generic/mmu_context.h
+++ b/include/asm-generic/mmu_context.h
@@ -24,6 +24,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 }
 #endif
 
+/*
+ * exit_lazy_tlb - Called after switching away from a lazy TLB mode mm.
+ *
+ * mm:  the lazy mm context that was switched
+ * tsk: the task that was switched to (with a non-lazy mm)
+ *
+ * mm may equal tsk->mm.
+ * mm and tsk->mm will not be NULL.
+ *
+ * Note this is not symmetrical to enter_lazy_tlb, this is not
+ * called when tasks switch into the lazy mm, it's called after the
+ * lazy mm becomes non-lazy (either switched to a different mm or the
+ * owner of the mm returns).
+ */
+#ifndef exit_lazy_tlb
+static inline void exit_lazy_tlb(struct mm_struct *mm,
+			struct task_struct *tsk)
+{
+}
+#endif
+
 /**
  * init_new_context - Initialize context of a new mm_struct.
  * @tsk: task struct for the mm
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 933a625621b8..e380302aac13 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1250,6 +1250,7 @@ void kthread_use_mm(struct mm_struct *mm)
 	}
 	tsk->mm = mm;
 	switch_mm_irqs_off(active_mm, mm, tsk);
+	exit_lazy_tlb(active_mm, tsk);
 	local_irq_enable();
 	task_unlock(tsk);
 #ifdef finish_arch_post_lock_switch
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7e453492cff..dcc46039ade5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3765,6 +3765,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
+			exit_lazy_tlb(prev->active_mm, next);
+
 			/* will mmdrop() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-11-28 16:01 [PATCH 0/8] shoot lazy tlbs Nicholas Piggin
  2020-11-28 16:01 ` [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb Nicholas Piggin
@ 2020-11-28 16:01 ` Nicholas Piggin
  2020-11-28 17:55   ` Andy Lutomirski
  2020-11-30 14:57   ` Mathieu Desnoyers
  2020-11-28 16:01 ` [PATCH 3/8] x86: remove ARCH_HAS_SYNC_CORE_BEFORE_USERMODE Nicholas Piggin
                   ` (5 subsequent siblings)
  7 siblings, 2 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, x86, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

And get rid of the generic sync_core_before_usermode facility. This is
functionally a no-op in the core scheduler code, but it also catches

This helper is the wrong way around I think. The idea that membarrier
state requires a core sync before returning to user is the easy one
that does not need hiding behind membarrier calls. The gap in core
synchronization due to x86's sysret/sysexit and lazy tlb mode, is the
tricky detail that is better put in x86 lazy tlb code.

Consider if an arch did not synchronize core in switch_mm either, then
membarrier_mm_sync_core_before_usermode would be in the wrong place
but arch specific mmu context functions would still be the right place.
There is also a exit_lazy_tlb case that is not covered by this call, which
could be a bugs (kthread use mm the membarrier process's mm then context
switch back to the process without switching mm or lazy mm switch).

This makes lazy tlb code a bit more modular.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 .../membarrier-sync-core/arch-support.txt     |  6 ++++-
 arch/x86/include/asm/mmu_context.h            | 27 +++++++++++++++++++
 include/linux/sched/mm.h                      | 14 ----------
 kernel/cpu.c                                  |  4 ++-
 kernel/sched/core.c                           | 16 +++++------
 5 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index 47e6903f47a5..0763a63a7097 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -5,6 +5,10 @@
 #
 # Architecture requirements
 #
+# If your architecture returns to user-space through non-core-serializing
+# instructions, you need to ensure these are done in switch_mm and exit_lazy_tlb
+# (if lazy tlb switching is implemented).
+#
 # * arm/arm64/powerpc
 #
 # Rely on implicit context synchronization as a result of exception return
@@ -24,7 +28,7 @@
 # instead on write_cr3() performed by switch_mm() to provide core serialization
 # after changing the current mm, and deal with the special case of kthread ->
 # uthread (temporarily keeping current mm into active_mm) by issuing a
-# sync_core_before_usermode() in that specific case.
+# serializing instruction in exit_lazy_mm() in that specific case.
 #
     -----------------------
     |         arch |status|
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 36afcbea6a9f..8094893254f1 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -6,12 +6,14 @@
 #include <linux/atomic.h>
 #include <linux/mm_types.h>
 #include <linux/pkeys.h>
+#include <linux/sched/mm.h>
 
 #include <trace/events/tlb.h>
 
 #include <asm/tlbflush.h>
 #include <asm/paravirt.h>
 #include <asm/debugreg.h>
+#include <asm/sync_core.h>
 
 extern atomic64_t last_mm_ctx_id;
 
@@ -94,6 +96,31 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
 #define enter_lazy_tlb enter_lazy_tlb
 extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
 
+#ifdef CONFIG_MEMBARRIER
+/*
+ * Ensure that a core serializing instruction is issued before returning
+ * to user-mode, if a SYNC_CORE was requested. x86 implements return to
+ * user-space through sysexit, sysrel, and sysretq, which are not core
+ * serializing.
+ *
+ * See the membarrier comment in finish_task_switch as to why this is done
+ * in exit_lazy_tlb.
+ */
+#define exit_lazy_tlb exit_lazy_tlb
+static inline void exit_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+	/* Switching mm is serializing with write_cr3 */
+        if (tsk->mm != mm)
+                return;
+
+        if (likely(!(atomic_read(&mm->membarrier_state) &
+                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
+                return;
+
+	sync_core_before_usermode();
+}
+#endif
+
 /*
  * Init a new mm.  Used on mm copies, like at fork()
  * and on mm's that are brand-new, like at execve().
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..2c6bcdf76d99 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -7,7 +7,6 @@
 #include <linux/sched.h>
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
-#include <linux/sync_core.h>
 
 /*
  * Routines for handling mm_structs
@@ -335,16 +334,6 @@ enum {
 #include <asm/membarrier.h>
 #endif
 
-static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
-{
-	if (current->mm != mm)
-		return;
-	if (likely(!(atomic_read(&mm->membarrier_state) &
-		     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
-		return;
-	sync_core_before_usermode();
-}
-
 extern void membarrier_exec_mmap(struct mm_struct *mm);
 
 #else
@@ -358,9 +347,6 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
 static inline void membarrier_exec_mmap(struct mm_struct *mm)
 {
 }
-static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
-{
-}
 #endif
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ff2578ecf17..134688d79589 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -572,7 +572,9 @@ static int finish_cpu(unsigned int cpu)
 
 	/*
 	 * idle_task_exit() will have switched to &init_mm, now
-	 * clean up any remaining active_mm state.
+	 * clean up any remaining active_mm state. exit_lazy_tlb
+	 * is not done, if an arch did any accounting in these
+	 * functions it would have to be added.
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dcc46039ade5..e4e8cebd82e2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3620,22 +3620,19 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	kcov_finish_switch(current);
 
 	fire_sched_in_preempt_notifiers(current);
+
 	/*
 	 * When switching through a kernel thread, the loop in
 	 * membarrier_{private,global}_expedited() may have observed that
 	 * kernel thread and not issued an IPI. It is therefore possible to
 	 * schedule between user->kernel->user threads without passing though
-	 * switch_mm(). Membarrier requires a barrier after storing to
-	 * rq->curr, before returning to userspace, so provide them here:
-	 *
-	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
-	 *   provided by mmdrop(),
-	 * - a sync_core for SYNC_CORE.
+	 * switch_mm(). Membarrier requires a full barrier after storing to
+	 * rq->curr, before returning to userspace, for
+	 * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by mmdrop().
 	 */
-	if (mm) {
-		membarrier_mm_sync_core_before_usermode(mm);
+	if (mm)
 		mmdrop(mm);
-	}
+
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -6689,6 +6686,7 @@ void idle_task_exit(void)
 	BUG_ON(current != this_rq()->idle);
 
 	if (mm != &init_mm) {
+		/* enter_lazy_tlb is not done because we're about to go down */
 		switch_mm(mm, &init_mm, current);
 		finish_arch_post_lock_switch();
 	}
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 3/8] x86: remove ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
  2020-11-28 16:01 [PATCH 0/8] shoot lazy tlbs Nicholas Piggin
  2020-11-28 16:01 ` [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb Nicholas Piggin
  2020-11-28 16:01 ` [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode Nicholas Piggin
@ 2020-11-28 16:01 ` Nicholas Piggin
  2020-11-28 16:01 ` [PATCH 4/8] lazy tlb: introduce lazy mm refcount helper functions Nicholas Piggin
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, x86, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

Switch remaining x86-specific users to asm/sync_core.h, remove the
linux/sync_core.h header and ARCH_ option.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/x86/Kconfig                    |  1 -
 arch/x86/kernel/alternative.c       |  2 +-
 arch/x86/kernel/cpu/mce/core.c      |  2 +-
 drivers/misc/sgi-gru/grufault.c     |  2 +-
 drivers/misc/sgi-gru/gruhandles.c   |  2 +-
 drivers/misc/sgi-gru/grukservices.c |  2 +-
 include/linux/sync_core.h           | 21 ---------------------
 init/Kconfig                        |  3 ---
 8 files changed, 5 insertions(+), 30 deletions(-)
 delete mode 100644 include/linux/sync_core.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f6946b81f74a..160d3ad90507 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -80,7 +80,6 @@ config X86
 	select ARCH_HAS_SET_DIRECT_MAP
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
-	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAS_DEBUG_WX
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2400ad62f330..9a7ab08f4157 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -17,7 +17,7 @@
 #include <linux/kprobes.h>
 #include <linux/mmu_context.h>
 #include <linux/bsearch.h>
-#include <linux/sync_core.h>
+#include <asm/sync_core.h>
 #include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4102b866e7c0..282ea9942829 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -41,12 +41,12 @@
 #include <linux/irq_work.h>
 #include <linux/export.h>
 #include <linux/set_memory.h>
-#include <linux/sync_core.h>
 #include <linux/task_work.h>
 #include <linux/hardirq.h>
 
 #include <asm/intel-family.h>
 #include <asm/processor.h>
+#include <asm/sync_core.h>
 #include <asm/traps.h>
 #include <asm/tlbflush.h>
 #include <asm/mce.h>
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index 723825524ea0..48fd5b101de1 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -20,8 +20,8 @@
 #include <linux/io.h>
 #include <linux/uaccess.h>
 #include <linux/security.h>
-#include <linux/sync_core.h>
 #include <linux/prefetch.h>
+#include <asm/sync_core.h>
 #include "gru.h"
 #include "grutables.h"
 #include "grulib.h"
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
index 1d75d5e540bc..c8cba1c1b00f 100644
--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -16,7 +16,7 @@
 #define GRU_OPERATION_TIMEOUT	(((cycles_t) local_cpu_data->itc_freq)*10)
 #define CLKS2NSEC(c)		((c) *1000000000 / local_cpu_data->itc_freq)
 #else
-#include <linux/sync_core.h>
+#include <asm/sync_core.h>
 #include <asm/tsc.h>
 #define GRU_OPERATION_TIMEOUT	((cycles_t) tsc_khz*10*1000)
 #define CLKS2NSEC(c)		((c) * 1000000 / tsc_khz)
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index 0ea923fe6371..860aea9deb45 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -16,11 +16,11 @@
 #include <linux/miscdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
-#include <linux/sync_core.h>
 #include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <asm/io_apic.h>
+#include <asm/sync_core.h>
 #include "gru.h"
 #include "grulib.h"
 #include "grutables.h"
diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h
deleted file mode 100644
index 013da4b8b327..000000000000
--- a/include/linux/sync_core.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SYNC_CORE_H
-#define _LINUX_SYNC_CORE_H
-
-#ifdef CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
-#include <asm/sync_core.h>
-#else
-/*
- * This is a dummy sync_core_before_usermode() implementation that can be used
- * on all architectures which return to user-space through core serializing
- * instructions.
- * If your architecture returns to user-space through non-core-serializing
- * instructions, you need to write your own functions.
- */
-static inline void sync_core_before_usermode(void)
-{
-}
-#endif
-
-#endif /* _LINUX_SYNC_CORE_H */
-
diff --git a/init/Kconfig b/init/Kconfig
index 02d13ae27abb..82f9b5c937cb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2334,9 +2334,6 @@ source "kernel/Kconfig.locks"
 config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	bool
 
-config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
-	bool
-
 # It may be useful for an architecture to override the definitions of the
 # SYSCALL_DEFINE() and __SYSCALL_DEFINEx() macros in <linux/syscalls.h>
 # and the COMPAT_ variants in <linux/compat.h>, in particular to use a
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 4/8] lazy tlb: introduce lazy mm refcount helper functions
  2020-11-28 16:01 [PATCH 0/8] shoot lazy tlbs Nicholas Piggin
                   ` (2 preceding siblings ...)
  2020-11-28 16:01 ` [PATCH 3/8] x86: remove ARCH_HAS_SYNC_CORE_BEFORE_USERMODE Nicholas Piggin
@ 2020-11-28 16:01 ` Nicholas Piggin
  2020-11-28 16:01 ` [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable Nicholas Piggin
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, x86, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

Add explicit _lazy_tlb annotated functions for lazy mm refcounting.
This makes things a bit more explicit, and allows explicit refcounting
to be removed if it is not used.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 ++--
 fs/exec.c                            |  2 +-
 include/linux/sched/mm.h             | 11 +++++++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kthread.c                     | 11 +++++++----
 kernel/sched/core.c                  | 15 ++++++++-------
 8 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 43eb1bfba466..a75938702c58 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -254,7 +254,7 @@ static int ecard_init_mm(void)
 	current->active_mm = mm;
 	activate_mm(active_mm, mm);
 	exit_lazy_tlb(active_mm, current);
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	ecard_init_pgtables(mm);
 	return 0;
 }
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index ac3fec03926a..e66606ef2a3d 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -658,11 +658,11 @@ static void do_exit_flush_lazy_tlb(void *arg)
 	if (current->active_mm == mm) {
 		WARN_ON_ONCE(current->mm != NULL);
 		/* Is a kernel thread and is using mm as the lazy tlb */
-		mmgrab(&init_mm);
+		mmgrab_lazy_tlb(&init_mm);
 		current->active_mm = &init_mm;
 		switch_mm_irqs_off(mm, &init_mm, current);
 		exit_lazy_tlb(mm, current);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
 
 	atomic_dec(&mm->context.active_cpus);
diff --git a/fs/exec.c b/fs/exec.c
index 4b4dea1bb7ba..0a1461bb62e2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1031,7 +1031,7 @@ static int exec_mmap(struct mm_struct *mm)
 		mm_update_next_owner(old_mm);
 		mmput(old_mm);
 	} else {
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 	}
 	return 0;
 }
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2c6bcdf76d99..7157c0f6fef8 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -48,6 +48,17 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+	mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+	mmdrop(mm);
+}
+
 /**
  * mmget() - Pin the address space associated with a &struct mm_struct.
  * @mm: The address space to pin.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 134688d79589..ff9fcbc4e76b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -578,7 +578,7 @@ static int finish_cpu(unsigned int cpu)
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
-	mmdrop(mm);
+	mmdrop_lazy_tlb(mm);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 1f236ed375f8..3711a74fcf4a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -474,7 +474,7 @@ static void exit_mm(void)
 		__set_current_state(TASK_RUNNING);
 		mmap_read_lock(mm);
 	}
-	mmgrab(mm);
+	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e380302aac13..f1241e19327e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1240,14 +1240,14 @@ void kthread_use_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(tsk->mm);
 
+	mmgrab(mm);
+
 	task_lock(tsk);
 	/* Hold off tlb flush IPIs while switching mm's */
 	local_irq_disable();
 	active_mm = tsk->active_mm;
-	if (active_mm != mm) {
-		mmgrab(mm);
+	if (active_mm != mm)
 		tsk->active_mm = mm;
-	}
 	tsk->mm = mm;
 	switch_mm_irqs_off(active_mm, mm, tsk);
 	exit_lazy_tlb(active_mm, tsk);
@@ -1258,7 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm)
 #endif
 
 	if (active_mm != mm)
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 
 	to_kthread(tsk)->oldfs = force_uaccess_begin();
 }
@@ -1281,10 +1281,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	sync_mm_rss(mm);
 	local_irq_disable();
 	tsk->mm = NULL;
+	mmgrab_lazy_tlb(mm);
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
 	local_irq_enable();
 	task_unlock(tsk);
+
+	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e4e8cebd82e2..e372b613d514 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3628,10 +3628,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * schedule between user->kernel->user threads without passing though
 	 * switch_mm(). Membarrier requires a full barrier after storing to
 	 * rq->curr, before returning to userspace, for
-	 * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by mmdrop().
+	 * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by
+	 * mmdrop_lazy_tlb().
 	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
@@ -3736,9 +3737,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	/*
 	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
 	 *
-	 * kernel ->   user   switch + mmdrop() active
+	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
 	 */
 	if (!next->mm) {                                // to kernel
@@ -3746,7 +3747,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
+			mmgrab_lazy_tlb(prev->active_mm);
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
@@ -3764,7 +3765,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		if (!prev->mm) {                        // from kernel
 			exit_lazy_tlb(prev->active_mm, next);
 
-			/* will mmdrop() in finish_task_switch(). */
+			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}
@@ -7206,7 +7207,7 @@ void __init sched_init(void)
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	/*
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable
  2020-11-28 16:01 [PATCH 0/8] shoot lazy tlbs Nicholas Piggin
                   ` (3 preceding siblings ...)
  2020-11-28 16:01 ` [PATCH 4/8] lazy tlb: introduce lazy mm refcount helper functions Nicholas Piggin
@ 2020-11-28 16:01 ` Nicholas Piggin
  2020-11-29  0:36   ` Andy Lutomirski
  2020-11-28 16:01 ` [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 46+ messages in thread
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, x86, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

NOMMU systems could easily go without this and save a bit of code
and the refcount atomics, because their mm switch is a no-op. I
haven't flipped them over because haven't audited all arch code to
convert over to using the _lazy_tlb refcounting.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig             | 11 +++++++
 include/linux/sched/mm.h | 13 ++++++--
 kernel/sched/core.c      | 68 +++++++++++++++++++++++++++++-----------
 kernel/sched/sched.h     |  4 ++-
 4 files changed, 75 insertions(+), 21 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 56b6ccc0e32d..596bf589d74b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -430,6 +430,17 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	  irqs disabled over activate_mm. Architectures that do IPI based TLB
 	  shootdowns should enable this.
 
+# Should make this depend on MMU, because there is little use for lazy mm switching
+# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
+config MMU_LAZY_TLB
+	def_bool y
+	help
+	  Enable "lazy TLB" mmu context switching for kernel threads.
+
+config MMU_LAZY_TLB_REFCOUNT
+	def_bool y
+	depends on MMU_LAZY_TLB
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 7157c0f6fef8..bd0f27402d4b 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -51,12 +51,21 @@ static inline void mmdrop(struct mm_struct *mm)
 /* Helpers for lazy TLB mm refcounting */
 static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 {
-	mmgrab(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+		mmgrab(mm);
 }
 
 static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 {
-	mmdrop(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+		mmdrop(mm);
+	} else {
+		/*
+		 * mmdrop_lazy_tlb must provide a full memory barrier, see the
+		 * membarrier comment finish_task_switch.
+		 */
+		smp_mb();
+	}
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e372b613d514..3b79c6cc3a37 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3579,7 +3579,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
+	struct mm_struct *mm = NULL;
 	long prev_state;
 
 	/*
@@ -3598,7 +3598,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		      current->comm, current->pid, preempt_count()))
 		preempt_count_set(FORK_PREEMPT_COUNT);
 
-	rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	mm = rq->prev_lazy_mm;
+	rq->prev_lazy_mm = NULL;
+#endif
 
 	/*
 	 * A task struct has one reference for the use as "current".
@@ -3630,6 +3633,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * rq->curr, before returning to userspace, for
 	 * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by
 	 * mmdrop_lazy_tlb().
+	 *
+	 * This same issue applies to other places that mmdrop_lazy_tlb().
 	 */
 	if (mm)
 		mmdrop_lazy_tlb(mm);
@@ -3719,22 +3724,10 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	calculate_sigpending();
 }
 
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next, struct rq_flags *rf)
+static __always_inline void
+context_switch_mm(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
 {
-	prepare_task_switch(rq, prev, next);
-
-	/*
-	 * For paravirt, this is coupled with an exit in switch_to to
-	 * combine the page table reload and the switch backend into
-	 * one hypercall.
-	 */
-	arch_start_context_switch(prev);
-
 	/*
 	 * kernel -> kernel   lazy + transfer active
 	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
@@ -3765,11 +3758,50 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		if (!prev->mm) {                        // from kernel
 			exit_lazy_tlb(prev->active_mm, next);
 
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
 			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
-			rq->prev_mm = prev->active_mm;
+			rq->prev_lazy_mm = prev->active_mm;
 			prev->active_mm = NULL;
+#else
+			/* See membarrier comment in finish_task_switch(). */
+			smp_mb();
+#endif
 		}
 	}
+}
+
+static __always_inline void
+context_switch_mm_nolazy(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	if (!next->mm)
+		next->active_mm = &init_mm;
+	membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
+	switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
+	if (!prev->mm)
+		prev->active_mm = NULL;
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf)
+{
+	prepare_task_switch(rq, prev, next);
+
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+		context_switch_mm(rq, prev, next);
+	else
+		context_switch_mm_nolazy(rq, prev, next);
 
 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..3b72aec5a2f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -950,7 +950,9 @@ struct rq {
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-	struct mm_struct	*prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	struct mm_struct	*prev_lazy_mm;
+#endif
 
 	unsigned int		clock_update_flags;
 	u64			clock;
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-28 16:01 [PATCH 0/8] shoot lazy tlbs Nicholas Piggin
                   ` (4 preceding siblings ...)
  2020-11-28 16:01 ` [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable Nicholas Piggin
@ 2020-11-28 16:01 ` Nicholas Piggin
  2020-11-29  3:54   ` Andy Lutomirski
                     ` (2 more replies)
  2020-11-28 16:01 ` [PATCH 7/8] powerpc: use lazy mm refcount helper functions Nicholas Piggin
  2020-11-28 16:01 ` [PATCH 8/8] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN Nicholas Piggin
  7 siblings, 3 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, x86, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).

Abandoning lazy tlb slows switching down quite a bit in the important
user->idle->user cases, so so instead implement a non-refcounted scheme
that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
any remaining lazy ones.

Shootdown IPIs are some concern, but they have not been observed to be
a big problem with this scheme (the powerpc implementation generated
314 additional interrupts on a 144 CPU system during a kernel compile).
There are a number of strategies that could be employed to reduce IPIs
if they turn out to be a problem for some workload.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig  | 13 +++++++++++++
 kernel/fork.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 596bf589d74b..540e43aeefa4 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -440,6 +440,19 @@ config MMU_LAZY_TLB
 config MMU_LAZY_TLB_REFCOUNT
 	def_bool y
 	depends on MMU_LAZY_TLB
+	depends on !MMU_LAZY_TLB_SHOOTDOWN
+
+config MMU_LAZY_TLB_SHOOTDOWN
+	bool
+	depends on MMU_LAZY_TLB
+	help
+	  Instead of refcounting the "lazy tlb" mm struct, which can cause
+	  contention with multi-threaded apps on large multiprocessor systems,
+	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+	  switch to init_mm if they were using the to-be-freed mm as the lazy
+	  tlb. To implement this, architectures must use _lazy_tlb variants of
+	  mm refcounting, and mm_cpumask must include at least all possible
+	  CPUs in which mm might be lazy.
 
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..e47312c2b48b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -669,6 +669,54 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+static void do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		WARN_ON_ONCE(current->mm);
+		current->active_mm = &init_mm;
+		switch_mm(mm, &init_mm, current);
+		exit_lazy_tlb(mm, current);
+	}
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+		/*
+		 * IPI overheads have not found to be expensive, but they could
+		 * be reduced in a number of possible ways, for example (in
+		 * roughly increasing order of complexity):
+		 * - A batch of mms requiring IPIs could be gathered and freed
+		 *   at once.
+		 * - CPUs could store their active mm somewhere that can be
+		 *   remotely checked without a lock, to filter out
+		 *   false-positives in the cpumask.
+		 * - After mm_users or mm_count reaches zero, switching away
+		 *   from the mm could clear mm_cpumask to reduce some IPIs
+		 *   (some batching or delaying would help).
+		 * - A delayed freeing and RCU-like quiescing sequence based on
+		 *   mm switching to avoid IPIs completely.
+		 */
+		on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+		if (IS_ENABLED(CONFIG_DEBUG_VM))
+			on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+	} else {
+		/*
+		 * In this case, lazy tlb mms are refounted and would not reach
+		 * __mmdrop until all CPUs have switched away and mmdrop()ed.
+		 */
+	}
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -678,7 +726,12 @@ void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
+
+	/* Ensure no CPUs are using this as their lazy tlb mm */
+	shoot_lazy_tlbs(mm);
+
 	WARN_ON_ONCE(mm == current->active_mm);
+
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_subscriptions_destroy(mm);
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 7/8] powerpc: use lazy mm refcount helper functions
  2020-11-28 16:01 [PATCH 0/8] shoot lazy tlbs Nicholas Piggin
                   ` (5 preceding siblings ...)
  2020-11-28 16:01 ` [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
@ 2020-11-28 16:01 ` Nicholas Piggin
  2020-11-28 16:01 ` [PATCH 8/8] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN Nicholas Piggin
  7 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, x86, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

Use _lazy_tlb functions for lazy mm refcounting in powerpc, to prepare
to move to MMU_LAZY_TLB_SHOOTDOWN.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c2857cbd960..93c0eaa6f4bf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1395,7 +1395,7 @@ void start_secondary(void *unused)
 {
 	unsigned int cpu = raw_smp_processor_id();
 
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 8/8] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
  2020-11-28 16:01 [PATCH 0/8] shoot lazy tlbs Nicholas Piggin
                   ` (6 preceding siblings ...)
  2020-11-28 16:01 ` [PATCH 7/8] powerpc: use lazy mm refcount helper functions Nicholas Piggin
@ 2020-11-28 16:01 ` Nicholas Piggin
  7 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, x86, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

On a 16-socket 192-core POWER8 system, a context switching benchmark
with as many software threads as CPUs (so each switch will go in and
out of idle), upstream can achieve a rate of about 1 million context
switches per second. After this patch it goes up to 118 million.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e9f13fe08492..d4793c0229d2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -231,6 +231,7 @@ config PPC
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE
 	select MMU_GATHER_PAGE_SIZE
+	select MMU_LAZY_TLB_SHOOTDOWN		if PPC_BOOK3S_64
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
 	select HAVE_SYSCALL_TRACEPOINTS
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-11-28 16:01 ` [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode Nicholas Piggin
@ 2020-11-28 17:55   ` Andy Lutomirski
  2020-12-02  2:49     ` Nicholas Piggin
  2020-11-30 14:57   ` Mathieu Desnoyers
  1 sibling, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2020-11-28 17:55 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann, Peter Zijlstra,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> And get rid of the generic sync_core_before_usermode facility. This is
> functionally a no-op in the core scheduler code, but it also catches
>
> This helper is the wrong way around I think. The idea that membarrier
> state requires a core sync before returning to user is the easy one
> that does not need hiding behind membarrier calls. The gap in core
> synchronization due to x86's sysret/sysexit and lazy tlb mode, is the
> tricky detail that is better put in x86 lazy tlb code.
>
> Consider if an arch did not synchronize core in switch_mm either, then
> membarrier_mm_sync_core_before_usermode would be in the wrong place
> but arch specific mmu context functions would still be the right place.
> There is also a exit_lazy_tlb case that is not covered by this call, which
> could be a bugs (kthread use mm the membarrier process's mm then context
> switch back to the process without switching mm or lazy mm switch).
>
> This makes lazy tlb code a bit more modular.

I have a couple of membarrier fixes that I want to send out today or
tomorrow, and they might eliminate the need for this patch.  Let me
think about this a little bit.  I'll cc you.  The existing code is way
to subtle and the comments are far too confusing for me to be quickly
confident about any of my conclusions :)


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable
  2020-11-28 16:01 ` [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable Nicholas Piggin
@ 2020-11-29  0:36   ` Andy Lutomirski
  2020-12-02  2:49     ` Nicholas Piggin
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2020-11-29  0:36 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann, Peter Zijlstra,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> NOMMU systems could easily go without this and save a bit of code
> and the refcount atomics, because their mm switch is a no-op. I
> haven't flipped them over because haven't audited all arch code to
> convert over to using the _lazy_tlb refcounting.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/Kconfig             | 11 +++++++
>  include/linux/sched/mm.h | 13 ++++++--
>  kernel/sched/core.c      | 68 +++++++++++++++++++++++++++++-----------
>  kernel/sched/sched.h     |  4 ++-
>  4 files changed, 75 insertions(+), 21 deletions(-)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 56b6ccc0e32d..596bf589d74b 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -430,6 +430,17 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
>           irqs disabled over activate_mm. Architectures that do IPI based TLB
>           shootdowns should enable this.
>
> +# Should make this depend on MMU, because there is little use for lazy mm switching
> +# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
> +config MMU_LAZY_TLB
> +       def_bool y
> +       help
> +         Enable "lazy TLB" mmu context switching for kernel threads.
> +
> +config MMU_LAZY_TLB_REFCOUNT
> +       def_bool y
> +       depends on MMU_LAZY_TLB
> +

This could use some documentation as to what "no" means.

>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>         bool
>
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 7157c0f6fef8..bd0f27402d4b 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -51,12 +51,21 @@ static inline void mmdrop(struct mm_struct *mm)
>  /* Helpers for lazy TLB mm refcounting */
>  static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
>  {
> -       mmgrab(mm);
> +       if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
> +               mmgrab(mm);
>  }
>
>  static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
>  {
> -       mmdrop(mm);
> +       if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
> +               mmdrop(mm);
> +       } else {
> +               /*
> +                * mmdrop_lazy_tlb must provide a full memory barrier, see the
> +                * membarrier comment finish_task_switch.

"membarrier comment in finish_task_switch()", perhaps?

> +                */
> +               smp_mb();
> +       }
>  }
>
>  /**
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e372b613d514..3b79c6cc3a37 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3579,7 +3579,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>         __releases(rq->lock)
>  {
>         struct rq *rq = this_rq();
> -       struct mm_struct *mm = rq->prev_mm;
> +       struct mm_struct *mm = NULL;
>         long prev_state;
>
>         /*
> @@ -3598,7 +3598,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>                       current->comm, current->pid, preempt_count()))
>                 preempt_count_set(FORK_PREEMPT_COUNT);
>
> -       rq->prev_mm = NULL;
> +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
> +       mm = rq->prev_lazy_mm;
> +       rq->prev_lazy_mm = NULL;
> +#endif
>
>         /*
>          * A task struct has one reference for the use as "current".
> @@ -3630,6 +3633,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>          * rq->curr, before returning to userspace, for
>          * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by
>          * mmdrop_lazy_tlb().
> +        *
> +        * This same issue applies to other places that mmdrop_lazy_tlb().
>          */
>         if (mm)
>                 mmdrop_lazy_tlb(mm);
> @@ -3719,22 +3724,10 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
>         calculate_sigpending();
>  }
>
> -/*
> - * context_switch - switch to the new MM and the new thread's register state.
> - */
> -static __always_inline struct rq *
> -context_switch(struct rq *rq, struct task_struct *prev,
> -              struct task_struct *next, struct rq_flags *rf)
> +static __always_inline void
> +context_switch_mm(struct rq *rq, struct task_struct *prev,
> +              struct task_struct *next)
>  {
> -       prepare_task_switch(rq, prev, next);
> -
> -       /*
> -        * For paravirt, this is coupled with an exit in switch_to to
> -        * combine the page table reload and the switch backend into
> -        * one hypercall.
> -        */
> -       arch_start_context_switch(prev);
> -
>         /*
>          * kernel -> kernel   lazy + transfer active
>          *   user -> kernel   lazy + mmgrab_lazy_tlb() active
> @@ -3765,11 +3758,50 @@ context_switch(struct rq *rq, struct task_struct *prev,
>                 if (!prev->mm) {                        // from kernel
>                         exit_lazy_tlb(prev->active_mm, next);
>
> +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
>                         /* will mmdrop_lazy_tlb() in finish_task_switch(). */
> -                       rq->prev_mm = prev->active_mm;
> +                       rq->prev_lazy_mm = prev->active_mm;
>                         prev->active_mm = NULL;
> +#else
> +                       /* See membarrier comment in finish_task_switch(). */
> +                       smp_mb();
> +#endif
>                 }
>         }
> +}
> +

Comment here describing what this does, please.


> +static __always_inline void
> +context_switch_mm_nolazy(struct rq *rq, struct task_struct *prev,
> +              struct task_struct *next)
> +{
> +       if (!next->mm)
> +               next->active_mm = &init_mm;
> +       membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
> +       switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
> +       if (!prev->mm)
> +               prev->active_mm = NULL;
> +}
> +
> +/*
> + * context_switch - switch to the new MM and the new thread's register state.
> + */
> +static __always_inline struct rq *
> +context_switch(struct rq *rq, struct task_struct *prev,
> +              struct task_struct *next, struct rq_flags *rf)
> +{
> +       prepare_task_switch(rq, prev, next);
> +
> +       /*
> +        * For paravirt, this is coupled with an exit in switch_to to
> +        * combine the page table reload and the switch backend into
> +        * one hypercall.
> +        */
> +       arch_start_context_switch(prev);
> +
> +       if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
> +               context_switch_mm(rq, prev, next);
> +       else
> +               context_switch_mm_nolazy(rq, prev, next);
>
>         rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index df80bfcea92e..3b72aec5a2f2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -950,7 +950,9 @@ struct rq {
>         struct task_struct      *idle;
>         struct task_struct      *stop;
>         unsigned long           next_balance;
> -       struct mm_struct        *prev_mm;
> +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
> +       struct mm_struct        *prev_lazy_mm;
> +#endif
>
>         unsigned int            clock_update_flags;
>         u64                     clock;
> --
> 2.23.0
>


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb
  2020-11-28 16:01 ` [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb Nicholas Piggin
@ 2020-11-29  0:38   ` Andy Lutomirski
  2020-12-02  2:49     ` Nicholas Piggin
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2020-11-29  0:38 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann, Peter Zijlstra,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Sat, Nov 28, 2020 at 8:01 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> This is called at points where a lazy mm is switched away or made not
> lazy (by its owner switching back).
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/arm/mach-rpc/ecard.c            |  1 +
>  arch/powerpc/mm/book3s64/radix_tlb.c |  1 +
>  fs/exec.c                            |  6 ++++--
>  include/asm-generic/mmu_context.h    | 21 +++++++++++++++++++++
>  kernel/kthread.c                     |  1 +
>  kernel/sched/core.c                  |  2 ++
>  6 files changed, 30 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
> index 827b50f1c73e..43eb1bfba466 100644
> --- a/arch/arm/mach-rpc/ecard.c
> +++ b/arch/arm/mach-rpc/ecard.c
> @@ -253,6 +253,7 @@ static int ecard_init_mm(void)
>         current->mm = mm;
>         current->active_mm = mm;
>         activate_mm(active_mm, mm);
> +       exit_lazy_tlb(active_mm, current);
>         mmdrop(active_mm);
>         ecard_init_pgtables(mm);
>         return 0;
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
> index b487b489d4b6..ac3fec03926a 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -661,6 +661,7 @@ static void do_exit_flush_lazy_tlb(void *arg)
>                 mmgrab(&init_mm);
>                 current->active_mm = &init_mm;
>                 switch_mm_irqs_off(mm, &init_mm, current);
> +               exit_lazy_tlb(mm, current);
>                 mmdrop(mm);
>         }
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 547a2390baf5..4b4dea1bb7ba 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1017,6 +1017,8 @@ static int exec_mmap(struct mm_struct *mm)
>         if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
>                 local_irq_enable();
>         activate_mm(active_mm, mm);
> +       if (!old_mm)
> +               exit_lazy_tlb(active_mm, tsk);
>         if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
>                 local_irq_enable();
>         tsk->mm->vmacache_seqnum = 0;
> @@ -1028,9 +1030,9 @@ static int exec_mmap(struct mm_struct *mm)
>                 setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
>                 mm_update_next_owner(old_mm);
>                 mmput(old_mm);
> -               return 0;
> +       } else {
> +               mmdrop(active_mm);
>         }
> -       mmdrop(active_mm);

This looks like an unrelated change.

>         return 0;
>  }
>
> diff --git a/include/asm-generic/mmu_context.h b/include/asm-generic/mmu_context.h
> index 91727065bacb..4626d0020e65 100644
> --- a/include/asm-generic/mmu_context.h
> +++ b/include/asm-generic/mmu_context.h
> @@ -24,6 +24,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
>  }
>  #endif
>
> +/*
> + * exit_lazy_tlb - Called after switching away from a lazy TLB mode mm.
> + *
> + * mm:  the lazy mm context that was switched
> + * tsk: the task that was switched to (with a non-lazy mm)
> + *
> + * mm may equal tsk->mm.
> + * mm and tsk->mm will not be NULL.
> + *
> + * Note this is not symmetrical to enter_lazy_tlb, this is not
> + * called when tasks switch into the lazy mm, it's called after the
> + * lazy mm becomes non-lazy (either switched to a different mm or the
> + * owner of the mm returns).
> + */
> +#ifndef exit_lazy_tlb
> +static inline void exit_lazy_tlb(struct mm_struct *mm,

Maybe name this parameter prev_lazy_mm?


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-28 16:01 ` [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
@ 2020-11-29  3:54   ` Andy Lutomirski
  2020-11-29 20:16     ` Andy Lutomirski
                       ` (3 more replies)
  2020-12-02 11:17   ` Peter Zijlstra
  2020-12-02 14:19   ` Peter Zijlstra
  2 siblings, 4 replies; 46+ messages in thread
From: Andy Lutomirski @ 2020-11-29  3:54 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann, Peter Zijlstra,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> On big systems, the mm refcount can become highly contented when doing
> a lot of context switching with threaded applications (particularly
> switching between the idle thread and an application thread).
>
> Abandoning lazy tlb slows switching down quite a bit in the important
> user->idle->user cases, so so instead implement a non-refcounted scheme
> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
> any remaining lazy ones.
>
> Shootdown IPIs are some concern, but they have not been observed to be
> a big problem with this scheme (the powerpc implementation generated
> 314 additional interrupts on a 144 CPU system during a kernel compile).
> There are a number of strategies that could be employed to reduce IPIs
> if they turn out to be a problem for some workload.

I'm still wondering whether we can do even better.

The IPIs you're doing aren't really necessary -- we don't
fundamentally need to free the pagetables immediately when all
non-lazy users are done with them (and current kernels don't) -- what
we need to do is to synchronize all the bookkeeping.  So, with
adequate locking (famous last words), a couple of alternative schemes
ought to be possible.

a) Instead of sending an IPI, increment mm_count on behalf of the
remote CPU and do something to make sure that the remote CPU knows we
did this on its behalf.  Then free the mm when mm_count hits zero.

b) Treat mm_cpumask as part of the refcount.  Add one to mm_count when
an mm is created.  Once mm_users hits zero, whoever clears the last
bit in mm_cpumask is responsible for decrementing a single reference
from mm_count, and whoever sets it to zero frees the mm.

Version (b) seems fairly straightforward to implement -- add RCU
protection and a atomic_t special_ref_cleared (initially 0) to struct
mm_struct itself.  After anyone clears a bit to mm_cpumask (which is
already a barrier), they read mm_users.  If it's zero, then they scan
mm_cpumask and see if it's empty.  If it is, they atomically swap
special_ref_cleared to 1.  If it was zero before the swap, they do
mmdrop().  I can imagine some tweaks that could make this a big
faster, at least in the limit of a huge number of CPUs.

Version (a) seems a bit harder to reason about.  Maybe it could be
done like this.  Add a percpu variable mm_with_extra_count.  This
variable can be NULL, but it can also be an mm that has an extra
reference on behalf of the cpu in question.

__mmput scans mm_cpumask and, for each cpu in the mask, mmgrabs the mm
and cmpxchgs that cpu's mm_with_extra_count from NULL to mm.  If it
succeeds, then we win.  If it fails, further thought is required, and
maybe we have to send an IPI, although maybe some other cleverness is
possible.  Any time a CPU switches mms, it does atomic swaps
mm_with_extra_count to NULL and mmdrops whatever the mm was.  (Maybe
it needs to check the mm isn't equal to the new mm, although it would
be quite bizarre for this to happen.)  Other than these mmgrab and
mmdrop calls, the mm switching code doesn't mmgrab or mmdrop at all.


Version (a) seems like it could have excellent performance.


*However*, I think we should consider whether we want to do something
even bigger first.  Even with any of these changes, we still need to
maintain mm_cpumask(), and that itself can be a scalability problem.
I wonder if we can solve this problem too.  Perhaps the switch_mm()
paths could only ever set mm_cpumask bits, and anyone who would send
an IPI because a bit is set in mm_cpumask would first check some
percpu variable (cpu_rq(cpu)->something?  an entirely new variable) to
see if the bit in mm_cpumask is spurious.  Or perhaps mm_cpumask could
be split up across multiple cachelines, one per node.

We should keep the recent lessons from Apple in mind, though: x86 is a
dinosaur.  The future of atomics is going to look a lot more like
ARM's LSE than x86's rather anemic set.  This means that mm_cpumask
operations won't need to be full barriers forever, and we might not
want to take the implied full barriers in set_bit() and clear_bit()
for granted.

--Andy


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-29  3:54   ` Andy Lutomirski
@ 2020-11-29 20:16     ` Andy Lutomirski
  2020-11-30  9:25       ` Peter Zijlstra
  2020-11-30 18:31       ` Andy Lutomirski
  2020-11-30  9:26     ` Peter Zijlstra
                       ` (2 subsequent siblings)
  3 siblings, 2 replies; 46+ messages in thread
From: Andy Lutomirski @ 2020-11-29 20:16 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Nicholas Piggin, LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, Linux-MM,
	Anton Blanchard

On Sat, Nov 28, 2020 at 7:54 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
> >
> > On big systems, the mm refcount can become highly contented when doing
> > a lot of context switching with threaded applications (particularly
> > switching between the idle thread and an application thread).
> >
> > Abandoning lazy tlb slows switching down quite a bit in the important
> > user->idle->user cases, so so instead implement a non-refcounted scheme
> > that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
> > any remaining lazy ones.
> >
> > Shootdown IPIs are some concern, but they have not been observed to be
> > a big problem with this scheme (the powerpc implementation generated
> > 314 additional interrupts on a 144 CPU system during a kernel compile).
> > There are a number of strategies that could be employed to reduce IPIs
> > if they turn out to be a problem for some workload.
>
> I'm still wondering whether we can do even better.
>

Hold on a sec.. __mmput() unmaps VMAs, frees pagetables, and flushes
the TLB.  On x86, this will shoot down all lazies as long as even a
single pagetable was freed.  (Or at least it will if we don't have a
serious bug, but the code seems okay.  We'll hit pmd_free_tlb, which
sets tlb->freed_tables, which will trigger the IPI.)  So, on
architectures like x86, the shootdown approach should be free.  The
only way it ought to have any excess IPIs is if we have CPUs in
mm_cpumask() that don't need IPI to free pagetables, which could
happen on paravirt.

Can you try to figure out why you saw any increase in IPIs?  It would
be nice if we can make the new code unconditional.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-29 20:16     ` Andy Lutomirski
@ 2020-11-30  9:25       ` Peter Zijlstra
  2020-11-30 18:31       ` Andy Lutomirski
  1 sibling, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2020-11-30  9:25 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Nicholas Piggin, LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Sun, Nov 29, 2020 at 12:16:26PM -0800, Andy Lutomirski wrote:
> On Sat, Nov 28, 2020 at 7:54 PM Andy Lutomirski <luto@kernel.org> wrote:
> >
> > On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
> > >
> > > On big systems, the mm refcount can become highly contented when doing
> > > a lot of context switching with threaded applications (particularly
> > > switching between the idle thread and an application thread).
> > >
> > > Abandoning lazy tlb slows switching down quite a bit in the important
> > > user->idle->user cases, so so instead implement a non-refcounted scheme
> > > that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
> > > any remaining lazy ones.
> > >
> > > Shootdown IPIs are some concern, but they have not been observed to be
> > > a big problem with this scheme (the powerpc implementation generated
> > > 314 additional interrupts on a 144 CPU system during a kernel compile).
> > > There are a number of strategies that could be employed to reduce IPIs
> > > if they turn out to be a problem for some workload.
> >
> > I'm still wondering whether we can do even better.
> >
> 
> Hold on a sec.. __mmput() unmaps VMAs, frees pagetables, and flushes
> the TLB.  On x86, this will shoot down all lazies as long as even a
> single pagetable was freed.  (Or at least it will if we don't have a
> serious bug, but the code seems okay.  We'll hit pmd_free_tlb, which
> sets tlb->freed_tables, which will trigger the IPI.)  So, on
> architectures like x86, the shootdown approach should be free.  The
> only way it ought to have any excess IPIs is if we have CPUs in
> mm_cpumask() that don't need IPI to free pagetables, which could
> happen on paravirt.
> 
> Can you try to figure out why you saw any increase in IPIs?  It would
> be nice if we can make the new code unconditional.

Power doesn't do IPI based TLBI.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-29  3:54   ` Andy Lutomirski
  2020-11-29 20:16     ` Andy Lutomirski
@ 2020-11-30  9:26     ` Peter Zijlstra
  2020-11-30  9:30     ` Peter Zijlstra
  2020-12-02  3:09     ` Nicholas Piggin
  3 siblings, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2020-11-30  9:26 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Nicholas Piggin, LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Sat, Nov 28, 2020 at 07:54:57PM -0800, Andy Lutomirski wrote:
> Version (b) seems fairly straightforward to implement -- add RCU
> protection and a atomic_t special_ref_cleared (initially 0) to struct
> mm_struct itself.  After anyone clears a bit to mm_cpumask (which is
> already a barrier),

No it isn't. clear_bit() implies no barrier what so ever. That's x86
you're thinking about.

> they read mm_users.  If it's zero, then they scan
> mm_cpumask and see if it's empty.  If it is, they atomically swap
> special_ref_cleared to 1.  If it was zero before the swap, they do
> mmdrop().  I can imagine some tweaks that could make this a big
> faster, at least in the limit of a huge number of CPUs.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-29  3:54   ` Andy Lutomirski
  2020-11-29 20:16     ` Andy Lutomirski
  2020-11-30  9:26     ` Peter Zijlstra
@ 2020-11-30  9:30     ` Peter Zijlstra
  2020-11-30  9:34       ` Peter Zijlstra
  2020-12-02  3:09     ` Nicholas Piggin
  3 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2020-11-30  9:30 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Nicholas Piggin, LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Sat, Nov 28, 2020 at 07:54:57PM -0800, Andy Lutomirski wrote:
> This means that mm_cpumask operations won't need to be full barriers
> forever, and we might not want to take the implied full barriers in
> set_bit() and clear_bit() for granted.

There is no implied full barrier for those ops.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-30  9:30     ` Peter Zijlstra
@ 2020-11-30  9:34       ` Peter Zijlstra
  0 siblings, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2020-11-30  9:34 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Nicholas Piggin, LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Mon, Nov 30, 2020 at 10:30:00AM +0100, Peter Zijlstra wrote:
> On Sat, Nov 28, 2020 at 07:54:57PM -0800, Andy Lutomirski wrote:
> > This means that mm_cpumask operations won't need to be full barriers
> > forever, and we might not want to take the implied full barriers in
> > set_bit() and clear_bit() for granted.
> 
> There is no implied full barrier for those ops.

Neither ARM nor Power implies any ordering on those ops. But Power has
some of the worst atomic performance in the world despite of that.

IIRC they don't do their LL/SC in L1.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-11-28 16:01 ` [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode Nicholas Piggin
  2020-11-28 17:55   ` Andy Lutomirski
@ 2020-11-30 14:57   ` Mathieu Desnoyers
  1 sibling, 0 replies; 46+ messages in thread
From: Mathieu Desnoyers @ 2020-11-30 14:57 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-kernel, x86, Arnd Bergmann, Peter Zijlstra, linux-arch,
	linuxppc-dev, linux-mm, Anton Blanchard

----- On Nov 28, 2020, at 11:01 AM, Nicholas Piggin npiggin@gmail.com wrote:

> And get rid of the generic sync_core_before_usermode facility. This is
> functionally a no-op in the core scheduler code, but it also catches

This sentence is incomplete.

> 
> This helper is the wrong way around I think. The idea that membarrier
> state requires a core sync before returning to user is the easy one
> that does not need hiding behind membarrier calls. The gap in core
> synchronization due to x86's sysret/sysexit and lazy tlb mode, is the
> tricky detail that is better put in x86 lazy tlb code.

Ideally yes this complexity should sit within the x86 architecture code
if only that architecture requires it.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-29 20:16     ` Andy Lutomirski
  2020-11-30  9:25       ` Peter Zijlstra
@ 2020-11-30 18:31       ` Andy Lutomirski
  2020-12-01 21:27         ` Will Deacon
                           ` (2 more replies)
  1 sibling, 3 replies; 46+ messages in thread
From: Andy Lutomirski @ 2020-11-30 18:31 UTC (permalink / raw)
  To: Andy Lutomirski, Will Deacon, Catalin Marinas, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, Dave Hansen
  Cc: Nicholas Piggin, LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, Linux-MM,
	Anton Blanchard

other arch folk: there's some background here:

https://lkml.kernel.org/r/CALCETrVXUbe8LfNn-Qs+DzrOQaiw+sFUg1J047yByV31SaTOZw@mail.gmail.com

On Sun, Nov 29, 2020 at 12:16 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Sat, Nov 28, 2020 at 7:54 PM Andy Lutomirski <luto@kernel.org> wrote:
> >
> > On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
> > >
> > > On big systems, the mm refcount can become highly contented when doing
> > > a lot of context switching with threaded applications (particularly
> > > switching between the idle thread and an application thread).
> > >
> > > Abandoning lazy tlb slows switching down quite a bit in the important
> > > user->idle->user cases, so so instead implement a non-refcounted scheme
> > > that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
> > > any remaining lazy ones.
> > >
> > > Shootdown IPIs are some concern, but they have not been observed to be
> > > a big problem with this scheme (the powerpc implementation generated
> > > 314 additional interrupts on a 144 CPU system during a kernel compile).
> > > There are a number of strategies that could be employed to reduce IPIs
> > > if they turn out to be a problem for some workload.
> >
> > I'm still wondering whether we can do even better.
> >
>
> Hold on a sec.. __mmput() unmaps VMAs, frees pagetables, and flushes
> the TLB.  On x86, this will shoot down all lazies as long as even a
> single pagetable was freed.  (Or at least it will if we don't have a
> serious bug, but the code seems okay.  We'll hit pmd_free_tlb, which
> sets tlb->freed_tables, which will trigger the IPI.)  So, on
> architectures like x86, the shootdown approach should be free.  The
> only way it ought to have any excess IPIs is if we have CPUs in
> mm_cpumask() that don't need IPI to free pagetables, which could
> happen on paravirt.

Indeed, on x86, we do this:

[   11.558844]  flush_tlb_mm_range.cold+0x18/0x1d
[   11.559905]  tlb_finish_mmu+0x10e/0x1a0
[   11.561068]  exit_mmap+0xc8/0x1a0
[   11.561932]  mmput+0x29/0xd0
[   11.562688]  do_exit+0x316/0xa90
[   11.563588]  do_group_exit+0x34/0xb0
[   11.564476]  __x64_sys_exit_group+0xf/0x10
[   11.565512]  do_syscall_64+0x34/0x50

and we have info->freed_tables set.

What are the architectures that have large systems like?

x86: we already zap lazies, so it should cost basically nothing to do
a little loop at the end of __mmput() to make sure that no lazies are
left.  If we care about paravirt performance, we could implement one
of the optimizations I mentioned above to fix up the refcounts instead
of sending an IPI to any remaining lazies.

arm64: AFAICT arm64's flush uses magic arm64 hardware support for
remote flushes, so any lazy mm references will still exist after
exit_mmap().  (arm64 uses lazy TLB, right?)  So this is kind of like
the x86 paravirt case.  Are there large enough arm64 systems that any
of this matters?

s390x: The code has too many acronyms for me to understand it fully,
but I think it's more or less the same situation as arm64.  How big do
s390x systems come?

power: Ridiculously complicated, seems to vary by system and kernel config.

So, Nick, your unconditional IPI scheme is apparently a big
improvement for power, and it should be an improvement and have low
cost for x86.  On arm64 and s390x it will add more IPIs on process
exit but reduce contention on context switching depending on how lazy
TLB works.  I suppose we could try it for all architectures without
any further optimizations.  Or we could try one of the perhaps
excessively clever improvements I linked above.  arm64, s390x people,
what do you think?


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-30 18:31       ` Andy Lutomirski
@ 2020-12-01 21:27         ` Will Deacon
  2020-12-01 21:50           ` Andy Lutomirski
  2020-12-02  3:47         ` Nicholas Piggin
  2020-12-03 17:03         ` Alexander Gordeev
  2 siblings, 1 reply; 46+ messages in thread
From: Will Deacon @ 2020-12-01 21:27 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Catalin Marinas, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Dave Hansen, Nicholas Piggin, LKML,
	X86 ML, Mathieu Desnoyers, Arnd Bergmann, Peter Zijlstra,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Mon, Nov 30, 2020 at 10:31:51AM -0800, Andy Lutomirski wrote:
> other arch folk: there's some background here:
> 
> https://lkml.kernel.org/r/CALCETrVXUbe8LfNn-Qs+DzrOQaiw+sFUg1J047yByV31SaTOZw@mail.gmail.com
> 
> On Sun, Nov 29, 2020 at 12:16 PM Andy Lutomirski <luto@kernel.org> wrote:
> >
> > On Sat, Nov 28, 2020 at 7:54 PM Andy Lutomirski <luto@kernel.org> wrote:
> > >
> > > On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
> > > >
> > > > On big systems, the mm refcount can become highly contented when doing
> > > > a lot of context switching with threaded applications (particularly
> > > > switching between the idle thread and an application thread).
> > > >
> > > > Abandoning lazy tlb slows switching down quite a bit in the important
> > > > user->idle->user cases, so so instead implement a non-refcounted scheme
> > > > that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
> > > > any remaining lazy ones.
> > > >
> > > > Shootdown IPIs are some concern, but they have not been observed to be
> > > > a big problem with this scheme (the powerpc implementation generated
> > > > 314 additional interrupts on a 144 CPU system during a kernel compile).
> > > > There are a number of strategies that could be employed to reduce IPIs
> > > > if they turn out to be a problem for some workload.
> > >
> > > I'm still wondering whether we can do even better.
> > >
> >
> > Hold on a sec.. __mmput() unmaps VMAs, frees pagetables, and flushes
> > the TLB.  On x86, this will shoot down all lazies as long as even a
> > single pagetable was freed.  (Or at least it will if we don't have a
> > serious bug, but the code seems okay.  We'll hit pmd_free_tlb, which
> > sets tlb->freed_tables, which will trigger the IPI.)  So, on
> > architectures like x86, the shootdown approach should be free.  The
> > only way it ought to have any excess IPIs is if we have CPUs in
> > mm_cpumask() that don't need IPI to free pagetables, which could
> > happen on paravirt.
> 
> Indeed, on x86, we do this:
> 
> [   11.558844]  flush_tlb_mm_range.cold+0x18/0x1d
> [   11.559905]  tlb_finish_mmu+0x10e/0x1a0
> [   11.561068]  exit_mmap+0xc8/0x1a0
> [   11.561932]  mmput+0x29/0xd0
> [   11.562688]  do_exit+0x316/0xa90
> [   11.563588]  do_group_exit+0x34/0xb0
> [   11.564476]  __x64_sys_exit_group+0xf/0x10
> [   11.565512]  do_syscall_64+0x34/0x50
> 
> and we have info->freed_tables set.
> 
> What are the architectures that have large systems like?
> 
> x86: we already zap lazies, so it should cost basically nothing to do
> a little loop at the end of __mmput() to make sure that no lazies are
> left.  If we care about paravirt performance, we could implement one
> of the optimizations I mentioned above to fix up the refcounts instead
> of sending an IPI to any remaining lazies.
> 
> arm64: AFAICT arm64's flush uses magic arm64 hardware support for
> remote flushes, so any lazy mm references will still exist after
> exit_mmap().  (arm64 uses lazy TLB, right?)  So this is kind of like
> the x86 paravirt case.  Are there large enough arm64 systems that any
> of this matters?

Yes, there are large arm64 systems where performance of TLB invalidation
matters, but they're either niche (supercomputers) or not readily available
(NUMA boxes).

But anyway, we blow away the TLB for everybody in tlb_finish_mmu() after
freeing the page-tables. We have an optimisation to avoid flushing if
we're just unmapping leaf entries when the mm is going away, but we don't
have a choice once we get to actually reclaiming the page-tables.

One thing I probably should mention, though, is that we don't maintain
mm_cpumask() because we're not able to benefit from it and the atomic
update is a waste of time.

Will


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-12-01 21:27         ` Will Deacon
@ 2020-12-01 21:50           ` Andy Lutomirski
  2020-12-01 23:04             ` Will Deacon
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2020-12-01 21:50 UTC (permalink / raw)
  To: Will Deacon
  Cc: Andy Lutomirski, Catalin Marinas, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Dave Hansen, Nicholas Piggin, LKML,
	X86 ML, Mathieu Desnoyers, Arnd Bergmann, Peter Zijlstra,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Tue, Dec 1, 2020 at 1:28 PM Will Deacon <will@kernel.org> wrote:
>
> On Mon, Nov 30, 2020 at 10:31:51AM -0800, Andy Lutomirski wrote:
> > other arch folk: there's some background here:
> >
> > https://lkml.kernel.org/r/CALCETrVXUbe8LfNn-Qs+DzrOQaiw+sFUg1J047yByV31SaTOZw@mail.gmail.com
> >
> > On Sun, Nov 29, 2020 at 12:16 PM Andy Lutomirski <luto@kernel.org> wrote:
> > >
> > > On Sat, Nov 28, 2020 at 7:54 PM Andy Lutomirski <luto@kernel.org> wrote:
> > > >
> > > > On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
> > > > >
> > > > > On big systems, the mm refcount can become highly contented when doing
> > > > > a lot of context switching with threaded applications (particularly
> > > > > switching between the idle thread and an application thread).
> > > > >
> > > > > Abandoning lazy tlb slows switching down quite a bit in the important
> > > > > user->idle->user cases, so so instead implement a non-refcounted scheme
> > > > > that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
> > > > > any remaining lazy ones.
> > > > >
> > > > > Shootdown IPIs are some concern, but they have not been observed to be
> > > > > a big problem with this scheme (the powerpc implementation generated
> > > > > 314 additional interrupts on a 144 CPU system during a kernel compile).
> > > > > There are a number of strategies that could be employed to reduce IPIs
> > > > > if they turn out to be a problem for some workload.
> > > >
> > > > I'm still wondering whether we can do even better.
> > > >
> > >
> > > Hold on a sec.. __mmput() unmaps VMAs, frees pagetables, and flushes
> > > the TLB.  On x86, this will shoot down all lazies as long as even a
> > > single pagetable was freed.  (Or at least it will if we don't have a
> > > serious bug, but the code seems okay.  We'll hit pmd_free_tlb, which
> > > sets tlb->freed_tables, which will trigger the IPI.)  So, on
> > > architectures like x86, the shootdown approach should be free.  The
> > > only way it ought to have any excess IPIs is if we have CPUs in
> > > mm_cpumask() that don't need IPI to free pagetables, which could
> > > happen on paravirt.
> >
> > Indeed, on x86, we do this:
> >
> > [   11.558844]  flush_tlb_mm_range.cold+0x18/0x1d
> > [   11.559905]  tlb_finish_mmu+0x10e/0x1a0
> > [   11.561068]  exit_mmap+0xc8/0x1a0
> > [   11.561932]  mmput+0x29/0xd0
> > [   11.562688]  do_exit+0x316/0xa90
> > [   11.563588]  do_group_exit+0x34/0xb0
> > [   11.564476]  __x64_sys_exit_group+0xf/0x10
> > [   11.565512]  do_syscall_64+0x34/0x50
> >
> > and we have info->freed_tables set.
> >
> > What are the architectures that have large systems like?
> >
> > x86: we already zap lazies, so it should cost basically nothing to do
> > a little loop at the end of __mmput() to make sure that no lazies are
> > left.  If we care about paravirt performance, we could implement one
> > of the optimizations I mentioned above to fix up the refcounts instead
> > of sending an IPI to any remaining lazies.
> >
> > arm64: AFAICT arm64's flush uses magic arm64 hardware support for
> > remote flushes, so any lazy mm references will still exist after
> > exit_mmap().  (arm64 uses lazy TLB, right?)  So this is kind of like
> > the x86 paravirt case.  Are there large enough arm64 systems that any
> > of this matters?
>
> Yes, there are large arm64 systems where performance of TLB invalidation
> matters, but they're either niche (supercomputers) or not readily available
> (NUMA boxes).
>
> But anyway, we blow away the TLB for everybody in tlb_finish_mmu() after
> freeing the page-tables. We have an optimisation to avoid flushing if
> we're just unmapping leaf entries when the mm is going away, but we don't
> have a choice once we get to actually reclaiming the page-tables.
>
> One thing I probably should mention, though, is that we don't maintain
> mm_cpumask() because we're not able to benefit from it and the atomic
> update is a waste of time.

Do you do anything special for lazy TLB or do you just use the generic
code?  (i.e. where do your user pagetables point when you go from a
user task to idle or to a kernel thread?)

Do you end up with all cpus set in mm_cpumask or can you have the mm
loaded on a CPU that isn't in mm_cpumask?

--Andy

>
> Will


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-12-01 21:50           ` Andy Lutomirski
@ 2020-12-01 23:04             ` Will Deacon
  0 siblings, 0 replies; 46+ messages in thread
From: Will Deacon @ 2020-12-01 23:04 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Catalin Marinas, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Dave Hansen, Nicholas Piggin, LKML,
	X86 ML, Mathieu Desnoyers, Arnd Bergmann, Peter Zijlstra,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Tue, Dec 01, 2020 at 01:50:38PM -0800, Andy Lutomirski wrote:
> On Tue, Dec 1, 2020 at 1:28 PM Will Deacon <will@kernel.org> wrote:
> >
> > On Mon, Nov 30, 2020 at 10:31:51AM -0800, Andy Lutomirski wrote:
> > > other arch folk: there's some background here:
> > >
> > > https://lkml.kernel.org/r/CALCETrVXUbe8LfNn-Qs+DzrOQaiw+sFUg1J047yByV31SaTOZw@mail.gmail.com
> > >
> > > On Sun, Nov 29, 2020 at 12:16 PM Andy Lutomirski <luto@kernel.org> wrote:
> > > >
> > > > On Sat, Nov 28, 2020 at 7:54 PM Andy Lutomirski <luto@kernel.org> wrote:
> > > > >
> > > > > On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
> > > > > >
> > > > > > On big systems, the mm refcount can become highly contented when doing
> > > > > > a lot of context switching with threaded applications (particularly
> > > > > > switching between the idle thread and an application thread).
> > > > > >
> > > > > > Abandoning lazy tlb slows switching down quite a bit in the important
> > > > > > user->idle->user cases, so so instead implement a non-refcounted scheme
> > > > > > that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
> > > > > > any remaining lazy ones.
> > > > > >
> > > > > > Shootdown IPIs are some concern, but they have not been observed to be
> > > > > > a big problem with this scheme (the powerpc implementation generated
> > > > > > 314 additional interrupts on a 144 CPU system during a kernel compile).
> > > > > > There are a number of strategies that could be employed to reduce IPIs
> > > > > > if they turn out to be a problem for some workload.
> > > > >
> > > > > I'm still wondering whether we can do even better.
> > > > >
> > > >
> > > > Hold on a sec.. __mmput() unmaps VMAs, frees pagetables, and flushes
> > > > the TLB.  On x86, this will shoot down all lazies as long as even a
> > > > single pagetable was freed.  (Or at least it will if we don't have a
> > > > serious bug, but the code seems okay.  We'll hit pmd_free_tlb, which
> > > > sets tlb->freed_tables, which will trigger the IPI.)  So, on
> > > > architectures like x86, the shootdown approach should be free.  The
> > > > only way it ought to have any excess IPIs is if we have CPUs in
> > > > mm_cpumask() that don't need IPI to free pagetables, which could
> > > > happen on paravirt.
> > >
> > > Indeed, on x86, we do this:
> > >
> > > [   11.558844]  flush_tlb_mm_range.cold+0x18/0x1d
> > > [   11.559905]  tlb_finish_mmu+0x10e/0x1a0
> > > [   11.561068]  exit_mmap+0xc8/0x1a0
> > > [   11.561932]  mmput+0x29/0xd0
> > > [   11.562688]  do_exit+0x316/0xa90
> > > [   11.563588]  do_group_exit+0x34/0xb0
> > > [   11.564476]  __x64_sys_exit_group+0xf/0x10
> > > [   11.565512]  do_syscall_64+0x34/0x50
> > >
> > > and we have info->freed_tables set.
> > >
> > > What are the architectures that have large systems like?
> > >
> > > x86: we already zap lazies, so it should cost basically nothing to do
> > > a little loop at the end of __mmput() to make sure that no lazies are
> > > left.  If we care about paravirt performance, we could implement one
> > > of the optimizations I mentioned above to fix up the refcounts instead
> > > of sending an IPI to any remaining lazies.
> > >
> > > arm64: AFAICT arm64's flush uses magic arm64 hardware support for
> > > remote flushes, so any lazy mm references will still exist after
> > > exit_mmap().  (arm64 uses lazy TLB, right?)  So this is kind of like
> > > the x86 paravirt case.  Are there large enough arm64 systems that any
> > > of this matters?
> >
> > Yes, there are large arm64 systems where performance of TLB invalidation
> > matters, but they're either niche (supercomputers) or not readily available
> > (NUMA boxes).
> >
> > But anyway, we blow away the TLB for everybody in tlb_finish_mmu() after
> > freeing the page-tables. We have an optimisation to avoid flushing if
> > we're just unmapping leaf entries when the mm is going away, but we don't
> > have a choice once we get to actually reclaiming the page-tables.
> >
> > One thing I probably should mention, though, is that we don't maintain
> > mm_cpumask() because we're not able to benefit from it and the atomic
> > update is a waste of time.
> 
> Do you do anything special for lazy TLB or do you just use the generic
> code?  (i.e. where do your user pagetables point when you go from a
> user task to idle or to a kernel thread?)

We don't do anything special (there's something funny with the PAN emulation
but you can ignore that); the page-table just points wherever it did before
for userspace. Switching explicitly to the init_mm, however, causes us to
unmap userspace entirely.

Since we have ASIDs, switch_mm() generally doesn't have to care about the
TLBs at all.

> Do you end up with all cpus set in mm_cpumask or can you have the mm
> loaded on a CPU that isn't in mm_cpumask?

I think the mask is always zero (we never set anything in there).

Will


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable
  2020-11-29  0:36   ` Andy Lutomirski
@ 2020-12-02  2:49     ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-12-02  2:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra, X86 ML

Excerpts from Andy Lutomirski's message of November 29, 2020 10:36 am:
> On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> NOMMU systems could easily go without this and save a bit of code
>> and the refcount atomics, because their mm switch is a no-op. I
>> haven't flipped them over because haven't audited all arch code to
>> convert over to using the _lazy_tlb refcounting.
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/Kconfig             | 11 +++++++
>>  include/linux/sched/mm.h | 13 ++++++--
>>  kernel/sched/core.c      | 68 +++++++++++++++++++++++++++++-----------
>>  kernel/sched/sched.h     |  4 ++-
>>  4 files changed, 75 insertions(+), 21 deletions(-)
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 56b6ccc0e32d..596bf589d74b 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -430,6 +430,17 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
>>           irqs disabled over activate_mm. Architectures that do IPI based TLB
>>           shootdowns should enable this.
>>
>> +# Should make this depend on MMU, because there is little use for lazy mm switching
>> +# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
>> +config MMU_LAZY_TLB
>> +       def_bool y
>> +       help
>> +         Enable "lazy TLB" mmu context switching for kernel threads.
>> +
>> +config MMU_LAZY_TLB_REFCOUNT
>> +       def_bool y
>> +       depends on MMU_LAZY_TLB
>> +
> 
> This could use some documentation as to what "no" means.

Sure I can add a bit more.

> 
>>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>>         bool
>>
>> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
>> index 7157c0f6fef8..bd0f27402d4b 100644
>> --- a/include/linux/sched/mm.h
>> +++ b/include/linux/sched/mm.h
>> @@ -51,12 +51,21 @@ static inline void mmdrop(struct mm_struct *mm)
>>  /* Helpers for lazy TLB mm refcounting */
>>  static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
>>  {
>> -       mmgrab(mm);
>> +       if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
>> +               mmgrab(mm);
>>  }
>>
>>  static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
>>  {
>> -       mmdrop(mm);
>> +       if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
>> +               mmdrop(mm);
>> +       } else {
>> +               /*
>> +                * mmdrop_lazy_tlb must provide a full memory barrier, see the
>> +                * membarrier comment finish_task_switch.
> 
> "membarrier comment in finish_task_switch()", perhaps?

Sure.

Thanks,
Nick



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-11-28 17:55   ` Andy Lutomirski
@ 2020-12-02  2:49     ` Nicholas Piggin
  2020-12-03  5:09       ` Andy Lutomirski
  0 siblings, 1 reply; 46+ messages in thread
From: Nicholas Piggin @ 2020-12-02  2:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra, X86 ML

Excerpts from Andy Lutomirski's message of November 29, 2020 3:55 am:
> On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> And get rid of the generic sync_core_before_usermode facility. This is
>> functionally a no-op in the core scheduler code, but it also catches
>>
>> This helper is the wrong way around I think. The idea that membarrier
>> state requires a core sync before returning to user is the easy one
>> that does not need hiding behind membarrier calls. The gap in core
>> synchronization due to x86's sysret/sysexit and lazy tlb mode, is the
>> tricky detail that is better put in x86 lazy tlb code.
>>
>> Consider if an arch did not synchronize core in switch_mm either, then
>> membarrier_mm_sync_core_before_usermode would be in the wrong place
>> but arch specific mmu context functions would still be the right place.
>> There is also a exit_lazy_tlb case that is not covered by this call, which
>> could be a bugs (kthread use mm the membarrier process's mm then context
>> switch back to the process without switching mm or lazy mm switch).
>>
>> This makes lazy tlb code a bit more modular.
> 
> I have a couple of membarrier fixes that I want to send out today or
> tomorrow, and they might eliminate the need for this patch.  Let me
> think about this a little bit.  I'll cc you.  The existing code is way
> to subtle and the comments are far too confusing for me to be quickly
> confident about any of my conclusions :)
> 

Thanks for the head's up. I'll have to have a better look through them 
but I don't know that it eliminates the need for this entirely although
it might close some gaps and make this not a bug fix. The problem here 
is x86 code wanted something to be called when a lazy mm is unlazied,
but it missed some spots and also the core scheduler doesn't need to 
know about those x86 details if it has this generic call that annotates
the lazy handling better.

I'll go through the wording again and look at your patches a bit better
but I think they are somewhat orthogonal.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb
  2020-11-29  0:38   ` Andy Lutomirski
@ 2020-12-02  2:49     ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-12-02  2:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra, X86 ML

Excerpts from Andy Lutomirski's message of November 29, 2020 10:38 am:
> On Sat, Nov 28, 2020 at 8:01 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> This is called at points where a lazy mm is switched away or made not
>> lazy (by its owner switching back).
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/arm/mach-rpc/ecard.c            |  1 +
>>  arch/powerpc/mm/book3s64/radix_tlb.c |  1 +
>>  fs/exec.c                            |  6 ++++--
>>  include/asm-generic/mmu_context.h    | 21 +++++++++++++++++++++
>>  kernel/kthread.c                     |  1 +
>>  kernel/sched/core.c                  |  2 ++
>>  6 files changed, 30 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
>> index 827b50f1c73e..43eb1bfba466 100644
>> --- a/arch/arm/mach-rpc/ecard.c
>> +++ b/arch/arm/mach-rpc/ecard.c
>> @@ -253,6 +253,7 @@ static int ecard_init_mm(void)
>>         current->mm = mm;
>>         current->active_mm = mm;
>>         activate_mm(active_mm, mm);
>> +       exit_lazy_tlb(active_mm, current);
>>         mmdrop(active_mm);
>>         ecard_init_pgtables(mm);
>>         return 0;
>> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
>> index b487b489d4b6..ac3fec03926a 100644
>> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
>> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
>> @@ -661,6 +661,7 @@ static void do_exit_flush_lazy_tlb(void *arg)
>>                 mmgrab(&init_mm);
>>                 current->active_mm = &init_mm;
>>                 switch_mm_irqs_off(mm, &init_mm, current);
>> +               exit_lazy_tlb(mm, current);
>>                 mmdrop(mm);
>>         }
>>
>> diff --git a/fs/exec.c b/fs/exec.c
>> index 547a2390baf5..4b4dea1bb7ba 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1017,6 +1017,8 @@ static int exec_mmap(struct mm_struct *mm)
>>         if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
>>                 local_irq_enable();
>>         activate_mm(active_mm, mm);
>> +       if (!old_mm)
>> +               exit_lazy_tlb(active_mm, tsk);
>>         if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
>>                 local_irq_enable();
>>         tsk->mm->vmacache_seqnum = 0;
>> @@ -1028,9 +1030,9 @@ static int exec_mmap(struct mm_struct *mm)
>>                 setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
>>                 mm_update_next_owner(old_mm);
>>                 mmput(old_mm);
>> -               return 0;
>> +       } else {
>> +               mmdrop(active_mm);
>>         }
>> -       mmdrop(active_mm);
> 
> This looks like an unrelated change.

I thought the old style was pointless and made me look twice to make 
sure we weren't mmdrop()ing the lazy.

> 
>>         return 0;
>>  }
>>
>> diff --git a/include/asm-generic/mmu_context.h b/include/asm-generic/mmu_context.h
>> index 91727065bacb..4626d0020e65 100644
>> --- a/include/asm-generic/mmu_context.h
>> +++ b/include/asm-generic/mmu_context.h
>> @@ -24,6 +24,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
>>  }
>>  #endif
>>
>> +/*
>> + * exit_lazy_tlb - Called after switching away from a lazy TLB mode mm.
>> + *
>> + * mm:  the lazy mm context that was switched
>> + * tsk: the task that was switched to (with a non-lazy mm)
>> + *
>> + * mm may equal tsk->mm.
>> + * mm and tsk->mm will not be NULL.
>> + *
>> + * Note this is not symmetrical to enter_lazy_tlb, this is not
>> + * called when tasks switch into the lazy mm, it's called after the
>> + * lazy mm becomes non-lazy (either switched to a different mm or the
>> + * owner of the mm returns).
>> + */
>> +#ifndef exit_lazy_tlb
>> +static inline void exit_lazy_tlb(struct mm_struct *mm,
> 
> Maybe name this parameter prev_lazy_mm?
> 

mm is better because it's the mm that we're "exiting lazy" from, the 
function name gives the context.

prev might suggest it was the previous but it's the current one, or
that we're switching to another mm but we may not be at all.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-29  3:54   ` Andy Lutomirski
                       ` (2 preceding siblings ...)
  2020-11-30  9:30     ` Peter Zijlstra
@ 2020-12-02  3:09     ` Nicholas Piggin
  3 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-12-02  3:09 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra, X86 ML

Excerpts from Andy Lutomirski's message of November 29, 2020 1:54 pm:
> On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> On big systems, the mm refcount can become highly contented when doing
>> a lot of context switching with threaded applications (particularly
>> switching between the idle thread and an application thread).
>>
>> Abandoning lazy tlb slows switching down quite a bit in the important
>> user->idle->user cases, so so instead implement a non-refcounted scheme
>> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>> any remaining lazy ones.
>>
>> Shootdown IPIs are some concern, but they have not been observed to be
>> a big problem with this scheme (the powerpc implementation generated
>> 314 additional interrupts on a 144 CPU system during a kernel compile).
>> There are a number of strategies that could be employed to reduce IPIs
>> if they turn out to be a problem for some workload.
> 
> I'm still wondering whether we can do even better.

We probably can, for some values of better / more complex. This came up 
last time I posted, there was a big concern about IPIs etc, but it just 
wasn't an issue at all even when I tried to coax them to happen a bit.

The thing is they are faily self-limiting, it's not actually all that 
frequent that you have an mm get taken for a lazy *and* move between 
CPUs. Perhaps more often with threaded apps, but in that case you're 
eating various IPI costs anyway (e.g., when moving the task to another
CPU, on TLB shootdowns, etc).

So from last time I did measure and I did document some possible 
improvements that could be made in comments, but I decided to keep it 
simple before adding complexity to it.

> 
> The IPIs you're doing aren't really necessary -- we don't
> fundamentally need to free the pagetables immediately when all
> non-lazy users are done with them (and current kernels don't) -- what
> we need to do is to synchronize all the bookkeeping.  So, with
> adequate locking (famous last words), a couple of alternative schemes
> ought to be possible.

It's not freeing the page tables, those are freed by this point already 
I think (at least on powerpc they are). It's releasing the lazy mm.

> 
> a) Instead of sending an IPI, increment mm_count on behalf of the
> remote CPU and do something to make sure that the remote CPU knows we
> did this on its behalf.  Then free the mm when mm_count hits zero.
> 
> b) Treat mm_cpumask as part of the refcount.  Add one to mm_count when
> an mm is created.  Once mm_users hits zero, whoever clears the last
> bit in mm_cpumask is responsible for decrementing a single reference
> from mm_count, and whoever sets it to zero frees the mm.

Right, these were some possible avenues to explore, thing is it's 
complexity and more synchronisation costs, and in the fast (context 
switch) path too. The IPI actually avoids all fast path work, atomic
or not.

> Version (b) seems fairly straightforward to implement -- add RCU
> protection and a atomic_t special_ref_cleared (initially 0) to struct
> mm_struct itself.  After anyone clears a bit to mm_cpumask (which is
> already a barrier), they read mm_users.  If it's zero, then they scan
> mm_cpumask and see if it's empty.  If it is, they atomically swap
> special_ref_cleared to 1.  If it was zero before the swap, they do
> mmdrop().  I can imagine some tweaks that could make this a big
> faster, at least in the limit of a huge number of CPUs.
> 
> Version (a) seems a bit harder to reason about.  Maybe it could be
> done like this.  Add a percpu variable mm_with_extra_count.  This
> variable can be NULL, but it can also be an mm that has an extra
> reference on behalf of the cpu in question.
> 
> __mmput scans mm_cpumask and, for each cpu in the mask, mmgrabs the mm
> and cmpxchgs that cpu's mm_with_extra_count from NULL to mm.  If it
> succeeds, then we win.  If it fails, further thought is required, and
> maybe we have to send an IPI, although maybe some other cleverness is
> possible.  Any time a CPU switches mms, it does atomic swaps
> mm_with_extra_count to NULL and mmdrops whatever the mm was.  (Maybe
> it needs to check the mm isn't equal to the new mm, although it would
> be quite bizarre for this to happen.)  Other than these mmgrab and
> mmdrop calls, the mm switching code doesn't mmgrab or mmdrop at all.
> 
> 
> Version (a) seems like it could have excellent performance.

That said, if x86 wanted to explore something like this, the code to do 
it is a bit modular (I don't think a proliferation of lazy refcounting 
config options is a good idea of course, but 2 versions one for powrepc
style set-and-forget mm_cpumask and one for x86 set-and-clear would
be okay.

> *However*, I think we should consider whether we want to do something
> even bigger first.  Even with any of these changes, we still need to
> maintain mm_cpumask(), and that itself can be a scalability problem.
> I wonder if we can solve this problem too.  Perhaps the switch_mm()
> paths could only ever set mm_cpumask bits,

Powerpc does this.

> and anyone who would send
> an IPI because a bit is set in mm_cpumask would first check some
> percpu variable (cpu_rq(cpu)->something? 

This is a suggested possible optimization to the IPI scheme (you would
check if it's active).

There's pros and cons to it. You get more IPIs and cross TLB shootdowns
and jitter cleaning up behind you rather than cleaning up as you go.

> an entirely new variable) to
> see if the bit in mm_cpumask is spurious.  Or perhaps mm_cpumask could
> be split up across multiple cachelines, one per node.

IIRC Peter or someone mentioned this was something that was looked at 
for x86.

> We should keep the recent lessons from Apple in mind, though: x86 is a
> dinosaur.

Wow. What is the recent lesson from Apple?? I'm completely out of the 
loop here.

> The future of atomics is going to look a lot more like
> ARM's LSE than x86's rather anemic set.  This means that mm_cpumask
> operations won't need to be full barriers forever, and we might not
> want to take the implied full barriers in set_bit() and clear_bit()
> for granted.

Sure, set_bit / clear_bit aren't full barriers in terms of Linux 
semantics, so generic code doesn't assume that. What x86 does is
add the smp_mb__after_blah or before_blah to avoid an added barrier
because of it's heavier-than-required set_bit.

I'm not quite sure what you were getting at though. The atomic itself is 
really quite a small cost of the exit() operation (or even a context 
switch operation) _most_ of the time (x86 CPUs seem to have very fast
atomics so might be even smaller cost for you). It's just when you 
happen to bounce a cache line, which hurts no matter what you do.

Thanks,
Nick




^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-30 18:31       ` Andy Lutomirski
  2020-12-01 21:27         ` Will Deacon
@ 2020-12-02  3:47         ` Nicholas Piggin
  2020-12-03  5:05           ` Andy Lutomirski
  2020-12-03 17:03         ` Alexander Gordeev
  2 siblings, 1 reply; 46+ messages in thread
From: Nicholas Piggin @ 2020-12-02  3:47 UTC (permalink / raw)
  To: Christian Borntraeger, Catalin Marinas, Dave Hansen,
	Vasily Gorbik, Heiko Carstens, Andy Lutomirski, Will Deacon
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra, X86 ML

Excerpts from Andy Lutomirski's message of December 1, 2020 4:31 am:
> other arch folk: there's some background here:
> 
> https://lkml.kernel.org/r/CALCETrVXUbe8LfNn-Qs+DzrOQaiw+sFUg1J047yByV31SaTOZw@mail.gmail.com
> 
> On Sun, Nov 29, 2020 at 12:16 PM Andy Lutomirski <luto@kernel.org> wrote:
>>
>> On Sat, Nov 28, 2020 at 7:54 PM Andy Lutomirski <luto@kernel.org> wrote:
>> >
>> > On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>> > >
>> > > On big systems, the mm refcount can become highly contented when doing
>> > > a lot of context switching with threaded applications (particularly
>> > > switching between the idle thread and an application thread).
>> > >
>> > > Abandoning lazy tlb slows switching down quite a bit in the important
>> > > user->idle->user cases, so so instead implement a non-refcounted scheme
>> > > that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>> > > any remaining lazy ones.
>> > >
>> > > Shootdown IPIs are some concern, but they have not been observed to be
>> > > a big problem with this scheme (the powerpc implementation generated
>> > > 314 additional interrupts on a 144 CPU system during a kernel compile).
>> > > There are a number of strategies that could be employed to reduce IPIs
>> > > if they turn out to be a problem for some workload.
>> >
>> > I'm still wondering whether we can do even better.
>> >
>>
>> Hold on a sec.. __mmput() unmaps VMAs, frees pagetables, and flushes
>> the TLB.  On x86, this will shoot down all lazies as long as even a
>> single pagetable was freed.  (Or at least it will if we don't have a
>> serious bug, but the code seems okay.  We'll hit pmd_free_tlb, which
>> sets tlb->freed_tables, which will trigger the IPI.)  So, on
>> architectures like x86, the shootdown approach should be free.  The
>> only way it ought to have any excess IPIs is if we have CPUs in
>> mm_cpumask() that don't need IPI to free pagetables, which could
>> happen on paravirt.
> 
> Indeed, on x86, we do this:
> 
> [   11.558844]  flush_tlb_mm_range.cold+0x18/0x1d
> [   11.559905]  tlb_finish_mmu+0x10e/0x1a0
> [   11.561068]  exit_mmap+0xc8/0x1a0
> [   11.561932]  mmput+0x29/0xd0
> [   11.562688]  do_exit+0x316/0xa90
> [   11.563588]  do_group_exit+0x34/0xb0
> [   11.564476]  __x64_sys_exit_group+0xf/0x10
> [   11.565512]  do_syscall_64+0x34/0x50
> 
> and we have info->freed_tables set.
> 
> What are the architectures that have large systems like?
> 
> x86: we already zap lazies, so it should cost basically nothing to do

This is not zapping lazies, this is freeing the user page tables.

"lazy mm" is where a switch to a kernel thread takes on the
previous mm for its kernel mapping rather than switch to init_mm.

> a little loop at the end of __mmput() to make sure that no lazies are
> left.  If we care about paravirt performance, we could implement one
> of the optimizations I mentioned above to fix up the refcounts instead
> of sending an IPI to any remaining lazies.

It might be possible x86's scheme you could scan mm_cpumask
carefully synchronized or something when the last user reference
gets dropped that frees the lazy at that point, but I don't know
what that would buy you because you're still having to maintain
the mm_cpumask on switches. powerpc's characteristics are just
different here so it makes sense whereas I don't know if it
would on x86.

> 
> arm64: AFAICT arm64's flush uses magic arm64 hardware support for
> remote flushes, so any lazy mm references will still exist after
> exit_mmap().  (arm64 uses lazy TLB, right?)  So this is kind of like
> the x86 paravirt case.  Are there large enough arm64 systems that any
> of this matters?
> 
> s390x: The code has too many acronyms for me to understand it fully,
> but I think it's more or less the same situation as arm64.  How big do
> s390x systems come?
> 
> power: Ridiculously complicated, seems to vary by system and kernel config.
> 
> So, Nick, your unconditional IPI scheme is apparently a big
> improvement for power, and it should be an improvement and have low
> cost for x86.

As said, the tradeoffs are different, I'm not so sure. It was a big 
improvement on a very big system with the powerpc mm_cpumask switching
model on a microbenchmark designed to stress this, which is about all
I can say for it.

> On arm64 and s390x it will add more IPIs on process
> exit but reduce contention on context switching depending on how lazy
> TLB works.  I suppose we could try it for all architectures without
> any further optimizations.

It will remain opt-in but certainly try it out and see. There are some
requirements as documented in the config option text.

> Or we could try one of the perhaps
> excessively clever improvements I linked above.  arm64, s390x people,
> what do you think?
> 

I'm not against improvements to the scheme. e.g., from the patch

+               /*
+                * IPI overheads have not found to be expensive, but they could
+                * be reduced in a number of possible ways, for example (in
+                * roughly increasing order of complexity):
+                * - A batch of mms requiring IPIs could be gathered and freed
+                *   at once.
+                * - CPUs could store their active mm somewhere that can be
+                *   remotely checked without a lock, to filter out
+                *   false-positives in the cpumask.
+                * - After mm_users or mm_count reaches zero, switching away
+                *   from the mm could clear mm_cpumask to reduce some IPIs
+                *   (some batching or delaying would help).
+                * - A delayed freeing and RCU-like quiescing sequence based on
+                *   mm switching to avoid IPIs completely.
+                */

But would like to have numbers before being too clever.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-28 16:01 ` [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
  2020-11-29  3:54   ` Andy Lutomirski
@ 2020-12-02 11:17   ` Peter Zijlstra
  2020-12-02 12:45     ` Peter Zijlstra
  2020-12-02 14:19   ` Peter Zijlstra
  2 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2020-12-02 11:17 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-kernel, x86, Mathieu Desnoyers, Arnd Bergmann, linux-arch,
	linuxppc-dev, linux-mm, Anton Blanchard

On Sun, Nov 29, 2020 at 02:01:39AM +1000, Nicholas Piggin wrote:
> +static void shoot_lazy_tlbs(struct mm_struct *mm)
> +{
> +	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
> +		/*
> +		 * IPI overheads have not found to be expensive, but they could
> +		 * be reduced in a number of possible ways, for example (in
> +		 * roughly increasing order of complexity):
> +		 * - A batch of mms requiring IPIs could be gathered and freed
> +		 *   at once.
> +		 * - CPUs could store their active mm somewhere that can be
> +		 *   remotely checked without a lock, to filter out
> +		 *   false-positives in the cpumask.
> +		 * - After mm_users or mm_count reaches zero, switching away
> +		 *   from the mm could clear mm_cpumask to reduce some IPIs
> +		 *   (some batching or delaying would help).
> +		 * - A delayed freeing and RCU-like quiescing sequence based on
> +		 *   mm switching to avoid IPIs completely.
> +		 */
> +		on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
> +		if (IS_ENABLED(CONFIG_DEBUG_VM))
> +			on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);

So the obvious 'improvement' here would be something like:

	for_each_online_cpu(cpu) {
		p = rcu_dereference(cpu_rq(cpu)->curr;
		if (p->active_mm != mm)
			continue;
		__cpumask_set_cpu(cpu, tmpmask);
	}
	on_each_cpu_mask(tmpmask, ...);

The remote CPU will never switch _to_ @mm, on account of it being quite
dead, but it is quite prone to false negatives.

Consider that __schedule() sets rq->curr *before* context_switch(), this
means we'll see next->active_mm, even though prev->active_mm might still
be our @mm.

Now, because we'll be removing the atomic ops from context_switch()'s
active_mm swizzling, I think we can change this to something like the
below. The hope being that the cost of the new barrier can be offset by
the loss of the atomics.

Hmm ?

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41404afb7f4c..2597c5c0ccb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4509,7 +4509,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	if (!next->mm) {                                // to kernel
 		enter_lazy_tlb(prev->active_mm, next);
 
-		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
 			mmgrab(prev->active_mm);
 		else
@@ -4524,6 +4523,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		 * case 'prev->active_mm == next->mm' through
 		 * finish_task_switch()'s mmdrop().
 		 */
+		next->active_mm = next->mm;
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
@@ -5713,11 +5713,9 @@ static void __sched notrace __schedule(bool preempt)
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
-		/*
-		 * RCU users of rcu_dereference(rq->curr) may not see
-		 * changes to task_struct made by pick_next_task().
-		 */
-		RCU_INIT_POINTER(rq->curr, next);
+
+		next->active_mm = prev->active_mm;
+		rcu_assign_pointer(rq->curr, next);
 		/*
 		 * The membarrier system call requires each architecture
 		 * to have a full memory barrier after updating


^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-12-02 11:17   ` Peter Zijlstra
@ 2020-12-02 12:45     ` Peter Zijlstra
  0 siblings, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2020-12-02 12:45 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-kernel, x86, Mathieu Desnoyers, Arnd Bergmann, linux-arch,
	linuxppc-dev, linux-mm, Anton Blanchard

On Wed, Dec 02, 2020 at 12:17:31PM +0100, Peter Zijlstra wrote:

> So the obvious 'improvement' here would be something like:
> 
> 	for_each_online_cpu(cpu) {
> 		p = rcu_dereference(cpu_rq(cpu)->curr;
> 		if (p->active_mm != mm)
> 			continue;
> 		__cpumask_set_cpu(cpu, tmpmask);
> 	}
> 	on_each_cpu_mask(tmpmask, ...);
> 
> The remote CPU will never switch _to_ @mm, on account of it being quite
> dead, but it is quite prone to false negatives.
> 
> Consider that __schedule() sets rq->curr *before* context_switch(), this
> means we'll see next->active_mm, even though prev->active_mm might still
> be our @mm.
> 
> Now, because we'll be removing the atomic ops from context_switch()'s
> active_mm swizzling, I think we can change this to something like the
> below. The hope being that the cost of the new barrier can be offset by
> the loss of the atomics.
> 
> Hmm ?
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 41404afb7f4c..2597c5c0ccb0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4509,7 +4509,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
>  	if (!next->mm) {                                // to kernel
>  		enter_lazy_tlb(prev->active_mm, next);
>  
> -		next->active_mm = prev->active_mm;
>  		if (prev->mm)                           // from user
>  			mmgrab(prev->active_mm);
>  		else
> @@ -4524,6 +4523,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
>  		 * case 'prev->active_mm == next->mm' through
>  		 * finish_task_switch()'s mmdrop().
>  		 */
> +		next->active_mm = next->mm;
>  		switch_mm_irqs_off(prev->active_mm, next->mm, next);

I think that next->active_mm store should be after switch_mm(),
otherwise we still race.

>  
>  		if (!prev->mm) {                        // from kernel
> @@ -5713,11 +5713,9 @@ static void __sched notrace __schedule(bool preempt)
>  
>  	if (likely(prev != next)) {
>  		rq->nr_switches++;
> -		/*
> -		 * RCU users of rcu_dereference(rq->curr) may not see
> -		 * changes to task_struct made by pick_next_task().
> -		 */
> -		RCU_INIT_POINTER(rq->curr, next);
> +
> +		next->active_mm = prev->active_mm;
> +		rcu_assign_pointer(rq->curr, next);
>  		/*
>  		 * The membarrier system call requires each architecture
>  		 * to have a full memory barrier after updating


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-28 16:01 ` [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
  2020-11-29  3:54   ` Andy Lutomirski
  2020-12-02 11:17   ` Peter Zijlstra
@ 2020-12-02 14:19   ` Peter Zijlstra
  2020-12-02 14:38     ` Andy Lutomirski
  2 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2020-12-02 14:19 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-kernel, x86, Mathieu Desnoyers, Arnd Bergmann, linux-arch,
	linuxppc-dev, linux-mm, Anton Blanchard

On Sun, Nov 29, 2020 at 02:01:39AM +1000, Nicholas Piggin wrote:
> +		 * - A delayed freeing and RCU-like quiescing sequence based on
> +		 *   mm switching to avoid IPIs completely.

That one's interesting too. so basically you want to count switch_mm()
invocations on each CPU. Then, periodically snapshot the counter on each
CPU, and when they've all changed, increment a global counter.

Then, you snapshot the global counter and wait for it to increment
(twice I think, the first increment might already be in progress).

The only question here is what should drive this machinery.. the tick
probably.

This shouldn't be too hard to do I think.

Something a little like so perhaps?


diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41404afb7f4c..27b64a60a468 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4525,6 +4525,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		 * finish_task_switch()'s mmdrop().
 		 */
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
+		rq->nr_mm_switches++;
 
 		if (!prev->mm) {                        // from kernel
 			/* will mmdrop() in finish_task_switch(). */
@@ -4739,6 +4740,80 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
+static DEFINE_PER_CPU(unsigned long[2], mm_switches);
+
+static struct {
+	unsigned long __percpu *switches[2];
+	unsigned long generation;
+	atomic_t complete;
+	struct wait_queue_dead wait;
+} mm_foo = {
+	.switches = &mm_switches,
+	.generation = 0,
+	.complete = -1, // XXX bootstrap, hotplug
+	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(mm_foo.wait),
+};
+
+static void mm_gen_tick(int cpu, struct rq *rq)
+{
+	unsigned long prev, curr, switches = rq->nr_mm_switches;
+	int idx = READ_ONCE(mm_foo.generation) & 1;
+
+	/* DATA-DEP on mm_foo.generation */
+
+	prev = __this_cpu_read(mm_foo.switches[idx^1]);
+	curr = __this_cpu_read(mm_foo.switches[idx]);
+
+	/* we haven't switched since the last generation */
+	if (prev == switches)
+		return false;
+
+	__this_cpu_write(mm_foo.switches[idx], switches);
+
+	/*
+	 * If @curr is less than @prev, this is the first update of
+	 * this generation, per the above, switches has also increased since,
+	 * so mark out CPU complete.
+	 */
+	if ((long)(curr - prev) < 0 && atomic_dec_and_test(&mm_foo.complete)) {
+		/*
+		 * All CPUs are complete, IOW they all switched at least once
+		 * since the last generation. Reset the completion counter and
+		 * increment the generation.
+		 */
+		atomic_set(&mm_foo.complete, nr_online_cpus());
+		/*
+		 * Matches the address dependency above:
+		 *
+		 *   idx = gen & 1	complete = nr_cpus
+		 *   <DATA-DEP>		<WMB>
+		 *   curr = sw[idx]	generation++;
+		 *   prev = sw[idx^1]
+		 *   if (curr < prev)
+		 *     complete--
+		 *
+		 * If we don't observe the new generation; we'll not decrement. If we
+		 * do see the new generation, we must also see the new completion count.
+		 */
+		smp_wmb();
+		mm_foo.generation++;
+		return true;
+	}
+
+	return false;
+}
+
+static void mm_gen_wake(void)
+{
+	wake_up_all(&mm_foo.wait);
+}
+
+static void mm_gen_wait(void)
+{
+	unsigned int gen = READ_ONCE(mm_foo.generation);
+	wait_event(&mm_foo.wait, READ_ONCE(mm_foo.generation) - gen > 1);
+}
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -4750,6 +4825,7 @@ void scheduler_tick(void)
 	struct task_struct *curr = rq->curr;
 	struct rq_flags rf;
 	unsigned long thermal_pressure;
+	bool wake_mm_gen;
 
 	arch_scale_freq_tick();
 	sched_clock_tick();
@@ -4763,8 +4839,13 @@ void scheduler_tick(void)
 	calc_global_load_tick(rq);
 	psi_task_tick(rq);
 
+	wake_mm_gen = mm_gen_tick(cpu, rq);
+
 	rq_unlock(rq, &rf);
 
+	if (wake_mm_gen)
+		mm_gen_wake();
+
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bf9d8da7d35e..62fb685db8d0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -927,6 +927,7 @@ struct rq {
 	unsigned int		ttwu_pending;
 #endif
 	u64			nr_switches;
+	u64			nr_mm_switches;
 
 #ifdef CONFIG_UCLAMP_TASK
 	/* Utilization clamp values based on CPU's RUNNABLE tasks */


^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-12-02 14:19   ` Peter Zijlstra
@ 2020-12-02 14:38     ` Andy Lutomirski
  2020-12-02 16:29       ` Peter Zijlstra
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2020-12-02 14:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Nicholas Piggin, linux-kernel, x86, Mathieu Desnoyers,
	Arnd Bergmann, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard


> On Dec 2, 2020, at 6:20 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Sun, Nov 29, 2020 at 02:01:39AM +1000, Nicholas Piggin wrote:
>> +         * - A delayed freeing and RCU-like quiescing sequence based on
>> +         *   mm switching to avoid IPIs completely.
> 
> That one's interesting too. so basically you want to count switch_mm()
> invocations on each CPU. Then, periodically snapshot the counter on each
> CPU, and when they've all changed, increment a global counter.
> 
> Then, you snapshot the global counter and wait for it to increment
> (twice I think, the first increment might already be in progress).
> 
> The only question here is what should drive this machinery.. the tick
> probably.
> 
> This shouldn't be too hard to do I think.
> 
> Something a little like so perhaps?

I don’t think this will work.  A CPU can go idle with lazy mm and nohz forever.  This could lead to unbounded memory use on a lightly loaded system.



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-12-02 14:38     ` Andy Lutomirski
@ 2020-12-02 16:29       ` Peter Zijlstra
  0 siblings, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2020-12-02 16:29 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Nicholas Piggin, linux-kernel, x86, Mathieu Desnoyers,
	Arnd Bergmann, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

On Wed, Dec 02, 2020 at 06:38:12AM -0800, Andy Lutomirski wrote:
> 
> > On Dec 2, 2020, at 6:20 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > On Sun, Nov 29, 2020 at 02:01:39AM +1000, Nicholas Piggin wrote:
> >> +         * - A delayed freeing and RCU-like quiescing sequence based on
> >> +         *   mm switching to avoid IPIs completely.
> > 
> > That one's interesting too. so basically you want to count switch_mm()
> > invocations on each CPU. Then, periodically snapshot the counter on each
> > CPU, and when they've all changed, increment a global counter.
> > 
> > Then, you snapshot the global counter and wait for it to increment
> > (twice I think, the first increment might already be in progress).
> > 
> > The only question here is what should drive this machinery.. the tick
> > probably.
> > 
> > This shouldn't be too hard to do I think.
> > 
> > Something a little like so perhaps?
> 
> I don’t think this will work.  A CPU can go idle with lazy mm and nohz
> forever.  This could lead to unbounded memory use on a lightly loaded
> system.

Hurm.. quite so indeed. Fixing that seems to end up with requiring that
other proposal, such that we can tell which CPU has what active_mm
stuck.

Also, more complicated... :/


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-12-02  3:47         ` Nicholas Piggin
@ 2020-12-03  5:05           ` Andy Lutomirski
  0 siblings, 0 replies; 46+ messages in thread
From: Andy Lutomirski @ 2020-12-03  5:05 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Christian Borntraeger, Catalin Marinas, Dave Hansen,
	Vasily Gorbik, Heiko Carstens, Andy Lutomirski, Will Deacon,
	Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra, X86 ML

> On Dec 1, 2020, at 7:47 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>
> Excerpts from Andy Lutomirski's message of December 1, 2020 4:31 am:
>> other arch folk: there's some background here:
>>
>> https://lkml.kernel.org/r/CALCETrVXUbe8LfNn-Qs+DzrOQaiw+sFUg1J047yByV31SaTOZw@mail.gmail.com
>>
>>> On Sun, Nov 29, 2020 at 12:16 PM Andy Lutomirski <luto@kernel.org> wrote:
>>>
>>> On Sat, Nov 28, 2020 at 7:54 PM Andy Lutomirski <luto@kernel.org> wrote:
>>>>
>>>> On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>>>>
>>>>> On big systems, the mm refcount can become highly contented when doing
>>>>> a lot of context switching with threaded applications (particularly
>>>>> switching between the idle thread and an application thread).
>>>>>
>>>>> Abandoning lazy tlb slows switching down quite a bit in the important
>>>>> user->idle->user cases, so so instead implement a non-refcounted scheme
>>>>> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>>>>> any remaining lazy ones.
>>>>>
>>>>> Shootdown IPIs are some concern, but they have not been observed to be
>>>>> a big problem with this scheme (the powerpc implementation generated
>>>>> 314 additional interrupts on a 144 CPU system during a kernel compile).
>>>>> There are a number of strategies that could be employed to reduce IPIs
>>>>> if they turn out to be a problem for some workload.
>>>>
>>>> I'm still wondering whether we can do even better.
>>>>
>>>
>>> Hold on a sec.. __mmput() unmaps VMAs, frees pagetables, and flushes
>>> the TLB.  On x86, this will shoot down all lazies as long as even a
>>> single pagetable was freed.  (Or at least it will if we don't have a
>>> serious bug, but the code seems okay.  We'll hit pmd_free_tlb, which
>>> sets tlb->freed_tables, which will trigger the IPI.)  So, on
>>> architectures like x86, the shootdown approach should be free.  The
>>> only way it ought to have any excess IPIs is if we have CPUs in
>>> mm_cpumask() that don't need IPI to free pagetables, which could
>>> happen on paravirt.
>>
>> Indeed, on x86, we do this:
>>
>> [   11.558844]  flush_tlb_mm_range.cold+0x18/0x1d
>> [   11.559905]  tlb_finish_mmu+0x10e/0x1a0
>> [   11.561068]  exit_mmap+0xc8/0x1a0
>> [   11.561932]  mmput+0x29/0xd0
>> [   11.562688]  do_exit+0x316/0xa90
>> [   11.563588]  do_group_exit+0x34/0xb0
>> [   11.564476]  __x64_sys_exit_group+0xf/0x10
>> [   11.565512]  do_syscall_64+0x34/0x50
>>
>> and we have info->freed_tables set.
>>
>> What are the architectures that have large systems like?
>>
>> x86: we already zap lazies, so it should cost basically nothing to do
>
> This is not zapping lazies, this is freeing the user page tables.
>
> "lazy mm" is where a switch to a kernel thread takes on the
> previous mm for its kernel mapping rather than switch to init_mm.

The intent of the code is to flush the TLB after freeing user pages
tables, but, on bare metal, lazies get zapped as a side effect.

Anyway, I'm going to send out a mockup of an alternative approach shortly.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-12-02  2:49     ` Nicholas Piggin
@ 2020-12-03  5:09       ` Andy Lutomirski
  2020-12-05  8:00         ` Nicholas Piggin
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2020-12-03  5:09 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Andy Lutomirski, Anton Blanchard, Arnd Bergmann, linux-arch,
	LKML, Linux-MM, linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra,
	X86 ML

On Tue, Dec 1, 2020 at 6:50 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> Excerpts from Andy Lutomirski's message of November 29, 2020 3:55 am:
> > On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
> >>
> >> And get rid of the generic sync_core_before_usermode facility. This is
> >> functionally a no-op in the core scheduler code, but it also catches
> >>
> >> This helper is the wrong way around I think. The idea that membarrier
> >> state requires a core sync before returning to user is the easy one
> >> that does not need hiding behind membarrier calls. The gap in core
> >> synchronization due to x86's sysret/sysexit and lazy tlb mode, is the
> >> tricky detail that is better put in x86 lazy tlb code.
> >>
> >> Consider if an arch did not synchronize core in switch_mm either, then
> >> membarrier_mm_sync_core_before_usermode would be in the wrong place
> >> but arch specific mmu context functions would still be the right place.
> >> There is also a exit_lazy_tlb case that is not covered by this call, which
> >> could be a bugs (kthread use mm the membarrier process's mm then context
> >> switch back to the process without switching mm or lazy mm switch).
> >>
> >> This makes lazy tlb code a bit more modular.
> >
> > I have a couple of membarrier fixes that I want to send out today or
> > tomorrow, and they might eliminate the need for this patch.  Let me
> > think about this a little bit.  I'll cc you.  The existing code is way
> > to subtle and the comments are far too confusing for me to be quickly
> > confident about any of my conclusions :)
> >
>
> Thanks for the head's up. I'll have to have a better look through them
> but I don't know that it eliminates the need for this entirely although
> it might close some gaps and make this not a bug fix. The problem here
> is x86 code wanted something to be called when a lazy mm is unlazied,
> but it missed some spots and also the core scheduler doesn't need to
> know about those x86 details if it has this generic call that annotates
> the lazy handling better.

I'll send v3 tomorrow.  They add more sync_core_before_usermode() callers.

Having looked at your patches a bunch and the membarrier code a bunch,
I don't think I like the approach of pushing this logic out into new
core functions called by arch code.  Right now, even with my
membarrier patches applied, understanding how (for example) the x86
switch_mm_irqs_off() plus the scheduler code provides the barriers
that membarrier needs is quite complicated, and it's not clear to me
that the code is even correct.  At the very least I'm pretty sure that
the x86 comments are misleading.  With your patches, someone trying to
audit the code would have to follow core code calling into arch code
and back out into core code to figure out what's going on.  I think
the result is worse.

I wrote this incomplete patch which takes the opposite approach (sorry
for whitespace damage):

commit 928b5c60e93f475934892d6e0b357ebf0a2bf87d
Author: Andy Lutomirski <luto@kernel.org>
Date:   Wed Dec 2 17:24:02 2020 -0800

    [WIP] x86/mm: Handle unlazying membarrier core sync in the arch code

    The core scheduler isn't a great place for
    membarrier_mm_sync_core_before_usermode() -- the core scheduler
    doesn't actually know whether we are lazy.  With the old code, if a
    CPU is running a membarrier-registered task, goes idle, gets unlazied
    via a TLB shootdown IPI, and switches back to the
    membarrier-registered task, it will do an unnecessary core sync.

    Conveniently, x86 is the only architecture that does anything in this
    hook, so we can just move the code.

    XXX: actually delete the old code.

    Signed-off-by: Andy Lutomirski <luto@kernel.org>

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 3338a1feccf9..e27300fc865b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -496,6 +496,8 @@ void switch_mm_irqs_off(struct mm_struct *prev,
struct mm_struct *next,
          * from one thread in a process to another thread in the same
          * process. No TLB flush required.
          */
+
+        // XXX: why is this okay wrt membarrier?
         if (!was_lazy)
             return;

@@ -508,12 +510,24 @@ void switch_mm_irqs_off(struct mm_struct *prev,
struct mm_struct *next,
         smp_mb();
         next_tlb_gen = atomic64_read(&next->context.tlb_gen);
         if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
-                next_tlb_gen)
+            next_tlb_gen) {
+            /*
+             * We're reactivating an mm, and membarrier might
+             * need to serialize.  Tell membarrier.
+             */
+
+            // XXX: I can't understand the logic in
+            // membarrier_mm_sync_core_before_usermode().  What's
+            // the mm check for?
+            membarrier_mm_sync_core_before_usermode(next);
             return;
+        }

         /*
          * TLB contents went out of date while we were in lazy
          * mode. Fall through to the TLB switching code below.
+         * No need for an explicit membarrier invocation -- the CR3
+         * write will serialize.
          */
         new_asid = prev_asid;
         need_flush = true;


^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-11-30 18:31       ` Andy Lutomirski
  2020-12-01 21:27         ` Will Deacon
  2020-12-02  3:47         ` Nicholas Piggin
@ 2020-12-03 17:03         ` Alexander Gordeev
  2020-12-03 17:14           ` Andy Lutomirski
  2 siblings, 1 reply; 46+ messages in thread
From: Alexander Gordeev @ 2020-12-03 17:03 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Will Deacon, Catalin Marinas, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Dave Hansen, Nicholas Piggin, LKML,
	X86 ML, Mathieu Desnoyers, Arnd Bergmann, Peter Zijlstra,
	linux-arch, linuxppc-dev, Linux-MM, Anton Blanchard

On Mon, Nov 30, 2020 at 10:31:51AM -0800, Andy Lutomirski wrote:
> other arch folk: there's some background here:
> 
> https://lkml.kernel.org/r/CALCETrVXUbe8LfNn-Qs+DzrOQaiw+sFUg1J047yByV31SaTOZw@mail.gmail.com
> 
> On Sun, Nov 29, 2020 at 12:16 PM Andy Lutomirski <luto@kernel.org> wrote:
> >
> > On Sat, Nov 28, 2020 at 7:54 PM Andy Lutomirski <luto@kernel.org> wrote:
> > >
> > > On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
> > > >
> > > > On big systems, the mm refcount can become highly contented when doing
> > > > a lot of context switching with threaded applications (particularly
> > > > switching between the idle thread and an application thread).
> > > >
> > > > Abandoning lazy tlb slows switching down quite a bit in the important
> > > > user->idle->user cases, so so instead implement a non-refcounted scheme
> > > > that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
> > > > any remaining lazy ones.
> > > >
> > > > Shootdown IPIs are some concern, but they have not been observed to be
> > > > a big problem with this scheme (the powerpc implementation generated
> > > > 314 additional interrupts on a 144 CPU system during a kernel compile).
> > > > There are a number of strategies that could be employed to reduce IPIs
> > > > if they turn out to be a problem for some workload.
> > >
> > > I'm still wondering whether we can do even better.
> > >
> >
> > Hold on a sec.. __mmput() unmaps VMAs, frees pagetables, and flushes
> > the TLB.  On x86, this will shoot down all lazies as long as even a
> > single pagetable was freed.  (Or at least it will if we don't have a
> > serious bug, but the code seems okay.  We'll hit pmd_free_tlb, which
> > sets tlb->freed_tables, which will trigger the IPI.)  So, on
> > architectures like x86, the shootdown approach should be free.  The
> > only way it ought to have any excess IPIs is if we have CPUs in
> > mm_cpumask() that don't need IPI to free pagetables, which could
> > happen on paravirt.
> 
> Indeed, on x86, we do this:
> 
> [   11.558844]  flush_tlb_mm_range.cold+0x18/0x1d
> [   11.559905]  tlb_finish_mmu+0x10e/0x1a0
> [   11.561068]  exit_mmap+0xc8/0x1a0
> [   11.561932]  mmput+0x29/0xd0
> [   11.562688]  do_exit+0x316/0xa90
> [   11.563588]  do_group_exit+0x34/0xb0
> [   11.564476]  __x64_sys_exit_group+0xf/0x10
> [   11.565512]  do_syscall_64+0x34/0x50
> 
> and we have info->freed_tables set.
> 
> What are the architectures that have large systems like?
> 
> x86: we already zap lazies, so it should cost basically nothing to do
> a little loop at the end of __mmput() to make sure that no lazies are
> left.  If we care about paravirt performance, we could implement one
> of the optimizations I mentioned above to fix up the refcounts instead
> of sending an IPI to any remaining lazies.
> 
> arm64: AFAICT arm64's flush uses magic arm64 hardware support for
> remote flushes, so any lazy mm references will still exist after
> exit_mmap().  (arm64 uses lazy TLB, right?)  So this is kind of like
> the x86 paravirt case.  Are there large enough arm64 systems that any
> of this matters?
> 
> s390x: The code has too many acronyms for me to understand it fully,
> but I think it's more or less the same situation as arm64.  How big do
> s390x systems come?
> 
> power: Ridiculously complicated, seems to vary by system and kernel config.
> 
> So, Nick, your unconditional IPI scheme is apparently a big
> improvement for power, and it should be an improvement and have low
> cost for x86.  On arm64 and s390x it will add more IPIs on process
> exit but reduce contention on context switching depending on how lazy

s390 does not invalidate TLBs per-CPU explicitly - we have special
instructions for that. Those in turn initiate signalling to other
CPUs, completely transparent to OS.

Apart from mm_count, I am struggling to realize how the suggested
scheme could change the the contention on s390 in connection with
TLB. Could you clarify a bit here, please?

> TLB works.  I suppose we could try it for all architectures without
> any further optimizations.  Or we could try one of the perhaps
> excessively clever improvements I linked above.  arm64, s390x people,
> what do you think?

I do not immediately see anything in the series that would harm
performance on s390.

We however use mm_cpumask to distinguish between local and global TLB
flushes. With this series it looks like mm_cpumask is *required* to
be consistent with lazy users. And that is something quite diffucult
for us to adhere (at least in the foreseeable future).

But actually keeping track of lazy users in a cpumask is something
the generic code would rather do AFAICT.

Thanks!


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-12-03 17:03         ` Alexander Gordeev
@ 2020-12-03 17:14           ` Andy Lutomirski
  2020-12-03 18:33             ` Alexander Gordeev
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2020-12-03 17:14 UTC (permalink / raw)
  To: Alexander Gordeev
  Cc: Andy Lutomirski, Will Deacon, Catalin Marinas, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, Dave Hansen,
	Nicholas Piggin, LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, Linux-MM,
	Anton Blanchard



> On Dec 3, 2020, at 9:09 AM, Alexander Gordeev <agordeev@linux.ibm.com> wrote:
> 
> On Mon, Nov 30, 2020 at 10:31:51AM -0800, Andy Lutomirski wrote:
>> other arch folk: there's some background here:

> 
>> 
>> power: Ridiculously complicated, seems to vary by system and kernel config.
>> 
>> So, Nick, your unconditional IPI scheme is apparently a big
>> improvement for power, and it should be an improvement and have low
>> cost for x86.  On arm64 and s390x it will add more IPIs on process
>> exit but reduce contention on context switching depending on how lazy
> 
> s390 does not invalidate TLBs per-CPU explicitly - we have special
> instructions for that. Those in turn initiate signalling to other
> CPUs, completely transparent to OS.

Just to make sure I understand: this means that you broadcast flushes to all CPUs, not just a subset?

> 
> Apart from mm_count, I am struggling to realize how the suggested
> scheme could change the the contention on s390 in connection with
> TLB. Could you clarify a bit here, please?

I’m just talking about mm_count. Maintaining mm_count is quite expensive on some workloads.

> 
>> TLB works.  I suppose we could try it for all architectures without
>> any further optimizations.  Or we could try one of the perhaps
>> excessively clever improvements I linked above.  arm64, s390x people,
>> what do you think?
> 
> I do not immediately see anything in the series that would harm
> performance on s390.
> 
> We however use mm_cpumask to distinguish between local and global TLB
> flushes. With this series it looks like mm_cpumask is *required* to
> be consistent with lazy users. And that is something quite diffucult
> for us to adhere (at least in the foreseeable future).

You don’t actually need to maintain mm_cpumask — we could scan all CPUs instead.

> 
> But actually keeping track of lazy users in a cpumask is something
> the generic code would rather do AFAICT.

The problem is that arches don’t agree on what the contents of mm_cpumask should be.  Tracking a mask of exactly what the arch wants in generic code is a nontrivial operation.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-12-03 17:14           ` Andy Lutomirski
@ 2020-12-03 18:33             ` Alexander Gordeev
  0 siblings, 0 replies; 46+ messages in thread
From: Alexander Gordeev @ 2020-12-03 18:33 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Will Deacon, Catalin Marinas, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, Dave Hansen,
	Nicholas Piggin, LKML, X86 ML, Mathieu Desnoyers, Arnd Bergmann,
	Peter Zijlstra, linux-arch, linuxppc-dev, Linux-MM,
	Anton Blanchard

On Thu, Dec 03, 2020 at 09:14:22AM -0800, Andy Lutomirski wrote:
> 
> 
> > On Dec 3, 2020, at 9:09 AM, Alexander Gordeev <agordeev@linux.ibm.com> wrote:
> > 
> > On Mon, Nov 30, 2020 at 10:31:51AM -0800, Andy Lutomirski wrote:
> >> other arch folk: there's some background here:
> 
> > 
> >> 
> >> power: Ridiculously complicated, seems to vary by system and kernel config.
> >> 
> >> So, Nick, your unconditional IPI scheme is apparently a big
> >> improvement for power, and it should be an improvement and have low
> >> cost for x86.  On arm64 and s390x it will add more IPIs on process
> >> exit but reduce contention on context switching depending on how lazy
> > 
> > s390 does not invalidate TLBs per-CPU explicitly - we have special
> > instructions for that. Those in turn initiate signalling to other
> > CPUs, completely transparent to OS.
> 
> Just to make sure I understand: this means that you broadcast flushes to all CPUs, not just a subset?

Correct.
If mm has one CPU attached we flush TLB only for that CPU.
If mm has more than one CPUs attached we flush all CPUs' TLBs.

In fact, details are bit more complicated, since the hardware
is able to flush subsets of TLB entries depending on provided
parameters (e.g page tables used to create that entries).
But we can not select a CPU subset.

> > Apart from mm_count, I am struggling to realize how the suggested
> > scheme could change the the contention on s390 in connection with
> > TLB. Could you clarify a bit here, please?
> 
> I’m just talking about mm_count. Maintaining mm_count is quite expensive on some workloads.
> 
> > 
> >> TLB works.  I suppose we could try it for all architectures without
> >> any further optimizations.  Or we could try one of the perhaps
> >> excessively clever improvements I linked above.  arm64, s390x people,
> >> what do you think?
> > 
> > I do not immediately see anything in the series that would harm
> > performance on s390.
> > 
> > We however use mm_cpumask to distinguish between local and global TLB
> > flushes. With this series it looks like mm_cpumask is *required* to
> > be consistent with lazy users. And that is something quite diffucult
> > for us to adhere (at least in the foreseeable future).
> 
> You don’t actually need to maintain mm_cpumask — we could scan all CPUs instead.
> 
> > 
> > But actually keeping track of lazy users in a cpumask is something
> > the generic code would rather do AFAICT.
> 
> The problem is that arches don’t agree on what the contents of mm_cpumask should be.  Tracking a mask of exactly what the arch wants in generic code is a nontrivial operation.

It could be yet another cpumask or the CPU scan you mentioned.
Just wanted to make sure there is no new requirement for an arch
to maintain mm_cpumask ;)

Thanks, Andy!


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-12-03  5:09       ` Andy Lutomirski
@ 2020-12-05  8:00         ` Nicholas Piggin
  2020-12-05 16:11           ` Andy Lutomirski
  0 siblings, 1 reply; 46+ messages in thread
From: Nicholas Piggin @ 2020-12-05  8:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra, X86 ML

Excerpts from Andy Lutomirski's message of December 3, 2020 3:09 pm:
> On Tue, Dec 1, 2020 at 6:50 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> Excerpts from Andy Lutomirski's message of November 29, 2020 3:55 am:
>> > On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>> >>
>> >> And get rid of the generic sync_core_before_usermode facility. This is
>> >> functionally a no-op in the core scheduler code, but it also catches
>> >>
>> >> This helper is the wrong way around I think. The idea that membarrier
>> >> state requires a core sync before returning to user is the easy one
>> >> that does not need hiding behind membarrier calls. The gap in core
>> >> synchronization due to x86's sysret/sysexit and lazy tlb mode, is the
>> >> tricky detail that is better put in x86 lazy tlb code.
>> >>
>> >> Consider if an arch did not synchronize core in switch_mm either, then
>> >> membarrier_mm_sync_core_before_usermode would be in the wrong place
>> >> but arch specific mmu context functions would still be the right place.
>> >> There is also a exit_lazy_tlb case that is not covered by this call, which
>> >> could be a bugs (kthread use mm the membarrier process's mm then context
>> >> switch back to the process without switching mm or lazy mm switch).
>> >>
>> >> This makes lazy tlb code a bit more modular.
>> >
>> > I have a couple of membarrier fixes that I want to send out today or
>> > tomorrow, and they might eliminate the need for this patch.  Let me
>> > think about this a little bit.  I'll cc you.  The existing code is way
>> > to subtle and the comments are far too confusing for me to be quickly
>> > confident about any of my conclusions :)
>> >
>>
>> Thanks for the head's up. I'll have to have a better look through them
>> but I don't know that it eliminates the need for this entirely although
>> it might close some gaps and make this not a bug fix. The problem here
>> is x86 code wanted something to be called when a lazy mm is unlazied,
>> but it missed some spots and also the core scheduler doesn't need to
>> know about those x86 details if it has this generic call that annotates
>> the lazy handling better.
> 
> I'll send v3 tomorrow.  They add more sync_core_before_usermode() callers.
> 
> Having looked at your patches a bunch and the membarrier code a bunch,
> I don't think I like the approach of pushing this logic out into new
> core functions called by arch code.  Right now, even with my
> membarrier patches applied, understanding how (for example) the x86
> switch_mm_irqs_off() plus the scheduler code provides the barriers
> that membarrier needs is quite complicated, and it's not clear to me
> that the code is even correct.  At the very least I'm pretty sure that
> the x86 comments are misleading.
>
> With your patches, someone trying to
> audit the code would have to follow core code calling into arch code
> and back out into core code to figure out what's going on.  I think
> the result is worse.

Sorry I missed this and rather than reply to the later version you
have a bigger comment here.

I disagree. Until now nobody following it noticed that the mm gets
un-lazied in other cases, because that was not too clear from the
code (only indirectly using non-standard terminology in the arch
support document).

In other words, membarrier needs a special sync to deal with the case 
when a kthread takes the mm. exit_lazy_tlb gives membarrier code that 
exact hook that it wants from the core scheduler code.

> 
> I wrote this incomplete patch which takes the opposite approach (sorry
> for whitespace damage):

That said, if you want to move the code entirely in the x86 arch from
exit_lazy_tlb to switch_mm_irqs_off, it's trivial and touches no core
code after my series :) and I would have no problem with doing that.

I suspect it might actually be more readable to go the other way and
pull most of the real_prev == next membarrier code into exit_lazy_tlb
instead, but I could be wrong I don't know exactly how the x86 lazy
state correlates with core lazy tlb state.

Thanks,
Nick

> 
> commit 928b5c60e93f475934892d6e0b357ebf0a2bf87d
> Author: Andy Lutomirski <luto@kernel.org>
> Date:   Wed Dec 2 17:24:02 2020 -0800
> 
>     [WIP] x86/mm: Handle unlazying membarrier core sync in the arch code
> 
>     The core scheduler isn't a great place for
>     membarrier_mm_sync_core_before_usermode() -- the core scheduler
>     doesn't actually know whether we are lazy.  With the old code, if a
>     CPU is running a membarrier-registered task, goes idle, gets unlazied
>     via a TLB shootdown IPI, and switches back to the
>     membarrier-registered task, it will do an unnecessary core sync.
> 
>     Conveniently, x86 is the only architecture that does anything in this
>     hook, so we can just move the code.
> 
>     XXX: actually delete the old code.
> 
>     Signed-off-by: Andy Lutomirski <luto@kernel.org>
> 
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 3338a1feccf9..e27300fc865b 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -496,6 +496,8 @@ void switch_mm_irqs_off(struct mm_struct *prev,
> struct mm_struct *next,
>           * from one thread in a process to another thread in the same
>           * process. No TLB flush required.
>           */
> +
> +        // XXX: why is this okay wrt membarrier?
>          if (!was_lazy)
>              return;
> 
> @@ -508,12 +510,24 @@ void switch_mm_irqs_off(struct mm_struct *prev,
> struct mm_struct *next,
>          smp_mb();
>          next_tlb_gen = atomic64_read(&next->context.tlb_gen);
>          if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
> -                next_tlb_gen)
> +            next_tlb_gen) {
> +            /*
> +             * We're reactivating an mm, and membarrier might
> +             * need to serialize.  Tell membarrier.
> +             */
> +
> +            // XXX: I can't understand the logic in
> +            // membarrier_mm_sync_core_before_usermode().  What's
> +            // the mm check for?
> +            membarrier_mm_sync_core_before_usermode(next);
>              return;
> +        }
> 
>          /*
>           * TLB contents went out of date while we were in lazy
>           * mode. Fall through to the TLB switching code below.
> +         * No need for an explicit membarrier invocation -- the CR3
> +         * write will serialize.
>           */
>          new_asid = prev_asid;
>          need_flush = true;
> 


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-12-05  8:00         ` Nicholas Piggin
@ 2020-12-05 16:11           ` Andy Lutomirski
  2020-12-05 23:14             ` Nicholas Piggin
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2020-12-05 16:11 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Andy Lutomirski, Anton Blanchard, Arnd Bergmann, linux-arch,
	LKML, Linux-MM, linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra,
	X86 ML


> On Dec 5, 2020, at 12:00 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
> 
> 
> I disagree. Until now nobody following it noticed that the mm gets
> un-lazied in other cases, because that was not too clear from the
> code (only indirectly using non-standard terminology in the arch
> support document).

> In other words, membarrier needs a special sync to deal with the case 
> when a kthread takes the mm.

I don’t think this is actually true. Somehow the x86 oddities about CR3 writes leaked too much into the membarrier core code and comments. (I doubt this is x86 specific.  The actual x86 specific part seems to be that we can return to user mode without syncing the instruction stream.)

As far as I can tell, membarrier doesn’t care at all about laziness. Membarrier cares about rq->curr->mm.  The fact that a cpu can switch its actual loaded mm without scheduling at all (on x86 at least) is entirely beside the point except insofar as it has an effect on whether a subsequent switch_mm() call serializes.  If we notify membarrier about x86’s asynchronous CR3 writes, then membarrier needs to understand what to do with them, which results in an unmaintainable mess in membarrier *and* in the x86 code.

I’m currently trying to document how membarrier actually works, and hopefully this will result in untangling membarrier from mmdrop() and such.

A silly part of this is that x86 already has a high quality implementation of most of membarrier(): flush_tlb_mm().  If you flush an mm’s TLB, we carefully propagate the flush to all threads, with attention to memory ordering.  We can’t use this directly as an arch-specific implementation of membarrier because it has the annoying side affect of flushing the TLB and because upcoming hardware might be able to flush without guaranteeing a core sync.  (Upcoming means Zen 3, but the Zen 3 implementation is sadly not usable by Linux.)


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-12-05 16:11           ` Andy Lutomirski
@ 2020-12-05 23:14             ` Nicholas Piggin
  2020-12-06  0:36               ` Andy Lutomirski
  0 siblings, 1 reply; 46+ messages in thread
From: Nicholas Piggin @ 2020-12-05 23:14 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Andy Lutomirski, Mathieu Desnoyers, Peter Zijlstra,
	X86 ML

Excerpts from Andy Lutomirski's message of December 6, 2020 2:11 am:
> 
>> On Dec 5, 2020, at 12:00 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>> 
>> 
>> I disagree. Until now nobody following it noticed that the mm gets
>> un-lazied in other cases, because that was not too clear from the
>> code (only indirectly using non-standard terminology in the arch
>> support document).
> 
>> In other words, membarrier needs a special sync to deal with the case 
>> when a kthread takes the mm.
> 
> I don’t think this is actually true. Somehow the x86 oddities about 
> CR3 writes leaked too much into the membarrier core code and comments. 
> (I doubt this is x86 specific.  The actual x86 specific part seems to 
> be that we can return to user mode without syncing the instruction 
> stream.)
> 
> As far as I can tell, membarrier doesn’t care at all about laziness. 
> Membarrier cares about rq->curr->mm.  The fact that a cpu can switch 
> its actual loaded mm without scheduling at all (on x86 at least) is 
> entirely beside the point except insofar as it has an effect on 
> whether a subsequent switch_mm() call serializes.

Core membarrier itself doesn't care about laziness, which is why the
membarrier flush should go in exit_lazy_tlb() or other x86 specific
code (at least until more architectures did the same thing and we moved
it into generic code). I just meant this non-serialising return as 
documented in the membarrier arch enablement doc specifies the lazy tlb
requirement.

If an mm was lazy tlb for a kernel thread and then it becomes unlazy,
and if switch_mm is serialising but return to user is not, then you
need a serialising instruction somewhere before return to user. unlazy
is the logical place to add that, because the lazy tlb mm (i.e., 
switching to a kernel thread and back without switching mm) is what 
opens the hole.

> If we notify 
> membarrier about x86’s asynchronous CR3 writes, then membarrier needs 
> to understand what to do with them, which results in an unmaintainable 
> mess in membarrier *and* in the x86 code.

How do you mean? exit_lazy_tlb is the opposite, core scheduler notifying
arch code about when an mm becomes not-lazy, and nothing to do with
membarrier at all even. It's a convenient hook to do your un-lazying.
I guess you can do it also checking things in switch_mm and keeping state
in arch code, I don't think that's necessarily the best place to put it.

So membarrier code is unchanged (it cares that the serialise is done at
un-lazy time), core code is simpler (no knowledge of this membarrier 
quirk and it already knows about lazy-tlb so the calls actually improve 
the documentation), and x86 code I would argue becomes nicer (or no real
difference at worst) because you can move some exit lazy tlb handling to
that specific call rather than decipher it from switch_mm.

> 
> I’m currently trying to document how membarrier actually works, and 
> hopefully this will result in untangling membarrier from mmdrop() and 
> such.

That would be nice.

> 
> A silly part of this is that x86 already has a high quality 
> implementation of most of membarrier(): flush_tlb_mm().  If you flush 
> an mm’s TLB, we carefully propagate the flush to all threads, with 
> attention to memory ordering.  We can’t use this directly as an 
> arch-specific implementation of membarrier because it has the annoying 
> side affect of flushing the TLB and because upcoming hardware might be 
> able to flush without guaranteeing a core sync.  (Upcoming means Zen 
> 3, but the Zen 3 implementation is sadly not usable by Linux.)
> 

A hardware broadcast TLB flush, you mean? What makes it unusable by 
Linux out of curiosity?


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-12-05 23:14             ` Nicholas Piggin
@ 2020-12-06  0:36               ` Andy Lutomirski
  2020-12-06  3:59                 ` Nicholas Piggin
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2020-12-06  0:36 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Andy Lutomirski, Mathieu Desnoyers, Peter Zijlstra,
	X86 ML

On Sat, Dec 5, 2020 at 3:15 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> Excerpts from Andy Lutomirski's message of December 6, 2020 2:11 am:
> >

> If an mm was lazy tlb for a kernel thread and then it becomes unlazy,
> and if switch_mm is serialising but return to user is not, then you
> need a serialising instruction somewhere before return to user. unlazy
> is the logical place to add that, because the lazy tlb mm (i.e.,
> switching to a kernel thread and back without switching mm) is what
> opens the hole.

The issue here is that unlazying on x86 sometimes serializes and
sometimes doesn't.  It's straightforward to add logic to the x86 code
to serialize specifically in the case where membarrier core sync is
registered and unlazying would otherwise not serialize, but trying to
define sensible semantics for this in a call to core code seems
complicated.  (Specifically, the x86 code only sometimes sends IPIs to
lazy CPUs for TLB flushes.  If an IPI is skipped, then unlazying will
flush the TLB, and that operation is serializing.

The whole lazy thing is IMO a red herring for membarrier().  The
membarrier() logic requires that switching *logical* mms
(rq->curr->mm) serializes before user mode if the new mm is registered
for core sync.  AFAICT the only architecture on which this isn't
automatic is x86, and somehow the logic turned into "actually changing
rq->curr->mm serializes, but unlazying only sometimes serializes, so
we need to do an extra serialization step on unlazying operations"
instead of "tell x86 to make sure it always serializes when it
switches logical mms".  The latter is easy to specify and easy to
implement.

>
> How do you mean? exit_lazy_tlb is the opposite, core scheduler notifying
> arch code about when an mm becomes not-lazy, and nothing to do with
> membarrier at all even. It's a convenient hook to do your un-lazying.
> I guess you can do it also checking things in switch_mm and keeping state
> in arch code, I don't think that's necessarily the best place to put it.

I'm confused.  I just re-read your patches, and it looks like you have
arch code calling exit_lazy_tlb().  On x86, if we do a TLB shootdown
IPI to a lazy CPU, the IPI handler will unlazy that CPU (by switching
to init_mm for real), and we have no way to notify the core scheduler
about this, so we don't.  The result is that the core scheduler state
and the x86 state gets out of sync.  If the core scheduler
subsequently switches us back to the mm that it thinks we were still
using lazily them, from the x86 code's perspective, we're not
unlazying -- we're just doing a regular switch from init_mm to some
other mm.  This is why x86's switch_mm_irqs_off() totally ignores its
'prev' argument.

I'm honestly a bit surprised that other architectures don't do the
same thing.  I suppose that some architectures don't use shootdown
IPIs at all, in which case there doesn't seem to be any good reason to
aggressively unlazy.

(Oddly, despite the fact that, since Ivy Bridge, x86 has a "just flush
the TLB" instruction, that instruction is considerably slower than the
old "switch mm and flush" operation.  So the operation "switch to
init_mm" is only ever any slower than "flush and stay lazy" if we get
lucky and unlazy to the same mm before we get a second TLB shootdown
*and* if unlazying to the same mm would not have needed to flush.  I
spend quite a bit of time tuning this stuff and being quite surprised
at the bizarre performance properties of Intel's TLB management
instructions.)

>
> So membarrier code is unchanged (it cares that the serialise is done at
> un-lazy time), core code is simpler (no knowledge of this membarrier
> quirk and it already knows about lazy-tlb so the calls actually improve
> the documentation), and x86 code I would argue becomes nicer (or no real
> difference at worst) because you can move some exit lazy tlb handling to
> that specific call rather than decipher it from switch_mm.

As above, I can't move the exit-lazy handling because the scheduler
doesn't know when I'm unlazying.

>
> >
> > I’m currently trying to document how membarrier actually works, and
> > hopefully this will result in untangling membarrier from mmdrop() and
> > such.
>
> That would be nice.

It's still a work in progress.  I haven't actually convinced myself
that the non-IPI case in membarrier() is correct, nor have I convinced
myself that it's incorrect.

Anyway, I think that my patch is a bit incorrect and I either need a
barrier somewhere (which may already exist) or a store-release to
lazy_mm to make sure that all accesses to the lazy mm are done before
lazy_mm is freed.  On x86, even aside from the fact that all stores
are releases, this isn't needed -- stopping using an mm is itself a
full barrier.  Will this be a performance issue on power?

>
> >
> > A silly part of this is that x86 already has a high quality
> > implementation of most of membarrier(): flush_tlb_mm().  If you flush
> > an mm’s TLB, we carefully propagate the flush to all threads, with
> > attention to memory ordering.  We can’t use this directly as an
> > arch-specific implementation of membarrier because it has the annoying
> > side affect of flushing the TLB and because upcoming hardware might be
> > able to flush without guaranteeing a core sync.  (Upcoming means Zen
> > 3, but the Zen 3 implementation is sadly not usable by Linux.)
> >
>
> A hardware broadcast TLB flush, you mean? What makes it unusable by
> Linux out of curiosity?

The new instruction is INVLPGB.  Unfortunately, x86's ASID field is
very narrow, and there's no way we can give each mm the same ASID
across all CPUs, which means we can't accurately target the flush at
the correct set of TLB entries.  I've asked engineers at both Intel
and AMD to widen the ASID field, but that will end up being
complicated -- x86 has run out of bits in its absurdly overloaded CR3
encoding, and widening the ASID to any reasonable size would require
adding a new way to switch mms.  There are lots of reasons that x86
should do that anyway [0], but it would be a big project and I'm not
sure that either company is interested in big projects like that.

[0] On x86, you can't switch between (64-bit execution, 48-bit virtual
address space) and (64-bit execution, 57-bit address space) without
exiting 64-bit mode in the middle.  This is because the way that the
addressing mode is split among multiple registers prevents a single
instruction from switching between the two states.  This is absolutely
delightful for anyone trying to boot an OS on a system with a very,
very large amount of memory.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-12-06  0:36               ` Andy Lutomirski
@ 2020-12-06  3:59                 ` Nicholas Piggin
  2020-12-11  0:11                   ` Andy Lutomirski
  0 siblings, 1 reply; 46+ messages in thread
From: Nicholas Piggin @ 2020-12-06  3:59 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra, X86 ML

Excerpts from Andy Lutomirski's message of December 6, 2020 10:36 am:
> On Sat, Dec 5, 2020 at 3:15 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> Excerpts from Andy Lutomirski's message of December 6, 2020 2:11 am:
>> >
> 
>> If an mm was lazy tlb for a kernel thread and then it becomes unlazy,
>> and if switch_mm is serialising but return to user is not, then you
>> need a serialising instruction somewhere before return to user. unlazy
>> is the logical place to add that, because the lazy tlb mm (i.e.,
>> switching to a kernel thread and back without switching mm) is what
>> opens the hole.
> 
> The issue here is that unlazying on x86 sometimes serializes and
> sometimes doesn't.

That's additional state that x86 keeps around though, which is
fine. It can optimise that case if it knows it's already
serialised.

> It's straightforward to add logic to the x86 code
> to serialize specifically in the case where membarrier core sync is
> registered and unlazying would otherwise not serialize, but trying to
> define sensible semantics for this in a call to core code seems
> complicated.

It's not though, it's a call from core code (to arch code).

> (Specifically, the x86 code only sometimes sends IPIs to
> lazy CPUs for TLB flushes.  If an IPI is skipped, then unlazying will
> flush the TLB, and that operation is serializing.
> 
> The whole lazy thing is IMO a red herring for membarrier().  The
> membarrier() logic requires that switching *logical* mms
> (rq->curr->mm) serializes before user mode if the new mm is registered
> for core sync.

It's not a red herring, the reason the IPI gets skipped is because
we go to a kernel thread -- that's all core code and core lazy tlb
handling.

That x86 might do some additional ops serialise during un-lazy in
some cases doesn't make that a red herring, it just means that you
can take advantage of it and avoid doing an extra serialising op.

> AFAICT the only architecture on which this isn't
> automatic is x86, and somehow the logic turned into "actually changing
> rq->curr->mm serializes, but unlazying only sometimes serializes, so
> we need to do an extra serialization step on unlazying operations"
> instead of "tell x86 to make sure it always serializes when it
> switches logical mms".  The latter is easy to specify and easy to
> implement.
> 
>>
>> How do you mean? exit_lazy_tlb is the opposite, core scheduler notifying
>> arch code about when an mm becomes not-lazy, and nothing to do with
>> membarrier at all even. It's a convenient hook to do your un-lazying.
>> I guess you can do it also checking things in switch_mm and keeping state
>> in arch code, I don't think that's necessarily the best place to put it.
> 
> I'm confused.  I just re-read your patches, and it looks like you have
> arch code calling exit_lazy_tlb().

More for code-comment / consistency than anything else. They are
entirely arch hooks.

> On x86, if we do a TLB shootdown
> IPI to a lazy CPU, the IPI handler will unlazy that CPU (by switching
> to init_mm for real), and we have no way to notify the core scheduler
> about this, so we don't.  The result is that the core scheduler state
> and the x86 state gets out of sync.  If the core scheduler
> subsequently switches us back to the mm that it thinks we were still
> using lazily them, from the x86 code's perspective, we're not
> unlazying -- we're just doing a regular switch from init_mm to some
> other mm.  This is why x86's switch_mm_irqs_off() totally ignores its
> 'prev' argument.

You actually do now have such a way to do that now that we've
(hopefully) closed races, and I think should use it, which might make 
things simpler for you. See patch 6 do_shoot_lazy_tlb().

> I'm honestly a bit surprised that other architectures don't do the
> same thing.  I suppose that some architectures don't use shootdown
> IPIs at all, in which case there doesn't seem to be any good reason to
> aggressively unlazy.

powerpc/radix does (in some cases) since a few years ago. It just 
doesn't fully exploit that for the final TLB shootdown to always clean 
them all up and avoid the subsequent shoot-lazies IPI, but it could be 
more aggressive there.

The powerpc virtualised hash architecture is the traditional one and 
isn't conducive to this (translation management is done via hcalls, and
the hypervisor maintains the TLB) so I suspect that's why it wasn't
done earlier there. That will continue to rely on shoot-lazies.

> (Oddly, despite the fact that, since Ivy Bridge, x86 has a "just flush
> the TLB" instruction, that instruction is considerably slower than the
> old "switch mm and flush" operation.  So the operation "switch to
> init_mm" is only ever any slower than "flush and stay lazy" if we get
> lucky and unlazy to the same mm before we get a second TLB shootdown
> *and* if unlazying to the same mm would not have needed to flush.  I
> spend quite a bit of time tuning this stuff and being quite surprised
> at the bizarre performance properties of Intel's TLB management
> instructions.)

Well, you also casue an extra mm switch in case you returned to the
same mm. Which probably isn't uncommon (app<->idle).

>>
>> So membarrier code is unchanged (it cares that the serialise is done at
>> un-lazy time), core code is simpler (no knowledge of this membarrier
>> quirk and it already knows about lazy-tlb so the calls actually improve
>> the documentation), and x86 code I would argue becomes nicer (or no real
>> difference at worst) because you can move some exit lazy tlb handling to
>> that specific call rather than decipher it from switch_mm.
> 
> As above, I can't move the exit-lazy handling because the scheduler
> doesn't know when I'm unlazying.

As above, you can actually tell it. But even if you don't do that, in
the current scheme it's still telling you a superset of what you need, 
so you'd just put move your extra checks there.

> 
>>
>> >
>> > I’m currently trying to document how membarrier actually works, and
>> > hopefully this will result in untangling membarrier from mmdrop() and
>> > such.
>>
>> That would be nice.
> 
> It's still a work in progress.  I haven't actually convinced myself
> that the non-IPI case in membarrier() is correct, nor have I convinced
> myself that it's incorrect.
> 
> Anyway, I think that my patch is a bit incorrect and I either need a
> barrier somewhere (which may already exist) or a store-release to
> lazy_mm to make sure that all accesses to the lazy mm are done before
> lazy_mm is freed.  On x86, even aside from the fact that all stores
> are releases, this isn't needed -- stopping using an mm is itself a
> full barrier.  Will this be a performance issue on power?

store-release is lwsync on power. Not so bad as a full barrier, but
probably not wonderful. The fast path would be worse than shoot-lazies
of course, but may not be prohibitive.

I'm still going to persue shoot-lazies for the merge window. As you
see it's about a dozen lines and a if (IS_ENABLED(... in core code.
Your change is common code, but a significant complexity (which
affects all archs) so needs a lot more review and testing at this
point.

If x86 is already shooting lazies in its final TLB flush, I don't
know why you're putting so much effort in though, surely it's more
complexity and (even slightly) more cost there too.

> 
>>
>> >
>> > A silly part of this is that x86 already has a high quality
>> > implementation of most of membarrier(): flush_tlb_mm().  If you flush
>> > an mm’s TLB, we carefully propagate the flush to all threads, with
>> > attention to memory ordering.  We can’t use this directly as an
>> > arch-specific implementation of membarrier because it has the annoying
>> > side affect of flushing the TLB and because upcoming hardware might be
>> > able to flush without guaranteeing a core sync.  (Upcoming means Zen
>> > 3, but the Zen 3 implementation is sadly not usable by Linux.)
>> >
>>
>> A hardware broadcast TLB flush, you mean? What makes it unusable by
>> Linux out of curiosity?
> 
> The new instruction is INVLPGB.  Unfortunately, x86's ASID field is
> very narrow, and there's no way we can give each mm the same ASID
> across all CPUs, which means we can't accurately target the flush at
> the correct set of TLB entries.  I've asked engineers at both Intel
> and AMD to widen the ASID field, but that will end up being
> complicated -- x86 has run out of bits in its absurdly overloaded CR3
> encoding, and widening the ASID to any reasonable size would require
> adding a new way to switch mms.  There are lots of reasons that x86
> should do that anyway [0], but it would be a big project and I'm not
> sure that either company is interested in big projects like that.

Interesting, thanks. powerpc has a PID register for guest ASIDs that
implements about 20 bits.

The IPI is very flexible though, it allows more complex/fine grained
flushes and also software state to be updated, so we've started using
it a bit. I haven't seen much software where performance of IPIs is
prohibitive these days. Maybe improvements to threaded malloc, JVMs
databases etc reduce the amount of flushes.


> [0] On x86, you can't switch between (64-bit execution, 48-bit virtual
> address space) and (64-bit execution, 57-bit address space) without
> exiting 64-bit mode in the middle.  This is because the way that the
> addressing mode is split among multiple registers prevents a single
> instruction from switching between the two states.  This is absolutely
> delightful for anyone trying to boot an OS on a system with a very,
> very large amount of memory.
> 

powerpc has some issues like that with context switching guest / host
state there are several MMU registers involved that can't be switched 
with a single instruction. It doesn't require such a big hammer, but
a careful sequence to switch things.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-12-06  3:59                 ` Nicholas Piggin
@ 2020-12-11  0:11                   ` Andy Lutomirski
  2020-12-14  4:07                     ` Nicholas Piggin
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2020-12-11  0:11 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Andy Lutomirski, Anton Blanchard, Arnd Bergmann, linux-arch,
	LKML, Linux-MM, linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra,
	X86 ML

> On Dec 5, 2020, at 7:59 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>

> I'm still going to persue shoot-lazies for the merge window. As you
> see it's about a dozen lines and a if (IS_ENABLED(... in core code.
> Your change is common code, but a significant complexity (which
> affects all archs) so needs a lot more review and testing at this
> point.

I don't think it's ready for this merge window.  I read the early
patches again, and I think they make the membarrier code worse, not
better.  I'm not fundamentally opposed to the shoot-lazies concept,
but it needs more thought and it needs a cleaner foundation.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-12-11  0:11                   ` Andy Lutomirski
@ 2020-12-14  4:07                     ` Nicholas Piggin
  2020-12-14  5:53                       ` Nicholas Piggin
  0 siblings, 1 reply; 46+ messages in thread
From: Nicholas Piggin @ 2020-12-14  4:07 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra, X86 ML

Excerpts from Andy Lutomirski's message of December 11, 2020 10:11 am:
>> On Dec 5, 2020, at 7:59 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>
> 
>> I'm still going to persue shoot-lazies for the merge window. As you
>> see it's about a dozen lines and a if (IS_ENABLED(... in core code.
>> Your change is common code, but a significant complexity (which
>> affects all archs) so needs a lot more review and testing at this
>> point.
> 
> I don't think it's ready for this merge window.

Yes next one I meant (aka this one for development perspective :)).

> I read the early
> patches again, and I think they make the membarrier code worse, not
> better.

Mathieu and I disagree, so we are at an impasse. I addressed your 
comment about not being able to do the additional core sync avoidance 
from the exit tlb call (you can indeed do so in your arch code) and 
about exit_lazy_tlb being a call into the scheduler (it's not) and
about the arch code not being able to reconcile lazy tlb mm with the
core scheduler code (you can).

I fundamentally think the core sync is an issue with what the membarrier
/ arch specifics are doing with lazy tlb mm switching, and not something
the core scheduler needs to know about at all. I don't see the big
problem with essentially moving it from an explicit call to 
exit_lazy_tlb (which from scheduler POV describes better what it is 
doing, not how).

> I'm not fundamentally opposed to the shoot-lazies concept,
> but it needs more thought and it needs a cleaner foundation.

Well shoot lazies actually doesn't really rely on that membarrier
change at all, it just came as a nice looking cleanup so that part
can be dropped from the series. It's not really foundational.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
  2020-12-14  4:07                     ` Nicholas Piggin
@ 2020-12-14  5:53                       ` Nicholas Piggin
  0 siblings, 0 replies; 46+ messages in thread
From: Nicholas Piggin @ 2020-12-14  5:53 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Anton Blanchard, Arnd Bergmann, linux-arch, LKML, Linux-MM,
	linuxppc-dev, Mathieu Desnoyers, Peter Zijlstra, X86 ML

Excerpts from Nicholas Piggin's message of December 14, 2020 2:07 pm:
> Excerpts from Andy Lutomirski's message of December 11, 2020 10:11 am:
>>> On Dec 5, 2020, at 7:59 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>>
>> 
>>> I'm still going to persue shoot-lazies for the merge window. As you
>>> see it's about a dozen lines and a if (IS_ENABLED(... in core code.
>>> Your change is common code, but a significant complexity (which
>>> affects all archs) so needs a lot more review and testing at this
>>> point.
>> 
>> I don't think it's ready for this merge window.
> 
> Yes next one I meant (aka this one for development perspective :)).
> 
>> I read the early
>> patches again, and I think they make the membarrier code worse, not
>> better.
> 
> Mathieu and I disagree, so we are at an impasse.

Well actually not really, I went and cut out the exit_lazy_tlb stuff
from the patch series, those are better to be untangled anyway. I think 
an earlier version had something in exit_lazy_tlb for the mm refcounting 
change but it's not required now anyway.

I'll split them out and just work on the shoot lazies series for now, I
might revisit exit_lazy_tlb after the dust settles from that and the
current membarrier changes. I'll test and repost shortly.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2020-12-14  5:53 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-28 16:01 [PATCH 0/8] shoot lazy tlbs Nicholas Piggin
2020-11-28 16:01 ` [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb Nicholas Piggin
2020-11-29  0:38   ` Andy Lutomirski
2020-12-02  2:49     ` Nicholas Piggin
2020-11-28 16:01 ` [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode Nicholas Piggin
2020-11-28 17:55   ` Andy Lutomirski
2020-12-02  2:49     ` Nicholas Piggin
2020-12-03  5:09       ` Andy Lutomirski
2020-12-05  8:00         ` Nicholas Piggin
2020-12-05 16:11           ` Andy Lutomirski
2020-12-05 23:14             ` Nicholas Piggin
2020-12-06  0:36               ` Andy Lutomirski
2020-12-06  3:59                 ` Nicholas Piggin
2020-12-11  0:11                   ` Andy Lutomirski
2020-12-14  4:07                     ` Nicholas Piggin
2020-12-14  5:53                       ` Nicholas Piggin
2020-11-30 14:57   ` Mathieu Desnoyers
2020-11-28 16:01 ` [PATCH 3/8] x86: remove ARCH_HAS_SYNC_CORE_BEFORE_USERMODE Nicholas Piggin
2020-11-28 16:01 ` [PATCH 4/8] lazy tlb: introduce lazy mm refcount helper functions Nicholas Piggin
2020-11-28 16:01 ` [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable Nicholas Piggin
2020-11-29  0:36   ` Andy Lutomirski
2020-12-02  2:49     ` Nicholas Piggin
2020-11-28 16:01 ` [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
2020-11-29  3:54   ` Andy Lutomirski
2020-11-29 20:16     ` Andy Lutomirski
2020-11-30  9:25       ` Peter Zijlstra
2020-11-30 18:31       ` Andy Lutomirski
2020-12-01 21:27         ` Will Deacon
2020-12-01 21:50           ` Andy Lutomirski
2020-12-01 23:04             ` Will Deacon
2020-12-02  3:47         ` Nicholas Piggin
2020-12-03  5:05           ` Andy Lutomirski
2020-12-03 17:03         ` Alexander Gordeev
2020-12-03 17:14           ` Andy Lutomirski
2020-12-03 18:33             ` Alexander Gordeev
2020-11-30  9:26     ` Peter Zijlstra
2020-11-30  9:30     ` Peter Zijlstra
2020-11-30  9:34       ` Peter Zijlstra
2020-12-02  3:09     ` Nicholas Piggin
2020-12-02 11:17   ` Peter Zijlstra
2020-12-02 12:45     ` Peter Zijlstra
2020-12-02 14:19   ` Peter Zijlstra
2020-12-02 14:38     ` Andy Lutomirski
2020-12-02 16:29       ` Peter Zijlstra
2020-11-28 16:01 ` [PATCH 7/8] powerpc: use lazy mm refcount helper functions Nicholas Piggin
2020-11-28 16:01 ` [PATCH 8/8] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN Nicholas Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).