linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/4] shoot lazy tlbs
@ 2021-06-01  6:22 Nicholas Piggin
  2021-06-01  6:23 ` [PATCH v3 1/4] lazy tlb: introduce lazy mm refcount helper functions Nicholas Piggin
                   ` (4 more replies)
  0 siblings, 5 replies; 10+ messages in thread
From: Nicholas Piggin @ 2021-06-01  6:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nicholas Piggin, Randy Dunlap, linux-kernel, linux-arch,
	linuxppc-dev, linux-mm, Anton Blanchard, Andy Lutomirski

There haven't been objections to the series since last posting, this
is just a rebase and tidies up a few comments minor patch rearranging.

Thanks,
Nick

Nicholas Piggin (4):
  lazy tlb: introduce lazy mm refcount helper functions
  lazy tlb: allow lazy tlb mm switching to be configurable
  lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN

 arch/Kconfig                         | 38 ++++++++++++
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/Kconfig                 |  1 +
 arch/powerpc/kernel/smp.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 +-
 fs/exec.c                            |  4 +-
 include/linux/sched/mm.h             | 20 +++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/fork.c                        | 52 ++++++++++++++++
 kernel/kthread.c                     | 11 ++--
 kernel/sched/core.c                  | 88 ++++++++++++++++++++--------
 kernel/sched/sched.h                 |  4 +-
 13 files changed, 192 insertions(+), 38 deletions(-)

-- 
2.23.0



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 1/4] lazy tlb: introduce lazy mm refcount helper functions
  2021-06-01  6:22 [PATCH v3 0/4] shoot lazy tlbs Nicholas Piggin
@ 2021-06-01  6:23 ` Nicholas Piggin
  2021-06-01  6:23 ` [PATCH v3 2/4] lazy tlb: allow lazy tlb mm switching to be configurable Nicholas Piggin
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Nicholas Piggin @ 2021-06-01  6:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nicholas Piggin, Randy Dunlap, linux-kernel, linux-arch,
	linuxppc-dev, linux-mm, Anton Blanchard, Andy Lutomirski

Add explicit _lazy_tlb annotated functions for lazy mm refcounting.
This makes lazy mm references more obvious, and allows explicit
refcounting to be removed if it is not used.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/kernel/smp.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 ++--
 fs/exec.c                            |  4 ++--
 include/linux/sched/mm.h             | 11 +++++++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kthread.c                     | 11 +++++++----
 kernel/sched/core.c                  | 15 ++++++++-------
 9 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 827b50f1c73e..1b4a41aad793 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
 	current->mm = mm;
 	current->active_mm = mm;
 	activate_mm(active_mm, mm);
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	ecard_init_pgtables(mm);
 	return 0;
 }
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2e05c783440a..fb0bdfc67366 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1541,7 +1541,7 @@ void start_secondary(void *unused)
 {
 	unsigned int cpu = raw_smp_processor_id();
 
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 409e61210789..2962082787c0 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -663,10 +663,10 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
 	if (current->active_mm == mm) {
 		WARN_ON_ONCE(current->mm != NULL);
 		/* Is a kernel thread and is using mm as the lazy tlb */
-		mmgrab(&init_mm);
+		mmgrab_lazy_tlb(&init_mm);
 		current->active_mm = &init_mm;
 		switch_mm_irqs_off(mm, &init_mm, current);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
 
 	/*
diff --git a/fs/exec.c b/fs/exec.c
index 18594f11c31f..ca0f8b1af23a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1029,9 +1029,9 @@ static int exec_mmap(struct mm_struct *mm)
 		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 		mm_update_next_owner(old_mm);
 		mmput(old_mm);
-		return 0;
+	} else {
+		mmdrop_lazy_tlb(active_mm);
 	}
-	mmdrop(active_mm);
 	return 0;
 }
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index e24b1fe348e3..bfd1baca5266 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+	mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+	mmdrop(mm);
+}
+
 /**
  * mmget() - Pin the address space associated with a &struct mm_struct.
  * @mm: The address space to pin.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e538518556f4..e87a89824e6c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -602,7 +602,7 @@ static int finish_cpu(unsigned int cpu)
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
-	mmdrop(mm);
+	mmdrop_lazy_tlb(mm);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index fd1c04193e18..8e87ec5f6be2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -476,7 +476,7 @@ static void exit_mm(void)
 		__set_current_state(TASK_RUNNING);
 		mmap_read_lock(mm);
 	}
-	mmgrab(mm);
+	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fe3f2a40d61e..b70e28431a01 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1314,14 +1314,14 @@ void kthread_use_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(tsk->mm);
 
+	mmgrab(mm);
+
 	task_lock(tsk);
 	/* Hold off tlb flush IPIs while switching mm's */
 	local_irq_disable();
 	active_mm = tsk->active_mm;
-	if (active_mm != mm) {
-		mmgrab(mm);
+	if (active_mm != mm)
 		tsk->active_mm = mm;
-	}
 	tsk->mm = mm;
 	membarrier_update_current_mm(mm);
 	switch_mm_irqs_off(active_mm, mm, tsk);
@@ -1341,7 +1341,7 @@ void kthread_use_mm(struct mm_struct *mm)
 	 * mmdrop(), or explicitly with smp_mb().
 	 */
 	if (active_mm != mm)
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 	else
 		smp_mb();
 
@@ -1375,10 +1375,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	local_irq_disable();
 	tsk->mm = NULL;
 	membarrier_update_current_mm(NULL);
+	mmgrab_lazy_tlb(mm);
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
 	local_irq_enable();
 	task_unlock(tsk);
+
+	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5226cc26a095..e359c76ea2e2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4229,13 +4229,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * rq->curr, before returning to userspace, so provide them here:
 	 *
 	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
-	 *   provided by mmdrop(),
+	 *   provided by mmdrop_lazy_tlb(),
 	 * - a sync_core for SYNC_CORE.
 	 */
 	if (mm) {
 		membarrier_mm_sync_core_before_usermode(mm);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
+
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -4299,9 +4300,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	/*
 	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
 	 *
-	 * kernel ->   user   switch + mmdrop() active
+	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
 	 */
 	if (!next->mm) {                                // to kernel
@@ -4309,7 +4310,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
+			mmgrab_lazy_tlb(prev->active_mm);
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
@@ -4325,7 +4326,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop() in finish_task_switch(). */
+			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}
@@ -8239,7 +8240,7 @@ void __init sched_init(void)
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	/*
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 2/4] lazy tlb: allow lazy tlb mm switching to be configurable
  2021-06-01  6:22 [PATCH v3 0/4] shoot lazy tlbs Nicholas Piggin
  2021-06-01  6:23 ` [PATCH v3 1/4] lazy tlb: introduce lazy mm refcount helper functions Nicholas Piggin
@ 2021-06-01  6:23 ` Nicholas Piggin
  2021-06-01  6:23 ` [PATCH v3 3/4] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Nicholas Piggin @ 2021-06-01  6:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nicholas Piggin, Randy Dunlap, linux-kernel, linux-arch,
	linuxppc-dev, linux-mm, Anton Blanchard, Andy Lutomirski

Add CONFIG_MMU_LAZY_TLB which can be configured out to disable the lazy
tlb mechanism entirely, and switches to init_mm when switching to a
kernel thread.

NOMMU systems could easily go without this and save a bit of code and
the refcount atomics, because their mm switch is a no-op. They have not
been switched over by default because the arch code needs to be audited
and tested for lazy tlb mm refcounting and converted to _lazy_tlb
refcounting if necessary.

CONFIG_MMU_LAZY_TLB_REFCOUNT is also added, but it must always be
enabled if CONFIG_MMU_LAZY_TLB is enabled until the next patch which
provides an alternate scheme.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig             | 26 ++++++++++++++
 include/linux/sched/mm.h | 13 +++++--
 kernel/sched/core.c      | 75 ++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h     |  4 ++-
 4 files changed, 96 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index c45b770d3579..276e1c1c0219 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -418,6 +418,32 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	  irqs disabled over activate_mm. Architectures that do IPI based TLB
 	  shootdowns should enable this.
 
+# Enable "lazy TLB", which means a user->kernel thread context switch does not
+# switch the mm to init_mm and the kernel thread takes a reference to the user
+# mm to provide its kernel mapping. This is how Linux has traditionally worked
+# (see Documentation/vm/active_mm.rst), for performance. Switching to and from
+# idle thread is a performance-critical case.
+#
+# If mm context switches are inexpensive or free (in the case of NOMMU) then
+# this could be disabled.
+#
+# It would make sense to have this depend on MMU, but need to audit and test
+# the NOMMU architectures for lazy mm refcounting first.
+config MMU_LAZY_TLB
+	def_bool y
+	depends on !NO_MMU_LAZY_TLB
+
+# This allows archs to disable MMU_LAZY_TLB. mmgrab/mmdrop in arch/ code has
+# to be audited and switched to _lazy_tlb postfix as necessary.
+config NO_MMU_LAZY_TLB
+	def_bool n
+
+# Use normal mm refcounting for MMU_LAZY_TLB kernel thread references.
+# For now, this must be enabled if MMU_LAZY_TLB is enabled.
+config MMU_LAZY_TLB_REFCOUNT
+	def_bool y
+	depends on MMU_LAZY_TLB
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index bfd1baca5266..29e4638ad124 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -52,12 +52,21 @@ static inline void mmdrop(struct mm_struct *mm)
 /* Helpers for lazy TLB mm refcounting */
 static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 {
-	mmgrab(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+		mmgrab(mm);
 }
 
 static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 {
-	mmdrop(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+		mmdrop(mm);
+	} else {
+		/*
+		 * mmdrop_lazy_tlb must provide a full memory barrier, see the
+		 * membarrier comment finish_task_switch which relies on this.
+		 */
+		smp_mb();
+	}
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e359c76ea2e2..299c3eb12b2b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4171,7 +4171,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
+	struct mm_struct *mm = NULL;
 	long prev_state;
 
 	/*
@@ -4190,7 +4190,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		      current->comm, current->pid, preempt_count()))
 		preempt_count_set(FORK_PREEMPT_COUNT);
 
-	rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	mm = rq->prev_lazy_mm;
+	rq->prev_lazy_mm = NULL;
+#endif
 
 	/*
 	 * A task struct has one reference for the use as "current".
@@ -4282,22 +4285,10 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	calculate_sigpending();
 }
 
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next, struct rq_flags *rf)
+static __always_inline void
+context_switch_mm(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
 {
-	prepare_task_switch(rq, prev, next);
-
-	/*
-	 * For paravirt, this is coupled with an exit in switch_to to
-	 * combine the page table reload and the switch backend into
-	 * one hypercall.
-	 */
-	arch_start_context_switch(prev);
-
 	/*
 	 * kernel -> kernel   lazy + transfer active
 	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
@@ -4326,11 +4317,57 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
-			rq->prev_mm = prev->active_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+			/* Will mmdrop_lazy_tlb() in finish_task_switch(). */
+			rq->prev_lazy_mm = prev->active_mm;
 			prev->active_mm = NULL;
+#else
+			/*
+			 * Without MMU_LAZY_REFCOUNT there is no lazy
+			 * tracking (because no rq->prev_lazy_mm) in
+			 * finish_task_switch, so no mmdrop_lazy_tlb(),
+			 * so no memory barrier for membarrier (see the
+			 * membarrier comment in finish_task_switch()).
+			 * Do it here.
+			 */
+			smp_mb();
+#endif
 		}
 	}
+}
+
+static __always_inline void
+context_switch_mm_nolazy(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	if (!next->mm)
+		next->active_mm = &init_mm;
+	membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
+	switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
+	if (!prev->mm)
+		prev->active_mm = NULL;
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf)
+{
+	prepare_task_switch(rq, prev, next);
+
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+		context_switch_mm(rq, prev, next);
+	else
+		context_switch_mm_nolazy(rq, prev, next);
 
 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a189bec13729..0729cf19a987 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -961,7 +961,9 @@ struct rq {
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-	struct mm_struct	*prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	struct mm_struct	*prev_lazy_mm;
+#endif
 
 	unsigned int		clock_update_flags;
 	u64			clock;
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 3/4] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2021-06-01  6:22 [PATCH v3 0/4] shoot lazy tlbs Nicholas Piggin
  2021-06-01  6:23 ` [PATCH v3 1/4] lazy tlb: introduce lazy mm refcount helper functions Nicholas Piggin
  2021-06-01  6:23 ` [PATCH v3 2/4] lazy tlb: allow lazy tlb mm switching to be configurable Nicholas Piggin
@ 2021-06-01  6:23 ` Nicholas Piggin
  2021-06-01  6:23 ` [PATCH v3 4/4] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN Nicholas Piggin
  2021-06-04 16:54 ` [PATCH v3 0/4] shoot lazy tlbs Andy Lutomirski
  4 siblings, 0 replies; 10+ messages in thread
From: Nicholas Piggin @ 2021-06-01  6:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nicholas Piggin, Randy Dunlap, linux-kernel, linux-arch,
	linuxppc-dev, linux-mm, Anton Blanchard, Andy Lutomirski

On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).

Abandoning lazy tlb slows switching down quite a bit in the important
user->idle->user cases, so instead implement a non-refcounted scheme
that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
any remaining lazy ones.

Shootdown IPIs are some concern, but they have not been observed to be
a big problem with this scheme (the powerpc implementation generated
314 additional interrupts on a 144 CPU system during a kernel compile).
There are a number of strategies that could be employed to reduce IPIs
if they turn out to be a problem for some workload.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig  | 14 +++++++++++++-
 kernel/fork.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 276e1c1c0219..91e1882e3284 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -439,11 +439,23 @@ config NO_MMU_LAZY_TLB
 	def_bool n
 
 # Use normal mm refcounting for MMU_LAZY_TLB kernel thread references.
-# For now, this must be enabled if MMU_LAZY_TLB is enabled.
 config MMU_LAZY_TLB_REFCOUNT
 	def_bool y
 	depends on MMU_LAZY_TLB
 
+# Instead of refcounting the lazy mm struct for kernel thread references
+# (which can cause contention with multi-threaded apps on large multiprocessor
+# systems), this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+# switch to init_mm if they were using the to-be-freed mm as the lazy tlb. To
+# implement this, architectures must use _lazy_tlb variants of mm refcounting
+# when releasing kernel thread mm references, and mm_cpumask must include at
+# least all possible CPUs in which the mm might be lazy, at the time of the
+# final mmdrop. mmgrab/mmdrop in arch/ code must be switched to _lazy_tlb
+# postfix as necessary.
+config MMU_LAZY_TLB_SHOOTDOWN
+	bool
+	depends on MMU_LAZY_TLB
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/kernel/fork.c b/kernel/fork.c
index dc06afd725cb..d485c24426a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -674,6 +674,53 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+static void do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		WARN_ON_ONCE(current->mm);
+		current->active_mm = &init_mm;
+		switch_mm(mm, &init_mm, current);
+	}
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+		/*
+		 * IPI overheads have not found to be expensive, but they could
+		 * be reduced in a number of possible ways, for example (in
+		 * roughly increasing order of complexity):
+		 * - A batch of mms requiring IPIs could be gathered and freed
+		 *   at once.
+		 * - CPUs could store their active mm somewhere that can be
+		 *   remotely checked without a lock, to filter out
+		 *   false-positives in the cpumask.
+		 * - After mm_users or mm_count reaches zero, switching away
+		 *   from the mm could clear mm_cpumask to reduce some IPIs
+		 *   (some batching or delaying would help).
+		 * - A delayed freeing and RCU-like quiescing sequence based on
+		 *   mm switching to avoid IPIs completely.
+		 */
+		on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+		if (IS_ENABLED(CONFIG_DEBUG_VM))
+			on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+	} else {
+		/*
+		 * In this case, lazy tlb mms are refounted and would not reach
+		 * __mmdrop until all CPUs have switched away and mmdrop()ed.
+		 */
+	}
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -683,7 +730,12 @@ void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
+
+	/* Ensure no CPUs are using this as their lazy tlb mm */
+	shoot_lazy_tlbs(mm);
+
 	WARN_ON_ONCE(mm == current->active_mm);
+
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_subscriptions_destroy(mm);
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 4/4] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
  2021-06-01  6:22 [PATCH v3 0/4] shoot lazy tlbs Nicholas Piggin
                   ` (2 preceding siblings ...)
  2021-06-01  6:23 ` [PATCH v3 3/4] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
@ 2021-06-01  6:23 ` Nicholas Piggin
  2021-06-04 16:54 ` [PATCH v3 0/4] shoot lazy tlbs Andy Lutomirski
  4 siblings, 0 replies; 10+ messages in thread
From: Nicholas Piggin @ 2021-06-01  6:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nicholas Piggin, Randy Dunlap, linux-kernel, linux-arch,
	linuxppc-dev, linux-mm, Anton Blanchard, Andy Lutomirski

On a 16-socket 192-core POWER8 system, a context switching benchmark
with as many software threads as CPUs (so each switch will go in and
out of idle), upstream can achieve a rate of about 1 million context
switches per second. After this patch it goes up to 118 million.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 088dd2afcfe4..8a092eedc692 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -252,6 +252,7 @@ config PPC
 	select IRQ_FORCED_THREADING
 	select MMU_GATHER_PAGE_SIZE
 	select MMU_GATHER_RCU_TABLE_FREE
+	select MMU_LAZY_TLB_SHOOTDOWN		if PPC_BOOK3S_64
 	select MODULES_USE_ELF_RELA
 	select NEED_DMA_MAP_STATE		if PPC64 || NOT_COHERENT_CACHE
 	select NEED_SG_DMA_LENGTH
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 0/4] shoot lazy tlbs
  2021-06-01  6:22 [PATCH v3 0/4] shoot lazy tlbs Nicholas Piggin
                   ` (3 preceding siblings ...)
  2021-06-01  6:23 ` [PATCH v3 4/4] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN Nicholas Piggin
@ 2021-06-04 16:54 ` Andy Lutomirski
  2021-06-04 17:05   ` Andy Lutomirski
  4 siblings, 1 reply; 10+ messages in thread
From: Andy Lutomirski @ 2021-06-04 16:54 UTC (permalink / raw)
  To: Nicholas Piggin, Andrew Morton
  Cc: Randy Dunlap, linux-kernel, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

On 5/31/21 11:22 PM, Nicholas Piggin wrote:
> There haven't been objections to the series since last posting, this
> is just a rebase and tidies up a few comments minor patch rearranging.
> 

I continue to object to having too many modes.  I like my more generic
improvements better.  Let me try to find some time to email again.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 0/4] shoot lazy tlbs
  2021-06-04 16:54 ` [PATCH v3 0/4] shoot lazy tlbs Andy Lutomirski
@ 2021-06-04 17:05   ` Andy Lutomirski
  2021-06-05  0:17     ` Nicholas Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: Andy Lutomirski @ 2021-06-04 17:05 UTC (permalink / raw)
  To: Nicholas Piggin, Andrew Morton
  Cc: Randy Dunlap, linux-kernel, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard

On 6/4/21 9:54 AM, Andy Lutomirski wrote:
> On 5/31/21 11:22 PM, Nicholas Piggin wrote:
>> There haven't been objections to the series since last posting, this
>> is just a rebase and tidies up a few comments minor patch rearranging.
>>
> 
> I continue to object to having too many modes.  I like my more generic
> improvements better.  Let me try to find some time to email again.
> 

Specifically, this:

https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/mm

I, or someone, needs to dust off my membarrier series before any of
these kinds of changes get made.  The barrier situation in the scheduler
is too confusing otherwise.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 0/4] shoot lazy tlbs
  2021-06-04 17:05   ` Andy Lutomirski
@ 2021-06-05  0:17     ` Nicholas Piggin
  2021-06-05  0:26       ` Nicholas Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: Nicholas Piggin @ 2021-06-05  0:17 UTC (permalink / raw)
  To: Andrew Morton, Andy Lutomirski
  Cc: Anton Blanchard, linux-arch, linux-kernel, linux-mm,
	linuxppc-dev, Randy Dunlap

Excerpts from Andy Lutomirski's message of June 5, 2021 3:05 am:
> On 6/4/21 9:54 AM, Andy Lutomirski wrote:
>> On 5/31/21 11:22 PM, Nicholas Piggin wrote:
>>> There haven't been objections to the series since last posting, this
>>> is just a rebase and tidies up a few comments minor patch rearranging.
>>>
>> 
>> I continue to object to having too many modes.  I like my more generic
>> improvements better.  Let me try to find some time to email again.
>> 
> 
> Specifically, this:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/mm

That's worse than what powerpc does with the shoot lazies code so 
we wouldn't use it anyway.

The fact is mm-cpumask and lazy mm is very architecture specific, so I 
don't really see that another "mode" is such a problem, it's for the 
most part "this is what powerpc does" -> "this is what powerpc does".
The only mode in the context switch is just "take a ref on the lazy mm"
or "don't take a ref". Surely that's not too onerous to add!?

Actually the bigger part of it is actually the no-lazy mmu mode which
is not yet used, I thought it was a neat little demonstrator of how code
works with/without lazy but I will get rid of that for submission.


> I, or someone, needs to dust off my membarrier series before any of
> these kinds of changes get made.  The barrier situation in the scheduler
> is too confusing otherwise.
> 

I disagree, I've disentangled the changes from membarrier stuff now, 
they can be done concurrently.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 0/4] shoot lazy tlbs
  2021-06-05  0:17     ` Nicholas Piggin
@ 2021-06-05  0:26       ` Nicholas Piggin
  2021-06-05  2:52         ` Nicholas Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: Nicholas Piggin @ 2021-06-05  0:26 UTC (permalink / raw)
  To: Andrew Morton, Andy Lutomirski
  Cc: Anton Blanchard, linux-arch, linux-kernel, linux-mm,
	linuxppc-dev, Randy Dunlap

Excerpts from Nicholas Piggin's message of June 5, 2021 10:17 am:
> Excerpts from Andy Lutomirski's message of June 5, 2021 3:05 am:
>> On 6/4/21 9:54 AM, Andy Lutomirski wrote:
>>> On 5/31/21 11:22 PM, Nicholas Piggin wrote:
>>>> There haven't been objections to the series since last posting, this
>>>> is just a rebase and tidies up a few comments minor patch rearranging.
>>>>
>>> 
>>> I continue to object to having too many modes.  I like my more generic
>>> improvements better.  Let me try to find some time to email again.
>>> 
>> 
>> Specifically, this:
>> 
>> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/mm
> 
> That's worse than what powerpc does with the shoot lazies code so 
> we wouldn't use it anyway.
> 
> The fact is mm-cpumask and lazy mm is very architecture specific, so I 
> don't really see that another "mode" is such a problem, it's for the 
> most part "this is what powerpc does" -> "this is what powerpc does".
> The only mode in the context switch is just "take a ref on the lazy mm"
> or "don't take a ref". Surely that's not too onerous to add!?
> 
> Actually the bigger part of it is actually the no-lazy mmu mode which
> is not yet used, I thought it was a neat little demonstrator of how code
> works with/without lazy but I will get rid of that for submission.

I admit that does add a bit more churn than necessary maybe that was
the main objection.

Here is the entire kernel/sched/core.c change after that is removed.
Pretty simple now. I'll resubmit.

Thanks,
Nick


diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e359c76ea2e2..1be0b97e12ec 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4171,7 +4171,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
+	struct mm_struct *mm = NULL;
 	long prev_state;
 
 	/*
@@ -4190,7 +4190,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		      current->comm, current->pid, preempt_count()))
 		preempt_count_set(FORK_PREEMPT_COUNT);
 
-	rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	mm = rq->prev_lazy_mm;
+	rq->prev_lazy_mm = NULL;
+#endif
 
 	/*
 	 * A task struct has one reference for the use as "current".
@@ -4326,9 +4329,21 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
-			rq->prev_mm = prev->active_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+			/* Will mmdrop_lazy_tlb() in finish_task_switch(). */
+			rq->prev_lazy_mm = prev->active_mm;
 			prev->active_mm = NULL;
+#else
+			/*
+			 * Without MMU_LAZY_TLB_REFCOUNT there is no lazy
+			 * tracking (because no rq->prev_lazy_mm) in
+			 * finish_task_switch, so no mmdrop_lazy_tlb(),
+			 * so no memory barrier for membarrier (see the
+			 * membarrier comment in finish_task_switch()).
+			 * Do it here.
+			 */
+			smp_mb();
+#endif
 		}
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a189bec13729..0729cf19a987 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -961,7 +961,9 @@ struct rq {
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-	struct mm_struct	*prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	struct mm_struct	*prev_lazy_mm;
+#endif
 
 	unsigned int		clock_update_flags;
 	u64			clock;



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 0/4] shoot lazy tlbs
  2021-06-05  0:26       ` Nicholas Piggin
@ 2021-06-05  2:52         ` Nicholas Piggin
  0 siblings, 0 replies; 10+ messages in thread
From: Nicholas Piggin @ 2021-06-05  2:52 UTC (permalink / raw)
  To: Andrew Morton, Andy Lutomirski
  Cc: Anton Blanchard, linux-arch, linux-kernel, linux-mm,
	linuxppc-dev, Randy Dunlap

Excerpts from Nicholas Piggin's message of June 5, 2021 10:26 am:
> Excerpts from Nicholas Piggin's message of June 5, 2021 10:17 am:
>> Excerpts from Andy Lutomirski's message of June 5, 2021 3:05 am:
>>> On 6/4/21 9:54 AM, Andy Lutomirski wrote:
>>>> On 5/31/21 11:22 PM, Nicholas Piggin wrote:
>>>>> There haven't been objections to the series since last posting, this
>>>>> is just a rebase and tidies up a few comments minor patch rearranging.
>>>>>
>>>> 
>>>> I continue to object to having too many modes.  I like my more generic
>>>> improvements better.  Let me try to find some time to email again.
>>>> 
>>> 
>>> Specifically, this:
>>> 
>>> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/mm
>> 
>> That's worse than what powerpc does with the shoot lazies code so 
>> we wouldn't use it anyway.
>> 
>> The fact is mm-cpumask and lazy mm is very architecture specific, so I 
>> don't really see that another "mode" is such a problem, it's for the 
>> most part "this is what powerpc does" -> "this is what powerpc does".
>> The only mode in the context switch is just "take a ref on the lazy mm"
>> or "don't take a ref". Surely that's not too onerous to add!?
>> 
>> Actually the bigger part of it is actually the no-lazy mmu mode which
>> is not yet used, I thought it was a neat little demonstrator of how code
>> works with/without lazy but I will get rid of that for submission.
> 
> I admit that does add a bit more churn than necessary maybe that was
> the main objection.
> 
> Here is the entire kernel/sched/core.c change after that is removed.
> Pretty simple now. I'll resubmit.

If it gives you some concerns about a great complex new mode, I'll
put it a different way -- all this allows is the arch to say that it
does not use lazy tlb mms beyond their refcounted lifetime, so there
is no need to refcount the lazy tlb reference.

That's all it is. One implementation of that is shoot lazies, and that
could be done entirely in arch/powerpc via destroy_context (I just put 
it in mm/ in case it is useful to others, but that's no real 
difference).

So you see it's really just about management of lazies, the refcounting
is just a bit on the side. And lazy management is highly arch specific,
x86 being one of the really different complex ones there including
very complex and unique interactions with membarrier ordering, so that
can't be a fair objection.

Thanks,
Nick



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-06-05  5:45 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-01  6:22 [PATCH v3 0/4] shoot lazy tlbs Nicholas Piggin
2021-06-01  6:23 ` [PATCH v3 1/4] lazy tlb: introduce lazy mm refcount helper functions Nicholas Piggin
2021-06-01  6:23 ` [PATCH v3 2/4] lazy tlb: allow lazy tlb mm switching to be configurable Nicholas Piggin
2021-06-01  6:23 ` [PATCH v3 3/4] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
2021-06-01  6:23 ` [PATCH v3 4/4] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN Nicholas Piggin
2021-06-04 16:54 ` [PATCH v3 0/4] shoot lazy tlbs Andy Lutomirski
2021-06-04 17:05   ` Andy Lutomirski
2021-06-05  0:17     ` Nicholas Piggin
2021-06-05  0:26       ` Nicholas Piggin
2021-06-05  2:52         ` Nicholas Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).