All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/5] shoot lazy tlbs
@ 2020-12-14  6:53 ` Nicholas Piggin
  0 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard, Andy Lutomirski

This is another rebase, on top of mainline now (don't need the
asm-generic tree), and without any x86 or membarrier changes.
This makes the series far smaller and more manageable and
without the controversial bits.

Thanks,
Nick

Nicholas Piggin (5):
  lazy tlb: introduce lazy mm refcount helper functions
  lazy tlb: allow lazy tlb mm switching to be configurable
  lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  powerpc: use lazy mm refcount helper functions
  powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN

 arch/Kconfig                         | 30 ++++++++++
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/Kconfig                 |  1 +
 arch/powerpc/kernel/smp.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 +-
 fs/exec.c                            |  4 +-
 include/linux/sched/mm.h             | 20 +++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/fork.c                        | 52 ++++++++++++++++
 kernel/kthread.c                     | 11 ++--
 kernel/sched/core.c                  | 88 ++++++++++++++++++++--------
 kernel/sched/sched.h                 |  4 +-
 13 files changed, 184 insertions(+), 38 deletions(-)

-- 
2.23.0


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 0/5] shoot lazy tlbs
@ 2020-12-14  6:53 ` Nicholas Piggin
  0 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski, linuxppc-dev

This is another rebase, on top of mainline now (don't need the
asm-generic tree), and without any x86 or membarrier changes.
This makes the series far smaller and more manageable and
without the controversial bits.

Thanks,
Nick

Nicholas Piggin (5):
  lazy tlb: introduce lazy mm refcount helper functions
  lazy tlb: allow lazy tlb mm switching to be configurable
  lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  powerpc: use lazy mm refcount helper functions
  powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN

 arch/Kconfig                         | 30 ++++++++++
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/Kconfig                 |  1 +
 arch/powerpc/kernel/smp.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 +-
 fs/exec.c                            |  4 +-
 include/linux/sched/mm.h             | 20 +++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/fork.c                        | 52 ++++++++++++++++
 kernel/kthread.c                     | 11 ++--
 kernel/sched/core.c                  | 88 ++++++++++++++++++++--------
 kernel/sched/sched.h                 |  4 +-
 13 files changed, 184 insertions(+), 38 deletions(-)

-- 
2.23.0


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 1/5] lazy tlb: introduce lazy mm refcount helper functions
  2020-12-14  6:53 ` Nicholas Piggin
@ 2020-12-14  6:53   ` Nicholas Piggin
  -1 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard, Andy Lutomirski

Add explicit _lazy_tlb annotated functions for lazy mm refcounting.
This makes things a bit more explicit, and allows explicit refcounting
to be removed if it is not used.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 ++--
 fs/exec.c                            |  4 ++--
 include/linux/sched/mm.h             | 11 +++++++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kthread.c                     | 11 +++++++----
 kernel/sched/core.c                  | 15 ++++++++-------
 8 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 827b50f1c73e..1b4a41aad793 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
 	current->mm = mm;
 	current->active_mm = mm;
 	activate_mm(active_mm, mm);
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	ecard_init_pgtables(mm);
 	return 0;
 }
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b487b489d4b6..74708aef333e 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -658,10 +658,10 @@ static void do_exit_flush_lazy_tlb(void *arg)
 	if (current->active_mm == mm) {
 		WARN_ON_ONCE(current->mm != NULL);
 		/* Is a kernel thread and is using mm as the lazy tlb */
-		mmgrab(&init_mm);
+		mmgrab_lazy_tlb(&init_mm);
 		current->active_mm = &init_mm;
 		switch_mm_irqs_off(mm, &init_mm, current);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
 
 	atomic_dec(&mm->context.active_cpus);
diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..56fc23dcbe4d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1028,9 +1028,9 @@ static int exec_mmap(struct mm_struct *mm)
 		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 		mm_update_next_owner(old_mm);
 		mmput(old_mm);
-		return 0;
+	} else {
+		mmdrop_lazy_tlb(active_mm);
 	}
-	mmdrop(active_mm);
 	return 0;
 }
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..94a117160083 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+	mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+	mmdrop(mm);
+}
+
 /**
  * mmget() - Pin the address space associated with a &struct mm_struct.
  * @mm: The address space to pin.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2b8d7a5db383..a54cdfa08d71 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -576,7 +576,7 @@ static int finish_cpu(unsigned int cpu)
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
-	mmdrop(mm);
+	mmdrop_lazy_tlb(mm);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 1f236ed375f8..3711a74fcf4a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -474,7 +474,7 @@ static void exit_mm(void)
 		__set_current_state(TASK_RUNNING);
 		mmap_read_lock(mm);
 	}
-	mmgrab(mm);
+	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 933a625621b8..da189e0d26ed 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1240,14 +1240,14 @@ void kthread_use_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(tsk->mm);
 
+	mmgrab(mm);
+
 	task_lock(tsk);
 	/* Hold off tlb flush IPIs while switching mm's */
 	local_irq_disable();
 	active_mm = tsk->active_mm;
-	if (active_mm != mm) {
-		mmgrab(mm);
+	if (active_mm != mm)
 		tsk->active_mm = mm;
-	}
 	tsk->mm = mm;
 	switch_mm_irqs_off(active_mm, mm, tsk);
 	local_irq_enable();
@@ -1257,7 +1257,7 @@ void kthread_use_mm(struct mm_struct *mm)
 #endif
 
 	if (active_mm != mm)
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 
 	to_kthread(tsk)->oldfs = force_uaccess_begin();
 }
@@ -1280,10 +1280,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	sync_mm_rss(mm);
 	local_irq_disable();
 	tsk->mm = NULL;
+	mmgrab_lazy_tlb(mm);
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
 	local_irq_enable();
 	task_unlock(tsk);
+
+	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7e453492cff..c2f8ea43d29b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3629,13 +3629,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * rq->curr, before returning to userspace, so provide them here:
 	 *
 	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
-	 *   provided by mmdrop(),
+	 *   provided by mmdrop_lazy_tlb(),
 	 * - a sync_core for SYNC_CORE.
 	 */
 	if (mm) {
 		membarrier_mm_sync_core_before_usermode(mm);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
+
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -3739,9 +3740,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	/*
 	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
 	 *
-	 * kernel ->   user   switch + mmdrop() active
+	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
 	 */
 	if (!next->mm) {                                // to kernel
@@ -3749,7 +3750,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
+			mmgrab_lazy_tlb(prev->active_mm);
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
@@ -3765,7 +3766,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop() in finish_task_switch(). */
+			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}
@@ -7206,7 +7207,7 @@ void __init sched_init(void)
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	/*
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 1/5] lazy tlb: introduce lazy mm refcount helper functions
@ 2020-12-14  6:53   ` Nicholas Piggin
  0 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski, linuxppc-dev

Add explicit _lazy_tlb annotated functions for lazy mm refcounting.
This makes things a bit more explicit, and allows explicit refcounting
to be removed if it is not used.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 ++--
 fs/exec.c                            |  4 ++--
 include/linux/sched/mm.h             | 11 +++++++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kthread.c                     | 11 +++++++----
 kernel/sched/core.c                  | 15 ++++++++-------
 8 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 827b50f1c73e..1b4a41aad793 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
 	current->mm = mm;
 	current->active_mm = mm;
 	activate_mm(active_mm, mm);
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	ecard_init_pgtables(mm);
 	return 0;
 }
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b487b489d4b6..74708aef333e 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -658,10 +658,10 @@ static void do_exit_flush_lazy_tlb(void *arg)
 	if (current->active_mm == mm) {
 		WARN_ON_ONCE(current->mm != NULL);
 		/* Is a kernel thread and is using mm as the lazy tlb */
-		mmgrab(&init_mm);
+		mmgrab_lazy_tlb(&init_mm);
 		current->active_mm = &init_mm;
 		switch_mm_irqs_off(mm, &init_mm, current);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
 
 	atomic_dec(&mm->context.active_cpus);
diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..56fc23dcbe4d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1028,9 +1028,9 @@ static int exec_mmap(struct mm_struct *mm)
 		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 		mm_update_next_owner(old_mm);
 		mmput(old_mm);
-		return 0;
+	} else {
+		mmdrop_lazy_tlb(active_mm);
 	}
-	mmdrop(active_mm);
 	return 0;
 }
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..94a117160083 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+	mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+	mmdrop(mm);
+}
+
 /**
  * mmget() - Pin the address space associated with a &struct mm_struct.
  * @mm: The address space to pin.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2b8d7a5db383..a54cdfa08d71 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -576,7 +576,7 @@ static int finish_cpu(unsigned int cpu)
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
-	mmdrop(mm);
+	mmdrop_lazy_tlb(mm);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 1f236ed375f8..3711a74fcf4a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -474,7 +474,7 @@ static void exit_mm(void)
 		__set_current_state(TASK_RUNNING);
 		mmap_read_lock(mm);
 	}
-	mmgrab(mm);
+	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 933a625621b8..da189e0d26ed 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1240,14 +1240,14 @@ void kthread_use_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(tsk->mm);
 
+	mmgrab(mm);
+
 	task_lock(tsk);
 	/* Hold off tlb flush IPIs while switching mm's */
 	local_irq_disable();
 	active_mm = tsk->active_mm;
-	if (active_mm != mm) {
-		mmgrab(mm);
+	if (active_mm != mm)
 		tsk->active_mm = mm;
-	}
 	tsk->mm = mm;
 	switch_mm_irqs_off(active_mm, mm, tsk);
 	local_irq_enable();
@@ -1257,7 +1257,7 @@ void kthread_use_mm(struct mm_struct *mm)
 #endif
 
 	if (active_mm != mm)
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 
 	to_kthread(tsk)->oldfs = force_uaccess_begin();
 }
@@ -1280,10 +1280,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	sync_mm_rss(mm);
 	local_irq_disable();
 	tsk->mm = NULL;
+	mmgrab_lazy_tlb(mm);
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
 	local_irq_enable();
 	task_unlock(tsk);
+
+	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7e453492cff..c2f8ea43d29b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3629,13 +3629,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * rq->curr, before returning to userspace, so provide them here:
 	 *
 	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
-	 *   provided by mmdrop(),
+	 *   provided by mmdrop_lazy_tlb(),
 	 * - a sync_core for SYNC_CORE.
 	 */
 	if (mm) {
 		membarrier_mm_sync_core_before_usermode(mm);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
+
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -3739,9 +3740,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	/*
 	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
 	 *
-	 * kernel ->   user   switch + mmdrop() active
+	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
 	 */
 	if (!next->mm) {                                // to kernel
@@ -3749,7 +3750,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
+			mmgrab_lazy_tlb(prev->active_mm);
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
@@ -3765,7 +3766,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop() in finish_task_switch(). */
+			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}
@@ -7206,7 +7207,7 @@ void __init sched_init(void)
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	/*
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 2/5] lazy tlb: allow lazy tlb mm switching to be configurable
  2020-12-14  6:53 ` Nicholas Piggin
@ 2020-12-14  6:53   ` Nicholas Piggin
  -1 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard, Andy Lutomirski

Add CONFIG_MMU_LAZY_TLB which can be configured out to disable
the lazy tlb mechanism entirely, and switches to init_mm when
switching to a kernel thread.

NOMMU systems could easily go without this and save a bit of code
and the refcount atomics, because their mm switch is a no-op. They
have not been switched over by default because the arch code needs
to be audited and tested for lazy tlb mm refcounting and converted
to _lazy_tlb refcounting if necessary.

CONFIG_MMU_LAZY_TLB_REFCOUNT is also added, but it must always
be enabled if CONFIG_MMU_LAZY_TLB is enabled until the next patch
which provides an alternate scheme.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig             | 17 +++++++++
 include/linux/sched/mm.h | 13 +++++--
 kernel/sched/core.c      | 75 ++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h     |  4 ++-
 4 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index ba4e966484ab..84faaba66364 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -430,6 +430,23 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	  irqs disabled over activate_mm. Architectures that do IPI based TLB
 	  shootdowns should enable this.
 
+# Should make this depend on MMU, because there is little use for lazy mm switching
+# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
+config MMU_LAZY_TLB
+	def_bool y
+	help
+	  Enable "lazy TLB" mmu context switching for kernel threads.
+	  If this is disabled then switching to a kernel thread always
+	  switches to init_mm. If mm switches are inexpensive or free
+	  (in the case of NOMMU) then this could be disabled.
+
+config MMU_LAZY_TLB_REFCOUNT
+	def_bool y
+	depends on MMU_LAZY_TLB
+	help
+	  This must be enabled if MMU_LAZY_TLB is enabled until the next
+	  patch.
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 94a117160083..5edf8e942c84 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -52,12 +52,21 @@ static inline void mmdrop(struct mm_struct *mm)
 /* Helpers for lazy TLB mm refcounting */
 static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 {
-	mmgrab(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+		mmgrab(mm);
 }
 
 static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 {
-	mmdrop(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+		mmdrop(mm);
+	} else {
+		/*
+		 * mmdrop_lazy_tlb must provide a full memory barrier, see the
+		 * membarrier comment finish_task_switch which relies on this.
+		 */
+		smp_mb();
+	}
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c2f8ea43d29b..9c1dc9406e4b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3579,7 +3579,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
+	struct mm_struct *mm = NULL;
 	long prev_state;
 
 	/*
@@ -3598,7 +3598,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		      current->comm, current->pid, preempt_count()))
 		preempt_count_set(FORK_PREEMPT_COUNT);
 
-	rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	mm = rq->prev_lazy_mm;
+	rq->prev_lazy_mm = NULL;
+#endif
 
 	/*
 	 * A task struct has one reference for the use as "current".
@@ -3722,22 +3725,10 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	calculate_sigpending();
 }
 
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next, struct rq_flags *rf)
+static __always_inline void
+context_switch_mm(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
 {
-	prepare_task_switch(rq, prev, next);
-
-	/*
-	 * For paravirt, this is coupled with an exit in switch_to to
-	 * combine the page table reload and the switch backend into
-	 * one hypercall.
-	 */
-	arch_start_context_switch(prev);
-
 	/*
 	 * kernel -> kernel   lazy + transfer active
 	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
@@ -3766,11 +3757,57 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
-			rq->prev_mm = prev->active_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+			/* Will mmdrop_lazy_tlb() in finish_task_switch(). */
+			rq->prev_lazy_mm = prev->active_mm;
 			prev->active_mm = NULL;
+#else
+			/*
+			 * Without MMU_LAZY_REFCOUNT there is no lazy
+			 * tracking (because no rq->prev_lazy_mm) in
+			 * finish_task_switch, so no mmdrop_lazy_tlb(),
+			 * so no memory barrier for membarrier (see the
+			 * membarrier comment in finish_task_switch()).
+			 * Do it here.
+			 */
+			smp_mb();
+#endif
 		}
 	}
+}
+
+static __always_inline void
+context_switch_mm_nolazy(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	if (!next->mm)
+		next->active_mm = &init_mm;
+	membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
+	switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
+	if (!prev->mm)
+		prev->active_mm = NULL;
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf)
+{
+	prepare_task_switch(rq, prev, next);
+
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+		context_switch_mm(rq, prev, next);
+	else
+		context_switch_mm_nolazy(rq, prev, next);
 
 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..3b72aec5a2f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -950,7 +950,9 @@ struct rq {
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-	struct mm_struct	*prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	struct mm_struct	*prev_lazy_mm;
+#endif
 
 	unsigned int		clock_update_flags;
 	u64			clock;
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 2/5] lazy tlb: allow lazy tlb mm switching to be configurable
@ 2020-12-14  6:53   ` Nicholas Piggin
  0 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski, linuxppc-dev

Add CONFIG_MMU_LAZY_TLB which can be configured out to disable
the lazy tlb mechanism entirely, and switches to init_mm when
switching to a kernel thread.

NOMMU systems could easily go without this and save a bit of code
and the refcount atomics, because their mm switch is a no-op. They
have not been switched over by default because the arch code needs
to be audited and tested for lazy tlb mm refcounting and converted
to _lazy_tlb refcounting if necessary.

CONFIG_MMU_LAZY_TLB_REFCOUNT is also added, but it must always
be enabled if CONFIG_MMU_LAZY_TLB is enabled until the next patch
which provides an alternate scheme.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig             | 17 +++++++++
 include/linux/sched/mm.h | 13 +++++--
 kernel/sched/core.c      | 75 ++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h     |  4 ++-
 4 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index ba4e966484ab..84faaba66364 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -430,6 +430,23 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	  irqs disabled over activate_mm. Architectures that do IPI based TLB
 	  shootdowns should enable this.
 
+# Should make this depend on MMU, because there is little use for lazy mm switching
+# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
+config MMU_LAZY_TLB
+	def_bool y
+	help
+	  Enable "lazy TLB" mmu context switching for kernel threads.
+	  If this is disabled then switching to a kernel thread always
+	  switches to init_mm. If mm switches are inexpensive or free
+	  (in the case of NOMMU) then this could be disabled.
+
+config MMU_LAZY_TLB_REFCOUNT
+	def_bool y
+	depends on MMU_LAZY_TLB
+	help
+	  This must be enabled if MMU_LAZY_TLB is enabled until the next
+	  patch.
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 94a117160083..5edf8e942c84 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -52,12 +52,21 @@ static inline void mmdrop(struct mm_struct *mm)
 /* Helpers for lazy TLB mm refcounting */
 static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 {
-	mmgrab(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+		mmgrab(mm);
 }
 
 static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 {
-	mmdrop(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+		mmdrop(mm);
+	} else {
+		/*
+		 * mmdrop_lazy_tlb must provide a full memory barrier, see the
+		 * membarrier comment finish_task_switch which relies on this.
+		 */
+		smp_mb();
+	}
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c2f8ea43d29b..9c1dc9406e4b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3579,7 +3579,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
+	struct mm_struct *mm = NULL;
 	long prev_state;
 
 	/*
@@ -3598,7 +3598,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		      current->comm, current->pid, preempt_count()))
 		preempt_count_set(FORK_PREEMPT_COUNT);
 
-	rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	mm = rq->prev_lazy_mm;
+	rq->prev_lazy_mm = NULL;
+#endif
 
 	/*
 	 * A task struct has one reference for the use as "current".
@@ -3722,22 +3725,10 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	calculate_sigpending();
 }
 
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next, struct rq_flags *rf)
+static __always_inline void
+context_switch_mm(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
 {
-	prepare_task_switch(rq, prev, next);
-
-	/*
-	 * For paravirt, this is coupled with an exit in switch_to to
-	 * combine the page table reload and the switch backend into
-	 * one hypercall.
-	 */
-	arch_start_context_switch(prev);
-
 	/*
 	 * kernel -> kernel   lazy + transfer active
 	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
@@ -3766,11 +3757,57 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
-			rq->prev_mm = prev->active_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+			/* Will mmdrop_lazy_tlb() in finish_task_switch(). */
+			rq->prev_lazy_mm = prev->active_mm;
 			prev->active_mm = NULL;
+#else
+			/*
+			 * Without MMU_LAZY_REFCOUNT there is no lazy
+			 * tracking (because no rq->prev_lazy_mm) in
+			 * finish_task_switch, so no mmdrop_lazy_tlb(),
+			 * so no memory barrier for membarrier (see the
+			 * membarrier comment in finish_task_switch()).
+			 * Do it here.
+			 */
+			smp_mb();
+#endif
 		}
 	}
+}
+
+static __always_inline void
+context_switch_mm_nolazy(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	if (!next->mm)
+		next->active_mm = &init_mm;
+	membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
+	switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
+	if (!prev->mm)
+		prev->active_mm = NULL;
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf)
+{
+	prepare_task_switch(rq, prev, next);
+
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+		context_switch_mm(rq, prev, next);
+	else
+		context_switch_mm_nolazy(rq, prev, next);
 
 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..3b72aec5a2f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -950,7 +950,9 @@ struct rq {
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-	struct mm_struct	*prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	struct mm_struct	*prev_lazy_mm;
+#endif
 
 	unsigned int		clock_update_flags;
 	u64			clock;
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-12-14  6:53 ` Nicholas Piggin
@ 2020-12-14  6:53   ` Nicholas Piggin
  -1 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard, Andy Lutomirski

On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).

Abandoning lazy tlb slows switching down quite a bit in the important
user->idle->user cases, so instead implement a non-refcounted scheme
that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
any remaining lazy ones.

Shootdown IPIs are some concern, but they have not been observed to be
a big problem with this scheme (the powerpc implementation generated
314 additional interrupts on a 144 CPU system during a kernel compile).
There are a number of strategies that could be employed to reduce IPIs
if they turn out to be a problem for some workload.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig  | 17 +++++++++++++++--
 kernel/fork.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 84faaba66364..e69c974369cc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -443,9 +443,22 @@ config MMU_LAZY_TLB
 config MMU_LAZY_TLB_REFCOUNT
 	def_bool y
 	depends on MMU_LAZY_TLB
+	depends on !MMU_LAZY_TLB_SHOOTDOWN
 	help
-	  This must be enabled if MMU_LAZY_TLB is enabled until the next
-	  patch.
+	  This refcounts the mm that is used as the lazy TLB mm when switching
+	  switching to a kernel thread.
+
+config MMU_LAZY_TLB_SHOOTDOWN
+	bool
+	depends on MMU_LAZY_TLB
+	help
+	  Instead of refcounting the "lazy tlb" mm struct, which can cause
+	  contention with multi-threaded apps on large multiprocessor systems,
+	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+	  switch to init_mm if they were using the to-be-freed mm as the lazy
+	  tlb. To implement this, architectures must use _lazy_tlb variants of
+	  mm refcounting, and mm_cpumask must include at least all possible
+	  CPUs in which mm might be lazy.
 
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..74b972d2d8a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -669,6 +669,53 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+static void do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		WARN_ON_ONCE(current->mm);
+		current->active_mm = &init_mm;
+		switch_mm(mm, &init_mm, current);
+	}
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+		/*
+		 * IPI overheads have not found to be expensive, but they could
+		 * be reduced in a number of possible ways, for example (in
+		 * roughly increasing order of complexity):
+		 * - A batch of mms requiring IPIs could be gathered and freed
+		 *   at once.
+		 * - CPUs could store their active mm somewhere that can be
+		 *   remotely checked without a lock, to filter out
+		 *   false-positives in the cpumask.
+		 * - After mm_users or mm_count reaches zero, switching away
+		 *   from the mm could clear mm_cpumask to reduce some IPIs
+		 *   (some batching or delaying would help).
+		 * - A delayed freeing and RCU-like quiescing sequence based on
+		 *   mm switching to avoid IPIs completely.
+		 */
+		on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+		if (IS_ENABLED(CONFIG_DEBUG_VM))
+			on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+	} else {
+		/*
+		 * In this case, lazy tlb mms are refounted and would not reach
+		 * __mmdrop until all CPUs have switched away and mmdrop()ed.
+		 */
+	}
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -678,7 +725,12 @@ void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
+
+	/* Ensure no CPUs are using this as their lazy tlb mm */
+	shoot_lazy_tlbs(mm);
+
 	WARN_ON_ONCE(mm == current->active_mm);
+
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_subscriptions_destroy(mm);
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
@ 2020-12-14  6:53   ` Nicholas Piggin
  0 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski, linuxppc-dev

On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).

Abandoning lazy tlb slows switching down quite a bit in the important
user->idle->user cases, so instead implement a non-refcounted scheme
that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
any remaining lazy ones.

Shootdown IPIs are some concern, but they have not been observed to be
a big problem with this scheme (the powerpc implementation generated
314 additional interrupts on a 144 CPU system during a kernel compile).
There are a number of strategies that could be employed to reduce IPIs
if they turn out to be a problem for some workload.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig  | 17 +++++++++++++++--
 kernel/fork.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 84faaba66364..e69c974369cc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -443,9 +443,22 @@ config MMU_LAZY_TLB
 config MMU_LAZY_TLB_REFCOUNT
 	def_bool y
 	depends on MMU_LAZY_TLB
+	depends on !MMU_LAZY_TLB_SHOOTDOWN
 	help
-	  This must be enabled if MMU_LAZY_TLB is enabled until the next
-	  patch.
+	  This refcounts the mm that is used as the lazy TLB mm when switching
+	  switching to a kernel thread.
+
+config MMU_LAZY_TLB_SHOOTDOWN
+	bool
+	depends on MMU_LAZY_TLB
+	help
+	  Instead of refcounting the "lazy tlb" mm struct, which can cause
+	  contention with multi-threaded apps on large multiprocessor systems,
+	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+	  switch to init_mm if they were using the to-be-freed mm as the lazy
+	  tlb. To implement this, architectures must use _lazy_tlb variants of
+	  mm refcounting, and mm_cpumask must include at least all possible
+	  CPUs in which mm might be lazy.
 
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..74b972d2d8a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -669,6 +669,53 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+static void do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		WARN_ON_ONCE(current->mm);
+		current->active_mm = &init_mm;
+		switch_mm(mm, &init_mm, current);
+	}
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+		/*
+		 * IPI overheads have not found to be expensive, but they could
+		 * be reduced in a number of possible ways, for example (in
+		 * roughly increasing order of complexity):
+		 * - A batch of mms requiring IPIs could be gathered and freed
+		 *   at once.
+		 * - CPUs could store their active mm somewhere that can be
+		 *   remotely checked without a lock, to filter out
+		 *   false-positives in the cpumask.
+		 * - After mm_users or mm_count reaches zero, switching away
+		 *   from the mm could clear mm_cpumask to reduce some IPIs
+		 *   (some batching or delaying would help).
+		 * - A delayed freeing and RCU-like quiescing sequence based on
+		 *   mm switching to avoid IPIs completely.
+		 */
+		on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+		if (IS_ENABLED(CONFIG_DEBUG_VM))
+			on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+	} else {
+		/*
+		 * In this case, lazy tlb mms are refounted and would not reach
+		 * __mmdrop until all CPUs have switched away and mmdrop()ed.
+		 */
+	}
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -678,7 +725,12 @@ void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
+
+	/* Ensure no CPUs are using this as their lazy tlb mm */
+	shoot_lazy_tlbs(mm);
+
 	WARN_ON_ONCE(mm == current->active_mm);
+
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_subscriptions_destroy(mm);
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 4/5] powerpc: use lazy mm refcount helper functions
  2020-12-14  6:53 ` Nicholas Piggin
@ 2020-12-14  6:53   ` Nicholas Piggin
  -1 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard, Andy Lutomirski

Use _lazy_tlb functions for lazy mm refcounting in powerpc, to prepare
to move to MMU_LAZY_TLB_SHOOTDOWN.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c2857cbd960..93c0eaa6f4bf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1395,7 +1395,7 @@ void start_secondary(void *unused)
 {
 	unsigned int cpu = raw_smp_processor_id();
 
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 4/5] powerpc: use lazy mm refcount helper functions
@ 2020-12-14  6:53   ` Nicholas Piggin
  0 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski, linuxppc-dev

Use _lazy_tlb functions for lazy mm refcounting in powerpc, to prepare
to move to MMU_LAZY_TLB_SHOOTDOWN.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c2857cbd960..93c0eaa6f4bf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1395,7 +1395,7 @@ void start_secondary(void *unused)
 {
 	unsigned int cpu = raw_smp_processor_id();
 
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 5/5] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
  2020-12-14  6:53 ` Nicholas Piggin
@ 2020-12-14  6:53   ` Nicholas Piggin
  -1 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, linux-arch, linuxppc-dev, linux-mm,
	Anton Blanchard, Andy Lutomirski

On a 16-socket 192-core POWER8 system, a context switching benchmark
with as many software threads as CPUs (so each switch will go in and
out of idle), upstream can achieve a rate of about 1 million context
switches per second. After this patch it goes up to 118 million.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5181872f9452..356138bdb5bb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -232,6 +232,7 @@ config PPC
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE
 	select MMU_GATHER_PAGE_SIZE
+	select MMU_LAZY_TLB_SHOOTDOWN		if PPC_BOOK3S_64
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
 	select HAVE_SYSCALL_TRACEPOINTS
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 5/5] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
@ 2020-12-14  6:53   ` Nicholas Piggin
  0 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski, linuxppc-dev

On a 16-socket 192-core POWER8 system, a context switching benchmark
with as many software threads as CPUs (so each switch will go in and
out of idle), upstream can achieve a rate of about 1 million context
switches per second. After this patch it goes up to 118 million.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5181872f9452..356138bdb5bb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -232,6 +232,7 @@ config PPC
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE
 	select MMU_GATHER_PAGE_SIZE
+	select MMU_LAZY_TLB_SHOOTDOWN		if PPC_BOOK3S_64
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
 	select HAVE_SYSCALL_TRACEPOINTS
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  2020-12-14  6:53   ` Nicholas Piggin
@ 2020-12-14  7:04     ` Randy Dunlap
  -1 siblings, 0 replies; 16+ messages in thread
From: Randy Dunlap @ 2020-12-14  7:04 UTC (permalink / raw)
  To: Nicholas Piggin, linux-kernel
  Cc: linux-arch, linuxppc-dev, linux-mm, Anton Blanchard, Andy Lutomirski

On 12/13/20 10:53 PM, Nicholas Piggin wrote:
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 84faaba66364..e69c974369cc 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -443,9 +443,22 @@ config MMU_LAZY_TLB
>  config MMU_LAZY_TLB_REFCOUNT
>  	def_bool y
>  	depends on MMU_LAZY_TLB
> +	depends on !MMU_LAZY_TLB_SHOOTDOWN
>  	help
> -	  This must be enabled if MMU_LAZY_TLB is enabled until the next
> -	  patch.
> +	  This refcounts the mm that is used as the lazy TLB mm when switching
> +	  switching to a kernel thread.

duplicate "switching".

> +
> +config MMU_LAZY_TLB_SHOOTDOWN
> +	bool
> +	depends on MMU_LAZY_TLB
> +	help
> +	  Instead of refcounting the "lazy tlb" mm struct, which can cause
> +	  contention with multi-threaded apps on large multiprocessor systems,
> +	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
> +	  switch to init_mm if they were using the to-be-freed mm as the lazy
> +	  tlb. To implement this, architectures must use _lazy_tlb variants of
> +	  mm refcounting, and mm_cpumask must include at least all possible
> +	  CPUs in which mm might be lazy.
>  
>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>  	bool


-- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
@ 2020-12-14  7:04     ` Randy Dunlap
  0 siblings, 0 replies; 16+ messages in thread
From: Randy Dunlap @ 2020-12-14  7:04 UTC (permalink / raw)
  To: Nicholas Piggin, linux-kernel
  Cc: linux-arch, linux-mm, linuxppc-dev, Andy Lutomirski

On 12/13/20 10:53 PM, Nicholas Piggin wrote:
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 84faaba66364..e69c974369cc 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -443,9 +443,22 @@ config MMU_LAZY_TLB
>  config MMU_LAZY_TLB_REFCOUNT
>  	def_bool y
>  	depends on MMU_LAZY_TLB
> +	depends on !MMU_LAZY_TLB_SHOOTDOWN
>  	help
> -	  This must be enabled if MMU_LAZY_TLB is enabled until the next
> -	  patch.
> +	  This refcounts the mm that is used as the lazy TLB mm when switching
> +	  switching to a kernel thread.

duplicate "switching".

> +
> +config MMU_LAZY_TLB_SHOOTDOWN
> +	bool
> +	depends on MMU_LAZY_TLB
> +	help
> +	  Instead of refcounting the "lazy tlb" mm struct, which can cause
> +	  contention with multi-threaded apps on large multiprocessor systems,
> +	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
> +	  switch to init_mm if they were using the to-be-freed mm as the lazy
> +	  tlb. To implement this, architectures must use _lazy_tlb variants of
> +	  mm refcounting, and mm_cpumask must include at least all possible
> +	  CPUs in which mm might be lazy.
>  
>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>  	bool


-- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 0/5] shoot lazy tlbs
  2020-12-14  6:53 ` Nicholas Piggin
@ 2021-02-04  8:21   ` Nicholas Piggin
  -1 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2021-02-04  8:21 UTC (permalink / raw)
  To: linux-kernel
  Cc: Anton Blanchard, linux-arch, linux-mm, linuxppc-dev, Andy Lutomirski

I'll ask Andrew to put this in -mm if no objections.

The series now doesn't touch other archs in non-trivial ways, and core code
is functionally not changed much / at all if the option is not selected so
it's actually pretty simple aside from the powerpc change.

Thanks,
Nick

Excerpts from Nicholas Piggin's message of December 14, 2020 4:53 pm:
> This is another rebase, on top of mainline now (don't need the
> asm-generic tree), and without any x86 or membarrier changes.
> This makes the series far smaller and more manageable and
> without the controversial bits.
> 
> Thanks,
> Nick
> 
> Nicholas Piggin (5):
>   lazy tlb: introduce lazy mm refcount helper functions
>   lazy tlb: allow lazy tlb mm switching to be configurable
>   lazy tlb: shoot lazies, a non-refcounting lazy tlb option
>   powerpc: use lazy mm refcount helper functions
>   powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
> 
>  arch/Kconfig                         | 30 ++++++++++
>  arch/arm/mach-rpc/ecard.c            |  2 +-
>  arch/powerpc/Kconfig                 |  1 +
>  arch/powerpc/kernel/smp.c            |  2 +-
>  arch/powerpc/mm/book3s64/radix_tlb.c |  4 +-
>  fs/exec.c                            |  4 +-
>  include/linux/sched/mm.h             | 20 +++++++
>  kernel/cpu.c                         |  2 +-
>  kernel/exit.c                        |  2 +-
>  kernel/fork.c                        | 52 ++++++++++++++++
>  kernel/kthread.c                     | 11 ++--
>  kernel/sched/core.c                  | 88 ++++++++++++++++++++--------
>  kernel/sched/sched.h                 |  4 +-
>  13 files changed, 184 insertions(+), 38 deletions(-)
> 
> -- 
> 2.23.0
> 
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 0/5] shoot lazy tlbs
@ 2021-02-04  8:21   ` Nicholas Piggin
  0 siblings, 0 replies; 16+ messages in thread
From: Nicholas Piggin @ 2021-02-04  8:21 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-arch, linux-mm, linuxppc-dev, Andy Lutomirski

I'll ask Andrew to put this in -mm if no objections.

The series now doesn't touch other archs in non-trivial ways, and core code
is functionally not changed much / at all if the option is not selected so
it's actually pretty simple aside from the powerpc change.

Thanks,
Nick

Excerpts from Nicholas Piggin's message of December 14, 2020 4:53 pm:
> This is another rebase, on top of mainline now (don't need the
> asm-generic tree), and without any x86 or membarrier changes.
> This makes the series far smaller and more manageable and
> without the controversial bits.
> 
> Thanks,
> Nick
> 
> Nicholas Piggin (5):
>   lazy tlb: introduce lazy mm refcount helper functions
>   lazy tlb: allow lazy tlb mm switching to be configurable
>   lazy tlb: shoot lazies, a non-refcounting lazy tlb option
>   powerpc: use lazy mm refcount helper functions
>   powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
> 
>  arch/Kconfig                         | 30 ++++++++++
>  arch/arm/mach-rpc/ecard.c            |  2 +-
>  arch/powerpc/Kconfig                 |  1 +
>  arch/powerpc/kernel/smp.c            |  2 +-
>  arch/powerpc/mm/book3s64/radix_tlb.c |  4 +-
>  fs/exec.c                            |  4 +-
>  include/linux/sched/mm.h             | 20 +++++++
>  kernel/cpu.c                         |  2 +-
>  kernel/exit.c                        |  2 +-
>  kernel/fork.c                        | 52 ++++++++++++++++
>  kernel/kthread.c                     | 11 ++--
>  kernel/sched/core.c                  | 88 ++++++++++++++++++++--------
>  kernel/sched/sched.h                 |  4 +-
>  13 files changed, 184 insertions(+), 38 deletions(-)
> 
> -- 
> 2.23.0
> 
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2021-02-04  8:23 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-14  6:53 [PATCH v2 0/5] shoot lazy tlbs Nicholas Piggin
2020-12-14  6:53 ` Nicholas Piggin
2020-12-14  6:53 ` [PATCH v2 1/5] lazy tlb: introduce lazy mm refcount helper functions Nicholas Piggin
2020-12-14  6:53   ` Nicholas Piggin
2020-12-14  6:53 ` [PATCH v2 2/5] lazy tlb: allow lazy tlb mm switching to be configurable Nicholas Piggin
2020-12-14  6:53   ` Nicholas Piggin
2020-12-14  6:53 ` [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
2020-12-14  6:53   ` Nicholas Piggin
2020-12-14  7:04   ` Randy Dunlap
2020-12-14  7:04     ` Randy Dunlap
2020-12-14  6:53 ` [PATCH v2 4/5] powerpc: use lazy mm refcount helper functions Nicholas Piggin
2020-12-14  6:53   ` Nicholas Piggin
2020-12-14  6:53 ` [PATCH v2 5/5] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN Nicholas Piggin
2020-12-14  6:53   ` Nicholas Piggin
2021-02-04  8:21 ` [PATCH v2 0/5] shoot lazy tlbs Nicholas Piggin
2021-02-04  8:21   ` Nicholas Piggin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.