* [RFC][PATCH] avoid refcounting the lazy tlb mm struct
@ 2020-07-06 7:23 Nicholas Piggin
2020-07-10 0:45 ` Anton Blanchard
0 siblings, 1 reply; 2+ messages in thread
From: Nicholas Piggin @ 2020-07-06 7:23 UTC (permalink / raw)
To: linux-mm; +Cc: linux-arch, linuxppc-dev, Anton Blanchard
On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).
Not doing lazy tlb at all slows switching down quite a bit, so I wonder
if we can avoid the refcount for the lazy tlb, but have __mmdrop() IPI
all CPUs that might be using this mm lazily.
This patch has only had light testing so far, but seems to work okay.
Thanks,
Nick
--
diff --git a/arch/Kconfig b/arch/Kconfig
index 8cc35dc556c7..69ea7172db3d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -411,6 +411,16 @@ config MMU_GATHER_NO_GATHER
bool
depends on MMU_GATHER_TABLE_FREE
+config MMU_LAZY_TLB_SHOOTDOWN
+ bool
+ help
+ Instead of refcounting the "lazy tlb" mm struct, which can cause
+ contention with multi-threaded apps on large multiprocessor systems,
+ this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+ switch to init_mm if they were using the to-be-freed mm as the lazy
+ tlb. Architectures which do not track all possible lazy tlb CPUs in
+ mm_cpumask can not use this (without modification).
+
config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 920c4e3ca4ef..24ac85c868db 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -225,6 +225,7 @@ config PPC
select HAVE_PERF_USER_STACK_DUMP
select MMU_GATHER_RCU_TABLE_FREE
select MMU_GATHER_PAGE_SIZE
+ select MMU_LAZY_TLB_SHOOTDOWN
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b5cc9b23cf02..52730629b3eb 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -652,10 +652,10 @@ static void do_exit_flush_lazy_tlb(void *arg)
* Must be a kernel thread because sender is single-threaded.
*/
BUG_ON(current->mm);
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
switch_mm(mm, &init_mm, current);
current->active_mm = &init_mm;
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
}
_tlbiel_pid(pid, RIC_FLUSH_ALL);
}
diff --git a/fs/exec.c b/fs/exec.c
index e6e8a9a70327..6c96c8feba1f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1119,7 +1119,7 @@ static int exec_mmap(struct mm_struct *mm)
mmput(old_mm);
return 0;
}
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
return 0;
}
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 480a4d1b7dd8..ef28059086a1 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -51,6 +51,25 @@ static inline void mmdrop(struct mm_struct *mm)
void mmdrop(struct mm_struct *mm);
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
+ mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
+ mmdrop(mm);
+}
+
+static inline void mmdrop_lazy_tlb_smp_mb(struct mm_struct *mm)
+{
+ mmdrop_lazy_tlb(mm);
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
+ smp_mb();
+}
+
/*
* This has to be called after a get_task_mm()/mmget_not_zero()
* followed by taking the mmap_lock for writing before modifying the
diff --git a/kernel/fork.c b/kernel/fork.c
index 142b23645d82..e3f1039cee9f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -685,6 +685,34 @@ static void check_mm(struct mm_struct *mm)
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ BUG_ON(current->mm);
+ switch_mm(mm, &init_mm, current);
+ current->active_mm = &init_mm;
+ }
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ BUG_ON(current->active_mm == mm);
+}
+
+void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ smp_call_function_many(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ do_shoot_lazy_tlb(mm);
+ }
+ smp_call_function(do_check_lazy_tlb, (void *)mm, 1);
+ do_check_lazy_tlb(mm);
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
@@ -692,6 +720,7 @@ static void check_mm(struct mm_struct *mm)
*/
void __mmdrop(struct mm_struct *mm)
{
+ shoot_lazy_tlbs(mm);
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
WARN_ON_ONCE(mm == current->active_mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ca5db40392d4..4d615e0be9e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3308,7 +3308,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_lazy_tlb_smp_mb(mm);
}
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
@@ -3413,9 +3413,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
/*
* kernel -> kernel lazy + transfer active
- * user -> kernel lazy + mmgrab() active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
*
- * kernel -> user switch + mmdrop() active
+ * kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
*/
if (!next->mm) { // to kernel
@@ -3423,7 +3423,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
next->active_mm = prev->active_mm;
if (prev->mm) // from user
- mmgrab(prev->active_mm);
+ mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
@@ -3439,7 +3439,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_mm_irqs_off(prev->active_mm, next->mm, next);
if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
+ /* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [RFC][PATCH] avoid refcounting the lazy tlb mm struct
2020-07-06 7:23 [RFC][PATCH] avoid refcounting the lazy tlb mm struct Nicholas Piggin
@ 2020-07-10 0:45 ` Anton Blanchard
0 siblings, 0 replies; 2+ messages in thread
From: Anton Blanchard @ 2020-07-10 0:45 UTC (permalink / raw)
To: Nicholas Piggin; +Cc: linux-mm, linux-arch, linuxppc-dev
Hi Nick,
> On big systems, the mm refcount can become highly contented when doing
> a lot of context switching with threaded applications (particularly
> switching between the idle thread and an application thread).
>
> Not doing lazy tlb at all slows switching down quite a bit, so I
> wonder if we can avoid the refcount for the lazy tlb, but have
> __mmdrop() IPI all CPUs that might be using this mm lazily.
>
> This patch has only had light testing so far, but seems to work okay.
I tested this patch on a large POWER8 system with 1536 hardware threads.
I can create a worst case situation for mm refcounting by using
the threaded context switch test in will-it-scale set to half the
number of available CPUs (768).
With that workload the patch improves the context switch rate by 118x!
Tested-by: Anton Blanchard <anton@ozlabs.org>
Thanks,
Anton
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 8cc35dc556c7..69ea7172db3d 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -411,6 +411,16 @@ config MMU_GATHER_NO_GATHER
> bool
> depends on MMU_GATHER_TABLE_FREE
>
> +config MMU_LAZY_TLB_SHOOTDOWN
> + bool
> + help
> + Instead of refcounting the "lazy tlb" mm struct, which can
> cause
> + contention with multi-threaded apps on large
> multiprocessor systems,
> + this option causes __mmdrop to IPI all CPUs in the
> mm_cpumask and
> + switch to init_mm if they were using the to-be-freed mm as
> the lazy
> + tlb. Architectures which do not track all possible lazy
> tlb CPUs in
> + mm_cpumask can not use this (without modification).
> +
> config ARCH_HAVE_NMI_SAFE_CMPXCHG
> bool
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 920c4e3ca4ef..24ac85c868db 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -225,6 +225,7 @@ config PPC
> select HAVE_PERF_USER_STACK_DUMP
> select MMU_GATHER_RCU_TABLE_FREE
> select MMU_GATHER_PAGE_SIZE
> + select MMU_LAZY_TLB_SHOOTDOWN
> select HAVE_REGS_AND_STACK_ACCESS_API
> select HAVE_RELIABLE_STACKTRACE if
> PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c
> b/arch/powerpc/mm/book3s64/radix_tlb.c index
> b5cc9b23cf02..52730629b3eb 100644 ---
> a/arch/powerpc/mm/book3s64/radix_tlb.c +++
> b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -652,10 +652,10 @@ static
> void do_exit_flush_lazy_tlb(void *arg)
> * Must be a kernel thread because sender is
> single-threaded. */
> BUG_ON(current->mm);
> - mmgrab(&init_mm);
> + mmgrab_lazy_tlb(&init_mm);
> switch_mm(mm, &init_mm, current);
> current->active_mm = &init_mm;
> - mmdrop(mm);
> + mmdrop_lazy_tlb(mm);
> }
> _tlbiel_pid(pid, RIC_FLUSH_ALL);
> }
> diff --git a/fs/exec.c b/fs/exec.c
> index e6e8a9a70327..6c96c8feba1f 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1119,7 +1119,7 @@ static int exec_mmap(struct mm_struct *mm)
> mmput(old_mm);
> return 0;
> }
> - mmdrop(active_mm);
> + mmdrop_lazy_tlb(active_mm);
> return 0;
> }
>
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 480a4d1b7dd8..ef28059086a1 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -51,6 +51,25 @@ static inline void mmdrop(struct mm_struct *mm)
>
> void mmdrop(struct mm_struct *mm);
>
> +static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
> +{
> + if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> + mmgrab(mm);
> +}
> +
> +static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
> +{
> + if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> + mmdrop(mm);
> +}
> +
> +static inline void mmdrop_lazy_tlb_smp_mb(struct mm_struct *mm)
> +{
> + mmdrop_lazy_tlb(mm);
> + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> + smp_mb();
> +}
> +
> /*
> * This has to be called after a get_task_mm()/mmget_not_zero()
> * followed by taking the mmap_lock for writing before modifying the
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 142b23645d82..e3f1039cee9f 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -685,6 +685,34 @@ static void check_mm(struct mm_struct *mm)
> #define allocate_mm() (kmem_cache_alloc(mm_cachep,
> GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep,
> (mm)))
> +static void do_shoot_lazy_tlb(void *arg)
> +{
> + struct mm_struct *mm = arg;
> +
> + if (current->active_mm == mm) {
> + BUG_ON(current->mm);
> + switch_mm(mm, &init_mm, current);
> + current->active_mm = &init_mm;
> + }
> +}
> +
> +static void do_check_lazy_tlb(void *arg)
> +{
> + struct mm_struct *mm = arg;
> +
> + BUG_ON(current->active_mm == mm);
> +}
> +
> +void shoot_lazy_tlbs(struct mm_struct *mm)
> +{
> + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
> + smp_call_function_many(mm_cpumask(mm),
> do_shoot_lazy_tlb, (void *)mm, 1);
> + do_shoot_lazy_tlb(mm);
> + }
> + smp_call_function(do_check_lazy_tlb, (void *)mm, 1);
> + do_check_lazy_tlb(mm);
> +}
> +
> /*
> * Called when the last reference to the mm
> * is dropped: either by a lazy thread or by
> @@ -692,6 +720,7 @@ static void check_mm(struct mm_struct *mm)
> */
> void __mmdrop(struct mm_struct *mm)
> {
> + shoot_lazy_tlbs(mm);
> BUG_ON(mm == &init_mm);
> WARN_ON_ONCE(mm == current->mm);
> WARN_ON_ONCE(mm == current->active_mm);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ca5db40392d4..4d615e0be9e0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3308,7 +3308,7 @@ static struct rq *finish_task_switch(struct
> task_struct *prev) */
> if (mm) {
> membarrier_mm_sync_core_before_usermode(mm);
> - mmdrop(mm);
> + mmdrop_lazy_tlb_smp_mb(mm);
> }
> if (unlikely(prev_state == TASK_DEAD)) {
> if (prev->sched_class->task_dead)
> @@ -3413,9 +3413,9 @@ context_switch(struct rq *rq, struct
> task_struct *prev,
> /*
> * kernel -> kernel lazy + transfer active
> - * user -> kernel lazy + mmgrab() active
> + * user -> kernel lazy + mmgrab_lazy_tlb() active
> *
> - * kernel -> user switch + mmdrop() active
> + * kernel -> user switch + mmdrop_lazy_tlb() active
> * user -> user switch
> */
> if (!next->mm) { // to kernel
> @@ -3423,7 +3423,7 @@ context_switch(struct rq *rq, struct
> task_struct *prev,
> next->active_mm = prev->active_mm;
> if (prev->mm) // from user
> - mmgrab(prev->active_mm);
> + mmgrab_lazy_tlb(prev->active_mm);
> else
> prev->active_mm = NULL;
> } else { // to user
> @@ -3439,7 +3439,7 @@ context_switch(struct rq *rq, struct
> task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm,
> next);
> if (!prev->mm) { // from
> kernel
> - /* will mmdrop() in finish_task_switch(). */
> + /* will mmdrop_lazy_tlb() in
> finish_task_switch(). */ rq->prev_mm = prev->active_mm;
> prev->active_mm = NULL;
> }
>
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2020-07-10 0:45 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-06 7:23 [RFC][PATCH] avoid refcounting the lazy tlb mm struct Nicholas Piggin
2020-07-10 0:45 ` Anton Blanchard
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).