On 02/23/2020 02:25 PM, Andrea Arcangeli wrote: > Hello, > > This is introducing a nr_active_mm that allows to optimize away the > tlbi broadcast also for multi threaded processes, it doesn't rely > anymore on mm_users <= 1. > > This also optimizes away all TLB flushes (including local ones) when > the process is not running in any cpu (including during exit_mmap with > lazy tlb state). > > This optimization is generally only observable when there are parallel > TLB flushes from different processes in multiple CPUs. One possible > use case is an userland malloc libs freeing small objects with > MADV_DONTNEED and causing a frequent tiny tlb flushes as demonstrated > by the tcmalloc testsuite. > > All memory intensive apps dealing a multitude of frequently freed > small objects tend to opt-out of glibc and they opt-in jemalloc or > tcmalloc, so this should facilitate the SMP/NUMA scalability of long > lived apps with small objects running in different containers if > they're issuing frequent MADV_DONTNEED tlb flushes while the other > threads of the process are not running. > > I was suggested to implement the mm_cpumask the standard way in > order to optimize multithreaded apps too and to avoid restricting the > optimization to mm_users <= 1. So initially I had two bitmasks allocated > as shown at the bottom of this cover letter, by setting > ARCH_NR_MM_CPUMASK to 2 with the below patch applied... however I > figured a single atomic per-mm achieves the exact same runtime behavior > of the extra bitmap, so I just dropped the extra bitmap and I replaced > it with nr_active_mm as an optimization. > > If the switch_mm atomic ops in the switch_mm fast path would be a > concern (they're still faster than the cpumask_set_cpu/clear_cpu, with > less than 256-512 CPUs), it's worth mentioning it'd be possible to > remove all atomic ops from the switch_mm fast path by restricting this > optimization to single threaded processes by checking mm_users <= 1 > and < 1 instead of nr_active_mm <= 1 and < 1 similarly to what the > earlier version of this patchset was doing. > > Thanks, > Andrea > > Andrea Arcangeli (3): > mm: use_mm: fix for arches checking mm_users to optimize TLB flushes > arm64: select CPUMASK_OFFSTACK if NUMA > arm64: tlb: skip tlbi broadcast > > arch/arm64/Kconfig | 1 + > arch/arm64/include/asm/efi.h | 2 +- > arch/arm64/include/asm/mmu.h | 4 +- > arch/arm64/include/asm/mmu_context.h | 33 ++++++++-- > arch/arm64/include/asm/tlbflush.h | 95 +++++++++++++++++++++++++++- > arch/arm64/mm/context.c | 54 ++++++++++++++++ > mm/mmu_context.c | 2 + > 7 files changed, 180 insertions(+), 11 deletions(-) > > Early attempt with the standard mm_cpumask follows: > > From: Andrea Arcangeli > Subject: mm: allow per-arch mm_cpumasks based on ARCH_NR_MM_CPUMASK > > Allow archs to allocate multiple mm_cpumasks in the mm_struct per-arch > by definining a ARCH_NR_MM_CPUMASK > 1 (to be included before > "linux/mm_types.h"). > > Those extra per-mm cpumasks can be referenced with > __mm_cpumask(N, mm), where N == 0 points to the mm_cpumask() > known by the common code and N > 0 points to the per-arch private > ones. > --- > drivers/firmware/efi/efi.c | 3 ++- > include/linux/mm_types.h | 17 +++++++++++++++-- > kernel/fork.c | 3 ++- > mm/init-mm.c | 2 +- > 4 files changed, 20 insertions(+), 5 deletions(-) > > diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c > index 5da0232ae33f..608c9bf181e5 100644 > --- a/drivers/firmware/efi/efi.c > +++ b/drivers/firmware/efi/efi.c > @@ -86,7 +86,8 @@ struct mm_struct efi_mm = { > .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), > .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), > .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), > - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, > + .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS) * > + ARCH_NR_MM_CPUMASK] = 0}, > }; > > struct workqueue_struct *efi_rts_wq; > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index f29bba20bba1..b53d5622b3b2 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -531,6 +531,9 @@ struct mm_struct { > RH_KABI_RESERVE(7) > RH_KABI_RESERVE(8) > > +#ifndef ARCH_NR_MM_CPUMASK > +#define ARCH_NR_MM_CPUMASK 1 > +#endif > /* > * The mm_cpumask needs to be at the end of mm_struct, because it > * is dynamically sized based on nr_cpu_ids. > @@ -544,15 +547,25 @@ extern struct mm_struct init_mm; > static inline void mm_init_cpumask(struct mm_struct *mm) > { > unsigned long cpu_bitmap = (unsigned long)mm; > + int i; > > cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); > - cpumask_clear((struct cpumask *)cpu_bitmap); > + for (i = 0; i < ARCH_NR_MM_CPUMASK; i++) { > + cpumask_clear((struct cpumask *)cpu_bitmap); > + cpu_bitmap += cpumask_size(); > + } > } > > /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ > +static inline cpumask_t *__mm_cpumask(int index, struct mm_struct *mm) > +{ > + return (struct cpumask *)((unsigned long)&mm->cpu_bitmap + > + cpumask_size() * index); > +} > + > static inline cpumask_t *mm_cpumask(struct mm_struct *mm) > { > - return (struct cpumask *)&mm->cpu_bitmap; > + return __mm_cpumask(0, mm); > } > > struct mmu_gather; > diff --git a/kernel/fork.c b/kernel/fork.c > index 1dad2f91fac3..a6cbbc1b6008 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -2418,7 +2418,8 @@ void __init proc_caches_init(void) > * dynamically sized based on the maximum CPU number this system > * can have, taking hotplug into account (nr_cpu_ids). > */ > - mm_size = sizeof(struct mm_struct) + cpumask_size(); > + mm_size = sizeof(struct mm_struct) + cpumask_size() * \ > + ARCH_NR_MM_CPUMASK; > > mm_cachep = kmem_cache_create_usercopy("mm_struct", > mm_size, ARCH_MIN_MMSTRUCT_ALIGN, > diff --git a/mm/init-mm.c b/mm/init-mm.c > index a787a319211e..d975f8ce270e 100644 > --- a/mm/init-mm.c > +++ b/mm/init-mm.c > @@ -35,6 +35,6 @@ struct mm_struct init_mm = { > .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), > .mmlist = LIST_HEAD_INIT(init_mm.mmlist), > .user_ns = &init_user_ns, > - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, > + .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS) * ARCH_NR_MM_CPUMASK] = 0}, > INIT_MM_CONTEXT(init_mm) > }; > > > [bitmap version depending on the above follows] > > @@ -248,6 +260,42 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) > cpu_switch_mm(mm->pgd, mm); > } > > +enum tlb_flush_types tlb_flush_check(struct mm_struct *mm, unsigned int cpu) > +{ > + if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) { > + bool is_local = cpumask_test_cpu(cpu, mm_cpumask(mm)); > + cpumask_t *stale_cpumask = __mm_cpumask(1, mm); > + int next_zero = cpumask_next_zero(-1, stale_cpumask); > + bool local_is_clear = false; > + if (next_zero < nr_cpu_ids && > + (is_local && next_zero == cpu)) { > + next_zero = cpumask_next_zero(next_zero, stale_cpumask); > + local_is_clear = true; > + } > + if (next_zero < nr_cpu_ids) { > + cpumask_setall(stale_cpumask); > + local_is_clear = false; > + } > + > + /* > + * Enforce CPU ordering between the > + * cpumask_setall() and cpumask_any_but(). > + */ > + smp_mb(); > + > + if (likely(cpumask_any_but(mm_cpumask(mm), > + cpu) >= nr_cpu_ids)) { > + if (is_local) { > + if (!local_is_clear) > + cpumask_clear_cpu(cpu, stale_cpumask); > + return TLB_FLUSH_LOCAL; > + } > + return TLB_FLUSH_NO; > + } > + } > + return TLB_FLUSH_BROADCAST; > +} > + > /* Errata workaround post TTBRx_EL1 update. */ > asmlinkage void post_ttbr_update_workaround(void) > { > > For the 3-part series: Acked-by: Larry Woodman