On 02/23/2020 02:25 PM, Andrea Arcangeli wrote:
> Hello,
>
> This is introducing a nr_active_mm that allows to optimize away the
> tlbi broadcast also for multi threaded processes, it doesn't rely
> anymore on mm_users <= 1.
>
> This also optimizes away all TLB flushes (including local ones) when
> the process is not running in any cpu (including during exit_mmap with
> lazy tlb state).
>
> This optimization is generally only observable when there are parallel
> TLB flushes from different processes in multiple CPUs. One possible
> use case is an userland malloc libs freeing small objects with
> MADV_DONTNEED and causing a frequent tiny tlb flushes as demonstrated
> by the tcmalloc testsuite.
>
> All memory intensive apps dealing a multitude of frequently freed
> small objects tend to opt-out of glibc and they opt-in jemalloc or
> tcmalloc, so this should facilitate the SMP/NUMA scalability of long
> lived apps with small objects running in different containers if
> they're issuing frequent MADV_DONTNEED tlb flushes while the other
> threads of the process are not running.
>
> I was suggested to implement the mm_cpumask the standard way in
> order to optimize multithreaded apps too and to avoid restricting the
> optimization to mm_users <= 1. So initially I had two bitmasks allocated
> as shown at the bottom of this cover letter, by setting
> ARCH_NR_MM_CPUMASK to 2 with the below patch applied... however I
> figured a single atomic per-mm achieves the exact same runtime behavior
> of the extra bitmap, so I just dropped the extra bitmap and I replaced
> it with nr_active_mm as an optimization.
>
> If the switch_mm atomic ops in the switch_mm fast path would be a
> concern (they're still faster than the cpumask_set_cpu/clear_cpu, with
> less than 256-512 CPUs), it's worth mentioning it'd be possible to
> remove all atomic ops from the switch_mm fast path by restricting this
> optimization to single threaded processes by checking mm_users <= 1
> and < 1 instead of nr_active_mm <= 1 and < 1 similarly to what the
> earlier version of this patchset was doing.
>
> Thanks,
> Andrea
>
> Andrea Arcangeli (3):
>   mm: use_mm: fix for arches checking mm_users to optimize TLB flushes
>   arm64: select CPUMASK_OFFSTACK if NUMA
>   arm64: tlb: skip tlbi broadcast
>
>  arch/arm64/Kconfig                   |  1 +
>  arch/arm64/include/asm/efi.h         |  2 +-
>  arch/arm64/include/asm/mmu.h         |  4 +-
>  arch/arm64/include/asm/mmu_context.h | 33 ++++++++--
>  arch/arm64/include/asm/tlbflush.h    | 95 +++++++++++++++++++++++++++-
>  arch/arm64/mm/context.c              | 54 ++++++++++++++++
>  mm/mmu_context.c                     |  2 +
>  7 files changed, 180 insertions(+), 11 deletions(-)
>
> Early attempt with the standard mm_cpumask follows:
>
> From: Andrea Arcangeli <aarcange@redhat.com>
> Subject: mm: allow per-arch mm_cpumasks based on ARCH_NR_MM_CPUMASK
>
> Allow archs to allocate multiple mm_cpumasks in the mm_struct per-arch
> by definining a ARCH_NR_MM_CPUMASK > 1 (to be included before
> "linux/mm_types.h").
>
> Those extra per-mm cpumasks can be referenced with
> __mm_cpumask(N, mm), where N == 0 points to the mm_cpumask()
> known by the common code and N > 0 points to the per-arch private
> ones.
> ---
>  drivers/firmware/efi/efi.c |  3 ++-
>  include/linux/mm_types.h   | 17 +++++++++++++++--
>  kernel/fork.c              |  3 ++-
>  mm/init-mm.c               |  2 +-
>  4 files changed, 20 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
> index 5da0232ae33f..608c9bf181e5 100644
> --- a/drivers/firmware/efi/efi.c
> +++ b/drivers/firmware/efi/efi.c
> @@ -86,7 +86,8 @@ struct mm_struct efi_mm = {
>  	.mmap_sem		= __RWSEM_INITIALIZER(efi_mm.mmap_sem),
>  	.page_table_lock	= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
>  	.mmlist			= LIST_HEAD_INIT(efi_mm.mmlist),
> -	.cpu_bitmap		= { [BITS_TO_LONGS(NR_CPUS)] = 0},
> +	.cpu_bitmap		= { [BITS_TO_LONGS(NR_CPUS) *
> +				     ARCH_NR_MM_CPUMASK] = 0},
>  };
>  
>  struct workqueue_struct *efi_rts_wq;
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index f29bba20bba1..b53d5622b3b2 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -531,6 +531,9 @@ struct mm_struct {
>  	RH_KABI_RESERVE(7)
>  	RH_KABI_RESERVE(8)
>  
> +#ifndef ARCH_NR_MM_CPUMASK
> +#define ARCH_NR_MM_CPUMASK 1
> +#endif
>  	/*
>  	 * The mm_cpumask needs to be at the end of mm_struct, because it
>  	 * is dynamically sized based on nr_cpu_ids.
> @@ -544,15 +547,25 @@ extern struct mm_struct init_mm;
>  static inline void mm_init_cpumask(struct mm_struct *mm)
>  {
>  	unsigned long cpu_bitmap = (unsigned long)mm;
> +	int i;
>  
>  	cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
> -	cpumask_clear((struct cpumask *)cpu_bitmap);
> +	for (i = 0; i < ARCH_NR_MM_CPUMASK; i++) {
> +		cpumask_clear((struct cpumask *)cpu_bitmap);
> +		cpu_bitmap += cpumask_size();
> +	}
>  }
>  
>  /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
> +static inline cpumask_t *__mm_cpumask(int index, struct mm_struct *mm)
> +{
> +	return (struct cpumask *)((unsigned long)&mm->cpu_bitmap +
> +				  cpumask_size() * index);
> +}
> +
>  static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
>  {
> -	return (struct cpumask *)&mm->cpu_bitmap;
> +	return __mm_cpumask(0, mm);
>  }
>  
>  struct mmu_gather;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 1dad2f91fac3..a6cbbc1b6008 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2418,7 +2418,8 @@ void __init proc_caches_init(void)
>  	 * dynamically sized based on the maximum CPU number this system
>  	 * can have, taking hotplug into account (nr_cpu_ids).
>  	 */
> -	mm_size = sizeof(struct mm_struct) + cpumask_size();
> +	mm_size = sizeof(struct mm_struct) + cpumask_size() * \
> +		ARCH_NR_MM_CPUMASK;
>  
>  	mm_cachep = kmem_cache_create_usercopy("mm_struct",
>  			mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
> diff --git a/mm/init-mm.c b/mm/init-mm.c
> index a787a319211e..d975f8ce270e 100644
> --- a/mm/init-mm.c
> +++ b/mm/init-mm.c
> @@ -35,6 +35,6 @@ struct mm_struct init_mm = {
>  	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
>  	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
>  	.user_ns	= &init_user_ns,
> -	.cpu_bitmap	= { [BITS_TO_LONGS(NR_CPUS)] = 0},
> +	.cpu_bitmap	= { [BITS_TO_LONGS(NR_CPUS) * ARCH_NR_MM_CPUMASK] = 0},
>  	INIT_MM_CONTEXT(init_mm)
>  };
>
>
> [bitmap version depending on the above follows]
>
> @@ -248,6 +260,42 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>  		cpu_switch_mm(mm->pgd, mm);
>  }
>  
> +enum tlb_flush_types tlb_flush_check(struct mm_struct *mm, unsigned int cpu)
> +{
> +	if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
> +		bool is_local = cpumask_test_cpu(cpu, mm_cpumask(mm));
> +		cpumask_t *stale_cpumask = __mm_cpumask(1, mm);
> +		int next_zero = cpumask_next_zero(-1, stale_cpumask);
> +		bool local_is_clear = false;
> +		if (next_zero < nr_cpu_ids &&
> +		    (is_local && next_zero == cpu)) {
> +			next_zero = cpumask_next_zero(next_zero, stale_cpumask);
> +			local_is_clear = true;
> +		}
> +		if (next_zero < nr_cpu_ids) {
> +			cpumask_setall(stale_cpumask);
> +			local_is_clear = false;
> +		}
> +
> +		/*
> +		 * Enforce CPU ordering between the
> +		 * cpumask_setall() and cpumask_any_but().
> +		 */
> +		smp_mb();
> +
> +		if (likely(cpumask_any_but(mm_cpumask(mm),
> +					   cpu) >= nr_cpu_ids)) {
> +			if (is_local) {
> +				if (!local_is_clear)
> +					cpumask_clear_cpu(cpu, stale_cpumask);
> +				return TLB_FLUSH_LOCAL;
> +			}
> +			return TLB_FLUSH_NO;
> +		}
> +	}
> +	return TLB_FLUSH_BROADCAST;
> +}
> +
>  /* Errata workaround post TTBRx_EL1 update. */
>  asmlinkage void post_ttbr_update_workaround(void)
>  {
>
>
For the 3-part series:


Acked-by: Larry Woodman <lwoodman@redhat.com>