Re: [RFC PATCH RESEND 28/28] kernel/fork: throttle call_rcu() calls in vm_area_free

From: Laurent Dufour <ldufour@linux.ibm.com>
To: Suren Baghdasaryan <surenb@google.com>, akpm@linux-foundation.org
Cc: michel@lespinasse.org, joelaf@google.com, songliubraving@fb.com,
	mhocko@suse.com, david@redhat.com, peterz@infradead.org,
	bigeasy@linutronix.de, peterx@redhat.com, dhowells@redhat.com,
	linux-mm@kvack.org, jglisse@google.com, dave@stgolabs.net,
	minchan@google.com, x86@kernel.org, hughd@google.com,
	willy@infradead.org, laurent.dufour@fr.ibm.com, mgorman@suse.de,
	rientjes@google.com, axelrasmussen@google.com,
	kernel-team@android.com, paulmck@kernel.org,
	liam.howlett@oracle.com, luto@kernel.org, vbabka@suse.cz,
	linux-arm-kernel@lists.infradead.org, kent.overstreet@linux.dev,
	linux-kernel@vger.kernel.org, hannes@cmpxchg.org,
	linuxppc-dev@lists.ozlabs.org
Subject: Re: [RFC PATCH RESEND 28/28] kernel/fork: throttle call_rcu() calls in vm_area_free
Date: Fri, 9 Sep 2022 17:19:09 +0200	[thread overview]
Message-ID: <34dd5656-dc3c-6a20-5390-04d05c619fdc@linux.ibm.com> (raw)
In-Reply-To: <20220901173516.702122-29-surenb@google.com>

Le 01/09/2022 à 19:35, Suren Baghdasaryan a écrit :
> call_rcu() can take a long time when callback offloading is enabled.
> Its use in the vm_area_free can cause regressions in the exit path when
> multiple VMAs are being freed. To minimize that impact, place VMAs into
> a list and free them in groups using one call_rcu() call per group.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  include/linux/mm.h       |  1 +
>  include/linux/mm_types.h | 11 ++++++-
>  kernel/fork.c            | 68 +++++++++++++++++++++++++++++++++++-----
>  mm/init-mm.c             |  3 ++
>  mm/mmap.c                |  1 +
>  5 files changed, 75 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a3cbaa7b9119..81dff694ac14 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -249,6 +249,7 @@ void setup_initial_init_mm(void *start_code, void *end_code,
>  struct vm_area_struct *vm_area_alloc(struct mm_struct *);
>  struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
>  void vm_area_free(struct vm_area_struct *);
> +void drain_free_vmas(struct mm_struct *mm);
>  
>  #ifndef CONFIG_MMU
>  extern struct rb_root nommu_region_tree;
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 36562e702baf..6f3effc493b1 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -412,7 +412,11 @@ struct vm_area_struct {
>  			struct vm_area_struct *vm_next, *vm_prev;
>  		};
>  #ifdef CONFIG_PER_VMA_LOCK
> -		struct rcu_head vm_rcu;	/* Used for deferred freeing. */
> +		struct {
> +			struct list_head vm_free_list;
> +			/* Used for deferred freeing. */
> +			struct rcu_head vm_rcu;
> +		};
>  #endif
>  	};
>  
> @@ -573,6 +577,11 @@ struct mm_struct {
>  					  */
>  #ifdef CONFIG_PER_VMA_LOCK
>  		int mm_lock_seq;
> +		struct {
> +			struct list_head head;
> +			spinlock_t lock;
> +			int size;
> +		} vma_free_list;
>  #endif
>  
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b443ba3a247a..7c88710aed72 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -483,26 +483,75 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
>  }
>  
>  #ifdef CONFIG_PER_VMA_LOCK
> -static void __vm_area_free(struct rcu_head *head)
> +static inline void __vm_area_free(struct vm_area_struct *vma)
>  {
> -	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
> -						  vm_rcu);
>  	/* The vma should either have no lock holders or be write-locked. */
>  	vma_assert_no_reader(vma);
>  	kmem_cache_free(vm_area_cachep, vma);
>  }
> -#endif
> +
> +static void vma_free_rcu_callback(struct rcu_head *head)
> +{
> +	struct vm_area_struct *first_vma;
> +	struct vm_area_struct *vma, *vma2;
> +
> +	first_vma = container_of(head, struct vm_area_struct, vm_rcu);
> +	list_for_each_entry_safe(vma, vma2, &first_vma->vm_free_list, vm_free_list)

Is that safe to walk the list against concurrent calls to
list_splice_init(), or list_add()?

> +		__vm_area_free(vma);
> +	__vm_area_free(first_vma);
> +}
> +
> +void drain_free_vmas(struct mm_struct *mm)
> +{
> +	struct vm_area_struct *first_vma;
> +	LIST_HEAD(to_destroy);
> +
> +	spin_lock(&mm->vma_free_list.lock);
> +	list_splice_init(&mm->vma_free_list.head, &to_destroy);
> +	mm->vma_free_list.size = 0;
> +	spin_unlock(&mm->vma_free_list.lock);
> +
> +	if (list_empty(&to_destroy))
> +		return;
> +
> +	first_vma = list_first_entry(&to_destroy, struct vm_area_struct, vm_free_list);
> +	/* Remove the head which is allocated on the stack */
> +	list_del(&to_destroy);
> +
> +	call_rcu(&first_vma->vm_rcu, vma_free_rcu_callback);
> +}
> +
> +#define VM_AREA_FREE_LIST_MAX	32
> +
> +void vm_area_free(struct vm_area_struct *vma)
> +{
> +	struct mm_struct *mm = vma->vm_mm;
> +	bool drain;
> +
> +	free_anon_vma_name(vma);
> +
> +	spin_lock(&mm->vma_free_list.lock);
> +	list_add(&vma->vm_free_list, &mm->vma_free_list.head);
> +	mm->vma_free_list.size++;
> +	drain = mm->vma_free_list.size > VM_AREA_FREE_LIST_MAX;
> +	spin_unlock(&mm->vma_free_list.lock);
> +
> +	if (drain)
> +		drain_free_vmas(mm);
> +}
> +
> +#else /* CONFIG_PER_VMA_LOCK */
> +
> +void drain_free_vmas(struct mm_struct *mm) {}
>  
>  void vm_area_free(struct vm_area_struct *vma)
>  {
>  	free_anon_vma_name(vma);
> -#ifdef CONFIG_PER_VMA_LOCK
> -	call_rcu(&vma->vm_rcu, __vm_area_free);
> -#else
>  	kmem_cache_free(vm_area_cachep, vma);
> -#endif
>  }
>  
> +#endif /* CONFIG_PER_VMA_LOCK */
> +
>  static void account_kernel_stack(struct task_struct *tsk, int account)
>  {
>  	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
> @@ -1137,6 +1186,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	INIT_LIST_HEAD(&mm->mmlist);
>  #ifdef CONFIG_PER_VMA_LOCK
>  	WRITE_ONCE(mm->mm_lock_seq, 0);
> +	INIT_LIST_HEAD(&mm->vma_free_list.head);
> +	spin_lock_init(&mm->vma_free_list.lock);
> +	mm->vma_free_list.size = 0;
>  #endif
>  	mm_pgtables_bytes_init(mm);
>  	mm->map_count = 0;
> diff --git a/mm/init-mm.c b/mm/init-mm.c
> index 8399f90d631c..7b6d2460545f 100644
> --- a/mm/init-mm.c
> +++ b/mm/init-mm.c
> @@ -39,6 +39,9 @@ struct mm_struct init_mm = {
>  	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
>  #ifdef CONFIG_PER_VMA_LOCK
>  	.mm_lock_seq	= 0,
> +	.vma_free_list.head = LIST_HEAD_INIT(init_mm.vma_free_list.head),
> +	.vma_free_list.lock =  __SPIN_LOCK_UNLOCKED(init_mm.vma_free_list.lock),
> +	.vma_free_list.size = 0,
>  #endif
>  	.user_ns	= &init_user_ns,
>  	.cpu_bitmap	= CPU_BITS_NONE,
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 1edfcd384f5e..d61b7ef84ba6 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3149,6 +3149,7 @@ void exit_mmap(struct mm_struct *mm)
>  	}
>  	mm->mmap = NULL;
>  	mmap_write_unlock(mm);
> +	drain_free_vmas(mm);
>  	vm_unacct_memory(nr_accounted);
>  }
>