Re: [RFC PATCH] fork: Free per-cpu cached vmalloc'ed thread stacks with

From: Uladzislau Rezki <urezki@gmail.com>
To: "Isaac J. Manjarres" <isaacm@codeaurora.org>
Cc: "Isaac J. Manjarres" <isaacm@codeaurora.org>,
	linux-mm.kvack.org@wittgenstein, linux-kernel@vger.kernel.org,
	akpm@linux-foundation.org, mingo@kernel.org,
	peterz@infradead.org, ebiederm@xmission.com, esyr@redhat.com,
	tglx@linutronix.de, christian@kellner.me, areber@redhat.com,
	shakeelb@google.com, cyphar@cyphar.com, psodagud@codeaurora.org,
	pratikp@codeaurora.org
Subject: Re: [RFC PATCH] fork: Free per-cpu cached vmalloc'ed thread stacks with
Date: Mon, 7 Sep 2020 16:21:31 +0200	[thread overview]
Message-ID: <20200907142131.GA16844@pc636> (raw)
In-Reply-To: <20200907084538.qz5g4mikznwxr67g@wittgenstein>

On Mon, Sep 07, 2020 at 10:45:38AM +0200, Christian Brauner wrote:
> On Sat, Sep 05, 2020 at 12:12:29AM +0000, Isaac J. Manjarres wrote:
> > The per-cpu cached vmalloc'ed stacks are currently freed in the
> > CPU hotplug teardown path by the free_vm_stack_cache() callback,
> > which invokes vfree(), which may result in purging the list of
> > lazily freed vmap areas.
> > 
> > Purging all of the lazily freed vmap areas can take a long time
> > when the list of vmap areas is large. This is problematic, as
> > free_vm_stack_cache() is invoked prior to the offline CPU's timers
> > being migrated. This is not desirable as it can lead to timer
> > migration delays in the CPU hotplug teardown path, and timer callbacks
> > will be invoked long after the timer has expired.
> > 
> > For example, on a system that has only one online CPU (CPU 1) that is
> > running a heavy workload, and another CPU that is being offlined,
> > the online CPU will invoke free_vm_stack_cache() to free the cached
> > vmalloc'ed stacks for the CPU being offlined. When there are 2702
> > vmap areas that total to 13498 pages, free_vm_stack_cache() takes
> > over 2 seconds to execute:
> > 
> > [001]   399.335808: cpuhp_enter: cpu: 0005 target:   0 step:  67 (free_vm_stack_cache)
> > 
> > /* The first vmap area to be freed */
> > [001]   399.337157: __purge_vmap_area_lazy: [0:2702] 0xffffffc033da8000 - 0xffffffc033dad000 (5 : 13498)
> > 
> > /* After two seconds */
> > [001]   401.528010: __purge_vmap_area_lazy: [1563:2702] 0xffffffc02fe10000 - 0xffffffc02fe15000 (5 : 5765)
> > 
> > Instead of freeing the per-cpu cached vmalloc'ed stacks synchronously
> > with respect to the CPU hotplug teardown state machine, free them
> > asynchronously to help move along the CPU hotplug teardown state machine
> > quickly.
> > 
> > Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
> > ---
> 
> This needs reviews and acks from the good folks over at mm/.
> 
> >  kernel/fork.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 4d32190..68346a0 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -202,7 +202,7 @@ static int free_vm_stack_cache(unsigned int cpu)
> >  		if (!vm_stack)
> >  			continue;
> >  
> > -		vfree(vm_stack->addr);
> > +		vfree_atomic(vm_stack->addr);
> >  		cached_vm_stacks[i] = NULL;
> >  	}
> >  
Could you please also test below patch?

<snip>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index be4724b916b3..cef7a9768f01 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -419,6 +419,10 @@ static LLIST_HEAD(vmap_purge_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 static bool vmap_initialized __read_mostly;

+static struct rb_root purge_vmap_area_root = RB_ROOT;
+static LIST_HEAD(purge_vmap_area_list);
+static DEFINE_SPINLOCK(purge_vmap_area_lock);
+
 /*
  * This kmem_cache is used for vmap_area objects. Instead of
  * allocating from slab we reuse an object from this cache to
@@ -743,7 +747,8 @@ insert_vmap_area_augment(struct vmap_area *va,
  */
 static __always_inline struct vmap_area *
 merge_or_add_vmap_area(struct vmap_area *va,
-       struct rb_root *root, struct list_head *head)
+       struct rb_root *root, struct list_head *head,
+       bool update)
 {
        struct vmap_area *sibling;
        struct list_head *next;
@@ -825,7 +830,9 @@ merge_or_add_vmap_area(struct vmap_area *va,
        /*
         * Last step is to check and update the tree.
         */
-       augment_tree_propagate_from(va);
+       if (update)
+               augment_tree_propagate_from(va);
+
        return va;
 }

@@ -1140,7 +1147,7 @@ static void free_vmap_area(struct vmap_area *va)
         * Insert/Merge it back to the free tree/list.
         */
        spin_lock(&free_vmap_area_lock);
-       merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+       merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list, true);
        spin_unlock(&free_vmap_area_lock);
 }
 
@@ -1328,32 +1335,33 @@ void set_iounmap_nonlazy(void)
 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 {
        unsigned long resched_threshold;
-       struct llist_node *valist;
-       struct vmap_area *va;
-       struct vmap_area *n_va;
+       struct list_head local_pure_list;
+       struct vmap_area *va, *n_va;
 
        lockdep_assert_held(&vmap_purge_lock);
 
-       valist = llist_del_all(&vmap_purge_list);
-       if (unlikely(valist == NULL))
+       spin_lock(&purge_vmap_area_lock);
+       purge_vmap_area_root = RB_ROOT;
+       list_replace_init(&purge_vmap_area_list, &local_pure_list);
+       spin_unlock(&purge_vmap_area_lock);
+
+       if (unlikely(list_empty(&local_pure_list)))
                return false;

-       /*
-        * TODO: to calculate a flush range without looping.
-        * The list can be up to lazy_max_pages() elements.
-        */
-       llist_for_each_entry(va, valist, purge_list) {
-               if (va->va_start < start)
-                       start = va->va_start;
-               if (va->va_end > end)
-                       end = va->va_end;
-       }
+
+       start = min(start,
+                       list_first_entry(&local_pure_list,
+                               struct vmap_area, list)->va_start);
+
+       end = max(end,
+                       list_last_entry(&local_pure_list,
+                               struct vmap_area, list)->va_end);
 
        flush_tlb_kernel_range(start, end);
        resched_threshold = lazy_max_pages() << 1;
 
        spin_lock(&free_vmap_area_lock);
-       llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+       list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
                unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
                unsigned long orig_start = va->va_start;
                unsigned long orig_end = va->va_end;
@@ -1364,7 +1372,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
                 * anything.
                 */
                va = merge_or_add_vmap_area(va, &free_vmap_area_root,
-                                           &free_vmap_area_list);
+                               &free_vmap_area_list, true);
 
                if (!va)
                        continue;
@@ -1421,9 +1429,19 @@ static void free_vmap_area_noflush(struct vmap_area *va)
        nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
                                PAGE_SHIFT, &vmap_lazy_nr);
 
-       /* After this point, we may free va at any time */
-       llist_add(&va->purge_list, &vmap_purge_list);
+       /*
+        * But this time the node has been removed from the busy
+        * data structures therefore, merge or place it to the purge
+        * tree/list.
+        */
+       spin_lock(&purge_vmap_area_lock);
+       merge_or_add_vmap_area(va,
+               &purge_vmap_area_root, &purge_vmap_area_list, false);
+       spin_unlock(&purge_vmap_area_lock);
 
+       /*
+        * After this point, we may free va at any time.
+        */
        if (unlikely(nr_lazy > lazy_max_pages()))
                try_purge_vmap_area_lazy();
 }
@@ -3352,7 +3370,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
-                                           &free_vmap_area_list);
+                               &free_vmap_area_list, true);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end);
@@ -3402,7 +3420,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
-                                           &free_vmap_area_list);
+                               &free_vmap_area_list, true);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end);
<snip>

Thank you!

--
Vlad Rezki