[PATCH 1/3] mm/vmalloc: Move draining areas out of caller context

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context
@ 2022-01-19 14:35 Uladzislau Rezki (Sony)
  2022-01-19 14:35 ` [PATCH 2/3] mm/vmalloc: Add adjust_search_size parameter Uladzislau Rezki (Sony)
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Uladzislau Rezki (Sony) @ 2022-01-19 14:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, Christoph Hellwig, Matthew Wilcox,
	Nicholas Piggin, Uladzislau Rezki, Oleksiy Avramchenko

A caller initiates the drain procces from its context once the
drain threshold is reached or passed. There are at least two
drawbacks of doing so:

a) a caller can be a high-prio or RT task. In that case it can
   stuck in doing the actual drain of all lazily freed areas.
   This is not optimal because such tasks usually are latency
   sensitive where the control should be returned back as soon
   as possible in order to drive such workloads in time. See
   96e2db456135 ("mm/vmalloc: rework the drain logic")

b) It is not safe to call vfree() during holding a spinlock due
   to the vmap_purge_lock mutex. The was a report about this from
   Zeal Robot <zealci@zte.com.cn> here:
   https://lore.kernel.org/all/20211222081026.484058-1-chi.minghao@zte.com.cn

Moving the drain to the separate work context addresses those
issues.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 mm/vmalloc.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bdc7222f87d4..ed0f9eaa61a9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -793,6 +793,9 @@ RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
 static void purge_vmap_area_lazy(void);
 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
 static unsigned long lazy_max_pages(void);
+static void drain_vmap_area(struct work_struct *work);
+static DECLARE_WORK(drain_vmap_area_work, drain_vmap_area);
+static atomic_t drain_vmap_area_work_in_progress;
 
 static atomic_long_t nr_vmalloc_pages;
 
@@ -1719,18 +1722,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 	return true;
 }
 
-/*
- * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
- * is already purging.
- */
-static void try_purge_vmap_area_lazy(void)
-{
-	if (mutex_trylock(&vmap_purge_lock)) {
-		__purge_vmap_area_lazy(ULONG_MAX, 0);
-		mutex_unlock(&vmap_purge_lock);
-	}
-}
-
 /*
  * Kick off a purge of the outstanding lazy areas.
  */
@@ -1742,6 +1733,23 @@ static void purge_vmap_area_lazy(void)
 	mutex_unlock(&vmap_purge_lock);
 }
 
+static void drain_vmap_area(struct work_struct *work)
+{
+	unsigned long nr_lazy;
+
+	do {
+		mutex_lock(&vmap_purge_lock);
+		__purge_vmap_area_lazy(ULONG_MAX, 0);
+		mutex_unlock(&vmap_purge_lock);
+
+		/* Recheck if further work is required. */
+		nr_lazy = atomic_long_read(&vmap_lazy_nr);
+	} while (nr_lazy > lazy_max_pages());
+
+	/* We are done at this point. */
+	atomic_set(&drain_vmap_area_work_in_progress, 0);
+}
+
 /*
  * Free a vmap area, caller ensuring that the area has been unmapped
  * and flush_cache_vunmap had been called for the correct range
@@ -1768,7 +1776,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
 
 	/* After this point, we may free va at any time */
 	if (unlikely(nr_lazy > lazy_max_pages()))
-		try_purge_vmap_area_lazy();
+		if (!atomic_xchg(&drain_vmap_area_work_in_progress, 1))
+			schedule_work(&drain_vmap_area_work);
 }
 
 /*
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/3] mm/vmalloc: Add adjust_search_size parameter
  2022-01-19 14:35 [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context Uladzislau Rezki (Sony)
@ 2022-01-19 14:35 ` Uladzislau Rezki (Sony)
  2022-01-19 14:35 ` [PATCH 3/3] mm/vmalloc: Eliminate an extra orig_gfp_mask Uladzislau Rezki (Sony)
  2022-01-20  9:12 ` [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context Christoph Hellwig
  2 siblings, 0 replies; 6+ messages in thread
From: Uladzislau Rezki (Sony) @ 2022-01-19 14:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, Christoph Hellwig, Matthew Wilcox,
	Nicholas Piggin, Uladzislau Rezki, Oleksiy Avramchenko,
	Uladzislau Rezki

From: Uladzislau Rezki <uladzislau.rezki@sony.com>

Extend the find_vmap_lowest_match() function with one more
parameter. It is "adjust_search_size" boolean variable, so
it is possible to control an accuracy of search block if a
specific alignment is required.

With this patch, a search size is always adjusted, to serve
a request as fast as possible because of performance reason.

But there is one exception though, it is short ranges where
requested size corresponds to passed vstart/vend restriction
together with a specific alignment request. In such scenario
an adjustment wold not lead to success allocation.

Signed-off-by: Uladzislau Rezki <uladzislau.rezki@sony.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 mm/vmalloc.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ed0f9eaa61a9..52ee67107046 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1192,22 +1192,28 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
 /*
  * Find the first free block(lowest start address) in the tree,
  * that will accomplish the request corresponding to passing
- * parameters.
+ * parameters. Please note, with an alignment bigger than PAGE_SIZE,
+ * a search length is adjusted to account for worst case alignment
+ * overhead.
  */
 static __always_inline struct vmap_area *
-find_vmap_lowest_match(unsigned long size,
-	unsigned long align, unsigned long vstart)
+find_vmap_lowest_match(unsigned long size, unsigned long align,
+	unsigned long vstart, bool adjust_search_size)
 {
 	struct vmap_area *va;
 	struct rb_node *node;
+	unsigned long length;
 
 	/* Start from the root. */
 	node = free_vmap_area_root.rb_node;
 
+	/* Adjust the search size for alignment overhead. */
+	length = adjust_search_size ? size + align - 1 : size;
+
 	while (node) {
 		va = rb_entry(node, struct vmap_area, rb_node);
 
-		if (get_subtree_max_size(node->rb_left) >= size &&
+		if (get_subtree_max_size(node->rb_left) >= length &&
 				vstart < va->va_start) {
 			node = node->rb_left;
 		} else {
@@ -1217,9 +1223,9 @@ find_vmap_lowest_match(unsigned long size,
 			/*
 			 * Does not make sense to go deeper towards the right
 			 * sub-tree if it does not have a free block that is
-			 * equal or bigger to the requested search size.
+			 * equal or bigger to the requested search length.
 			 */
-			if (get_subtree_max_size(node->rb_right) >= size) {
+			if (get_subtree_max_size(node->rb_right) >= length) {
 				node = node->rb_right;
 				continue;
 			}
@@ -1235,7 +1241,7 @@ find_vmap_lowest_match(unsigned long size,
 				if (is_within_this_va(va, size, align, vstart))
 					return va;
 
-				if (get_subtree_max_size(node->rb_right) >= size &&
+				if (get_subtree_max_size(node->rb_right) >= length &&
 						vstart <= va->va_start) {
 					/*
 					 * Shift the vstart forward. Please note, we update it with
@@ -1283,7 +1289,7 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align)
 	get_random_bytes(&rnd, sizeof(rnd));
 	vstart = VMALLOC_START + rnd;
 
-	va_1 = find_vmap_lowest_match(size, align, vstart);
+	va_1 = find_vmap_lowest_match(size, align, vstart, false);
 	va_2 = find_vmap_lowest_linear_match(size, align, vstart);
 
 	if (va_1 != va_2)
@@ -1434,12 +1440,25 @@ static __always_inline unsigned long
 __alloc_vmap_area(unsigned long size, unsigned long align,
 	unsigned long vstart, unsigned long vend)
 {
+	bool adjust_search_size = true;
 	unsigned long nva_start_addr;
 	struct vmap_area *va;
 	enum fit_type type;
 	int ret;
 
-	va = find_vmap_lowest_match(size, align, vstart);
+	/*
+	 * Do not adjust when:
+	 *   a) align <= PAGE_SIZE, because it does not make any sense.
+	 *      All blocks(their start addresses) are at least PAGE_SIZE
+	 *      aligned anyway;
+	 *   b) a short range where a requested size corresponds to exactly
+	 *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
+	 *      With adjusted search length an allocation would not succeed.
+	 */
+	if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
+		adjust_search_size = false;
+
+	va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
 	if (unlikely(!va))
 		return vend;
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/3] mm/vmalloc: Eliminate an extra orig_gfp_mask
  2022-01-19 14:35 [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context Uladzislau Rezki (Sony)
  2022-01-19 14:35 ` [PATCH 2/3] mm/vmalloc: Add adjust_search_size parameter Uladzislau Rezki (Sony)
@ 2022-01-19 14:35 ` Uladzislau Rezki (Sony)
  2022-01-20  9:12 ` [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context Christoph Hellwig
  2 siblings, 0 replies; 6+ messages in thread
From: Uladzislau Rezki (Sony) @ 2022-01-19 14:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, Christoph Hellwig, Matthew Wilcox,
	Nicholas Piggin, Uladzislau Rezki, Oleksiy Avramchenko,
	Vasily Averin

That extra variable has been introduced just for keeping an original
passed gfp_mask because it is updated with __GFP_NOWARN on entry, thus
error handling messages were broken.

Instead we can keep an original gfp_mask without modifying it and add
an extra __GFP_NOWARN flag together with gfp_mask as a parameter to
the vm_area_alloc_pages() function. It will make it less confused.

Cc: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 mm/vmalloc.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 52ee67107046..04edd32ba6bc 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2953,7 +2953,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 int node)
 {
 	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
-	const gfp_t orig_gfp_mask = gfp_mask;
 	bool nofail = gfp_mask & __GFP_NOFAIL;
 	unsigned long addr = (unsigned long)area->addr;
 	unsigned long size = get_vm_area_size(area);
@@ -2967,7 +2966,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	max_small_pages = ALIGN(size, 1UL << page_shift) >> PAGE_SHIFT;
 
 	array_size = (unsigned long)max_small_pages * sizeof(struct page *);
-	gfp_mask |= __GFP_NOWARN;
 	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
 		gfp_mask |= __GFP_HIGHMEM;
 
@@ -2980,7 +2978,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	}
 
 	if (!area->pages) {
-		warn_alloc(orig_gfp_mask, NULL,
+		warn_alloc(gfp_mask, NULL,
 			"vmalloc error: size %lu, failed to allocated page array size %lu",
 			nr_small_pages * PAGE_SIZE, array_size);
 		free_vm_area(area);
@@ -2990,8 +2988,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
 	page_order = vm_area_page_order(area);
 
-	area->nr_pages = vm_area_alloc_pages(gfp_mask, node,
-		page_order, nr_small_pages, area->pages);
+	area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
+		node, page_order, nr_small_pages, area->pages);
 
 	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
 	if (gfp_mask & __GFP_ACCOUNT) {
@@ -3007,7 +3005,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	 * allocation request, free them via __vfree() if any.
 	 */
 	if (area->nr_pages != nr_small_pages) {
-		warn_alloc(orig_gfp_mask, NULL,
+		warn_alloc(gfp_mask, NULL,
 			"vmalloc error: size %lu, page order %u, failed to allocate pages",
 			area->nr_pages * PAGE_SIZE, page_order);
 		goto fail;
@@ -3035,7 +3033,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		memalloc_noio_restore(flags);
 
 	if (ret < 0) {
-		warn_alloc(orig_gfp_mask, NULL,
+		warn_alloc(gfp_mask, NULL,
 			"vmalloc error: size %lu, failed to map pages",
 			area->nr_pages * PAGE_SIZE);
 		goto fail;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context
  2022-01-19 14:35 [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context Uladzislau Rezki (Sony)
  2022-01-19 14:35 ` [PATCH 2/3] mm/vmalloc: Add adjust_search_size parameter Uladzislau Rezki (Sony)
  2022-01-19 14:35 ` [PATCH 3/3] mm/vmalloc: Eliminate an extra orig_gfp_mask Uladzislau Rezki (Sony)
@ 2022-01-20  9:12 ` Christoph Hellwig
  2022-01-20 10:42   ` Uladzislau Rezki
  2 siblings, 1 reply; 6+ messages in thread
From: Christoph Hellwig @ 2022-01-20  9:12 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: Andrew Morton, linux-mm, LKML, Christoph Hellwig, Matthew Wilcox,
	Nicholas Piggin, Oleksiy Avramchenko

On Wed, Jan 19, 2022 at 03:35:38PM +0100, Uladzislau Rezki (Sony) wrote:
> +static void drain_vmap_area(struct work_struct *work)

Nit, but I prefer to have a _work postix for workers just to keep
it easy to ready.

>  	/* After this point, we may free va at any time */
>  	if (unlikely(nr_lazy > lazy_max_pages()))
> -		try_purge_vmap_area_lazy();
> +		if (!atomic_xchg(&drain_vmap_area_work_in_progress, 1))
> +			schedule_work(&drain_vmap_area_work);

Work items are defined to be single threaded, so I don't think we need
the drain_vmap_area_work_in_progress hack.

>  
>  /*
> -- 
> 2.30.2
> 
> 
---end quoted text---

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context
  2022-01-20  9:12 ` [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context Christoph Hellwig
@ 2022-01-20 10:42   ` Uladzislau Rezki
  0 siblings, 0 replies; 6+ messages in thread
From: Uladzislau Rezki @ 2022-01-20 10:42 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Uladzislau Rezki (Sony),
	Andrew Morton, linux-mm, LKML, Matthew Wilcox, Nicholas Piggin,
	Oleksiy Avramchenko

> On Wed, Jan 19, 2022 at 03:35:38PM +0100, Uladzislau Rezki (Sony) wrote:
> > +static void drain_vmap_area(struct work_struct *work)
> 
> Nit, but I prefer to have a _work postix for workers just to keep
> it easy to ready.
> 
Will fix it!

> >  	/* After this point, we may free va at any time */
> >  	if (unlikely(nr_lazy > lazy_max_pages()))
> > -		try_purge_vmap_area_lazy();
> > +		if (!atomic_xchg(&drain_vmap_area_work_in_progress, 1))
> > +			schedule_work(&drain_vmap_area_work);
> 
> Work items are defined to be single threaded, so I don't think we need
> the drain_vmap_area_work_in_progress hack.
> 
The motivation with that hack was to prevent the drain work being placed
several times at once, i.e. schedule_work() checks only a pending bit.

If the work is in run-queue another caller of vfree() will place it one
more time, since pending bit is not set because the work is in TASK_RUNNING
state.

Or am i missing something?

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context
@ 2022-01-19 14:34 Uladzislau Rezki (Sony)
  0 siblings, 0 replies; 6+ messages in thread
From: Uladzislau Rezki (Sony) @ 2022-01-19 14:34 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Michal Hocko, Andrew Morton, Daniel Axtens, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Peter Zijlstra, Thomas Gleixner,
	Theodore Y . Ts'o, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko

A caller initiates the drain procces from its context once the
drain threshold is reached or passed. There are at least two
drawbacks of doing so:

a) a caller can be a high-prio or RT task. In that case it can
   stuck in doing the actual drain of all lazily freed areas.
   This is not optimal because such tasks usually are latency
   sensitive where the control should be returned back as soon
   as possible in order to drive such workloads in time. See
   96e2db456135 ("mm/vmalloc: rework the drain logic")

b) It is not safe to call vfree() during holding a spinlock due
   to the vmap_purge_lock mutex. The was a report about this from
   Zeal Robot <zealci@zte.com.cn> here:
   https://lore.kernel.org/all/20211222081026.484058-1-chi.minghao@zte.com.cn

Moving the drain to the separate work context addresses those
issues.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 mm/vmalloc.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bdc7222f87d4..ed0f9eaa61a9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -793,6 +793,9 @@ RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
 static void purge_vmap_area_lazy(void);
 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
 static unsigned long lazy_max_pages(void);
+static void drain_vmap_area(struct work_struct *work);
+static DECLARE_WORK(drain_vmap_area_work, drain_vmap_area);
+static atomic_t drain_vmap_area_work_in_progress;
 
 static atomic_long_t nr_vmalloc_pages;
 
@@ -1719,18 +1722,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 	return true;
 }
 
-/*
- * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
- * is already purging.
- */
-static void try_purge_vmap_area_lazy(void)
-{
-	if (mutex_trylock(&vmap_purge_lock)) {
-		__purge_vmap_area_lazy(ULONG_MAX, 0);
-		mutex_unlock(&vmap_purge_lock);
-	}
-}
-
 /*
  * Kick off a purge of the outstanding lazy areas.
  */
@@ -1742,6 +1733,23 @@ static void purge_vmap_area_lazy(void)
 	mutex_unlock(&vmap_purge_lock);
 }
 
+static void drain_vmap_area(struct work_struct *work)
+{
+	unsigned long nr_lazy;
+
+	do {
+		mutex_lock(&vmap_purge_lock);
+		__purge_vmap_area_lazy(ULONG_MAX, 0);
+		mutex_unlock(&vmap_purge_lock);
+
+		/* Recheck if further work is required. */
+		nr_lazy = atomic_long_read(&vmap_lazy_nr);
+	} while (nr_lazy > lazy_max_pages());
+
+	/* We are done at this point. */
+	atomic_set(&drain_vmap_area_work_in_progress, 0);
+}
+
 /*
  * Free a vmap area, caller ensuring that the area has been unmapped
  * and flush_cache_vunmap had been called for the correct range
@@ -1768,7 +1776,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
 
 	/* After this point, we may free va at any time */
 	if (unlikely(nr_lazy > lazy_max_pages()))
-		try_purge_vmap_area_lazy();
+		if (!atomic_xchg(&drain_vmap_area_work_in_progress, 1))
+			schedule_work(&drain_vmap_area_work);
 }
 
 /*
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-01-20 10:43 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-19 14:35 [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context Uladzislau Rezki (Sony)
2022-01-19 14:35 ` [PATCH 2/3] mm/vmalloc: Add adjust_search_size parameter Uladzislau Rezki (Sony)
2022-01-19 14:35 ` [PATCH 3/3] mm/vmalloc: Eliminate an extra orig_gfp_mask Uladzislau Rezki (Sony)
2022-01-20  9:12 ` [PATCH 1/3] mm/vmalloc: Move draining areas out of caller context Christoph Hellwig
2022-01-20 10:42   ` Uladzislau Rezki
  -- strict thread matches above, loose matches on Subject: below --
2022-01-19 14:34 Uladzislau Rezki (Sony)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.