All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] mm/util: Add kvmalloc_node_caller
@ 2021-03-22 19:38 Matthew Wilcox (Oracle)
  2021-03-22 19:38 ` [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages Matthew Wilcox (Oracle)
  0 siblings, 1 reply; 11+ messages in thread
From: Matthew Wilcox (Oracle) @ 2021-03-22 19:38 UTC (permalink / raw)
  To: Andrew Morton, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Vlastimil Babka, linux-mm, linux-kernel,
	Uladzislau Rezki, Nicholas Piggin
  Cc: Matthew Wilcox (Oracle)

Allow the caller of kvmalloc to specify who counts as the allocator
of the memory instead of assuming it's the immediate caller.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/mm.h   |  4 +++-
 include/linux/slab.h |  2 ++
 mm/util.c            | 52 ++++++++++++++++++++++++--------------------
 3 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cb1e191da319..b65a7105d9a7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -786,7 +786,9 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
 
-extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
+void *kvmalloc_node_caller(size_t size, gfp_t flags, int node,
+		unsigned long caller);
+void *kvmalloc_node(size_t size, gfp_t flags, int node);
 static inline void *kvmalloc(size_t size, gfp_t flags)
 {
 	return kvmalloc_node(size, flags, NUMA_NO_NODE);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0c97d788762c..6611b8ee55ee 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -663,6 +663,8 @@ extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
 
 #else /* CONFIG_NUMA */
 
+#define __kmalloc_node_track_caller(size, flags, node, caller) \
+	__kmalloc_track_caller(size, flags, caller)
 #define kmalloc_node_track_caller(size, flags, node) \
 	kmalloc_track_caller(size, flags)
 
diff --git a/mm/util.c b/mm/util.c
index 0b6dd9d81da7..7120bb8ff9ca 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -539,26 +539,8 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_mmap);
 
-/**
- * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
- * failure, fall back to non-contiguous (vmalloc) allocation.
- * @size: size of the request.
- * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
- * @node: numa node to allocate from
- *
- * Uses kmalloc to get the memory but if the allocation fails then falls back
- * to the vmalloc allocator. Use kvfree for freeing the memory.
- *
- * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
- * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
- * preferable to the vmalloc fallback, due to visible performance drawbacks.
- *
- * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
- * fall back to vmalloc.
- *
- * Return: pointer to the allocated memory of %NULL in case of failure
- */
-void *kvmalloc_node(size_t size, gfp_t flags, int node)
+void *kvmalloc_node_caller(size_t size, gfp_t flags, int node,
+		unsigned long caller)
 {
 	gfp_t kmalloc_flags = flags;
 	void *ret;
@@ -584,7 +566,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
 			kmalloc_flags |= __GFP_NORETRY;
 	}
 
-	ret = kmalloc_node(size, kmalloc_flags, node);
+	ret = __kmalloc_node_track_caller(size, kmalloc_flags, node, caller);
 
 	/*
 	 * It doesn't really make sense to fallback to vmalloc for sub page
@@ -593,8 +575,32 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
 	if (ret || size <= PAGE_SIZE)
 		return ret;
 
-	return __vmalloc_node(size, 1, flags, node,
-			__builtin_return_address(0));
+	return __vmalloc_node(size, 1, flags, node, (void *)caller);
+}
+
+/**
+ * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @flags: GFP flags for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
+ * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
+ * preferable to the vmalloc fallback, due to visible performance drawbacks.
+ *
+ * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
+ * fall back to vmalloc.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of failure.
+ * %ZERO_SIZE_PTR if @size is zero.
+ */
+void *kvmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return kvmalloc_node_caller(size, flags, node, _RET_IP_);
 }
 EXPORT_SYMBOL(kvmalloc_node);
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages
  2021-03-22 19:38 [PATCH 1/2] mm/util: Add kvmalloc_node_caller Matthew Wilcox (Oracle)
@ 2021-03-22 19:38 ` Matthew Wilcox (Oracle)
  2021-03-22 22:36   ` Uladzislau Rezki
  0 siblings, 1 reply; 11+ messages in thread
From: Matthew Wilcox (Oracle) @ 2021-03-22 19:38 UTC (permalink / raw)
  To: Andrew Morton, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Vlastimil Babka, linux-mm, linux-kernel,
	Uladzislau Rezki, Nicholas Piggin
  Cc: Matthew Wilcox (Oracle)

If we're trying to allocate 4MB of memory, the table will be 8KiB in size
(1024 pointers * 8 bytes per pointer), which can usually be satisfied
by a kmalloc (which is significantly faster).  Instead of changing this
open-coded implementation, just use kvmalloc().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/vmalloc.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 96444d64129a..32b640a84250 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2802,13 +2802,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		gfp_mask |= __GFP_HIGHMEM;
 
 	/* Please note that the recursion is strictly bounded. */
-	if (array_size > PAGE_SIZE) {
-		pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+	pages = kvmalloc_node_caller(array_size, nested_gfp, node,
 					area->caller);
-	} else {
-		pages = kmalloc_node(array_size, nested_gfp, node);
-	}
-
 	if (!pages) {
 		free_vm_area(area);
 		return NULL;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages
  2021-03-22 19:38 ` [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages Matthew Wilcox (Oracle)
@ 2021-03-22 22:36   ` Uladzislau Rezki
  2021-03-22 23:03     ` Matthew Wilcox
  0 siblings, 1 reply; 11+ messages in thread
From: Uladzislau Rezki @ 2021-03-22 22:36 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Andrew Morton, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Vlastimil Babka, linux-mm, linux-kernel,
	Uladzislau Rezki, Nicholas Piggin

On Mon, Mar 22, 2021 at 07:38:20PM +0000, Matthew Wilcox (Oracle) wrote:
> If we're trying to allocate 4MB of memory, the table will be 8KiB in size
> (1024 pointers * 8 bytes per pointer), which can usually be satisfied
> by a kmalloc (which is significantly faster).  Instead of changing this
> open-coded implementation, just use kvmalloc().
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  mm/vmalloc.c | 7 +------
>  1 file changed, 1 insertion(+), 6 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 96444d64129a..32b640a84250 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2802,13 +2802,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  		gfp_mask |= __GFP_HIGHMEM;
>  
>  	/* Please note that the recursion is strictly bounded. */
> -	if (array_size > PAGE_SIZE) {
> -		pages = __vmalloc_node(array_size, 1, nested_gfp, node,
> +	pages = kvmalloc_node_caller(array_size, nested_gfp, node,
>  					area->caller);
> -	} else {
> -		pages = kmalloc_node(array_size, nested_gfp, node);
> -	}
> -
>  	if (!pages) {
>  		free_vm_area(area);
>  		return NULL;
> -- 
> 2.30.2
Makes sense to me. Though i expected a bigger difference:

# patch
single CPU, 4MB allocation, loops: 1000000 avg: 85293854 usec

# default
single CPU, 4MB allocation, loops: 1000000 avg: 89275857 usec

One question. Should we care much about fragmentation? I mean
with the patch, allocations > 2MB will do request to SLAB bigger
then PAGE_SIZE.

Thanks!

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages
  2021-03-22 22:36   ` Uladzislau Rezki
@ 2021-03-22 23:03     ` Matthew Wilcox
  2021-03-23 12:04       ` Uladzislau Rezki
  0 siblings, 1 reply; 11+ messages in thread
From: Matthew Wilcox @ 2021-03-22 23:03 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Andrew Morton, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Vlastimil Babka, linux-mm, linux-kernel,
	Nicholas Piggin

On Mon, Mar 22, 2021 at 11:36:19PM +0100, Uladzislau Rezki wrote:
> On Mon, Mar 22, 2021 at 07:38:20PM +0000, Matthew Wilcox (Oracle) wrote:
> > If we're trying to allocate 4MB of memory, the table will be 8KiB in size
> > (1024 pointers * 8 bytes per pointer), which can usually be satisfied
> > by a kmalloc (which is significantly faster).  Instead of changing this
> > open-coded implementation, just use kvmalloc().
> > 
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > ---
> >  mm/vmalloc.c | 7 +------
> >  1 file changed, 1 insertion(+), 6 deletions(-)
> > 
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 96444d64129a..32b640a84250 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -2802,13 +2802,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
> >  		gfp_mask |= __GFP_HIGHMEM;
> >  
> >  	/* Please note that the recursion is strictly bounded. */
> > -	if (array_size > PAGE_SIZE) {
> > -		pages = __vmalloc_node(array_size, 1, nested_gfp, node,
> > +	pages = kvmalloc_node_caller(array_size, nested_gfp, node,
> >  					area->caller);
> > -	} else {
> > -		pages = kmalloc_node(array_size, nested_gfp, node);
> > -	}
> > -
> >  	if (!pages) {
> >  		free_vm_area(area);
> >  		return NULL;
> > -- 
> > 2.30.2
> Makes sense to me. Though i expected a bigger difference:
> 
> # patch
> single CPU, 4MB allocation, loops: 1000000 avg: 85293854 usec
> 
> # default
> single CPU, 4MB allocation, loops: 1000000 avg: 89275857 usec

Well, 4.5% isn't something to leave on the table ... but yeah, I was
expecting more in the 10-20% range.  It may be more significant if
there's contention on the spinlocks (like if this crazy ksmbd is calling
vmalloc(4MB) on multiple nodes simultaneously).

I suspect the vast majority of the time is spent calling alloc_pages_node()
1024 times.  Have you looked at Mel's patch to do ... well, exactly what
vmalloc() wants?

https://lore.kernel.org/linux-mm/20210322091845.16437-1-mgorman@techsingularity.net/

> One question. Should we care much about fragmentation? I mean
> with the patch, allocations > 2MB will do request to SLAB bigger
> then PAGE_SIZE.

We're pretty good about allocating memory in larger chunks these days.
Looking at my laptop's slabinfo,
kmalloc-8k           219    232   8192    4    8 : tunables    0    0    0 : sla
bdata     58     58      0

That's using 8 pages per slab, so that's order-3 allocations.  There's a
few more of those:

$ sudo grep '8 :' /proc/slabinfo |wc
     42     672    4508

so I have confidence that kvmalloc() will manage to use kmalloc up to 16MB
vmalloc allocations, and after that it'll tend to fall back to vmalloc.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages
  2021-03-22 23:03     ` Matthew Wilcox
@ 2021-03-23 12:04       ` Uladzislau Rezki
  2021-03-23 12:39         ` Matthew Wilcox
  0 siblings, 1 reply; 11+ messages in thread
From: Uladzislau Rezki @ 2021-03-23 12:04 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Uladzislau Rezki, Andrew Morton, Christoph Lameter, Pekka Enberg,
	David Rientjes, Joonsoo Kim, Vlastimil Babka, linux-mm,
	linux-kernel, Nicholas Piggin

On Mon, Mar 22, 2021 at 11:03:11PM +0000, Matthew Wilcox wrote:
> On Mon, Mar 22, 2021 at 11:36:19PM +0100, Uladzislau Rezki wrote:
> > On Mon, Mar 22, 2021 at 07:38:20PM +0000, Matthew Wilcox (Oracle) wrote:
> > > If we're trying to allocate 4MB of memory, the table will be 8KiB in size
> > > (1024 pointers * 8 bytes per pointer), which can usually be satisfied
> > > by a kmalloc (which is significantly faster).  Instead of changing this
> > > open-coded implementation, just use kvmalloc().
> > > 
> > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > ---
> > >  mm/vmalloc.c | 7 +------
> > >  1 file changed, 1 insertion(+), 6 deletions(-)
> > > 
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index 96444d64129a..32b640a84250 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > > @@ -2802,13 +2802,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
> > >  		gfp_mask |= __GFP_HIGHMEM;
> > >  
> > >  	/* Please note that the recursion is strictly bounded. */
> > > -	if (array_size > PAGE_SIZE) {
> > > -		pages = __vmalloc_node(array_size, 1, nested_gfp, node,
> > > +	pages = kvmalloc_node_caller(array_size, nested_gfp, node,
> > >  					area->caller);
> > > -	} else {
> > > -		pages = kmalloc_node(array_size, nested_gfp, node);
> > > -	}
> > > -
> > >  	if (!pages) {
> > >  		free_vm_area(area);
> > >  		return NULL;
> > > -- 
> > > 2.30.2
> > Makes sense to me. Though i expected a bigger difference:
> > 
> > # patch
> > single CPU, 4MB allocation, loops: 1000000 avg: 85293854 usec
> > 
> > # default
> > single CPU, 4MB allocation, loops: 1000000 avg: 89275857 usec
> 
> Well, 4.5% isn't something to leave on the table ... but yeah, I was
> expecting more in the 10-20% range.  It may be more significant if
> there's contention on the spinlocks (like if this crazy ksmbd is calling
> vmalloc(4MB) on multiple nodes simultaneously).
> 
Yep, it can be that simultaneous allocations will show even bigger
improvements because of lock contention. Anyway there is an advantage
in switching to SLAB - 5% is also a win :) 

>
> I suspect the vast majority of the time is spent calling alloc_pages_node()
> 1024 times.  Have you looked at Mel's patch to do ... well, exactly what
> vmalloc() wants?
> 
<snip>
-   97.37%     0.00%  vmalloc_test/0   [kernel.vmlinux]  [k] ret_from_fork                                                                                                              ◆
     ret_from_fork                                                                                                                                                                      ▒
     kthread                                                                                                                                                                            ▒
   - 0xffffffffc047373b                                                                                                                                                                 ▒
      - 52.67% 0xffffffffc047349f                                                                                                                                                       ▒
           __vmalloc_node                                                                                                                                                               ▒
         - __vmalloc_node_range                                                                                                                                                         ▒
            - 45.25% __alloc_pages_nodemask                                                                                                                                             ▒
               - 37.59% get_page_from_freelist                                                                                                                                          ▒
                    4.34% __list_del_entry_valid                                                                                                                                        ▒
                    3.67% __list_add_valid                                                                                                                                              ▒
                    1.52% prep_new_page                                                                                                                                                 ▒
                    1.20% check_preemption_disabled                                                                                                                                     ▒
              3.75% map_kernel_range_noflush                                                                                                                                            ▒
            - 0.64% kvmalloc_node_caller                                                                                                                                                ▒
                 __kmalloc_track_caller                                                                                                                                                 ▒
                 memset_orig                                                                                                                                                            ▒
      - 44.61% 0xffffffffc047348d                                                                                                                                                       ▒
         - __vunmap                                                                                                                                                                     ▒
            - 35.56% free_unref_page                                                                                                                                                    ▒
               - 22.48% free_pcppages_bulk                                                                                                                                              ▒
                  - 4.21% __mod_zone_page_state                                                                                                                                         ▒
                       2.78% check_preemption_disabled                                                                                                                                  ▒
                       0.80% __this_cpu_preempt_check                                                                                                                                   ▒
                    2.24% __list_del_entry_valid                                                                                                                                        ▒
                    1.84% __list_add_valid                                                                                                                                              ▒
               - 6.55% free_unref_page_commit                                                                                                                                           ▒
                    2.47% check_preemption_disabled                                                                                                                                     ▒
                    1.36% __list_add_valid                                                                                                                                              ▒
                 3.10% free_unref_page_prepare.part.88                                                                                                                                  ▒
                 0.72% free_pcp_prepare                                                                                                                                                 ▒
            - 6.26% remove_vm_area                                                                                                                                                      ▒
                 6.18% unmap_kernel_range_noflush                                                                                                                                       ▒
              2.31% __free_pages   
<snip>

__alloc_pages_nodemask() consumes lot of cycles because it is called
one time per a page and like you mentioned, for 4MB request it is invoked
1024 times!

>
> https://lore.kernel.org/linux-mm/20210322091845.16437-1-mgorman@techsingularity.net/
>
I saw it. It would be good to switch to the bulk interface for vmalloc
once it is settled and mainlined. Apart of that, i find it also useful
for the kvfree_rcu() code in a context of page-cache refilling :)

> 
> > One question. Should we care much about fragmentation? I mean
> > with the patch, allocations > 2MB will do request to SLAB bigger
> > then PAGE_SIZE.
> 
> We're pretty good about allocating memory in larger chunks these days.
> Looking at my laptop's slabinfo,
> kmalloc-8k           219    232   8192    4    8 : tunables    0    0    0 : sla
> bdata     58     58      0
> 
> That's using 8 pages per slab, so that's order-3 allocations.  There's a
> few more of those:
> 
> $ sudo grep '8 :' /proc/slabinfo |wc
>      42     672    4508
> 
> so I have confidence that kvmalloc() will manage to use kmalloc up to 16MB
> vmalloc allocations, and after that it'll tend to fall back to vmalloc.
>

Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>

Thanks!

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages
  2021-03-23 12:04       ` Uladzislau Rezki
@ 2021-03-23 12:39         ` Matthew Wilcox
  2021-03-23 13:39           ` Uladzislau Rezki
  2021-03-23 20:39           ` Uladzislau Rezki
  0 siblings, 2 replies; 11+ messages in thread
From: Matthew Wilcox @ 2021-03-23 12:39 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Andrew Morton, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Vlastimil Babka, linux-mm, linux-kernel,
	Nicholas Piggin

On Tue, Mar 23, 2021 at 01:04:36PM +0100, Uladzislau Rezki wrote:
> On Mon, Mar 22, 2021 at 11:03:11PM +0000, Matthew Wilcox wrote:
> > I suspect the vast majority of the time is spent calling alloc_pages_node()
> > 1024 times.  Have you looked at Mel's patch to do ... well, exactly what
> > vmalloc() wants?
> > 
> <snip>
>          - __vmalloc_node_range
>             - 45.25% __alloc_pages_nodemask
>                - 37.59% get_page_from_freelist
[...]
>       - 44.61% 0xffffffffc047348d
>          - __vunmap
>             - 35.56% free_unref_page

Hmm!  I hadn't been thinking about the free side of things.
Does this make a difference?

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4f5f8c907897..61d5b769fea0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2277,16 +2277,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
 	vm_remove_mappings(area, deallocate_pages);
 
 	if (deallocate_pages) {
-		int i;
-
-		for (i = 0; i < area->nr_pages; i++) {
-			struct page *page = area->pages[i];
-
-			BUG_ON(!page);
-			__free_pages(page, 0);
-		}
+		release_pages(area->pages, area->nr_pages);
 		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
-
 		kvfree(area->pages);
 	}

release_pages does a bunch of checks that are unnecessary ... we could
probably just do:

		LIST_HEAD(pages_to_free);

		for (i = 0; i < area->nr_pages; i++) {
			struct page *page = area->pages[i];
			if (put_page_testzero(page))
				list_add(&page->lru, &pages_to_free);
		}
		free_unref_page_list(&pages_to_free);

but let's see if the provided interface gets us the performance we want.
 
> Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> 
> Thanks!

Thank you!

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages
  2021-03-23 12:39         ` Matthew Wilcox
@ 2021-03-23 13:39           ` Uladzislau Rezki
  2021-03-23 14:07             ` Matthew Wilcox
  2021-03-23 20:39           ` Uladzislau Rezki
  1 sibling, 1 reply; 11+ messages in thread
From: Uladzislau Rezki @ 2021-03-23 13:39 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Uladzislau Rezki, Andrew Morton, Christoph Lameter, Pekka Enberg,
	David Rientjes, Joonsoo Kim, Vlastimil Babka, linux-mm,
	linux-kernel, Nicholas Piggin

On Tue, Mar 23, 2021 at 12:39:13PM +0000, Matthew Wilcox wrote:
> On Tue, Mar 23, 2021 at 01:04:36PM +0100, Uladzislau Rezki wrote:
> > On Mon, Mar 22, 2021 at 11:03:11PM +0000, Matthew Wilcox wrote:
> > > I suspect the vast majority of the time is spent calling alloc_pages_node()
> > > 1024 times.  Have you looked at Mel's patch to do ... well, exactly what
> > > vmalloc() wants?
> > > 
> > <snip>
> >          - __vmalloc_node_range
> >             - 45.25% __alloc_pages_nodemask
> >                - 37.59% get_page_from_freelist
> [...]
> >       - 44.61% 0xffffffffc047348d
> >          - __vunmap
> >             - 35.56% free_unref_page
> 
> Hmm!  I hadn't been thinking about the free side of things.
> Does this make a difference?
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 4f5f8c907897..61d5b769fea0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2277,16 +2277,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
>  	vm_remove_mappings(area, deallocate_pages);
>  
>  	if (deallocate_pages) {
> -		int i;
> -
> -		for (i = 0; i < area->nr_pages; i++) {
> -			struct page *page = area->pages[i];
> -
> -			BUG_ON(!page);
> -			__free_pages(page, 0);
> -		}
> +		release_pages(area->pages, area->nr_pages);
>  		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
> -
>  		kvfree(area->pages);
>  	}
> 
Will check it today!

> release_pages does a bunch of checks that are unnecessary ... we could
> probably just do:
> 
> 		LIST_HEAD(pages_to_free);
> 
> 		for (i = 0; i < area->nr_pages; i++) {
> 			struct page *page = area->pages[i];
> 			if (put_page_testzero(page))
> 				list_add(&page->lru, &pages_to_free);
> 		}
> 		free_unref_page_list(&pages_to_free);
> 
> but let's see if the provided interface gets us the performance we want.
>  
> > Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > 
> > Thanks!
> 
> Thank you!
You are welcome. A small nit:

  CC      mm/vmalloc.o
mm/vmalloc.c: In function ‘__vmalloc_area_node’:
mm/vmalloc.c:2492:14: warning: passing argument 4 of ‘kvmalloc_node_caller’ makes integer from pointer without a cast [-Wint-conversion]
          area->caller);
          ~~~~^~~~~~~~
In file included from mm/vmalloc.c:12:
./include/linux/mm.h:782:7: note: expected ‘long unsigned int’ but argument is of type ‘const void *’
 void *kvmalloc_node_caller(size_t size, gfp_t flags, int node,

<snip>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8a202ba263f6..ee6fa44983bc 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2489,7 +2489,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 
        /* Please note that the recursion is strictly bounded. */
        pages = kvmalloc_node_caller(array_size, nested_gfp, node,
-                                                                area->caller);
+                               (unsigned long) area->caller);
        if (!pages) {
                free_vm_area(area);
                return NULL;
<snip>

As for the bulk-array interface. I have checked the:

git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git mm-bulk-rebase-v6r2

applied the patch that is in question + below one:

<snip>
@@ -2503,25 +2498,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        area->pages = pages;
        area->nr_pages = nr_pages;
 
-       for (i = 0; i < area->nr_pages; i++) {
-               struct page *page;
-
-               if (node == NUMA_NO_NODE)
-                       page = alloc_page(gfp_mask);
-               else
-                       page = alloc_pages_node(node, gfp_mask, 0);
-
-               if (unlikely(!page)) {
-                       /* Successfully allocated i pages, free them in __vfree() */
-                       area->nr_pages = i;
-                       atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
-                       goto fail;
-               }
-               area->pages[i] = page;
-               if (gfpflags_allow_blocking(gfp_mask))
-                       cond_resched();
+       ret = alloc_pages_bulk_array(gfp_mask, area->nr_pages, area->pages);
+       if (ret == nr_pages)
+               atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
+       else {
+               area->nr_pages = ret;
+               goto fail;
        }
-       atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
<snip>

single CPU, 4MB allocation, 1000000 avg: 70639437 usec
single CPU, 4MB allocation, 1000000 avg: 89218654 usec

and now we get ~21% delta. That is very good :)

--
Vlad Rezki

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages
  2021-03-23 13:39           ` Uladzislau Rezki
@ 2021-03-23 14:07             ` Matthew Wilcox
  2021-03-23 20:49               ` Uladzislau Rezki
  0 siblings, 1 reply; 11+ messages in thread
From: Matthew Wilcox @ 2021-03-23 14:07 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Andrew Morton, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Vlastimil Babka, linux-mm, linux-kernel,
	Nicholas Piggin

On Tue, Mar 23, 2021 at 02:39:48PM +0100, Uladzislau Rezki wrote:
> On Tue, Mar 23, 2021 at 12:39:13PM +0000, Matthew Wilcox wrote:
> > On Tue, Mar 23, 2021 at 01:04:36PM +0100, Uladzislau Rezki wrote:
> > > On Mon, Mar 22, 2021 at 11:03:11PM +0000, Matthew Wilcox wrote:
> > > > I suspect the vast majority of the time is spent calling alloc_pages_node()
> > > > 1024 times.  Have you looked at Mel's patch to do ... well, exactly what
> > > > vmalloc() wants?
> > > > 
> > > <snip>
> > >          - __vmalloc_node_range
> > >             - 45.25% __alloc_pages_nodemask
> > >                - 37.59% get_page_from_freelist
> > [...]
> > >       - 44.61% 0xffffffffc047348d
> > >          - __vunmap
> > >             - 35.56% free_unref_page
> > 
> > Hmm!  I hadn't been thinking about the free side of things.
> > Does this make a difference?
> > 
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 4f5f8c907897..61d5b769fea0 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -2277,16 +2277,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
> >  	vm_remove_mappings(area, deallocate_pages);
> >  
> >  	if (deallocate_pages) {
> > -		int i;
> > -
> > -		for (i = 0; i < area->nr_pages; i++) {
> > -			struct page *page = area->pages[i];
> > -
> > -			BUG_ON(!page);
> > -			__free_pages(page, 0);
> > -		}
> > +		release_pages(area->pages, area->nr_pages);
> >  		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
> > -
> >  		kvfree(area->pages);
> >  	}
> > 
> Will check it today!
> 
> > release_pages does a bunch of checks that are unnecessary ... we could
> > probably just do:
> > 
> > 		LIST_HEAD(pages_to_free);
> > 
> > 		for (i = 0; i < area->nr_pages; i++) {
> > 			struct page *page = area->pages[i];
> > 			if (put_page_testzero(page))
> > 				list_add(&page->lru, &pages_to_free);
> > 		}
> > 		free_unref_page_list(&pages_to_free);
> > 
> > but let's see if the provided interface gets us the performance we want.
> >  
> > > Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > 
> > > Thanks!
> > 
> > Thank you!
> You are welcome. A small nit:
> 
>   CC      mm/vmalloc.o
> mm/vmalloc.c: In function ‘__vmalloc_area_node’:
> mm/vmalloc.c:2492:14: warning: passing argument 4 of ‘kvmalloc_node_caller’ makes integer from pointer without a cast [-Wint-conversion]
>           area->caller);
>           ~~~~^~~~~~~~
> In file included from mm/vmalloc.c:12:
> ./include/linux/mm.h:782:7: note: expected ‘long unsigned int’ but argument is of type ‘const void *’
>  void *kvmalloc_node_caller(size_t size, gfp_t flags, int node,

Oh, thank you!  I confused myself by changing the type halfway through.
vmalloc() uses void * to match __builtin_return_address while most
of the rest of the kernel uses unsigned long to match _RET_IP_.
I'll submit another patch to convert vmalloc to use _RET_IP_.

> As for the bulk-array interface. I have checked the:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git mm-bulk-rebase-v6r2
> 
> applied the patch that is in question + below one:
> 
> <snip>
> @@ -2503,25 +2498,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>         area->pages = pages;
>         area->nr_pages = nr_pages;
>  
> -       for (i = 0; i < area->nr_pages; i++) {
> -               struct page *page;
> -
> -               if (node == NUMA_NO_NODE)
> -                       page = alloc_page(gfp_mask);
> -               else
> -                       page = alloc_pages_node(node, gfp_mask, 0);
> -
> -               if (unlikely(!page)) {
> -                       /* Successfully allocated i pages, free them in __vfree() */
> -                       area->nr_pages = i;
> -                       atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
> -                       goto fail;
> -               }
> -               area->pages[i] = page;
> -               if (gfpflags_allow_blocking(gfp_mask))
> -                       cond_resched();
> +       ret = alloc_pages_bulk_array(gfp_mask, area->nr_pages, area->pages);
> +       if (ret == nr_pages)
> +               atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
> +       else {
> +               area->nr_pages = ret;
> +               goto fail;
>         }
> -       atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
> <snip>
> 
> single CPU, 4MB allocation, 1000000 avg: 70639437 usec
> single CPU, 4MB allocation, 1000000 avg: 89218654 usec
> 
> and now we get ~21% delta. That is very good :)

Amazing!  That's great news for Mel's patch as well as the kvmalloc
change.

(there's an entirely separate issue that they really shouldn't be
allocating 4MB of memory, but we can at least make what we have faster).

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages
  2021-03-23 12:39         ` Matthew Wilcox
  2021-03-23 13:39           ` Uladzislau Rezki
@ 2021-03-23 20:39           ` Uladzislau Rezki
  2021-03-24 18:41             ` Uladzislau Rezki
  1 sibling, 1 reply; 11+ messages in thread
From: Uladzislau Rezki @ 2021-03-23 20:39 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Uladzislau Rezki, Andrew Morton, Christoph Lameter, Pekka Enberg,
	David Rientjes, Joonsoo Kim, Vlastimil Babka, linux-mm,
	linux-kernel, Nicholas Piggin

> On Tue, Mar 23, 2021 at 01:04:36PM +0100, Uladzislau Rezki wrote:
> > On Mon, Mar 22, 2021 at 11:03:11PM +0000, Matthew Wilcox wrote:
> > > I suspect the vast majority of the time is spent calling alloc_pages_node()
> > > 1024 times.  Have you looked at Mel's patch to do ... well, exactly what
> > > vmalloc() wants?
> > > 
> > <snip>
> >          - __vmalloc_node_range
> >             - 45.25% __alloc_pages_nodemask
> >                - 37.59% get_page_from_freelist
> [...]
> >       - 44.61% 0xffffffffc047348d
> >          - __vunmap
> >             - 35.56% free_unref_page
> 
> Hmm!  I hadn't been thinking about the free side of things.
> Does this make a difference?
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 4f5f8c907897..61d5b769fea0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2277,16 +2277,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
>  	vm_remove_mappings(area, deallocate_pages);
>  
>  	if (deallocate_pages) {
> -		int i;
> -
> -		for (i = 0; i < area->nr_pages; i++) {
> -			struct page *page = area->pages[i];
> -
> -			BUG_ON(!page);
> -			__free_pages(page, 0);
> -		}
> +		release_pages(area->pages, area->nr_pages);
>  		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
> -
>  		kvfree(area->pages);
>  	}
>
Same test. 4MB allocation on a single CPU:

default: loops: 1000000 avg: 93601889 usec
patch:   loops: 1000000 avg: 98217904 usec

<snip default>
            - __vunmap
               - 41.17% free_unref_page
                  - 28.42% free_pcppages_bulk
                     - 6.38% __mod_zone_page_state
                          4.79% check_preemption_disabled
                       2.63% __list_del_entry_valid
                       2.63% __list_add_valid
                  - 7.50% free_unref_page_commit
                       2.15% check_preemption_disabled
                       2.01% __list_add_valid
                    2.31% free_unref_page_prepare.part.86
                    0.70% free_pcp_prepare
<snip default>

<snip patch>
        - __vunmap
               - 45.36% release_pages
                  - 37.70% free_unref_page_list
                     - 24.70% free_pcppages_bulk
                        - 5.42% __mod_zone_page_state
                             4.23% check_preemption_disabled
                          2.31% __list_add_valid
                          2.07% __list_del_entry_valid
                     - 7.58% free_unref_page_commit
                          2.47% check_preemption_disabled
                          1.75% __list_add_valid
                       3.43% free_unref_page_prepare.part.86
                  - 2.39% mem_cgroup_uncharge_list
                       uncharge_page
<snip patch>

It is obvious that the default version is slightly better. It requires
less things to be done comparing with release_pages() variant.

> 
> release_pages does a bunch of checks that are unnecessary ... we could
> probably just do:
> 
> 		LIST_HEAD(pages_to_free);
> 
> 		for (i = 0; i < area->nr_pages; i++) {
> 			struct page *page = area->pages[i];
> 			if (put_page_testzero(page))
> 				list_add(&page->lru, &pages_to_free);
> 		}
> 		free_unref_page_list(&pages_to_free);
> 
> but let's see if the provided interface gets us the performance we want.
>  
I will test it tomorrow. From the first glance it looks like a more light version :)

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages
  2021-03-23 14:07             ` Matthew Wilcox
@ 2021-03-23 20:49               ` Uladzislau Rezki
  0 siblings, 0 replies; 11+ messages in thread
From: Uladzislau Rezki @ 2021-03-23 20:49 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Uladzislau Rezki, Andrew Morton, Christoph Lameter, Pekka Enberg,
	David Rientjes, Joonsoo Kim, Vlastimil Babka, linux-mm,
	linux-kernel, Nicholas Piggin

On Tue, Mar 23, 2021 at 02:07:22PM +0000, Matthew Wilcox wrote:
> On Tue, Mar 23, 2021 at 02:39:48PM +0100, Uladzislau Rezki wrote:
> > On Tue, Mar 23, 2021 at 12:39:13PM +0000, Matthew Wilcox wrote:
> > > On Tue, Mar 23, 2021 at 01:04:36PM +0100, Uladzislau Rezki wrote:
> > > > On Mon, Mar 22, 2021 at 11:03:11PM +0000, Matthew Wilcox wrote:
> > > > > I suspect the vast majority of the time is spent calling alloc_pages_node()
> > > > > 1024 times.  Have you looked at Mel's patch to do ... well, exactly what
> > > > > vmalloc() wants?
> > > > > 
> > > > <snip>
> > > >          - __vmalloc_node_range
> > > >             - 45.25% __alloc_pages_nodemask
> > > >                - 37.59% get_page_from_freelist
> > > [...]
> > > >       - 44.61% 0xffffffffc047348d
> > > >          - __vunmap
> > > >             - 35.56% free_unref_page
> > > 
> > > Hmm!  I hadn't been thinking about the free side of things.
> > > Does this make a difference?
> > > 
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index 4f5f8c907897..61d5b769fea0 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > > @@ -2277,16 +2277,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
> > >  	vm_remove_mappings(area, deallocate_pages);
> > >  
> > >  	if (deallocate_pages) {
> > > -		int i;
> > > -
> > > -		for (i = 0; i < area->nr_pages; i++) {
> > > -			struct page *page = area->pages[i];
> > > -
> > > -			BUG_ON(!page);
> > > -			__free_pages(page, 0);
> > > -		}
> > > +		release_pages(area->pages, area->nr_pages);
> > >  		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
> > > -
> > >  		kvfree(area->pages);
> > >  	}
> > > 
> > Will check it today!
> > 
> > > release_pages does a bunch of checks that are unnecessary ... we could
> > > probably just do:
> > > 
> > > 		LIST_HEAD(pages_to_free);
> > > 
> > > 		for (i = 0; i < area->nr_pages; i++) {
> > > 			struct page *page = area->pages[i];
> > > 			if (put_page_testzero(page))
> > > 				list_add(&page->lru, &pages_to_free);
> > > 		}
> > > 		free_unref_page_list(&pages_to_free);
> > > 
> > > but let's see if the provided interface gets us the performance we want.
> > >  
> > > > Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > 
> > > > Thanks!
> > > 
> > > Thank you!
> > You are welcome. A small nit:
> > 
> >   CC      mm/vmalloc.o
> > mm/vmalloc.c: In function ‘__vmalloc_area_node’:
> > mm/vmalloc.c:2492:14: warning: passing argument 4 of ‘kvmalloc_node_caller’ makes integer from pointer without a cast [-Wint-conversion]
> >           area->caller);
> >           ~~~~^~~~~~~~
> > In file included from mm/vmalloc.c:12:
> > ./include/linux/mm.h:782:7: note: expected ‘long unsigned int’ but argument is of type ‘const void *’
> >  void *kvmalloc_node_caller(size_t size, gfp_t flags, int node,
> 
> Oh, thank you!  I confused myself by changing the type halfway through.
> vmalloc() uses void * to match __builtin_return_address while most
> of the rest of the kernel uses unsigned long to match _RET_IP_.
> I'll submit another patch to convert vmalloc to use _RET_IP_.
> 
Thanks!

> > As for the bulk-array interface. I have checked the:
> > 
> > git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git mm-bulk-rebase-v6r2
> > 
> > applied the patch that is in question + below one:
> > 
> > <snip>
> > @@ -2503,25 +2498,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
> >         area->pages = pages;
> >         area->nr_pages = nr_pages;
> >  
> > -       for (i = 0; i < area->nr_pages; i++) {
> > -               struct page *page;
> > -
> > -               if (node == NUMA_NO_NODE)
> > -                       page = alloc_page(gfp_mask);
> > -               else
> > -                       page = alloc_pages_node(node, gfp_mask, 0);
> > -
> > -               if (unlikely(!page)) {
> > -                       /* Successfully allocated i pages, free them in __vfree() */
> > -                       area->nr_pages = i;
> > -                       atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
> > -                       goto fail;
> > -               }
> > -               area->pages[i] = page;
> > -               if (gfpflags_allow_blocking(gfp_mask))
> > -                       cond_resched();
> > +       ret = alloc_pages_bulk_array(gfp_mask, area->nr_pages, area->pages);
> > +       if (ret == nr_pages)
> > +               atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
> > +       else {
> > +               area->nr_pages = ret;
> > +               goto fail;
> >         }
> > -       atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
> > <snip>
> > 
> > single CPU, 4MB allocation, 1000000 avg: 70639437 usec
> > single CPU, 4MB allocation, 1000000 avg: 89218654 usec
> > 
> > and now we get ~21% delta. That is very good :)
> 
> Amazing!  That's great news for Mel's patch as well as the kvmalloc
> change.
> 
Cool! I am glad if it gives some points to the bulk-array interface :)

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages
  2021-03-23 20:39           ` Uladzislau Rezki
@ 2021-03-24 18:41             ` Uladzislau Rezki
  0 siblings, 0 replies; 11+ messages in thread
From: Uladzislau Rezki @ 2021-03-24 18:41 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Andrew Morton, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Vlastimil Babka, linux-mm, linux-kernel,
	Nicholas Piggin, urezki

On Tue, Mar 23, 2021 at 09:39:24PM +0100, Uladzislau Rezki wrote:
> > On Tue, Mar 23, 2021 at 01:04:36PM +0100, Uladzislau Rezki wrote:
> > > On Mon, Mar 22, 2021 at 11:03:11PM +0000, Matthew Wilcox wrote:
> > > > I suspect the vast majority of the time is spent calling alloc_pages_node()
> > > > 1024 times.  Have you looked at Mel's patch to do ... well, exactly what
> > > > vmalloc() wants?
> > > > 
> > > <snip>
> > >          - __vmalloc_node_range
> > >             - 45.25% __alloc_pages_nodemask
> > >                - 37.59% get_page_from_freelist
> > [...]
> > >       - 44.61% 0xffffffffc047348d
> > >          - __vunmap
> > >             - 35.56% free_unref_page
> > 
> > Hmm!  I hadn't been thinking about the free side of things.
> > Does this make a difference?
> > 
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 4f5f8c907897..61d5b769fea0 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -2277,16 +2277,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
> >  	vm_remove_mappings(area, deallocate_pages);
> >  
> >  	if (deallocate_pages) {
> > -		int i;
> > -
> > -		for (i = 0; i < area->nr_pages; i++) {
> > -			struct page *page = area->pages[i];
> > -
> > -			BUG_ON(!page);
> > -			__free_pages(page, 0);
> > -		}
> > +		release_pages(area->pages, area->nr_pages);
> >  		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
> > -
> >  		kvfree(area->pages);
> >  	}
> >
> Same test. 4MB allocation on a single CPU:
> 
> default: loops: 1000000 avg: 93601889 usec
> patch:   loops: 1000000 avg: 98217904 usec
> 
> <snip default>
>             - __vunmap
>                - 41.17% free_unref_page
>                   - 28.42% free_pcppages_bulk
>                      - 6.38% __mod_zone_page_state
>                           4.79% check_preemption_disabled
>                        2.63% __list_del_entry_valid
>                        2.63% __list_add_valid
>                   - 7.50% free_unref_page_commit
>                        2.15% check_preemption_disabled
>                        2.01% __list_add_valid
>                     2.31% free_unref_page_prepare.part.86
>                     0.70% free_pcp_prepare
> <snip default>
> 
> <snip patch>
>         - __vunmap
>                - 45.36% release_pages
>                   - 37.70% free_unref_page_list
>                      - 24.70% free_pcppages_bulk
>                         - 5.42% __mod_zone_page_state
>                              4.23% check_preemption_disabled
>                           2.31% __list_add_valid
>                           2.07% __list_del_entry_valid
>                      - 7.58% free_unref_page_commit
>                           2.47% check_preemption_disabled
>                           1.75% __list_add_valid
>                        3.43% free_unref_page_prepare.part.86
>                   - 2.39% mem_cgroup_uncharge_list
>                        uncharge_page
> <snip patch>
> 
> It is obvious that the default version is slightly better. It requires
> less things to be done comparing with release_pages() variant.
> 
> > 
> > release_pages does a bunch of checks that are unnecessary ... we could
> > probably just do:
> > 
> > 		LIST_HEAD(pages_to_free);
> > 
> > 		for (i = 0; i < area->nr_pages; i++) {
> > 			struct page *page = area->pages[i];
> > 			if (put_page_testzero(page))
> > 				list_add(&page->lru, &pages_to_free);
> > 		}
> > 		free_unref_page_list(&pages_to_free);
> > 
> > but let's see if the provided interface gets us the performance we want.
> >  
> I will test it tomorrow. From the first glance it looks like a more light version :)
> 
Here we go:

<snip>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4f5f8c907897..349024768ba6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2254,6 +2254,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
 static void __vunmap(const void *addr, int deallocate_pages)
 {
        struct vm_struct *area;
+       LIST_HEAD(pages_to_free);
 
        if (!addr)
                return;
@@ -2282,11 +2283,12 @@ static void __vunmap(const void *addr, int deallocate_pages)
                for (i = 0; i < area->nr_pages; i++) {
                        struct page *page = area->pages[i];
 
-                       BUG_ON(!page);
-                       __free_pages(page, 0);
+                       if (put_page_testzero(page))
+                               list_add(&page->lru, &pages_to_free);
                }
-               atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
 
+               free_unref_page_list(&pages_to_free);
+               atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
                kvfree(area->pages);
        }
<snip>

# patch
4MB allocation, single cpu, loops: 1000000 avg: 89065758 usec
4MB allocation, single cpu, loops: 1000000 avg: 90258523 usec
4MB allocation, single cpu, loops: 1000000 avg: 89363057 usec
4MB allocation, single cpu, loops: 1000000 avg: 89271685 usec
4MB allocation, single cpu, loops: 1000000 avg: 89247375 usec

# default
4MB allocation, single cpu, loops: 1000000 avg: 89258814 usec
4MB allocation, single cpu, loops: 1000000 avg: 89364194 usec
4MB allocation, single cpu, loops: 1000000 avg: 89226816 usec
4MB allocation, single cpu, loops: 1000000 avg: 89247360 usec
4MB allocation, single cpu, loops: 1000000 avg: 89330116 usec

Do not see any difference.

See below some profiling regarding cache misses:

<snip>
         - __vunmap
            - 32.15% free_unref_page_list                                                                                                                                               
               - 23.54% free_pcppages_bulk
                  - 6.33% __mod_zone_page_state
                       4.65% check_preemption_disabled
<snip>

free_unref_page_list():
       │        free_unref_page_list():
       │ffffffff8125152a:   mov    0x8(%rbp),%rax
 31.81 │ffffffff8125152e:   lea    0x8(%rbp),%r12                                                                                                                                        
       │ffffffff81251532:   mov    %rbp,%r14
 14.40 │ffffffff81251535:   lea    -0x8(%rax),%rbp  


(gdb) l *0xffffffff8125152e
0xffffffff8125152e is in free_unref_page_list (mm/page_alloc.c:3271).
3266            struct page *page, *next;
3267            unsigned long flags, pfn;
3268            int batch_count = 0;
3269
3270            /* Prepare pages for freeing */
3271            list_for_each_entry_safe(page, next, list, lru) {
3272                    pfn = page_to_pfn(page);
3273                    if (!free_unref_page_prepare(page, pfn))
3274                            list_del(&page->lru);
3275                    set_page_private(page, pfn);
(gdb)

free_pcppages_bulk():
       │        PageBuddy():
  0.59 │ffffffff8124f523:   mov        0x30(%rax),%edi
 13.59 │ffffffff8124f526:   and        $0xf0000080,%edi    

(gdb) l *0xffffffff8124f526
0xffffffff8124f526 is in free_pcppages_bulk (./include/linux/page-flags.h:742).
737
738     /*
739      * PageBuddy() indicates that the page is free and in the buddy system
740      * (see mm/page_alloc.c).
741      */
742     PAGE_TYPE_OPS(Buddy, buddy)
743
744     /*
745      * PageOffline() indicates that the page is logically offline although the
746      * containing section is online. (e.g. inflated in a balloon driver or
(gdb)

Looks like it would be good to have a free_pages_bulk_array() :)

--
Vlad Rezki

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-03-24 18:42 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-22 19:38 [PATCH 1/2] mm/util: Add kvmalloc_node_caller Matthew Wilcox (Oracle)
2021-03-22 19:38 ` [PATCH 2/2] mm/vmalloc: Use kvmalloc to allocate the table of pages Matthew Wilcox (Oracle)
2021-03-22 22:36   ` Uladzislau Rezki
2021-03-22 23:03     ` Matthew Wilcox
2021-03-23 12:04       ` Uladzislau Rezki
2021-03-23 12:39         ` Matthew Wilcox
2021-03-23 13:39           ` Uladzislau Rezki
2021-03-23 14:07             ` Matthew Wilcox
2021-03-23 20:49               ` Uladzislau Rezki
2021-03-23 20:39           ` Uladzislau Rezki
2021-03-24 18:41             ` Uladzislau Rezki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.