All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] dmapool performance enhancements
@ 2022-04-28 20:27 kbusch
  2022-04-28 20:27 ` [PATCH 1/2] mm/dmapool: replace linked list with xarray kbusch
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: kbusch @ 2022-04-28 20:27 UTC (permalink / raw)
  To: linux-kernel, linux-mm; +Cc: willy, kernel-team, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Allocating and freeing blocks from the dmapool iterates a list of all
allocated pages. We can save time removing the per-alloc/free list
traversal for a constant time lookup, so this series does that.

Compared to current kernel, perf record from running io_uring benchmarks
on nvme reports dma_pool_alloc() cost reduction cut in half from 0.81% to
0.41%.

Keith Busch (2):
  mm/dmapool: replace linked list with xarray
  mm/dmapool: link blocks across pages

 mm/dmapool.c | 107 +++++++++++++++++++++++++++------------------------
 1 file changed, 56 insertions(+), 51 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/2] mm/dmapool: replace linked list with xarray
  2022-04-28 20:27 [PATCH 0/2] dmapool performance enhancements kbusch
@ 2022-04-28 20:27 ` kbusch
  2022-04-28 21:59   ` Matthew Wilcox
  2022-04-28 20:27 ` [PATCH 2/2] mm/dmapool: link blocks across pages kbusch
  2022-05-27 19:35 ` [PATCH 0/2] dmapool performance enhancements Tony Battersby
  2 siblings, 1 reply; 7+ messages in thread
From: kbusch @ 2022-04-28 20:27 UTC (permalink / raw)
  To: linux-kernel, linux-mm; +Cc: willy, kernel-team, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Store the cached dma pool pages in an xarray instead of a linked list.
This allows constant lookup time to free the page, lockless iteration
while displaying the attributes, and frees up space in 'struct dma_page'.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 mm/dmapool.c | 39 ++++++++++++++++-----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index a7eb5d0eb2da..ac93f58d4654 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -35,13 +35,14 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/xarray.h>
 
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
 #define DMAPOOL_DEBUG 1
 #endif
 
 struct dma_pool {		/* the pool */
-	struct list_head page_list;
+	struct xarray pages;
 	spinlock_t lock;
 	size_t size;
 	struct device *dev;
@@ -52,7 +53,6 @@ struct dma_pool {		/* the pool */
 };
 
 struct dma_page {		/* cacheable header for 'allocation' bytes */
-	struct list_head page_list;
 	void *vaddr;
 	dma_addr_t dma;
 	unsigned int in_use;
@@ -81,13 +81,12 @@ static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha
 	list_for_each_entry(pool, &dev->dma_pools, pools) {
 		unsigned pages = 0;
 		unsigned blocks = 0;
+		unsigned long i;
 
-		spin_lock_irq(&pool->lock);
-		list_for_each_entry(page, &pool->page_list, page_list) {
+		xa_for_each(&pool->pages, i, page) {
 			pages++;
 			blocks += page->in_use;
 		}
-		spin_unlock_irq(&pool->lock);
 
 		/* per-pool info, no real statistics yet */
 		temp = scnprintf(next, size, "%-16s %4u %4zu %4zu %2u\n",
@@ -160,7 +159,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 
 	retval->dev = dev;
 
-	INIT_LIST_HEAD(&retval->page_list);
+	xa_init(&retval->pages);
 	spin_lock_init(&retval->lock);
 	retval->size = size;
 	retval->boundary = boundary;
@@ -252,7 +251,6 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
 	memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
 #endif
 	dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
-	list_del(&page->page_list);
 	kfree(page);
 }
 
@@ -266,8 +264,9 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
  */
 void dma_pool_destroy(struct dma_pool *pool)
 {
-	struct dma_page *page, *tmp;
+	struct dma_page *page;
 	bool empty = false;
+	unsigned long i;
 
 	if (unlikely(!pool))
 		return;
@@ -282,7 +281,8 @@ void dma_pool_destroy(struct dma_pool *pool)
 		device_remove_file(pool->dev, &dev_attr_pools);
 	mutex_unlock(&pools_reg_lock);
 
-	list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) {
+	xa_for_each(&pool->pages, i, page) {
+		xa_erase(&pool->pages, i);
 		if (is_page_busy(page)) {
 			if (pool->dev)
 				dev_err(pool->dev, "%s %s, %p busy\n", __func__,
@@ -291,12 +291,12 @@ void dma_pool_destroy(struct dma_pool *pool)
 				pr_err("%s %s, %p busy\n", __func__,
 				       pool->name, page->vaddr);
 			/* leak the still-in-use consistent memory */
-			list_del(&page->page_list);
 			kfree(page);
 		} else
 			pool_free_page(pool, page);
 	}
 
+	xa_destroy(&pool->pages);
 	kfree(pool);
 }
 EXPORT_SYMBOL(dma_pool_destroy);
@@ -316,13 +316,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 {
 	unsigned long flags;
 	struct dma_page *page;
+	unsigned long i;
 	size_t offset;
 	void *retval;
 
 	might_alloc(mem_flags);
 
 	spin_lock_irqsave(&pool->lock, flags);
-	list_for_each_entry(page, &pool->page_list, page_list) {
+	xa_for_each(&pool->pages, i, page) {
 		if (page->offset < pool->allocation)
 			goto ready;
 	}
@@ -334,9 +335,9 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 	if (!page)
 		return NULL;
 
+	xa_store(&pool->pages, page->vaddr, page, mem_flags);
 	spin_lock_irqsave(&pool->lock, flags);
 
-	list_add(&page->page_list, &pool->page_list);
  ready:
 	page->in_use++;
 	offset = page->offset;
@@ -379,17 +380,9 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 }
 EXPORT_SYMBOL(dma_pool_alloc);
 
-static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
+static struct dma_page *pool_find_page(struct dma_pool *pool, unsigned long vaddr)
 {
-	struct dma_page *page;
-
-	list_for_each_entry(page, &pool->page_list, page_list) {
-		if (dma < page->dma)
-			continue;
-		if ((dma - page->dma) < pool->allocation)
-			return page;
-	}
-	return NULL;
+	return xa_load(pool->pages, vaddr & ~(pool->allocation - 1));
 }
 
 /**
@@ -408,7 +401,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 	unsigned int offset;
 
 	spin_lock_irqsave(&pool->lock, flags);
-	page = pool_find_page(pool, dma);
+	page = pool_find_page(pool, vaddr);
 	if (!page) {
 		spin_unlock_irqrestore(&pool->lock, flags);
 		if (pool->dev)
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/2] mm/dmapool: link blocks across pages
  2022-04-28 20:27 [PATCH 0/2] dmapool performance enhancements kbusch
  2022-04-28 20:27 ` [PATCH 1/2] mm/dmapool: replace linked list with xarray kbusch
@ 2022-04-28 20:27 ` kbusch
  2022-05-27 19:35 ` [PATCH 0/2] dmapool performance enhancements Tony Battersby
  2 siblings, 0 replies; 7+ messages in thread
From: kbusch @ 2022-04-28 20:27 UTC (permalink / raw)
  To: linux-kernel, linux-mm; +Cc: willy, kernel-team, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Once a page is allocated from a dma_pool, it is never released until the
pool is destroyed. That means it is safe to link free structures across
pages rather than only within them, and removes the need to scan the
'pages' list to find a gap in the cached pages.

This also allows all the blocks within a page to be allocated rather
than the last block being reserved, so it's more memory efficient.

A minor consequence of this is that the minimum sized structure is
changed from sizeof(int) to sizeof(void *), but I didn't find existing
users requesting that small of a size anyway.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 mm/dmapool.c | 80 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 34 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index ac93f58d4654..34ec8d0e62ea 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -44,6 +44,7 @@
 struct dma_pool {		/* the pool */
 	struct xarray pages;
 	spinlock_t lock;
+	void *next_block;
 	size_t size;
 	struct device *dev;
 	size_t allocation;
@@ -56,7 +57,6 @@ struct dma_page {		/* cacheable header for 'allocation' bytes */
 	void *vaddr;
 	dma_addr_t dma;
 	unsigned int in_use;
-	unsigned int offset;
 };
 
 static DEFINE_MUTEX(pools_lock);
@@ -140,8 +140,8 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 
 	if (size == 0)
 		return NULL;
-	else if (size < 4)
-		size = 4;
+	else if (size < sizeof(void *))
+		size = sizeof(void *);
 
 	size = ALIGN(size, align);
 	allocation = max_t(size_t, size, PAGE_SIZE);
@@ -164,6 +164,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 	retval->size = size;
 	retval->boundary = boundary;
 	retval->allocation = allocation;
+	retval->next_block = NULL;
 
 	INIT_LIST_HEAD(&retval->pools);
 
@@ -201,18 +202,25 @@ EXPORT_SYMBOL(dma_pool_create);
 
 static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
 {
-	unsigned int offset = 0;
 	unsigned int next_boundary = pool->boundary;
+	unsigned int offset = pool->size;
+	void **v = page->vaddr;
+	void *next;
 
-	do {
-		unsigned int next = offset + pool->size;
-		if (unlikely((next + pool->size) >= next_boundary)) {
-			next = next_boundary;
+	while (offset < pool->allocation) {
+		if (offset + pool->size >= next_boundary) {
+			offset = next_boundary;
 			next_boundary += pool->boundary;
+			continue;
 		}
-		*(int *)(page->vaddr + offset) = next;
-		offset = next;
-	} while (offset < pool->allocation);
+
+		next = page->vaddr + offset;
+		*v = next;
+		v = next;
+
+		offset += pool->size;
+	}
+	*v = NULL;
 }
 
 static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
@@ -230,7 +238,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
 #endif
 		pool_initialise_page(pool, page);
 		page->in_use = 0;
-		page->offset = 0;
 	} else {
 		kfree(page);
 		page = NULL;
@@ -301,6 +308,11 @@ void dma_pool_destroy(struct dma_pool *pool)
 }
 EXPORT_SYMBOL(dma_pool_destroy);
 
+static struct dma_page *pool_find_page(struct dma_pool *pool, unsigned long vaddr)
+{
+	return xa_load(&pool->pages, vaddr & ~(pool->allocation - 1));
+}
+
 /**
  * dma_pool_alloc - get a block of consistent memory
  * @pool: dma pool that will produce the block
@@ -316,16 +328,16 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 {
 	unsigned long flags;
 	struct dma_page *page;
-	unsigned long i;
 	size_t offset;
 	void *retval;
 
 	might_alloc(mem_flags);
 
 	spin_lock_irqsave(&pool->lock, flags);
-	xa_for_each(&pool->pages, i, page) {
-		if (page->offset < pool->allocation)
-			goto ready;
+	retval = pool->next_block;
+	if (retval) {
+		page = pool_find_page(pool, (unsigned long)retval);
+		goto ready;
 	}
 
 	/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
@@ -335,21 +347,26 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 	if (!page)
 		return NULL;
 
-	xa_store(&pool->pages, page->vaddr, page, mem_flags);
+	xa_store(&pool->pages, (unsigned long)page->vaddr, page,
+		 mem_flags);
+
 	spin_lock_irqsave(&pool->lock, flags);
+	*(void **)(page->vaddr + pool->allocation - pool->size) =
+						pool->next_block;
+	pool->next_block = page->vaddr;
+	retval = pool->next_block;
 
  ready:
 	page->in_use++;
-	offset = page->offset;
-	page->offset = *(int *)(page->vaddr + offset);
-	retval = offset + page->vaddr;
+	pool->next_block = *(void **)retval;
+	offset = retval - page->vaddr;
 	*handle = offset + page->dma;
 #ifdef	DMAPOOL_DEBUG
 	{
 		int i;
 		u8 *data = retval;
-		/* page->offset is stored in first 4 bytes */
-		for (i = sizeof(page->offset); i < pool->size; i++) {
+		/* next block link is stored in first pointer bytes */
+		for (i = sizeof(void *); i < pool->size; i++) {
 			if (data[i] == POOL_POISON_FREED)
 				continue;
 			if (pool->dev)
@@ -380,11 +397,6 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 }
 EXPORT_SYMBOL(dma_pool_alloc);
 
-static struct dma_page *pool_find_page(struct dma_pool *pool, unsigned long vaddr)
-{
-	return xa_load(pool->pages, vaddr & ~(pool->allocation - 1));
-}
-
 /**
  * dma_pool_free - put block back into dma pool
  * @pool: the dma pool holding the block
@@ -401,7 +413,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 	unsigned int offset;
 
 	spin_lock_irqsave(&pool->lock, flags);
-	page = pool_find_page(pool, vaddr);
+	page = pool_find_page(pool, (unsigned long)vaddr);
 	if (!page) {
 		spin_unlock_irqrestore(&pool->lock, flags);
 		if (pool->dev)
@@ -428,10 +440,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 		return;
 	}
 	{
-		unsigned int chain = page->offset;
-		while (chain < pool->allocation) {
-			if (chain != offset) {
-				chain = *(int *)(page->vaddr + chain);
+		void *v = pool->next_block;
+		while (v) {
+			if (v != vaddr) {
+				v = *(void **)v;
 				continue;
 			}
 			spin_unlock_irqrestore(&pool->lock, flags);
@@ -448,8 +460,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 #endif
 
 	page->in_use--;
-	*(int *)vaddr = page->offset;
-	page->offset = offset;
+	*(void **)vaddr = pool->next_block;
+	pool->next_block = vaddr;
 	/*
 	 * Resist a temptation to do
 	 *    if (!is_page_busy(page)) pool_free_page(pool, page);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] mm/dmapool: replace linked list with xarray
  2022-04-28 20:27 ` [PATCH 1/2] mm/dmapool: replace linked list with xarray kbusch
@ 2022-04-28 21:59   ` Matthew Wilcox
  2022-04-29  1:41     ` Keith Busch
  0 siblings, 1 reply; 7+ messages in thread
From: Matthew Wilcox @ 2022-04-28 21:59 UTC (permalink / raw)
  To: kbusch; +Cc: linux-kernel, linux-mm, kernel-team

On Thu, Apr 28, 2022 at 02:27:13PM -0600, kbusch@kernel.org wrote:
> Store the cached dma pool pages in an xarray instead of a linked list.
> This allows constant lookup time to free the page, lockless iteration
> while displaying the attributes, and frees up space in 'struct dma_page'.

Hey Keith, this looks great, especially since there's more performance
you can squeeze out of it.

>  struct dma_pool {		/* the pool */
> -	struct list_head page_list;
> +	struct xarray pages;
>  	spinlock_t lock;

A further optimisation you could make is to use the xa_lock to protect
the rest of the data structure.  But that would be a subsequent patch.

> @@ -282,7 +281,8 @@ void dma_pool_destroy(struct dma_pool *pool)
>  		device_remove_file(pool->dev, &dev_attr_pools);
>  	mutex_unlock(&pools_reg_lock);
>  
> -	list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) {
> +	xa_for_each(&pool->pages, i, page) {
> +		xa_erase(&pool->pages, i);
>  		if (is_page_busy(page)) {
>  			if (pool->dev)
>  				dev_err(pool->dev, "%s %s, %p busy\n", __func__,
> @@ -291,12 +291,12 @@ void dma_pool_destroy(struct dma_pool *pool)
>  				pr_err("%s %s, %p busy\n", __func__,
>  				       pool->name, page->vaddr);
>  			/* leak the still-in-use consistent memory */
> -			list_del(&page->page_list);
>  			kfree(page);
>  		} else
>  			pool_free_page(pool, page);
>  	}
>  
> +	xa_destroy(&pool->pages);

If you're erasing the entries as you go, you don't need to call
xa_destroy().  Contrariwise, if you call xa_destroy(), you don't need to
call xa_erase().  I'd probably just call xa_destroy() at the end as it's
less work.

> @@ -316,13 +316,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
>  {
>  	unsigned long flags;
>  	struct dma_page *page;
> +	unsigned long i;
>  	size_t offset;
>  	void *retval;
>  
>  	might_alloc(mem_flags);
>  
>  	spin_lock_irqsave(&pool->lock, flags);
> -	list_for_each_entry(page, &pool->page_list, page_list) {
> +	xa_for_each(&pool->pages, i, page) {
>  		if (page->offset < pool->allocation)
>  			goto ready;
>  	}

A further optimisation you could do is use xarray search marks to
search for only pages which have free entries.

> +	xa_store(&pool->pages, page->vaddr, page, mem_flags);

Oof.  The XArray isn't good at such sparse allocations.  You can improve
it (by a significant amount) by shifting the vaddr by PAGE_SHIFT bits.
Should save you two nodes of tree height and thus two cache lines per
lookup.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] mm/dmapool: replace linked list with xarray
  2022-04-28 21:59   ` Matthew Wilcox
@ 2022-04-29  1:41     ` Keith Busch
  0 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2022-04-29  1:41 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-kernel, linux-mm, kernel-team

On Thu, Apr 28, 2022 at 10:59:49PM +0100, Matthew Wilcox wrote:
> On Thu, Apr 28, 2022 at 02:27:13PM -0600, kbusch@kernel.org wrote:
> > @@ -316,13 +316,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
> >  {
> >  	unsigned long flags;
> >  	struct dma_page *page;
> > +	unsigned long i;
> >  	size_t offset;
> >  	void *retval;
> >  
> >  	might_alloc(mem_flags);
> >  
> >  	spin_lock_irqsave(&pool->lock, flags);
> > -	list_for_each_entry(page, &pool->page_list, page_list) {
> > +	xa_for_each(&pool->pages, i, page) {
> >  		if (page->offset < pool->allocation)
> >  			goto ready;
> >  	}
> 
> A further optimisation you could do is use xarray search marks to
> search for only pages which have free entries.

That's an interesting idea. I didn't consider setting marks since patch 2
replaces this search with essentially a stack pop. If a marked entry can be
returned in a similar time, though, I could drop patch 2. I can't tell from the
xarray code if that operation time is in the same ballpark, though, so I'll
just rerun the the benchmark. :)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/2] dmapool performance enhancements
  2022-04-28 20:27 [PATCH 0/2] dmapool performance enhancements kbusch
  2022-04-28 20:27 ` [PATCH 1/2] mm/dmapool: replace linked list with xarray kbusch
  2022-04-28 20:27 ` [PATCH 2/2] mm/dmapool: link blocks across pages kbusch
@ 2022-05-27 19:35 ` Tony Battersby
  2022-05-27 21:01   ` Keith Busch
  2 siblings, 1 reply; 7+ messages in thread
From: Tony Battersby @ 2022-05-27 19:35 UTC (permalink / raw)
  To: kbusch; +Cc: kernel-team, linux-kernel, linux-mm, willy

I posted a similar patch series back in 2018:

https://lore.kernel.org/linux-mm/73ec1f52-d758-05df-fb6a-41d269e910d0@cybernetics.com/
https://lore.kernel.org/linux-mm/15ff502d-d840-1003-6c45-bc17f0d81262@cybernetics.com/
https://lore.kernel.org/linux-mm/1288e597-a67a-25b3-b7c6-db883ca67a25@cybernetics.com/


I initially used a red-black tree keyed by the DMA address, but then for
v2 of the patchset I put the dma pool info directly into struct page and
used virt_to_page() to get at it.  But it turned out that was a bad idea
because not all architectures have struct page backing
dma_alloc_coherent():

https://lore.kernel.org/linux-kernel/20181206013054.GI6707@atomide.com/

I intended to go back and resubmit the red-black tree version, but I was
too busy at the time and forgot about it.  A few days ago I finally
decided to update the patches and submit them upstream.  I found your
recent dmapool xarray patches by searching the mailing list archive to
see if anyone else was working on something similar.

Using the following as a benchmark:

modprobe mpt3sas
drivers/scsi/mpt3sas/mpt3sas_base.c
_base_allocate_chain_dma_pool
loop dma_pool_alloc(ioc->chain_dma_pool)

rmmod mpt3sas
drivers/scsi/mpt3sas/mpt3sas_base.c
_base_release_memory_pools()
loop dma_pool_free(ioc->chain_dma_pool)

Here are the benchmark results showing the speedup from the patchsets:

        modprobe  rmmod
orig          1x     1x
xarray      5.2x   186x
rbtree      9.3x   269x

It looks like my red-black tree version is faster than the v1 of the
xarray patch on this benchmark at least, although the mpt3sas usage of
dmapool is hardly typical.  I will try to get some testing done on my
patchset and post it next week.

Tony Battersby
Cybernetics


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/2] dmapool performance enhancements
  2022-05-27 19:35 ` [PATCH 0/2] dmapool performance enhancements Tony Battersby
@ 2022-05-27 21:01   ` Keith Busch
  0 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2022-05-27 21:01 UTC (permalink / raw)
  To: Tony Battersby; +Cc: kernel-team, linux-kernel, linux-mm, willy

On Fri, May 27, 2022 at 03:35:47PM -0400, Tony Battersby wrote:
> I posted a similar patch series back in 2018:
> 
> https://lore.kernel.org/linux-mm/73ec1f52-d758-05df-fb6a-41d269e910d0@cybernetics.com/
> https://lore.kernel.org/linux-mm/15ff502d-d840-1003-6c45-bc17f0d81262@cybernetics.com/
> https://lore.kernel.org/linux-mm/1288e597-a67a-25b3-b7c6-db883ca67a25@cybernetics.com/
> 
> 
> I initially used a red-black tree keyed by the DMA address, but then for
> v2 of the patchset I put the dma pool info directly into struct page and
> used virt_to_page() to get at it.  But it turned out that was a bad idea
> because not all architectures have struct page backing
> dma_alloc_coherent():
> 
> https://lore.kernel.org/linux-kernel/20181206013054.GI6707@atomide.com/
> 
> I intended to go back and resubmit the red-black tree version, but I was
> too busy at the time and forgot about it.  A few days ago I finally
> decided to update the patches and submit them upstream.  I found your
> recent dmapool xarray patches by searching the mailing list archive to
> see if anyone else was working on something similar.
> 
> Using the following as a benchmark:
> 
> modprobe mpt3sas
> drivers/scsi/mpt3sas/mpt3sas_base.c
> _base_allocate_chain_dma_pool
> loop dma_pool_alloc(ioc->chain_dma_pool)
> 
> rmmod mpt3sas
> drivers/scsi/mpt3sas/mpt3sas_base.c
> _base_release_memory_pools()
> loop dma_pool_free(ioc->chain_dma_pool)
> 
> Here are the benchmark results showing the speedup from the patchsets:
> 
>         modprobe  rmmod
> orig          1x     1x
> xarray      5.2x   186x
> rbtree      9.3x   269x
> 
> It looks like my red-black tree version is faster than the v1 of the
> xarray patch on this benchmark at least, although the mpt3sas usage of
> dmapool is hardly typical.  I will try to get some testing done on my
> patchset and post it next week.

Thanks for the info.

Just comparing with xarray, I actually found that the list was still faster
until you get >100 pages in the pool, after which xarray becomes the clear
winner.

But it turns out I don't often see that many pages allocated for a lot of real
use cases, so I'm trying to take this in a different direction by replacing the
lookup structures with an intrusive stack. That is safe to do since pages are
never freed for the lifetime of the pool, and it's by far faster than anything
else. The downside is that I'd need to increase the size of the smallest
allowable pool block, but I think that's okay.

Anyway I was planning to post this new idea soon. My reasons for wanting a
faster dma pool are still in the works, though, so I'm just trying to sort out
those patches before returning to this one.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-05-27 21:01 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-28 20:27 [PATCH 0/2] dmapool performance enhancements kbusch
2022-04-28 20:27 ` [PATCH 1/2] mm/dmapool: replace linked list with xarray kbusch
2022-04-28 21:59   ` Matthew Wilcox
2022-04-29  1:41     ` Keith Busch
2022-04-28 20:27 ` [PATCH 2/2] mm/dmapool: link blocks across pages kbusch
2022-05-27 19:35 ` [PATCH 0/2] dmapool performance enhancements Tony Battersby
2022-05-27 21:01   ` Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.