linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
@ 2023-01-24 12:43 Felix Fietkau
  2023-01-24 14:11 ` Ilias Apalodimas
  0 siblings, 1 reply; 43+ messages in thread
From: Felix Fietkau @ 2023-01-24 12:43 UTC (permalink / raw)
  To: netdev, Jesper Dangaard Brouer, Ilias Apalodimas,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Lorenzo Bianconi, linux-kernel

While testing fragmented page_pool allocation in the mt76 driver, I was able
to reliably trigger page refcount underflow issues, which did not occur with
full-page page_pool allocation.
It appears to me, that handling refcounting in two separate counters
(page->pp_frag_count and page refcount) is racy when page refcount gets
incremented by code dealing with skb fragments directly, and
page_pool_return_skb_page is called multiple times for the same fragment.

Dropping page->pp_frag_count and relying entirely on the page refcount makes
these underflow issues and crashes go away.

Cc: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 include/linux/mm_types.h | 17 +++++------------
 include/net/page_pool.h  | 19 ++++---------------
 net/core/page_pool.c     | 12 ++++--------
 3 files changed, 13 insertions(+), 35 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9757067c3053..96ec3b19a86d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -125,18 +125,11 @@ struct page {
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
 			unsigned long dma_addr;
-			union {
-				/**
-				 * dma_addr_upper: might require a 64-bit
-				 * value on 32-bit architectures.
-				 */
-				unsigned long dma_addr_upper;
-				/**
-				 * For frag page support, not supported in
-				 * 32-bit architectures with 64-bit DMA.
-				 */
-				atomic_long_t pp_frag_count;
-			};
+			/**
+			 * dma_addr_upper: might require a 64-bit
+			 * value on 32-bit architectures.
+			 */
+			unsigned long dma_addr_upper;
 		};
 		struct {	/* Tail pages of compound page */
 			unsigned long compound_head;	/* Bit zero is set */
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 813c93499f20..28e1fdbdcd53 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -279,14 +279,14 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
 
 static inline void page_pool_fragment_page(struct page *page, long nr)
 {
-	atomic_long_set(&page->pp_frag_count, nr);
+	page_ref_add(page, nr);
 }
 
 static inline long page_pool_defrag_page(struct page *page, long nr)
 {
 	long ret;
 
-	/* If nr == pp_frag_count then we have cleared all remaining
+	/* If nr == page_ref_count then we have cleared all remaining
 	 * references to the page. No need to actually overwrite it, instead
 	 * we can leave this to be overwritten by the calling function.
 	 *
@@ -295,22 +295,14 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
 	 * especially when dealing with a page that may be partitioned
 	 * into only 2 or 3 pieces.
 	 */
-	if (atomic_long_read(&page->pp_frag_count) == nr)
+	if (page_ref_count(page) == nr)
 		return 0;
 
-	ret = atomic_long_sub_return(nr, &page->pp_frag_count);
+	ret = page_ref_sub_return(page, nr);
 	WARN_ON(ret < 0);
 	return ret;
 }
 
-static inline bool page_pool_is_last_frag(struct page_pool *pool,
-					  struct page *page)
-{
-	/* If fragments aren't enabled or count is 0 we were the last user */
-	return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
-	       (page_pool_defrag_page(page, 1) == 0);
-}
-
 static inline void page_pool_put_page(struct page_pool *pool,
 				      struct page *page,
 				      unsigned int dma_sync_size,
@@ -320,9 +312,6 @@ static inline void page_pool_put_page(struct page_pool *pool,
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	if (!page_pool_is_last_frag(pool, page))
-		return;
-
 	page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
 #endif
 }
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 9b203d8660e4..0defcadae225 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -25,7 +25,7 @@
 #define DEFER_TIME (msecs_to_jiffies(1000))
 #define DEFER_WARN_INTERVAL (60 * HZ)
 
-#define BIAS_MAX	LONG_MAX
+#define BIAS_MAX(pool)	(PAGE_SIZE << ((pool)->p.order))
 
 #ifdef CONFIG_PAGE_POOL_STATS
 /* alloc_stat_inc is intended to be used in softirq context */
@@ -619,10 +619,6 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 	for (i = 0; i < count; i++) {
 		struct page *page = virt_to_head_page(data[i]);
 
-		/* It is not the last user for the page frag case */
-		if (!page_pool_is_last_frag(pool, page))
-			continue;
-
 		page = __page_pool_put_page(pool, page, -1, false);
 		/* Approved for bulk recycling in ptr_ring cache */
 		if (page)
@@ -659,7 +655,7 @@ EXPORT_SYMBOL(page_pool_put_page_bulk);
 static struct page *page_pool_drain_frag(struct page_pool *pool,
 					 struct page *page)
 {
-	long drain_count = BIAS_MAX - pool->frag_users;
+	long drain_count = BIAS_MAX(pool) - pool->frag_users;
 
 	/* Some user is still using the page frag */
 	if (likely(page_pool_defrag_page(page, drain_count)))
@@ -678,7 +674,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
 
 static void page_pool_free_frag(struct page_pool *pool)
 {
-	long drain_count = BIAS_MAX - pool->frag_users;
+	long drain_count = BIAS_MAX(pool) - pool->frag_users;
 	struct page *page = pool->frag_page;
 
 	pool->frag_page = NULL;
@@ -724,7 +720,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
 		pool->frag_users = 1;
 		*offset = 0;
 		pool->frag_offset = size;
-		page_pool_fragment_page(page, BIAS_MAX);
+		page_pool_fragment_page(page, BIAS_MAX(pool));
 		return page;
 	}
 
-- 
2.39.0


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-24 12:43 [PATCH] net: page_pool: fix refcounting issues with fragmented allocation Felix Fietkau
@ 2023-01-24 14:11 ` Ilias Apalodimas
  2023-01-24 15:57   ` Alexander H Duyck
  2023-01-24 17:22   ` Felix Fietkau
  0 siblings, 2 replies; 43+ messages in thread
From: Ilias Apalodimas @ 2023-01-24 14:11 UTC (permalink / raw)
  To: Felix Fietkau
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Alexander Duyck, Yunsheng Lin

Hi Felix,

++cc Alexander and Yunsheng.

Thanks for the report

On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>
> While testing fragmented page_pool allocation in the mt76 driver, I was able
> to reliably trigger page refcount underflow issues, which did not occur with
> full-page page_pool allocation.
> It appears to me, that handling refcounting in two separate counters
> (page->pp_frag_count and page refcount) is racy when page refcount gets
> incremented by code dealing with skb fragments directly, and
> page_pool_return_skb_page is called multiple times for the same fragment.
>
> Dropping page->pp_frag_count and relying entirely on the page refcount makes
> these underflow issues and crashes go away.
>

This has been discussed here [1].  TL;DR changing this to page
refcount might blow up in other colorful ways.  Can we look closer and
figure out why the underflow happens?

[1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/

Thanks
/Ilias

> Cc: Lorenzo Bianconi <lorenzo@kernel.org>
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
>  include/linux/mm_types.h | 17 +++++------------
>  include/net/page_pool.h  | 19 ++++---------------
>  net/core/page_pool.c     | 12 ++++--------
>  3 files changed, 13 insertions(+), 35 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 9757067c3053..96ec3b19a86d 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -125,18 +125,11 @@ struct page {
>                         struct page_pool *pp;
>                         unsigned long _pp_mapping_pad;
>                         unsigned long dma_addr;
> -                       union {
> -                               /**
> -                                * dma_addr_upper: might require a 64-bit
> -                                * value on 32-bit architectures.
> -                                */
> -                               unsigned long dma_addr_upper;
> -                               /**
> -                                * For frag page support, not supported in
> -                                * 32-bit architectures with 64-bit DMA.
> -                                */
> -                               atomic_long_t pp_frag_count;
> -                       };
> +                       /**
> +                        * dma_addr_upper: might require a 64-bit
> +                        * value on 32-bit architectures.
> +                        */
> +                       unsigned long dma_addr_upper;
>                 };
>                 struct {        /* Tail pages of compound page */
>                         unsigned long compound_head;    /* Bit zero is set */
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 813c93499f20..28e1fdbdcd53 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -279,14 +279,14 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
>
>  static inline void page_pool_fragment_page(struct page *page, long nr)
>  {
> -       atomic_long_set(&page->pp_frag_count, nr);
> +       page_ref_add(page, nr);
>  }
>
>  static inline long page_pool_defrag_page(struct page *page, long nr)
>  {
>         long ret;
>
> -       /* If nr == pp_frag_count then we have cleared all remaining
> +       /* If nr == page_ref_count then we have cleared all remaining
>          * references to the page. No need to actually overwrite it, instead
>          * we can leave this to be overwritten by the calling function.
>          *
> @@ -295,22 +295,14 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
>          * especially when dealing with a page that may be partitioned
>          * into only 2 or 3 pieces.
>          */
> -       if (atomic_long_read(&page->pp_frag_count) == nr)
> +       if (page_ref_count(page) == nr)
>                 return 0;
>
> -       ret = atomic_long_sub_return(nr, &page->pp_frag_count);
> +       ret = page_ref_sub_return(page, nr);
>         WARN_ON(ret < 0);
>         return ret;
>  }
>
> -static inline bool page_pool_is_last_frag(struct page_pool *pool,
> -                                         struct page *page)
> -{
> -       /* If fragments aren't enabled or count is 0 we were the last user */
> -       return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
> -              (page_pool_defrag_page(page, 1) == 0);
> -}
> -
>  static inline void page_pool_put_page(struct page_pool *pool,
>                                       struct page *page,
>                                       unsigned int dma_sync_size,
> @@ -320,9 +312,6 @@ static inline void page_pool_put_page(struct page_pool *pool,
>          * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
>          */
>  #ifdef CONFIG_PAGE_POOL
> -       if (!page_pool_is_last_frag(pool, page))
> -               return;
> -
>         page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
>  #endif
>  }
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 9b203d8660e4..0defcadae225 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -25,7 +25,7 @@
>  #define DEFER_TIME (msecs_to_jiffies(1000))
>  #define DEFER_WARN_INTERVAL (60 * HZ)
>
> -#define BIAS_MAX       LONG_MAX
> +#define BIAS_MAX(pool) (PAGE_SIZE << ((pool)->p.order))
>
>  #ifdef CONFIG_PAGE_POOL_STATS
>  /* alloc_stat_inc is intended to be used in softirq context */
> @@ -619,10 +619,6 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
>         for (i = 0; i < count; i++) {
>                 struct page *page = virt_to_head_page(data[i]);
>
> -               /* It is not the last user for the page frag case */
> -               if (!page_pool_is_last_frag(pool, page))
> -                       continue;
> -
>                 page = __page_pool_put_page(pool, page, -1, false);
>                 /* Approved for bulk recycling in ptr_ring cache */
>                 if (page)
> @@ -659,7 +655,7 @@ EXPORT_SYMBOL(page_pool_put_page_bulk);
>  static struct page *page_pool_drain_frag(struct page_pool *pool,
>                                          struct page *page)
>  {
> -       long drain_count = BIAS_MAX - pool->frag_users;
> +       long drain_count = BIAS_MAX(pool) - pool->frag_users;
>
>         /* Some user is still using the page frag */
>         if (likely(page_pool_defrag_page(page, drain_count)))
> @@ -678,7 +674,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
>
>  static void page_pool_free_frag(struct page_pool *pool)
>  {
> -       long drain_count = BIAS_MAX - pool->frag_users;
> +       long drain_count = BIAS_MAX(pool) - pool->frag_users;
>         struct page *page = pool->frag_page;
>
>         pool->frag_page = NULL;
> @@ -724,7 +720,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>                 pool->frag_users = 1;
>                 *offset = 0;
>                 pool->frag_offset = size;
> -               page_pool_fragment_page(page, BIAS_MAX);
> +               page_pool_fragment_page(page, BIAS_MAX(pool));
>                 return page;
>         }
>
> --
> 2.39.0
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-24 14:11 ` Ilias Apalodimas
@ 2023-01-24 15:57   ` Alexander H Duyck
  2023-01-24 16:59     ` Felix Fietkau
  2023-01-26 10:31     ` Ilias Apalodimas
  2023-01-24 17:22   ` Felix Fietkau
  1 sibling, 2 replies; 43+ messages in thread
From: Alexander H Duyck @ 2023-01-24 15:57 UTC (permalink / raw)
  To: Ilias Apalodimas, Felix Fietkau
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote:
> Hi Felix,
> 
> ++cc Alexander and Yunsheng.
> 
> Thanks for the report
> 
> On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > 
> > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > to reliably trigger page refcount underflow issues, which did not occur with
> > full-page page_pool allocation.
> > It appears to me, that handling refcounting in two separate counters
> > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > incremented by code dealing with skb fragments directly, and
> > page_pool_return_skb_page is called multiple times for the same fragment.
> > 
> > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > these underflow issues and crashes go away.
> > 
> 
> This has been discussed here [1].  TL;DR changing this to page
> refcount might blow up in other colorful ways.  Can we look closer and
> figure out why the underflow happens?
> 
> [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/
> 
> Thanks
> /Ilias
> 
> 

The logic should be safe in terms of the page pool itself as it should
be holding one reference to the page while the pp_frag_count is non-
zero. That one reference is what keeps the two halfs in sync as the
page shouldn't be able to be freed until we exhaust the pp_frag_count.

To have an underflow there are two possible scenarios. One is that
either put_page or free_page is being called somewhere that the
page_pool freeing functions should be used. The other possibility is
that a pp_frag_count reference was taken somewhere a page reference
should have.

Do we have a backtrace for the spots that are showing this underrun? If
nothing else we may want to look at tracking down the spots that are
freeing the page pool pages via put_page or free_page to determine what
paths these pages are taking.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-24 15:57   ` Alexander H Duyck
@ 2023-01-24 16:59     ` Felix Fietkau
  2023-01-26 10:31     ` Ilias Apalodimas
  1 sibling, 0 replies; 43+ messages in thread
From: Felix Fietkau @ 2023-01-24 16:59 UTC (permalink / raw)
  To: Alexander H Duyck, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On 24.01.23 16:57, Alexander H Duyck wrote:
> On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote:
>> Hi Felix,
>> 
>> ++cc Alexander and Yunsheng.
>> 
>> Thanks for the report
>> 
>> On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > 
>> > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > to reliably trigger page refcount underflow issues, which did not occur with
>> > full-page page_pool allocation.
>> > It appears to me, that handling refcounting in two separate counters
>> > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > incremented by code dealing with skb fragments directly, and
>> > page_pool_return_skb_page is called multiple times for the same fragment.
>> > 
>> > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > these underflow issues and crashes go away.
>> > 
>> 
>> This has been discussed here [1].  TL;DR changing this to page
>> refcount might blow up in other colorful ways.  Can we look closer and
>> figure out why the underflow happens?
>> 
>> [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/
>> 
>> Thanks
>> /Ilias
>> 
>> 
> 
> The logic should be safe in terms of the page pool itself as it should
> be holding one reference to the page while the pp_frag_count is non-
> zero. That one reference is what keeps the two halfs in sync as the
> page shouldn't be able to be freed until we exhaust the pp_frag_count.
> 
> To have an underflow there are two possible scenarios. One is that
> either put_page or free_page is being called somewhere that the
> page_pool freeing functions should be used. The other possibility is
> that a pp_frag_count reference was taken somewhere a page reference
> should have.
> 
> Do we have a backtrace for the spots that are showing this underrun? If
> nothing else we may want to look at tracking down the spots that are
> freeing the page pool pages via put_page or free_page to determine what
> paths these pages are taking.
Here's an example of the kind of traces that I was seeing with v6.1:
https://nbd.name/p/61a6617e
On v5.15 I also occasionally got traces like this:
https://nbd.name/p/0b9e4f0d

 From what I can tell, it also triggered the warning that shows up when 
page->pp_frag_count underflows. Unfortunately these traces don't 
directly point to the place where things go wrong.
I do wonder if the pp_frag_count is maybe racy when we have a mix of 
get_page + page_pool_put_page calls.

In case you're wondering what I was doing to trigger the crash: I simply 
create 4 wireless client mode interfaces on the same card and pushed TCP 
traffic from an AP to all 4 simultaneously. I can trigger it pretty much 
immediately after TCP traffic ramps up.

- Felix

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-24 14:11 ` Ilias Apalodimas
  2023-01-24 15:57   ` Alexander H Duyck
@ 2023-01-24 17:22   ` Felix Fietkau
  2023-01-24 21:10     ` Alexander H Duyck
  2023-01-26 10:32     ` [PATCH] net: page_pool: fix refcounting issues with fragmented allocation Ilias Apalodimas
  1 sibling, 2 replies; 43+ messages in thread
From: Felix Fietkau @ 2023-01-24 17:22 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Alexander Duyck, Yunsheng Lin

On 24.01.23 15:11, Ilias Apalodimas wrote:
> Hi Felix,
> 
> ++cc Alexander and Yunsheng.
> 
> Thanks for the report
> 
> On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>>
>> While testing fragmented page_pool allocation in the mt76 driver, I was able
>> to reliably trigger page refcount underflow issues, which did not occur with
>> full-page page_pool allocation.
>> It appears to me, that handling refcounting in two separate counters
>> (page->pp_frag_count and page refcount) is racy when page refcount gets
>> incremented by code dealing with skb fragments directly, and
>> page_pool_return_skb_page is called multiple times for the same fragment.
>>
>> Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> these underflow issues and crashes go away.
>>
> 
> This has been discussed here [1].  TL;DR changing this to page
> refcount might blow up in other colorful ways.  Can we look closer and
> figure out why the underflow happens?
I don't see how the approch taken in my patch would blow up. From what I 
can tell, it should be fairly close to how refcount is handled in 
page_frag_alloc. The main improvement it adds is to prevent it from 
blowing up if pool-allocated fragments get shared across multiple skbs 
with corresponding get_page and page_pool_return_skb_page calls.

- Felix


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-24 17:22   ` Felix Fietkau
@ 2023-01-24 21:10     ` Alexander H Duyck
  2023-01-24 21:30       ` Felix Fietkau
  2023-01-26 10:32     ` [PATCH] net: page_pool: fix refcounting issues with fragmented allocation Ilias Apalodimas
  1 sibling, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2023-01-24 21:10 UTC (permalink / raw)
  To: Felix Fietkau, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> On 24.01.23 15:11, Ilias Apalodimas wrote:
> > Hi Felix,
> > 
> > ++cc Alexander and Yunsheng.
> > 
> > Thanks for the report
> > 
> > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > 
> > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > to reliably trigger page refcount underflow issues, which did not occur with
> > > full-page page_pool allocation.
> > > It appears to me, that handling refcounting in two separate counters
> > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > incremented by code dealing with skb fragments directly, and
> > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > 
> > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > these underflow issues and crashes go away.
> > > 
> > 
> > This has been discussed here [1].  TL;DR changing this to page
> > refcount might blow up in other colorful ways.  Can we look closer and
> > figure out why the underflow happens?
> I don't see how the approch taken in my patch would blow up. From what I 
> can tell, it should be fairly close to how refcount is handled in 
> page_frag_alloc. The main improvement it adds is to prevent it from 
> blowing up if pool-allocated fragments get shared across multiple skbs 
> with corresponding get_page and page_pool_return_skb_page calls.
> 
> - Felix
> 

Do you have the patch available to review as an RFC? From what I am
seeing it looks like you are underrunning on the pp_frag_count itself.
I would suspect the issue to be something like starting with a bad
count in terms of the total number of references, or deducing the wrong
amount when you finally free the page assuming you are tracking your
frag count using a non-atomic value in the driver.

Thanks,

- Alex

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-24 21:10     ` Alexander H Duyck
@ 2023-01-24 21:30       ` Felix Fietkau
  2023-01-25 17:11         ` Alexander H Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Felix Fietkau @ 2023-01-24 21:30 UTC (permalink / raw)
  To: Alexander H Duyck, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On 24.01.23 22:10, Alexander H Duyck wrote:
> On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> On 24.01.23 15:11, Ilias Apalodimas wrote:
>> > Hi Felix,
>> > 
>> > ++cc Alexander and Yunsheng.
>> > 
>> > Thanks for the report
>> > 
>> > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > > 
>> > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > > to reliably trigger page refcount underflow issues, which did not occur with
>> > > full-page page_pool allocation.
>> > > It appears to me, that handling refcounting in two separate counters
>> > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > > incremented by code dealing with skb fragments directly, and
>> > > page_pool_return_skb_page is called multiple times for the same fragment.
>> > > 
>> > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > > these underflow issues and crashes go away.
>> > > 
>> > 
>> > This has been discussed here [1].  TL;DR changing this to page
>> > refcount might blow up in other colorful ways.  Can we look closer and
>> > figure out why the underflow happens?
>> I don't see how the approch taken in my patch would blow up. From what I 
>> can tell, it should be fairly close to how refcount is handled in 
>> page_frag_alloc. The main improvement it adds is to prevent it from 
>> blowing up if pool-allocated fragments get shared across multiple skbs 
>> with corresponding get_page and page_pool_return_skb_page calls.
>> 
>> - Felix
>> 
> 
> Do you have the patch available to review as an RFC? From what I am
> seeing it looks like you are underrunning on the pp_frag_count itself.
> I would suspect the issue to be something like starting with a bad
> count in terms of the total number of references, or deducing the wrong
> amount when you finally free the page assuming you are tracking your
> frag count using a non-atomic value in the driver.
The driver patches for page pool are here:
https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/

They are also applied in my mt76 tree at:
https://github.com/nbd168/wireless

- Felix

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-24 21:30       ` Felix Fietkau
@ 2023-01-25 17:11         ` Alexander H Duyck
  2023-01-25 17:32           ` Felix Fietkau
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2023-01-25 17:11 UTC (permalink / raw)
  To: Felix Fietkau, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
> On 24.01.23 22:10, Alexander H Duyck wrote:
> > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> > > On 24.01.23 15:11, Ilias Apalodimas wrote:
> > > > Hi Felix,
> > > > 
> > > > ++cc Alexander and Yunsheng.
> > > > 
> > > > Thanks for the report
> > > > 
> > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > > > 
> > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > > full-page page_pool allocation.
> > > > > It appears to me, that handling refcounting in two separate counters
> > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > > incremented by code dealing with skb fragments directly, and
> > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > > > 
> > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > > these underflow issues and crashes go away.
> > > > > 
> > > > 
> > > > This has been discussed here [1].  TL;DR changing this to page
> > > > refcount might blow up in other colorful ways.  Can we look closer and
> > > > figure out why the underflow happens?
> > > I don't see how the approch taken in my patch would blow up. From what I 
> > > can tell, it should be fairly close to how refcount is handled in 
> > > page_frag_alloc. The main improvement it adds is to prevent it from 
> > > blowing up if pool-allocated fragments get shared across multiple skbs 
> > > with corresponding get_page and page_pool_return_skb_page calls.
> > > 
> > > - Felix
> > > 
> > 
> > Do you have the patch available to review as an RFC? From what I am
> > seeing it looks like you are underrunning on the pp_frag_count itself.
> > I would suspect the issue to be something like starting with a bad
> > count in terms of the total number of references, or deducing the wrong
> > amount when you finally free the page assuming you are tracking your
> > frag count using a non-atomic value in the driver.
> The driver patches for page pool are here:
> https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
> https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
> 
> They are also applied in my mt76 tree at:
> https://github.com/nbd168/wireless
> 
> - Felix

So one thing I am thinking is that we may be seeing an issue where we
are somehow getting a mix of frag and non-frag based page pool pages.
That is the only case I can think of where we might be underflowing
negative. If you could add some additional debug info on the underflow
WARN_ON case in page_pool_defrag_page that might be useful.
Specifically I would be curious what the actual return value is. I'm
assuming we are only hitting negative 1, but I would want to verify we
aren't seeing something else.

Also just to confirm this is building on 64b kernel correct? Just want
to make sure we don't have this running on a 32b setup where the frag
count and the upper 32b of the DMA address are overlapped.

As far as the patch set I only really see a few minor issues which I am
going to post a few snippets below.


> diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> b/drivers/net/wireless/mediatek/mt76/dma.c
> index 611769e445fa..7fd9aa9c3d9e 100644

...

> @@ -593,25 +593,28 @@  mt76_dma_rx_fill(struct mt76_dev *dev, struct
> mt76_queue *q)
>  
>  	while (q->queued < q->ndesc - 1) {
>  		struct mt76_queue_buf qbuf;
> -		void *buf = NULL;
> +		dma_addr_t addr;
> +		int offset;
> +		void *buf;
>  
> -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> GFP_ATOMIC);
> +		buf = mt76_get_page_pool_buf(q, &offset, q-
> >buf_size);
>  		if (!buf)
>  			break;
>  
> -		addr = dma_map_single(dev->dma_dev, buf, len,
> DMA_FROM_DEVICE);
> +		addr = dma_map_single(dev->dma_dev, buf + offset,
> len,
> +				      DMA_FROM_DEVICE);

Offset was already added to buf in mt76_get_page_pool_buf so the DMA
mapping offset doesn't look right to me.

>  		if (unlikely(dma_mapping_error(dev->dma_dev, addr)))
> {
> -			skb_free_frag(buf);
> +			mt76_put_page_pool_buf(buf, allow_direct);
>  			break;
>  		}
>  

I'm not a fan of the defensive programming in mt76_put_page_pool_buf.
If you are in an area that is using page pool you should be using the
page pool version of the freeing operations instead of adding
additional overhead that can mess things up by having it have to also
check for if the page is a page pool page or not.

> -		qbuf.addr = addr + offset;
> -		qbuf.len = len - offset;
> +		qbuf.addr = addr + q->buf_offset;
> +		qbuf.len = len - q->buf_offset;
>  		qbuf.skip_unmap = false;
>  		if (mt76_dma_add_rx_buf(dev, q, &qbuf, buf) < 0) {
>  			dma_unmap_single(dev->dma_dev, addr, len,
>  					 DMA_FROM_DEVICE);
> -			skb_free_frag(buf);
> +			mt76_put_page_pool_buf(buf, allow_direct);
>  			break;
>  		}
>  		frames++;

...

> @@ -848,6 +847,8 @@  mt76_dma_rx_process(struct mt76_dev *dev, struct
> mt76_queue *q, int budget)
>  			goto free_frag;
>  
>  		skb_reserve(skb, q->buf_offset);
> +		if (mt76_is_page_from_pp(data))
> +			skb_mark_for_recycle(skb);
>  
>  		*(u32 *)skb->cb = info;
>  

More defensive programming here. Is there a path that allows for a
mixed setup?

The only spot where I can see there being anything like that is in 
/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c. But it doesn't make
any sense to me as to why it was included in the patch. It might be
easier to sort out the issue if we were to get rid of some of the
defensive programming.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-25 17:11         ` Alexander H Duyck
@ 2023-01-25 17:32           ` Felix Fietkau
  2023-01-25 18:26             ` Alexander H Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Felix Fietkau @ 2023-01-25 17:32 UTC (permalink / raw)
  To: Alexander H Duyck, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On 25.01.23 18:11, Alexander H Duyck wrote:
> On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>> On 24.01.23 22:10, Alexander H Duyck wrote:
>> > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>> > > > Hi Felix,
>> > > > 
>> > > > ++cc Alexander and Yunsheng.
>> > > > 
>> > > > Thanks for the report
>> > > > 
>> > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > > > > 
>> > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > > > > to reliably trigger page refcount underflow issues, which did not occur with
>> > > > > full-page page_pool allocation.
>> > > > > It appears to me, that handling refcounting in two separate counters
>> > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > > > > incremented by code dealing with skb fragments directly, and
>> > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>> > > > > 
>> > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > > > > these underflow issues and crashes go away.
>> > > > > 
>> > > > 
>> > > > This has been discussed here [1].  TL;DR changing this to page
>> > > > refcount might blow up in other colorful ways.  Can we look closer and
>> > > > figure out why the underflow happens?
>> > > I don't see how the approch taken in my patch would blow up. From what I 
>> > > can tell, it should be fairly close to how refcount is handled in 
>> > > page_frag_alloc. The main improvement it adds is to prevent it from 
>> > > blowing up if pool-allocated fragments get shared across multiple skbs 
>> > > with corresponding get_page and page_pool_return_skb_page calls.
>> > > 
>> > > - Felix
>> > > 
>> > 
>> > Do you have the patch available to review as an RFC? From what I am
>> > seeing it looks like you are underrunning on the pp_frag_count itself.
>> > I would suspect the issue to be something like starting with a bad
>> > count in terms of the total number of references, or deducing the wrong
>> > amount when you finally free the page assuming you are tracking your
>> > frag count using a non-atomic value in the driver.
>> The driver patches for page pool are here:
>> https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>> https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>> 
>> They are also applied in my mt76 tree at:
>> https://github.com/nbd168/wireless
>> 
>> - Felix
> 
> So one thing I am thinking is that we may be seeing an issue where we
> are somehow getting a mix of frag and non-frag based page pool pages.
> That is the only case I can think of where we might be underflowing
> negative. If you could add some additional debug info on the underflow
> WARN_ON case in page_pool_defrag_page that might be useful.
> Specifically I would be curious what the actual return value is. I'm
> assuming we are only hitting negative 1, but I would want to verify we
> aren't seeing something else.
I'll try to run some more tests soon. However, I think I found the piece 
of code that is incompatible with using pp_frag_count.
When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
packet), and it is not split by the hardware, a cfg80211 function 
extracts the individual MSDUs into separate skbs. In that case, a 
fragment can be shared across multiple skbs, and get_page is used to 
increase the refcount.
You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
its helper functions).
This code also has a bug where it doesn't set pp_recycle on the newly 
allocated skb if the previous one has it, but that's a separate matter 
and fixing it doesn't make the crash go away.
Is there any way I can make that part of the code work with the current 
page pool frag implementation?

> Also just to confirm this is building on 64b kernel correct? Just want
> to make sure we don't have this running on a 32b setup where the frag
> count and the upper 32b of the DMA address are overlapped.
Yes, I'm using a 64b kernel.

> As far as the patch set I only really see a few minor issues which I am
> going to post a few snippets below.
> 
> 
>> diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
>> b/drivers/net/wireless/mediatek/mt76/dma.c
>> index 611769e445fa..7fd9aa9c3d9e 100644
> 
> ...
> 
>> @@ -593,25 +593,28 @@  mt76_dma_rx_fill(struct mt76_dev *dev, struct
>> mt76_queue *q)
>>  
>>  	while (q->queued < q->ndesc - 1) {
>>  		struct mt76_queue_buf qbuf;
>> -		void *buf = NULL;
>> +		dma_addr_t addr;
>> +		int offset;
>> +		void *buf;
>>  
>> -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
>> GFP_ATOMIC);
>> +		buf = mt76_get_page_pool_buf(q, &offset, q-
>> >buf_size);
>>  		if (!buf)
>>  			break;
>>  
>> -		addr = dma_map_single(dev->dma_dev, buf, len,
>> DMA_FROM_DEVICE);
>> +		addr = dma_map_single(dev->dma_dev, buf + offset,
>> len,
>> +				      DMA_FROM_DEVICE);
> 
> Offset was already added to buf in mt76_get_page_pool_buf so the DMA
> mapping offset doesn't look right to me.
Right. This is resolved by the follow-up patch which keeps pages DMA 
mapped. I plan on squashing both patches into one and adding some fixes 
on top when the underlying page pool issue is resolved.

>>  		if (unlikely(dma_mapping_error(dev->dma_dev, addr)))
>> {
>> -			skb_free_frag(buf);
>> +			mt76_put_page_pool_buf(buf, allow_direct);
>>  			break;
>>  		}
>>  
> 
> I'm not a fan of the defensive programming in mt76_put_page_pool_buf.
> If you are in an area that is using page pool you should be using the
> page pool version of the freeing operations instead of adding
> additional overhead that can mess things up by having it have to also
> check for if the page is a page pool page or not.
See below.

>> -		qbuf.addr = addr + offset;
>> -		qbuf.len = len - offset;
>> +		qbuf.addr = addr + q->buf_offset;
>> +		qbuf.len = len - q->buf_offset;
>>  		qbuf.skip_unmap = false;
>>  		if (mt76_dma_add_rx_buf(dev, q, &qbuf, buf) < 0) {
>>  			dma_unmap_single(dev->dma_dev, addr, len,
>>  					 DMA_FROM_DEVICE);
>> -			skb_free_frag(buf);
>> +			mt76_put_page_pool_buf(buf, allow_direct);
>>  			break;
>>  		}
>>  		frames++;
> 
> ...
> 
>> @@ -848,6 +847,8 @@  mt76_dma_rx_process(struct mt76_dev *dev, struct
>> mt76_queue *q, int budget)
>>  			goto free_frag;
>>  
>>  		skb_reserve(skb, q->buf_offset);
>> +		if (mt76_is_page_from_pp(data))
>> +			skb_mark_for_recycle(skb);
>>  
>>  		*(u32 *)skb->cb = info;
>>  
> 
> More defensive programming here. Is there a path that allows for a
> mixed setup?
> 
> The only spot where I can see there being anything like that is in
> /drivers/net/wireless/mediatek/mt76/mt7915/mmio.c. But it doesn't make
> any sense to me as to why it was included in the patch. It might be
> easier to sort out the issue if we were to get rid of some of the
> defensive programming.
This is not defensive programming. In its current state, there is a 
scenario where we can have a mix of pp and non-pp pages (if hardware 
offload support is enabled).
However in my tests, offload support was disabled and all pages are PP 
ones.
I also have some unpublished pending changes to always allocate from the 
pool (even for the initial buffers allocated for offloading).
This did not make a difference in my tests though.

- Felix

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-25 17:32           ` Felix Fietkau
@ 2023-01-25 18:26             ` Alexander H Duyck
  2023-01-25 18:42               ` Felix Fietkau
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2023-01-25 18:26 UTC (permalink / raw)
  To: Felix Fietkau, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
> On 25.01.23 18:11, Alexander H Duyck wrote:
> > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
> > > On 24.01.23 22:10, Alexander H Duyck wrote:
> > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
> > > > > > Hi Felix,
> > > > > > 
> > > > > > ++cc Alexander and Yunsheng.
> > > > > > 
> > > > > > Thanks for the report
> > > > > > 
> > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > > > > > 
> > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > > > > full-page page_pool allocation.
> > > > > > > It appears to me, that handling refcounting in two separate counters
> > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > > > > incremented by code dealing with skb fragments directly, and
> > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > > > > > 
> > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > > > > these underflow issues and crashes go away.
> > > > > > > 
> > > > > > 
> > > > > > This has been discussed here [1].  TL;DR changing this to page
> > > > > > refcount might blow up in other colorful ways.  Can we look closer and
> > > > > > figure out why the underflow happens?
> > > > > I don't see how the approch taken in my patch would blow up. From what I 
> > > > > can tell, it should be fairly close to how refcount is handled in 
> > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
> > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
> > > > > with corresponding get_page and page_pool_return_skb_page calls.
> > > > > 
> > > > > - Felix
> > > > > 
> > > > 
> > > > Do you have the patch available to review as an RFC? From what I am
> > > > seeing it looks like you are underrunning on the pp_frag_count itself.
> > > > I would suspect the issue to be something like starting with a bad
> > > > count in terms of the total number of references, or deducing the wrong
> > > > amount when you finally free the page assuming you are tracking your
> > > > frag count using a non-atomic value in the driver.
> > > The driver patches for page pool are here:
> > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
> > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
> > > 
> > > They are also applied in my mt76 tree at:
> > > https://github.com/nbd168/wireless
> > > 
> > > - Felix
> > 
> > So one thing I am thinking is that we may be seeing an issue where we
> > are somehow getting a mix of frag and non-frag based page pool pages.
> > That is the only case I can think of where we might be underflowing
> > negative. If you could add some additional debug info on the underflow
> > WARN_ON case in page_pool_defrag_page that might be useful.
> > Specifically I would be curious what the actual return value is. I'm
> > assuming we are only hitting negative 1, but I would want to verify we
> > aren't seeing something else.
> I'll try to run some more tests soon. However, I think I found the piece 
> of code that is incompatible with using pp_frag_count.
> When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
> packet), and it is not split by the hardware, a cfg80211 function 
> extracts the individual MSDUs into separate skbs. In that case, a 
> fragment can be shared across multiple skbs, and get_page is used to 
> increase the refcount.
> You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
> its helper functions).

I'm not sure if it is problematic or not. Basically it is trading off
by copying over the frags, calling get_page on each frag, and then
using dev_kfree_skb to disassemble and release the pp_frag references.
There should be other paths in the kernel that are doing something
similar.

> This code also has a bug where it doesn't set pp_recycle on the newly 
> allocated skb if the previous one has it, but that's a separate matter 
> and fixing it doesn't make the crash go away.

Adding the recycle would cause this bug. So one thing we might be
seeing is something like that triggering this error. Specifically if
the page is taken via get_page when assembling the new skb then we
cannot set the recycle flag in the new skb otherwise it will result in
the reference undercount we are seeing. What we are doing is shifting
the references away from the pp_frag_count to the page reference count
in this case. If we set the pp_recycle flag then it would cause us to
decrement pp_frag_count instead of the page reference count resulting
in the underrun. 

> Is there any way I can make that part of the code work with the current 
> page pool frag implementation?

The current code should work. Basically as long as the references are
taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
into this issue because the pp_frag_count will be dropped when the
original skb is freed and the page reference count will be decremented
when the new one is freed.

For page pool page fragments the main thing to keep in mind is that if
pp_recycle is set it will update the pp_frag_count and if it is not
then it will just decrement the page reference count.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-25 18:26             ` Alexander H Duyck
@ 2023-01-25 18:42               ` Felix Fietkau
  2023-01-25 19:02                 ` Alexander H Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Felix Fietkau @ 2023-01-25 18:42 UTC (permalink / raw)
  To: Alexander H Duyck, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On 25.01.23 19:26, Alexander H Duyck wrote:
> On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>> On 25.01.23 18:11, Alexander H Duyck wrote:
>> > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>> > > On 24.01.23 22:10, Alexander H Duyck wrote:
>> > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>> > > > > > Hi Felix,
>> > > > > > 
>> > > > > > ++cc Alexander and Yunsheng.
>> > > > > > 
>> > > > > > Thanks for the report
>> > > > > > 
>> > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > > > > > > 
>> > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>> > > > > > > full-page page_pool allocation.
>> > > > > > > It appears to me, that handling refcounting in two separate counters
>> > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > > > > > > incremented by code dealing with skb fragments directly, and
>> > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>> > > > > > > 
>> > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > > > > > > these underflow issues and crashes go away.
>> > > > > > > 
>> > > > > > 
>> > > > > > This has been discussed here [1].  TL;DR changing this to page
>> > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>> > > > > > figure out why the underflow happens?
>> > > > > I don't see how the approch taken in my patch would blow up. From what I 
>> > > > > can tell, it should be fairly close to how refcount is handled in 
>> > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>> > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>> > > > > with corresponding get_page and page_pool_return_skb_page calls.
>> > > > > 
>> > > > > - Felix
>> > > > > 
>> > > > 
>> > > > Do you have the patch available to review as an RFC? From what I am
>> > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>> > > > I would suspect the issue to be something like starting with a bad
>> > > > count in terms of the total number of references, or deducing the wrong
>> > > > amount when you finally free the page assuming you are tracking your
>> > > > frag count using a non-atomic value in the driver.
>> > > The driver patches for page pool are here:
>> > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>> > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>> > > 
>> > > They are also applied in my mt76 tree at:
>> > > https://github.com/nbd168/wireless
>> > > 
>> > > - Felix
>> > 
>> > So one thing I am thinking is that we may be seeing an issue where we
>> > are somehow getting a mix of frag and non-frag based page pool pages.
>> > That is the only case I can think of where we might be underflowing
>> > negative. If you could add some additional debug info on the underflow
>> > WARN_ON case in page_pool_defrag_page that might be useful.
>> > Specifically I would be curious what the actual return value is. I'm
>> > assuming we are only hitting negative 1, but I would want to verify we
>> > aren't seeing something else.
>> I'll try to run some more tests soon. However, I think I found the piece 
>> of code that is incompatible with using pp_frag_count.
>> When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>> packet), and it is not split by the hardware, a cfg80211 function 
>> extracts the individual MSDUs into separate skbs. In that case, a 
>> fragment can be shared across multiple skbs, and get_page is used to 
>> increase the refcount.
>> You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>> its helper functions).
> 
> I'm not sure if it is problematic or not. Basically it is trading off
> by copying over the frags, calling get_page on each frag, and then
> using dev_kfree_skb to disassemble and release the pp_frag references.
> There should be other paths in the kernel that are doing something
> similar.
> 
>> This code also has a bug where it doesn't set pp_recycle on the newly 
>> allocated skb if the previous one has it, but that's a separate matter 
>> and fixing it doesn't make the crash go away.
> 
> Adding the recycle would cause this bug. So one thing we might be
> seeing is something like that triggering this error. Specifically if
> the page is taken via get_page when assembling the new skb then we
> cannot set the recycle flag in the new skb otherwise it will result in
> the reference undercount we are seeing. What we are doing is shifting
> the references away from the pp_frag_count to the page reference count
> in this case. If we set the pp_recycle flag then it would cause us to
> decrement pp_frag_count instead of the page reference count resulting
> in the underrun.
Couldn't leaving out the pp_recycle flag potentially lead to a case 
where the last user of the page drops it via page_frag_free instead of 
page_pool_return_skb_page? Is that valid?

>> Is there any way I can make that part of the code work with the current 
>> page pool frag implementation?
> 
> The current code should work. Basically as long as the references are
> taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
> into this issue because the pp_frag_count will be dropped when the
> original skb is freed and the page reference count will be decremented
> when the new one is freed.
> 
> For page pool page fragments the main thing to keep in mind is that if
> pp_recycle is set it will update the pp_frag_count and if it is not
> then it will just decrement the page reference count.
What takes care of DMA unmap and other cleanup if the last reference to 
the page is dropped via page_frag_free?

- Felix

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-25 18:42               ` Felix Fietkau
@ 2023-01-25 19:02                 ` Alexander H Duyck
  2023-01-25 19:10                   ` Felix Fietkau
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2023-01-25 19:02 UTC (permalink / raw)
  To: Felix Fietkau, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
> On 25.01.23 19:26, Alexander H Duyck wrote:
> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
> > > On 25.01.23 18:11, Alexander H Duyck wrote:
> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
> > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
> > > > > > > > Hi Felix,
> > > > > > > > 
> > > > > > > > ++cc Alexander and Yunsheng.
> > > > > > > > 
> > > > > > > > Thanks for the report
> > > > > > > > 
> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > > > > > > > 
> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > > > > > > full-page page_pool allocation.
> > > > > > > > > It appears to me, that handling refcounting in two separate counters
> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > > > > > > incremented by code dealing with skb fragments directly, and
> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > > > > > > > 
> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > > > > > > these underflow issues and crashes go away.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > This has been discussed here [1].  TL;DR changing this to page
> > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
> > > > > > > > figure out why the underflow happens?
> > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
> > > > > > > can tell, it should be fairly close to how refcount is handled in 
> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
> > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
> > > > > > > 
> > > > > > > - Felix
> > > > > > > 
> > > > > > 
> > > > > > Do you have the patch available to review as an RFC? From what I am
> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
> > > > > > I would suspect the issue to be something like starting with a bad
> > > > > > count in terms of the total number of references, or deducing the wrong
> > > > > > amount when you finally free the page assuming you are tracking your
> > > > > > frag count using a non-atomic value in the driver.
> > > > > The driver patches for page pool are here:
> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
> > > > > 
> > > > > They are also applied in my mt76 tree at:
> > > > > https://github.com/nbd168/wireless
> > > > > 
> > > > > - Felix
> > > > 
> > > > So one thing I am thinking is that we may be seeing an issue where we
> > > > are somehow getting a mix of frag and non-frag based page pool pages.
> > > > That is the only case I can think of where we might be underflowing
> > > > negative. If you could add some additional debug info on the underflow
> > > > WARN_ON case in page_pool_defrag_page that might be useful.
> > > > Specifically I would be curious what the actual return value is. I'm
> > > > assuming we are only hitting negative 1, but I would want to verify we
> > > > aren't seeing something else.
> > > I'll try to run some more tests soon. However, I think I found the piece 
> > > of code that is incompatible with using pp_frag_count.
> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
> > > packet), and it is not split by the hardware, a cfg80211 function 
> > > extracts the individual MSDUs into separate skbs. In that case, a 
> > > fragment can be shared across multiple skbs, and get_page is used to 
> > > increase the refcount.
> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
> > > its helper functions).
> > 
> > I'm not sure if it is problematic or not. Basically it is trading off
> > by copying over the frags, calling get_page on each frag, and then
> > using dev_kfree_skb to disassemble and release the pp_frag references.
> > There should be other paths in the kernel that are doing something
> > similar.
> > 
> > > This code also has a bug where it doesn't set pp_recycle on the newly 
> > > allocated skb if the previous one has it, but that's a separate matter 
> > > and fixing it doesn't make the crash go away.
> > 
> > Adding the recycle would cause this bug. So one thing we might be
> > seeing is something like that triggering this error. Specifically if
> > the page is taken via get_page when assembling the new skb then we
> > cannot set the recycle flag in the new skb otherwise it will result in
> > the reference undercount we are seeing. What we are doing is shifting
> > the references away from the pp_frag_count to the page reference count
> > in this case. If we set the pp_recycle flag then it would cause us to
> > decrement pp_frag_count instead of the page reference count resulting
> > in the underrun.
> Couldn't leaving out the pp_recycle flag potentially lead to a case 
> where the last user of the page drops it via page_frag_free instead of 
> page_pool_return_skb_page? Is that valid?

No. What will happen is that when the pp_frag_count is exhausted the
page will be unmapped and evicted from the page pool. When the page is
then finally freed it will end up going back to the page allocator
instead of page pool.

Basically the idea is that until pp_frag_count reaches 0 there will be
at least 1 page reference held.

> > > Is there any way I can make that part of the code work with the current 
> > > page pool frag implementation?
> > 
> > The current code should work. Basically as long as the references are
> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
> > into this issue because the pp_frag_count will be dropped when the
> > original skb is freed and the page reference count will be decremented
> > when the new one is freed.
> > 
> > For page pool page fragments the main thing to keep in mind is that if
> > pp_recycle is set it will update the pp_frag_count and if it is not
> > then it will just decrement the page reference count.
> What takes care of DMA unmap and other cleanup if the last reference to 
> the page is dropped via page_frag_free?
> 
> - Felix

When the page is freed on the skb w/ pp_recycle set it will unmap the
page and evict it from the page pool. Basically in these cases the page
goes from the page pool back to the page allocator.

The general idea with this is that if we are using fragments that there
will be enough of them floating around that if one or two frags have a
temporeary detour through a non-recycling path that hopefully by the
time the last fragment is freed the other instances holding the
additional page reference will have let them go. If not then the page
will go back to the page allocator and it will have to be replaced in
the page pool.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-25 19:02                 ` Alexander H Duyck
@ 2023-01-25 19:10                   ` Felix Fietkau
  2023-01-25 19:40                     ` Felix Fietkau
  0 siblings, 1 reply; 43+ messages in thread
From: Felix Fietkau @ 2023-01-25 19:10 UTC (permalink / raw)
  To: Alexander H Duyck, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On 25.01.23 20:02, Alexander H Duyck wrote:
> On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>> On 25.01.23 19:26, Alexander H Duyck wrote:
>> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>> > > On 25.01.23 18:11, Alexander H Duyck wrote:
>> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>> > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>> > > > > > > > Hi Felix,
>> > > > > > > > 
>> > > > > > > > ++cc Alexander and Yunsheng.
>> > > > > > > > 
>> > > > > > > > Thanks for the report
>> > > > > > > > 
>> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > > > > > > > > 
>> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>> > > > > > > > > full-page page_pool allocation.
>> > > > > > > > > It appears to me, that handling refcounting in two separate counters
>> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > > > > > > > > incremented by code dealing with skb fragments directly, and
>> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>> > > > > > > > > 
>> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > > > > > > > > these underflow issues and crashes go away.
>> > > > > > > > > 
>> > > > > > > > 
>> > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>> > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>> > > > > > > > figure out why the underflow happens?
>> > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
>> > > > > > > can tell, it should be fairly close to how refcount is handled in 
>> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>> > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>> > > > > > > 
>> > > > > > > - Felix
>> > > > > > > 
>> > > > > > 
>> > > > > > Do you have the patch available to review as an RFC? From what I am
>> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>> > > > > > I would suspect the issue to be something like starting with a bad
>> > > > > > count in terms of the total number of references, or deducing the wrong
>> > > > > > amount when you finally free the page assuming you are tracking your
>> > > > > > frag count using a non-atomic value in the driver.
>> > > > > The driver patches for page pool are here:
>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>> > > > > 
>> > > > > They are also applied in my mt76 tree at:
>> > > > > https://github.com/nbd168/wireless
>> > > > > 
>> > > > > - Felix
>> > > > 
>> > > > So one thing I am thinking is that we may be seeing an issue where we
>> > > > are somehow getting a mix of frag and non-frag based page pool pages.
>> > > > That is the only case I can think of where we might be underflowing
>> > > > negative. If you could add some additional debug info on the underflow
>> > > > WARN_ON case in page_pool_defrag_page that might be useful.
>> > > > Specifically I would be curious what the actual return value is. I'm
>> > > > assuming we are only hitting negative 1, but I would want to verify we
>> > > > aren't seeing something else.
>> > > I'll try to run some more tests soon. However, I think I found the piece 
>> > > of code that is incompatible with using pp_frag_count.
>> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>> > > packet), and it is not split by the hardware, a cfg80211 function 
>> > > extracts the individual MSDUs into separate skbs. In that case, a 
>> > > fragment can be shared across multiple skbs, and get_page is used to 
>> > > increase the refcount.
>> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>> > > its helper functions).
>> > 
>> > I'm not sure if it is problematic or not. Basically it is trading off
>> > by copying over the frags, calling get_page on each frag, and then
>> > using dev_kfree_skb to disassemble and release the pp_frag references.
>> > There should be other paths in the kernel that are doing something
>> > similar.
>> > 
>> > > This code also has a bug where it doesn't set pp_recycle on the newly 
>> > > allocated skb if the previous one has it, but that's a separate matter 
>> > > and fixing it doesn't make the crash go away.
>> > 
>> > Adding the recycle would cause this bug. So one thing we might be
>> > seeing is something like that triggering this error. Specifically if
>> > the page is taken via get_page when assembling the new skb then we
>> > cannot set the recycle flag in the new skb otherwise it will result in
>> > the reference undercount we are seeing. What we are doing is shifting
>> > the references away from the pp_frag_count to the page reference count
>> > in this case. If we set the pp_recycle flag then it would cause us to
>> > decrement pp_frag_count instead of the page reference count resulting
>> > in the underrun.
>> Couldn't leaving out the pp_recycle flag potentially lead to a case 
>> where the last user of the page drops it via page_frag_free instead of 
>> page_pool_return_skb_page? Is that valid?
> 
> No. What will happen is that when the pp_frag_count is exhausted the
> page will be unmapped and evicted from the page pool. When the page is
> then finally freed it will end up going back to the page allocator
> instead of page pool.
> 
> Basically the idea is that until pp_frag_count reaches 0 there will be
> at least 1 page reference held.
> 
>> > > Is there any way I can make that part of the code work with the current 
>> > > page pool frag implementation?
>> > 
>> > The current code should work. Basically as long as the references are
>> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>> > into this issue because the pp_frag_count will be dropped when the
>> > original skb is freed and the page reference count will be decremented
>> > when the new one is freed.
>> > 
>> > For page pool page fragments the main thing to keep in mind is that if
>> > pp_recycle is set it will update the pp_frag_count and if it is not
>> > then it will just decrement the page reference count.
>> What takes care of DMA unmap and other cleanup if the last reference to 
>> the page is dropped via page_frag_free?
>> 
>> - Felix
> 
> When the page is freed on the skb w/ pp_recycle set it will unmap the
> page and evict it from the page pool. Basically in these cases the page
> goes from the page pool back to the page allocator.
> 
> The general idea with this is that if we are using fragments that there
> will be enough of them floating around that if one or two frags have a
> temporeary detour through a non-recycling path that hopefully by the
> time the last fragment is freed the other instances holding the
> additional page reference will have let them go. If not then the page
> will go back to the page allocator and it will have to be replaced in
> the page pool.
Thanks for the explanation, it makes sense to me now. Unfortunately it 
also means that I have no idea what could cause this issue. I will 
finish my mt76 patch rework which gets rid of the pp vs non-pp 
allocation mix and re-run my tests to provide updated traces.

- Felix

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-25 19:10                   ` Felix Fietkau
@ 2023-01-25 19:40                     ` Felix Fietkau
  2023-01-25 20:02                       ` Felix Fietkau
  2023-01-25 22:14                       ` Alexander H Duyck
  0 siblings, 2 replies; 43+ messages in thread
From: Felix Fietkau @ 2023-01-25 19:40 UTC (permalink / raw)
  To: Alexander H Duyck, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On 25.01.23 20:10, Felix Fietkau wrote:
> On 25.01.23 20:02, Alexander H Duyck wrote:
>> On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>>> On 25.01.23 19:26, Alexander H Duyck wrote:
>>> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>>> > > On 25.01.23 18:11, Alexander H Duyck wrote:
>>> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>>> > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>>> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>>> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>>> > > > > > > > Hi Felix,
>>> > > > > > > > 
>>> > > > > > > > ++cc Alexander and Yunsheng.
>>> > > > > > > > 
>>> > > > > > > > Thanks for the report
>>> > > > > > > > 
>>> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>>> > > > > > > > > 
>>> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>>> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>>> > > > > > > > > full-page page_pool allocation.
>>> > > > > > > > > It appears to me, that handling refcounting in two separate counters
>>> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>>> > > > > > > > > incremented by code dealing with skb fragments directly, and
>>> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>>> > > > > > > > > 
>>> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>>> > > > > > > > > these underflow issues and crashes go away.
>>> > > > > > > > > 
>>> > > > > > > > 
>>> > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>>> > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>>> > > > > > > > figure out why the underflow happens?
>>> > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
>>> > > > > > > can tell, it should be fairly close to how refcount is handled in 
>>> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>>> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>>> > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>>> > > > > > > 
>>> > > > > > > - Felix
>>> > > > > > > 
>>> > > > > > 
>>> > > > > > Do you have the patch available to review as an RFC? From what I am
>>> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>>> > > > > > I would suspect the issue to be something like starting with a bad
>>> > > > > > count in terms of the total number of references, or deducing the wrong
>>> > > > > > amount when you finally free the page assuming you are tracking your
>>> > > > > > frag count using a non-atomic value in the driver.
>>> > > > > The driver patches for page pool are here:
>>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>>> > > > > 
>>> > > > > They are also applied in my mt76 tree at:
>>> > > > > https://github.com/nbd168/wireless
>>> > > > > 
>>> > > > > - Felix
>>> > > > 
>>> > > > So one thing I am thinking is that we may be seeing an issue where we
>>> > > > are somehow getting a mix of frag and non-frag based page pool pages.
>>> > > > That is the only case I can think of where we might be underflowing
>>> > > > negative. If you could add some additional debug info on the underflow
>>> > > > WARN_ON case in page_pool_defrag_page that might be useful.
>>> > > > Specifically I would be curious what the actual return value is. I'm
>>> > > > assuming we are only hitting negative 1, but I would want to verify we
>>> > > > aren't seeing something else.
>>> > > I'll try to run some more tests soon. However, I think I found the piece 
>>> > > of code that is incompatible with using pp_frag_count.
>>> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>>> > > packet), and it is not split by the hardware, a cfg80211 function 
>>> > > extracts the individual MSDUs into separate skbs. In that case, a 
>>> > > fragment can be shared across multiple skbs, and get_page is used to 
>>> > > increase the refcount.
>>> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>>> > > its helper functions).
>>> > 
>>> > I'm not sure if it is problematic or not. Basically it is trading off
>>> > by copying over the frags, calling get_page on each frag, and then
>>> > using dev_kfree_skb to disassemble and release the pp_frag references.
>>> > There should be other paths in the kernel that are doing something
>>> > similar.
>>> > 
>>> > > This code also has a bug where it doesn't set pp_recycle on the newly 
>>> > > allocated skb if the previous one has it, but that's a separate matter 
>>> > > and fixing it doesn't make the crash go away.
>>> > 
>>> > Adding the recycle would cause this bug. So one thing we might be
>>> > seeing is something like that triggering this error. Specifically if
>>> > the page is taken via get_page when assembling the new skb then we
>>> > cannot set the recycle flag in the new skb otherwise it will result in
>>> > the reference undercount we are seeing. What we are doing is shifting
>>> > the references away from the pp_frag_count to the page reference count
>>> > in this case. If we set the pp_recycle flag then it would cause us to
>>> > decrement pp_frag_count instead of the page reference count resulting
>>> > in the underrun.
>>> Couldn't leaving out the pp_recycle flag potentially lead to a case 
>>> where the last user of the page drops it via page_frag_free instead of 
>>> page_pool_return_skb_page? Is that valid?
>> 
>> No. What will happen is that when the pp_frag_count is exhausted the
>> page will be unmapped and evicted from the page pool. When the page is
>> then finally freed it will end up going back to the page allocator
>> instead of page pool.
>> 
>> Basically the idea is that until pp_frag_count reaches 0 there will be
>> at least 1 page reference held.
>> 
>>> > > Is there any way I can make that part of the code work with the current 
>>> > > page pool frag implementation?
>>> > 
>>> > The current code should work. Basically as long as the references are
>>> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>>> > into this issue because the pp_frag_count will be dropped when the
>>> > original skb is freed and the page reference count will be decremented
>>> > when the new one is freed.
>>> > 
>>> > For page pool page fragments the main thing to keep in mind is that if
>>> > pp_recycle is set it will update the pp_frag_count and if it is not
>>> > then it will just decrement the page reference count.
>>> What takes care of DMA unmap and other cleanup if the last reference to 
>>> the page is dropped via page_frag_free?
>>> 
>>> - Felix
>> 
>> When the page is freed on the skb w/ pp_recycle set it will unmap the
>> page and evict it from the page pool. Basically in these cases the page
>> goes from the page pool back to the page allocator.
>> 
>> The general idea with this is that if we are using fragments that there
>> will be enough of them floating around that if one or two frags have a
>> temporeary detour through a non-recycling path that hopefully by the
>> time the last fragment is freed the other instances holding the
>> additional page reference will have let them go. If not then the page
>> will go back to the page allocator and it will have to be replaced in
>> the page pool.
> Thanks for the explanation, it makes sense to me now. Unfortunately it
> also means that I have no idea what could cause this issue. I will
> finish my mt76 patch rework which gets rid of the pp vs non-pp
> allocation mix and re-run my tests to provide updated traces.
Here's the updated mt76 page pool support commit:
https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808

And here is the trace that I'm getting with 6.1:
https://nbd.name/p/a16957f2

If you have any debug patch you'd like me to test, please let me know.

- Felix

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-25 19:40                     ` Felix Fietkau
@ 2023-01-25 20:02                       ` Felix Fietkau
  2023-01-25 22:14                       ` Alexander H Duyck
  1 sibling, 0 replies; 43+ messages in thread
From: Felix Fietkau @ 2023-01-25 20:02 UTC (permalink / raw)
  To: Alexander H Duyck, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On 25.01.23 20:40, Felix Fietkau wrote:
> On 25.01.23 20:10, Felix Fietkau wrote:
>> On 25.01.23 20:02, Alexander H Duyck wrote:
>>> On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>>>> On 25.01.23 19:26, Alexander H Duyck wrote:
>>>> > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>>>> > > On 25.01.23 18:11, Alexander H Duyck wrote:
>>>> > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>>>> > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>>>> > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>>>> > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>>>> > > > > > > > Hi Felix,
>>>> > > > > > > > 
>>>> > > > > > > > ++cc Alexander and Yunsheng.
>>>> > > > > > > > 
>>>> > > > > > > > Thanks for the report
>>>> > > > > > > > 
>>>> > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>>>> > > > > > > > > 
>>>> > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>>>> > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>>>> > > > > > > > > full-page page_pool allocation.
>>>> > > > > > > > > It appears to me, that handling refcounting in two separate counters
>>>> > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>>>> > > > > > > > > incremented by code dealing with skb fragments directly, and
>>>> > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>>>> > > > > > > > > 
>>>> > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>>>> > > > > > > > > these underflow issues and crashes go away.
>>>> > > > > > > > > 
>>>> > > > > > > > 
>>>> > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>>>> > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>>>> > > > > > > > figure out why the underflow happens?
>>>> > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
>>>> > > > > > > can tell, it should be fairly close to how refcount is handled in 
>>>> > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>>>> > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>>>> > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>>>> > > > > > > 
>>>> > > > > > > - Felix
>>>> > > > > > > 
>>>> > > > > > 
>>>> > > > > > Do you have the patch available to review as an RFC? From what I am
>>>> > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>>>> > > > > > I would suspect the issue to be something like starting with a bad
>>>> > > > > > count in terms of the total number of references, or deducing the wrong
>>>> > > > > > amount when you finally free the page assuming you are tracking your
>>>> > > > > > frag count using a non-atomic value in the driver.
>>>> > > > > The driver patches for page pool are here:
>>>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>>>> > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>>>> > > > > 
>>>> > > > > They are also applied in my mt76 tree at:
>>>> > > > > https://github.com/nbd168/wireless
>>>> > > > > 
>>>> > > > > - Felix
>>>> > > > 
>>>> > > > So one thing I am thinking is that we may be seeing an issue where we
>>>> > > > are somehow getting a mix of frag and non-frag based page pool pages.
>>>> > > > That is the only case I can think of where we might be underflowing
>>>> > > > negative. If you could add some additional debug info on the underflow
>>>> > > > WARN_ON case in page_pool_defrag_page that might be useful.
>>>> > > > Specifically I would be curious what the actual return value is. I'm
>>>> > > > assuming we are only hitting negative 1, but I would want to verify we
>>>> > > > aren't seeing something else.
>>>> > > I'll try to run some more tests soon. However, I think I found the piece 
>>>> > > of code that is incompatible with using pp_frag_count.
>>>> > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>>>> > > packet), and it is not split by the hardware, a cfg80211 function 
>>>> > > extracts the individual MSDUs into separate skbs. In that case, a 
>>>> > > fragment can be shared across multiple skbs, and get_page is used to 
>>>> > > increase the refcount.
>>>> > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>>>> > > its helper functions).
>>>> > 
>>>> > I'm not sure if it is problematic or not. Basically it is trading off
>>>> > by copying over the frags, calling get_page on each frag, and then
>>>> > using dev_kfree_skb to disassemble and release the pp_frag references.
>>>> > There should be other paths in the kernel that are doing something
>>>> > similar.
>>>> > 
>>>> > > This code also has a bug where it doesn't set pp_recycle on the newly 
>>>> > > allocated skb if the previous one has it, but that's a separate matter 
>>>> > > and fixing it doesn't make the crash go away.
>>>> > 
>>>> > Adding the recycle would cause this bug. So one thing we might be
>>>> > seeing is something like that triggering this error. Specifically if
>>>> > the page is taken via get_page when assembling the new skb then we
>>>> > cannot set the recycle flag in the new skb otherwise it will result in
>>>> > the reference undercount we are seeing. What we are doing is shifting
>>>> > the references away from the pp_frag_count to the page reference count
>>>> > in this case. If we set the pp_recycle flag then it would cause us to
>>>> > decrement pp_frag_count instead of the page reference count resulting
>>>> > in the underrun.
>>>> Couldn't leaving out the pp_recycle flag potentially lead to a case 
>>>> where the last user of the page drops it via page_frag_free instead of 
>>>> page_pool_return_skb_page? Is that valid?
>>> 
>>> No. What will happen is that when the pp_frag_count is exhausted the
>>> page will be unmapped and evicted from the page pool. When the page is
>>> then finally freed it will end up going back to the page allocator
>>> instead of page pool.
>>> 
>>> Basically the idea is that until pp_frag_count reaches 0 there will be
>>> at least 1 page reference held.
>>> 
>>>> > > Is there any way I can make that part of the code work with the current 
>>>> > > page pool frag implementation?
>>>> > 
>>>> > The current code should work. Basically as long as the references are
>>>> > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>>>> > into this issue because the pp_frag_count will be dropped when the
>>>> > original skb is freed and the page reference count will be decremented
>>>> > when the new one is freed.
>>>> > 
>>>> > For page pool page fragments the main thing to keep in mind is that if
>>>> > pp_recycle is set it will update the pp_frag_count and if it is not
>>>> > then it will just decrement the page reference count.
>>>> What takes care of DMA unmap and other cleanup if the last reference to 
>>>> the page is dropped via page_frag_free?
>>>> 
>>>> - Felix
>>> 
>>> When the page is freed on the skb w/ pp_recycle set it will unmap the
>>> page and evict it from the page pool. Basically in these cases the page
>>> goes from the page pool back to the page allocator.
>>> 
>>> The general idea with this is that if we are using fragments that there
>>> will be enough of them floating around that if one or two frags have a
>>> temporeary detour through a non-recycling path that hopefully by the
>>> time the last fragment is freed the other instances holding the
>>> additional page reference will have let them go. If not then the page
>>> will go back to the page allocator and it will have to be replaced in
>>> the page pool.
>> Thanks for the explanation, it makes sense to me now. Unfortunately it
>> also means that I have no idea what could cause this issue. I will
>> finish my mt76 patch rework which gets rid of the pp vs non-pp
>> allocation mix and re-run my tests to provide updated traces.
> Here's the updated mt76 page pool support commit:
> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808
> 
> And here is the trace that I'm getting with 6.1:
> https://nbd.name/p/a16957f2
> 
> If you have any debug patch you'd like me to test, please let me know.
To answer your earlier question: When pp_frag_count goes below zero, 
it's at -1 as suspected.

Here are some more completely different traces that I got during other 
test runs. I hope they provide some kind of clue:
https://nbd.name/p/8e46b6eb

- Felix


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-25 19:40                     ` Felix Fietkau
  2023-01-25 20:02                       ` Felix Fietkau
@ 2023-01-25 22:14                       ` Alexander H Duyck
  2023-01-26  6:12                         ` Felix Fietkau
  1 sibling, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2023-01-25 22:14 UTC (permalink / raw)
  To: Felix Fietkau, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote:
> On 25.01.23 20:10, Felix Fietkau wrote:
> > On 25.01.23 20:02, Alexander H Duyck wrote:
> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
> > > > On 25.01.23 19:26, Alexander H Duyck wrote:
> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote:
> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
> > > > > > > > > > > Hi Felix,
> > > > > > > > > > > 
> > > > > > > > > > > ++cc Alexander and Yunsheng.
> > > > > > > > > > > 
> > > > > > > > > > > Thanks for the report
> > > > > > > > > > > 
> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > > > > > > > > > > 
> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > > > > > > > > > full-page page_pool allocation.
> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters
> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and
> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > > > > > > > > > > 
> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > > > > > > > > > these underflow issues and crashes go away.
> > > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > This has been discussed here [1].  TL;DR changing this to page
> > > > > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
> > > > > > > > > > > figure out why the underflow happens?
> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in 
> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
> > > > > > > > > > 
> > > > > > > > > > - Felix
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Do you have the patch available to review as an RFC? From what I am
> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
> > > > > > > > > I would suspect the issue to be something like starting with a bad
> > > > > > > > > count in terms of the total number of references, or deducing the wrong
> > > > > > > > > amount when you finally free the page assuming you are tracking your
> > > > > > > > > frag count using a non-atomic value in the driver.
> > > > > > > > The driver patches for page pool are here:
> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
> > > > > > > > 
> > > > > > > > They are also applied in my mt76 tree at:
> > > > > > > > https://github.com/nbd168/wireless
> > > > > > > > 
> > > > > > > > - Felix
> > > > > > > 
> > > > > > > So one thing I am thinking is that we may be seeing an issue where we
> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages.
> > > > > > > That is the only case I can think of where we might be underflowing
> > > > > > > negative. If you could add some additional debug info on the underflow
> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful.
> > > > > > > Specifically I would be curious what the actual return value is. I'm
> > > > > > > assuming we are only hitting negative 1, but I would want to verify we
> > > > > > > aren't seeing something else.
> > > > > > I'll try to run some more tests soon. However, I think I found the piece 
> > > > > > of code that is incompatible with using pp_frag_count.
> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
> > > > > > packet), and it is not split by the hardware, a cfg80211 function 
> > > > > > extracts the individual MSDUs into separate skbs. In that case, a 
> > > > > > fragment can be shared across multiple skbs, and get_page is used to 
> > > > > > increase the refcount.
> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
> > > > > > its helper functions).
> > > > > 
> > > > > I'm not sure if it is problematic or not. Basically it is trading off
> > > > > by copying over the frags, calling get_page on each frag, and then
> > > > > using dev_kfree_skb to disassemble and release the pp_frag references.
> > > > > There should be other paths in the kernel that are doing something
> > > > > similar.
> > > > > 
> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly 
> > > > > > allocated skb if the previous one has it, but that's a separate matter 
> > > > > > and fixing it doesn't make the crash go away.
> > > > > 
> > > > > Adding the recycle would cause this bug. So one thing we might be
> > > > > seeing is something like that triggering this error. Specifically if
> > > > > the page is taken via get_page when assembling the new skb then we
> > > > > cannot set the recycle flag in the new skb otherwise it will result in
> > > > > the reference undercount we are seeing. What we are doing is shifting
> > > > > the references away from the pp_frag_count to the page reference count
> > > > > in this case. If we set the pp_recycle flag then it would cause us to
> > > > > decrement pp_frag_count instead of the page reference count resulting
> > > > > in the underrun.
> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case 
> > > > where the last user of the page drops it via page_frag_free instead of 
> > > > page_pool_return_skb_page? Is that valid?
> > > 
> > > No. What will happen is that when the pp_frag_count is exhausted the
> > > page will be unmapped and evicted from the page pool. When the page is
> > > then finally freed it will end up going back to the page allocator
> > > instead of page pool.
> > > 
> > > Basically the idea is that until pp_frag_count reaches 0 there will be
> > > at least 1 page reference held.
> > > 
> > > > > > Is there any way I can make that part of the code work with the current 
> > > > > > page pool frag implementation?
> > > > > 
> > > > > The current code should work. Basically as long as the references are
> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
> > > > > into this issue because the pp_frag_count will be dropped when the
> > > > > original skb is freed and the page reference count will be decremented
> > > > > when the new one is freed.
> > > > > 
> > > > > For page pool page fragments the main thing to keep in mind is that if
> > > > > pp_recycle is set it will update the pp_frag_count and if it is not
> > > > > then it will just decrement the page reference count.
> > > > What takes care of DMA unmap and other cleanup if the last reference to 
> > > > the page is dropped via page_frag_free?
> > > > 
> > > > - Felix
> > > 
> > > When the page is freed on the skb w/ pp_recycle set it will unmap the
> > > page and evict it from the page pool. Basically in these cases the page
> > > goes from the page pool back to the page allocator.
> > > 
> > > The general idea with this is that if we are using fragments that there
> > > will be enough of them floating around that if one or two frags have a
> > > temporeary detour through a non-recycling path that hopefully by the
> > > time the last fragment is freed the other instances holding the
> > > additional page reference will have let them go. If not then the page
> > > will go back to the page allocator and it will have to be replaced in
> > > the page pool.
> > Thanks for the explanation, it makes sense to me now. Unfortunately it
> > also means that I have no idea what could cause this issue. I will
> > finish my mt76 patch rework which gets rid of the pp vs non-pp
> > allocation mix and re-run my tests to provide updated traces.
> Here's the updated mt76 page pool support commit:
> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808

Yeah, so I don't see anything wrong with the patch in terms of page
pool.

> And here is the trace that I'm getting with 6.1:
> https://nbd.name/p/a16957f2
> 
> If you have any debug patch you'd like me to test, please let me know.
> 
> - Felix

So looking at the traces I am assuming what we are seeing is the
deferred freeing from the TCP Rx path since I don't see a driver
anywhere between net_rx_action and napi_consume skb. So it seems like
the packets are likely making it all the way up the network stack.

Is this the first wireless driver to add support for page pool? I'm
thinking we must be seeing something in the wireless path that is
causing an issue such as the function you called out earlier but I
can't see anything obvious.

One thing we need to be on the lookout for is cloned skbs. When an skb
is cloned the pp_recycle gets copied over. In that case the reference
is moved over to the skb dataref count. What comes to mind is something
like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool
fragment recycling").






^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-25 22:14                       ` Alexander H Duyck
@ 2023-01-26  6:12                         ` Felix Fietkau
  2023-01-26  9:14                           ` Felix Fietkau
  0 siblings, 1 reply; 43+ messages in thread
From: Felix Fietkau @ 2023-01-26  6:12 UTC (permalink / raw)
  To: Alexander H Duyck, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On 25.01.23 23:14, Alexander H Duyck wrote:
> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote:
>> On 25.01.23 20:10, Felix Fietkau wrote:
>> > On 25.01.23 20:02, Alexander H Duyck wrote:
>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>> > > > On 25.01.23 19:26, Alexander H Duyck wrote:
>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote:
>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>> > > > > > > > > > > Hi Felix,
>> > > > > > > > > > > 
>> > > > > > > > > > > ++cc Alexander and Yunsheng.
>> > > > > > > > > > > 
>> > > > > > > > > > > Thanks for the report
>> > > > > > > > > > > 
>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> > > > > > > > > > > > 
>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>> > > > > > > > > > > > full-page page_pool allocation.
>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters
>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and
>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>> > > > > > > > > > > > 
>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> > > > > > > > > > > > these underflow issues and crashes go away.
>> > > > > > > > > > > > 
>> > > > > > > > > > > 
>> > > > > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>> > > > > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>> > > > > > > > > > > figure out why the underflow happens?
>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in 
>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>> > > > > > > > > > 
>> > > > > > > > > > - Felix
>> > > > > > > > > > 
>> > > > > > > > > 
>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am
>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>> > > > > > > > > I would suspect the issue to be something like starting with a bad
>> > > > > > > > > count in terms of the total number of references, or deducing the wrong
>> > > > > > > > > amount when you finally free the page assuming you are tracking your
>> > > > > > > > > frag count using a non-atomic value in the driver.
>> > > > > > > > The driver patches for page pool are here:
>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>> > > > > > > > 
>> > > > > > > > They are also applied in my mt76 tree at:
>> > > > > > > > https://github.com/nbd168/wireless
>> > > > > > > > 
>> > > > > > > > - Felix
>> > > > > > > 
>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we
>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages.
>> > > > > > > That is the only case I can think of where we might be underflowing
>> > > > > > > negative. If you could add some additional debug info on the underflow
>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful.
>> > > > > > > Specifically I would be curious what the actual return value is. I'm
>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we
>> > > > > > > aren't seeing something else.
>> > > > > > I'll try to run some more tests soon. However, I think I found the piece 
>> > > > > > of code that is incompatible with using pp_frag_count.
>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>> > > > > > packet), and it is not split by the hardware, a cfg80211 function 
>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a 
>> > > > > > fragment can be shared across multiple skbs, and get_page is used to 
>> > > > > > increase the refcount.
>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>> > > > > > its helper functions).
>> > > > > 
>> > > > > I'm not sure if it is problematic or not. Basically it is trading off
>> > > > > by copying over the frags, calling get_page on each frag, and then
>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references.
>> > > > > There should be other paths in the kernel that are doing something
>> > > > > similar.
>> > > > > 
>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly 
>> > > > > > allocated skb if the previous one has it, but that's a separate matter 
>> > > > > > and fixing it doesn't make the crash go away.
>> > > > > 
>> > > > > Adding the recycle would cause this bug. So one thing we might be
>> > > > > seeing is something like that triggering this error. Specifically if
>> > > > > the page is taken via get_page when assembling the new skb then we
>> > > > > cannot set the recycle flag in the new skb otherwise it will result in
>> > > > > the reference undercount we are seeing. What we are doing is shifting
>> > > > > the references away from the pp_frag_count to the page reference count
>> > > > > in this case. If we set the pp_recycle flag then it would cause us to
>> > > > > decrement pp_frag_count instead of the page reference count resulting
>> > > > > in the underrun.
>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case 
>> > > > where the last user of the page drops it via page_frag_free instead of 
>> > > > page_pool_return_skb_page? Is that valid?
>> > > 
>> > > No. What will happen is that when the pp_frag_count is exhausted the
>> > > page will be unmapped and evicted from the page pool. When the page is
>> > > then finally freed it will end up going back to the page allocator
>> > > instead of page pool.
>> > > 
>> > > Basically the idea is that until pp_frag_count reaches 0 there will be
>> > > at least 1 page reference held.
>> > > 
>> > > > > > Is there any way I can make that part of the code work with the current 
>> > > > > > page pool frag implementation?
>> > > > > 
>> > > > > The current code should work. Basically as long as the references are
>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>> > > > > into this issue because the pp_frag_count will be dropped when the
>> > > > > original skb is freed and the page reference count will be decremented
>> > > > > when the new one is freed.
>> > > > > 
>> > > > > For page pool page fragments the main thing to keep in mind is that if
>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not
>> > > > > then it will just decrement the page reference count.
>> > > > What takes care of DMA unmap and other cleanup if the last reference to 
>> > > > the page is dropped via page_frag_free?
>> > > > 
>> > > > - Felix
>> > > 
>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the
>> > > page and evict it from the page pool. Basically in these cases the page
>> > > goes from the page pool back to the page allocator.
>> > > 
>> > > The general idea with this is that if we are using fragments that there
>> > > will be enough of them floating around that if one or two frags have a
>> > > temporeary detour through a non-recycling path that hopefully by the
>> > > time the last fragment is freed the other instances holding the
>> > > additional page reference will have let them go. If not then the page
>> > > will go back to the page allocator and it will have to be replaced in
>> > > the page pool.
>> > Thanks for the explanation, it makes sense to me now. Unfortunately it
>> > also means that I have no idea what could cause this issue. I will
>> > finish my mt76 patch rework which gets rid of the pp vs non-pp
>> > allocation mix and re-run my tests to provide updated traces.
>> Here's the updated mt76 page pool support commit:
>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808
> 
> Yeah, so I don't see anything wrong with the patch in terms of page
> pool.
> 
>> And here is the trace that I'm getting with 6.1:
>> https://nbd.name/p/a16957f2
>> 
>> If you have any debug patch you'd like me to test, please let me know.
>> 
>> - Felix
> 
> So looking at the traces I am assuming what we are seeing is the
> deferred freeing from the TCP Rx path since I don't see a driver
> anywhere between net_rx_action and napi_consume skb. So it seems like
> the packets are likely making it all the way up the network stack.
> 
> Is this the first wireless driver to add support for page pool? I'm
> thinking we must be seeing something in the wireless path that is
> causing an issue such as the function you called out earlier but I
> can't see anything obvious.
Yes, it's the first driver with page pool support.

> One thing we need to be on the lookout for is cloned skbs. When an skb
> is cloned the pp_recycle gets copied over. In that case the reference
> is moved over to the skb dataref count. What comes to mind is something
> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool
> fragment recycling").
I suspect that the crash might be related to a bad interaction between 
the page reuse in A-MSDU rx + skb coalescing on TCP rx.
If I change the A-MSDU code to copy data instead of reusing fragments, 
it doesn't crash anymore.
I believe the issue must be specific to that codepath, since most 
received and processed packets are either not A-MSDU or A-MSDU decap has 
already been performed by the hardware.
If I change my test to use 3 client mode interfaces instead of 4, the 
hardware is able to offload all A-MSDU rx processing and I don't see any 
crashes anymore.

Could you please take another look at ieee80211_amsdu_to_8023s to see if 
there's anything in there that could cause these issues?

- Felix


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-26  6:12                         ` Felix Fietkau
@ 2023-01-26  9:14                           ` Felix Fietkau
  2023-01-26 16:08                             ` Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Felix Fietkau @ 2023-01-26  9:14 UTC (permalink / raw)
  To: Alexander H Duyck, Ilias Apalodimas
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Yunsheng Lin

On 26.01.23 07:12, Felix Fietkau wrote:
> On 25.01.23 23:14, Alexander H Duyck wrote:
>> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote:
>>> On 25.01.23 20:10, Felix Fietkau wrote:
>>> > On 25.01.23 20:02, Alexander H Duyck wrote:
>>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>>> > > > On 25.01.23 19:26, Alexander H Duyck wrote:
>>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote:
>>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>>> > > > > > > > > > > Hi Felix,
>>> > > > > > > > > > > 
>>> > > > > > > > > > > ++cc Alexander and Yunsheng.
>>> > > > > > > > > > > 
>>> > > > > > > > > > > Thanks for the report
>>> > > > > > > > > > > 
>>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>>> > > > > > > > > > > > 
>>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>>> > > > > > > > > > > > full-page page_pool allocation.
>>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters
>>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and
>>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>>> > > > > > > > > > > > 
>>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>>> > > > > > > > > > > > these underflow issues and crashes go away.
>>> > > > > > > > > > > > 
>>> > > > > > > > > > > 
>>> > > > > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>>> > > > > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>>> > > > > > > > > > > figure out why the underflow happens?
>>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I 
>>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in 
>>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from 
>>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs 
>>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>>> > > > > > > > > > 
>>> > > > > > > > > > - Felix
>>> > > > > > > > > > 
>>> > > > > > > > > 
>>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am
>>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>>> > > > > > > > > I would suspect the issue to be something like starting with a bad
>>> > > > > > > > > count in terms of the total number of references, or deducing the wrong
>>> > > > > > > > > amount when you finally free the page assuming you are tracking your
>>> > > > > > > > > frag count using a non-atomic value in the driver.
>>> > > > > > > > The driver patches for page pool are here:
>>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>>> > > > > > > > 
>>> > > > > > > > They are also applied in my mt76 tree at:
>>> > > > > > > > https://github.com/nbd168/wireless
>>> > > > > > > > 
>>> > > > > > > > - Felix
>>> > > > > > > 
>>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we
>>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages.
>>> > > > > > > That is the only case I can think of where we might be underflowing
>>> > > > > > > negative. If you could add some additional debug info on the underflow
>>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful.
>>> > > > > > > Specifically I would be curious what the actual return value is. I'm
>>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we
>>> > > > > > > aren't seeing something else.
>>> > > > > > I'll try to run some more tests soon. However, I think I found the piece 
>>> > > > > > of code that is incompatible with using pp_frag_count.
>>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11 
>>> > > > > > packet), and it is not split by the hardware, a cfg80211 function 
>>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a 
>>> > > > > > fragment can be shared across multiple skbs, and get_page is used to 
>>> > > > > > increase the refcount.
>>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and 
>>> > > > > > its helper functions).
>>> > > > > 
>>> > > > > I'm not sure if it is problematic or not. Basically it is trading off
>>> > > > > by copying over the frags, calling get_page on each frag, and then
>>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references.
>>> > > > > There should be other paths in the kernel that are doing something
>>> > > > > similar.
>>> > > > > 
>>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly 
>>> > > > > > allocated skb if the previous one has it, but that's a separate matter 
>>> > > > > > and fixing it doesn't make the crash go away.
>>> > > > > 
>>> > > > > Adding the recycle would cause this bug. So one thing we might be
>>> > > > > seeing is something like that triggering this error. Specifically if
>>> > > > > the page is taken via get_page when assembling the new skb then we
>>> > > > > cannot set the recycle flag in the new skb otherwise it will result in
>>> > > > > the reference undercount we are seeing. What we are doing is shifting
>>> > > > > the references away from the pp_frag_count to the page reference count
>>> > > > > in this case. If we set the pp_recycle flag then it would cause us to
>>> > > > > decrement pp_frag_count instead of the page reference count resulting
>>> > > > > in the underrun.
>>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case 
>>> > > > where the last user of the page drops it via page_frag_free instead of 
>>> > > > page_pool_return_skb_page? Is that valid?
>>> > > 
>>> > > No. What will happen is that when the pp_frag_count is exhausted the
>>> > > page will be unmapped and evicted from the page pool. When the page is
>>> > > then finally freed it will end up going back to the page allocator
>>> > > instead of page pool.
>>> > > 
>>> > > Basically the idea is that until pp_frag_count reaches 0 there will be
>>> > > at least 1 page reference held.
>>> > > 
>>> > > > > > Is there any way I can make that part of the code work with the current 
>>> > > > > > page pool frag implementation?
>>> > > > > 
>>> > > > > The current code should work. Basically as long as the references are
>>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>>> > > > > into this issue because the pp_frag_count will be dropped when the
>>> > > > > original skb is freed and the page reference count will be decremented
>>> > > > > when the new one is freed.
>>> > > > > 
>>> > > > > For page pool page fragments the main thing to keep in mind is that if
>>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not
>>> > > > > then it will just decrement the page reference count.
>>> > > > What takes care of DMA unmap and other cleanup if the last reference to 
>>> > > > the page is dropped via page_frag_free?
>>> > > > 
>>> > > > - Felix
>>> > > 
>>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the
>>> > > page and evict it from the page pool. Basically in these cases the page
>>> > > goes from the page pool back to the page allocator.
>>> > > 
>>> > > The general idea with this is that if we are using fragments that there
>>> > > will be enough of them floating around that if one or two frags have a
>>> > > temporeary detour through a non-recycling path that hopefully by the
>>> > > time the last fragment is freed the other instances holding the
>>> > > additional page reference will have let them go. If not then the page
>>> > > will go back to the page allocator and it will have to be replaced in
>>> > > the page pool.
>>> > Thanks for the explanation, it makes sense to me now. Unfortunately it
>>> > also means that I have no idea what could cause this issue. I will
>>> > finish my mt76 patch rework which gets rid of the pp vs non-pp
>>> > allocation mix and re-run my tests to provide updated traces.
>>> Here's the updated mt76 page pool support commit:
>>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808
>> 
>> Yeah, so I don't see anything wrong with the patch in terms of page
>> pool.
>> 
>>> And here is the trace that I'm getting with 6.1:
>>> https://nbd.name/p/a16957f2
>>> 
>>> If you have any debug patch you'd like me to test, please let me know.
>>> 
>>> - Felix
>> 
>> So looking at the traces I am assuming what we are seeing is the
>> deferred freeing from the TCP Rx path since I don't see a driver
>> anywhere between net_rx_action and napi_consume skb. So it seems like
>> the packets are likely making it all the way up the network stack.
>> 
>> Is this the first wireless driver to add support for page pool? I'm
>> thinking we must be seeing something in the wireless path that is
>> causing an issue such as the function you called out earlier but I
>> can't see anything obvious.
> Yes, it's the first driver with page pool support.
> 
>> One thing we need to be on the lookout for is cloned skbs. When an skb
>> is cloned the pp_recycle gets copied over. In that case the reference
>> is moved over to the skb dataref count. What comes to mind is something
>> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool
>> fragment recycling").
> I suspect that the crash might be related to a bad interaction between
> the page reuse in A-MSDU rx + skb coalescing on TCP rx.
> If I change the A-MSDU code to copy data instead of reusing fragments,
> it doesn't crash anymore.
> I believe the issue must be specific to that codepath, since most
> received and processed packets are either not A-MSDU or A-MSDU decap has
> already been performed by the hardware.
> If I change my test to use 3 client mode interfaces instead of 4, the
> hardware is able to offload all A-MSDU rx processing and I don't see any
> crashes anymore.
> 
> Could you please take another look at ieee80211_amsdu_to_8023s to see if
> there's anything in there that could cause these issues?
Here are clues from a few more tests that I ran:
- preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does 
not prevent the crashes, so the issue is indeed related to taking page 
references and putting the pages in skb fragments.
- if I return false in skb_try_coalesce, it still crashes:
https://nbd.name/p/18cac078

- Felix

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-24 15:57   ` Alexander H Duyck
  2023-01-24 16:59     ` Felix Fietkau
@ 2023-01-26 10:31     ` Ilias Apalodimas
  2023-01-26 15:41       ` Alexander Duyck
  1 sibling, 1 reply; 43+ messages in thread
From: Ilias Apalodimas @ 2023-01-26 10:31 UTC (permalink / raw)
  To: Alexander H Duyck
  Cc: Felix Fietkau, netdev, Jesper Dangaard Brouer, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi,
	linux-kernel, Yunsheng Lin

Hi Alexander,

Sorry for being late to the party,  was overloaded...

On Tue, Jan 24, 2023 at 07:57:35AM -0800, Alexander H Duyck wrote:
> On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote:
> > Hi Felix,
> >
> > ++cc Alexander and Yunsheng.
> >
> > Thanks for the report
> >
> > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > >
> > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > to reliably trigger page refcount underflow issues, which did not occur with
> > > full-page page_pool allocation.
> > > It appears to me, that handling refcounting in two separate counters
> > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > incremented by code dealing with skb fragments directly, and
> > > page_pool_return_skb_page is called multiple times for the same fragment.
> > >
> > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > these underflow issues and crashes go away.
> > >
> >
> > This has been discussed here [1].  TL;DR changing this to page
> > refcount might blow up in other colorful ways.  Can we look closer and
> > figure out why the underflow happens?
> >
> > [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/
> >
> > Thanks
> > /Ilias
> >
> >
>
> The logic should be safe in terms of the page pool itself as it should
> be holding one reference to the page while the pp_frag_count is non-
> zero. That one reference is what keeps the two halfs in sync as the
> page shouldn't be able to be freed until we exhaust the pp_frag_count.

Do you remember why we decided to go with the fragment counter instead of
page references?

>
> To have an underflow there are two possible scenarios. One is that
> either put_page or free_page is being called somewhere that the
> page_pool freeing functions should be used.

Wouldn't that affect the non fragmented path as well? IOW the driver that
works with a full page would crash as well.

> The other possibility is
> that a pp_frag_count reference was taken somewhere a page reference
> should have.
>
> Do we have a backtrace for the spots that are showing this underrun? If
> nothing else we may want to look at tracking down the spots that are
> freeing the page pool pages via put_page or free_page to determine what
> paths these pages are taking.

Thanks
/Ilias

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-24 17:22   ` Felix Fietkau
  2023-01-24 21:10     ` Alexander H Duyck
@ 2023-01-26 10:32     ` Ilias Apalodimas
  1 sibling, 0 replies; 43+ messages in thread
From: Ilias Apalodimas @ 2023-01-26 10:32 UTC (permalink / raw)
  To: Felix Fietkau
  Cc: netdev, Jesper Dangaard Brouer, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi, linux-kernel,
	Alexander Duyck, Yunsheng Lin

On Tue, Jan 24, 2023 at 06:22:54PM +0100, Felix Fietkau wrote:
> On 24.01.23 15:11, Ilias Apalodimas wrote:
> > Hi Felix,
> >
> > ++cc Alexander and Yunsheng.
> >
> > Thanks for the report
> >
> > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > >
> > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > to reliably trigger page refcount underflow issues, which did not occur with
> > > full-page page_pool allocation.
> > > It appears to me, that handling refcounting in two separate counters
> > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > incremented by code dealing with skb fragments directly, and
> > > page_pool_return_skb_page is called multiple times for the same fragment.
> > >
> > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > these underflow issues and crashes go away.
> > >
> >
> > This has been discussed here [1].  TL;DR changing this to page
> > refcount might blow up in other colorful ways.  Can we look closer and
> > figure out why the underflow happens?
> I don't see how the approch taken in my patch would blow up. From what I can
> tell, it should be fairly close to how refcount is handled in
> page_frag_alloc. The main improvement it adds is to prevent it from blowing
> up if pool-allocated fragments get shared across multiple skbs with
> corresponding get_page and page_pool_return_skb_page calls.
>
> - Felix
>

Yes sorry for the noise, that patch I referred to was doing a completely
different thing, elevating the page refcnt to BIAS_MAX from the start

Thanks
/Ilias

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-26 10:31     ` Ilias Apalodimas
@ 2023-01-26 15:41       ` Alexander Duyck
  2023-01-26 16:05         ` Ilias Apalodimas
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2023-01-26 15:41 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: Felix Fietkau, netdev, Jesper Dangaard Brouer, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi,
	linux-kernel, Yunsheng Lin

On Thu, Jan 26, 2023 at 2:32 AM Ilias Apalodimas
<ilias.apalodimas@linaro.org> wrote:
>
> Hi Alexander,
>
> Sorry for being late to the party,  was overloaded...
>
> On Tue, Jan 24, 2023 at 07:57:35AM -0800, Alexander H Duyck wrote:
> > On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote:
> > > Hi Felix,
> > >
> > > ++cc Alexander and Yunsheng.
> > >
> > > Thanks for the report
> > >
> > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > >
> > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > full-page page_pool allocation.
> > > > It appears to me, that handling refcounting in two separate counters
> > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > incremented by code dealing with skb fragments directly, and
> > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > >
> > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > these underflow issues and crashes go away.
> > > >
> > >
> > > This has been discussed here [1].  TL;DR changing this to page
> > > refcount might blow up in other colorful ways.  Can we look closer and
> > > figure out why the underflow happens?
> > >
> > > [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/
> > >
> > > Thanks
> > > /Ilias
> > >
> > >
> >
> > The logic should be safe in terms of the page pool itself as it should
> > be holding one reference to the page while the pp_frag_count is non-
> > zero. That one reference is what keeps the two halfs in sync as the
> > page shouldn't be able to be freed until we exhaust the pp_frag_count.
>
> Do you remember why we decided to go with the fragment counter instead of
> page references?

The issue has to do with when to destroy the mappings. Basically with
the fragment counter we destroy the mappings and remove the page from
the pool when the count hits 0. The reference count is really used for
the page allocator to do its tracking. If we end up trying to merge
the two the problem becomes one of lifetimes as we wouldn't know when
to destroy the DMA mappings as they would have to live the full life
of the page.

> >
> > To have an underflow there are two possible scenarios. One is that
> > either put_page or free_page is being called somewhere that the
> > page_pool freeing functions should be used.
>
> Wouldn't that affect the non fragmented path as well? IOW the driver that
> works with a full page would crash as well.

The problem is the non-fragmented path doesn't get as noisy. Also
there aren't currently any wireless drivers making use of the page
pool, or at least that is my understanding. I'm suspecting something
like the issue we saw in 1effe8ca4e34c ("skbuff: fix coalescing for
page_pool fragment recycling"). We likely have some corner case where
we should be taking a page reference and clearing a pp_recycle flag.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-26 15:41       ` Alexander Duyck
@ 2023-01-26 16:05         ` Ilias Apalodimas
  0 siblings, 0 replies; 43+ messages in thread
From: Ilias Apalodimas @ 2023-01-26 16:05 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Felix Fietkau, netdev, Jesper Dangaard Brouer, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi,
	linux-kernel, Yunsheng Lin

On Thu, Jan 26, 2023 at 07:41:15AM -0800, Alexander Duyck wrote:
> On Thu, Jan 26, 2023 at 2:32 AM Ilias Apalodimas
> <ilias.apalodimas@linaro.org> wrote:
> >
> > Hi Alexander,
> >
> > Sorry for being late to the party,  was overloaded...
> >
> > On Tue, Jan 24, 2023 at 07:57:35AM -0800, Alexander H Duyck wrote:
> > > On Tue, 2023-01-24 at 16:11 +0200, Ilias Apalodimas wrote:
> > > > Hi Felix,
> > > >
> > > > ++cc Alexander and Yunsheng.
> > > >
> > > > Thanks for the report
> > > >
> > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> > > > >
> > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> > > > > to reliably trigger page refcount underflow issues, which did not occur with
> > > > > full-page page_pool allocation.
> > > > > It appears to me, that handling refcounting in two separate counters
> > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> > > > > incremented by code dealing with skb fragments directly, and
> > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> > > > >
> > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> > > > > these underflow issues and crashes go away.
> > > > >
> > > >
> > > > This has been discussed here [1].  TL;DR changing this to page
> > > > refcount might blow up in other colorful ways.  Can we look closer and
> > > > figure out why the underflow happens?
> > > >
> > > > [1] https://lore.kernel.org/netdev/1625903002-31619-4-git-send-email-linyunsheng@huawei.com/
> > > >
> > > > Thanks
> > > > /Ilias
> > > >
> > > >
> > >
> > > The logic should be safe in terms of the page pool itself as it should
> > > be holding one reference to the page while the pp_frag_count is non-
> > > zero. That one reference is what keeps the two halfs in sync as the
> > > page shouldn't be able to be freed until we exhaust the pp_frag_count.
> >
> > Do you remember why we decided to go with the fragment counter instead of
> > page references?
>
> The issue has to do with when to destroy the mappings. Basically with
> the fragment counter we destroy the mappings and remove the page from
> the pool when the count hits 0. The reference count is really used for
> the page allocator to do its tracking. If we end up trying to merge
> the two the problem becomes one of lifetimes as we wouldn't know when
> to destroy the DMA mappings as they would have to live the full life
> of the page.

Ah yes thanks! We need that on a comment somewhere,  I keep forgetting...
Basically the pp_frag_count is our number of outstanding dma mappings.

>
> > >
> > > To have an underflow there are two possible scenarios. One is that
> > > either put_page or free_page is being called somewhere that the
> > > page_pool freeing functions should be used.
> >
> > Wouldn't that affect the non fragmented path as well? IOW the driver that
> > works with a full page would crash as well.
>
> The problem is the non-fragmented path doesn't get as noisy. Also
> there aren't currently any wireless drivers making use of the page
> pool, or at least that is my understanding. I'm suspecting something
> like the issue we saw in 1effe8ca4e34c ("skbuff: fix coalescing for
> page_pool fragment recycling"). We likely have some corner case where
> we should be taking a page reference and clearing a pp_recycle flag.

Yea, same thinking here. I'll have another closer look tomorrow, but
looking at the wireless internals what happens is
1. They alloc a fragment
2. They create a new SKB, without the recycle bit and refer to the existing
fragments.  Since the recyle bit is off *that* skb will never try to
decrease the frag counter.  Instead it bumps the page refcnt which should be
properly decremented one that SKB is freed. I guess somehow an SKB ends up with
the recycle bit set, when it shouldn't.

Regards
/Ilias

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-26  9:14                           ` Felix Fietkau
@ 2023-01-26 16:08                             ` Alexander Duyck
  2023-01-26 16:40                               ` Alexander Duyck
  2023-01-26 17:44                               ` Felix Fietkau
  0 siblings, 2 replies; 43+ messages in thread
From: Alexander Duyck @ 2023-01-26 16:08 UTC (permalink / raw)
  To: Felix Fietkau
  Cc: Ilias Apalodimas, netdev, Jesper Dangaard Brouer,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Lorenzo Bianconi, linux-kernel, Yunsheng Lin

On Thu, Jan 26, 2023 at 1:15 AM Felix Fietkau <nbd@nbd.name> wrote:
>
> On 26.01.23 07:12, Felix Fietkau wrote:
> > On 25.01.23 23:14, Alexander H Duyck wrote:
> >> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote:
> >>> On 25.01.23 20:10, Felix Fietkau wrote:
> >>> > On 25.01.23 20:02, Alexander H Duyck wrote:
> >>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
> >>> > > > On 25.01.23 19:26, Alexander H Duyck wrote:
> >>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
> >>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote:
> >>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
> >>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
> >>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
> >>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
> >>> > > > > > > > > > > Hi Felix,
> >>> > > > > > > > > > >
> >>> > > > > > > > > > > ++cc Alexander and Yunsheng.
> >>> > > > > > > > > > >
> >>> > > > > > > > > > > Thanks for the report
> >>> > > > > > > > > > >
> >>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
> >>> > > > > > > > > > > >
> >>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
> >>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
> >>> > > > > > > > > > > > full-page page_pool allocation.
> >>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters
> >>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
> >>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and
> >>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
> >>> > > > > > > > > > > >
> >>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
> >>> > > > > > > > > > > > these underflow issues and crashes go away.
> >>> > > > > > > > > > > >
> >>> > > > > > > > > > >
> >>> > > > > > > > > > > This has been discussed here [1].  TL;DR changing this to page
> >>> > > > > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
> >>> > > > > > > > > > > figure out why the underflow happens?
> >>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I
> >>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in
> >>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from
> >>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs
> >>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
> >>> > > > > > > > > >
> >>> > > > > > > > > > - Felix
> >>> > > > > > > > > >
> >>> > > > > > > > >
> >>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am
> >>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
> >>> > > > > > > > > I would suspect the issue to be something like starting with a bad
> >>> > > > > > > > > count in terms of the total number of references, or deducing the wrong
> >>> > > > > > > > > amount when you finally free the page assuming you are tracking your
> >>> > > > > > > > > frag count using a non-atomic value in the driver.
> >>> > > > > > > > The driver patches for page pool are here:
> >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
> >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
> >>> > > > > > > >
> >>> > > > > > > > They are also applied in my mt76 tree at:
> >>> > > > > > > > https://github.com/nbd168/wireless
> >>> > > > > > > >
> >>> > > > > > > > - Felix
> >>> > > > > > >
> >>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we
> >>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages.
> >>> > > > > > > That is the only case I can think of where we might be underflowing
> >>> > > > > > > negative. If you could add some additional debug info on the underflow
> >>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful.
> >>> > > > > > > Specifically I would be curious what the actual return value is. I'm
> >>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we
> >>> > > > > > > aren't seeing something else.
> >>> > > > > > I'll try to run some more tests soon. However, I think I found the piece
> >>> > > > > > of code that is incompatible with using pp_frag_count.
> >>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11
> >>> > > > > > packet), and it is not split by the hardware, a cfg80211 function
> >>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a
> >>> > > > > > fragment can be shared across multiple skbs, and get_page is used to
> >>> > > > > > increase the refcount.
> >>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and
> >>> > > > > > its helper functions).
> >>> > > > >
> >>> > > > > I'm not sure if it is problematic or not. Basically it is trading off
> >>> > > > > by copying over the frags, calling get_page on each frag, and then
> >>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references.
> >>> > > > > There should be other paths in the kernel that are doing something
> >>> > > > > similar.
> >>> > > > >
> >>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly
> >>> > > > > > allocated skb if the previous one has it, but that's a separate matter
> >>> > > > > > and fixing it doesn't make the crash go away.
> >>> > > > >
> >>> > > > > Adding the recycle would cause this bug. So one thing we might be
> >>> > > > > seeing is something like that triggering this error. Specifically if
> >>> > > > > the page is taken via get_page when assembling the new skb then we
> >>> > > > > cannot set the recycle flag in the new skb otherwise it will result in
> >>> > > > > the reference undercount we are seeing. What we are doing is shifting
> >>> > > > > the references away from the pp_frag_count to the page reference count
> >>> > > > > in this case. If we set the pp_recycle flag then it would cause us to
> >>> > > > > decrement pp_frag_count instead of the page reference count resulting
> >>> > > > > in the underrun.
> >>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case
> >>> > > > where the last user of the page drops it via page_frag_free instead of
> >>> > > > page_pool_return_skb_page? Is that valid?
> >>> > >
> >>> > > No. What will happen is that when the pp_frag_count is exhausted the
> >>> > > page will be unmapped and evicted from the page pool. When the page is
> >>> > > then finally freed it will end up going back to the page allocator
> >>> > > instead of page pool.
> >>> > >
> >>> > > Basically the idea is that until pp_frag_count reaches 0 there will be
> >>> > > at least 1 page reference held.
> >>> > >
> >>> > > > > > Is there any way I can make that part of the code work with the current
> >>> > > > > > page pool frag implementation?
> >>> > > > >
> >>> > > > > The current code should work. Basically as long as the references are
> >>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
> >>> > > > > into this issue because the pp_frag_count will be dropped when the
> >>> > > > > original skb is freed and the page reference count will be decremented
> >>> > > > > when the new one is freed.
> >>> > > > >
> >>> > > > > For page pool page fragments the main thing to keep in mind is that if
> >>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not
> >>> > > > > then it will just decrement the page reference count.
> >>> > > > What takes care of DMA unmap and other cleanup if the last reference to
> >>> > > > the page is dropped via page_frag_free?
> >>> > > >
> >>> > > > - Felix
> >>> > >
> >>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the
> >>> > > page and evict it from the page pool. Basically in these cases the page
> >>> > > goes from the page pool back to the page allocator.
> >>> > >
> >>> > > The general idea with this is that if we are using fragments that there
> >>> > > will be enough of them floating around that if one or two frags have a
> >>> > > temporeary detour through a non-recycling path that hopefully by the
> >>> > > time the last fragment is freed the other instances holding the
> >>> > > additional page reference will have let them go. If not then the page
> >>> > > will go back to the page allocator and it will have to be replaced in
> >>> > > the page pool.
> >>> > Thanks for the explanation, it makes sense to me now. Unfortunately it
> >>> > also means that I have no idea what could cause this issue. I will
> >>> > finish my mt76 patch rework which gets rid of the pp vs non-pp
> >>> > allocation mix and re-run my tests to provide updated traces.
> >>> Here's the updated mt76 page pool support commit:
> >>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808
> >>
> >> Yeah, so I don't see anything wrong with the patch in terms of page
> >> pool.
> >>
> >>> And here is the trace that I'm getting with 6.1:
> >>> https://nbd.name/p/a16957f2
> >>>
> >>> If you have any debug patch you'd like me to test, please let me know.
> >>>
> >>> - Felix
> >>
> >> So looking at the traces I am assuming what we are seeing is the
> >> deferred freeing from the TCP Rx path since I don't see a driver
> >> anywhere between net_rx_action and napi_consume skb. So it seems like
> >> the packets are likely making it all the way up the network stack.
> >>
> >> Is this the first wireless driver to add support for page pool? I'm
> >> thinking we must be seeing something in the wireless path that is
> >> causing an issue such as the function you called out earlier but I
> >> can't see anything obvious.
> > Yes, it's the first driver with page pool support.
> >
> >> One thing we need to be on the lookout for is cloned skbs. When an skb
> >> is cloned the pp_recycle gets copied over. In that case the reference
> >> is moved over to the skb dataref count. What comes to mind is something
> >> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool
> >> fragment recycling").
> > I suspect that the crash might be related to a bad interaction between
> > the page reuse in A-MSDU rx + skb coalescing on TCP rx.
> > If I change the A-MSDU code to copy data instead of reusing fragments,
> > it doesn't crash anymore.

Which piece did you change? My main interest would be trying to narrow
down the section of this function that is causing this. Did you modify
__ieee80211_amsdu_copy or some other function within the setup?

> > I believe the issue must be specific to that codepath, since most
> > received and processed packets are either not A-MSDU or A-MSDU decap has
> > already been performed by the hardware.
> > If I change my test to use 3 client mode interfaces instead of 4, the
> > hardware is able to offload all A-MSDU rx processing and I don't see any
> > crashes anymore.
> >
> > Could you please take another look at ieee80211_amsdu_to_8023s to see if
> > there's anything in there that could cause these issues?

The thing is I am not sure it is the only cause for this. I am
suspecting we are seeing this triggering an issue when combined with
something else.

If we could add some tracing to dump the skb and list buffers that
might be helpful. We would want to verify the pp_recycle value, clone
flag, and for the frags we would want to frag count and page reference
counts. The expectation would be that the original skb should have the
pp_recycle flag set and the frag counts consistent through the
process, and the list skbs should all have taken page references w/ no
pp_recycle on the skbs in the list.

> Here are clues from a few more tests that I ran:
> - preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does
> not prevent the crashes, so the issue is indeed related to taking page
> references and putting the pages in skb fragments.

You said in the first email it stops it and in the second "does not".
I am assuming that is some sort of typo since you seem to be implying
it does resolve it for you. Is that correct?

> - if I return false in skb_try_coalesce, it still crashes:
> https://nbd.name/p/18cac078

Yeah, I wasn't suspecting skb_try_coalesce since we have exercised the
code. My thought was something like the function you mentioned above
plus cloning or something else.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-26 16:08                             ` Alexander Duyck
@ 2023-01-26 16:40                               ` Alexander Duyck
  2023-01-26 17:44                               ` Felix Fietkau
  1 sibling, 0 replies; 43+ messages in thread
From: Alexander Duyck @ 2023-01-26 16:40 UTC (permalink / raw)
  To: Felix Fietkau
  Cc: Ilias Apalodimas, netdev, Jesper Dangaard Brouer,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Lorenzo Bianconi, linux-kernel, Yunsheng Lin

?

On Thu, Jan 26, 2023 at 8:08 AM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>
> On Thu, Jan 26, 2023 at 1:15 AM Felix Fietkau <nbd@nbd.name> wrote:
> >

...

> > - if I return false in skb_try_coalesce, it still crashes:
> > https://nbd.name/p/18cac078
>
> Yeah, I wasn't suspecting skb_try_coalesce since we have exercised the
> code. My thought was something like the function you mentioned above
> plus cloning or something else.

One question I would have. Is GRO happening after the call to
ieee80211_amsdu_to_8023s? If so we might want to try switching that
off to see if it might be aggregating page pool frames and non-page
pool frames together.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-26 16:08                             ` Alexander Duyck
  2023-01-26 16:40                               ` Alexander Duyck
@ 2023-01-26 17:44                               ` Felix Fietkau
  2023-01-26 18:38                                 ` Alexander H Duyck
  1 sibling, 1 reply; 43+ messages in thread
From: Felix Fietkau @ 2023-01-26 17:44 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Ilias Apalodimas, netdev, Jesper Dangaard Brouer,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Lorenzo Bianconi, linux-kernel, Yunsheng Lin

On 26.01.23 17:08, Alexander Duyck wrote:
> On Thu, Jan 26, 2023 at 1:15 AM Felix Fietkau <nbd@nbd.name> wrote:
>>
>> On 26.01.23 07:12, Felix Fietkau wrote:
>> > On 25.01.23 23:14, Alexander H Duyck wrote:
>> >> On Wed, 2023-01-25 at 20:40 +0100, Felix Fietkau wrote:
>> >>> On 25.01.23 20:10, Felix Fietkau wrote:
>> >>> > On 25.01.23 20:02, Alexander H Duyck wrote:
>> >>> > > On Wed, 2023-01-25 at 19:42 +0100, Felix Fietkau wrote:
>> >>> > > > On 25.01.23 19:26, Alexander H Duyck wrote:
>> >>> > > > > On Wed, 2023-01-25 at 18:32 +0100, Felix Fietkau wrote:
>> >>> > > > > > On 25.01.23 18:11, Alexander H Duyck wrote:
>> >>> > > > > > > On Tue, 2023-01-24 at 22:30 +0100, Felix Fietkau wrote:
>> >>> > > > > > > > On 24.01.23 22:10, Alexander H Duyck wrote:
>> >>> > > > > > > > > On Tue, 2023-01-24 at 18:22 +0100, Felix Fietkau wrote:
>> >>> > > > > > > > > > On 24.01.23 15:11, Ilias Apalodimas wrote:
>> >>> > > > > > > > > > > Hi Felix,
>> >>> > > > > > > > > > >
>> >>> > > > > > > > > > > ++cc Alexander and Yunsheng.
>> >>> > > > > > > > > > >
>> >>> > > > > > > > > > > Thanks for the report
>> >>> > > > > > > > > > >
>> >>> > > > > > > > > > > On Tue, 24 Jan 2023 at 14:43, Felix Fietkau <nbd@nbd.name> wrote:
>> >>> > > > > > > > > > > >
>> >>> > > > > > > > > > > > While testing fragmented page_pool allocation in the mt76 driver, I was able
>> >>> > > > > > > > > > > > to reliably trigger page refcount underflow issues, which did not occur with
>> >>> > > > > > > > > > > > full-page page_pool allocation.
>> >>> > > > > > > > > > > > It appears to me, that handling refcounting in two separate counters
>> >>> > > > > > > > > > > > (page->pp_frag_count and page refcount) is racy when page refcount gets
>> >>> > > > > > > > > > > > incremented by code dealing with skb fragments directly, and
>> >>> > > > > > > > > > > > page_pool_return_skb_page is called multiple times for the same fragment.
>> >>> > > > > > > > > > > >
>> >>> > > > > > > > > > > > Dropping page->pp_frag_count and relying entirely on the page refcount makes
>> >>> > > > > > > > > > > > these underflow issues and crashes go away.
>> >>> > > > > > > > > > > >
>> >>> > > > > > > > > > >
>> >>> > > > > > > > > > > This has been discussed here [1].  TL;DR changing this to page
>> >>> > > > > > > > > > > refcount might blow up in other colorful ways.  Can we look closer and
>> >>> > > > > > > > > > > figure out why the underflow happens?
>> >>> > > > > > > > > > I don't see how the approch taken in my patch would blow up. From what I
>> >>> > > > > > > > > > can tell, it should be fairly close to how refcount is handled in
>> >>> > > > > > > > > > page_frag_alloc. The main improvement it adds is to prevent it from
>> >>> > > > > > > > > > blowing up if pool-allocated fragments get shared across multiple skbs
>> >>> > > > > > > > > > with corresponding get_page and page_pool_return_skb_page calls.
>> >>> > > > > > > > > >
>> >>> > > > > > > > > > - Felix
>> >>> > > > > > > > > >
>> >>> > > > > > > > >
>> >>> > > > > > > > > Do you have the patch available to review as an RFC? From what I am
>> >>> > > > > > > > > seeing it looks like you are underrunning on the pp_frag_count itself.
>> >>> > > > > > > > > I would suspect the issue to be something like starting with a bad
>> >>> > > > > > > > > count in terms of the total number of references, or deducing the wrong
>> >>> > > > > > > > > amount when you finally free the page assuming you are tracking your
>> >>> > > > > > > > > frag count using a non-atomic value in the driver.
>> >>> > > > > > > > The driver patches for page pool are here:
>> >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/64abb23f4867c075c19d704beaae5a0a2f8e8821.1673963374.git.lorenzo@kernel.org/
>> >>> > > > > > > > https://patchwork.kernel.org/project/linux-wireless/patch/68081e02cbe2afa2d35c8aa93194f0adddbd0f05.1673963374.git.lorenzo@kernel.org/
>> >>> > > > > > > >
>> >>> > > > > > > > They are also applied in my mt76 tree at:
>> >>> > > > > > > > https://github.com/nbd168/wireless
>> >>> > > > > > > >
>> >>> > > > > > > > - Felix
>> >>> > > > > > >
>> >>> > > > > > > So one thing I am thinking is that we may be seeing an issue where we
>> >>> > > > > > > are somehow getting a mix of frag and non-frag based page pool pages.
>> >>> > > > > > > That is the only case I can think of where we might be underflowing
>> >>> > > > > > > negative. If you could add some additional debug info on the underflow
>> >>> > > > > > > WARN_ON case in page_pool_defrag_page that might be useful.
>> >>> > > > > > > Specifically I would be curious what the actual return value is. I'm
>> >>> > > > > > > assuming we are only hitting negative 1, but I would want to verify we
>> >>> > > > > > > aren't seeing something else.
>> >>> > > > > > I'll try to run some more tests soon. However, I think I found the piece
>> >>> > > > > > of code that is incompatible with using pp_frag_count.
>> >>> > > > > > When receiving an A-MSDU packet (multiple MSDUs within a single 802.11
>> >>> > > > > > packet), and it is not split by the hardware, a cfg80211 function
>> >>> > > > > > extracts the individual MSDUs into separate skbs. In that case, a
>> >>> > > > > > fragment can be shared across multiple skbs, and get_page is used to
>> >>> > > > > > increase the refcount.
>> >>> > > > > > You can find this in net/wireless/util.c: ieee80211_amsdu_to_8023s (and
>> >>> > > > > > its helper functions).
>> >>> > > > >
>> >>> > > > > I'm not sure if it is problematic or not. Basically it is trading off
>> >>> > > > > by copying over the frags, calling get_page on each frag, and then
>> >>> > > > > using dev_kfree_skb to disassemble and release the pp_frag references.
>> >>> > > > > There should be other paths in the kernel that are doing something
>> >>> > > > > similar.
>> >>> > > > >
>> >>> > > > > > This code also has a bug where it doesn't set pp_recycle on the newly
>> >>> > > > > > allocated skb if the previous one has it, but that's a separate matter
>> >>> > > > > > and fixing it doesn't make the crash go away.
>> >>> > > > >
>> >>> > > > > Adding the recycle would cause this bug. So one thing we might be
>> >>> > > > > seeing is something like that triggering this error. Specifically if
>> >>> > > > > the page is taken via get_page when assembling the new skb then we
>> >>> > > > > cannot set the recycle flag in the new skb otherwise it will result in
>> >>> > > > > the reference undercount we are seeing. What we are doing is shifting
>> >>> > > > > the references away from the pp_frag_count to the page reference count
>> >>> > > > > in this case. If we set the pp_recycle flag then it would cause us to
>> >>> > > > > decrement pp_frag_count instead of the page reference count resulting
>> >>> > > > > in the underrun.
>> >>> > > > Couldn't leaving out the pp_recycle flag potentially lead to a case
>> >>> > > > where the last user of the page drops it via page_frag_free instead of
>> >>> > > > page_pool_return_skb_page? Is that valid?
>> >>> > >
>> >>> > > No. What will happen is that when the pp_frag_count is exhausted the
>> >>> > > page will be unmapped and evicted from the page pool. When the page is
>> >>> > > then finally freed it will end up going back to the page allocator
>> >>> > > instead of page pool.
>> >>> > >
>> >>> > > Basically the idea is that until pp_frag_count reaches 0 there will be
>> >>> > > at least 1 page reference held.
>> >>> > >
>> >>> > > > > > Is there any way I can make that part of the code work with the current
>> >>> > > > > > page pool frag implementation?
>> >>> > > > >
>> >>> > > > > The current code should work. Basically as long as the references are
>> >>> > > > > taken w/ get_page and skb->pp_recycle is not set then we shouldn't run
>> >>> > > > > into this issue because the pp_frag_count will be dropped when the
>> >>> > > > > original skb is freed and the page reference count will be decremented
>> >>> > > > > when the new one is freed.
>> >>> > > > >
>> >>> > > > > For page pool page fragments the main thing to keep in mind is that if
>> >>> > > > > pp_recycle is set it will update the pp_frag_count and if it is not
>> >>> > > > > then it will just decrement the page reference count.
>> >>> > > > What takes care of DMA unmap and other cleanup if the last reference to
>> >>> > > > the page is dropped via page_frag_free?
>> >>> > > >
>> >>> > > > - Felix
>> >>> > >
>> >>> > > When the page is freed on the skb w/ pp_recycle set it will unmap the
>> >>> > > page and evict it from the page pool. Basically in these cases the page
>> >>> > > goes from the page pool back to the page allocator.
>> >>> > >
>> >>> > > The general idea with this is that if we are using fragments that there
>> >>> > > will be enough of them floating around that if one or two frags have a
>> >>> > > temporeary detour through a non-recycling path that hopefully by the
>> >>> > > time the last fragment is freed the other instances holding the
>> >>> > > additional page reference will have let them go. If not then the page
>> >>> > > will go back to the page allocator and it will have to be replaced in
>> >>> > > the page pool.
>> >>> > Thanks for the explanation, it makes sense to me now. Unfortunately it
>> >>> > also means that I have no idea what could cause this issue. I will
>> >>> > finish my mt76 patch rework which gets rid of the pp vs non-pp
>> >>> > allocation mix and re-run my tests to provide updated traces.
>> >>> Here's the updated mt76 page pool support commit:
>> >>> https://github.com/nbd168/wireless/commit/923cdab6d4c92a0acb3536b3b0cc4af9fee7c808
>> >>
>> >> Yeah, so I don't see anything wrong with the patch in terms of page
>> >> pool.
>> >>
>> >>> And here is the trace that I'm getting with 6.1:
>> >>> https://nbd.name/p/a16957f2
>> >>>
>> >>> If you have any debug patch you'd like me to test, please let me know.
>> >>>
>> >>> - Felix
>> >>
>> >> So looking at the traces I am assuming what we are seeing is the
>> >> deferred freeing from the TCP Rx path since I don't see a driver
>> >> anywhere between net_rx_action and napi_consume skb. So it seems like
>> >> the packets are likely making it all the way up the network stack.
>> >>
>> >> Is this the first wireless driver to add support for page pool? I'm
>> >> thinking we must be seeing something in the wireless path that is
>> >> causing an issue such as the function you called out earlier but I
>> >> can't see anything obvious.
>> > Yes, it's the first driver with page pool support.
>> >
>> >> One thing we need to be on the lookout for is cloned skbs. When an skb
>> >> is cloned the pp_recycle gets copied over. In that case the reference
>> >> is moved over to the skb dataref count. What comes to mind is something
>> >> like commit 1effe8ca4e34c ("skbuff: fix coalescing for page_pool
>> >> fragment recycling").
>> > I suspect that the crash might be related to a bad interaction between
>> > the page reuse in A-MSDU rx + skb coalescing on TCP rx.
>> > If I change the A-MSDU code to copy data instead of reusing fragments,
>> > it doesn't crash anymore.
> 
> Which piece did you change? My main interest would be trying to narrow
> down the section of this function that is causing this. Did you modify
> __ieee80211_amsdu_copy or some other function within the setup?
I replaced this line:
   bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb);
with:
   bool reuse_frag = false;

>> > I believe the issue must be specific to that codepath, since most
>> > received and processed packets are either not A-MSDU or A-MSDU decap has
>> > already been performed by the hardware.
>> > If I change my test to use 3 client mode interfaces instead of 4, the
>> > hardware is able to offload all A-MSDU rx processing and I don't see any
>> > crashes anymore.
>> >
>> > Could you please take another look at ieee80211_amsdu_to_8023s to see if
>> > there's anything in there that could cause these issues?
> 
> The thing is I am not sure it is the only cause for this. I am
> suspecting we are seeing this triggering an issue when combined with
> something else.
> 
> If we could add some tracing to dump the skb and list buffers that
> might be helpful. We would want to verify the pp_recycle value, clone
> flag, and for the frags we would want to frag count and page reference
> counts. The expectation would be that the original skb should have the
> pp_recycle flag set and the frag counts consistent through the
> process, and the list skbs should all have taken page references w/ no
> pp_recycle on the skbs in the list.
> 
>> Here are clues from a few more tests that I ran:
>> - preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does
>> not prevent the crashes, so the issue is indeed related to taking page
>> references and putting the pages in skb fragments.
> 
> You said in the first email it stops it and in the second "does not".
> I am assuming that is some sort of typo since you seem to be implying
> it does resolve it for you. Is that correct?
For everything except for the last subframe, the function pulls 
fragments out of the original skb and puts them into a new skb allocated 
with dev_alloc_skb. Additionally, the last skb is reused for the final 
subframe. What I meant was: In order to figure out if the skb-reuse is 
problematic, I prevented it from happening, ensuring that all subframes 
are allocated and get the fragments from the skb.
All I meant to say is that since that led to the same crash, the 
skb-reuse is not the problem.

Regarding the question from your other mail: without GRO there is no 
crash and it looks stable.

- Felix


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-26 17:44                               ` Felix Fietkau
@ 2023-01-26 18:38                                 ` Alexander H Duyck
  2023-01-26 18:43                                   ` Felix Fietkau
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander H Duyck @ 2023-01-26 18:38 UTC (permalink / raw)
  To: Felix Fietkau
  Cc: Ilias Apalodimas, netdev, Jesper Dangaard Brouer,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Lorenzo Bianconi, linux-kernel, Yunsheng Lin

> 
> > Which piece did you change? My main interest would be trying to narrow
> > down the section of this function that is causing this. Did you modify
> > __ieee80211_amsdu_copy or some other function within the setup?
> I replaced this line:
>    bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb);
> with:
>    bool reuse_frag = false;

I see. So essentially everything is copied into the new buffers.

> > > > I believe the issue must be specific to that codepath, since most
> > > > received and processed packets are either not A-MSDU or A-MSDU decap has
> > > > already been performed by the hardware.
> > > > If I change my test to use 3 client mode interfaces instead of 4, the
> > > > hardware is able to offload all A-MSDU rx processing and I don't see any
> > > > crashes anymore.
> > > > 
> > > > Could you please take another look at ieee80211_amsdu_to_8023s to see if
> > > > there's anything in there that could cause these issues?
> > 
> > The thing is I am not sure it is the only cause for this. I am
> > suspecting we are seeing this triggering an issue when combined with
> > something else.
> > 
> > If we could add some tracing to dump the skb and list buffers that
> > might be helpful. We would want to verify the pp_recycle value, clone
> > flag, and for the frags we would want to frag count and page reference
> > counts. The expectation would be that the original skb should have the
> > pp_recycle flag set and the frag counts consistent through the
> > process, and the list skbs should all have taken page references w/ no
> > pp_recycle on the skbs in the list.
> > 
> > > Here are clues from a few more tests that I ran:
> > > - preventing the reuse of the last skb in ieee80211_amsdu_to_8023s does
> > > not prevent the crashes, so the issue is indeed related to taking page
> > > references and putting the pages in skb fragments.
> > 
> > You said in the first email it stops it and in the second "does not".
> > I am assuming that is some sort of typo since you seem to be implying
> > it does resolve it for you. Is that correct?
> For everything except for the last subframe, the function pulls 
> fragments out of the original skb and puts them into a new skb allocated 
> with dev_alloc_skb. Additionally, the last skb is reused for the final 
> subframe. What I meant was: In order to figure out if the skb-reuse is 
> problematic, I prevented it from happening, ensuring that all subframes 
> are allocated and get the fragments from the skb.
> All I meant to say is that since that led to the same crash, the 
> skb-reuse is not the problem.
> 
> Regarding the question from your other mail: without GRO there is no 
> crash and it looks stable.
> 
> - Felix
> 

Okay, I think that tells me exactly what is going on. Can you give the
change below a try and see if it solves the problem for you.

I think what is happening is that after you are reassigning the frags
they are getting merged into GRO frames where the head may have
pp_recycle set. As a result I think the pages are getting recycled when
they should be just freed via put_page.

I'm suspecting this wasn't an issue up until now as I don't believe
there are any that are running in a mixed mode where they have both
pp_recycle and non-pp_recycle skbs coming from the same device.

diff --git a/net/core/gro.c b/net/core/gro.c
index 506f83d715f8..4bac7ea6e025 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct
sk_buff *skb)
 	struct sk_buff *lp;
 	int segs;
 
+	/* Do not splice page pool based packets w/ non-page pool
+	 * packets. This can result in reference count issues as page
+	 * pool pages will not decrement the reference count and will
+	 * instead be immediately returned to the pool or have frag
+	 * count decremented.
+	 */
+	if (p->pp_recycle != skb->pp_recycle)
+		return -ETOOMANYREFS;
+
 	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
 	gro_max_size = READ_ONCE(p->dev->gro_max_size);
 

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH] net: page_pool: fix refcounting issues with fragmented allocation
  2023-01-26 18:38                                 ` Alexander H Duyck
@ 2023-01-26 18:43                                   ` Felix Fietkau
  2023-01-26 19:06                                     ` [net PATCH] skb: Do mix page pool and page referenced frags in GRO Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Felix Fietkau @ 2023-01-26 18:43 UTC (permalink / raw)
  To: Alexander H Duyck
  Cc: Ilias Apalodimas, netdev, Jesper Dangaard Brouer,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Lorenzo Bianconi, linux-kernel, Yunsheng Lin

On 26.01.23 19:38, Alexander H Duyck wrote:
> Okay, I think that tells me exactly what is going on. Can you give the
> change below a try and see if it solves the problem for you.
> 
> I think what is happening is that after you are reassigning the frags
> they are getting merged into GRO frames where the head may have
> pp_recycle set. As a result I think the pages are getting recycled when
> they should be just freed via put_page.
> 
> I'm suspecting this wasn't an issue up until now as I don't believe
> there are any that are running in a mixed mode where they have both
> pp_recycle and non-pp_recycle skbs coming from the same device.
> 
> diff --git a/net/core/gro.c b/net/core/gro.c
> index 506f83d715f8..4bac7ea6e025 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct
> sk_buff *skb)
>   	struct sk_buff *lp;
>   	int segs;
>   
> +	/* Do not splice page pool based packets w/ non-page pool
> +	 * packets. This can result in reference count issues as page
> +	 * pool pages will not decrement the reference count and will
> +	 * instead be immediately returned to the pool or have frag
> +	 * count decremented.
> +	 */
> +	if (p->pp_recycle != skb->pp_recycle)
> +		return -ETOOMANYREFS;
> +
>   	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
>   	gro_max_size = READ_ONCE(p->dev->gro_max_size);
>   
That works, thanks!

- Felix

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-26 18:43                                   ` Felix Fietkau
@ 2023-01-26 19:06                                     ` Alexander Duyck
  2023-01-26 19:14                                       ` Toke Høiland-Jørgensen
                                                         ` (3 more replies)
  0 siblings, 4 replies; 43+ messages in thread
From: Alexander Duyck @ 2023-01-26 19:06 UTC (permalink / raw)
  To: nbd
  Cc: alexander.duyck, davem, edumazet, hawk, ilias.apalodimas, kuba,
	linux-kernel, linyunsheng, lorenzo, netdev, pabeni

From: Alexander Duyck <alexanderduyck@fb.com>

GSO should not merge page pool recycled frames with standard reference
counted frames. Traditionally this didn't occur, at least not often.
However as we start looking at adding support for wireless adapters there
becomes the potential to mix the two due to A-MSDU repartitioning frames in
the receive path. There are possibly other places where this may have
occurred however I suspect they must be few and far between as we have not
seen this issue until now.

Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool")
Reported-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
---
 net/core/gro.c |    9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/net/core/gro.c b/net/core/gro.c
index 506f83d715f8..4bac7ea6e025 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 	struct sk_buff *lp;
 	int segs;
 
+	/* Do not splice page pool based packets w/ non-page pool
+	 * packets. This can result in reference count issues as page
+	 * pool pages will not decrement the reference count and will
+	 * instead be immediately returned to the pool or have frag
+	 * count decremented.
+	 */
+	if (p->pp_recycle != skb->pp_recycle)
+		return -ETOOMANYREFS;
+
 	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
 	gro_max_size = READ_ONCE(p->dev->gro_max_size);
 



^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-26 19:06                                     ` [net PATCH] skb: Do mix page pool and page referenced frags in GRO Alexander Duyck
@ 2023-01-26 19:14                                       ` Toke Høiland-Jørgensen
  2023-01-26 19:48                                         ` Alexander Duyck
  2023-01-26 23:13                                       ` Jakub Kicinski
                                                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 43+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-01-26 19:14 UTC (permalink / raw)
  To: Alexander Duyck, nbd
  Cc: alexander.duyck, davem, edumazet, hawk, ilias.apalodimas, kuba,
	linux-kernel, linyunsheng, lorenzo, netdev, pabeni

Alexander Duyck <alexander.duyck@gmail.com> writes:

> From: Alexander Duyck <alexanderduyck@fb.com>
>
> GSO should not merge page pool recycled frames with standard reference
> counted frames. Traditionally this didn't occur, at least not often.
> However as we start looking at adding support for wireless adapters there
> becomes the potential to mix the two due to A-MSDU repartitioning frames in
> the receive path. There are possibly other places where this may have
> occurred however I suspect they must be few and far between as we have not
> seen this issue until now.
>
> Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool")
> Reported-by: Felix Fietkau <nbd@nbd.name>
> Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>

I know I'm pattern matching a bit crudely here, but we recently had
another report where doing a get_page() on skb->head didn't seem to be
enough; any chance they might be related?

See: https://lore.kernel.org/r/Y9BfknDG0LXmruDu@JNXK7M3

-Toke


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-26 19:14                                       ` Toke Høiland-Jørgensen
@ 2023-01-26 19:48                                         ` Alexander Duyck
  2023-01-26 21:35                                           ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 43+ messages in thread
From: Alexander Duyck @ 2023-01-26 19:48 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: nbd, davem, edumazet, hawk, ilias.apalodimas, kuba, linux-kernel,
	linyunsheng, lorenzo, netdev, pabeni

On Thu, Jan 26, 2023 at 11:14 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Alexander Duyck <alexander.duyck@gmail.com> writes:
>
> > From: Alexander Duyck <alexanderduyck@fb.com>
> >
> > GSO should not merge page pool recycled frames with standard reference
> > counted frames. Traditionally this didn't occur, at least not often.
> > However as we start looking at adding support for wireless adapters there
> > becomes the potential to mix the two due to A-MSDU repartitioning frames in
> > the receive path. There are possibly other places where this may have
> > occurred however I suspect they must be few and far between as we have not
> > seen this issue until now.
> >
> > Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool")
> > Reported-by: Felix Fietkau <nbd@nbd.name>
> > Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
>
> I know I'm pattern matching a bit crudely here, but we recently had
> another report where doing a get_page() on skb->head didn't seem to be
> enough; any chance they might be related?
>
> See: https://lore.kernel.org/r/Y9BfknDG0LXmruDu@JNXK7M3

Looking at it I wouldn't think so. Doing get_page() on these frames is
fine. In the case you reference it looks like get_page() is being
called on a slab allocated skb head. So somehow a slab allocated head
is leaking through.

What is causing the issue here is that after get_page() is being
called and the fragments are moved into a non-pp_recycle skb they are
then picked out and merged back into a pp_recycle skb. As a result
what is happening is that we are seeing a reference count leak from
pp_frag_count and into refcount.

This is the quick-n-dirty fix. I am debating if we want to expand this
so that we could support the case where the donor frame is pp_recycle
but the recipient is a standard reference counted frame. Fixing that
would essentially consist of having to add logic to take the reference
on all donor frags, making certain that nr_frags on the donor skb
isn't altered, and then lastly making sure that all cases use the
NAPI_GRO_FREE path to drop the page pool counts.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-26 19:48                                         ` Alexander Duyck
@ 2023-01-26 21:35                                           ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 43+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-01-26 21:35 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: nbd, davem, edumazet, hawk, ilias.apalodimas, kuba, linux-kernel,
	linyunsheng, lorenzo, netdev, pabeni

Alexander Duyck <alexander.duyck@gmail.com> writes:

> On Thu, Jan 26, 2023 at 11:14 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Alexander Duyck <alexander.duyck@gmail.com> writes:
>>
>> > From: Alexander Duyck <alexanderduyck@fb.com>
>> >
>> > GSO should not merge page pool recycled frames with standard reference
>> > counted frames. Traditionally this didn't occur, at least not often.
>> > However as we start looking at adding support for wireless adapters there
>> > becomes the potential to mix the two due to A-MSDU repartitioning frames in
>> > the receive path. There are possibly other places where this may have
>> > occurred however I suspect they must be few and far between as we have not
>> > seen this issue until now.
>> >
>> > Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool")
>> > Reported-by: Felix Fietkau <nbd@nbd.name>
>> > Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
>>
>> I know I'm pattern matching a bit crudely here, but we recently had
>> another report where doing a get_page() on skb->head didn't seem to be
>> enough; any chance they might be related?
>>
>> See: https://lore.kernel.org/r/Y9BfknDG0LXmruDu@JNXK7M3
>
> Looking at it I wouldn't think so. Doing get_page() on these frames is
> fine. In the case you reference it looks like get_page() is being
> called on a slab allocated skb head. So somehow a slab allocated head
> is leaking through.

Alright, thanks for taking a look! :)

-Toke


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-26 19:06                                     ` [net PATCH] skb: Do mix page pool and page referenced frags in GRO Alexander Duyck
  2023-01-26 19:14                                       ` Toke Høiland-Jørgensen
@ 2023-01-26 23:13                                       ` Jakub Kicinski
  2023-01-27  7:15                                         ` Ilias Apalodimas
                                                           ` (2 more replies)
  2023-01-28  2:37                                       ` Yunsheng Lin
  2023-01-28  7:50                                       ` patchwork-bot+netdevbpf
  3 siblings, 3 replies; 43+ messages in thread
From: Jakub Kicinski @ 2023-01-26 23:13 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: nbd, davem, edumazet, hawk, ilias.apalodimas, linux-kernel,
	linyunsheng, lorenzo, netdev, pabeni

On Thu, 26 Jan 2023 11:06:59 -0800 Alexander Duyck wrote:
> From: Alexander Duyck <alexanderduyck@fb.com>
> 
> GSO should not merge page pool recycled frames with standard reference
> counted frames. Traditionally this didn't occur, at least not often.
> However as we start looking at adding support for wireless adapters there
> becomes the potential to mix the two due to A-MSDU repartitioning frames in
> the receive path. There are possibly other places where this may have
> occurred however I suspect they must be few and far between as we have not
> seen this issue until now.
> 
> Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool")
> Reported-by: Felix Fietkau <nbd@nbd.name>
> Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>

Exciting investigation!
Felix, out of curiosity - the impact of loosing GRO on performance is
not significant enough to care?  We could possibly try to switch to
using the frag list if we can't merge into frags safely.

> diff --git a/net/core/gro.c b/net/core/gro.c
> index 506f83d715f8..4bac7ea6e025 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
>  	struct sk_buff *lp;
>  	int segs;
>  
> +	/* Do not splice page pool based packets w/ non-page pool
> +	 * packets. This can result in reference count issues as page
> +	 * pool pages will not decrement the reference count and will
> +	 * instead be immediately returned to the pool or have frag
> +	 * count decremented.
> +	 */
> +	if (p->pp_recycle != skb->pp_recycle)
> +		return -ETOOMANYREFS;
> 
>  	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
>  	gro_max_size = READ_ONCE(p->dev->gro_max_size);
>  
> 
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-26 23:13                                       ` Jakub Kicinski
@ 2023-01-27  7:15                                         ` Ilias Apalodimas
  2023-01-27  7:21                                         ` Felix Fietkau
  2023-01-30 16:49                                         ` Jesper Dangaard Brouer
  2 siblings, 0 replies; 43+ messages in thread
From: Ilias Apalodimas @ 2023-01-27  7:15 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Alexander Duyck, nbd, davem, edumazet, hawk, linux-kernel,
	linyunsheng, lorenzo, netdev, pabeni

Thanks Alexander!

On Fri, 27 Jan 2023 at 01:13, Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Thu, 26 Jan 2023 11:06:59 -0800 Alexander Duyck wrote:
> > From: Alexander Duyck <alexanderduyck@fb.com>
> >
> > GSO should not merge page pool recycled frames with standard reference
> > counted frames. Traditionally this didn't occur, at least not often.
> > However as we start looking at adding support for wireless adapters there
> > becomes the potential to mix the two due to A-MSDU repartitioning frames in
> > the receive path. There are possibly other places where this may have
> > occurred however I suspect they must be few and far between as we have not
> > seen this issue until now.
> >
> > Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool")
> > Reported-by: Felix Fietkau <nbd@nbd.name>
> > Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
>
> Exciting investigation!
> Felix, out of curiosity - the impact of loosing GRO on performance is
> not significant enough to care?  We could possibly try to switch to
> using the frag list if we can't merge into frags safely.
>
> > diff --git a/net/core/gro.c b/net/core/gro.c
> > index 506f83d715f8..4bac7ea6e025 100644
> > --- a/net/core/gro.c
> > +++ b/net/core/gro.c
> > @@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> >       struct sk_buff *lp;
> >       int segs;
> >
> > +     /* Do not splice page pool based packets w/ non-page pool
> > +      * packets. This can result in reference count issues as page
> > +      * pool pages will not decrement the reference count and will
> > +      * instead be immediately returned to the pool or have frag
> > +      * count decremented.
> > +      */
> > +     if (p->pp_recycle != skb->pp_recycle)
> > +             return -ETOOMANYREFS;
> >
> >       /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> >       gro_max_size = READ_ONCE(p->dev->gro_max_size);
> >
> >
> >
>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-26 23:13                                       ` Jakub Kicinski
  2023-01-27  7:15                                         ` Ilias Apalodimas
@ 2023-01-27  7:21                                         ` Felix Fietkau
  2023-01-30 16:49                                         ` Jesper Dangaard Brouer
  2 siblings, 0 replies; 43+ messages in thread
From: Felix Fietkau @ 2023-01-27  7:21 UTC (permalink / raw)
  To: Jakub Kicinski, Alexander Duyck
  Cc: davem, edumazet, hawk, ilias.apalodimas, linux-kernel,
	linyunsheng, lorenzo, netdev, pabeni

On 27.01.23 00:13, Jakub Kicinski wrote:
> On Thu, 26 Jan 2023 11:06:59 -0800 Alexander Duyck wrote:
>> From: Alexander Duyck <alexanderduyck@fb.com>
>> 
>> GSO should not merge page pool recycled frames with standard reference
>> counted frames. Traditionally this didn't occur, at least not often.
>> However as we start looking at adding support for wireless adapters there
>> becomes the potential to mix the two due to A-MSDU repartitioning frames in
>> the receive path. There are possibly other places where this may have
>> occurred however I suspect they must be few and far between as we have not
>> seen this issue until now.
>> 
>> Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool")
>> Reported-by: Felix Fietkau <nbd@nbd.name>
>> Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
> 
> Exciting investigation!
> Felix, out of curiosity - the impact of loosing GRO on performance is
> not significant enough to care?  We could possibly try to switch to
> using the frag list if we can't merge into frags safely.
Since this only affects combining page_pool and non-page_pool packets, 
the performance loss should be neglegible.

- Felix


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-26 19:06                                     ` [net PATCH] skb: Do mix page pool and page referenced frags in GRO Alexander Duyck
  2023-01-26 19:14                                       ` Toke Høiland-Jørgensen
  2023-01-26 23:13                                       ` Jakub Kicinski
@ 2023-01-28  2:37                                       ` Yunsheng Lin
  2023-01-28  5:26                                         ` Jakub Kicinski
  2023-01-28  7:50                                       ` patchwork-bot+netdevbpf
  3 siblings, 1 reply; 43+ messages in thread
From: Yunsheng Lin @ 2023-01-28  2:37 UTC (permalink / raw)
  To: Alexander Duyck, nbd
  Cc: davem, edumazet, hawk, ilias.apalodimas, kuba, linux-kernel,
	lorenzo, netdev, pabeni

On 2023/1/27 3:06, Alexander Duyck wrote:
> From: Alexander Duyck <alexanderduyck@fb.com>
> 
> GSO should not merge page pool recycled frames with standard reference
> counted frames. Traditionally this didn't occur, at least not often.
> However as we start looking at adding support for wireless adapters there
> becomes the potential to mix the two due to A-MSDU repartitioning frames in
> the receive path. There are possibly other places where this may have
> occurred however I suspect they must be few and far between as we have not
> seen this issue until now.
> 
> Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool")
> Reported-by: Felix Fietkau <nbd@nbd.name>
> Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
> ---
>  net/core/gro.c |    9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> diff --git a/net/core/gro.c b/net/core/gro.c
> index 506f83d715f8..4bac7ea6e025 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
>  	struct sk_buff *lp;
>  	int segs;
>  
> +	/* Do not splice page pool based packets w/ non-page pool
> +	 * packets. This can result in reference count issues as page
> +	 * pool pages will not decrement the reference count and will
> +	 * instead be immediately returned to the pool or have frag
> +	 * count decremented.
> +	 */
> +	if (p->pp_recycle != skb->pp_recycle)
> +		return -ETOOMANYREFS;

If we are not allowing gro for the above case, setting NAPI_GRO_CB(p)->flush
to 1 in gro_list_prepare() seems to be making more sense so that the above
case has the same handling as skb_has_frag_list() handling?
https://elixir.bootlin.com/linux/v6.2-rc4/source/net/core/gro.c#L503

As it seems to avoid some unnecessary operation according to comment
in tcp4_gro_receive():
https://elixir.bootlin.com/linux/v6.2-rc4/source/net/ipv4/tcp_offload.c#L322


Also if A-MSDU is normal case for wireless adapters and we want the
performance back for the above case, maybe the driver can set
skb->pp_recycle and update the page->pp_frag_count instead of
page refcount if A-MSDU or A-MSDU decap performed by the driver
can track if the page is from page pool. In that case we may turn
the above checking to a WARN_ON() to catch any other corner-case.


> +
>  	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
>  	gro_max_size = READ_ONCE(p->dev->gro_max_size);
>  
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-28  2:37                                       ` Yunsheng Lin
@ 2023-01-28  5:26                                         ` Jakub Kicinski
  2023-01-28  7:08                                           ` Eric Dumazet
  2023-01-28  7:15                                           ` Eric Dumazet
  0 siblings, 2 replies; 43+ messages in thread
From: Jakub Kicinski @ 2023-01-28  5:26 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: Alexander Duyck, nbd, davem, edumazet, hawk, ilias.apalodimas,
	linux-kernel, lorenzo, netdev, pabeni

On Sat, 28 Jan 2023 10:37:47 +0800 Yunsheng Lin wrote:
> If we are not allowing gro for the above case, setting NAPI_GRO_CB(p)->flush
> to 1 in gro_list_prepare() seems to be making more sense so that the above
> case has the same handling as skb_has_frag_list() handling?
> https://elixir.bootlin.com/linux/v6.2-rc4/source/net/core/gro.c#L503
> 
> As it seems to avoid some unnecessary operation according to comment
> in tcp4_gro_receive():
> https://elixir.bootlin.com/linux/v6.2-rc4/source/net/ipv4/tcp_offload.c#L322

The frag_list case can be determined with just the input skb.
For pp_recycle we need to compare input skb's pp_recycle with
the pp_recycle of the skb already held by GRO.

I'll hold off with applying a bit longer tho, in case Eric
wants to chime in with an ack or opinion.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-28  5:26                                         ` Jakub Kicinski
@ 2023-01-28  7:08                                           ` Eric Dumazet
  2023-01-30  8:50                                             ` Paolo Abeni
  2023-01-28  7:15                                           ` Eric Dumazet
  1 sibling, 1 reply; 43+ messages in thread
From: Eric Dumazet @ 2023-01-28  7:08 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Yunsheng Lin, Alexander Duyck, nbd, davem, hawk,
	ilias.apalodimas, linux-kernel, lorenzo, netdev, pabeni

On Sat, Jan 28, 2023 at 6:26 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Sat, 28 Jan 2023 10:37:47 +0800 Yunsheng Lin wrote:
> > If we are not allowing gro for the above case, setting NAPI_GRO_CB(p)->flush
> > to 1 in gro_list_prepare() seems to be making more sense so that the above
> > case has the same handling as skb_has_frag_list() handling?
> > https://elixir.bootlin.com/linux/v6.2-rc4/source/net/core/gro.c#L503
> >
> > As it seems to avoid some unnecessary operation according to comment
> > in tcp4_gro_receive():
> > https://elixir.bootlin.com/linux/v6.2-rc4/source/net/ipv4/tcp_offload.c#L322
>
> The frag_list case can be determined with just the input skb.
> For pp_recycle we need to compare input skb's pp_recycle with
> the pp_recycle of the skb already held by GRO.
>
> I'll hold off with applying a bit longer tho, in case Eric
> wants to chime in with an ack or opinion.

We can say that we are adding in the fast path an expensive check
about an unlikely condition.

GRO is by far the most expensive component in our stack.

I would at least make the extra checks conditional to CONFIG_PAGE_POOL=y
Ideally all accesses to skb->pp_recycle should be done via a helper [1]

I hope that we will switch later to a per-page marker, instead of a per-skb one.

( a la https://www.spinics.net/lists/netdev/msg874099.html )

[1] Problem is that CONFIG_PAGE_POOL=y is now forced because of
net/bpf/test_run.c

So... testing skb->pp_recycle seems needed for the time being

Reviewed-by: Eric Dumazet <edumazet@google.com>

My tentative patch was something like:

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4c8492401a101f1d6d43079fc70962210389763c..a53b176738b10f3b69b38c487e0c280f44990b6f
100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -918,8 +918,10 @@ struct sk_buff {
                                fclone:2,
                                peeked:1,
                                head_frag:1,
-                               pfmemalloc:1,
-                               pp_recycle:1; /* page_pool recycle indicator */
+#ifdef CONFIG_PAGE_POOL
+                               pp_recycle:1, /* page_pool recycle indicator */
+#endif
+                               pfmemalloc:1;
 #ifdef CONFIG_SKB_EXTENSIONS
        __u8                    active_extensions;
 #endif
@@ -3388,6 +3390,15 @@ static inline void __skb_frag_unref(skb_frag_t
*frag, bool recycle)
        put_page(page);
 }

+static inline bool skb_pp_recycle(const struct sk_buff *skb)
+{
+#ifdef CONFIG_PAGE_POOL
+       return skb->pp_recycle;
+#else
+       return false;
+#endif
+}
+
 /**
  * skb_frag_unref - release a reference on a paged fragment of an skb.
  * @skb: the buffer
@@ -3400,7 +3411,7 @@ static inline void skb_frag_unref(struct sk_buff
*skb, int f)
        struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (!skb_zcopy_managed(skb))
-               __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
+               __skb_frag_unref(&shinfo->frags[f], skb_pp_recycle(skb));
 }

 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 180df58e85c72eaa16f5cb56b56d181a379b8921..7a2783a2c9608eec728a0adacea4619ab1c62791
100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -801,19 +801,13 @@ static void skb_clone_fraglist(struct sk_buff *skb)
                skb_get(list);
 }

-static bool skb_pp_recycle(struct sk_buff *skb, void *data)
-{
-       if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
-               return false;
-       return page_pool_return_skb_page(virt_to_page(data));
-}
-
 static void skb_free_head(struct sk_buff *skb)
 {
        unsigned char *head = skb->head;

        if (skb->head_frag) {
-               if (skb_pp_recycle(skb, head))
+               if (skb_pp_recycle(skb) &&
+                   page_pool_return_skb_page(virt_to_page(head)))
                        return;
                skb_free_frag(head);
        } else {
@@ -840,7 +834,7 @@ static void skb_release_data(struct sk_buff *skb,
enum skb_drop_reason reason)
        }

        for (i = 0; i < shinfo->nr_frags; i++)
-               __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+               __skb_frag_unref(&shinfo->frags[i], skb_pp_recycle(skb));

 free_head:
        if (shinfo->frag_list)
@@ -857,7 +851,10 @@ static void skb_release_data(struct sk_buff *skb,
enum skb_drop_reason reason)
         * Eventually the last SKB will have the recycling bit set and it's
         * dataref set to 0, which will trigger the recycling
         */
+#ifdef CONFIG_PAGE_POOL
        skb->pp_recycle = 0;
+#endif
+       return;
 }

 /*
@@ -1292,7 +1289,9 @@ static struct sk_buff *__skb_clone(struct
sk_buff *n, struct sk_buff *skb)
        n->nohdr = 0;
        n->peeked = 0;
        C(pfmemalloc);
+#ifdef CONFIG_PAGE_POOL
        C(pp_recycle);
+#endif
        n->destructor = NULL;
        C(tail);
        C(end);
@@ -3859,7 +3858,7 @@ int skb_shift(struct sk_buff *tgt, struct
sk_buff *skb, int shiftlen)
                fragto = &skb_shinfo(tgt)->frags[merge];

                skb_frag_size_add(fragto, skb_frag_size(fragfrom));
-               __skb_frag_unref(fragfrom, skb->pp_recycle);
+               __skb_frag_unref(fragfrom, skb_pp_recycle(skb));
        }

        /* Reposition in the original skb */
@@ -5529,7 +5528,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct
sk_buff *from,
         * references for cloned SKBs at the moment that would result in
         * inconsistent reference counts.
         */
-       if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from)))
+       if (skb_pp_recycle(to) != (skb_pp_recycle(from) && !skb_cloned(from)))
                return false;

        if (len <= skb_tailroom(to)) {

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-28  5:26                                         ` Jakub Kicinski
  2023-01-28  7:08                                           ` Eric Dumazet
@ 2023-01-28  7:15                                           ` Eric Dumazet
  2023-01-28 17:08                                             ` Alexander Duyck
  1 sibling, 1 reply; 43+ messages in thread
From: Eric Dumazet @ 2023-01-28  7:15 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Yunsheng Lin, Alexander Duyck, nbd, davem, hawk,
	ilias.apalodimas, linux-kernel, lorenzo, netdev, pabeni

On Sat, Jan 28, 2023 at 6:26 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Sat, 28 Jan 2023 10:37:47 +0800 Yunsheng Lin wrote:
> > If we are not allowing gro for the above case, setting NAPI_GRO_CB(p)->flush
> > to 1 in gro_list_prepare() seems to be making more sense so that the above
> > case has the same handling as skb_has_frag_list() handling?
> > https://elixir.bootlin.com/linux/v6.2-rc4/source/net/core/gro.c#L503
> >
> > As it seems to avoid some unnecessary operation according to comment
> > in tcp4_gro_receive():
> > https://elixir.bootlin.com/linux/v6.2-rc4/source/net/ipv4/tcp_offload.c#L322
>
> The frag_list case can be determined with just the input skb.
> For pp_recycle we need to compare input skb's pp_recycle with
> the pp_recycle of the skb already held by GRO.
>
> I'll hold off with applying a bit longer tho, in case Eric
> wants to chime in with an ack or opinion.

Doing the test only if the final step (once all headers have been
verified) seems less costly
for the vast majority of the cases the driver cooks skbs with a
consistent pp_recycle bit ?

So Alex patch seems less expensive to me than adding the check very early.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-26 19:06                                     ` [net PATCH] skb: Do mix page pool and page referenced frags in GRO Alexander Duyck
                                                         ` (2 preceding siblings ...)
  2023-01-28  2:37                                       ` Yunsheng Lin
@ 2023-01-28  7:50                                       ` patchwork-bot+netdevbpf
  3 siblings, 0 replies; 43+ messages in thread
From: patchwork-bot+netdevbpf @ 2023-01-28  7:50 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: nbd, davem, edumazet, hawk, ilias.apalodimas, kuba, linux-kernel,
	linyunsheng, lorenzo, netdev, pabeni

Hello:

This patch was applied to netdev/net.git (master)
by Jakub Kicinski <kuba@kernel.org>:

On Thu, 26 Jan 2023 11:06:59 -0800 you wrote:
> From: Alexander Duyck <alexanderduyck@fb.com>
> 
> GSO should not merge page pool recycled frames with standard reference
> counted frames. Traditionally this didn't occur, at least not often.
> However as we start looking at adding support for wireless adapters there
> becomes the potential to mix the two due to A-MSDU repartitioning frames in
> the receive path. There are possibly other places where this may have
> occurred however I suspect they must be few and far between as we have not
> seen this issue until now.
> 
> [...]

Here is the summary with links:
  - [net] skb: Do mix page pool and page referenced frags in GRO
    https://git.kernel.org/netdev/net/c/7d2c89b32587

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-28  7:15                                           ` Eric Dumazet
@ 2023-01-28 17:08                                             ` Alexander Duyck
  0 siblings, 0 replies; 43+ messages in thread
From: Alexander Duyck @ 2023-01-28 17:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jakub Kicinski, Yunsheng Lin, nbd, davem, hawk, ilias.apalodimas,
	linux-kernel, lorenzo, netdev, pabeni

On Fri, Jan 27, 2023 at 11:16 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Sat, Jan 28, 2023 at 6:26 AM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Sat, 28 Jan 2023 10:37:47 +0800 Yunsheng Lin wrote:
> > > If we are not allowing gro for the above case, setting NAPI_GRO_CB(p)->flush
> > > to 1 in gro_list_prepare() seems to be making more sense so that the above
> > > case has the same handling as skb_has_frag_list() handling?
> > > https://elixir.bootlin.com/linux/v6.2-rc4/source/net/core/gro.c#L503
> > >
> > > As it seems to avoid some unnecessary operation according to comment
> > > in tcp4_gro_receive():
> > > https://elixir.bootlin.com/linux/v6.2-rc4/source/net/ipv4/tcp_offload.c#L322
> >
> > The frag_list case can be determined with just the input skb.
> > For pp_recycle we need to compare input skb's pp_recycle with
> > the pp_recycle of the skb already held by GRO.
> >
> > I'll hold off with applying a bit longer tho, in case Eric
> > wants to chime in with an ack or opinion.
>
> Doing the test only if the final step (once all headers have been
> verified) seems less costly
> for the vast majority of the cases the driver cooks skbs with a
> consistent pp_recycle bit ?
>
> So Alex patch seems less expensive to me than adding the check very early.

That was the general idea. Basically there is no need to look into
this until we are looking at merging the skb and it is very unlikely
that we will see a mix of page pool and non-page pool skbs. I
considered this check to be something equivalent to discovering there
is no space in the skb to store the frags so that is one of the
reasons why I had picked the spot that I did.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-28  7:08                                           ` Eric Dumazet
@ 2023-01-30  8:50                                             ` Paolo Abeni
  2023-01-30 16:17                                               ` Alexander Duyck
  0 siblings, 1 reply; 43+ messages in thread
From: Paolo Abeni @ 2023-01-30  8:50 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski
  Cc: Yunsheng Lin, Alexander Duyck, nbd, davem, hawk,
	ilias.apalodimas, linux-kernel, lorenzo, netdev

On Sat, 2023-01-28 at 08:08 +0100, Eric Dumazet wrote:
> On Sat, Jan 28, 2023 at 6:26 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > 
> > On Sat, 28 Jan 2023 10:37:47 +0800 Yunsheng Lin wrote:
> > > If we are not allowing gro for the above case, setting NAPI_GRO_CB(p)->flush
> > > to 1 in gro_list_prepare() seems to be making more sense so that the above
> > > case has the same handling as skb_has_frag_list() handling?
> > > https://elixir.bootlin.com/linux/v6.2-rc4/source/net/core/gro.c#L503
> > > 
> > > As it seems to avoid some unnecessary operation according to comment
> > > in tcp4_gro_receive():
> > > https://elixir.bootlin.com/linux/v6.2-rc4/source/net/ipv4/tcp_offload.c#L322
> > 
> > The frag_list case can be determined with just the input skb.
> > For pp_recycle we need to compare input skb's pp_recycle with
> > the pp_recycle of the skb already held by GRO.
> > 
> > I'll hold off with applying a bit longer tho, in case Eric
> > wants to chime in with an ack or opinion.
> 
> We can say that we are adding in the fast path an expensive check
> about an unlikely condition.
> 
> GRO is by far the most expensive component in our stack.

Slightly related to the above: currently the GRO engine performs the
skb metadata check for every packet. My understanding is that even with
XDP enabled and ebpf running on the given packet, the skb should 
usually have meta_len == 0. 

What about setting 'skb->slow_gro' together with meta_len and moving
the skb_metadata_differs() check under the slow_gro guard?

Cheers,

Paolo


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-30  8:50                                             ` Paolo Abeni
@ 2023-01-30 16:17                                               ` Alexander Duyck
  0 siblings, 0 replies; 43+ messages in thread
From: Alexander Duyck @ 2023-01-30 16:17 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Eric Dumazet, Jakub Kicinski, Yunsheng Lin, nbd, davem, hawk,
	ilias.apalodimas, linux-kernel, lorenzo, netdev

On Mon, Jan 30, 2023 at 12:50 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Sat, 2023-01-28 at 08:08 +0100, Eric Dumazet wrote:
> > On Sat, Jan 28, 2023 at 6:26 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > >
> > > On Sat, 28 Jan 2023 10:37:47 +0800 Yunsheng Lin wrote:
> > > > If we are not allowing gro for the above case, setting NAPI_GRO_CB(p)->flush
> > > > to 1 in gro_list_prepare() seems to be making more sense so that the above
> > > > case has the same handling as skb_has_frag_list() handling?
> > > > https://elixir.bootlin.com/linux/v6.2-rc4/source/net/core/gro.c#L503
> > > >
> > > > As it seems to avoid some unnecessary operation according to comment
> > > > in tcp4_gro_receive():
> > > > https://elixir.bootlin.com/linux/v6.2-rc4/source/net/ipv4/tcp_offload.c#L322
> > >
> > > The frag_list case can be determined with just the input skb.
> > > For pp_recycle we need to compare input skb's pp_recycle with
> > > the pp_recycle of the skb already held by GRO.
> > >
> > > I'll hold off with applying a bit longer tho, in case Eric
> > > wants to chime in with an ack or opinion.
> >
> > We can say that we are adding in the fast path an expensive check
> > about an unlikely condition.
> >
> > GRO is by far the most expensive component in our stack.
>
> Slightly related to the above: currently the GRO engine performs the
> skb metadata check for every packet. My understanding is that even with
> XDP enabled and ebpf running on the given packet, the skb should
> usually have meta_len == 0.
>
> What about setting 'skb->slow_gro' together with meta_len and moving
> the skb_metadata_differs() check under the slow_gro guard?

Makes sense to me, especially since we have to do a pointer chase to
get the metadata length out of the shared info.

Looking at the code one thing I was wondering about is if we should be
flagging frames where one is slow_gro and one is not as having a diff
and just skipping the checks since we know the slow_gro checks are
expensive and if they differ based on that flag odds are one will have
a field present that the other doesn't.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [net PATCH] skb: Do mix page pool and page referenced frags in GRO
  2023-01-26 23:13                                       ` Jakub Kicinski
  2023-01-27  7:15                                         ` Ilias Apalodimas
  2023-01-27  7:21                                         ` Felix Fietkau
@ 2023-01-30 16:49                                         ` Jesper Dangaard Brouer
  2 siblings, 0 replies; 43+ messages in thread
From: Jesper Dangaard Brouer @ 2023-01-30 16:49 UTC (permalink / raw)
  To: Jakub Kicinski, Alexander Duyck
  Cc: brouer, nbd, davem, edumazet, hawk, ilias.apalodimas,
	linux-kernel, linyunsheng, lorenzo, netdev, pabeni



On 27/01/2023 00.13, Jakub Kicinski wrote:
> On Thu, 26 Jan 2023 11:06:59 -0800 Alexander Duyck wrote:
>> From: Alexander Duyck<alexanderduyck@fb.com>
>>
>> GSO should not merge page pool recycled frames with standard reference
>> counted frames. Traditionally this didn't occur, at least not often.
>> However as we start looking at adding support for wireless adapters there
>> becomes the potential to mix the two due to A-MSDU repartitioning frames in
>> the receive path. There are possibly other places where this may have
>> occurred however I suspect they must be few and far between as we have not
>> seen this issue until now.
>>
>> Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool")
>> Reported-by: Felix Fietkau<nbd@nbd.name>
>> Signed-off-by: Alexander Duyck<alexanderduyck@fb.com>
> Exciting investigation!
> Felix, out of curiosity - the impact of loosing GRO on performance is
> not significant enough to care?  We could possibly try to switch to
> using the frag list if we can't merge into frags safely.

Using the frag list sounds scary, because we recently learned that
kfree_skb_list requires all SKBs on the list to have same refcnt (else
the walking of the list can lead to other bugs).

--Jesper


^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2023-01-30 16:50 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-24 12:43 [PATCH] net: page_pool: fix refcounting issues with fragmented allocation Felix Fietkau
2023-01-24 14:11 ` Ilias Apalodimas
2023-01-24 15:57   ` Alexander H Duyck
2023-01-24 16:59     ` Felix Fietkau
2023-01-26 10:31     ` Ilias Apalodimas
2023-01-26 15:41       ` Alexander Duyck
2023-01-26 16:05         ` Ilias Apalodimas
2023-01-24 17:22   ` Felix Fietkau
2023-01-24 21:10     ` Alexander H Duyck
2023-01-24 21:30       ` Felix Fietkau
2023-01-25 17:11         ` Alexander H Duyck
2023-01-25 17:32           ` Felix Fietkau
2023-01-25 18:26             ` Alexander H Duyck
2023-01-25 18:42               ` Felix Fietkau
2023-01-25 19:02                 ` Alexander H Duyck
2023-01-25 19:10                   ` Felix Fietkau
2023-01-25 19:40                     ` Felix Fietkau
2023-01-25 20:02                       ` Felix Fietkau
2023-01-25 22:14                       ` Alexander H Duyck
2023-01-26  6:12                         ` Felix Fietkau
2023-01-26  9:14                           ` Felix Fietkau
2023-01-26 16:08                             ` Alexander Duyck
2023-01-26 16:40                               ` Alexander Duyck
2023-01-26 17:44                               ` Felix Fietkau
2023-01-26 18:38                                 ` Alexander H Duyck
2023-01-26 18:43                                   ` Felix Fietkau
2023-01-26 19:06                                     ` [net PATCH] skb: Do mix page pool and page referenced frags in GRO Alexander Duyck
2023-01-26 19:14                                       ` Toke Høiland-Jørgensen
2023-01-26 19:48                                         ` Alexander Duyck
2023-01-26 21:35                                           ` Toke Høiland-Jørgensen
2023-01-26 23:13                                       ` Jakub Kicinski
2023-01-27  7:15                                         ` Ilias Apalodimas
2023-01-27  7:21                                         ` Felix Fietkau
2023-01-30 16:49                                         ` Jesper Dangaard Brouer
2023-01-28  2:37                                       ` Yunsheng Lin
2023-01-28  5:26                                         ` Jakub Kicinski
2023-01-28  7:08                                           ` Eric Dumazet
2023-01-30  8:50                                             ` Paolo Abeni
2023-01-30 16:17                                               ` Alexander Duyck
2023-01-28  7:15                                           ` Eric Dumazet
2023-01-28 17:08                                             ` Alexander Duyck
2023-01-28  7:50                                       ` patchwork-bot+netdevbpf
2023-01-26 10:32     ` [PATCH] net: page_pool: fix refcounting issues with fragmented allocation Ilias Apalodimas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).