linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Yunsheng Lin <linyunsheng@huawei.com>
To: <davem@davemloft.net>, <kuba@kernel.org>
Cc: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<linuxarm@openeuler.org>, <hawk@kernel.org>,
	<ilias.apalodimas@linaro.org>, <jonathan.lemon@gmail.com>,
	<alobakin@pm.me>, <willemb@google.com>, <cong.wang@bytedance.com>,
	<pabeni@redhat.com>, <haokexin@gmail.com>, <nogikh@google.com>,
	<elver@google.com>, <memxor@gmail.com>, <edumazet@google.com>,
	<alexander.duyck@gmail.com>, <dsahern@gmail.com>
Subject: [PATCH net-next v2 3/3] skbuff: keep track of pp page when __skb_frag_ref() is called
Date: Tue, 14 Sep 2021 20:11:14 +0800	[thread overview]
Message-ID: <20210914121114.28559-4-linyunsheng@huawei.com> (raw)
In-Reply-To: <20210914121114.28559-1-linyunsheng@huawei.com>

As the skb->pp_recycle and page->pp_magic may not be enough
to track if a frag page is from page pool after the calling
of __skb_frag_ref(), mostly because of a data race, see:
commit 2cc3aeb5eccc ("skbuff: Fix a potential race while
recycling page_pool packets").

There may be clone and expand head case that might lose the
track if a frag page is from page pool or not.

And not being able to keep track of pp page may cause problem
for the skb_split() case in tso_fragment() too:
Supposing a skb has 3 frag pages, all coming from a page pool,
and is split to skb1 and skb2:
skb1: first frag page + first half of second frag page
skb2: second half of second frag page + third frag page

How do we set the skb->pp_recycle of skb1 and skb2?
1. If we set both of them to 1, then we may have a similar
   race as the above commit for second frag page.
2. If we set only one of them to 1, then we may have resource
   leaking problem as both first frag page and third frag page
   are indeed from page pool.

So increment the frag count when __skb_frag_ref() is called,
and only use page->pp_magic to indicate if a frag page is from
page pool, to avoid the above data race.

For 32 bit systems with 64 bit dma, we preserve the orginial
behavior as frag count is used to trace how many time does a
frag page is called with __skb_frag_ref().

We still use both skb->pp_recycle and page->pp_magic to decide
the head page for a skb is from page pool or not.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/skbuff.h  | 40 ++++++++++++++++++++++++++++++++++++----
 include/net/page_pool.h | 28 +++++++++++++++++++++++++++-
 net/core/page_pool.c    | 16 ++--------------
 3 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 35eebc2310a5..4d975ab27078 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3073,7 +3073,16 @@ static inline struct page *skb_frag_page(const skb_frag_t *frag)
  */
 static inline void __skb_frag_ref(skb_frag_t *frag)
 {
-	get_page(skb_frag_page(frag));
+	struct page *page = skb_frag_page(frag);
+
+#ifdef CONFIG_PAGE_POOL
+	if (page_pool_is_pp_page(page)) {
+		page_pool_atomic_inc_frag_count(page);
+		return;
+	}
+#endif
+
+	get_page(page);
 }
 
 /**
@@ -3088,6 +3097,22 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
 	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
 }
 
+/**
+ * skb_frag_is_pp_page - decide if a page is recyclable.
+ * @page: frag page
+ * @recycle: skb->pp_recycle
+ *
+ * For 32 bit systems with 64 bit dma, the skb->pp_recycle is
+ * also used to decide if a page can be recycled to the page
+ * pool.
+ */
+static inline bool skb_frag_is_pp_page(struct page *page,
+				       bool recycle)
+{
+	return page_pool_is_pp_page(page) ||
+		(recycle && __page_pool_is_pp_page(page));
+}
+
 /**
  * __skb_frag_unref - release a reference on a paged fragment.
  * @frag: the paged fragment
@@ -3101,8 +3126,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 	struct page *page = skb_frag_page(frag);
 
 #ifdef CONFIG_PAGE_POOL
-	if (recycle && page_pool_return_skb_page(page))
+	if (skb_frag_is_pp_page(page, recycle)) {
+		page_pool_return_skb_page(page);
 		return;
+	}
 #endif
 	put_page(page);
 }
@@ -4720,9 +4747,14 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
 
 static inline bool skb_pp_recycle(struct sk_buff *skb, void *data)
 {
-	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
+	struct page *page = virt_to_head_page(data);
+
+	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle ||
+	    !__page_pool_is_pp_page(page))
 		return false;
-	return page_pool_return_skb_page(virt_to_head_page(data));
+
+	page_pool_return_skb_page(page);
+	return true;
 }
 
 #endif	/* __KERNEL__ */
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 2ad0706566c5..eb103d86f453 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -164,7 +164,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
 	return pool->p.dma_dir;
 }
 
-bool page_pool_return_skb_page(struct page *page);
+void page_pool_return_skb_page(struct page *page);
 
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
@@ -244,6 +244,32 @@ static inline void page_pool_set_frag_count(struct page *page, long nr)
 	atomic_long_set(&page->pp_frag_count, nr);
 }
 
+static inline void page_pool_atomic_inc_frag_count(struct page *page)
+{
+	atomic_long_inc(&page->pp_frag_count);
+}
+
+static inline bool __page_pool_is_pp_page(struct page *page)
+{
+	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
+	 * in order to preserve any existing bits, such as bit 0 for the
+	 * head page of compound page and bit 1 for pfmemalloc page, so
+	 * mask those bits for freeing side when doing below checking,
+	 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
+	 * to avoid recycling the pfmemalloc page.
+	 */
+	return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
+}
+
+static inline bool page_pool_is_pp_page(struct page *page)
+{
+	/* For systems with the same dma addr as the bus addr, we can use
+	 * page->pp_magic to indicate a pp page uniquely.
+	 */
+	return !PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
+			__page_pool_is_pp_page(page);
+}
+
 static inline long page_pool_atomic_sub_frag_count_return(struct page *page,
 							  long nr)
 {
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 09d7b8614ef5..3a419871d4bc 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -24,7 +24,7 @@
 #define DEFER_TIME (msecs_to_jiffies(1000))
 #define DEFER_WARN_INTERVAL (60 * HZ)
 
-#define BIAS_MAX	LONG_MAX
+#define BIAS_MAX	(LONG_MAX / 2)
 
 static int page_pool_init(struct page_pool *pool,
 			  const struct page_pool_params *params)
@@ -736,20 +736,10 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
 }
 EXPORT_SYMBOL(page_pool_update_nid);
 
-bool page_pool_return_skb_page(struct page *page)
+void page_pool_return_skb_page(struct page *page)
 {
 	struct page_pool *pp;
 
-	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
-	 * in order to preserve any existing bits, such as bit 0 for the
-	 * head page of compound page and bit 1 for pfmemalloc page, so
-	 * mask those bits for freeing side when doing below checking,
-	 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
-	 * to avoid recycling the pfmemalloc page.
-	 */
-	if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
-		return false;
-
 	pp = page->pp;
 
 	/* Driver set this to memory recycling info. Reset it on recycle.
@@ -758,7 +748,5 @@ bool page_pool_return_skb_page(struct page *page)
 	 * 'flipped' fragment being in use or not.
 	 */
 	page_pool_put_full_page(pp, page, false);
-
-	return true;
 }
 EXPORT_SYMBOL(page_pool_return_skb_page);
-- 
2.33.0


  parent reply	other threads:[~2021-09-14 12:12 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-14 12:11 [PATCH net-next v2 0/3] some optimization for page pool Yunsheng Lin
2021-09-14 12:11 ` [PATCH net-next v2 1/3] page_pool: support non-split page with PP_FLAG_PAGE_FRAG Yunsheng Lin
2021-09-14 12:11 ` [PATCH net-next v2 2/3] pool_pool: avoid calling compound_head() for skb frag page Yunsheng Lin
2021-09-14 12:11 ` Yunsheng Lin [this message]
2021-09-15  0:59   ` [PATCH net-next v2 3/3] skbuff: keep track of pp page when __skb_frag_ref() is called Alexander Duyck
2021-09-15  9:07     ` Yunsheng Lin
2021-09-15 12:56       ` Ilias Apalodimas
2021-09-15 15:42         ` Jesper Dangaard Brouer
2021-09-16  2:05           ` [Linuxarm] " Yunsheng Lin
2021-09-16  8:44             ` Ilias Apalodimas
2021-09-16  9:33               ` Yunsheng Lin
2021-09-16 10:38                 ` Ilias Apalodimas
2021-09-16 11:04                   ` Yunsheng Lin
2021-09-16 11:21                     ` Yunsheng Lin
2021-09-16 11:57                     ` [Linuxarm] " Ilias Apalodimas
2021-09-17  3:57                       ` Yunsheng Lin
2021-09-17  6:38                         ` Ilias Apalodimas
2021-09-17  9:17                           ` Yunsheng Lin
2021-09-17 15:01                             ` Ilias Apalodimas
2021-09-18  1:43                               ` Yunsheng Lin
2021-09-18  9:23                                 ` Ilias Apalodimas
2021-09-22  3:38                                   ` Yunsheng Lin
2021-09-17 17:15             ` [Linuxarm] " Eric Dumazet
2021-09-18  2:42               ` Yunsheng Lin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210914121114.28559-4-linyunsheng@huawei.com \
    --to=linyunsheng@huawei.com \
    --cc=alexander.duyck@gmail.com \
    --cc=alobakin@pm.me \
    --cc=cong.wang@bytedance.com \
    --cc=davem@davemloft.net \
    --cc=dsahern@gmail.com \
    --cc=edumazet@google.com \
    --cc=elver@google.com \
    --cc=haokexin@gmail.com \
    --cc=hawk@kernel.org \
    --cc=ilias.apalodimas@linaro.org \
    --cc=jonathan.lemon@gmail.com \
    --cc=kuba@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linuxarm@openeuler.org \
    --cc=memxor@gmail.com \
    --cc=netdev@vger.kernel.org \
    --cc=nogikh@google.com \
    --cc=pabeni@redhat.com \
    --cc=willemb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).