From: Mina Almasry <almasrymina@google.com>
To: Shailend Chand <shailend@google.com>,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-doc@vger.kernel.org, linux-arch@vger.kernel.org,
linux-kselftest@vger.kernel.org, bpf@vger.kernel.org,
linux-media@vger.kernel.org, dri-devel@lists.freedesktop.org
Cc: "Mina Almasry" <almasrymina@google.com>,
"David S. Miller" <davem@davemloft.net>,
"Eric Dumazet" <edumazet@google.com>,
"Jakub Kicinski" <kuba@kernel.org>,
"Paolo Abeni" <pabeni@redhat.com>,
"Jonathan Corbet" <corbet@lwn.net>,
"Jeroen de Borst" <jeroendb@google.com>,
"Praveen Kaligineedi" <pkaligineedi@google.com>,
"Jesper Dangaard Brouer" <hawk@kernel.org>,
"Ilias Apalodimas" <ilias.apalodimas@linaro.org>,
"Arnd Bergmann" <arnd@arndb.de>,
"David Ahern" <dsahern@kernel.org>,
"Willem de Bruijn" <willemdebruijn.kernel@gmail.com>,
"Shuah Khan" <shuah@kernel.org>,
"Sumit Semwal" <sumit.semwal@linaro.org>,
"Christian König" <christian.koenig@amd.com>,
"Yunsheng Lin" <linyunsheng@huawei.com>,
"Harshitha Ramamurthy" <hramamurthy@google.com>,
"Shakeel Butt" <shakeelb@google.com>
Subject: [net-next v1 09/16] page_pool: device memory support
Date: Thu, 7 Dec 2023 16:52:40 -0800 [thread overview]
Message-ID: <20231208005250.2910004-10-almasrymina@google.com> (raw)
In-Reply-To: <20231208005250.2910004-1-almasrymina@google.com>
Overload the LSB of struct page* to indicate that it's a page_pool_iov.
Refactor mm calls on struct page* into helpers, and add page_pool_iov
handling on those helpers. Modify callers of these mm APIs with calls to
these helpers instead.
In areas where struct page* is dereferenced, add a check for special
handling of page_pool_iov.
Signed-off-by: Mina Almasry <almasrymina@google.com>
---
v1:
- Disable fragmentation support for iov properly.
- fix napi_pp_put_page() path (Yunsheng).
---
include/net/page_pool/helpers.h | 78 ++++++++++++++++++++++++++++++++-
net/core/page_pool.c | 67 ++++++++++++++++++++--------
net/core/skbuff.c | 28 +++++++-----
3 files changed, 141 insertions(+), 32 deletions(-)
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 00197f14aa87..2d4e0a2c5620 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -154,6 +154,64 @@ static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
return NULL;
}
+static inline int page_pool_page_ref_count(struct page *page)
+{
+ if (page_is_page_pool_iov(page))
+ return page_pool_iov_refcount(page_to_page_pool_iov(page));
+
+ return page_ref_count(page);
+}
+
+static inline void page_pool_page_get_many(struct page *page,
+ unsigned int count)
+{
+ if (page_is_page_pool_iov(page))
+ return page_pool_iov_get_many(page_to_page_pool_iov(page),
+ count);
+
+ return page_ref_add(page, count);
+}
+
+static inline void page_pool_page_put_many(struct page *page,
+ unsigned int count)
+{
+ if (page_is_page_pool_iov(page))
+ return page_pool_iov_put_many(page_to_page_pool_iov(page),
+ count);
+
+ if (count > 1)
+ page_ref_sub(page, count - 1);
+
+ put_page(page);
+}
+
+static inline bool page_pool_page_is_pfmemalloc(struct page *page)
+{
+ if (page_is_page_pool_iov(page))
+ return false;
+
+ return page_is_pfmemalloc(page);
+}
+
+static inline bool page_pool_page_is_pref_nid(struct page *page, int pref_nid)
+{
+ /* Assume page_pool_iov are on the preferred node without actually
+ * checking...
+ *
+ * This check is only used to check for recycling memory in the page
+ * pool's fast paths. Currently the only implementation of page_pool_iov
+ * is dmabuf device memory. It's a deliberate decision by the user to
+ * bind a certain dmabuf to a certain netdev, and the netdev rx queue
+ * would not be able to reallocate memory from another dmabuf that
+ * exists on the preferred node, so, this check doesn't make much sense
+ * in this case. Assume all page_pool_iovs can be recycled for now.
+ */
+ if (page_is_page_pool_iov(page))
+ return true;
+
+ return page_to_nid(page) == pref_nid;
+}
+
/**
* page_pool_dev_alloc_pages() - allocate a page.
* @pool: pool from which to allocate
@@ -304,6 +362,10 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
{
long ret;
+ /* fragmentation support hasn't been added to ppiov yet */
+ if (WARN_ON_ONCE(page_is_page_pool_iov(page)))
+ return 0;
+
/* If nr == pp_frag_count then we have cleared all remaining
* references to the page:
* 1. 'n == 1': no need to actually overwrite it.
@@ -347,7 +409,8 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
static inline bool page_pool_is_last_frag(struct page *page)
{
/* If page_pool_defrag_page() returns 0, we were the last user */
- return page_pool_defrag_page(page, 1) == 0;
+ return page_is_page_pool_iov(page) ||
+ page_pool_defrag_page(page, 1) == 0;
}
/**
@@ -434,7 +497,12 @@ static inline void page_pool_free_va(struct page_pool *pool, void *va,
*/
static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
{
- dma_addr_t ret = page->dma_addr;
+ dma_addr_t ret;
+
+ if (page_is_page_pool_iov(page))
+ return page_pool_iov_dma_addr(page_to_page_pool_iov(page));
+
+ ret = page->dma_addr;
if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
ret <<= PAGE_SHIFT;
@@ -444,6 +512,12 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
{
+ /* page_pool_iovs are mapped and their dma-addr can't be modified. */
+ if (page_is_page_pool_iov(page)) {
+ DEBUG_NET_WARN_ON_ONCE(true);
+ return false;
+ }
+
if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
page->dma_addr = addr >> PAGE_SHIFT;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 423c88564a00..f0148d66371b 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -346,7 +346,7 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
if (unlikely(!page))
break;
- if (likely(page_to_nid(page) == pref_nid)) {
+ if (likely(page_pool_page_is_pref_nid(page, pref_nid))) {
pool->alloc.cache[pool->alloc.count++] = page;
} else {
/* NUMA mismatch;
@@ -391,7 +391,15 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool,
struct page *page,
unsigned int dma_sync_size)
{
- dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+ dma_addr_t dma_addr;
+
+ /* page_pool_iov memory provider do not support PP_FLAG_DMA_SYNC_DEV */
+ if (page_is_page_pool_iov(page)) {
+ DEBUG_NET_WARN_ON_ONCE(true);
+ return;
+ }
+
+ dma_addr = page_pool_get_dma_addr(page);
dma_sync_size = min(dma_sync_size, pool->p.max_len);
dma_sync_single_range_for_device(pool->p.dev, dma_addr,
@@ -403,6 +411,12 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
{
dma_addr_t dma;
+ if (page_is_page_pool_iov(page)) {
+ /* page_pool_iovs are already mapped */
+ DEBUG_NET_WARN_ON_ONCE(true);
+ return true;
+ }
+
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
* since dma_addr_t can be either 32 or 64 bits and does not always fit
* into page private data (i.e 32bit cpu with 64bit DMA caps)
@@ -434,22 +448,33 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
static void page_pool_set_pp_info(struct page_pool *pool,
struct page *page)
{
- page->pp = pool;
- page->pp_magic |= PP_SIGNATURE;
-
- /* Ensuring all pages have been split into one fragment initially:
- * page_pool_set_pp_info() is only called once for every page when it
- * is allocated from the page allocator and page_pool_fragment_page()
- * is dirtying the same cache line as the page->pp_magic above, so
- * the overhead is negligible.
- */
- page_pool_fragment_page(page, 1);
+ if (!page_is_page_pool_iov(page)) {
+ page->pp = pool;
+ page->pp_magic |= PP_SIGNATURE;
+
+ /* Ensuring all pages have been split into one fragment
+ * initially:
+ * page_pool_set_pp_info() is only called once for every page
+ * when it is allocated from the page allocator and
+ * page_pool_fragment_page() is dirtying the same cache line as
+ * the page->pp_magic above, so * the overhead is negligible.
+ */
+ page_pool_fragment_page(page, 1);
+ } else {
+ page_to_page_pool_iov(page)->pp = pool;
+ }
+
if (pool->has_init_callback)
pool->slow.init_callback(page, pool->slow.init_arg);
}
static void page_pool_clear_pp_info(struct page *page)
{
+ if (page_is_page_pool_iov(page)) {
+ page_to_page_pool_iov(page)->pp = NULL;
+ return;
+ }
+
page->pp_magic = 0;
page->pp = NULL;
}
@@ -664,7 +689,7 @@ static bool page_pool_recycle_in_cache(struct page *page,
return false;
}
- /* Caller MUST have verified/know (page_ref_count(page) == 1) */
+ /* Caller MUST have verified/know (page_pool_page_ref_count(page) == 1) */
pool->alloc.cache[pool->alloc.count++] = page;
recycle_stat_inc(pool, cached);
return true;
@@ -689,9 +714,10 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
* refcnt == 1 means page_pool owns page, and can recycle it.
*
* page is NOT reusable when allocated when system is under
- * some pressure. (page_is_pfmemalloc)
+ * some pressure. (page_pool_page_is_pfmemalloc)
*/
- if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
+ if (likely(page_pool_page_ref_count(page) == 1 &&
+ !page_pool_page_is_pfmemalloc(page))) {
/* Read barrier done in page_ref_count / READ_ONCE */
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
@@ -806,7 +832,8 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
if (likely(page_pool_defrag_page(page, drain_count)))
return NULL;
- if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
+ if (page_pool_page_ref_count(page) == 1 &&
+ !page_pool_page_is_pfmemalloc(page)) {
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
page_pool_dma_sync_for_device(pool, page, -1);
@@ -840,6 +867,10 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
if (WARN_ON(size > max_size))
return NULL;
+ /* page_pool_iov's don't currently support fragmentation */
+ if (WARN_ON_ONCE(pool->mp_ops == &dmabuf_devmem_ops))
+ return NULL;
+
size = ALIGN(size, dma_get_cache_alignment());
*offset = pool->frag_offset;
@@ -882,9 +913,9 @@ static void page_pool_empty_ring(struct page_pool *pool)
/* Empty recycle ring */
while ((page = ptr_ring_consume_bh(&pool->ring))) {
/* Verify the refcnt invariant of cached pages */
- if (!(page_ref_count(page) == 1))
+ if (!(page_pool_page_ref_count(page) == 1))
pr_crit("%s() page_pool refcnt %d violation\n",
- __func__, page_ref_count(page));
+ __func__, page_pool_page_ref_count(page));
page_pool_return_page(pool, page);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b157efea5dea..07f802f1adf1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -896,19 +896,23 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
bool allow_direct = false;
struct page_pool *pp;
- page = compound_head(page);
-
- /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
- * in order to preserve any existing bits, such as bit 0 for the
- * head page of compound page and bit 1 for pfmemalloc page, so
- * mask those bits for freeing side when doing below checking,
- * and page_is_pfmemalloc() is checked in __page_pool_put_page()
- * to avoid recycling the pfmemalloc page.
- */
- if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
- return false;
+ if (!page_is_page_pool_iov(page)) {
+ page = compound_head(page);
+
+ /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
+ * in order to preserve any existing bits, such as bit 0 for the
+ * head page of compound page and bit 1 for pfmemalloc page, so
+ * mask those bits for freeing side when doing below checking,
+ * and page_is_pfmemalloc() is checked in __page_pool_put_page()
+ * to avoid recycling the pfmemalloc page.
+ */
+ if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
+ return false;
- pp = page->pp;
+ pp = page->pp;
+ } else {
+ pp = page_to_page_pool_iov(page)->pp;
+ }
/* Allow direct recycle if we have reasons to believe that we are
* in the same context as the consumer would run, so there's
--
2.43.0.472.g3155946c3a-goog
next prev parent reply other threads:[~2023-12-08 0:53 UTC|newest]
Thread overview: 75+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-08 0:52 [net-next v1 00/16] Device Memory TCP Mina Almasry
2023-12-08 0:52 ` [net-next v1 01/16] net: page_pool: factor out releasing DMA from releasing the page Mina Almasry
2023-12-10 3:49 ` Shakeel Butt
2023-12-12 8:11 ` Ilias Apalodimas
2023-12-08 0:52 ` [net-next v1 02/16] net: page_pool: create hooks for custom page providers Mina Almasry
2023-12-12 8:07 ` Ilias Apalodimas
2023-12-12 14:47 ` Mina Almasry
2023-12-08 0:52 ` [net-next v1 03/16] queue_api: define queue api Mina Almasry
2023-12-14 1:15 ` Jakub Kicinski
2023-12-08 0:52 ` [net-next v1 04/16] gve: implement " Mina Almasry
2024-03-05 11:45 ` Arnd Bergmann
2023-12-08 0:52 ` [net-next v1 05/16] net: netdev netlink api to bind dma-buf to a net device Mina Almasry
2023-12-14 1:17 ` Jakub Kicinski
2023-12-08 0:52 ` [net-next v1 06/16] netdev: support binding dma-buf to netdevice Mina Almasry
2023-12-08 15:40 ` kernel test robot
2023-12-08 16:02 ` kernel test robot
2023-12-08 17:48 ` David Ahern
2023-12-08 19:22 ` Mina Almasry
2023-12-08 20:32 ` Mina Almasry
2023-12-09 23:29 ` David Ahern
2023-12-11 2:19 ` Mina Almasry
2023-12-08 0:52 ` [net-next v1 07/16] netdev: netdevice devmem allocator Mina Almasry
2023-12-08 17:56 ` David Ahern
2023-12-08 19:27 ` Mina Almasry
2023-12-08 0:52 ` [net-next v1 08/16] memory-provider: dmabuf devmem memory provider Mina Almasry
2023-12-08 22:48 ` Pavel Begunkov
2023-12-08 23:25 ` Mina Almasry
2023-12-10 3:03 ` Pavel Begunkov
2023-12-11 2:30 ` Mina Almasry
2023-12-11 20:35 ` Pavel Begunkov
2023-12-14 20:03 ` Mina Almasry
2023-12-19 23:55 ` Pavel Begunkov
2023-12-08 23:05 ` Pavel Begunkov
2023-12-12 12:25 ` Jason Gunthorpe
2023-12-12 13:07 ` Christoph Hellwig
2023-12-12 14:26 ` Mina Almasry
2023-12-12 14:39 ` Jason Gunthorpe
2023-12-12 14:58 ` Mina Almasry
2023-12-12 15:08 ` Jason Gunthorpe
2023-12-13 1:09 ` Mina Almasry
2023-12-13 2:19 ` David Ahern
2023-12-13 7:49 ` Yinjun Zhang
2023-12-08 0:52 ` Mina Almasry [this message]
2023-12-08 9:30 ` [net-next v1 09/16] page_pool: device memory support Yunsheng Lin
2023-12-08 16:05 ` Mina Almasry
2023-12-11 2:04 ` Yunsheng Lin
2023-12-11 2:26 ` Mina Almasry
2023-12-11 4:04 ` Mina Almasry
2023-12-11 11:51 ` Yunsheng Lin
2023-12-11 18:14 ` Mina Almasry
2023-12-12 11:17 ` Yunsheng Lin
2023-12-12 14:28 ` Mina Almasry
2023-12-13 11:48 ` Yunsheng Lin
2023-12-13 7:52 ` Mina Almasry
2023-12-08 0:52 ` [net-next v1 10/16] page_pool: don't release iov on elevanted refcount Mina Almasry
2023-12-08 0:52 ` [net-next v1 11/16] net: support non paged skb frags Mina Almasry
2023-12-08 0:52 ` [net-next v1 12/16] net: add support for skbs with unreadable frags Mina Almasry
2023-12-08 0:52 ` [net-next v1 13/16] tcp: RX path for devmem TCP Mina Almasry
2023-12-08 15:40 ` kernel test robot
2023-12-08 17:55 ` David Ahern
2023-12-08 19:23 ` Mina Almasry
2023-12-08 0:52 ` [net-next v1 14/16] net: add SO_DEVMEM_DONTNEED setsockopt to release RX frags Mina Almasry
2023-12-12 19:08 ` Simon Horman
2023-12-08 0:52 ` [net-next v1 15/16] net: add devmem TCP documentation Mina Almasry
2023-12-12 19:14 ` Simon Horman
2023-12-08 0:52 ` [net-next v1 16/16] selftests: add ncdevmem, netcat for devmem TCP Mina Almasry
2023-12-08 1:47 ` [net-next v1 00/16] Device Memory TCP Mina Almasry
2023-12-08 17:57 ` David Ahern
2023-12-08 19:31 ` Mina Almasry
2023-12-10 3:48 ` Shakeel Butt
2023-12-12 5:58 ` Christoph Hellwig
2023-12-14 6:20 ` patchwork-bot+netdevbpf
2023-12-14 6:48 ` Christoph Hellwig
2023-12-14 6:51 ` Mina Almasry
2023-12-14 6:59 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20231208005250.2910004-10-almasrymina@google.com \
--to=almasrymina@google.com \
--cc=arnd@arndb.de \
--cc=bpf@vger.kernel.org \
--cc=christian.koenig@amd.com \
--cc=corbet@lwn.net \
--cc=davem@davemloft.net \
--cc=dri-devel@lists.freedesktop.org \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=hawk@kernel.org \
--cc=hramamurthy@google.com \
--cc=ilias.apalodimas@linaro.org \
--cc=jeroendb@google.com \
--cc=kuba@kernel.org \
--cc=linux-arch@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-media@vger.kernel.org \
--cc=linyunsheng@huawei.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=pkaligineedi@google.com \
--cc=shailend@google.com \
--cc=shakeelb@google.com \
--cc=shuah@kernel.org \
--cc=sumit.semwal@linaro.org \
--cc=willemdebruijn.kernel@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).