io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: David Wei <dw@davidwei.uk>
To: Jens Axboe <axboe@kernel.dk>, Pavel Begunkov <asml.silence@gmail.com>
Cc: io-uring@vger.kernel.org, netdev@vger.kernel.org,
	Mina Almasry <almasrymina@google.com>,
	Jakub Kicinski <kuba@kernel.org>
Subject: [PATCH 05/11] io_uring: add ZC buf and pool
Date: Fri, 25 Aug 2023 18:19:48 -0700	[thread overview]
Message-ID: <20230826011954.1801099-6-dw@davidwei.uk> (raw)
In-Reply-To: <20230826011954.1801099-1-dw@davidwei.uk>

From: David Wei <davidhwei@meta.com>

This patch adds two objects:

* Zero copy buffer representation, holding a page, its mapped dma_addr,
  and a refcount for lifetime management.
* Zero copy pool, spiritually similar to page pool, that holds ZC bufs
  and hands them out to net devices.

The ZC pool is tiered with currently two tiers: a fast lockless cache
that should only be accessed from the NAPI context of a single RX queue,
and a freelist. When a ZC pool region is first mapped, it is added to
the freelist.

During normal operation, bufs are moved from the freelist into the cache
in POOL_CACHE_SIZE blocks before being given out.

Pool regions are registered w/ io_uring using the registered buffer API,
with a 1:1 mapping between region and nr_iovec in
io_uring_register_buffers. This does the heavy lifting of pinning and
chunking into bvecs into a struct io_mapped_ubuf for us.

For now as there is only one pool region per ifq, there is no separate
API for adding/removing regions yet and it is mapped implicitly during
ifq registration.

Signed-off-by: David Wei <davidhwei@meta.com>
Co-developed-by: Jonathan Lemon <jonathan.lemon@gmail.com>
---
 include/linux/io_uring.h |   6 ++
 io_uring/zc_rx.c         | 173 ++++++++++++++++++++++++++++++++++++++-
 io_uring/zc_rx.h         |   1 +
 3 files changed, 179 insertions(+), 1 deletion(-)

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 7fe31b2cd02f..cf1993befa6a 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -41,6 +41,12 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
 	return sqe->cmd;
 }
 
+struct io_zc_rx_buf {
+	dma_addr_t	dma;
+	struct page	*page;
+	atomic_t	refcount;
+};
+
 #if defined(CONFIG_IO_URING)
 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd);
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 8cc66731af5b..317127d0d4e7 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -4,13 +4,43 @@
 #include <linux/mm.h>
 #include <linux/io_uring.h>
 #include <linux/netdevice.h>
+#include <linux/nospec.h>
 
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
 #include "kbuf.h"
+#include "rsrc.h"
 #include "zc_rx.h"
 
+#define POOL_CACHE_SIZE	128
+
+struct io_zc_rx_pool {
+	struct io_zc_rx_ifq  	*ifq;
+	struct io_zc_rx_buf	*bufs;
+	u16			pool_id;
+	u32			nr_pages;
+
+	/* fast cache */
+	u32			cache_count;
+	u32			cache[POOL_CACHE_SIZE];
+
+	/* freelist */
+	spinlock_t		freelist_lock;
+	u32			free_count;
+	u32			freelist[];
+};
+
+static struct device *netdev2dev(struct net_device *dev)
+{
+	return dev->dev.parent;
+}
+
+static u64 mk_page_info(u16 pool_id, u32 pgid)
+{
+	return (u64)0xface << 48 | (u64)pool_id << 32 | (u64)pgid;
+}
+
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 
 static int __io_queue_mgmt(struct net_device *dev, struct io_zc_rx_ifq *ifq,
@@ -40,6 +70,143 @@ static int io_close_zc_rxq(struct io_zc_rx_ifq *ifq)
 	return __io_queue_mgmt(ifq->dev, NULL, ifq->if_rxq_id);
 }
 
+static int io_zc_rx_map_buf(struct device *dev, struct page *page, u16 pool_id,
+			    u32 pgid, struct io_zc_rx_buf *buf)
+{
+	dma_addr_t addr;
+
+	SetPagePrivate(page);
+	set_page_private(page, mk_page_info(pool_id, pgid));
+
+	addr = dma_map_page_attrs(dev, page, 0, PAGE_SIZE,
+				  DMA_BIDIRECTIONAL,
+				  DMA_ATTR_SKIP_CPU_SYNC);
+	if (dma_mapping_error(dev, addr)) {
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+		return -ENOMEM;
+	}
+
+	buf->dma = addr;
+	buf->page = page;
+	atomic_set(&buf->refcount, 0);
+	get_page(page);
+
+	return 0;
+}
+
+static void io_zc_rx_unmap_buf(struct device *dev, struct io_zc_rx_buf *buf)
+{
+	struct page *page;
+
+	page = buf->page;
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+	dma_unmap_page_attrs(dev, buf->dma, PAGE_SIZE,
+			     DMA_BIDIRECTIONAL,
+			     DMA_ATTR_SKIP_CPU_SYNC);
+	put_page(page);
+}
+
+static int io_zc_rx_map_pool(struct io_zc_rx_pool *pool,
+			     struct io_mapped_ubuf *imu,
+			     struct device *dev)
+{
+	struct io_zc_rx_buf *buf;
+	struct page *page;
+	int i, ret;
+
+	for (i = 0; i < imu->nr_bvecs; i++) {
+		page = imu->bvec[i].bv_page;
+		if (PagePrivate(page)) {
+			ret = -EEXIST;
+			goto err;
+		}
+
+		buf = &pool->bufs[i];
+		ret = io_zc_rx_map_buf(dev, page, pool->pool_id, i, buf);
+		if (ret)
+			goto err;
+
+		pool->freelist[i] = i;
+	}
+
+	return 0;
+err:
+	while (i--) {
+		buf = &pool->bufs[i];
+		io_zc_rx_unmap_buf(dev, buf);
+	}
+
+	return ret;
+}
+
+int io_zc_rx_create_pool(struct io_ring_ctx *ctx,
+			 struct io_zc_rx_ifq *ifq,
+			 u16 id)
+{
+	struct device *dev = netdev2dev(ifq->dev);
+	struct io_mapped_ubuf *imu;
+	struct io_zc_rx_pool *pool;
+	int nr_pages;
+	int ret;
+
+	if (ifq->pool)
+		return -EFAULT;
+
+	if (unlikely(id >= ctx->nr_user_bufs))
+		return -EFAULT;
+	id = array_index_nospec(id, ctx->nr_user_bufs);
+	imu = ctx->user_bufs[id];
+	if (imu->ubuf & ~PAGE_MASK || imu->ubuf_end & ~PAGE_MASK)
+		return -EFAULT;
+
+	ret = -ENOMEM;
+	nr_pages = imu->nr_bvecs;
+	pool = kvmalloc(struct_size(pool, freelist, nr_pages), GFP_KERNEL);
+	if (!pool)
+		goto err;
+
+	pool->bufs = kvmalloc_array(nr_pages, sizeof(*pool->bufs), GFP_KERNEL);
+	if (!pool->bufs)
+		goto err_buf;
+
+	ret = io_zc_rx_map_pool(pool, imu, dev);
+	if (ret)
+		goto err_map;
+
+	pool->ifq = ifq;
+	pool->pool_id = id;
+	pool->nr_pages = nr_pages;
+	pool->cache_count = 0;
+	spin_lock_init(&pool->freelist_lock);
+	pool->free_count = nr_pages;
+	ifq->pool = pool;
+
+	return 0;
+
+err_map:
+	kvfree(pool->bufs);
+err_buf:
+	kvfree(pool);
+err:
+	return ret;
+}
+
+static void io_zc_rx_destroy_pool(struct io_zc_rx_pool *pool)
+{
+	struct device *dev = netdev2dev(pool->ifq->dev);
+	struct io_zc_rx_buf *buf;
+
+	for (int i = 0; i < pool->nr_pages; i++) {
+		buf = &pool->bufs[i];
+
+		io_zc_rx_unmap_buf(dev, buf);
+	}
+	kvfree(pool->bufs);
+	kvfree(pool);
+}
+
 static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_zc_rx_ifq *ifq;
@@ -58,6 +225,8 @@ static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq)
 {
 	if (ifq->if_rxq_id != -1)
 		io_close_zc_rxq(ifq);
+	if (ifq->pool)
+		io_zc_rx_destroy_pool(ifq->pool);
 	if (ifq->dev)
 		dev_put(ifq->dev);
 	io_free_rbuf_ring(ifq);
@@ -92,7 +261,9 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
 	if (!ifq->dev)
 		goto err;
 
-	/* TODO: map zc region and initialise zc pool */
+	ret = io_zc_rx_create_pool(ctx, ifq, reg.region_id);
+	if (ret)
+		goto err;
 
 	ifq->rq_entries = reg.rq_entries;
 	ifq->cq_entries = reg.cq_entries;
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index 340ececa9f9c..3cd0e730115d 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -18,5 +18,6 @@ struct io_zc_rx_ifq {
 int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
 			  struct io_uring_zc_rx_ifq_reg __user *arg);
 int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx);
+int io_zc_rx_pool_create(struct io_zc_rx_ifq *ifq, u16 id);
 
 #endif
-- 
2.39.3


  parent reply	other threads:[~2023-08-26  1:22 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-08-26  1:19 [RFC RESEND 00/11] Zero copy network RX using io_uring David Wei
2023-08-26  1:19 ` [PATCH 01/11] io_uring: add interface queue David Wei
2023-08-26  1:19 ` [PATCH 02/11] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-08-26  1:19 ` [PATCH 03/11] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-08-26  2:21   ` David Ahern
2023-08-26 21:37     ` David Wei
2023-08-26  1:19 ` [PATCH 04/11] io_uring: setup ZC for an RX queue when registering an ifq David Wei
2023-08-26  2:26   ` David Ahern
2023-08-26 22:00     ` David Wei
2023-08-26  1:19 ` David Wei [this message]
2023-08-26  1:19 ` [PATCH 06/11] io_uring: add ZC pool API David Wei
2023-08-26  1:19 ` [PATCH 07/11] skbuff: add SKBFL_FIXED_FRAG and skb_fixed() David Wei
2023-08-26  1:19 ` [PATCH 08/11] io_uring: allocate a uarg for freeing zero copy skbs David Wei
2023-08-26  1:19 ` [PATCH 09/11] io_uring: delay ZC pool destruction David Wei
2023-08-26  1:19 ` [PATCH 10/11] netdev/bnxt: add data pool and use it in BNXT driver David Wei
2023-08-26  1:19 ` [PATCH 11/11] io_uring: add io_recvzc request David Wei
2023-10-22 19:06 ` [RFC RESEND 00/11] Zero copy network RX using io_uring Gal Pressman
2023-10-23  3:35   ` David Wei
  -- strict thread matches above, loose matches on Subject: below --
2023-08-25 22:55 [RFC PATCH " David Wei
2023-08-25 22:55 ` [PATCH 05/11] io_uring: add ZC buf and pool David Wei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230826011954.1801099-6-dw@davidwei.uk \
    --to=dw@davidwei.uk \
    --cc=almasrymina@google.com \
    --cc=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=io-uring@vger.kernel.org \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).