All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yunjian Wang <wangyunjian@huawei.com>
To: <mst@redhat.com>, <willemdebruijn.kernel@gmail.com>,
	<jasowang@redhat.com>, <kuba@kernel.org>, <bjorn@kernel.org>,
	<magnus.karlsson@intel.com>, <maciej.fijalkowski@intel.com>,
	<jonathan.lemon@gmail.com>, <davem@davemloft.net>
Cc: <bpf@vger.kernel.org>, <netdev@vger.kernel.org>,
	<linux-kernel@vger.kernel.org>, <kvm@vger.kernel.org>,
	<virtualization@lists.linux.dev>, <xudingke@huawei.com>,
	<liwei395@huawei.com>, Yunjian Wang <wangyunjian@huawei.com>
Subject: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
Date: Wed, 28 Feb 2024 19:05:56 +0800	[thread overview]
Message-ID: <1709118356-133960-1-git-send-email-wangyunjian@huawei.com> (raw)

This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
which can significantly reduce CPU utilization for XDP programs.

Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
ring has been utilized to queue different types of pointers by encoding
the type into the lower bits. Therefore, we introduce a new flag,
TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP descriptors
and differentiate them from XDP buffers and sk_buffs. Additionally, a
spin lock is added for enabling and disabling operations on the xsk pool.

The performance testing was performed on a Intel E5-2620 2.40GHz machine.
Traffic were generated/send through TUN(testpmd txonly with AF_XDP)
to VM (testpmd rxonly in guest).

+------+---------+---------+---------+
|      |   copy  |zero-copy| speedup |
+------+---------+---------+---------+
| UDP  |   Mpps  |   Mpps  |    %    |
| 64   |   2.5   |   4.0   |   60%   |
| 512  |   2.1   |   3.6   |   71%   |
| 1024 |   1.9   |   3.3   |   73%   |
+------+---------+---------+---------+

Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
---
 drivers/net/tun.c      | 177 +++++++++++++++++++++++++++++++++++++++--
 drivers/vhost/net.c    |   4 +
 include/linux/if_tun.h |  32 ++++++++
 3 files changed, 208 insertions(+), 5 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bc80fc1d576e..7f4ff50b532c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -63,6 +63,7 @@
 #include <net/rtnetlink.h>
 #include <net/sock.h>
 #include <net/xdp.h>
+#include <net/xdp_sock_drv.h>
 #include <net/ip_tunnels.h>
 #include <linux/seq_file.h>
 #include <linux/uio.h>
@@ -86,6 +87,7 @@ static void tun_default_link_ksettings(struct net_device *dev,
 				       struct ethtool_link_ksettings *cmd);
 
 #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+#define TUN_XDP_BATCH 64
 
 /* TUN device flags */
 
@@ -146,6 +148,9 @@ struct tun_file {
 	struct tun_struct *detached;
 	struct ptr_ring tx_ring;
 	struct xdp_rxq_info xdp_rxq;
+	struct xsk_buff_pool *xsk_pool;
+	spinlock_t pool_lock;	/* Protects xsk pool enable/disable */
+	u32 nb_descs;
 };
 
 struct tun_page {
@@ -614,6 +619,8 @@ void tun_ptr_free(void *ptr)
 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
 		xdp_return_frame(xdpf);
+	} else if (tun_is_xdp_desc_frame(ptr)) {
+		return;
 	} else {
 		__skb_array_destroy_skb(ptr);
 	}
@@ -631,6 +638,37 @@ static void tun_queue_purge(struct tun_file *tfile)
 	skb_queue_purge(&tfile->sk.sk_error_queue);
 }
 
+static void tun_set_xsk_pool(struct tun_file *tfile, struct xsk_buff_pool *pool)
+{
+	if (!pool)
+		return;
+
+	spin_lock(&tfile->pool_lock);
+	xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
+	tfile->xsk_pool = pool;
+	spin_unlock(&tfile->pool_lock);
+}
+
+static void tun_clean_xsk_pool(struct tun_file *tfile)
+{
+	spin_lock(&tfile->pool_lock);
+	if (tfile->xsk_pool) {
+		void *ptr;
+
+		while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
+			tun_ptr_free(ptr);
+
+		if (tfile->nb_descs) {
+			xsk_tx_completed(tfile->xsk_pool, tfile->nb_descs);
+			if (xsk_uses_need_wakeup(tfile->xsk_pool))
+				xsk_set_tx_need_wakeup(tfile->xsk_pool);
+			tfile->nb_descs = 0;
+		}
+		tfile->xsk_pool = NULL;
+	}
+	spin_unlock(&tfile->pool_lock);
+}
+
 static void __tun_detach(struct tun_file *tfile, bool clean)
 {
 	struct tun_file *ntfile;
@@ -648,6 +686,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 		u16 index = tfile->queue_index;
 		BUG_ON(index >= tun->numqueues);
 
+		ntfile = rtnl_dereference(tun->tfiles[tun->numqueues - 1]);
+		/* Stop xsk zc xmit */
+		tun_clean_xsk_pool(tfile);
+		tun_clean_xsk_pool(ntfile);
+
 		rcu_assign_pointer(tun->tfiles[index],
 				   tun->tfiles[tun->numqueues - 1]);
 		ntfile = rtnl_dereference(tun->tfiles[index]);
@@ -668,6 +711,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 		tun_flow_delete_by_queue(tun, tun->numqueues + 1);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
+		tun_set_xsk_pool(ntfile, xsk_get_pool_from_qid(tun->dev, index));
 		tun_set_real_num_queues(tun);
 	} else if (tfile->detached && clean) {
 		tun = tun_enable_queue(tfile);
@@ -801,6 +845,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 
 		if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
 			tfile->xdp_rxq.queue_index = tfile->queue_index;
+		tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev, tfile->queue_index));
 	} else {
 		/* Setup XDP RX-queue info, for new tfile getting attached */
 		err = xdp_rxq_info_reg(&tfile->xdp_rxq,
@@ -1221,11 +1266,50 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	return 0;
 }
 
+static int tun_xsk_pool_enable(struct net_device *netdev,
+			       struct xsk_buff_pool *pool,
+			       u16 qid)
+{
+	struct tun_struct *tun = netdev_priv(netdev);
+	struct tun_file *tfile;
+
+	if (qid >= tun->numqueues)
+		return -EINVAL;
+
+	tfile = rtnl_dereference(tun->tfiles[qid]);
+	tun_set_xsk_pool(tfile, pool);
+
+	return 0;
+}
+
+static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid)
+{
+	struct tun_struct *tun = netdev_priv(netdev);
+	struct tun_file *tfile;
+
+	if (qid >= MAX_TAP_QUEUES)
+		return -EINVAL;
+
+	tfile = rtnl_dereference(tun->tfiles[qid]);
+	if (tfile)
+		tun_clean_xsk_pool(tfile);
+	return 0;
+}
+
+static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
+			      u16 qid)
+{
+	return pool ? tun_xsk_pool_enable(dev, pool, qid) :
+		tun_xsk_pool_disable(dev, qid);
+}
+
 static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return tun_xdp_set(dev, xdp->prog, xdp->extack);
+	case XDP_SETUP_XSK_POOL:
+		return tun_xsk_pool_setup(dev, xdp->xsk.pool, xdp->xsk.queue_id);
 	default:
 		return -EINVAL;
 	}
@@ -1330,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	return nxmit;
 }
 
+static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile;
+
+	rcu_read_lock();
+	tfile = rcu_dereference(tun->tfiles[qid]);
+	if (tfile)
+		__tun_xdp_flush_tfile(tfile);
+	rcu_read_unlock();
+	return 0;
+}
+
 static const struct net_device_ops tap_netdev_ops = {
 	.ndo_init		= tun_net_init,
 	.ndo_uninit		= tun_net_uninit,
@@ -1346,6 +1443,7 @@ static const struct net_device_ops tap_netdev_ops = {
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_bpf		= tun_xdp,
 	.ndo_xdp_xmit		= tun_xdp_xmit,
+	.ndo_xsk_wakeup		= tun_xsk_wakeup,
 	.ndo_change_carrier	= tun_net_change_carrier,
 };
 
@@ -1403,7 +1501,8 @@ static void tun_net_initialize(struct net_device *dev)
 		/* Currently tun does not support XDP, only tap does. */
 		dev->xdp_features = NETDEV_XDP_ACT_BASIC |
 				    NETDEV_XDP_ACT_REDIRECT |
-				    NETDEV_XDP_ACT_NDO_XMIT;
+				    NETDEV_XDP_ACT_NDO_XMIT |
+				    NETDEV_XDP_ACT_XSK_ZEROCOPY;
 
 		break;
 	}
@@ -2058,11 +2157,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 				struct tun_file *tfile,
-				struct xdp_frame *xdp_frame,
+				void *addr,
+				size_t size,
 				struct iov_iter *iter)
 {
 	int vnet_hdr_sz = 0;
-	size_t size = xdp_frame->len;
 	size_t ret;
 
 	if (tun->flags & IFF_VNET_HDR) {
@@ -2077,7 +2176,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
 	}
 
-	ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
+	ret = copy_to_iter(addr, size, iter) + vnet_hdr_sz;
 
 	preempt_disable();
 	dev_sw_netstats_tx_add(tun->dev, 1, ret);
@@ -2240,8 +2339,20 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	if (tun_is_xdp_frame(ptr)) {
 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
+		ret = tun_put_user_xdp(tun, tfile, xdpf->data, xdpf->len, to);
 		xdp_return_frame(xdpf);
+	} else if (tun_is_xdp_desc_frame(ptr)) {
+		struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
+		void *data;
+
+		spin_lock(&tfile->pool_lock);
+		if (tfile->xsk_pool) {
+			data = xsk_buff_raw_get_data(tfile->xsk_pool, desc->addr);
+			ret = tun_put_user_xdp(tun, tfile, data, desc->len, to);
+		} else {
+			ret = 0;
+		}
+		spin_unlock(&tfile->pool_lock);
 	} else {
 		struct sk_buff *skb = ptr;
 
@@ -2654,6 +2765,10 @@ static int tun_ptr_peek_len(void *ptr)
 			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
 			return xdpf->len;
+		} else if (tun_is_xdp_desc_frame(ptr)) {
+			struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
+
+			return desc->len;
 		}
 		return __skb_array_len_with_tag(ptr);
 	} else {
@@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
 	}
 }
 
+static void tun_peek_xsk(struct tun_file *tfile)
+{
+	struct xsk_buff_pool *pool;
+	u32 i, batch, budget;
+	void *frame;
+
+	if (!ptr_ring_empty(&tfile->tx_ring))
+		return;
+
+	spin_lock(&tfile->pool_lock);
+	pool = tfile->xsk_pool;
+	if (!pool) {
+		spin_unlock(&tfile->pool_lock);
+		return;
+	}
+
+	if (tfile->nb_descs) {
+		xsk_tx_completed(pool, tfile->nb_descs);
+		if (xsk_uses_need_wakeup(pool))
+			xsk_set_tx_need_wakeup(pool);
+	}
+
+	spin_lock(&tfile->tx_ring.producer_lock);
+	budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
+
+	batch = xsk_tx_peek_release_desc_batch(pool, budget);
+	if (!batch) {
+		tfile->nb_descs = 0;
+		spin_unlock(&tfile->tx_ring.producer_lock);
+		spin_unlock(&tfile->pool_lock);
+		return;
+	}
+
+	tfile->nb_descs = batch;
+	for (i = 0; i < batch; i++) {
+		/* Encode the XDP DESC flag into lowest bit for consumer to differ
+		 * XDP desc from XDP buffer and sk_buff.
+		 */
+		frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
+		/* The budget must be less than or equal to tx_ring.size,
+		 * so enqueuing will not fail.
+		 */
+		__ptr_ring_produce(&tfile->tx_ring, frame);
+	}
+	spin_unlock(&tfile->tx_ring.producer_lock);
+	spin_unlock(&tfile->pool_lock);
+}
+
 static int tun_peek_len(struct socket *sock)
 {
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
@@ -2671,6 +2834,9 @@ static int tun_peek_len(struct socket *sock)
 	if (!tun)
 		return 0;
 
+	if (sock_flag(&tfile->sk, SOCK_XDP))
+		tun_peek_xsk(tfile);
+
 	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
 	tun_put(tun);
 
@@ -3473,6 +3639,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	}
 
 	mutex_init(&tfile->napi_mutex);
+	spin_lock_init(&tfile->pool_lock);
 	RCU_INIT_POINTER(tfile->tun, NULL);
 	tfile->flags = 0;
 	tfile->ifindex = 0;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 077e74421558..eb83764be26c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -202,6 +202,10 @@ static int vhost_net_buf_peek_len(void *ptr)
 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
 		return xdpf->len;
+	} else if (tun_is_xdp_desc_frame(ptr)) {
+		struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
+
+		return desc->len;
 	}
 
 	return __skb_array_len_with_tag(ptr);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 043d442994b0..4142453b5e52 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -6,10 +6,12 @@
 #ifndef __IF_TUN_H
 #define __IF_TUN_H
 
+#include <uapi/linux/if_xdp.h>
 #include <uapi/linux/if_tun.h>
 #include <uapi/linux/virtio_net.h>
 
 #define TUN_XDP_FLAG 0x1UL
+#define TUN_XDP_DESC_FLAG 0x2UL
 
 #define TUN_MSG_UBUF 1
 #define TUN_MSG_PTR  2
@@ -43,6 +45,21 @@ static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
 	return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
 }
 
+static inline bool tun_is_xdp_desc_frame(void *ptr)
+{
+	return (unsigned long)ptr & TUN_XDP_DESC_FLAG;
+}
+
+static inline void *tun_xdp_desc_to_ptr(struct xdp_desc *desc)
+{
+	return (void *)((unsigned long)desc | TUN_XDP_DESC_FLAG);
+}
+
+static inline struct xdp_desc *tun_ptr_to_xdp_desc(void *ptr)
+{
+	return (void *)((unsigned long)ptr & ~TUN_XDP_DESC_FLAG);
+}
+
 void tun_ptr_free(void *ptr);
 #else
 #include <linux/err.h>
@@ -75,6 +92,21 @@ static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
 	return NULL;
 }
 
+static inline bool tun_is_xdp_desc_frame(void *ptr)
+{
+	return false;
+}
+
+static inline void *tun_xdp_desc_to_ptr(struct xdp_desc *desc)
+{
+	return NULL;
+}
+
+static inline struct xdp_frame *tun_ptr_to_xdp_desc(void *ptr)
+{
+	return NULL;
+}
+
 static inline void tun_ptr_free(void *ptr)
 {
 }
-- 
2.41.0


             reply	other threads:[~2024-02-28 11:06 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-28 11:05 Yunjian Wang [this message]
2024-02-29 10:24 ` [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support kernel test robot
2024-02-29 11:12 ` Paolo Abeni
2024-02-29 13:15   ` wangyunjian
2024-03-01 11:45   ` wangyunjian
2024-03-01 11:53     ` Michael S. Tsirkin
2024-03-04 13:45       ` wangyunjian
2024-03-11  4:00         ` Jason Wang
2024-03-11 13:27           ` wangyunjian
2024-03-12  6:07             ` Jason Wang
2024-02-29 15:44 ` kernel test robot
2024-03-01 14:11 ` Maciej Fijalkowski
2024-03-01 18:40   ` Willem de Bruijn
2024-03-06  5:32     ` Jason Wang
2024-03-04  6:55 ` Jason Wang
2024-03-04 11:23   ` wangyunjian
2024-03-06  2:11     ` Jason Wang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1709118356-133960-1-git-send-email-wangyunjian@huawei.com \
    --to=wangyunjian@huawei.com \
    --cc=bjorn@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=davem@davemloft.net \
    --cc=jasowang@redhat.com \
    --cc=jonathan.lemon@gmail.com \
    --cc=kuba@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=liwei395@huawei.com \
    --cc=maciej.fijalkowski@intel.com \
    --cc=magnus.karlsson@intel.com \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=virtualization@lists.linux.dev \
    --cc=willemdebruijn.kernel@gmail.com \
    --cc=xudingke@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.