bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
To: bpf@vger.kernel.org, ast@kernel.org, daniel@iogearbox.net,
	andrii@kernel.org
Cc: netdev@vger.kernel.org, magnus.karlsson@intel.com,
	bjorn@kernel.org, tirthendu.sarkar@intel.com,
	maciej.fijalkowski@intel.com, simon.horman@corigine.com,
	toke@kernel.org
Subject: [PATCH v4 bpf-next 08/22] xsk: add support for AF_XDP multi-buffer on Tx path
Date: Thu, 15 Jun 2023 19:25:52 +0200	[thread overview]
Message-ID: <20230615172606.349557-9-maciej.fijalkowski@intel.com> (raw)
In-Reply-To: <20230615172606.349557-1-maciej.fijalkowski@intel.com>

From: Tirthendu Sarkar <tirthendu.sarkar@intel.com>

For transmitting an AF_XDP packet, allocate skb while processing the
first desc and copy data to it. The 'XDP_PKT_CONTD' flag in 'options'
field of the desc indicates the EOP status of the packet. If the current
desc is not EOP, store the skb, release the current desc and go
on to read the next descs.

Allocate a page for each subsequent desc, copy data to it and add it as
a frag in the skb stored in xsk. On processing EOP, transmit the skb
with frags. Addresses contained in descs have been already queued in
consumer queue and skb destructor updated the completion count.

On transmit failure cancel the releases, clear the descs from the
completion queue and consume the skb for retrying packet transmission.

For any invalid descriptor (invalid length/address/options) in the middle
of a packet, all pending descriptors will be dropped by xsk core along
with the invalid one and the next descriptor is treated as the start of
a new packet.

Maximum supported frames for a packet is MAX_SKB_FRAGS + 1. If it is
exceeded, all descriptors accumulated so far are dropped.

Signed-off-by: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
---
 net/xdp/xsk.c       | 117 +++++++++++++++++++++++++++++++++-----------
 net/xdp/xsk_queue.h |  13 +++--
 2 files changed, 97 insertions(+), 33 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 29bda8452e2c..1f20618df5dd 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -393,7 +393,8 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
 	rcu_read_lock();
 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
-			xs->tx->queue_empty_descs++;
+			if (xskq_has_descs(xs->tx))
+				xskq_cons_release(xs->tx);
 			continue;
 		}
 
@@ -539,24 +540,32 @@ static void xsk_consume_skb(struct sk_buff *skb)
 	xs->skb = NULL;
 }
 
+static void xsk_drop_skb(struct sk_buff *skb)
+{
+	xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
+	xsk_consume_skb(skb);
+}
+
 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 					      struct xdp_desc *desc)
 {
 	struct xsk_buff_pool *pool = xs->pool;
 	u32 hr, len, ts, offset, copy, copied;
-	struct sk_buff *skb;
+	struct sk_buff *skb = xs->skb;
 	struct page *page;
 	void *buffer;
 	int err, i;
 	u64 addr;
 
-	hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
+	if (!skb) {
+		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
 
-	skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
-	if (unlikely(!skb))
-		return ERR_PTR(err);
+		skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
+		if (unlikely(!skb))
+			return ERR_PTR(err);
 
-	skb_reserve(skb, hr);
+		skb_reserve(skb, hr);
+	}
 
 	addr = desc->addr;
 	len = desc->len;
@@ -566,7 +575,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 	offset = offset_in_page(buffer);
 	addr = buffer - pool->addrs;
 
-	for (copied = 0, i = 0; copied < len; i++) {
+	for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
+		if (unlikely(i >= MAX_SKB_FRAGS))
+			return ERR_PTR(-EFAULT);
+
 		page = pool->umem->pgs[addr >> PAGE_SHIFT];
 		get_page(page);
 
@@ -591,33 +603,56 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 				     struct xdp_desc *desc)
 {
 	struct net_device *dev = xs->dev;
-	struct sk_buff *skb;
+	struct sk_buff *skb = xs->skb;
+	int err;
 
 	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
 		skb = xsk_build_skb_zerocopy(xs, desc);
-		if (IS_ERR(skb))
-			return skb;
+		if (IS_ERR(skb)) {
+			err = PTR_ERR(skb);
+			goto free_err;
+		}
 	} else {
 		u32 hr, tr, len;
 		void *buffer;
-		int err;
 
-		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
-		tr = dev->needed_tailroom;
+		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
 		len = desc->len;
 
-		skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
-		if (unlikely(!skb))
-			return ERR_PTR(err);
+		if (!skb) {
+			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
+			tr = dev->needed_tailroom;
+			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
+			if (unlikely(!skb))
+				goto free_err;
 
-		skb_reserve(skb, hr);
-		skb_put(skb, len);
+			skb_reserve(skb, hr);
+			skb_put(skb, len);
 
-		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
-		err = skb_store_bits(skb, 0, buffer, len);
-		if (unlikely(err)) {
-			kfree_skb(skb);
-			return ERR_PTR(err);
+			err = skb_store_bits(skb, 0, buffer, len);
+			if (unlikely(err))
+				goto free_err;
+		} else {
+			int nr_frags = skb_shinfo(skb)->nr_frags;
+			struct page *page;
+			u8 *vaddr;
+
+			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
+				err = -EFAULT;
+				goto free_err;
+			}
+
+			page = alloc_page(xs->sk.sk_allocation);
+			if (unlikely(!page)) {
+				err = -EAGAIN;
+				goto free_err;
+			}
+
+			vaddr = kmap_local_page(page);
+			memcpy(vaddr, buffer, len);
+			kunmap_local(vaddr);
+
+			skb_add_rx_frag(skb, nr_frags, page, 0, len, 0);
 		}
 	}
 
@@ -628,6 +663,17 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 	xsk_set_destructor_arg(skb);
 
 	return skb;
+
+free_err:
+	if (err == -EAGAIN) {
+		xsk_cq_cancel_locked(xs, 1);
+	} else {
+		xsk_set_destructor_arg(skb);
+		xsk_drop_skb(skb);
+		xskq_cons_release(xs->tx);
+	}
+
+	return ERR_PTR(err);
 }
 
 static int __xsk_generic_xmit(struct sock *sk)
@@ -667,30 +713,45 @@ static int __xsk_generic_xmit(struct sock *sk)
 		skb = xsk_build_skb(xs, &desc);
 		if (IS_ERR(skb)) {
 			err = PTR_ERR(skb);
-			xsk_cq_cancel_locked(xs, 1);
-			goto out;
+			if (err == -EAGAIN)
+				goto out;
+			err = 0;
+			continue;
+		}
+
+		xskq_cons_release(xs->tx);
+
+		if (xp_mb_desc(&desc)) {
+			xs->skb = skb;
+			continue;
 		}
 
 		err = __dev_direct_xmit(skb, xs->queue_id);
 		if  (err == NETDEV_TX_BUSY) {
 			/* Tell user-space to retry the send */
+			xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
 			xsk_consume_skb(skb);
 			err = -EAGAIN;
 			goto out;
 		}
 
-		xskq_cons_release(xs->tx);
 		/* Ignore NET_XMIT_CN as packet might have been sent */
 		if (err == NET_XMIT_DROP) {
 			/* SKB completed but not sent */
 			err = -EBUSY;
+			xs->skb = NULL;
 			goto out;
 		}
 
 		sent_frame = true;
+		xs->skb = NULL;
 	}
 
-	xs->tx->queue_empty_descs++;
+	if (xskq_has_descs(xs->tx)) {
+		if (xs->skb)
+			xsk_drop_skb(xs->skb);
+		xskq_cons_release(xs->tx);
+	}
 
 out:
 	if (sent_frame)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 4190f43ce0b0..2d2af9fc2744 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -175,6 +175,11 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
 		xp_aligned_validate_desc(pool, desc);
 }
 
+static inline bool xskq_has_descs(struct xsk_queue *q)
+{
+	return q->cached_cons != q->cached_prod;
+}
+
 static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
 					   struct xdp_desc *d,
 					   struct xsk_buff_pool *pool)
@@ -190,17 +195,15 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q,
 				       struct xdp_desc *desc,
 				       struct xsk_buff_pool *pool)
 {
-	while (q->cached_cons != q->cached_prod) {
+	if (q->cached_cons != q->cached_prod) {
 		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
 		u32 idx = q->cached_cons & q->ring_mask;
 
 		*desc = ring->desc[idx];
-		if (xskq_cons_is_valid_desc(q, desc, pool))
-			return true;
-
-		q->cached_cons++;
+		return xskq_cons_is_valid_desc(q, desc, pool);
 	}
 
+	q->queue_empty_descs++;
 	return false;
 }
 
-- 
2.34.1


  parent reply	other threads:[~2023-06-15 17:26 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-15 17:25 [PATCH v4 bpf-next 00/22] xsk: multi-buffer support Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 01/22] xsk: prepare 'options' in xdp_desc for multi-buffer use Maciej Fijalkowski
2023-06-22 19:02   ` Benjamin Poirier
2023-06-26 13:18     ` Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 02/22] xsk: introduce XSK_USE_SG bind flag for xsk socket Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 03/22] xsk: prepare both copy and zero-copy modes to co-exist Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 04/22] xsk: move xdp_buff's data length check to xsk_rcv_check Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 05/22] xsk: add support for AF_XDP multi-buffer on Rx path Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 06/22] xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path Maciej Fijalkowski
2023-06-20 17:25   ` Toke Høiland-Jørgensen
2023-06-21  8:15     ` Sarkar, Tirthendu
2023-06-21 13:27       ` Toke Høiland-Jørgensen
2023-06-15 17:25 ` [PATCH v4 bpf-next 07/22] xsk: allow core/drivers to test EOP bit Maciej Fijalkowski
2023-06-15 17:25 ` Maciej Fijalkowski [this message]
2023-06-15 17:25 ` [PATCH v4 bpf-next 09/22] xsk: discard zero length descriptors in Tx path Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 10/22] xsk: support mbuf on ZC RX Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 11/22] ice: xsk: add RX multi-buffer support Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 12/22] xsk: support ZC Tx multi-buffer in batch API Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 13/22] xsk: report zero-copy multi-buffer capability via xdp_features Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 14/22] ice: xsk: Tx multi-buffer support Maciej Fijalkowski
2023-06-15 17:25 ` [PATCH v4 bpf-next 15/22] xsk: add multi-buffer documentation Maciej Fijalkowski
2023-06-20 17:34   ` Toke Høiland-Jørgensen
2023-06-21  8:06     ` Magnus Karlsson
2023-06-21 13:30       ` Toke Høiland-Jørgensen
2023-06-21 14:15         ` Magnus Karlsson
2023-06-21 20:34           ` Jakub Kicinski
2023-06-22  8:24             ` Magnus Karlsson
2023-06-22 10:56               ` Toke Høiland-Jørgensen
     [not found]                 ` <ZJx9WkB/dfB5EFjE@boxer>
2023-06-28 20:28                   ` Jakub Kicinski
2023-06-28 21:02                   ` Toke Høiland-Jørgensen
2023-06-29 20:28                     ` Maciej Fijalkowski
2023-06-29 20:57                       ` Toke Høiland-Jørgensen
2023-06-30 18:00                         ` Maciej Fijalkowski
2023-07-01 13:51                           ` Toke Høiland-Jørgensen
2023-06-15 17:26 ` [PATCH v4 bpf-next 16/22] selftests/xsk: transmit and receive multi-buffer packets Maciej Fijalkowski
2023-06-15 17:26 ` [PATCH v4 bpf-next 17/22] selftests/xsk: add basic multi-buffer test Maciej Fijalkowski
2023-06-15 17:26 ` [PATCH v4 bpf-next 18/22] selftests/xsk: add unaligned mode test for multi-buffer Maciej Fijalkowski
2023-06-15 17:26 ` [PATCH v4 bpf-next 19/22] selftests/xsk: add invalid descriptor " Maciej Fijalkowski
2023-06-15 17:26 ` [PATCH v4 bpf-next 20/22] selftests/xsk: add metadata copy test for multi-buff Maciej Fijalkowski
2023-06-15 17:26 ` [PATCH v4 bpf-next 21/22] selftests/xsk: add test for too many frags Maciej Fijalkowski
2023-06-15 17:26 ` [PATCH v4 bpf-next 22/22] selftests/xsk: reset NIC settings to default after running test suite Maciej Fijalkowski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230615172606.349557-9-maciej.fijalkowski@intel.com \
    --to=maciej.fijalkowski@intel.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bjorn@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=magnus.karlsson@intel.com \
    --cc=netdev@vger.kernel.org \
    --cc=simon.horman@corigine.com \
    --cc=tirthendu.sarkar@intel.com \
    --cc=toke@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).