All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5 net-next 0/3] add multi-buff support for xdp running in generic mode
@ 2023-12-14 14:29 Lorenzo Bianconi
  2023-12-14 14:29 ` [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct Lorenzo Bianconi
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Lorenzo Bianconi @ 2023-12-14 14:29 UTC (permalink / raw)
  To: netdev
  Cc: lorenzo.bianconi, davem, kuba, edumazet, pabeni, bpf, hawk, toke,
	willemdebruijn.kernel, jasowang, sdf

Introduce multi-buffer support for xdp running in generic mode not always
linearizing the skb in netif_receive_generic_xdp routine.
Introduce page_pool in softnet_data structure

Changes since v4:
- fix compilation error if page_pools are not enabled
Changes since v3:
- introduce page_pool in softnet_data structure
- rely on page_pools for xdp_generic code
Changes since v2:
- rely on napi_alloc_frag() and napi_build_skb() to build the new skb
Changes since v1:
- explicitly keep the skb segmented in netif_skb_check_for_generic_xdp() and
  do not rely on pskb_expand_head()

Lorenzo Bianconi (3):
  net: introduce page_pool pointer in softnet_data percpu struct
  xdp: rely on skb pointer reference in do_xdp_generic and
    netif_receive_generic_xdp
  xdp: add multi-buff support for xdp running in generic mode

 drivers/net/tun.c               |   4 +-
 include/linux/netdevice.h       |   3 +-
 include/net/page_pool/helpers.h |   5 +
 include/net/page_pool/types.h   |   1 +
 net/core/dev.c                  | 208 +++++++++++++++++++++++++++-----
 net/core/page_pool.c            |   5 +
 net/core/skbuff.c               |   5 +-
 7 files changed, 199 insertions(+), 32 deletions(-)

-- 
2.43.0


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct
  2023-12-14 14:29 [PATCH v5 net-next 0/3] add multi-buff support for xdp running in generic mode Lorenzo Bianconi
@ 2023-12-14 14:29 ` Lorenzo Bianconi
  2023-12-19 15:23   ` Paolo Abeni
  2023-12-19 16:29   ` Eric Dumazet
  2023-12-14 14:29 ` [PATCH v5 net-next 2/3] xdp: rely on skb pointer reference in do_xdp_generic and netif_receive_generic_xdp Lorenzo Bianconi
  2023-12-14 14:29 ` [PATCH v5 net-next 3/3] xdp: add multi-buff support for xdp running in generic mode Lorenzo Bianconi
  2 siblings, 2 replies; 13+ messages in thread
From: Lorenzo Bianconi @ 2023-12-14 14:29 UTC (permalink / raw)
  To: netdev
  Cc: lorenzo.bianconi, davem, kuba, edumazet, pabeni, bpf, hawk, toke,
	willemdebruijn.kernel, jasowang, sdf

Allocate percpu page_pools in softnet_data.
Moreover add cpuid filed in page_pool struct in order to recycle the
page in the page_pool "hot" cache if napi_pp_put_page() is running on
the same cpu.
This is a preliminary patch to add xdp multi-buff support for xdp running
in generic mode.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 include/linux/netdevice.h       |  1 +
 include/net/page_pool/helpers.h |  5 +++++
 include/net/page_pool/types.h   |  1 +
 net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
 net/core/page_pool.c            |  5 +++++
 net/core/skbuff.c               |  5 +++--
 6 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1b935ee341b4..30b6a3f601fe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3319,6 +3319,7 @@ struct softnet_data {
 	int			defer_count;
 	int			defer_ipi_scheduled;
 	struct sk_buff		*defer_list;
+	struct page_pool	*page_pool;
 	call_single_data_t	defer_csd;
 };
 
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 841e0a930bd7..6ae735804b40 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -401,4 +401,9 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid)
 		page_pool_update_nid(pool, new_nid);
 }
 
+static inline void page_pool_set_cpuid(struct page_pool *pool, int cpuid)
+{
+	pool->cpuid = cpuid;
+}
+
 #endif /* _NET_PAGE_POOL_HELPERS_H */
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 76481c465375..f63dadf2a6d4 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -128,6 +128,7 @@ struct page_pool_stats {
 struct page_pool {
 	struct page_pool_params_fast p;
 
+	int cpuid;
 	bool has_init_callback;
 
 	long frag_users;
diff --git a/net/core/dev.c b/net/core/dev.c
index 0432b04cf9b0..d600e3a6ec2c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -153,6 +153,8 @@
 #include <linux/prandom.h>
 #include <linux/once_lite.h>
 #include <net/netdev_rx_queue.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/helpers.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
@@ -11670,12 +11672,33 @@ static void __init net_dev_struct_check(void)
  *
  */
 
+#define SD_PAGE_POOL_RING_SIZE	256
+static int net_sd_page_pool_alloc(struct softnet_data *sd, int cpuid)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+	struct page_pool_params page_pool_params = {
+		.pool_size = SD_PAGE_POOL_RING_SIZE,
+		.nid = NUMA_NO_NODE,
+	};
+
+	sd->page_pool = page_pool_create(&page_pool_params);
+	if (IS_ERR(sd->page_pool)) {
+		sd->page_pool = NULL;
+		return -ENOMEM;
+	}
+
+	page_pool_set_cpuid(sd->page_pool, cpuid);
+#endif
+	return 0;
+}
+
 /*
  *       This is called single threaded during boot, so no need
  *       to take the rtnl semaphore.
  */
 static int __init net_dev_init(void)
 {
+	struct softnet_data *sd;
 	int i, rc = -ENOMEM;
 
 	BUG_ON(!dev_boot_phase);
@@ -11701,10 +11724,10 @@ static int __init net_dev_init(void)
 
 	for_each_possible_cpu(i) {
 		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
-		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
 		INIT_WORK(flush, flush_backlog);
 
+		sd = &per_cpu(softnet_data, i);
 		skb_queue_head_init(&sd->input_pkt_queue);
 		skb_queue_head_init(&sd->process_queue);
 #ifdef CONFIG_XFRM_OFFLOAD
@@ -11722,6 +11745,9 @@ static int __init net_dev_init(void)
 		init_gro_hash(&sd->backlog);
 		sd->backlog.poll = process_backlog;
 		sd->backlog.weight = weight_p;
+
+		if (net_sd_page_pool_alloc(sd, i))
+			goto out;
 	}
 
 	dev_boot_phase = 0;
@@ -11749,6 +11775,17 @@ static int __init net_dev_init(void)
 	WARN_ON(rc < 0);
 	rc = 0;
 out:
+	if (rc < 0) {
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			if (!sd->page_pool)
+				continue;
+
+			page_pool_destroy(sd->page_pool);
+			sd->page_pool = NULL;
+		}
+	}
+
 	return rc;
 }
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index dd5a72533f2b..275b8572a82b 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -178,6 +178,11 @@ static int page_pool_init(struct page_pool *pool,
 	memcpy(&pool->p, &params->fast, sizeof(pool->p));
 	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 
+	/* It is up to the consumer to set cpuid if we are using percpu
+	 * page_pool so initialize it to an invalid value.
+	 */
+	pool->cpuid = -1;
+
 	/* Validate only known flags were used */
 	if (pool->p.flags & ~(PP_FLAG_ALL))
 		return -EINVAL;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b157efea5dea..4bc0a7f98241 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -918,9 +918,10 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
 	 */
 	if (napi_safe || in_softirq()) {
 		const struct napi_struct *napi = READ_ONCE(pp->p.napi);
+		unsigned int cpuid = smp_processor_id();
 
-		allow_direct = napi &&
-			READ_ONCE(napi->list_owner) == smp_processor_id();
+		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
+		allow_direct |= (pp->cpuid == cpuid);
 	}
 
 	/* Driver set this to memory recycling info. Reset it on recycle.
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v5 net-next 2/3] xdp: rely on skb pointer reference in do_xdp_generic and netif_receive_generic_xdp
  2023-12-14 14:29 [PATCH v5 net-next 0/3] add multi-buff support for xdp running in generic mode Lorenzo Bianconi
  2023-12-14 14:29 ` [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct Lorenzo Bianconi
@ 2023-12-14 14:29 ` Lorenzo Bianconi
  2023-12-14 14:29 ` [PATCH v5 net-next 3/3] xdp: add multi-buff support for xdp running in generic mode Lorenzo Bianconi
  2 siblings, 0 replies; 13+ messages in thread
From: Lorenzo Bianconi @ 2023-12-14 14:29 UTC (permalink / raw)
  To: netdev
  Cc: lorenzo.bianconi, davem, kuba, edumazet, pabeni, bpf, hawk, toke,
	willemdebruijn.kernel, jasowang, sdf

Rely on skb pointer reference instead of the skb pointer in do_xdp_generic and
netif_receive_generic_xdp routine signatures. This is a preliminary patch to add
multi-buff support for xdp running in generic mode.

Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 drivers/net/tun.c         |  4 ++--
 include/linux/netdevice.h |  2 +-
 net/core/dev.c            | 16 +++++++++-------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index afa5497f7c35..206adddff699 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1921,7 +1921,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		rcu_read_lock();
 		xdp_prog = rcu_dereference(tun->xdp_prog);
 		if (xdp_prog) {
-			ret = do_xdp_generic(xdp_prog, skb);
+			ret = do_xdp_generic(xdp_prog, &skb);
 			if (ret != XDP_PASS) {
 				rcu_read_unlock();
 				local_bh_enable();
@@ -2511,7 +2511,7 @@ static int tun_xdp_one(struct tun_struct *tun,
 	skb_record_rx_queue(skb, tfile->queue_index);
 
 	if (skb_xdp) {
-		ret = do_xdp_generic(xdp_prog, skb);
+		ret = do_xdp_generic(xdp_prog, &skb);
 		if (ret != XDP_PASS) {
 			ret = 0;
 			goto out;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 30b6a3f601fe..d850ead7f9cd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3953,7 +3953,7 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
 u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 			     struct bpf_prog *xdp_prog);
 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb);
 int netif_rx(struct sk_buff *skb);
 int __netif_rx(struct sk_buff *skb);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index d600e3a6ec2c..d7857de03dba 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4916,10 +4916,11 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	return act;
 }
 
-static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
 				     struct xdp_buff *xdp,
 				     struct bpf_prog *xdp_prog)
 {
+	struct sk_buff *skb = *pskb;
 	u32 act = XDP_DROP;
 
 	/* Reinjected packets coming from act_mirred or similar should
@@ -5000,24 +5001,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 
 static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
 {
 	if (xdp_prog) {
 		struct xdp_buff xdp;
 		u32 act;
 		int err;
 
-		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
+		act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
 		if (act != XDP_PASS) {
 			switch (act) {
 			case XDP_REDIRECT:
-				err = xdp_do_generic_redirect(skb->dev, skb,
+				err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
 							      &xdp, xdp_prog);
 				if (err)
 					goto out_redir;
 				break;
 			case XDP_TX:
-				generic_xdp_tx(skb, xdp_prog);
+				generic_xdp_tx(*pskb, xdp_prog);
 				break;
 			}
 			return XDP_DROP;
@@ -5025,7 +5026,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 	}
 	return XDP_PASS;
 out_redir:
-	kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
+	kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
 	return XDP_DROP;
 }
 EXPORT_SYMBOL_GPL(do_xdp_generic);
@@ -5348,7 +5349,8 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 		int ret2;
 
 		migrate_disable();
-		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+				      &skb);
 		migrate_enable();
 
 		if (ret2 != XDP_PASS) {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v5 net-next 3/3] xdp: add multi-buff support for xdp running in generic mode
  2023-12-14 14:29 [PATCH v5 net-next 0/3] add multi-buff support for xdp running in generic mode Lorenzo Bianconi
  2023-12-14 14:29 ` [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct Lorenzo Bianconi
  2023-12-14 14:29 ` [PATCH v5 net-next 2/3] xdp: rely on skb pointer reference in do_xdp_generic and netif_receive_generic_xdp Lorenzo Bianconi
@ 2023-12-14 14:29 ` Lorenzo Bianconi
  2023-12-20 16:01   ` Jesper Dangaard Brouer
  2 siblings, 1 reply; 13+ messages in thread
From: Lorenzo Bianconi @ 2023-12-14 14:29 UTC (permalink / raw)
  To: netdev
  Cc: lorenzo.bianconi, davem, kuba, edumazet, pabeni, bpf, hawk, toke,
	willemdebruijn.kernel, jasowang, sdf

Similar to native xdp, do not always linearize the skb in
netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
processed by the eBPF program. This allow to add  multi-buffer support
for xdp running in generic mode.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 net/core/dev.c | 153 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 134 insertions(+), 19 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index d7857de03dba..47164acc3268 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4854,6 +4854,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
 	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
 			 skb_headlen(skb) + mac_len, true);
+	if (skb_is_nonlinear(skb)) {
+		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+		xdp_buff_set_frags_flag(xdp);
+	} else {
+		xdp_buff_clear_frags_flag(xdp);
+	}
 
 	orig_data_end = xdp->data_end;
 	orig_data = xdp->data;
@@ -4883,6 +4889,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 		skb->len += off; /* positive on grow, negative on shrink */
 	}
 
+	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
+	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
+	 */
+	if (xdp_buff_has_frags(xdp))
+		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+	else
+		skb->data_len = 0;
+
 	/* check if XDP changed eth hdr such SKB needs update */
 	eth = (struct ethhdr *)xdp->data;
 	if ((orig_eth_type != eth->h_proto) ||
@@ -4916,12 +4930,118 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	return act;
 }
 
+static int netif_skb_segment_for_xdp(struct sk_buff **pskb,
+				     struct bpf_prog *prog)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+	u32 size, truesize, len, max_head_size, off;
+	struct sk_buff *skb = *pskb, *nskb;
+	int err, i, head_off;
+	void *data;
+
+	/* XDP does not support fraglist so we need to linearize
+	 * the skb.
+	 */
+	if (skb_has_frag_list(skb) || !prog->aux->xdp_has_frags)
+		return -EOPNOTSUPP;
+
+	max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - XDP_PACKET_HEADROOM);
+	if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
+		return -ENOMEM;
+
+	size = min_t(u32, skb->len, max_head_size);
+	truesize = SKB_HEAD_ALIGN(size) + XDP_PACKET_HEADROOM;
+	data = page_pool_dev_alloc_va(sd->page_pool, &truesize);
+	if (!data)
+		return -ENOMEM;
+
+	nskb = napi_build_skb(data, truesize);
+	if (!nskb) {
+		page_pool_free_va(sd->page_pool, data, true);
+		return -ENOMEM;
+	}
+
+	skb_reserve(nskb, XDP_PACKET_HEADROOM);
+	skb_copy_header(nskb, skb);
+	skb_mark_for_recycle(nskb);
+
+	err = skb_copy_bits(skb, 0, nskb->data, size);
+	if (err) {
+		consume_skb(nskb);
+		return err;
+	}
+	skb_put(nskb, size);
+
+	head_off = skb_headroom(nskb) - skb_headroom(skb);
+	skb_headers_offset_update(nskb, head_off);
+
+	off = size;
+	len = skb->len - off;
+	for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+		struct page *page;
+		u32 page_off;
+
+		size = min_t(u32, len, PAGE_SIZE);
+		truesize = size;
+
+		page = page_pool_dev_alloc(sd->page_pool, &page_off,
+					   &truesize);
+		if (!data) {
+			consume_skb(nskb);
+			return -ENOMEM;
+		}
+
+		skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
+		err = skb_copy_bits(skb, off, page_address(page) + page_off,
+				    size);
+		if (err) {
+			consume_skb(nskb);
+			return err;
+		}
+
+		len -= size;
+		off += size;
+	}
+
+	consume_skb(skb);
+	*pskb = nskb;
+
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int netif_skb_check_for_xdp(struct sk_buff **pskb,
+				   struct bpf_prog *prog)
+{
+	struct sk_buff *skb = *pskb;
+	int err, hroom, troom;
+
+	if (!netif_skb_segment_for_xdp(pskb, prog))
+		return 0;
+
+	/* In case we have to go down the path and also linearize,
+	 * then lets do the pskb_expand_head() work just once here.
+	 */
+	hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+	troom = skb->tail + skb->data_len - skb->end;
+	err = pskb_expand_head(skb,
+			       hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+			       troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
+	if (err)
+		return err;
+
+	return skb_linearize(skb);
+}
+
 static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
 				     struct xdp_buff *xdp,
 				     struct bpf_prog *xdp_prog)
 {
 	struct sk_buff *skb = *pskb;
-	u32 act = XDP_DROP;
+	u32 mac_len, act = XDP_DROP;
 
 	/* Reinjected packets coming from act_mirred or similar should
 	 * not get XDP generic processing.
@@ -4929,41 +5049,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
 	if (skb_is_redirected(skb))
 		return XDP_PASS;
 
-	/* XDP packets must be linear and must have sufficient headroom
-	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
-	 * native XDP provides, thus we need to do it here as well.
+	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
+	 * bytes. This is the guarantee that also native XDP provides,
+	 * thus we need to do it here as well.
 	 */
+	mac_len = skb->data - skb_mac_header(skb);
+	__skb_push(skb, mac_len);
+
 	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
-		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
-		int troom = skb->tail + skb->data_len - skb->end;
-
-		/* In case we have to go down the path and also linearize,
-		 * then lets do the pskb_expand_head() work just once here.
-		 */
-		if (pskb_expand_head(skb,
-				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
-				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
-			goto do_drop;
-		if (skb_linearize(skb))
+		if (netif_skb_check_for_xdp(pskb, xdp_prog))
 			goto do_drop;
 	}
 
-	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
+	__skb_pull(*pskb, mac_len);
+
+	act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
 	switch (act) {
 	case XDP_REDIRECT:
 	case XDP_TX:
 	case XDP_PASS:
 		break;
 	default:
-		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
+		bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
 		fallthrough;
 	case XDP_ABORTED:
-		trace_xdp_exception(skb->dev, xdp_prog, act);
+		trace_xdp_exception((*pskb)->dev, xdp_prog, act);
 		fallthrough;
 	case XDP_DROP:
 	do_drop:
-		kfree_skb(skb);
+		kfree_skb(*pskb);
 		break;
 	}
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct
  2023-12-14 14:29 ` [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct Lorenzo Bianconi
@ 2023-12-19 15:23   ` Paolo Abeni
  2023-12-20 12:00     ` Jesper Dangaard Brouer
  2023-12-19 16:29   ` Eric Dumazet
  1 sibling, 1 reply; 13+ messages in thread
From: Paolo Abeni @ 2023-12-19 15:23 UTC (permalink / raw)
  To: Ilias Apalodimas, hawk
  Cc: lorenzo.bianconi, davem, kuba, edumazet, bpf, toke,
	willemdebruijn.kernel, jasowang, sdf, netdev, Lorenzo Bianconi

On Thu, 2023-12-14 at 15:29 +0100, Lorenzo Bianconi wrote:
> Allocate percpu page_pools in softnet_data.
> Moreover add cpuid filed in page_pool struct in order to recycle the
> page in the page_pool "hot" cache if napi_pp_put_page() is running on
> the same cpu.
> This is a preliminary patch to add xdp multi-buff support for xdp running
> in generic mode.
> 
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  include/linux/netdevice.h       |  1 +
>  include/net/page_pool/helpers.h |  5 +++++
>  include/net/page_pool/types.h   |  1 +
>  net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
>  net/core/page_pool.c            |  5 +++++
>  net/core/skbuff.c               |  5 +++--
>  6 files changed, 53 insertions(+), 3 deletions(-)

@Jesper, @Ilias: could you please have a look at the pp bits?

Thanks!

Paolo


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct
  2023-12-14 14:29 ` [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct Lorenzo Bianconi
  2023-12-19 15:23   ` Paolo Abeni
@ 2023-12-19 16:29   ` Eric Dumazet
  2023-12-19 17:32     ` Lorenzo Bianconi
  1 sibling, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2023-12-19 16:29 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: netdev, lorenzo.bianconi, davem, kuba, pabeni, bpf, hawk, toke,
	willemdebruijn.kernel, jasowang, sdf

On Thu, Dec 14, 2023 at 3:30 PM Lorenzo Bianconi <lorenzo@kernel.org> wrote:
>
> Allocate percpu page_pools in softnet_data.
> Moreover add cpuid filed in page_pool struct in order to recycle the
> page in the page_pool "hot" cache if napi_pp_put_page() is running on
> the same cpu.
> This is a preliminary patch to add xdp multi-buff support for xdp running
> in generic mode.
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  include/linux/netdevice.h       |  1 +
>  include/net/page_pool/helpers.h |  5 +++++
>  include/net/page_pool/types.h   |  1 +
>  net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
>  net/core/page_pool.c            |  5 +++++
>  net/core/skbuff.c               |  5 +++--
>  6 files changed, 53 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 1b935ee341b4..30b6a3f601fe 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3319,6 +3319,7 @@ struct softnet_data {
>         int                     defer_count;
>         int                     defer_ipi_scheduled;
>         struct sk_buff          *defer_list;
> +       struct page_pool        *page_pool;
>         call_single_data_t      defer_csd;
>  };

This field should be put elsewhere, not in this contended cache line.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct
  2023-12-19 16:29   ` Eric Dumazet
@ 2023-12-19 17:32     ` Lorenzo Bianconi
  0 siblings, 0 replies; 13+ messages in thread
From: Lorenzo Bianconi @ 2023-12-19 17:32 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, lorenzo.bianconi, davem, kuba, pabeni, bpf, hawk, toke,
	willemdebruijn.kernel, jasowang, sdf

[-- Attachment #1: Type: text/plain, Size: 1518 bytes --]

> On Thu, Dec 14, 2023 at 3:30 PM Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> >
> > Allocate percpu page_pools in softnet_data.
> > Moreover add cpuid filed in page_pool struct in order to recycle the
> > page in the page_pool "hot" cache if napi_pp_put_page() is running on
> > the same cpu.
> > This is a preliminary patch to add xdp multi-buff support for xdp running
> > in generic mode.
> >
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >  include/linux/netdevice.h       |  1 +
> >  include/net/page_pool/helpers.h |  5 +++++
> >  include/net/page_pool/types.h   |  1 +
> >  net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
> >  net/core/page_pool.c            |  5 +++++
> >  net/core/skbuff.c               |  5 +++--
> >  6 files changed, 53 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 1b935ee341b4..30b6a3f601fe 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -3319,6 +3319,7 @@ struct softnet_data {
> >         int                     defer_count;
> >         int                     defer_ipi_scheduled;
> >         struct sk_buff          *defer_list;
> > +       struct page_pool        *page_pool;
> >         call_single_data_t      defer_csd;
> >  };
> 
> This field should be put elsewhere, not in this contended cache line.

ack, I think we could add a percpu dedicated pointer for it.

Regards,
Lorenzo

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct
  2023-12-19 15:23   ` Paolo Abeni
@ 2023-12-20 12:00     ` Jesper Dangaard Brouer
  2024-01-17 17:36       ` Lorenzo Bianconi
  0 siblings, 1 reply; 13+ messages in thread
From: Jesper Dangaard Brouer @ 2023-12-20 12:00 UTC (permalink / raw)
  To: Paolo Abeni, Ilias Apalodimas, kuba, Lorenzo Bianconi, netdev
  Cc: Sebastian Andrzej Siewior, willemdebruijn.kernel, toke, davem,
	edumazet, bpf, lorenzo.bianconi, sdf, jasowang



On 19/12/2023 16.23, Paolo Abeni wrote:
> On Thu, 2023-12-14 at 15:29 +0100, Lorenzo Bianconi wrote:
>> Allocate percpu page_pools in softnet_data.
>> Moreover add cpuid filed in page_pool struct in order to recycle the
>> page in the page_pool "hot" cache if napi_pp_put_page() is running on
>> the same cpu.
>> This is a preliminary patch to add xdp multi-buff support for xdp running
>> in generic mode.
>>
>> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
>> ---
>>   include/linux/netdevice.h       |  1 +
>>   include/net/page_pool/helpers.h |  5 +++++
>>   include/net/page_pool/types.h   |  1 +
>>   net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
>>   net/core/page_pool.c            |  5 +++++
>>   net/core/skbuff.c               |  5 +++--
>>   6 files changed, 53 insertions(+), 3 deletions(-)
> 
> @Jesper, @Ilias: could you please have a look at the pp bits?
> 

I have some concerns... I'm still entertaining the idea, but we need to
be aware of the tradeoffs we are making.

(1)
Adding PP to softnet_data means per CPU caching 256 pages in the
ptr_ring (plus likely 64 in the alloc-cache).   Fortunately, PP starts
out empty, so as long as this PP isn't used they don't get cached. But
if used, then PP don't have a MM shrinker that removes these cached
pages, in case system is under MM pressure.  I guess, you can argue that
keeping this per netdev rx-queue would make memory usage even higher.
This is a tradeoff, we are trading memory (waste) for speed.


(2) (Question to Jakub I guess)
How does this connect with Jakub's PP netlink stats interface?
E.g. I find it very practical that this allow us get PP stats per
netdev, but in this case there isn't a netdev.


(3) (Implicit locking)
PP have lockless "alloc" because it it relies on drivers NAPI context.
The places where netstack access softnet_data provide similar protection
that we can rely on for PP, so this is likely correct implementation
wise.  But it will give people like Sebastian (Cc) more gray hair when
figuring out how PREEMPT_RT handle these cases.

(4)
The optimization is needed for the case where we need to re-allocate and
copy SKB fragments.  I think we should focus on avoiding this code path,
instead of optimizing it.  For UDP it should be fairly easy, but for TCP
this is harder.

--Jesper

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 net-next 3/3] xdp: add multi-buff support for xdp running in generic mode
  2023-12-14 14:29 ` [PATCH v5 net-next 3/3] xdp: add multi-buff support for xdp running in generic mode Lorenzo Bianconi
@ 2023-12-20 16:01   ` Jesper Dangaard Brouer
  2023-12-21  8:23     ` Lorenzo Bianconi
  0 siblings, 1 reply; 13+ messages in thread
From: Jesper Dangaard Brouer @ 2023-12-20 16:01 UTC (permalink / raw)
  To: Lorenzo Bianconi, netdev
  Cc: lorenzo.bianconi, davem, kuba, edumazet, pabeni, bpf, toke,
	willemdebruijn.kernel, jasowang, sdf, Yan Zhai



On 14/12/2023 15.29, Lorenzo Bianconi wrote:
> Similar to native xdp, do not always linearize the skb in
> netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
> processed by the eBPF program. This allow to add  multi-buffer support
> for xdp running in generic mode.
> 
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>   net/core/dev.c | 153 +++++++++++++++++++++++++++++++++++++++++++------
>   1 file changed, 134 insertions(+), 19 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index d7857de03dba..47164acc3268 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4854,6 +4854,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
>   	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
>   	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
>   			 skb_headlen(skb) + mac_len, true);
> +	if (skb_is_nonlinear(skb)) {
> +		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
> +		xdp_buff_set_frags_flag(xdp);
> +	} else {
> +		xdp_buff_clear_frags_flag(xdp);
> +	}
>   
>   	orig_data_end = xdp->data_end;
>   	orig_data = xdp->data;
> @@ -4883,6 +4889,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
>   		skb->len += off; /* positive on grow, negative on shrink */
>   	}
>   
> +	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
> +	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
> +	 */
> +	if (xdp_buff_has_frags(xdp))
> +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> +	else
> +		skb->data_len = 0;
> +
>   	/* check if XDP changed eth hdr such SKB needs update */
>   	eth = (struct ethhdr *)xdp->data;
>   	if ((orig_eth_type != eth->h_proto) ||
> @@ -4916,12 +4930,118 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
>   	return act;
>   }
>   
> +static int netif_skb_segment_for_xdp(struct sk_buff **pskb,

This function "...segment_for_xdp" always reallocate SKB and copies all
bits over.
Should it have been named "skb_realloc_for_xdp" ?

I was really hopeing we can find a design to avoid doing this realloc.

If the BPF-prog doesn't write into any of the fragments, then we can
avoid this realloc (+copy) dance. We designed XDP multi-buff to have
exactly the same layout+location as SKB in skb_shared_info, exactly to
avoid having to reallocated.

More comments inline below...

> +				     struct bpf_prog *prog)
> +{
> +#if IS_ENABLED(CONFIG_PAGE_POOL)
> +	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
> +	u32 size, truesize, len, max_head_size, off;
> +	struct sk_buff *skb = *pskb, *nskb;
> +	int err, i, head_off;
> +	void *data;
> +
> +	/* XDP does not support fraglist so we need to linearize
> +	 * the skb.
> +	 */
> +	if (skb_has_frag_list(skb) || !prog->aux->xdp_has_frags)
> +		return -EOPNOTSUPP;
> +
> +	max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - XDP_PACKET_HEADROOM);
> +	if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
> +		return -ENOMEM;
> +
> +	size = min_t(u32, skb->len, max_head_size);
> +	truesize = SKB_HEAD_ALIGN(size) + XDP_PACKET_HEADROOM;
> +	data = page_pool_dev_alloc_va(sd->page_pool, &truesize);
> +	if (!data)
> +		return -ENOMEM;
> +
> +	nskb = napi_build_skb(data, truesize);
> +	if (!nskb) {
> +		page_pool_free_va(sd->page_pool, data, true);
> +		return -ENOMEM;
> +	}
> +
> +	skb_reserve(nskb, XDP_PACKET_HEADROOM);
> +	skb_copy_header(nskb, skb);
> +	skb_mark_for_recycle(nskb);
> +
> +	err = skb_copy_bits(skb, 0, nskb->data, size);

This will likely copy part of the "frags" into the SKB "head" area.

Especially for netstack generated TCP packets, this will change the
segmentation layout significantly.  I wonder what (performance) effects
this will have on further handling of these SKBs.



> +	if (err) {
> +		consume_skb(nskb);
> +		return err;
> +	}
> +	skb_put(nskb, size);
> +
> +	head_off = skb_headroom(nskb) - skb_headroom(skb);
> +	skb_headers_offset_update(nskb, head_off);
> +
> +	off = size;
> +	len = skb->len - off;
> +	for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
> +		struct page *page;
> +		u32 page_off;
> +
> +		size = min_t(u32, len, PAGE_SIZE);
> +		truesize = size;
> +
> +		page = page_pool_dev_alloc(sd->page_pool, &page_off,
> +					   &truesize);
> +		if (!data) {
> +			consume_skb(nskb);
> +			return -ENOMEM;
> +		}
> +
> +		skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
> +		err = skb_copy_bits(skb, off, page_address(page) + page_off,
> +				    size);

I think it is correct, but we can easily endup with the new SKB (nskb)
having a different nskb->nr_frags.


> +		if (err) {
> +			consume_skb(nskb);
> +			return err;
> +		}
> +
> +		len -= size;
> +		off += size;
> +	}
> +
> +	consume_skb(skb);
> +	*pskb = nskb;
> +
> +	return 0;
> +#else
> +	return -EOPNOTSUPP;
> +#endif
> +}
> +
> +static int netif_skb_check_for_xdp(struct sk_buff **pskb,
> +				   struct bpf_prog *prog)
> +{
> +	struct sk_buff *skb = *pskb;
> +	int err, hroom, troom;
> +
> +	if (!netif_skb_segment_for_xdp(pskb, prog))
> +		return 0;

IMHO the code call logic, does not make it easy to add cases where we
can avoid the realloc.  With this patch, it feels like the realloc+copy
code path is the "main" code path for XDP-generic.

Our goal should be to avoid realloc.

My goal for XDP multi-buff was/is that it can co-exist with GSO/GRO
packets.  This patchset is a step in the direction of enabling GRO on
devices with XDP (generic) loaded.  And I was really excited about this,
but the overhead is going to be massive compared to normal GRO (without
realloc+copy) that XDP end-users are going to be disappointed.


> +
> +	/* In case we have to go down the path and also linearize,
> +	 * then lets do the pskb_expand_head() work just once here.
> +	 */
> +	hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
> +	troom = skb->tail + skb->data_len - skb->end;
> +	err = pskb_expand_head(skb,
> +			       hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
> +			       troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
> +	if (err)
> +		return err;
> +
> +	return skb_linearize(skb);
> +}
> +
>   static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
>   				     struct xdp_buff *xdp,
>   				     struct bpf_prog *xdp_prog)
>   {
>   	struct sk_buff *skb = *pskb;
> -	u32 act = XDP_DROP;
> +	u32 mac_len, act = XDP_DROP;
>   
>   	/* Reinjected packets coming from act_mirred or similar should
>   	 * not get XDP generic processing.
> @@ -4929,41 +5049,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
>   	if (skb_is_redirected(skb))
>   		return XDP_PASS;
>   
> -	/* XDP packets must be linear and must have sufficient headroom
> -	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
> -	 * native XDP provides, thus we need to do it here as well.
> +	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
> +	 * bytes. This is the guarantee that also native XDP provides,
> +	 * thus we need to do it here as well.

Some "native" XDP provider only have 192 bytes as HEADROOM and XDP code
can this not being static (256 bytes).  So, perhaps it is time to allow
XDP generic to only require 192 bytes?

>   	 */
> +	mac_len = skb->data - skb_mac_header(skb);
> +	__skb_push(skb, mac_len);
> +
>   	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
>   	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
> -		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
> -		int troom = skb->tail + skb->data_len - skb->end;
> -
> -		/* In case we have to go down the path and also linearize,
> -		 * then lets do the pskb_expand_head() work just once here.
> -		 */
> -		if (pskb_expand_head(skb,
> -				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
> -				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
> -			goto do_drop;
> -		if (skb_linearize(skb))
> +		if (netif_skb_check_for_xdp(pskb, xdp_prog))
>   			goto do_drop;
>   	}
>   
> -	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
> +	__skb_pull(*pskb, mac_len);
> +
> +	act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
>   	switch (act) {
>   	case XDP_REDIRECT:
>   	case XDP_TX:
>   	case XDP_PASS:
>   		break;
>   	default:
> -		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
> +		bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
>   		fallthrough;
>   	case XDP_ABORTED:
> -		trace_xdp_exception(skb->dev, xdp_prog, act);
> +		trace_xdp_exception((*pskb)->dev, xdp_prog, act);
>   		fallthrough;
>   	case XDP_DROP:
>   	do_drop:
> -		kfree_skb(skb);
> +		kfree_skb(*pskb);
>   		break;
>   	}
>   

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 net-next 3/3] xdp: add multi-buff support for xdp running in generic mode
  2023-12-20 16:01   ` Jesper Dangaard Brouer
@ 2023-12-21  8:23     ` Lorenzo Bianconi
  0 siblings, 0 replies; 13+ messages in thread
From: Lorenzo Bianconi @ 2023-12-21  8:23 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, lorenzo.bianconi, davem, kuba, edumazet, pabeni, bpf,
	toke, willemdebruijn.kernel, jasowang, sdf, Yan Zhai

[-- Attachment #1: Type: text/plain, Size: 10008 bytes --]

> 
> 
> On 14/12/2023 15.29, Lorenzo Bianconi wrote:
> > Similar to native xdp, do not always linearize the skb in
> > netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
> > processed by the eBPF program. This allow to add  multi-buffer support
> > for xdp running in generic mode.
> > 
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >   net/core/dev.c | 153 +++++++++++++++++++++++++++++++++++++++++++------
> >   1 file changed, 134 insertions(+), 19 deletions(-)
> > 
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index d7857de03dba..47164acc3268 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -4854,6 +4854,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
> >   	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
> >   	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
> >   			 skb_headlen(skb) + mac_len, true);
> > +	if (skb_is_nonlinear(skb)) {
> > +		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
> > +		xdp_buff_set_frags_flag(xdp);
> > +	} else {
> > +		xdp_buff_clear_frags_flag(xdp);
> > +	}
> >   	orig_data_end = xdp->data_end;
> >   	orig_data = xdp->data;
> > @@ -4883,6 +4889,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
> >   		skb->len += off; /* positive on grow, negative on shrink */
> >   	}
> > +	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
> > +	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
> > +	 */
> > +	if (xdp_buff_has_frags(xdp))
> > +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> > +	else
> > +		skb->data_len = 0;
> > +
> >   	/* check if XDP changed eth hdr such SKB needs update */
> >   	eth = (struct ethhdr *)xdp->data;
> >   	if ((orig_eth_type != eth->h_proto) ||
> > @@ -4916,12 +4930,118 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
> >   	return act;
> >   }
> > +static int netif_skb_segment_for_xdp(struct sk_buff **pskb,
> 
> This function "...segment_for_xdp" always reallocate SKB and copies all
> bits over.
> Should it have been named "skb_realloc_for_xdp" ?

Hi Jesper,

ack, naming is always hard :)

> 
> I was really hopeing we can find a design to avoid doing this realloc.
> 
> If the BPF-prog doesn't write into any of the fragments, then we can
> avoid this realloc (+copy) dance. We designed XDP multi-buff to have
> exactly the same layout+location as SKB in skb_shared_info, exactly to
> avoid having to reallocated.

I 100% agree with you, but we will need a similar copy fallback approach
anyway, right? It is just a matter to understand if we should implement it
with page_pool or page_frag_cache (or something different).

> 
> More comments inline below...
> 
> > +				     struct bpf_prog *prog)
> > +{
> > +#if IS_ENABLED(CONFIG_PAGE_POOL)
> > +	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
> > +	u32 size, truesize, len, max_head_size, off;
> > +	struct sk_buff *skb = *pskb, *nskb;
> > +	int err, i, head_off;
> > +	void *data;
> > +
> > +	/* XDP does not support fraglist so we need to linearize
> > +	 * the skb.
> > +	 */
> > +	if (skb_has_frag_list(skb) || !prog->aux->xdp_has_frags)
> > +		return -EOPNOTSUPP;
> > +
> > +	max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - XDP_PACKET_HEADROOM);
> > +	if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
> > +		return -ENOMEM;
> > +
> > +	size = min_t(u32, skb->len, max_head_size);
> > +	truesize = SKB_HEAD_ALIGN(size) + XDP_PACKET_HEADROOM;
> > +	data = page_pool_dev_alloc_va(sd->page_pool, &truesize);
> > +	if (!data)
> > +		return -ENOMEM;
> > +
> > +	nskb = napi_build_skb(data, truesize);
> > +	if (!nskb) {
> > +		page_pool_free_va(sd->page_pool, data, true);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	skb_reserve(nskb, XDP_PACKET_HEADROOM);
> > +	skb_copy_header(nskb, skb);
> > +	skb_mark_for_recycle(nskb);
> > +
> > +	err = skb_copy_bits(skb, 0, nskb->data, size);
> 
> This will likely copy part of the "frags" into the SKB "head" area.
> 
> Especially for netstack generated TCP packets, this will change the
> segmentation layout significantly.  I wonder what (performance) effects
> this will have on further handling of these SKBs.

Do you think it can be a problem? I think in this way we can reduce the
number of allocated page. Moreover, what about the case when skb head is
bigger than a single page? Do you think we should use bigger allocation order?

> 
> 
> 
> > +	if (err) {
> > +		consume_skb(nskb);
> > +		return err;
> > +	}
> > +	skb_put(nskb, size);
> > +
> > +	head_off = skb_headroom(nskb) - skb_headroom(skb);
> > +	skb_headers_offset_update(nskb, head_off);
> > +
> > +	off = size;
> > +	len = skb->len - off;
> > +	for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
> > +		struct page *page;
> > +		u32 page_off;
> > +
> > +		size = min_t(u32, len, PAGE_SIZE);
> > +		truesize = size;
> > +
> > +		page = page_pool_dev_alloc(sd->page_pool, &page_off,
> > +					   &truesize);
> > +		if (!data) {
> > +			consume_skb(nskb);
> > +			return -ENOMEM;
> > +		}
> > +
> > +		skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
> > +		err = skb_copy_bits(skb, off, page_address(page) + page_off,
> > +				    size);
> 
> I think it is correct, but we can easily endup with the new SKB (nskb)
> having a different nskb->nr_frags.

see above

> 
> 
> > +		if (err) {
> > +			consume_skb(nskb);
> > +			return err;
> > +		}
> > +
> > +		len -= size;
> > +		off += size;
> > +	}
> > +
> > +	consume_skb(skb);
> > +	*pskb = nskb;
> > +
> > +	return 0;
> > +#else
> > +	return -EOPNOTSUPP;
> > +#endif
> > +}
> > +
> > +static int netif_skb_check_for_xdp(struct sk_buff **pskb,
> > +				   struct bpf_prog *prog)
> > +{
> > +	struct sk_buff *skb = *pskb;
> > +	int err, hroom, troom;
> > +
> > +	if (!netif_skb_segment_for_xdp(pskb, prog))
> > +		return 0;
> 
> IMHO the code call logic, does not make it easy to add cases where we
> can avoid the realloc.  With this patch, it feels like the realloc+copy
> code path is the "main" code path for XDP-generic.

ack, I think it would depend about the logic to avoid realloc+copy. We could
revaluate it when we have this logic in place. What do you think?

> 
> Our goal should be to avoid realloc.
> 
> My goal for XDP multi-buff was/is that it can co-exist with GSO/GRO
> packets.  This patchset is a step in the direction of enabling GRO on
> devices with XDP (generic) loaded.  And I was really excited about this,
> but the overhead is going to be massive compared to normal GRO (without
> realloc+copy) that XDP end-users are going to be disappointed.
> 
> 
> > +
> > +	/* In case we have to go down the path and also linearize,
> > +	 * then lets do the pskb_expand_head() work just once here.
> > +	 */
> > +	hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
> > +	troom = skb->tail + skb->data_len - skb->end;
> > +	err = pskb_expand_head(skb,
> > +			       hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
> > +			       troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
> > +	if (err)
> > +		return err;
> > +
> > +	return skb_linearize(skb);
> > +}
> > +
> >   static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
> >   				     struct xdp_buff *xdp,
> >   				     struct bpf_prog *xdp_prog)
> >   {
> >   	struct sk_buff *skb = *pskb;
> > -	u32 act = XDP_DROP;
> > +	u32 mac_len, act = XDP_DROP;
> >   	/* Reinjected packets coming from act_mirred or similar should
> >   	 * not get XDP generic processing.
> > @@ -4929,41 +5049,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
> >   	if (skb_is_redirected(skb))
> >   		return XDP_PASS;
> > -	/* XDP packets must be linear and must have sufficient headroom
> > -	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
> > -	 * native XDP provides, thus we need to do it here as well.
> > +	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
> > +	 * bytes. This is the guarantee that also native XDP provides,
> > +	 * thus we need to do it here as well.
> 
> Some "native" XDP provider only have 192 bytes as HEADROOM and XDP code
> can this not being static (256 bytes).  So, perhaps it is time to allow
> XDP generic to only require 192 bytes?

ack, agree. I think this can be added a separated patch.

Regards,
Lorenzo

> 
> >   	 */
> > +	mac_len = skb->data - skb_mac_header(skb);
> > +	__skb_push(skb, mac_len);
> > +
> >   	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
> >   	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
> > -		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
> > -		int troom = skb->tail + skb->data_len - skb->end;
> > -
> > -		/* In case we have to go down the path and also linearize,
> > -		 * then lets do the pskb_expand_head() work just once here.
> > -		 */
> > -		if (pskb_expand_head(skb,
> > -				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
> > -				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
> > -			goto do_drop;
> > -		if (skb_linearize(skb))
> > +		if (netif_skb_check_for_xdp(pskb, xdp_prog))
> >   			goto do_drop;
> >   	}
> > -	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
> > +	__skb_pull(*pskb, mac_len);
> > +
> > +	act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
> >   	switch (act) {
> >   	case XDP_REDIRECT:
> >   	case XDP_TX:
> >   	case XDP_PASS:
> >   		break;
> >   	default:
> > -		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
> > +		bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
> >   		fallthrough;
> >   	case XDP_ABORTED:
> > -		trace_xdp_exception(skb->dev, xdp_prog, act);
> > +		trace_xdp_exception((*pskb)->dev, xdp_prog, act);
> >   		fallthrough;
> >   	case XDP_DROP:
> >   	do_drop:
> > -		kfree_skb(skb);
> > +		kfree_skb(*pskb);
> >   		break;
> >   	}

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct
  2023-12-20 12:00     ` Jesper Dangaard Brouer
@ 2024-01-17 17:36       ` Lorenzo Bianconi
  2024-01-18  1:47         ` Jakub Kicinski
  0 siblings, 1 reply; 13+ messages in thread
From: Lorenzo Bianconi @ 2024-01-17 17:36 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, kuba
  Cc: Paolo Abeni, Ilias Apalodimas, netdev, Sebastian Andrzej Siewior,
	willemdebruijn.kernel, toke, davem, edumazet, bpf,
	lorenzo.bianconi, sdf, jasowang

[-- Attachment #1: Type: text/plain, Size: 2844 bytes --]

> 
> 
> On 19/12/2023 16.23, Paolo Abeni wrote:
> > On Thu, 2023-12-14 at 15:29 +0100, Lorenzo Bianconi wrote:
> > > Allocate percpu page_pools in softnet_data.
> > > Moreover add cpuid filed in page_pool struct in order to recycle the
> > > page in the page_pool "hot" cache if napi_pp_put_page() is running on
> > > the same cpu.
> > > This is a preliminary patch to add xdp multi-buff support for xdp running
> > > in generic mode.
> > > 
> > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > > ---
> > >   include/linux/netdevice.h       |  1 +
> > >   include/net/page_pool/helpers.h |  5 +++++
> > >   include/net/page_pool/types.h   |  1 +
> > >   net/core/dev.c                  | 39 ++++++++++++++++++++++++++++++++-
> > >   net/core/page_pool.c            |  5 +++++
> > >   net/core/skbuff.c               |  5 +++--
> > >   6 files changed, 53 insertions(+), 3 deletions(-)
> > 
> > @Jesper, @Ilias: could you please have a look at the pp bits?
> > 
> 
> I have some concerns... I'm still entertaining the idea, but we need to
> be aware of the tradeoffs we are making.
> 
> (1)
> Adding PP to softnet_data means per CPU caching 256 pages in the
> ptr_ring (plus likely 64 in the alloc-cache).   Fortunately, PP starts
> out empty, so as long as this PP isn't used they don't get cached. But
> if used, then PP don't have a MM shrinker that removes these cached
> pages, in case system is under MM pressure.  I guess, you can argue that
> keeping this per netdev rx-queue would make memory usage even higher.
> This is a tradeoff, we are trading memory (waste) for speed.
> 
> 
> (2) (Question to Jakub I guess)
> How does this connect with Jakub's PP netlink stats interface?
> E.g. I find it very practical that this allow us get PP stats per
> netdev, but in this case there isn't a netdev.
> 
> 
> (3) (Implicit locking)
> PP have lockless "alloc" because it it relies on drivers NAPI context.
> The places where netstack access softnet_data provide similar protection
> that we can rely on for PP, so this is likely correct implementation
> wise.  But it will give people like Sebastian (Cc) more gray hair when
> figuring out how PREEMPT_RT handle these cases.
> 
> (4)
> The optimization is needed for the case where we need to re-allocate and
> copy SKB fragments.  I think we should focus on avoiding this code path,
> instead of optimizing it.  For UDP it should be fairly easy, but for TCP
> this is harder.

Hi all,

I would resume this activity and it seems to me there is no a clear direction
about where we should add the page_pool (in a per_cpu pointer or in
netdev_rx_queue struct) or if we can rely on page_frag_cache instead.

@Jakub: what do you think? Should we add a page_pool in a per_cpu pointer?

Regards,
Lorenzo

> 
> --Jesper

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct
  2024-01-17 17:36       ` Lorenzo Bianconi
@ 2024-01-18  1:47         ` Jakub Kicinski
  2024-01-22 11:19           ` Lorenzo Bianconi
  0 siblings, 1 reply; 13+ messages in thread
From: Jakub Kicinski @ 2024-01-18  1:47 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: Jesper Dangaard Brouer, Paolo Abeni, Ilias Apalodimas, netdev,
	Sebastian Andrzej Siewior, willemdebruijn.kernel, toke, davem,
	edumazet, bpf, lorenzo.bianconi, sdf, jasowang

On Wed, 17 Jan 2024 18:36:25 +0100 Lorenzo Bianconi wrote:
> I would resume this activity and it seems to me there is no a clear direction
> about where we should add the page_pool (in a per_cpu pointer or in
> netdev_rx_queue struct) or if we can rely on page_frag_cache instead.
> 
> @Jakub: what do you think? Should we add a page_pool in a per_cpu pointer?

Let's try to summarize. We want skb reallocation without linearization
for XDP generic. We need some fast-ish way to get pages for the payload.

First, options for placing the allocator:
 - struct netdev_rx_queue
 - per-CPU

IMO per-CPU has better scaling properties - you're less likely to
increase the CPU count to infinity than spawn extra netdev queues.

The second question is:
 - page_frag_cache
 - page_pool

I like the page pool because we have an increasing amount of infra for
it, and page pool is already used in veth, which we can hopefully also
de-duplicate if we have a per-CPU one, one day. But I do agree that
it's not a perfect fit.

To answer Jesper's questions:
 ad1. cache size - we can lower the cache to match page_frag_cache, 
      so I think 8 entries? page frag cache can give us bigger frags 
      and therefore lower frag count, so that's a minus for using 
      page pool
 ad2. nl API - we can extend netlink to dump unbound page pools fairly
      easily, I didn't want to do it without a clear use case, but I
      don't think there are any blockers
 ad3. locking - a bit independent of allocator but fair point, we assume
      XDP generic or Rx path for now, so sirq context / bh locked out
 ad4. right, well, right, IDK what real workloads need, and whether 
      XDP generic should be optimized at all.. I personally lean
      towards "no"
 
Sorry if I haven't helped much to clarify the direction :)
I have no strong preference on question #2, I would prefer to not add
per-queue state for something that's in no way tied to the device
(question #1 -> per-CPU). 

You did good perf analysis of the options, could you share it here
again?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct
  2024-01-18  1:47         ` Jakub Kicinski
@ 2024-01-22 11:19           ` Lorenzo Bianconi
  0 siblings, 0 replies; 13+ messages in thread
From: Lorenzo Bianconi @ 2024-01-22 11:19 UTC (permalink / raw)
  To: Jakub Kicinski, Jesper Dangaard Brouer, Ilias Apalodimas
  Cc: Lorenzo Bianconi, Paolo Abeni, netdev, Sebastian Andrzej Siewior,
	willemdebruijn.kernel, toke, davem, edumazet, bpf, sdf, jasowang

[-- Attachment #1: Type: text/plain, Size: 3826 bytes --]

> On Wed, 17 Jan 2024 18:36:25 +0100 Lorenzo Bianconi wrote:
> > I would resume this activity and it seems to me there is no a clear direction
> > about where we should add the page_pool (in a per_cpu pointer or in
> > netdev_rx_queue struct) or if we can rely on page_frag_cache instead.
> > 
> > @Jakub: what do you think? Should we add a page_pool in a per_cpu pointer?

Hi Jakub,

> 
> Let's try to summarize. We want skb reallocation without linearization
> for XDP generic. We need some fast-ish way to get pages for the payload.

correct

> 
> First, options for placing the allocator:
>  - struct netdev_rx_queue
>  - per-CPU
> 
> IMO per-CPU has better scaling properties - you're less likely to
> increase the CPU count to infinity than spawn extra netdev queues.

ack

> 
> The second question is:
>  - page_frag_cache
>  - page_pool
> 
> I like the page pool because we have an increasing amount of infra for
> it, and page pool is already used in veth, which we can hopefully also
> de-duplicate if we have a per-CPU one, one day. But I do agree that
> it's not a perfect fit.
> 
> To answer Jesper's questions:
>  ad1. cache size - we can lower the cache to match page_frag_cache, 
>       so I think 8 entries? page frag cache can give us bigger frags 
>       and therefore lower frag count, so that's a minus for using 
>       page pool
>  ad2. nl API - we can extend netlink to dump unbound page pools fairly
>       easily, I didn't want to do it without a clear use case, but I
>       don't think there are any blockers
>  ad3. locking - a bit independent of allocator but fair point, we assume
>       XDP generic or Rx path for now, so sirq context / bh locked out
>  ad4. right, well, right, IDK what real workloads need, and whether 
>       XDP generic should be optimized at all.. I personally lean
>       towards "no"
>  
> Sorry if I haven't helped much to clarify the direction :)
> I have no strong preference on question #2, I would prefer to not add
> per-queue state for something that's in no way tied to the device
> (question #1 -> per-CPU). 

Relying on netdev_alloc_cache/napi_alloc_cache will have the upside of reusing
current infrastructure (iirc my first revision used this approach).
The downside is we can't unify the code with veth driver.
There other way around adding per-cpu page_pools :). Anyway I am fine to have a
per-cpu page_pool similar to netdev_alloc_cache/napi_alloc_cache.

@Jesper/Ilias: what do you think?

> 
> You did good perf analysis of the options, could you share it here
> again?
> 

copying them out from my previous tests:

v00 (NS:ns0 - 192.168.0.1/24) <---> (NS:ns1 - 192.168.0.2/24) v01 ==(XDP_REDIRECT)==> v10 (NS:ns1 - 192.168.1.1/24) <---> (NS:ns2 - 192.168.1.2/24) v11

- v00: iperf3 client (pinned on core 0)
- v11: iperf3 server (pinned on core 7)

net-next veth codebase (page_pool APIs):
=======================================
- MTU  1500: ~ 5.42 Gbps
- MTU  8000: ~ 14.1 Gbps
- MTU 64000: ~ 18.4 Gbps

net-next veth codebase + netdev_alloc_cache/napi_alloc_cache:
=============================================================
- MTU  1500: ~ 6.62 Gbps
- MTU  8000: ~ 14.7 Gbps
- MTU 64000: ~ 19.7 Gbps

xdp_generic codebase + netdev_alloc_cache/napi_alloc_cache:
===========================================================
- MTU  1500: ~ 6.41 Gbps
- MTU  8000: ~ 14.2 Gbps
- MTU 64000: ~ 19.8 Gbps

xdp_generic codebase + page_pool in netdev_rx_queue:
====================================================
- MTU  1500: ~ 5.75 Gbps
- MTU  8000: ~ 15.3 Gbps
- MTU 64000: ~ 21.2 Gbps

IIRC relying on per-cpu page_pool has similar results of adding them in netdev_rx_queue,
but I can test them again with an updated kernel.

Regards,
Lorenzo

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2024-01-22 11:19 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-14 14:29 [PATCH v5 net-next 0/3] add multi-buff support for xdp running in generic mode Lorenzo Bianconi
2023-12-14 14:29 ` [PATCH v5 net-next 1/3] net: introduce page_pool pointer in softnet_data percpu struct Lorenzo Bianconi
2023-12-19 15:23   ` Paolo Abeni
2023-12-20 12:00     ` Jesper Dangaard Brouer
2024-01-17 17:36       ` Lorenzo Bianconi
2024-01-18  1:47         ` Jakub Kicinski
2024-01-22 11:19           ` Lorenzo Bianconi
2023-12-19 16:29   ` Eric Dumazet
2023-12-19 17:32     ` Lorenzo Bianconi
2023-12-14 14:29 ` [PATCH v5 net-next 2/3] xdp: rely on skb pointer reference in do_xdp_generic and netif_receive_generic_xdp Lorenzo Bianconi
2023-12-14 14:29 ` [PATCH v5 net-next 3/3] xdp: add multi-buff support for xdp running in generic mode Lorenzo Bianconi
2023-12-20 16:01   ` Jesper Dangaard Brouer
2023-12-21  8:23     ` Lorenzo Bianconi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.