All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next] xsk: support AF_PACKET
@ 2021-05-28  6:08 Xuan Zhuo
  2021-05-28  8:27   ` kernel test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Xuan Zhuo @ 2021-05-28  6:08 UTC (permalink / raw)
  To: netdev, bpf
  Cc: Björn Töpel, Magnus Karlsson, Jonathan Lemon,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Willem de Bruijn, Xie He, Eric Dumazet, John Ogness,
	Wang Hai, Xuan Zhuo, Tanner Love, Eyal Birger, Menglong Dong

In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
rx/tx data packets. This feature is very important in many cases. So
this patch allows AF_PACKET to obtain xsk packages.

By default, AF_PACKET is based on ptype_base/ptype_all in dev.c to
obtain data packets. But xsk is not suitable for calling these
callbacks, because it may send the packet to other protocol stacks. So
the method I used is to let AF_PACKET get the data packet from xsk
alone.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 include/net/xdp_sock.h |  15 +++++
 net/packet/af_packet.c |  35 +++++++++--
 net/packet/internal.h  |   7 +++
 net/xdp/Makefile       |   2 +-
 net/xdp/xsk.c          |   9 +++
 net/xdp/xsk_packet.c   | 129 +++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_packet.h   |  44 ++++++++++++++
 7 files changed, 234 insertions(+), 7 deletions(-)
 create mode 100644 net/xdp/xsk_packet.c
 create mode 100644 net/xdp/xsk_packet.h

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 9c0722c6d7ac..b0acf0293132 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -17,6 +17,11 @@ struct net_device;
 struct xsk_queue;
 struct xdp_buff;
 
+struct xsk_packet {
+	struct list_head list;
+	struct packet_type *pt;
+};
+
 struct xdp_umem {
 	void *addrs;
 	u64 size;
@@ -79,6 +84,8 @@ struct xdp_sock {
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(void);
+void xsk_add_pack(struct xsk_packet *xpt);
+void __xsk_remove_pack(struct xsk_packet *xpt);
 
 #else
 
@@ -96,6 +103,14 @@ static inline void __xsk_map_flush(void)
 {
 }
 
+void xsk_add_pack(struct xsk_packet *xpt)
+{
+}
+
+void __xsk_remove_pack(struct xsk_packet *xpt)
+{
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 597d798ac0a5..2720b51d13a6 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -303,10 +303,14 @@ static void __register_prot_hook(struct sock *sk)
 	struct packet_sock *po = pkt_sk(sk);
 
 	if (!po->running) {
-		if (po->fanout)
+		if (po->fanout) {
 			__fanout_link(sk, po);
-		else
+		} else {
 			dev_add_pack(&po->prot_hook);
+#ifdef CONFIG_XDP_SOCKETS
+			xsk_add_pack(&po->xsk_pt);
+#endif
+		}
 
 		sock_hold(sk);
 		po->running = 1;
@@ -333,10 +337,14 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
 
 	po->running = 0;
 
-	if (po->fanout)
+	if (po->fanout) {
 		__fanout_unlink(sk, po);
-	else
+	} else {
 		__dev_remove_pack(&po->prot_hook);
+#ifdef CONFIG_XDP_SOCKETS
+		__xsk_remove_pack(&po->xsk_pt);
+#endif
+	}
 
 	__sock_put(sk);
 
@@ -1483,8 +1491,12 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po)
 	rcu_assign_pointer(f->arr[f->num_members], sk);
 	smp_wmb();
 	f->num_members++;
-	if (f->num_members == 1)
+	if (f->num_members == 1) {
 		dev_add_pack(&f->prot_hook);
+#ifdef CONFIG_XDP_SOCKETS
+		xsk_add_pack(&f->xsk_pt);
+#endif
+	}
 	spin_unlock(&f->lock);
 }
 
@@ -1504,8 +1516,12 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
 			   rcu_dereference_protected(f->arr[f->num_members - 1],
 						     lockdep_is_held(&f->lock)));
 	f->num_members--;
-	if (f->num_members == 0)
+	if (f->num_members == 0) {
 		__dev_remove_pack(&f->prot_hook);
+#ifdef CONFIG_XDP_SOCKETS
+		__xsk_remove_pack(&po->xsk_pt);
+#endif
+	}
 	spin_unlock(&f->lock);
 }
 
@@ -1737,6 +1753,10 @@ static int fanout_add(struct sock *sk, struct fanout_args *args)
 		match->prot_hook.af_packet_priv = match;
 		match->prot_hook.id_match = match_fanout_group;
 		match->max_num_members = args->max_num_members;
+#ifdef CONFIG_XDP_SOCKETS
+		match->xsk_pt.pt = &match->prot_hook;
+#endif
+
 		list_add(&match->list, &fanout_list);
 	}
 	err = -EINVAL;
@@ -3315,6 +3335,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 		po->prot_hook.func = packet_rcv_spkt;
 
 	po->prot_hook.af_packet_priv = sk;
+#ifdef CONFIG_XDP_SOCKETS
+	po->xsk_pt.pt = &po->prot_hook;
+#endif
 
 	if (proto) {
 		po->prot_hook.type = proto;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 48af35b1aed2..d224b926588a 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -3,6 +3,7 @@
 #define __PACKET_INTERNAL_H__
 
 #include <linux/refcount.h>
+#include <net/xdp_sock.h>
 
 struct packet_mclist {
 	struct packet_mclist	*next;
@@ -94,6 +95,9 @@ struct packet_fanout {
 	spinlock_t		lock;
 	refcount_t		sk_ref;
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
+#ifdef CONFIG_XDP_SOCKETS
+	struct xsk_packet	xsk_pt;
+#endif
 	struct sock	__rcu	*arr[];
 };
 
@@ -136,6 +140,9 @@ struct packet_sock {
 	struct net_device __rcu	*cached_dev;
 	int			(*xmit)(struct sk_buff *skb);
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
+#ifdef CONFIG_XDP_SOCKETS
+	struct xsk_packet	xsk_pt;
+#endif
 	atomic_t		tp_drops ____cacheline_aligned_in_smp;
 };
 
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index 30cdc4315f42..bcac0591879b 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o xsk_packet.o
 obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o
 obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cd62d4ba87a9..fc97e7f9e4cb 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -28,6 +28,7 @@
 
 #include "xsk_queue.h"
 #include "xdp_umem.h"
+#include "xsk_packet.h"
 #include "xsk.h"
 
 #define TX_BATCH_SIZE 32
@@ -156,6 +157,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 	int err;
 
 	addr = xp_get_handle(xskb);
+	xsk_rx_packet_deliver(xs, addr, len);
 	err = xskq_prod_reserve_desc(xs->rx, addr, len);
 	if (err) {
 		xs->rx_queue_full++;
@@ -347,6 +349,8 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
 			goto out;
 
+		xsk_tx_zc_packet_deliver(xs, desc);
+
 		xskq_cons_release(xs->tx);
 		rcu_read_unlock();
 		return true;
@@ -576,6 +580,8 @@ static int xsk_generic_xmit(struct sock *sk)
 		}
 		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 
+		xsk_tx_packet_deliver(xs, &desc, skb);
+
 		err = __dev_direct_xmit(skb, xs->queue_id);
 		if  (err == NETDEV_TX_BUSY) {
 			/* Tell user-space to retry the send */
@@ -1467,6 +1473,9 @@ static int __init xsk_init(void)
 
 	for_each_possible_cpu(cpu)
 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
+
+	INIT_LIST_HEAD(&xsk_pt);
+
 	return 0;
 
 out_pernet:
diff --git a/net/xdp/xsk_packet.c b/net/xdp/xsk_packet.c
new file mode 100644
index 000000000000..41005f214d6d
--- /dev/null
+++ b/net/xdp/xsk_packet.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets packet api
+ *
+ * Author: Xuan Zhuo <xuanzhuo.dxf@linux.alibaba.com>
+ */
+
+#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
+#include "xsk.h"
+#include "xsk_packet.h"
+
+struct list_head xsk_pt __read_mostly;
+static DEFINE_SPINLOCK(pt_lock);
+
+static struct sk_buff *xsk_pt_alloc_skb(struct xdp_sock *xs,
+					struct xdp_desc *desc)
+{
+	struct sk_buff *skb;
+	void *buffer;
+	int err;
+
+	skb = alloc_skb(desc->len, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	skb_put(skb, desc->len);
+
+	buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
+	err = skb_store_bits(skb, 0, buffer, desc->len);
+	if (unlikely(err)) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	return skb;
+}
+
+static struct sk_buff *xsk_pt_get_skb(struct xdp_sock *xs,
+				      struct xdp_desc *desc,
+				      struct sk_buff *skb,
+				      bool rx)
+{
+	struct net_device *dev = xs->dev;
+
+	/* We must copy the data, because skb may exist for a long time
+	 * on AF_PACKET. If the buffer of the xsk is used by skb, the
+	 * release of xsk and the reuse of the buffer will be affected.
+	 */
+	if (!skb || (dev->priv_flags & IFF_TX_SKB_NO_LINEAR))
+		skb = xsk_pt_alloc_skb(xs, desc);
+	else
+		skb = skb_clone(skb, GFP_ATOMIC);
+
+	if (!skb)
+		return NULL;
+
+	skb->protocol = eth_type_trans(skb, dev);
+	skb_reset_network_header(skb);
+	skb->transport_header = skb->network_header;
+	__net_timestamp(skb);
+
+	if (!rx)
+		skb->pkt_type = PACKET_OUTGOING;
+
+	return skb;
+}
+
+void __xsk_pt_deliver(struct xdp_sock *xs, struct sk_buff *skb,
+		      struct xdp_desc *desc, bool rx)
+{
+	struct packet_type *pt_prev = NULL;
+	struct packet_type *ptype;
+	struct xsk_packet *xpt;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xpt, &xsk_pt, list) {
+		ptype = xpt->pt;
+
+		if (!rx && ptype->ignore_outgoing)
+			continue;
+
+		if (pt_prev) {
+			refcount_inc(&skb->users);
+			pt_prev->func(skb, skb->dev, pt_prev, skb->dev);
+			pt_prev = ptype;
+			continue;
+		}
+
+		skb = xsk_pt_get_skb(xs, desc, skb, rx);
+		if (unlikely(!skb))
+			goto out_unlock;
+
+		pt_prev = ptype;
+	}
+
+	if (pt_prev)
+		pt_prev->func(skb, skb->dev, pt_prev, skb->dev);
+
+out_unlock:
+	rcu_read_unlock();
+}
+
+void xsk_add_pack(struct xsk_packet *xpt)
+{
+	if (xpt->pt->type != htons(ETH_P_ALL))
+		return;
+
+	spin_lock(&pt_lock);
+	list_add_rcu(&xpt->list, &xsk_pt);
+	spin_unlock(&pt_lock);
+}
+
+void __xsk_remove_pack(struct xsk_packet *xpt)
+{
+	struct xsk_packet *xpt1;
+
+	spin_lock(&pt_lock);
+
+	list_for_each_entry(xpt1, &xsk_pt, list) {
+		if (xpt1 == xpt) {
+			list_del_rcu(&xpt1->list);
+			goto out;
+		}
+	}
+
+	pr_warn("xsk_remove_pack: %p not found\n", xpt);
+out:
+	spin_unlock(&pt_lock);
+}
diff --git a/net/xdp/xsk_packet.h b/net/xdp/xsk_packet.h
new file mode 100644
index 000000000000..55d30fa8828b
--- /dev/null
+++ b/net/xdp/xsk_packet.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __XSK_PACKET_H__
+#define __XSK_PACKET_H__
+extern struct list_head xsk_pt __read_mostly;
+
+void __xsk_pt_deliver(struct xdp_sock *xs, struct sk_buff *skb,
+		      struct xdp_desc *desc, bool rx);
+
+static inline void xsk_tx_packet_deliver(struct xdp_sock *xs,
+					 struct xdp_desc *desc,
+					 struct sk_buff *skb)
+{
+	if (likely(list_empty(&xsk_pt)))
+		return;
+
+	local_bh_disable();
+	__xsk_pt_deliver(xs, skb, desc, false);
+	local_bh_enable();
+}
+
+static inline void xsk_tx_zc_packet_deliver(struct xdp_sock *xs,
+					    struct xdp_desc *desc)
+{
+	if (likely(list_empty(&xsk_pt)))
+		return;
+
+	__xsk_pt_deliver(xs, NULL, desc, false);
+}
+
+static inline void xsk_rx_packet_deliver(struct xdp_sock *xs, u64 addr, u32 len)
+{
+	struct xdp_desc desc;
+
+	if (likely(list_empty(&xsk_pt)))
+		return;
+
+	desc.addr = addr;
+	desc.len = len;
+
+	__xsk_pt_deliver(xs, NULL, &desc, true);
+}
+
+#endif /* __XSK_PACKET_H__ */
-- 
2.31.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
  2021-05-28  6:08 [PATCH bpf-next] xsk: support AF_PACKET Xuan Zhuo
@ 2021-05-28  8:27   ` kernel test robot
  2021-05-28  8:34   ` kernel test robot
  2021-05-28  8:55 ` Toke Høiland-Jørgensen
  2 siblings, 0 replies; 15+ messages in thread
From: kernel test robot @ 2021-05-28  8:27 UTC (permalink / raw)
  To: Xuan Zhuo, netdev, bpf
  Cc: kbuild-all, Björn Töpel, Magnus Karlsson,
	Jonathan Lemon, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend

[-- Attachment #1: Type: text/plain, Size: 2469 bytes --]

Hi Xuan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Xuan-Zhuo/xsk-support-AF_PACKET/20210528-140828
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: h8300-randconfig-r012-20210528 (attached as .config)
compiler: h8300-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/814526a7683da5d39bbe21f94f18df3bb85a07e7
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Xuan-Zhuo/xsk-support-AF_PACKET/20210528-140828
        git checkout 814526a7683da5d39bbe21f94f18df3bb85a07e7
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=h8300 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   h8300-linux-ld: net/core/xdp.o: in function `xsk_add_pack':
>> xdp.c:(.text+0x204): multiple definition of `xsk_add_pack'; net/core/filter.o:filter.c:(.text+0xe01c): first defined here
   h8300-linux-ld: net/core/xdp.o: in function `__xsk_remove_pack':
>> xdp.c:(.text+0x210): multiple definition of `__xsk_remove_pack'; net/core/filter.o:filter.c:(.text+0xe028): first defined here
   h8300-linux-ld: net/ethtool/ioctl.o: in function `xsk_add_pack':
   ioctl.c:(.text+0x23dc): multiple definition of `xsk_add_pack'; net/core/filter.o:filter.c:(.text+0xe01c): first defined here
   h8300-linux-ld: net/ethtool/ioctl.o: in function `__xsk_remove_pack':
   ioctl.c:(.text+0x23e8): multiple definition of `__xsk_remove_pack'; net/core/filter.o:filter.c:(.text+0xe028): first defined here
   h8300-linux-ld: net/ethtool/channels.o: in function `xsk_add_pack':
   channels.c:(.text+0x250): multiple definition of `xsk_add_pack'; net/core/filter.o:filter.c:(.text+0xe01c): first defined here
   h8300-linux-ld: net/ethtool/channels.o: in function `__xsk_remove_pack':
   channels.c:(.text+0x25c): multiple definition of `__xsk_remove_pack'; net/core/filter.o:filter.c:(.text+0xe028): first defined here

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 27814 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
@ 2021-05-28  8:27   ` kernel test robot
  0 siblings, 0 replies; 15+ messages in thread
From: kernel test robot @ 2021-05-28  8:27 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2512 bytes --]

Hi Xuan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Xuan-Zhuo/xsk-support-AF_PACKET/20210528-140828
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: h8300-randconfig-r012-20210528 (attached as .config)
compiler: h8300-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/814526a7683da5d39bbe21f94f18df3bb85a07e7
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Xuan-Zhuo/xsk-support-AF_PACKET/20210528-140828
        git checkout 814526a7683da5d39bbe21f94f18df3bb85a07e7
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=h8300 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   h8300-linux-ld: net/core/xdp.o: in function `xsk_add_pack':
>> xdp.c:(.text+0x204): multiple definition of `xsk_add_pack'; net/core/filter.o:filter.c:(.text+0xe01c): first defined here
   h8300-linux-ld: net/core/xdp.o: in function `__xsk_remove_pack':
>> xdp.c:(.text+0x210): multiple definition of `__xsk_remove_pack'; net/core/filter.o:filter.c:(.text+0xe028): first defined here
   h8300-linux-ld: net/ethtool/ioctl.o: in function `xsk_add_pack':
   ioctl.c:(.text+0x23dc): multiple definition of `xsk_add_pack'; net/core/filter.o:filter.c:(.text+0xe01c): first defined here
   h8300-linux-ld: net/ethtool/ioctl.o: in function `__xsk_remove_pack':
   ioctl.c:(.text+0x23e8): multiple definition of `__xsk_remove_pack'; net/core/filter.o:filter.c:(.text+0xe028): first defined here
   h8300-linux-ld: net/ethtool/channels.o: in function `xsk_add_pack':
   channels.c:(.text+0x250): multiple definition of `xsk_add_pack'; net/core/filter.o:filter.c:(.text+0xe01c): first defined here
   h8300-linux-ld: net/ethtool/channels.o: in function `__xsk_remove_pack':
   channels.c:(.text+0x25c): multiple definition of `__xsk_remove_pack'; net/core/filter.o:filter.c:(.text+0xe028): first defined here

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 27814 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
  2021-05-28  6:08 [PATCH bpf-next] xsk: support AF_PACKET Xuan Zhuo
@ 2021-05-28  8:34   ` kernel test robot
  2021-05-28  8:34   ` kernel test robot
  2021-05-28  8:55 ` Toke Høiland-Jørgensen
  2 siblings, 0 replies; 15+ messages in thread
From: kernel test robot @ 2021-05-28  8:34 UTC (permalink / raw)
  To: Xuan Zhuo, netdev, bpf
  Cc: kbuild-all, clang-built-linux, Björn Töpel,
	Magnus Karlsson, Jonathan Lemon, Jakub Kicinski,
	Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
	John Fastabend

[-- Attachment #1: Type: text/plain, Size: 2543 bytes --]

Hi Xuan,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Xuan-Zhuo/xsk-support-AF_PACKET/20210528-140828
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: x86_64-randconfig-a013-20210528 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 6505c630407c5feec818f0bb1c284f9eeebf2071)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install x86_64 cross compiling tool for clang build
        # apt-get install binutils-x86-64-linux-gnu
        # https://github.com/0day-ci/linux/commit/814526a7683da5d39bbe21f94f18df3bb85a07e7
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Xuan-Zhuo/xsk-support-AF_PACKET/20210528-140828
        git checkout 814526a7683da5d39bbe21f94f18df3bb85a07e7
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from net/packet/af_packet.c:93:
   In file included from net/packet/internal.h:6:
>> include/net/xdp_sock.h:106:6: warning: no previous prototype for function 'xsk_add_pack' [-Wmissing-prototypes]
   void xsk_add_pack(struct xsk_packet *xpt)
        ^
   include/net/xdp_sock.h:106:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void xsk_add_pack(struct xsk_packet *xpt)
   ^
   static 
>> include/net/xdp_sock.h:110:6: warning: no previous prototype for function '__xsk_remove_pack' [-Wmissing-prototypes]
   void __xsk_remove_pack(struct xsk_packet *xpt)
        ^
   include/net/xdp_sock.h:110:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void __xsk_remove_pack(struct xsk_packet *xpt)
   ^
   static 
   2 warnings generated.


vim +/xsk_add_pack +106 include/net/xdp_sock.h

   105	
 > 106	void xsk_add_pack(struct xsk_packet *xpt)
   107	{
   108	}
   109	
 > 110	void __xsk_remove_pack(struct xsk_packet *xpt)
   111	{
   112	}
   113	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 33495 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
@ 2021-05-28  8:34   ` kernel test robot
  0 siblings, 0 replies; 15+ messages in thread
From: kernel test robot @ 2021-05-28  8:34 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2606 bytes --]

Hi Xuan,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Xuan-Zhuo/xsk-support-AF_PACKET/20210528-140828
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: x86_64-randconfig-a013-20210528 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 6505c630407c5feec818f0bb1c284f9eeebf2071)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install x86_64 cross compiling tool for clang build
        # apt-get install binutils-x86-64-linux-gnu
        # https://github.com/0day-ci/linux/commit/814526a7683da5d39bbe21f94f18df3bb85a07e7
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Xuan-Zhuo/xsk-support-AF_PACKET/20210528-140828
        git checkout 814526a7683da5d39bbe21f94f18df3bb85a07e7
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from net/packet/af_packet.c:93:
   In file included from net/packet/internal.h:6:
>> include/net/xdp_sock.h:106:6: warning: no previous prototype for function 'xsk_add_pack' [-Wmissing-prototypes]
   void xsk_add_pack(struct xsk_packet *xpt)
        ^
   include/net/xdp_sock.h:106:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void xsk_add_pack(struct xsk_packet *xpt)
   ^
   static 
>> include/net/xdp_sock.h:110:6: warning: no previous prototype for function '__xsk_remove_pack' [-Wmissing-prototypes]
   void __xsk_remove_pack(struct xsk_packet *xpt)
        ^
   include/net/xdp_sock.h:110:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void __xsk_remove_pack(struct xsk_packet *xpt)
   ^
   static 
   2 warnings generated.


vim +/xsk_add_pack +106 include/net/xdp_sock.h

   105	
 > 106	void xsk_add_pack(struct xsk_packet *xpt)
   107	{
   108	}
   109	
 > 110	void __xsk_remove_pack(struct xsk_packet *xpt)
   111	{
   112	}
   113	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 33495 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
  2021-05-28  6:08 [PATCH bpf-next] xsk: support AF_PACKET Xuan Zhuo
  2021-05-28  8:27   ` kernel test robot
  2021-05-28  8:34   ` kernel test robot
@ 2021-05-28  8:55 ` Toke Høiland-Jørgensen
       [not found]   ` <1622192521.5931044-1-xuanzhuo@linux.alibaba.com>
  2 siblings, 1 reply; 15+ messages in thread
From: Toke Høiland-Jørgensen @ 2021-05-28  8:55 UTC (permalink / raw)
  To: Xuan Zhuo, netdev, bpf
  Cc: Björn Töpel, Magnus Karlsson, Jonathan Lemon,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Willem de Bruijn, Xie He, Eric Dumazet, John Ogness,
	Wang Hai, Xuan Zhuo, Tanner Love, Eyal Birger, Menglong Dong

Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:

> In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
> rx/tx data packets. This feature is very important in many cases. So
> this patch allows AF_PACKET to obtain xsk packages.

You can use xdpdump to dump the packets from the XDP program before it
gets redirected into the XSK:
https://github.com/xdp-project/xdp-tools/tree/master/xdp-dump

Doens't currently work on egress, but if/when we get a proper TX hook
that should be doable as well.

Wiring up XSK to AF_PACKET sounds a bit nonsensical: XSK is already a
transport to userspace, why would you need a second one?

-Toke


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
       [not found]   ` <1622192521.5931044-1-xuanzhuo@linux.alibaba.com>
@ 2021-05-28  9:25     ` Toke Høiland-Jørgensen
  2021-05-28  9:32       ` Maciej Fijalkowski
  2021-05-28  9:50     ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 15+ messages in thread
From: Toke Høiland-Jørgensen @ 2021-05-28  9:25 UTC (permalink / raw)
  To: Xuan Zhuo
  Cc: Björn Töpel, Magnus Karlsson, Jonathan Lemon,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Willem de Bruijn, Xie He, Eric Dumazet, John Ogness,
	Wang Hai, Tanner Love, Eyal Birger, Menglong Dong, netdev, bpf

Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:

> On Fri, 28 May 2021 10:55:58 +0200, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>> Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:
>>
>> > In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
>> > rx/tx data packets. This feature is very important in many cases. So
>> > this patch allows AF_PACKET to obtain xsk packages.
>>
>> You can use xdpdump to dump the packets from the XDP program before it
>> gets redirected into the XSK:
>> https://github.com/xdp-project/xdp-tools/tree/master/xdp-dump
>
> Wow, this is a good idea.
>
>>
>> Doens't currently work on egress, but if/when we get a proper TX hook
>> that should be doable as well.
>>
>> Wiring up XSK to AF_PACKET sounds a bit nonsensical: XSK is already a
>> transport to userspace, why would you need a second one?
>
> I have some different ideas. In my opinion, just like AF_PACKET can monitor
> tcp/udp packets, AF_PACKET monitors xsk packets is the same.

But you're adding code in the fast path to do this, in a code path where
others have been working quite hard to squeeze out every drop of
performance (literally chasing single nanoseconds). So I'm sorry, but
this approach is just not going to fly.

What is your use case anyway? Yes, being able to run tcpdump and see the
packets is nice and convenient, but what do you actually want to use
this for? Just for debugging your application? System monitoring?
Something else?

-Toke


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
  2021-05-28  9:25     ` Toke Høiland-Jørgensen
@ 2021-05-28  9:32       ` Maciej Fijalkowski
  0 siblings, 0 replies; 15+ messages in thread
From: Maciej Fijalkowski @ 2021-05-28  9:32 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Xuan Zhuo, Björn Töpel, Magnus Karlsson,
	Jonathan Lemon, David S. Miller, Jakub Kicinski,
	Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
	John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, KP Singh, Willem de Bruijn, Xie He, Eric Dumazet,
	John Ogness, Wang Hai, Tanner Love, Eyal Birger, Menglong Dong,
	netdev, bpf

On Fri, May 28, 2021 at 11:25:56AM +0200, Toke Høiland-Jørgensen wrote:
> Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:
> 
> > On Fri, 28 May 2021 10:55:58 +0200, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >> Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:
> >>
> >> > In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
> >> > rx/tx data packets. This feature is very important in many cases. So
> >> > this patch allows AF_PACKET to obtain xsk packages.
> >>
> >> You can use xdpdump to dump the packets from the XDP program before it
> >> gets redirected into the XSK:
> >> https://github.com/xdp-project/xdp-tools/tree/master/xdp-dump
> >
> > Wow, this is a good idea.
> >
> >>
> >> Doens't currently work on egress, but if/when we get a proper TX hook
> >> that should be doable as well.
> >>
> >> Wiring up XSK to AF_PACKET sounds a bit nonsensical: XSK is already a
> >> transport to userspace, why would you need a second one?
> >
> > I have some different ideas. In my opinion, just like AF_PACKET can monitor
> > tcp/udp packets, AF_PACKET monitors xsk packets is the same.
> 
> But you're adding code in the fast path to do this, in a code path where
> others have been working quite hard to squeeze out every drop of
> performance (literally chasing single nanoseconds). So I'm sorry, but
> this approach is just not going to fly.

+1. Probably would be better for everyone if Xuan started a thread on list
what is his need.

> 
> What is your use case anyway? Yes, being able to run tcpdump and see the
> packets is nice and convenient, but what do you actually want to use
> this for? Just for debugging your application? System monitoring?
> Something else?
> 
> -Toke
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
       [not found]   ` <1622192521.5931044-1-xuanzhuo@linux.alibaba.com>
  2021-05-28  9:25     ` Toke Høiland-Jørgensen
@ 2021-05-28  9:50     ` Jesper Dangaard Brouer
  2021-05-28 10:00       ` Magnus Karlsson
  1 sibling, 1 reply; 15+ messages in thread
From: Jesper Dangaard Brouer @ 2021-05-28  9:50 UTC (permalink / raw)
  To: Xuan Zhuo, Eelco Chaudron, Lorenzo Bianconi
  Cc: brouer, Toke Høiland-Jørgensen, Björn Töpel,
	Magnus Karlsson, Jonathan Lemon, David S. Miller, Jakub Kicinski,
	Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Willem de Bruijn, Xie He, Eric Dumazet, John Ogness,
	Wang Hai, Tanner Love, Eyal Birger, Menglong Dong, netdev, bpf

On Fri, 28 May 2021 17:02:01 +0800
Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:

> On Fri, 28 May 2021 10:55:58 +0200, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> > Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:
> >  
> > > In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
> > > rx/tx data packets. This feature is very important in many cases. So
> > > this patch allows AF_PACKET to obtain xsk packages.  
> >
> > You can use xdpdump to dump the packets from the XDP program before it
> > gets redirected into the XSK:
> > https://github.com/xdp-project/xdp-tools/tree/master/xdp-dump  
> 
> Wow, this is a good idea.

Yes, it is rather cool (credit to Eelco).  Notice the extra info you
can capture from 'exit', like XDP return codes, if_index, rx_queue.

The tool uses the perf ring-buffer to send/copy data to userspace.
This is actually surprisingly fast, but I still think AF_XDP will be
faster (but it usually 'steals' the packet).

Another (crazy?) idea is to extend this (and xdpdump), is to leverage
Hangbin's recent XDP_REDIRECT extension e624d4ed4aa8 ("xdp: Extend
xdp_redirect_map with broadcast support").  We now have a
xdp_redirect_map flag BPF_F_BROADCAST, what if we create a
BPF_F_CLONE_PASS flag?

The semantic meaning of BPF_F_CLONE_PASS flag is to copy/clone the
packet for the specified map target index (e.g AF_XDP map), but
afterwards it does like veth/cpumap and creates an SKB from the
xdp_frame (see __xdp_build_skb_from_frame()) and send to netstack.
(Feel free to kick me if this doesn't make any sense)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
  2021-05-28  9:50     ` Jesper Dangaard Brouer
@ 2021-05-28 10:00       ` Magnus Karlsson
  2021-05-28 10:22         ` Daniel Borkmann
  0 siblings, 1 reply; 15+ messages in thread
From: Magnus Karlsson @ 2021-05-28 10:00 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Xuan Zhuo, Eelco Chaudron, Lorenzo Bianconi,
	Toke Høiland-Jørgensen, Björn Töpel,
	Magnus Karlsson, Jonathan Lemon, David S. Miller, Jakub Kicinski,
	Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Willem de Bruijn, Xie He, Eric Dumazet, John Ogness,
	Wang Hai, Tanner Love, Eyal Birger, Menglong Dong,
	Network Development, bpf

On Fri, May 28, 2021 at 11:52 AM Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
>
> On Fri, 28 May 2021 17:02:01 +0800
> Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> > On Fri, 28 May 2021 10:55:58 +0200, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> > > Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:
> > >
> > > > In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
> > > > rx/tx data packets. This feature is very important in many cases. So
> > > > this patch allows AF_PACKET to obtain xsk packages.
> > >
> > > You can use xdpdump to dump the packets from the XDP program before it
> > > gets redirected into the XSK:
> > > https://github.com/xdp-project/xdp-tools/tree/master/xdp-dump
> >
> > Wow, this is a good idea.
>
> Yes, it is rather cool (credit to Eelco).  Notice the extra info you
> can capture from 'exit', like XDP return codes, if_index, rx_queue.
>
> The tool uses the perf ring-buffer to send/copy data to userspace.
> This is actually surprisingly fast, but I still think AF_XDP will be
> faster (but it usually 'steals' the packet).
>
> Another (crazy?) idea is to extend this (and xdpdump), is to leverage
> Hangbin's recent XDP_REDIRECT extension e624d4ed4aa8 ("xdp: Extend
> xdp_redirect_map with broadcast support").  We now have a
> xdp_redirect_map flag BPF_F_BROADCAST, what if we create a
> BPF_F_CLONE_PASS flag?
>
> The semantic meaning of BPF_F_CLONE_PASS flag is to copy/clone the
> packet for the specified map target index (e.g AF_XDP map), but
> afterwards it does like veth/cpumap and creates an SKB from the
> xdp_frame (see __xdp_build_skb_from_frame()) and send to netstack.
> (Feel free to kick me if this doesn't make any sense)

This would be a smooth way to implement clone support for AF_XDP. If
we had this and someone added AF_XDP support to libpcap, we could both
capture AF_XDP traffic with tcpdump (using this clone functionality in
the XDP program) and speed up tcpdump for dumping traffic destined for
regular sockets. Would that solve your use case Xuan? Note that I have
not looked into the BPF_F_CLONE_PASS code, so do not know at this
point what it would take to support this for XSKMAPs.

> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
  2021-05-28 10:00       ` Magnus Karlsson
@ 2021-05-28 10:22         ` Daniel Borkmann
  2021-05-28 10:54           ` Toke Høiland-Jørgensen
  2021-05-28 12:23           ` Jesper Dangaard Brouer
  0 siblings, 2 replies; 15+ messages in thread
From: Daniel Borkmann @ 2021-05-28 10:22 UTC (permalink / raw)
  To: Magnus Karlsson, Jesper Dangaard Brouer
  Cc: Xuan Zhuo, Eelco Chaudron, Lorenzo Bianconi,
	Toke Høiland-Jørgensen, Björn Töpel,
	Magnus Karlsson, Jonathan Lemon, David S. Miller, Jakub Kicinski,
	Alexei Starovoitov, John Fastabend, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh,
	Willem de Bruijn, Xie He, Eric Dumazet, John Ogness, Wang Hai,
	Tanner Love, Eyal Birger, Menglong Dong, Network Development,
	bpf

On 5/28/21 12:00 PM, Magnus Karlsson wrote:
> On Fri, May 28, 2021 at 11:52 AM Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
>> On Fri, 28 May 2021 17:02:01 +0800
>> Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>> On Fri, 28 May 2021 10:55:58 +0200, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>>> Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:
>>>>
>>>>> In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
>>>>> rx/tx data packets. This feature is very important in many cases. So
>>>>> this patch allows AF_PACKET to obtain xsk packages.
>>>>
>>>> You can use xdpdump to dump the packets from the XDP program before it
>>>> gets redirected into the XSK:
>>>> https://github.com/xdp-project/xdp-tools/tree/master/xdp-dump
>>>
>>> Wow, this is a good idea.
>>
>> Yes, it is rather cool (credit to Eelco).  Notice the extra info you
>> can capture from 'exit', like XDP return codes, if_index, rx_queue.
>>
>> The tool uses the perf ring-buffer to send/copy data to userspace.
>> This is actually surprisingly fast, but I still think AF_XDP will be
>> faster (but it usually 'steals' the packet).
>>
>> Another (crazy?) idea is to extend this (and xdpdump), is to leverage
>> Hangbin's recent XDP_REDIRECT extension e624d4ed4aa8 ("xdp: Extend
>> xdp_redirect_map with broadcast support").  We now have a
>> xdp_redirect_map flag BPF_F_BROADCAST, what if we create a
>> BPF_F_CLONE_PASS flag?
>>
>> The semantic meaning of BPF_F_CLONE_PASS flag is to copy/clone the
>> packet for the specified map target index (e.g AF_XDP map), but
>> afterwards it does like veth/cpumap and creates an SKB from the
>> xdp_frame (see __xdp_build_skb_from_frame()) and send to netstack.
>> (Feel free to kick me if this doesn't make any sense)
> 
> This would be a smooth way to implement clone support for AF_XDP. If
> we had this and someone added AF_XDP support to libpcap, we could both
> capture AF_XDP traffic with tcpdump (using this clone functionality in
> the XDP program) and speed up tcpdump for dumping traffic destined for
> regular sockets. Would that solve your use case Xuan? Note that I have
> not looked into the BPF_F_CLONE_PASS code, so do not know at this
> point what it would take to support this for XSKMAPs.

Recently also ended up with something similar for our XDP LB to record pcaps [0] ;)
My question is.. tcpdump doesn't really care where the packet data comes from,
so why not extending libpcap's Linux-related internals to either capture from
perf RB or BPF ringbuf rather than AF_PACKET sockets? Cloning is slow, and if
you need to end up creating an skb which is then cloned once again inside AF_PACKET
it's even worse. Just relying and reading out, say, perf RB you don't need any
clones at all.

Thanks,
Daniel

   [0] https://cilium.io/blog/2021/05/20/cilium-110#pcap

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
  2021-05-28 10:22         ` Daniel Borkmann
@ 2021-05-28 10:54           ` Toke Høiland-Jørgensen
  2021-05-28 11:29             ` Daniel Borkmann
  2021-05-28 12:23           ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 15+ messages in thread
From: Toke Høiland-Jørgensen @ 2021-05-28 10:54 UTC (permalink / raw)
  To: Daniel Borkmann, Magnus Karlsson, Jesper Dangaard Brouer
  Cc: Xuan Zhuo, Eelco Chaudron, Lorenzo Bianconi,
	Björn Töpel, Magnus Karlsson, Jonathan Lemon,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, KP Singh, Willem de Bruijn, Xie He, Eric Dumazet,
	John Ogness, Wang Hai, Tanner Love, Eyal Birger, Menglong Dong,
	Network Development, bpf

Daniel Borkmann <daniel@iogearbox.net> writes:

> On 5/28/21 12:00 PM, Magnus Karlsson wrote:
>> On Fri, May 28, 2021 at 11:52 AM Jesper Dangaard Brouer
>> <brouer@redhat.com> wrote:
>>> On Fri, 28 May 2021 17:02:01 +0800
>>> Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>>> On Fri, 28 May 2021 10:55:58 +0200, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>>>> Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:
>>>>>
>>>>>> In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
>>>>>> rx/tx data packets. This feature is very important in many cases. So
>>>>>> this patch allows AF_PACKET to obtain xsk packages.
>>>>>
>>>>> You can use xdpdump to dump the packets from the XDP program before it
>>>>> gets redirected into the XSK:
>>>>> https://github.com/xdp-project/xdp-tools/tree/master/xdp-dump
>>>>
>>>> Wow, this is a good idea.
>>>
>>> Yes, it is rather cool (credit to Eelco).  Notice the extra info you
>>> can capture from 'exit', like XDP return codes, if_index, rx_queue.
>>>
>>> The tool uses the perf ring-buffer to send/copy data to userspace.
>>> This is actually surprisingly fast, but I still think AF_XDP will be
>>> faster (but it usually 'steals' the packet).
>>>
>>> Another (crazy?) idea is to extend this (and xdpdump), is to leverage
>>> Hangbin's recent XDP_REDIRECT extension e624d4ed4aa8 ("xdp: Extend
>>> xdp_redirect_map with broadcast support").  We now have a
>>> xdp_redirect_map flag BPF_F_BROADCAST, what if we create a
>>> BPF_F_CLONE_PASS flag?
>>>
>>> The semantic meaning of BPF_F_CLONE_PASS flag is to copy/clone the
>>> packet for the specified map target index (e.g AF_XDP map), but
>>> afterwards it does like veth/cpumap and creates an SKB from the
>>> xdp_frame (see __xdp_build_skb_from_frame()) and send to netstack.
>>> (Feel free to kick me if this doesn't make any sense)
>> 
>> This would be a smooth way to implement clone support for AF_XDP. If
>> we had this and someone added AF_XDP support to libpcap, we could both
>> capture AF_XDP traffic with tcpdump (using this clone functionality in
>> the XDP program) and speed up tcpdump for dumping traffic destined for
>> regular sockets. Would that solve your use case Xuan? Note that I have
>> not looked into the BPF_F_CLONE_PASS code, so do not know at this
>> point what it would take to support this for XSKMAPs.
>
> Recently also ended up with something similar for our XDP LB to record pcaps [0] ;)
> My question is.. tcpdump doesn't really care where the packet data comes from,
> so why not extending libpcap's Linux-related internals to either capture from
> perf RB or BPF ringbuf rather than AF_PACKET sockets? Cloning is slow, and if
> you need to end up creating an skb which is then cloned once again inside AF_PACKET
> it's even worse. Just relying and reading out, say, perf RB you don't need any
> clones at all.

We discussed this when creating xdpdump and decided to keep it as a
separate tool for the time being. I forget the details of the
discussion, maybe Eelco remembers.

Anyway, xdpdump does have a "pipe pcap to stdout" feature so you can do
`xdpdump | tcpdump` and get the interactive output; and it will also
save pcap information to disk, of course (using pcap-ng so it can also
save metadata like XDP program name and return code).

-Toke


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
  2021-05-28 10:54           ` Toke Høiland-Jørgensen
@ 2021-05-28 11:29             ` Daniel Borkmann
  2021-05-28 12:35               ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 15+ messages in thread
From: Daniel Borkmann @ 2021-05-28 11:29 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, Magnus Karlsson,
	Jesper Dangaard Brouer
  Cc: Xuan Zhuo, Eelco Chaudron, Lorenzo Bianconi,
	Björn Töpel, Magnus Karlsson, Jonathan Lemon,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, KP Singh, Willem de Bruijn, Xie He, Eric Dumazet,
	John Ogness, Wang Hai, Tanner Love, Eyal Birger, Menglong Dong,
	Network Development, bpf

On 5/28/21 12:54 PM, Toke Høiland-Jørgensen wrote:
> Daniel Borkmann <daniel@iogearbox.net> writes:
>> On 5/28/21 12:00 PM, Magnus Karlsson wrote:
>>> On Fri, May 28, 2021 at 11:52 AM Jesper Dangaard Brouer
>>> <brouer@redhat.com> wrote:
>>>> On Fri, 28 May 2021 17:02:01 +0800
>>>> Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>>>> On Fri, 28 May 2021 10:55:58 +0200, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>>>>> Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:
>>>>>>
>>>>>>> In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
>>>>>>> rx/tx data packets. This feature is very important in many cases. So
>>>>>>> this patch allows AF_PACKET to obtain xsk packages.
>>>>>>
>>>>>> You can use xdpdump to dump the packets from the XDP program before it
>>>>>> gets redirected into the XSK:
>>>>>> https://github.com/xdp-project/xdp-tools/tree/master/xdp-dump
>>>>>
>>>>> Wow, this is a good idea.
>>>>
>>>> Yes, it is rather cool (credit to Eelco).  Notice the extra info you
>>>> can capture from 'exit', like XDP return codes, if_index, rx_queue.
>>>>
>>>> The tool uses the perf ring-buffer to send/copy data to userspace.
>>>> This is actually surprisingly fast, but I still think AF_XDP will be
>>>> faster (but it usually 'steals' the packet).
>>>>
>>>> Another (crazy?) idea is to extend this (and xdpdump), is to leverage
>>>> Hangbin's recent XDP_REDIRECT extension e624d4ed4aa8 ("xdp: Extend
>>>> xdp_redirect_map with broadcast support").  We now have a
>>>> xdp_redirect_map flag BPF_F_BROADCAST, what if we create a
>>>> BPF_F_CLONE_PASS flag?
>>>>
>>>> The semantic meaning of BPF_F_CLONE_PASS flag is to copy/clone the
>>>> packet for the specified map target index (e.g AF_XDP map), but
>>>> afterwards it does like veth/cpumap and creates an SKB from the
>>>> xdp_frame (see __xdp_build_skb_from_frame()) and send to netstack.
>>>> (Feel free to kick me if this doesn't make any sense)
>>>
>>> This would be a smooth way to implement clone support for AF_XDP. If
>>> we had this and someone added AF_XDP support to libpcap, we could both
>>> capture AF_XDP traffic with tcpdump (using this clone functionality in
>>> the XDP program) and speed up tcpdump for dumping traffic destined for
>>> regular sockets. Would that solve your use case Xuan? Note that I have
>>> not looked into the BPF_F_CLONE_PASS code, so do not know at this
>>> point what it would take to support this for XSKMAPs.
>>
>> Recently also ended up with something similar for our XDP LB to record pcaps [0] ;)
>> My question is.. tcpdump doesn't really care where the packet data comes from,
>> so why not extending libpcap's Linux-related internals to either capture from
>> perf RB or BPF ringbuf rather than AF_PACKET sockets? Cloning is slow, and if
>> you need to end up creating an skb which is then cloned once again inside AF_PACKET
>> it's even worse. Just relying and reading out, say, perf RB you don't need any
>> clones at all.
> 
> We discussed this when creating xdpdump and decided to keep it as a
> separate tool for the time being. I forget the details of the
> discussion, maybe Eelco remembers.
> 
> Anyway, xdpdump does have a "pipe pcap to stdout" feature so you can do
> `xdpdump | tcpdump` and get the interactive output; and it will also
> save pcap information to disk, of course (using pcap-ng so it can also
> save metadata like XDP program name and return code).

Right, and this should yield a significantly better performance compared to
cloning & pushing traffic into AF_PACKET. I presume not many folks are aware
of xdpdump (yet) which is probably why such patch was created here.. a native
libpcap implementation could solve that aspect fwiw and additionally hook at
the same points as AF_PACKET via BPF but without the hassle/overhead of things
like dev_queue_xmit_nit() in fast path. (Maybe another option could be to have
a drop-in replacement libpcap.so for tcpdump using it transparently.)

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
  2021-05-28 10:22         ` Daniel Borkmann
  2021-05-28 10:54           ` Toke Høiland-Jørgensen
@ 2021-05-28 12:23           ` Jesper Dangaard Brouer
  1 sibling, 0 replies; 15+ messages in thread
From: Jesper Dangaard Brouer @ 2021-05-28 12:23 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Magnus Karlsson, Xuan Zhuo, Eelco Chaudron, Lorenzo Bianconi,
	Toke Høiland-Jørgensen, Björn Töpel,
	Magnus Karlsson, Jonathan Lemon, David S. Miller, Jakub Kicinski,
	Alexei Starovoitov, John Fastabend, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh,
	Willem de Bruijn, Xie He, Eric Dumazet, John Ogness, Wang Hai,
	Tanner Love, Eyal Birger, Menglong Dong, Network Development,
	bpf, brouer

On Fri, 28 May 2021 12:22:40 +0200
Daniel Borkmann <daniel@iogearbox.net> wrote:

> On 5/28/21 12:00 PM, Magnus Karlsson wrote:
> > On Fri, May 28, 2021 at 11:52 AM Jesper Dangaard Brouer
> > <brouer@redhat.com> wrote:  
> >> On Fri, 28 May 2021 17:02:01 +0800
> >> Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:  
> >>> On Fri, 28 May 2021 10:55:58 +0200, Toke Høiland-Jørgensen <toke@redhat.com> wrote:  
> >>>> Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:
> >>>>  
> >>>>> In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
> >>>>> rx/tx data packets. This feature is very important in many cases. So
> >>>>> this patch allows AF_PACKET to obtain xsk packages.  
> >>>>
> >>>> You can use xdpdump to dump the packets from the XDP program before it
> >>>> gets redirected into the XSK:
> >>>> https://github.com/xdp-project/xdp-tools/tree/master/xdp-dump  
> >>>
> >>> Wow, this is a good idea.  
> >>
> >> Yes, it is rather cool (credit to Eelco).  Notice the extra info you
> >> can capture from 'exit', like XDP return codes, if_index, rx_queue.
> >>
> >> The tool uses the perf ring-buffer to send/copy data to userspace.
> >> This is actually surprisingly fast, but I still think AF_XDP will be
> >> faster (but it usually 'steals' the packet).
> >>
> >> Another (crazy?) idea is to extend this (and xdpdump), is to leverage
> >> Hangbin's recent XDP_REDIRECT extension e624d4ed4aa8 ("xdp: Extend
> >> xdp_redirect_map with broadcast support").  We now have a
> >> xdp_redirect_map flag BPF_F_BROADCAST, what if we create a
> >> BPF_F_CLONE_PASS flag?
> >>
> >> The semantic meaning of BPF_F_CLONE_PASS flag is to copy/clone the
> >> packet for the specified map target index (e.g AF_XDP map), but
> >> afterwards it does like veth/cpumap and creates an SKB from the
> >> xdp_frame (see __xdp_build_skb_from_frame()) and send to netstack.
> >> (Feel free to kick me if this doesn't make any sense)  
> > 
> > This would be a smooth way to implement clone support for AF_XDP. If
> > we had this and someone added AF_XDP support to libpcap, we could both
> > capture AF_XDP traffic with tcpdump (using this clone functionality in
> > the XDP program) and speed up tcpdump for dumping traffic destined for
> > regular sockets. Would that solve your use case Xuan? Note that I have
> > not looked into the BPF_F_CLONE_PASS code, so do not know at this
> > point what it would take to support this for XSKMAPs.  

There is no spoon... the BPF_F_CLONE_PASS code is an idea.

> 
> Recently also ended up with something similar for our XDP LB to record pcaps [0] ;)
> My question is.. tcpdump doesn't really care where the packet data comes from,
> so why not extending libpcap's Linux-related internals to either capture from
> perf RB or BPF ringbuf 

Just want to first mention, that I do like adding a perf ring-buffer
(BPF ringbuf) interface to AF_PACKET.  But this is basically what
xdpdump already does.  The cool thing is that it is super flexible for
adding extra info like xdpdump does with XDP-return codes.


> rather than AF_PACKET sockets? Cloning is slow, and if
> you need to end up creating an skb which is then cloned once again inside AF_PACKET
> it's even worse. Just relying and reading out, say, perf RB you don't need any
> clones at all.

Well, this is exactly what we avoid with my idea of BPF_F_CLONE_PASS
when combined with AF_XDP. 

I should explain this idea better.  The trick is that AF_XDP have
preallocated all the packets it will every use (at setup time).   Thus,
the AF_XDP copy-mode does no allocations, which is why it is fast
(of-cause ZC mode is faster, but copy-mode AF_XDP is also VERY fast!).

(Details and step with AF_XDP code notes:)
When the xdp_do_redirect happens with ri->flags BPF_F_CLONE_PASS, then
the map specific enqueue (e.g. __xsk_map_redirect), will do a copy of
the xdp_buff (AF_XDP calls xsk_copy_xdp()) and for AF_XDP we don't need
to do a (real) allocation.  Instead of freeing the xdp_buff in xsk_rcv()
(see call to xdp_return_buff()) then we do the xdp_frame to SKB work.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH bpf-next] xsk: support AF_PACKET
  2021-05-28 11:29             ` Daniel Borkmann
@ 2021-05-28 12:35               ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 15+ messages in thread
From: Toke Høiland-Jørgensen @ 2021-05-28 12:35 UTC (permalink / raw)
  To: Daniel Borkmann, Magnus Karlsson, Jesper Dangaard Brouer
  Cc: Xuan Zhuo, Eelco Chaudron, Lorenzo Bianconi,
	Björn Töpel, Magnus Karlsson, Jonathan Lemon,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
	Yonghong Song, KP Singh, Willem de Bruijn, Xie He, Eric Dumazet,
	John Ogness, Wang Hai, Tanner Love, Eyal Birger, Menglong Dong,
	Network Development, bpf

Daniel Borkmann <daniel@iogearbox.net> writes:

> On 5/28/21 12:54 PM, Toke Høiland-Jørgensen wrote:
>> Daniel Borkmann <daniel@iogearbox.net> writes:
>>> On 5/28/21 12:00 PM, Magnus Karlsson wrote:
>>>> On Fri, May 28, 2021 at 11:52 AM Jesper Dangaard Brouer
>>>> <brouer@redhat.com> wrote:
>>>>> On Fri, 28 May 2021 17:02:01 +0800
>>>>> Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>>>>> On Fri, 28 May 2021 10:55:58 +0200, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>>>>>> Xuan Zhuo <xuanzhuo@linux.alibaba.com> writes:
>>>>>>>
>>>>>>>> In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
>>>>>>>> rx/tx data packets. This feature is very important in many cases. So
>>>>>>>> this patch allows AF_PACKET to obtain xsk packages.
>>>>>>>
>>>>>>> You can use xdpdump to dump the packets from the XDP program before it
>>>>>>> gets redirected into the XSK:
>>>>>>> https://github.com/xdp-project/xdp-tools/tree/master/xdp-dump
>>>>>>
>>>>>> Wow, this is a good idea.
>>>>>
>>>>> Yes, it is rather cool (credit to Eelco).  Notice the extra info you
>>>>> can capture from 'exit', like XDP return codes, if_index, rx_queue.
>>>>>
>>>>> The tool uses the perf ring-buffer to send/copy data to userspace.
>>>>> This is actually surprisingly fast, but I still think AF_XDP will be
>>>>> faster (but it usually 'steals' the packet).
>>>>>
>>>>> Another (crazy?) idea is to extend this (and xdpdump), is to leverage
>>>>> Hangbin's recent XDP_REDIRECT extension e624d4ed4aa8 ("xdp: Extend
>>>>> xdp_redirect_map with broadcast support").  We now have a
>>>>> xdp_redirect_map flag BPF_F_BROADCAST, what if we create a
>>>>> BPF_F_CLONE_PASS flag?
>>>>>
>>>>> The semantic meaning of BPF_F_CLONE_PASS flag is to copy/clone the
>>>>> packet for the specified map target index (e.g AF_XDP map), but
>>>>> afterwards it does like veth/cpumap and creates an SKB from the
>>>>> xdp_frame (see __xdp_build_skb_from_frame()) and send to netstack.
>>>>> (Feel free to kick me if this doesn't make any sense)
>>>>
>>>> This would be a smooth way to implement clone support for AF_XDP. If
>>>> we had this and someone added AF_XDP support to libpcap, we could both
>>>> capture AF_XDP traffic with tcpdump (using this clone functionality in
>>>> the XDP program) and speed up tcpdump for dumping traffic destined for
>>>> regular sockets. Would that solve your use case Xuan? Note that I have
>>>> not looked into the BPF_F_CLONE_PASS code, so do not know at this
>>>> point what it would take to support this for XSKMAPs.
>>>
>>> Recently also ended up with something similar for our XDP LB to record pcaps [0] ;)
>>> My question is.. tcpdump doesn't really care where the packet data comes from,
>>> so why not extending libpcap's Linux-related internals to either capture from
>>> perf RB or BPF ringbuf rather than AF_PACKET sockets? Cloning is slow, and if
>>> you need to end up creating an skb which is then cloned once again inside AF_PACKET
>>> it's even worse. Just relying and reading out, say, perf RB you don't need any
>>> clones at all.
>> 
>> We discussed this when creating xdpdump and decided to keep it as a
>> separate tool for the time being. I forget the details of the
>> discussion, maybe Eelco remembers.
>> 
>> Anyway, xdpdump does have a "pipe pcap to stdout" feature so you can do
>> `xdpdump | tcpdump` and get the interactive output; and it will also
>> save pcap information to disk, of course (using pcap-ng so it can also
>> save metadata like XDP program name and return code).
>
> Right, and this should yield a significantly better performance compared to
> cloning & pushing traffic into AF_PACKET. I presume not many folks are aware
> of xdpdump (yet) which is probably why such patch was created here..

What, are you implying we haven't achieved world domination yet?
Inconceivable! ;)

> a native libpcap implementation could solve that aspect fwiw and
> additionally hook at the same points as AF_PACKET via BPF but without
> the hassle/overhead of things like dev_queue_xmit_nit() in fast path.
> (Maybe another option could be to have a drop-in replacement
> libpcap.so for tcpdump using it transparently.)

I do believe that Michael was open to adding something like this to
tcpdump/libpcap when I last talked to him about it; and I'm certainly
not opposed to it either! Hooking up tcpdump like this may be a bit of a
firehose, though, so it would be nice to be able to carry over the
kernel-side filtering as well. I suppose it should be possible to write
an eBPF bytecode generator that does a bit of setup and then just
translates the cBPF packet filtering ops, no? This would be cool to have
in any case; IIRC Cloudflare did something like that but took a detour
through C code generation?

-Toke


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2021-05-28 12:37 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-28  6:08 [PATCH bpf-next] xsk: support AF_PACKET Xuan Zhuo
2021-05-28  8:27 ` kernel test robot
2021-05-28  8:27   ` kernel test robot
2021-05-28  8:34 ` kernel test robot
2021-05-28  8:34   ` kernel test robot
2021-05-28  8:55 ` Toke Høiland-Jørgensen
     [not found]   ` <1622192521.5931044-1-xuanzhuo@linux.alibaba.com>
2021-05-28  9:25     ` Toke Høiland-Jørgensen
2021-05-28  9:32       ` Maciej Fijalkowski
2021-05-28  9:50     ` Jesper Dangaard Brouer
2021-05-28 10:00       ` Magnus Karlsson
2021-05-28 10:22         ` Daniel Borkmann
2021-05-28 10:54           ` Toke Høiland-Jørgensen
2021-05-28 11:29             ` Daniel Borkmann
2021-05-28 12:35               ` Toke Høiland-Jørgensen
2021-05-28 12:23           ` Jesper Dangaard Brouer

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.