All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next] xsk: support AF_PACKET
@ 2021-05-28  6:08 Xuan Zhuo
  2021-05-28  8:27   ` kernel test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Xuan Zhuo @ 2021-05-28  6:08 UTC (permalink / raw)
  To: netdev, bpf
  Cc: Björn Töpel, Magnus Karlsson, Jonathan Lemon,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Willem de Bruijn, Xie He, Eric Dumazet, John Ogness,
	Wang Hai, Xuan Zhuo, Tanner Love, Eyal Birger, Menglong Dong

In xsk mode, users cannot use AF_PACKET(tcpdump) to observe the current
rx/tx data packets. This feature is very important in many cases. So
this patch allows AF_PACKET to obtain xsk packages.

By default, AF_PACKET is based on ptype_base/ptype_all in dev.c to
obtain data packets. But xsk is not suitable for calling these
callbacks, because it may send the packet to other protocol stacks. So
the method I used is to let AF_PACKET get the data packet from xsk
alone.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 include/net/xdp_sock.h |  15 +++++
 net/packet/af_packet.c |  35 +++++++++--
 net/packet/internal.h  |   7 +++
 net/xdp/Makefile       |   2 +-
 net/xdp/xsk.c          |   9 +++
 net/xdp/xsk_packet.c   | 129 +++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_packet.h   |  44 ++++++++++++++
 7 files changed, 234 insertions(+), 7 deletions(-)
 create mode 100644 net/xdp/xsk_packet.c
 create mode 100644 net/xdp/xsk_packet.h

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 9c0722c6d7ac..b0acf0293132 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -17,6 +17,11 @@ struct net_device;
 struct xsk_queue;
 struct xdp_buff;
 
+struct xsk_packet {
+	struct list_head list;
+	struct packet_type *pt;
+};
+
 struct xdp_umem {
 	void *addrs;
 	u64 size;
@@ -79,6 +84,8 @@ struct xdp_sock {
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(void);
+void xsk_add_pack(struct xsk_packet *xpt);
+void __xsk_remove_pack(struct xsk_packet *xpt);
 
 #else
 
@@ -96,6 +103,14 @@ static inline void __xsk_map_flush(void)
 {
 }
 
+void xsk_add_pack(struct xsk_packet *xpt)
+{
+}
+
+void __xsk_remove_pack(struct xsk_packet *xpt)
+{
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 597d798ac0a5..2720b51d13a6 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -303,10 +303,14 @@ static void __register_prot_hook(struct sock *sk)
 	struct packet_sock *po = pkt_sk(sk);
 
 	if (!po->running) {
-		if (po->fanout)
+		if (po->fanout) {
 			__fanout_link(sk, po);
-		else
+		} else {
 			dev_add_pack(&po->prot_hook);
+#ifdef CONFIG_XDP_SOCKETS
+			xsk_add_pack(&po->xsk_pt);
+#endif
+		}
 
 		sock_hold(sk);
 		po->running = 1;
@@ -333,10 +337,14 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
 
 	po->running = 0;
 
-	if (po->fanout)
+	if (po->fanout) {
 		__fanout_unlink(sk, po);
-	else
+	} else {
 		__dev_remove_pack(&po->prot_hook);
+#ifdef CONFIG_XDP_SOCKETS
+		__xsk_remove_pack(&po->xsk_pt);
+#endif
+	}
 
 	__sock_put(sk);
 
@@ -1483,8 +1491,12 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po)
 	rcu_assign_pointer(f->arr[f->num_members], sk);
 	smp_wmb();
 	f->num_members++;
-	if (f->num_members == 1)
+	if (f->num_members == 1) {
 		dev_add_pack(&f->prot_hook);
+#ifdef CONFIG_XDP_SOCKETS
+		xsk_add_pack(&f->xsk_pt);
+#endif
+	}
 	spin_unlock(&f->lock);
 }
 
@@ -1504,8 +1516,12 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
 			   rcu_dereference_protected(f->arr[f->num_members - 1],
 						     lockdep_is_held(&f->lock)));
 	f->num_members--;
-	if (f->num_members == 0)
+	if (f->num_members == 0) {
 		__dev_remove_pack(&f->prot_hook);
+#ifdef CONFIG_XDP_SOCKETS
+		__xsk_remove_pack(&po->xsk_pt);
+#endif
+	}
 	spin_unlock(&f->lock);
 }
 
@@ -1737,6 +1753,10 @@ static int fanout_add(struct sock *sk, struct fanout_args *args)
 		match->prot_hook.af_packet_priv = match;
 		match->prot_hook.id_match = match_fanout_group;
 		match->max_num_members = args->max_num_members;
+#ifdef CONFIG_XDP_SOCKETS
+		match->xsk_pt.pt = &match->prot_hook;
+#endif
+
 		list_add(&match->list, &fanout_list);
 	}
 	err = -EINVAL;
@@ -3315,6 +3335,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 		po->prot_hook.func = packet_rcv_spkt;
 
 	po->prot_hook.af_packet_priv = sk;
+#ifdef CONFIG_XDP_SOCKETS
+	po->xsk_pt.pt = &po->prot_hook;
+#endif
 
 	if (proto) {
 		po->prot_hook.type = proto;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 48af35b1aed2..d224b926588a 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -3,6 +3,7 @@
 #define __PACKET_INTERNAL_H__
 
 #include <linux/refcount.h>
+#include <net/xdp_sock.h>
 
 struct packet_mclist {
 	struct packet_mclist	*next;
@@ -94,6 +95,9 @@ struct packet_fanout {
 	spinlock_t		lock;
 	refcount_t		sk_ref;
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
+#ifdef CONFIG_XDP_SOCKETS
+	struct xsk_packet	xsk_pt;
+#endif
 	struct sock	__rcu	*arr[];
 };
 
@@ -136,6 +140,9 @@ struct packet_sock {
 	struct net_device __rcu	*cached_dev;
 	int			(*xmit)(struct sk_buff *skb);
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
+#ifdef CONFIG_XDP_SOCKETS
+	struct xsk_packet	xsk_pt;
+#endif
 	atomic_t		tp_drops ____cacheline_aligned_in_smp;
 };
 
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index 30cdc4315f42..bcac0591879b 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o xsk_packet.o
 obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o
 obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cd62d4ba87a9..fc97e7f9e4cb 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -28,6 +28,7 @@
 
 #include "xsk_queue.h"
 #include "xdp_umem.h"
+#include "xsk_packet.h"
 #include "xsk.h"
 
 #define TX_BATCH_SIZE 32
@@ -156,6 +157,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 	int err;
 
 	addr = xp_get_handle(xskb);
+	xsk_rx_packet_deliver(xs, addr, len);
 	err = xskq_prod_reserve_desc(xs->rx, addr, len);
 	if (err) {
 		xs->rx_queue_full++;
@@ -347,6 +349,8 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
 			goto out;
 
+		xsk_tx_zc_packet_deliver(xs, desc);
+
 		xskq_cons_release(xs->tx);
 		rcu_read_unlock();
 		return true;
@@ -576,6 +580,8 @@ static int xsk_generic_xmit(struct sock *sk)
 		}
 		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 
+		xsk_tx_packet_deliver(xs, &desc, skb);
+
 		err = __dev_direct_xmit(skb, xs->queue_id);
 		if  (err == NETDEV_TX_BUSY) {
 			/* Tell user-space to retry the send */
@@ -1467,6 +1473,9 @@ static int __init xsk_init(void)
 
 	for_each_possible_cpu(cpu)
 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
+
+	INIT_LIST_HEAD(&xsk_pt);
+
 	return 0;
 
 out_pernet:
diff --git a/net/xdp/xsk_packet.c b/net/xdp/xsk_packet.c
new file mode 100644
index 000000000000..41005f214d6d
--- /dev/null
+++ b/net/xdp/xsk_packet.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets packet api
+ *
+ * Author: Xuan Zhuo <xuanzhuo.dxf@linux.alibaba.com>
+ */
+
+#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
+#include "xsk.h"
+#include "xsk_packet.h"
+
+struct list_head xsk_pt __read_mostly;
+static DEFINE_SPINLOCK(pt_lock);
+
+static struct sk_buff *xsk_pt_alloc_skb(struct xdp_sock *xs,
+					struct xdp_desc *desc)
+{
+	struct sk_buff *skb;
+	void *buffer;
+	int err;
+
+	skb = alloc_skb(desc->len, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	skb_put(skb, desc->len);
+
+	buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
+	err = skb_store_bits(skb, 0, buffer, desc->len);
+	if (unlikely(err)) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	return skb;
+}
+
+static struct sk_buff *xsk_pt_get_skb(struct xdp_sock *xs,
+				      struct xdp_desc *desc,
+				      struct sk_buff *skb,
+				      bool rx)
+{
+	struct net_device *dev = xs->dev;
+
+	/* We must copy the data, because skb may exist for a long time
+	 * on AF_PACKET. If the buffer of the xsk is used by skb, the
+	 * release of xsk and the reuse of the buffer will be affected.
+	 */
+	if (!skb || (dev->priv_flags & IFF_TX_SKB_NO_LINEAR))
+		skb = xsk_pt_alloc_skb(xs, desc);
+	else
+		skb = skb_clone(skb, GFP_ATOMIC);
+
+	if (!skb)
+		return NULL;
+
+	skb->protocol = eth_type_trans(skb, dev);
+	skb_reset_network_header(skb);
+	skb->transport_header = skb->network_header;
+	__net_timestamp(skb);
+
+	if (!rx)
+		skb->pkt_type = PACKET_OUTGOING;
+
+	return skb;
+}
+
+void __xsk_pt_deliver(struct xdp_sock *xs, struct sk_buff *skb,
+		      struct xdp_desc *desc, bool rx)
+{
+	struct packet_type *pt_prev = NULL;
+	struct packet_type *ptype;
+	struct xsk_packet *xpt;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xpt, &xsk_pt, list) {
+		ptype = xpt->pt;
+
+		if (!rx && ptype->ignore_outgoing)
+			continue;
+
+		if (pt_prev) {
+			refcount_inc(&skb->users);
+			pt_prev->func(skb, skb->dev, pt_prev, skb->dev);
+			pt_prev = ptype;
+			continue;
+		}
+
+		skb = xsk_pt_get_skb(xs, desc, skb, rx);
+		if (unlikely(!skb))
+			goto out_unlock;
+
+		pt_prev = ptype;
+	}
+
+	if (pt_prev)
+		pt_prev->func(skb, skb->dev, pt_prev, skb->dev);
+
+out_unlock:
+	rcu_read_unlock();
+}
+
+void xsk_add_pack(struct xsk_packet *xpt)
+{
+	if (xpt->pt->type != htons(ETH_P_ALL))
+		return;
+
+	spin_lock(&pt_lock);
+	list_add_rcu(&xpt->list, &xsk_pt);
+	spin_unlock(&pt_lock);
+}
+
+void __xsk_remove_pack(struct xsk_packet *xpt)
+{
+	struct xsk_packet *xpt1;
+
+	spin_lock(&pt_lock);
+
+	list_for_each_entry(xpt1, &xsk_pt, list) {
+		if (xpt1 == xpt) {
+			list_del_rcu(&xpt1->list);
+			goto out;
+		}
+	}
+
+	pr_warn("xsk_remove_pack: %p not found\n", xpt);
+out:
+	spin_unlock(&pt_lock);
+}
diff --git a/net/xdp/xsk_packet.h b/net/xdp/xsk_packet.h
new file mode 100644
index 000000000000..55d30fa8828b
--- /dev/null
+++ b/net/xdp/xsk_packet.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __XSK_PACKET_H__
+#define __XSK_PACKET_H__
+extern struct list_head xsk_pt __read_mostly;
+
+void __xsk_pt_deliver(struct xdp_sock *xs, struct sk_buff *skb,
+		      struct xdp_desc *desc, bool rx);
+
+static inline void xsk_tx_packet_deliver(struct xdp_sock *xs,
+					 struct xdp_desc *desc,
+					 struct sk_buff *skb)
+{
+	if (likely(list_empty(&xsk_pt)))
+		return;
+
+	local_bh_disable();
+	__xsk_pt_deliver(xs, skb, desc, false);
+	local_bh_enable();
+}
+
+static inline void xsk_tx_zc_packet_deliver(struct xdp_sock *xs,
+					    struct xdp_desc *desc)
+{
+	if (likely(list_empty(&xsk_pt)))
+		return;
+
+	__xsk_pt_deliver(xs, NULL, desc, false);
+}
+
+static inline void xsk_rx_packet_deliver(struct xdp_sock *xs, u64 addr, u32 len)
+{
+	struct xdp_desc desc;
+
+	if (likely(list_empty(&xsk_pt)))
+		return;
+
+	desc.addr = addr;
+	desc.len = len;
+
+	__xsk_pt_deliver(xs, NULL, &desc, true);
+}
+
+#endif /* __XSK_PACKET_H__ */
-- 
2.31.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2021-05-28 12:37 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-28  6:08 [PATCH bpf-next] xsk: support AF_PACKET Xuan Zhuo
2021-05-28  8:27 ` kernel test robot
2021-05-28  8:27   ` kernel test robot
2021-05-28  8:34 ` kernel test robot
2021-05-28  8:34   ` kernel test robot
2021-05-28  8:55 ` Toke Høiland-Jørgensen
     [not found]   ` <1622192521.5931044-1-xuanzhuo@linux.alibaba.com>
2021-05-28  9:25     ` Toke Høiland-Jørgensen
2021-05-28  9:32       ` Maciej Fijalkowski
2021-05-28  9:50     ` Jesper Dangaard Brouer
2021-05-28 10:00       ` Magnus Karlsson
2021-05-28 10:22         ` Daniel Borkmann
2021-05-28 10:54           ` Toke Høiland-Jørgensen
2021-05-28 11:29             ` Daniel Borkmann
2021-05-28 12:35               ` Toke Høiland-Jørgensen
2021-05-28 12:23           ` Jesper Dangaard Brouer

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.