All of lore.kernel.org
 help / color / mirror / Atom feed
* [bpf-next RFCv2 0/3] AF_XDP support for veth.
@ 2018-12-19  1:04 William Tu
  2018-12-19  1:04 ` [bpf-next RFCv2 1/3] xsk: add xsk_umem_consume_tx_virtual William Tu
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: William Tu @ 2018-12-19  1:04 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, ast, daniel, netdev,
	makita.toshiaki, yihung.wei

The patch series adds AF_XDP async xmit support for veth device.
First patch add a new API for supporting non-physical NIC device to get
packet's virtual address.  The second patch implements the async xmit,
and last patch adds example use cases.

I tested with 2 namespaces, one as sender, the other as receiver.
The packet rate is measure at the receiver side.
  ip netns add at_ns0
  ip link add p0 type veth peer name p1
  ip link set p0 netns at_ns0
  ip link set dev p1 up
  ip netns exec at_ns0 ip link set dev p0 up
  
  # receiver
  ip netns exec at_ns0 xdp_rxq_info --dev p0 --action XDP_DROP
  
  # sender with AF_XDP
  xdpsock -i p1 -t -N -z
  
  # or sender without AF_XDP
  xdpsock -i p1 -t -S

Without AF_XDP: 724 Kpps
RXQ stats       RXQ:CPU pps         issue-pps  
rx_queue_index    0:1   724339      0          
rx_queue_index    0:sum 724339     

With AF_XDP: 1.1 Mpps (with ksoftirqd 100% cpu)
RXQ stats       RXQ:CPU pps         issue-pps  
rx_queue_index    0:3   1188181     0          
rx_queue_index    0:sum 1188181    

v1->v2:
- refactor the xsk_umem_consume_tx_virtual
- use the umems provided by netdev
- fix bug from locating peer side rq with qid

William Tu (3):
  xsk: add xsk_umem_consume_tx_virtual.
  veth: support AF_XDP.
  samples: bpf: add veth AF_XDP example.

 drivers/net/veth.c             | 199 ++++++++++++++++++++++++++++++++++++++++-
 include/net/xdp_sock.h         |   7 ++
 net/xdp/xdp_umem.c             |   1 +
 net/xdp/xsk.c                  |  21 ++++-
 samples/bpf/test_veth_afxdp.sh |  67 ++++++++++++++
 5 files changed, 291 insertions(+), 4 deletions(-)
 create mode 100755 samples/bpf/test_veth_afxdp.sh

-- 
2.7.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [bpf-next RFCv2 1/3] xsk: add xsk_umem_consume_tx_virtual.
  2018-12-19  1:04 [bpf-next RFCv2 0/3] AF_XDP support for veth William Tu
@ 2018-12-19  1:04 ` William Tu
  2018-12-20 19:48   ` Björn Töpel
  2018-12-19  1:04 ` [bpf-next RFCv2 2/3] veth: support AF_XDP William Tu
  2018-12-19  1:04 ` [bpf-next RFCv2 3/3] samples: bpf: add veth AF_XDP example William Tu
  2 siblings, 1 reply; 8+ messages in thread
From: William Tu @ 2018-12-19  1:04 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, ast, daniel, netdev,
	makita.toshiaki, yihung.wei

Currently the xsk_umem_consume_tx expects only the physical NICs so
the api returns a dma address.  This patch introduce the new function
to return the virtual address, when XSK is used by a virtual device.

Signed-off-by: William Tu <u9012063@gmail.com>
---
 include/net/xdp_sock.h |  7 +++++++
 net/xdp/xdp_umem.c     |  1 +
 net/xdp/xsk.c          | 21 +++++++++++++++++++--
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 13acb9803a6d..7fefe74f7fb5 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -81,6 +81,7 @@ u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
 void xsk_umem_discard_addr(struct xdp_umem *umem);
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
 bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
+bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem, void **addr, u32 *len);
 void xsk_umem_consume_tx_done(struct xdp_umem *umem);
 struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
 struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
@@ -165,6 +166,12 @@ static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
 	return false;
 }
 
+static inline bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem,
+					       void **vaddr, u32 *len)
+{
+	return false;
+}
+
 static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 {
 }
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index a264cf2accd0..424ae2538f9f 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -60,6 +60,7 @@ struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
 
 	return NULL;
 }
+EXPORT_SYMBOL(xdp_get_umem_from_qid);
 
 static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
 {
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 07156f43d295..a6ca0c0f0330 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -170,7 +170,8 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_consume_tx_done);
 
-bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+static bool __xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
+			     void **vaddr, u32 *len)
 {
 	struct xdp_desc desc;
 	struct xdp_sock *xs;
@@ -183,7 +184,12 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
 		if (xskq_produce_addr_lazy(umem->cq, desc.addr))
 			goto out;
 
-		*dma = xdp_umem_get_dma(umem, desc.addr);
+		if (dma)
+			*dma = xdp_umem_get_dma(umem, desc.addr);
+
+		if (vaddr)
+			*vaddr = xdp_umem_get_data(umem, desc.addr);
+
 		*len = desc.len;
 
 		xskq_discard_desc(xs->tx);
@@ -195,8 +201,19 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
 	rcu_read_unlock();
 	return false;
 }
+
+bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+{
+	return __xsk_umem_consume_tx(umem, dma, NULL, len);
+}
 EXPORT_SYMBOL(xsk_umem_consume_tx);
 
+bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem, void **addr, u32 *len)
+{
+	return __xsk_umem_consume_tx(umem, NULL, addr, len);
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx_virtual);
+
 static int xsk_zc_xmit(struct sock *sk)
 {
 	struct xdp_sock *xs = xdp_sk(sk);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [bpf-next RFCv2 2/3] veth: support AF_XDP.
  2018-12-19  1:04 [bpf-next RFCv2 0/3] AF_XDP support for veth William Tu
  2018-12-19  1:04 ` [bpf-next RFCv2 1/3] xsk: add xsk_umem_consume_tx_virtual William Tu
@ 2018-12-19  1:04 ` William Tu
  2018-12-21  8:38   ` Björn Töpel
  2018-12-19  1:04 ` [bpf-next RFCv2 3/3] samples: bpf: add veth AF_XDP example William Tu
  2 siblings, 1 reply; 8+ messages in thread
From: William Tu @ 2018-12-19  1:04 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, ast, daniel, netdev,
	makita.toshiaki, yihung.wei

The patch adds support for AF_XDP async xmit.  Users can use
AF_XDP on both side of the veth and get better performance, with
the cost of ksoftirqd doing the xmit.  The veth_xsk_async_xmit
simply kicks the napi function, veth_poll, to run, and the transmit
logic is implemented there.

Tested using two namespaces, one runs xdpsock and the other runs
xdp_rxq_info.  A simple script comparing the performance with/without
AF_XDP shows improvement from 724Kpps to 1.1Mpps.

  ip netns add at_ns0
  ip link add p0 type veth peer name p1
  ip link set p0 netns at_ns0
  ip link set dev p1 up
  ip netns exec at_ns0 ip link set dev p0 up

  # receiver
  ip netns exec at_ns0 xdp_rxq_info --dev p0 --action XDP_DROP

  # sender
  xdpsock -i p1 -t -N -z
  or
  xdpsock -i p1 -t -S

Signed-off-by: William Tu <u9012063@gmail.com>
---
 drivers/net/veth.c | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 197 insertions(+), 2 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f412ea1cef18..0ce89820ce70 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -25,6 +25,10 @@
 #include <linux/ptr_ring.h>
 #include <linux/bpf_trace.h>
 #include <linux/net_tstamp.h>
+#include <net/xdp_sock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <net/page_pool.h>
 
 #define DRV_NAME	"veth"
 #define DRV_VERSION	"1.0"
@@ -53,6 +57,8 @@ struct veth_rq {
 	bool			rx_notify_masked;
 	struct ptr_ring		xdp_ring;
 	struct xdp_rxq_info	xdp_rxq;
+	struct xdp_umem *xsk_umem;
+	u16 qid;
 };
 
 struct veth_priv {
@@ -737,15 +743,108 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit)
 	return done;
 }
 
+static int veth_xsk_send(struct napi_struct *napi, int budget)
+{
+	struct veth_rq *rq =
+		container_of(napi, struct veth_rq, xdp_napi);
+	int done = 0;
+
+	/* tx: use netif_tx_napi_add? */
+	while (rq->xsk_umem && budget--) {
+		struct veth_priv *priv, *peer_priv;
+		struct net_device *dev, *peer_dev;
+		unsigned int inner_xdp_xmit = 0;
+		unsigned int metasize = 0;
+		struct veth_rq *peer_rq;
+		struct xdp_frame *xdpf;
+		bool dropped = false;
+		struct sk_buff *skb;
+		struct page *page;
+		void *vaddr;
+		void *addr;
+		u32 len;
+
+		if (!xsk_umem_consume_tx_virtual(rq->xsk_umem, &vaddr, &len))
+			break;
+
+		page = dev_alloc_page();
+		if (!page) {
+			pr_warn("veth: page allocation fails\n");
+			xsk_umem_complete_tx(rq->xsk_umem, 1);
+			xsk_umem_consume_tx_done(rq->xsk_umem);
+			return -ENOMEM;
+		}
+
+		addr = page_to_virt(page);
+		xdpf = addr;
+		memset(xdpf, 0, sizeof(*xdpf));
+
+		addr += sizeof(*xdpf);
+		memcpy(addr, vaddr, len);
+
+		xdpf->data = addr + metasize;
+		xdpf->len = len;
+		xdpf->headroom = 0;
+		xdpf->metasize = metasize;
+		xdpf->mem.type = MEM_TYPE_PAGE_SHARED;
+
+		/* Invoke peer rq to rcv */
+		dev = rq->dev;
+		priv = netdev_priv(dev);
+		peer_dev = priv->peer;
+		peer_priv = netdev_priv(peer_dev);
+		peer_rq = &peer_priv->rq[rq->qid];
+
+		/* put into peer rq */
+		skb = veth_xdp_rcv_one(peer_rq, xdpf, &inner_xdp_xmit);
+		if (!skb) {
+			/* Peer side has XDP program attached */
+			if (inner_xdp_xmit & VETH_XDP_TX) {
+				/* Not supported */
+				pr_warn("veth: peer XDP_TX not supported\n");
+				xdp_return_frame(xdpf);
+				dropped = true;
+				goto skip_tx;
+			} else if (inner_xdp_xmit & VETH_XDP_REDIR) {
+				xdp_do_flush_map();
+			} else {
+				dropped = true;
+			}
+		} else {
+			/* Peer side has no XDP attached */
+			napi_gro_receive(&peer_rq->xdp_napi, skb);
+		}
+skip_tx:
+		xsk_umem_complete_tx(rq->xsk_umem, 1);
+		xsk_umem_consume_tx_done(rq->xsk_umem);
+
+		/* update peer stats */
+		u64_stats_update_begin(&peer_rq->stats.syncp);
+		peer_rq->stats.xdp_packets++;
+		peer_rq->stats.xdp_bytes += len;
+		if (dropped)
+			rq->stats.xdp_drops++;
+		u64_stats_update_end(&peer_rq->stats.syncp);
+		done++;
+	}
+
+	return done;
+}
+
 static int veth_poll(struct napi_struct *napi, int budget)
 {
 	struct veth_rq *rq =
 		container_of(napi, struct veth_rq, xdp_napi);
 	unsigned int xdp_xmit = 0;
-	int done;
+	int done = 0;
+	int tx_done;
+
+	tx_done = veth_xsk_send(napi, budget);
+	if (tx_done > 0)
+		done += tx_done;
 
 	xdp_set_return_frame_no_direct();
-	done = veth_xdp_rcv(rq, budget, &xdp_xmit);
+	done += veth_xdp_rcv(rq, budget, &xdp_xmit);
 
 	if (done < budget && napi_complete_done(napi, done)) {
 		/* Write rx_notify_masked before reading ptr_ring */
@@ -776,6 +875,7 @@ static int veth_napi_add(struct net_device *dev)
 		err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
 		if (err)
 			goto err_xdp_ring;
+		rq->qid = i;
 	}
 
 	for (i = 0; i < dev->real_num_rx_queues; i++) {
@@ -812,6 +912,7 @@ static void veth_napi_del(struct net_device *dev)
 		netif_napi_del(&rq->xdp_napi);
 		rq->rx_notify_masked = false;
 		ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
+		rq->qid = -1;
 	}
 }
 
@@ -836,6 +937,7 @@ static int veth_enable_xdp(struct net_device *dev)
 
 			/* Save original mem info as it can be overwritten */
 			rq->xdp_mem = rq->xdp_rxq.mem;
+			rq->qid = i;
 		}
 
 		err = veth_napi_add(dev);
@@ -1115,6 +1217,78 @@ static u32 veth_xdp_query(struct net_device *dev)
 	return 0;
 }
 
+int veth_xsk_umem_query(struct net_device *dev, struct xdp_umem **umem,
+			u16 qid)
+{
+	struct xdp_umem *queried_umem;
+
+	queried_umem = xdp_get_umem_from_qid(dev, qid);
+
+	if (!queried_umem)
+		return -EINVAL;
+
+	*umem = queried_umem;
+	return 0;
+}
+
+static int veth_xsk_umem_enable(struct net_device *dev,
+				struct xdp_umem *umem,
+				u16 qid)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	struct xdp_umem_fq_reuse *reuseq;
+	int err = 0;
+
+	if (qid >= dev->real_num_rx_queues)
+		return -EINVAL;
+
+	reuseq = xsk_reuseq_prepare(priv->rq[0].xdp_ring.size);
+	if (!reuseq)
+		return -ENOMEM;
+
+	xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
+
+	priv->rq[qid].xsk_umem = umem;
+	return err;
+}
+
+static int veth_xsk_umem_disable(struct net_device *dev,
+				 u16 qid)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	struct xdp_umem *umem;
+
+	umem = xdp_get_umem_from_qid(dev, qid);
+	if (!umem)
+		return -EINVAL;
+
+	priv->rq[qid].xsk_umem = NULL;
+	return 0;
+}
+
+int veth_xsk_umem_setup(struct net_device *dev, struct xdp_umem *umem,
+			u16 qid)
+{
+	return umem ? veth_xsk_umem_enable(dev, umem, qid) :
+		      veth_xsk_umem_disable(dev, qid);
+}
+
+int veth_xsk_async_xmit(struct net_device *dev, u32 qid)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	struct veth_rq *rq;
+
+	rq = &priv->rq[qid];
+
+	if (qid >= dev->real_num_rx_queues)
+		return -ENXIO;
+
+	if (!napi_if_scheduled_mark_missed(&rq->xdp_napi))
+		napi_schedule(&rq->xdp_napi);
+
+	return 0;
+}
+
 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
@@ -1123,6 +1297,26 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	case XDP_QUERY_PROG:
 		xdp->prog_id = veth_xdp_query(dev);
 		return 0;
+	case XDP_QUERY_XSK_UMEM:
+		return veth_xsk_umem_query(dev, &xdp->xsk.umem,
+					   xdp->xsk.queue_id);
+	case XDP_SETUP_XSK_UMEM: {
+		struct veth_priv *priv;
+		int err;
+
+		/* Enable XDP on both sides */
+		err = veth_enable_xdp(dev);
+		if (err)
+			return err;
+
+		priv = netdev_priv(dev);
+		err = veth_enable_xdp(priv->peer);
+		if (err)
+			return err;
+
+		return veth_xsk_umem_setup(dev, xdp->xsk.umem,
+					   xdp->xsk.queue_id);
+	}
 	default:
 		return -EINVAL;
 	}
@@ -1145,6 +1339,7 @@ static const struct net_device_ops veth_netdev_ops = {
 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
 	.ndo_bpf		= veth_xdp,
 	.ndo_xdp_xmit		= veth_xdp_xmit,
+	.ndo_xsk_async_xmit	= veth_xsk_async_xmit,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [bpf-next RFCv2 3/3] samples: bpf: add veth AF_XDP example.
  2018-12-19  1:04 [bpf-next RFCv2 0/3] AF_XDP support for veth William Tu
  2018-12-19  1:04 ` [bpf-next RFCv2 1/3] xsk: add xsk_umem_consume_tx_virtual William Tu
  2018-12-19  1:04 ` [bpf-next RFCv2 2/3] veth: support AF_XDP William Tu
@ 2018-12-19  1:04 ` William Tu
  2 siblings, 0 replies; 8+ messages in thread
From: William Tu @ 2018-12-19  1:04 UTC (permalink / raw)
  To: bjorn.topel, magnus.karlsson, ast, daniel, netdev,
	makita.toshiaki, yihung.wei

Add example use cases for AF_XDP socket on two namespaces.
The script runs sender at the root namespace, and receiver
at the at_ns0 namespace with different XDP actions.

Signed-off-by: William Tu <u9012063@gmail.com>
---
 samples/bpf/test_veth_afxdp.sh | 67 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100755 samples/bpf/test_veth_afxdp.sh

diff --git a/samples/bpf/test_veth_afxdp.sh b/samples/bpf/test_veth_afxdp.sh
new file mode 100755
index 000000000000..b65311cf15f7
--- /dev/null
+++ b/samples/bpf/test_veth_afxdp.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# The script runs the sender at the root namespace, and
+# the receiver at the namespace at_ns0, with different mode
+#  1. XDP_DROP
+#  2. XDP_TX
+#  3. XDP_PASS
+#  4. XDP_REDIRECT
+#  5. Generic XDP
+
+XDPSOCK=./xdpsock
+XDP_RXQ_INFO=./xdp_rxq_info
+
+ip netns add at_ns0
+ip link add p0 type veth peer name p1
+ip link set p0 netns at_ns0
+ip link set dev p1 up
+ip netns exec at_ns0 ip link set dev p0 up
+
+# send at root namespace
+test_xdp_drop()
+{
+	echo "[Peer] XDP_DROP"
+	ip netns exec at_ns0 $XDP_RXQ_INFO --dev p0 --action XDP_DROP &
+	$XDPSOCK -i p1 -t -N -z &> /tmp/t &> /dev/null &
+}
+
+test_xdp_pass()
+{
+	echo "[Peer] XDP_PASS"
+	ip netns exec at_ns0 $XDP_RXQ_INFO --dev p0 --action XDP_PASS &
+	$XDPSOCK -i p1 -t -N -z &> /tmp/t &> /dev/null &
+}
+
+test_xdp_tx()
+{
+	echo "[Peer] XDP_TX"
+	ip netns exec at_ns0 $XDP_RXQ_INFO --dev p0 --action XDP_TX &
+	$XDPSOCK -i p1 -t -N -z &> /tmp/t &> /dev/null &
+}
+
+test_generic_xdp()
+{
+	echo "[Peer] Generic XDP"
+	ip netns exec at_ns0 $XDPSOCK -i p0 -r -S &
+	$XDPSOCK -i p1 -t -N -z &> /tmp/t &> /dev/null &
+}
+
+test_xdp_redirect()
+{
+	echo "[Peer] XDP_REDIRECT"
+	ip netns exec at_ns0 $XDPSOCK -i p0 -r -N &
+	$XDPSOCK -i p1 -t -N -z &> /tmp/t &> /dev/null &
+}
+
+cleanup() {
+    killall xdpsock
+    killall xdp_rxq_info
+    ip netns del at_ns0
+    ip link del p1
+}
+
+trap cleanup 0 3 6
+
+test_xdp_drop
+cleanup
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [bpf-next RFCv2 1/3] xsk: add xsk_umem_consume_tx_virtual.
  2018-12-19  1:04 ` [bpf-next RFCv2 1/3] xsk: add xsk_umem_consume_tx_virtual William Tu
@ 2018-12-20 19:48   ` Björn Töpel
  2018-12-21 20:02     ` William Tu
  0 siblings, 1 reply; 8+ messages in thread
From: Björn Töpel @ 2018-12-20 19:48 UTC (permalink / raw)
  To: William Tu
  Cc: Magnus Karlsson, ast, Daniel Borkmann, Netdev, makita.toshiaki,
	yihung.wei, Karlsson, Magnus

Den ons 19 dec. 2018 kl 01:55 skrev William Tu <u9012063@gmail.com>:
>
> Currently the xsk_umem_consume_tx expects only the physical NICs so
> the api returns a dma address.  This patch introduce the new function
> to return the virtual address, when XSK is used by a virtual device.
>
> Signed-off-by: William Tu <u9012063@gmail.com>
> ---
>  include/net/xdp_sock.h |  7 +++++++
>  net/xdp/xdp_umem.c     |  1 +
>  net/xdp/xsk.c          | 21 +++++++++++++++++++--
>  3 files changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 13acb9803a6d..7fefe74f7fb5 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -81,6 +81,7 @@ u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
>  void xsk_umem_discard_addr(struct xdp_umem *umem);
>  void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
>  bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
> +bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem, void **addr, u32 *len);
>  void xsk_umem_consume_tx_done(struct xdp_umem *umem);
>  struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
>  struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
> @@ -165,6 +166,12 @@ static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
>         return false;
>  }
>
> +static inline bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem,
> +                                              void **vaddr, u32 *len)
> +{
> +       return false;
> +}
> +
>  static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
>  {
>  }
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index a264cf2accd0..424ae2538f9f 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -60,6 +60,7 @@ struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
>
>         return NULL;
>  }
> +EXPORT_SYMBOL(xdp_get_umem_from_qid);
>
>  static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
>  {
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 07156f43d295..a6ca0c0f0330 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -170,7 +170,8 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
>  }
>  EXPORT_SYMBOL(xsk_umem_consume_tx_done);
>
> -bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
> +static bool __xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
> +                            void **vaddr, u32 *len)
>  {
>         struct xdp_desc desc;
>         struct xdp_sock *xs;
> @@ -183,7 +184,12 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
>                 if (xskq_produce_addr_lazy(umem->cq, desc.addr))
>                         goto out;
>
> -               *dma = xdp_umem_get_dma(umem, desc.addr);
> +               if (dma)
> +                       *dma = xdp_umem_get_dma(umem, desc.addr);
> +
> +               if (vaddr)
> +                       *vaddr = xdp_umem_get_data(umem, desc.addr);
> +

This function is in the fast-path, so I'm reluctant to introduce the
branching above. What do think about something like this instead?

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 80ca48cefc42..458f0977b437 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -170,22 +170,19 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_consume_tx_done);

-bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+static __always_inline bool __xsk_umem_consume_tx(struct xdp_umem *umem,
+                          struct xdp_desc *desc)
 {
-    struct xdp_desc desc;
     struct xdp_sock *xs;

     rcu_read_lock();
     list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
-        if (!xskq_peek_desc(xs->tx, &desc))
+        if (!xskq_peek_desc(xs->tx, desc))
             continue;

-        if (xskq_produce_addr_lazy(umem->cq, desc.addr))
+        if (xskq_produce_addr_lazy(umem->cq, desc->addr))
             goto out;

-        *dma = xdp_umem_get_dma(umem, desc.addr);
-        *len = desc.len;
-
         xskq_discard_desc(xs->tx);
         rcu_read_unlock();
         return true;
@@ -195,8 +192,35 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem,
dma_addr_t *dma, u32 *len)
     rcu_read_unlock();
     return false;
 }
+
+bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+{
+    struct xdp_desc desc;
+
+    if (!__xsk_umem_consume_tx(umem, &desc))
+        return false;
+
+    *dma = xdp_umem_get_dma(umem, desc.addr);
+    *len = desc.len;
+
+    return true;
+}
 EXPORT_SYMBOL(xsk_umem_consume_tx);

+bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem, void **vaddr, u32 *len)
+{
+    struct xdp_desc desc;
+
+    if (!__xsk_umem_consume_tx(umem, &desc))
+        return false;
+
+    *vaddr = xdp_umem_get_data(umem, desc.addr);
+    *len = desc.len;
+
+    return true;
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx_virtual);
+
 static int xsk_zc_xmit(struct sock *sk)
 {
     struct xdp_sock *xs = xdp_sk(sk);


Björn



>                 *len = desc.len;
>
>                 xskq_discard_desc(xs->tx);
> @@ -195,8 +201,19 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
>         rcu_read_unlock();
>         return false;
>  }
> +
> +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
> +{
> +       return __xsk_umem_consume_tx(umem, dma, NULL, len);
> +}
>  EXPORT_SYMBOL(xsk_umem_consume_tx);
>
> +bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem, void **addr, u32 *len)
> +{
> +       return __xsk_umem_consume_tx(umem, NULL, addr, len);
> +}
> +EXPORT_SYMBOL(xsk_umem_consume_tx_virtual);
> +
>  static int xsk_zc_xmit(struct sock *sk)
>  {
>         struct xdp_sock *xs = xdp_sk(sk);
> --
> 2.7.4
>

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [bpf-next RFCv2 2/3] veth: support AF_XDP.
  2018-12-19  1:04 ` [bpf-next RFCv2 2/3] veth: support AF_XDP William Tu
@ 2018-12-21  8:38   ` Björn Töpel
  2018-12-22  0:04     ` William Tu
  0 siblings, 1 reply; 8+ messages in thread
From: Björn Töpel @ 2018-12-21  8:38 UTC (permalink / raw)
  To: William Tu
  Cc: Magnus Karlsson, ast, Daniel Borkmann, Netdev, makita.toshiaki,
	yihung.wei, Karlsson, Magnus

Den ons 19 dec. 2018 kl 01:55 skrev William Tu <u9012063@gmail.com>:
>
> The patch adds support for AF_XDP async xmit.  Users can use
> AF_XDP on both side of the veth and get better performance, with
> the cost of ksoftirqd doing the xmit.  The veth_xsk_async_xmit
> simply kicks the napi function, veth_poll, to run, and the transmit
> logic is implemented there.
>
> Tested using two namespaces, one runs xdpsock and the other runs
> xdp_rxq_info.  A simple script comparing the performance with/without
> AF_XDP shows improvement from 724Kpps to 1.1Mpps.
>
>   ip netns add at_ns0
>   ip link add p0 type veth peer name p1
>   ip link set p0 netns at_ns0
>   ip link set dev p1 up
>   ip netns exec at_ns0 ip link set dev p0 up
>
>   # receiver
>   ip netns exec at_ns0 xdp_rxq_info --dev p0 --action XDP_DROP
>
>   # sender
>   xdpsock -i p1 -t -N -z
>   or
>   xdpsock -i p1 -t -S
>
> Signed-off-by: William Tu <u9012063@gmail.com>
> ---
>  drivers/net/veth.c | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 197 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index f412ea1cef18..0ce89820ce70 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -25,6 +25,10 @@
>  #include <linux/ptr_ring.h>
>  #include <linux/bpf_trace.h>
>  #include <linux/net_tstamp.h>
> +#include <net/xdp_sock.h>
> +#include <linux/mm.h>
> +#include <linux/slab.h>
> +#include <net/page_pool.h>
>
>  #define DRV_NAME       "veth"
>  #define DRV_VERSION    "1.0"
> @@ -53,6 +57,8 @@ struct veth_rq {
>         bool                    rx_notify_masked;
>         struct ptr_ring         xdp_ring;
>         struct xdp_rxq_info     xdp_rxq;
> +       struct xdp_umem *xsk_umem;
> +       u16 qid;
>  };
>
>  struct veth_priv {
> @@ -737,15 +743,108 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit)
>         return done;
>  }
>
> +static int veth_xsk_send(struct napi_struct *napi, int budget)
> +{
> +       struct veth_rq *rq =
> +               container_of(napi, struct veth_rq, xdp_napi);
> +       int done = 0;
> +
> +       /* tx: use netif_tx_napi_add? */
> +       while (rq->xsk_umem && budget--) {
> +               struct veth_priv *priv, *peer_priv;
> +               struct net_device *dev, *peer_dev;
> +               unsigned int inner_xdp_xmit = 0;
> +               unsigned int metasize = 0;
> +               struct veth_rq *peer_rq;
> +               struct xdp_frame *xdpf;
> +               bool dropped = false;
> +               struct sk_buff *skb;
> +               struct page *page;
> +               void *vaddr;
> +               void *addr;
> +               u32 len;
> +
> +               if (!xsk_umem_consume_tx_virtual(rq->xsk_umem, &vaddr, &len))
> +                       break;
> +
> +               page = dev_alloc_page();
> +               if (!page) {
> +                       pr_warn("veth: page allocation fails\n");
> +                       xsk_umem_complete_tx(rq->xsk_umem, 1);
> +                       xsk_umem_consume_tx_done(rq->xsk_umem);
> +                       return -ENOMEM;
> +               }
> +
> +               addr = page_to_virt(page);
> +               xdpf = addr;
> +               memset(xdpf, 0, sizeof(*xdpf));
> +
> +               addr += sizeof(*xdpf);
> +               memcpy(addr, vaddr, len);
> +
> +               xdpf->data = addr + metasize;
> +               xdpf->len = len;
> +               xdpf->headroom = 0;
> +               xdpf->metasize = metasize;
> +               xdpf->mem.type = MEM_TYPE_PAGE_SHARED;
> +
> +               /* Invoke peer rq to rcv */
> +               dev = rq->dev;
> +               priv = netdev_priv(dev);
> +               peer_dev = priv->peer;
> +               peer_priv = netdev_priv(peer_dev);
> +               peer_rq = &peer_priv->rq[rq->qid];
> +
> +               /* put into peer rq */
> +               skb = veth_xdp_rcv_one(peer_rq, xdpf, &inner_xdp_xmit);
> +               if (!skb) {
> +                       /* Peer side has XDP program attached */
> +                       if (inner_xdp_xmit & VETH_XDP_TX) {
> +                               /* Not supported */
> +                               pr_warn("veth: peer XDP_TX not supported\n");
> +                               xdp_return_frame(xdpf);
> +                               dropped = true;
> +                               goto skip_tx;
> +                       } else if (inner_xdp_xmit & VETH_XDP_REDIR) {
> +                               xdp_do_flush_map();
> +                       } else {
> +                               dropped = true;
> +                       }
> +               } else {
> +                       /* Peer side has no XDP attached */
> +                       napi_gro_receive(&peer_rq->xdp_napi, skb);
> +               }
> +skip_tx:
> +               xsk_umem_complete_tx(rq->xsk_umem, 1);
> +               xsk_umem_consume_tx_done(rq->xsk_umem);
> +
> +               /* update peer stats */
> +               u64_stats_update_begin(&peer_rq->stats.syncp);
> +               peer_rq->stats.xdp_packets++;
> +               peer_rq->stats.xdp_bytes += len;
> +               if (dropped)
> +                       rq->stats.xdp_drops++;
> +               u64_stats_update_end(&peer_rq->stats.syncp);
> +               done++;
> +       }
> +
> +       return done;
> +}
> +
>  static int veth_poll(struct napi_struct *napi, int budget)
>  {
>         struct veth_rq *rq =
>                 container_of(napi, struct veth_rq, xdp_napi);
>         unsigned int xdp_xmit = 0;
> -       int done;
> +       int done = 0;
> +       int tx_done;
> +
> +       tx_done = veth_xsk_send(napi, budget);
> +       if (tx_done > 0)
> +               done += tx_done;
>
>         xdp_set_return_frame_no_direct();
> -       done = veth_xdp_rcv(rq, budget, &xdp_xmit);
> +       done += veth_xdp_rcv(rq, budget, &xdp_xmit);
>
>         if (done < budget && napi_complete_done(napi, done)) {
>                 /* Write rx_notify_masked before reading ptr_ring */
> @@ -776,6 +875,7 @@ static int veth_napi_add(struct net_device *dev)
>                 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
>                 if (err)
>                         goto err_xdp_ring;
> +               rq->qid = i;
>         }
>
>         for (i = 0; i < dev->real_num_rx_queues; i++) {
> @@ -812,6 +912,7 @@ static void veth_napi_del(struct net_device *dev)
>                 netif_napi_del(&rq->xdp_napi);
>                 rq->rx_notify_masked = false;
>                 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
> +               rq->qid = -1;
>         }
>  }
>
> @@ -836,6 +937,7 @@ static int veth_enable_xdp(struct net_device *dev)
>
>                         /* Save original mem info as it can be overwritten */
>                         rq->xdp_mem = rq->xdp_rxq.mem;
> +                       rq->qid = i;
>                 }
>
>                 err = veth_napi_add(dev);
> @@ -1115,6 +1217,78 @@ static u32 veth_xdp_query(struct net_device *dev)
>         return 0;
>  }
>
> +int veth_xsk_umem_query(struct net_device *dev, struct xdp_umem **umem,
> +                       u16 qid)
> +{
> +       struct xdp_umem *queried_umem;
> +
> +       queried_umem = xdp_get_umem_from_qid(dev, qid);
> +
> +       if (!queried_umem)
> +               return -EINVAL;
> +
> +       *umem = queried_umem;
> +       return 0;
> +}
> +
> +static int veth_xsk_umem_enable(struct net_device *dev,
> +                               struct xdp_umem *umem,
> +                               u16 qid)
> +{
> +       struct veth_priv *priv = netdev_priv(dev);
> +       struct xdp_umem_fq_reuse *reuseq;
> +       int err = 0;
> +
> +       if (qid >= dev->real_num_rx_queues)
> +               return -EINVAL;
> +
> +       reuseq = xsk_reuseq_prepare(priv->rq[0].xdp_ring.size);
> +       if (!reuseq)
> +               return -ENOMEM;
> +
> +       xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
> +
> +       priv->rq[qid].xsk_umem = umem;
> +       return err;
> +}
> +
> +static int veth_xsk_umem_disable(struct net_device *dev,
> +                                u16 qid)
> +{
> +       struct veth_priv *priv = netdev_priv(dev);
> +       struct xdp_umem *umem;
> +
> +       umem = xdp_get_umem_from_qid(dev, qid);
> +       if (!umem)
> +               return -EINVAL;
> +
> +       priv->rq[qid].xsk_umem = NULL;
> +       return 0;
> +}
> +
> +int veth_xsk_umem_setup(struct net_device *dev, struct xdp_umem *umem,
> +                       u16 qid)
> +{
> +       return umem ? veth_xsk_umem_enable(dev, umem, qid) :
> +                     veth_xsk_umem_disable(dev, qid);
> +}
> +
> +int veth_xsk_async_xmit(struct net_device *dev, u32 qid)
> +{
> +       struct veth_priv *priv = netdev_priv(dev);
> +       struct veth_rq *rq;
> +
> +       rq = &priv->rq[qid];
> +
> +       if (qid >= dev->real_num_rx_queues)
> +               return -ENXIO;
> +
> +       if (!napi_if_scheduled_mark_missed(&rq->xdp_napi))
> +               napi_schedule(&rq->xdp_napi);
> +
> +       return 0;
> +}
> +
>  static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>  {
>         switch (xdp->command) {
> @@ -1123,6 +1297,26 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>         case XDP_QUERY_PROG:
>                 xdp->prog_id = veth_xdp_query(dev);
>                 return 0;
> +       case XDP_QUERY_XSK_UMEM:
> +               return veth_xsk_umem_query(dev, &xdp->xsk.umem,
> +                                          xdp->xsk.queue_id);
> +       case XDP_SETUP_XSK_UMEM: {
> +               struct veth_priv *priv;
> +               int err;
> +
> +               /* Enable XDP on both sides */
> +               err = veth_enable_xdp(dev);
> +               if (err)
> +                       return err;
> +
> +               priv = netdev_priv(dev);
> +               err = veth_enable_xdp(priv->peer);

Hmm, setting the umem on one peer, enables XDP on both ends? Why?

I'm think there's some inconsistency with this patch -- again I might
be missing something. To get some clarity, here are a couple of
questions/statements/thoughts:

For a veth pair A and B, B is XDP enabled. Transmissions from A or
XDP_REDIRECT to A, will put the xdpf/skb onto B's Rx ring, and
eventually schedule B napi context (from flush). B's napi poll will
execute, draining the queue and execute the XDP program.

In this patch, if A runs AF_XDP xmit, you schedule A's napi context,
and execute B's receive path in that context. This is racy, right?

What you'd like is that, if B is running in a napi-mode, the sendmsg
schedules *B's* napi, and drain A's AF_XDP Tx ring in B's receive
path. What would be the method to drain A's AF_XDP Tx ring if B is
*not* running in napi (i.e. in XDP mode)? Schedule A's napi, build skb
and pass it to the peer B? Another approach is that when a A has
AF_XDP enabled, enable napi B to mode, and drain from B's napi poll.
Is this why you "enable XDP on both sides", even if one peer hasn't an
XDP program set (enabling XDP -> enable napi for that peer)?


Björn



> +               if (err)
> +                       return err;
> +
> +               return veth_xsk_umem_setup(dev, xdp->xsk.umem,
> +                                          xdp->xsk.queue_id);
> +       }
>         default:
>                 return -EINVAL;
>         }
> @@ -1145,6 +1339,7 @@ static const struct net_device_ops veth_netdev_ops = {
>         .ndo_set_rx_headroom    = veth_set_rx_headroom,
>         .ndo_bpf                = veth_xdp,
>         .ndo_xdp_xmit           = veth_xdp_xmit,
> +       .ndo_xsk_async_xmit     = veth_xsk_async_xmit,
>  };
>
>  #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [bpf-next RFCv2 1/3] xsk: add xsk_umem_consume_tx_virtual.
  2018-12-20 19:48   ` Björn Töpel
@ 2018-12-21 20:02     ` William Tu
  0 siblings, 0 replies; 8+ messages in thread
From: William Tu @ 2018-12-21 20:02 UTC (permalink / raw)
  To: Björn Töpel
  Cc: Magnus Karlsson, Alexei Starovoitov, Daniel Borkmann, Netdev,
	makita.toshiaki, Yi-Hung Wei, Karlsson, Magnus

On Thu, Dec 20, 2018 at 11:48 AM Björn Töpel <bjorn.topel@gmail.com> wrote:
>
> Den ons 19 dec. 2018 kl 01:55 skrev William Tu <u9012063@gmail.com>:
> >
> > Currently the xsk_umem_consume_tx expects only the physical NICs so
> > the api returns a dma address.  This patch introduce the new function
> > to return the virtual address, when XSK is used by a virtual device.
> >
> > Signed-off-by: William Tu <u9012063@gmail.com>
> > ---
> >  include/net/xdp_sock.h |  7 +++++++
> >  net/xdp/xdp_umem.c     |  1 +
> >  net/xdp/xsk.c          | 21 +++++++++++++++++++--
> >  3 files changed, 27 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > index 13acb9803a6d..7fefe74f7fb5 100644
> > --- a/include/net/xdp_sock.h
> > +++ b/include/net/xdp_sock.h
> > @@ -81,6 +81,7 @@ u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
> >  void xsk_umem_discard_addr(struct xdp_umem *umem);
> >  void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
> >  bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
> > +bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem, void **addr, u32 *len);
> >  void xsk_umem_consume_tx_done(struct xdp_umem *umem);
> >  struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
> >  struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
> > @@ -165,6 +166,12 @@ static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
> >         return false;
> >  }
> >
> > +static inline bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem,
> > +                                              void **vaddr, u32 *len)
> > +{
> > +       return false;
> > +}
> > +
> >  static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
> >  {
> >  }
> > diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> > index a264cf2accd0..424ae2538f9f 100644
> > --- a/net/xdp/xdp_umem.c
> > +++ b/net/xdp/xdp_umem.c
> > @@ -60,6 +60,7 @@ struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
> >
> >         return NULL;
> >  }
> > +EXPORT_SYMBOL(xdp_get_umem_from_qid);
> >
> >  static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
> >  {
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index 07156f43d295..a6ca0c0f0330 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -170,7 +170,8 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
> >  }
> >  EXPORT_SYMBOL(xsk_umem_consume_tx_done);
> >
> > -bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
> > +static bool __xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
> > +                            void **vaddr, u32 *len)
> >  {
> >         struct xdp_desc desc;
> >         struct xdp_sock *xs;
> > @@ -183,7 +184,12 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
> >                 if (xskq_produce_addr_lazy(umem->cq, desc.addr))
> >                         goto out;
> >
> > -               *dma = xdp_umem_get_dma(umem, desc.addr);
> > +               if (dma)
> > +                       *dma = xdp_umem_get_dma(umem, desc.addr);
> > +
> > +               if (vaddr)
> > +                       *vaddr = xdp_umem_get_data(umem, desc.addr);
> > +
>
> This function is in the fast-path, so I'm reluctant to introduce the
> branching above. What do think about something like this instead?
>
Yes, make sense. I will merge into my next patch set.
Thanks
William

> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 80ca48cefc42..458f0977b437 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -170,22 +170,19 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
>  }
>  EXPORT_SYMBOL(xsk_umem_consume_tx_done);
>
> -bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
> +static __always_inline bool __xsk_umem_consume_tx(struct xdp_umem *umem,
> +                          struct xdp_desc *desc)
>  {
> -    struct xdp_desc desc;
>      struct xdp_sock *xs;
>
>      rcu_read_lock();
>      list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
> -        if (!xskq_peek_desc(xs->tx, &desc))
> +        if (!xskq_peek_desc(xs->tx, desc))
>              continue;
>
> -        if (xskq_produce_addr_lazy(umem->cq, desc.addr))
> +        if (xskq_produce_addr_lazy(umem->cq, desc->addr))
>              goto out;
>
> -        *dma = xdp_umem_get_dma(umem, desc.addr);
> -        *len = desc.len;
> -
>          xskq_discard_desc(xs->tx);
>          rcu_read_unlock();
>          return true;
> @@ -195,8 +192,35 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem,
> dma_addr_t *dma, u32 *len)
>      rcu_read_unlock();
>      return false;
>  }
> +
> +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
> +{
> +    struct xdp_desc desc;
> +
> +    if (!__xsk_umem_consume_tx(umem, &desc))
> +        return false;
> +
> +    *dma = xdp_umem_get_dma(umem, desc.addr);
> +    *len = desc.len;
> +
> +    return true;
> +}
>  EXPORT_SYMBOL(xsk_umem_consume_tx);
>
> +bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem, void **vaddr, u32 *len)
> +{
> +    struct xdp_desc desc;
> +
> +    if (!__xsk_umem_consume_tx(umem, &desc))
> +        return false;
> +
> +    *vaddr = xdp_umem_get_data(umem, desc.addr);
> +    *len = desc.len;
> +
> +    return true;
> +}
> +EXPORT_SYMBOL(xsk_umem_consume_tx_virtual);
> +
>  static int xsk_zc_xmit(struct sock *sk)
>  {
>      struct xdp_sock *xs = xdp_sk(sk);
>
>
> Björn
>
>
>
> >                 *len = desc.len;
> >
> >                 xskq_discard_desc(xs->tx);
> > @@ -195,8 +201,19 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
> >         rcu_read_unlock();
> >         return false;
> >  }
> > +
> > +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
> > +{
> > +       return __xsk_umem_consume_tx(umem, dma, NULL, len);
> > +}
> >  EXPORT_SYMBOL(xsk_umem_consume_tx);
> >
> > +bool xsk_umem_consume_tx_virtual(struct xdp_umem *umem, void **addr, u32 *len)
> > +{
> > +       return __xsk_umem_consume_tx(umem, NULL, addr, len);
> > +}
> > +EXPORT_SYMBOL(xsk_umem_consume_tx_virtual);
> > +
> >  static int xsk_zc_xmit(struct sock *sk)
> >  {
> >         struct xdp_sock *xs = xdp_sk(sk);
> > --
> > 2.7.4
> >

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [bpf-next RFCv2 2/3] veth: support AF_XDP.
  2018-12-21  8:38   ` Björn Töpel
@ 2018-12-22  0:04     ` William Tu
  0 siblings, 0 replies; 8+ messages in thread
From: William Tu @ 2018-12-22  0:04 UTC (permalink / raw)
  To: Björn Töpel
  Cc: Magnus Karlsson, Alexei Starovoitov, Daniel Borkmann, Netdev,
	makita.toshiaki, Yi-Hung Wei, Karlsson, Magnus

On Fri, Dec 21, 2018 at 12:38 AM Björn Töpel <bjorn.topel@gmail.com> wrote:
>
> Den ons 19 dec. 2018 kl 01:55 skrev William Tu <u9012063@gmail.com>:
> >
> > The patch adds support for AF_XDP async xmit.  Users can use
> > AF_XDP on both side of the veth and get better performance, with
> > the cost of ksoftirqd doing the xmit.  The veth_xsk_async_xmit
> > simply kicks the napi function, veth_poll, to run, and the transmit
> > logic is implemented there.
> >
> > Tested using two namespaces, one runs xdpsock and the other runs
> > xdp_rxq_info.  A simple script comparing the performance with/without
> > AF_XDP shows improvement from 724Kpps to 1.1Mpps.
> >
> >   ip netns add at_ns0
> >   ip link add p0 type veth peer name p1
> >   ip link set p0 netns at_ns0
> >   ip link set dev p1 up
> >   ip netns exec at_ns0 ip link set dev p0 up
> >
> >   # receiver
> >   ip netns exec at_ns0 xdp_rxq_info --dev p0 --action XDP_DROP
> >
> >   # sender
> >   xdpsock -i p1 -t -N -z
> >   or
> >   xdpsock -i p1 -t -S
> >
> > Signed-off-by: William Tu <u9012063@gmail.com>
> > ---
> >  drivers/net/veth.c | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 197 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> > index f412ea1cef18..0ce89820ce70 100644
> > --- a/drivers/net/veth.c
> > +++ b/drivers/net/veth.c
> > @@ -25,6 +25,10 @@
> >  #include <linux/ptr_ring.h>
> >  #include <linux/bpf_trace.h>
> >  #include <linux/net_tstamp.h>
> > +#include <net/xdp_sock.h>
> > +#include <linux/mm.h>
> > +#include <linux/slab.h>
> > +#include <net/page_pool.h>
> >
> >  #define DRV_NAME       "veth"
> >  #define DRV_VERSION    "1.0"
> > @@ -53,6 +57,8 @@ struct veth_rq {
> >         bool                    rx_notify_masked;
> >         struct ptr_ring         xdp_ring;
> >         struct xdp_rxq_info     xdp_rxq;
> > +       struct xdp_umem *xsk_umem;
> > +       u16 qid;
> >  };
> >
> >  struct veth_priv {
> > @@ -737,15 +743,108 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit)
> >         return done;
> >  }
> >
> > +static int veth_xsk_send(struct napi_struct *napi, int budget)
> > +{
> > +       struct veth_rq *rq =
> > +               container_of(napi, struct veth_rq, xdp_napi);
> > +       int done = 0;
> > +
> > +       /* tx: use netif_tx_napi_add? */
> > +       while (rq->xsk_umem && budget--) {
> > +               struct veth_priv *priv, *peer_priv;
> > +               struct net_device *dev, *peer_dev;
> > +               unsigned int inner_xdp_xmit = 0;
> > +               unsigned int metasize = 0;
> > +               struct veth_rq *peer_rq;
> > +               struct xdp_frame *xdpf;
> > +               bool dropped = false;
> > +               struct sk_buff *skb;
> > +               struct page *page;
> > +               void *vaddr;
> > +               void *addr;
> > +               u32 len;
> > +
> > +               if (!xsk_umem_consume_tx_virtual(rq->xsk_umem, &vaddr, &len))
> > +                       break;
> > +
> > +               page = dev_alloc_page();
> > +               if (!page) {
> > +                       pr_warn("veth: page allocation fails\n");
> > +                       xsk_umem_complete_tx(rq->xsk_umem, 1);
> > +                       xsk_umem_consume_tx_done(rq->xsk_umem);
> > +                       return -ENOMEM;
> > +               }
> > +
> > +               addr = page_to_virt(page);
> > +               xdpf = addr;
> > +               memset(xdpf, 0, sizeof(*xdpf));
> > +
> > +               addr += sizeof(*xdpf);
> > +               memcpy(addr, vaddr, len);
> > +
> > +               xdpf->data = addr + metasize;
> > +               xdpf->len = len;
> > +               xdpf->headroom = 0;
> > +               xdpf->metasize = metasize;
> > +               xdpf->mem.type = MEM_TYPE_PAGE_SHARED;
> > +
> > +               /* Invoke peer rq to rcv */
> > +               dev = rq->dev;
> > +               priv = netdev_priv(dev);
> > +               peer_dev = priv->peer;
> > +               peer_priv = netdev_priv(peer_dev);
> > +               peer_rq = &peer_priv->rq[rq->qid];
> > +
> > +               /* put into peer rq */
> > +               skb = veth_xdp_rcv_one(peer_rq, xdpf, &inner_xdp_xmit);
> > +               if (!skb) {
> > +                       /* Peer side has XDP program attached */
> > +                       if (inner_xdp_xmit & VETH_XDP_TX) {
> > +                               /* Not supported */
> > +                               pr_warn("veth: peer XDP_TX not supported\n");
> > +                               xdp_return_frame(xdpf);
> > +                               dropped = true;
> > +                               goto skip_tx;
> > +                       } else if (inner_xdp_xmit & VETH_XDP_REDIR) {
> > +                               xdp_do_flush_map();
> > +                       } else {
> > +                               dropped = true;
> > +                       }
> > +               } else {
> > +                       /* Peer side has no XDP attached */
> > +                       napi_gro_receive(&peer_rq->xdp_napi, skb);
> > +               }
> > +skip_tx:
> > +               xsk_umem_complete_tx(rq->xsk_umem, 1);
> > +               xsk_umem_consume_tx_done(rq->xsk_umem);
> > +
> > +               /* update peer stats */
> > +               u64_stats_update_begin(&peer_rq->stats.syncp);
> > +               peer_rq->stats.xdp_packets++;
> > +               peer_rq->stats.xdp_bytes += len;
> > +               if (dropped)
> > +                       rq->stats.xdp_drops++;
> > +               u64_stats_update_end(&peer_rq->stats.syncp);
> > +               done++;
> > +       }
> > +
> > +       return done;
> > +}
> > +
> >  static int veth_poll(struct napi_struct *napi, int budget)
> >  {
> >         struct veth_rq *rq =
> >                 container_of(napi, struct veth_rq, xdp_napi);
> >         unsigned int xdp_xmit = 0;
> > -       int done;
> > +       int done = 0;
> > +       int tx_done;
> > +
> > +       tx_done = veth_xsk_send(napi, budget);
> > +       if (tx_done > 0)
> > +               done += tx_done;
> >
> >         xdp_set_return_frame_no_direct();
> > -       done = veth_xdp_rcv(rq, budget, &xdp_xmit);
> > +       done += veth_xdp_rcv(rq, budget, &xdp_xmit);
> >
> >         if (done < budget && napi_complete_done(napi, done)) {
> >                 /* Write rx_notify_masked before reading ptr_ring */
> > @@ -776,6 +875,7 @@ static int veth_napi_add(struct net_device *dev)
> >                 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
> >                 if (err)
> >                         goto err_xdp_ring;
> > +               rq->qid = i;
> >         }
> >
> >         for (i = 0; i < dev->real_num_rx_queues; i++) {
> > @@ -812,6 +912,7 @@ static void veth_napi_del(struct net_device *dev)
> >                 netif_napi_del(&rq->xdp_napi);
> >                 rq->rx_notify_masked = false;
> >                 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
> > +               rq->qid = -1;
> >         }
> >  }
> >
> > @@ -836,6 +937,7 @@ static int veth_enable_xdp(struct net_device *dev)
> >
> >                         /* Save original mem info as it can be overwritten */
> >                         rq->xdp_mem = rq->xdp_rxq.mem;
> > +                       rq->qid = i;
> >                 }
> >
> >                 err = veth_napi_add(dev);
> > @@ -1115,6 +1217,78 @@ static u32 veth_xdp_query(struct net_device *dev)
> >         return 0;
> >  }
> >
> > +int veth_xsk_umem_query(struct net_device *dev, struct xdp_umem **umem,
> > +                       u16 qid)
> > +{
> > +       struct xdp_umem *queried_umem;
> > +
> > +       queried_umem = xdp_get_umem_from_qid(dev, qid);
> > +
> > +       if (!queried_umem)
> > +               return -EINVAL;
> > +
> > +       *umem = queried_umem;
> > +       return 0;
> > +}
> > +
> > +static int veth_xsk_umem_enable(struct net_device *dev,
> > +                               struct xdp_umem *umem,
> > +                               u16 qid)
> > +{
> > +       struct veth_priv *priv = netdev_priv(dev);
> > +       struct xdp_umem_fq_reuse *reuseq;
> > +       int err = 0;
> > +
> > +       if (qid >= dev->real_num_rx_queues)
> > +               return -EINVAL;
> > +
> > +       reuseq = xsk_reuseq_prepare(priv->rq[0].xdp_ring.size);
> > +       if (!reuseq)
> > +               return -ENOMEM;
> > +
> > +       xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
> > +
> > +       priv->rq[qid].xsk_umem = umem;
> > +       return err;
> > +}
> > +
> > +static int veth_xsk_umem_disable(struct net_device *dev,
> > +                                u16 qid)
> > +{
> > +       struct veth_priv *priv = netdev_priv(dev);
> > +       struct xdp_umem *umem;
> > +
> > +       umem = xdp_get_umem_from_qid(dev, qid);
> > +       if (!umem)
> > +               return -EINVAL;
> > +
> > +       priv->rq[qid].xsk_umem = NULL;
> > +       return 0;
> > +}
> > +
> > +int veth_xsk_umem_setup(struct net_device *dev, struct xdp_umem *umem,
> > +                       u16 qid)
> > +{
> > +       return umem ? veth_xsk_umem_enable(dev, umem, qid) :
> > +                     veth_xsk_umem_disable(dev, qid);
> > +}
> > +
> > +int veth_xsk_async_xmit(struct net_device *dev, u32 qid)
> > +{
> > +       struct veth_priv *priv = netdev_priv(dev);
> > +       struct veth_rq *rq;
> > +
> > +       rq = &priv->rq[qid];
> > +
> > +       if (qid >= dev->real_num_rx_queues)
> > +               return -ENXIO;
> > +
> > +       if (!napi_if_scheduled_mark_missed(&rq->xdp_napi))
> > +               napi_schedule(&rq->xdp_napi);
> > +
> > +       return 0;
> > +}
> > +
> >  static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> >  {
> >         switch (xdp->command) {
> > @@ -1123,6 +1297,26 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> >         case XDP_QUERY_PROG:
> >                 xdp->prog_id = veth_xdp_query(dev);
> >                 return 0;
> > +       case XDP_QUERY_XSK_UMEM:
> > +               return veth_xsk_umem_query(dev, &xdp->xsk.umem,
> > +                                          xdp->xsk.queue_id);
> > +       case XDP_SETUP_XSK_UMEM: {
> > +               struct veth_priv *priv;
> > +               int err;
> > +
> > +               /* Enable XDP on both sides */
> > +               err = veth_enable_xdp(dev);
> > +               if (err)
> > +                       return err;
> > +
> > +               priv = netdev_priv(dev);
> > +               err = veth_enable_xdp(priv->peer);
>
> Hmm, setting the umem on one peer, enables XDP on both ends? Why?
>
> I'm think there's some inconsistency with this patch -- again I might
> be missing something. To get some clarity, here are a couple of
> questions/statements/thoughts:
>
> For a veth pair A and B, B is XDP enabled. Transmissions from A or
> XDP_REDIRECT to A, will put the xdpf/skb onto B's Rx ring, and
> eventually schedule B napi context (from flush). B's napi poll will
> execute, draining the queue and execute the XDP program.
>
> In this patch, if A runs AF_XDP xmit, you schedule A's napi context,
> and execute B's receive path in that context. This is racy, right?

Yes, I think you're right.
>
> What you'd like is that, if B is running in a napi-mode, the sendmsg
> schedules *B's* napi, and drain A's AF_XDP Tx ring in B's receive

OK, it makes sense to use B's napi to to receive. Then, for the sync xmit,
it will kick B's napi instead of A. I will make the change in next patch.

> path. What would be the method to drain A's AF_XDP Tx ring if B is
> *not* running in napi (i.e. in XDP mode)? Schedule A's napi, build skb
> and pass it to the peer B? Another approach is that when a A has
> AF_XDP enabled, enable napi B to mode, and drain from B's napi poll.
> Is this why you "enable XDP on both sides", even if one peer hasn't an
> XDP program set (enabling XDP -> enable napi for that peer)?

Right, I want to make sure both side have NAPI, so I enable xdp on both.

Thanks for the review.
William

>
>
> Björn
>
>
>
> > +               if (err)
> > +                       return err;
> > +
> > +               return veth_xsk_umem_setup(dev, xdp->xsk.umem,
> > +                                          xdp->xsk.queue_id);
> > +       }
> >         default:
> >                 return -EINVAL;
> >         }
> > @@ -1145,6 +1339,7 @@ static const struct net_device_ops veth_netdev_ops = {
> >         .ndo_set_rx_headroom    = veth_set_rx_headroom,
> >         .ndo_bpf                = veth_xdp,
> >         .ndo_xdp_xmit           = veth_xdp_xmit,
> > +       .ndo_xsk_async_xmit     = veth_xsk_async_xmit,
> >  };
> >
> >  #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
> > --
> > 2.7.4
> >

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-12-22  0:04 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-19  1:04 [bpf-next RFCv2 0/3] AF_XDP support for veth William Tu
2018-12-19  1:04 ` [bpf-next RFCv2 1/3] xsk: add xsk_umem_consume_tx_virtual William Tu
2018-12-20 19:48   ` Björn Töpel
2018-12-21 20:02     ` William Tu
2018-12-19  1:04 ` [bpf-next RFCv2 2/3] veth: support AF_XDP William Tu
2018-12-21  8:38   ` Björn Töpel
2018-12-22  0:04     ` William Tu
2018-12-19  1:04 ` [bpf-next RFCv2 3/3] samples: bpf: add veth AF_XDP example William Tu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.