All of lore.kernel.org
 help / color / mirror / Atom feed
* [net-next, PATCH 0/2, v3] net: socionext: XDP support
@ 2018-09-29 11:28 Ilias Apalodimas
  2018-09-29 11:28 ` [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA Ilias Apalodimas
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Ilias Apalodimas @ 2018-09-29 11:28 UTC (permalink / raw)
  To: netdev, jaswinder.singh
  Cc: ard.biesheuvel, masami.hiramatsu, arnd, bjorn.topel,
	magnus.karlsson, brouer, daniel, ast, jesus.sanchez-palencia,
	vinicius.gomes, makita.toshiaki, Ilias Apalodimas

This patch series adds AF_XDP support socionext netsec driver
In addition the new dma allocation scheme offers a 10% boost on Rx
pps rate using 64b packets

- patch [1/2]: Use a different allocation scheme for Rx DMA buffers to 
  prepare the driver for AF_XDP support
- patch [2/2]: Add XDP support without zero-copy

test and performance numbers (64b packets):
-------------------------------------------
- Normal SKBs on Rx: ~217kpps
test: pktgen -> intel i210 -> netsec -> XDP_TX/XDP_REDIRECT
- XDP_TX: 320kpps
- XDP_REDIRECT: 320kpps

qemu -> pktgen -> virtio -> ndo_xdp_xmit -> netsec
- ndo_xdp_xmit: Could not send more than 120kpps. Interface forwarded that 
                with success

Changes since v2:
 - Always allocate Rx buffers with XDP_PACKET_HEADROOM
 
 Björn Töpel:
 - Added locking in the Tx queue

 Jesper Dangaard Brouer:
 - Added support for .ndo_xdp_xmit
 - XDP_TX does not flush every packet

Changes since v1:
- patch [2/2]:
 Toshiaki Makita:
 - Added XDP_PACKET_HEADROOM
 - Fixed a bug on XDP_PASS case
 - Using extact for error messaging instead of netdev_warn, when
   trying to setup XDP

Ilias Apalodimas (2):
  net: socionext: different approach on DMA
  net: socionext: add XDP support

 drivers/net/ethernet/socionext/netsec.c | 541 +++++++++++++++++++++++++-------
 1 file changed, 426 insertions(+), 115 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA
  2018-09-29 11:28 [net-next, PATCH 0/2, v3] net: socionext: XDP support Ilias Apalodimas
@ 2018-09-29 11:28 ` Ilias Apalodimas
  2018-10-01  9:26   ` Jesper Dangaard Brouer
  2018-09-29 11:28 ` [net-next, PATCH 2/2, v3] net: socionext: add XDP support Ilias Apalodimas
  2018-10-01 12:48 ` [net-next, PATCH 0/2, v3] net: socionext: " Björn Töpel
  2 siblings, 1 reply; 13+ messages in thread
From: Ilias Apalodimas @ 2018-09-29 11:28 UTC (permalink / raw)
  To: netdev, jaswinder.singh
  Cc: ard.biesheuvel, masami.hiramatsu, arnd, bjorn.topel,
	magnus.karlsson, brouer, daniel, ast, jesus.sanchez-palencia,
	vinicius.gomes, makita.toshiaki, Ilias Apalodimas

Current driver dynamically allocates an skb and maps it as DMA rx buffer.
A following patch introduces XDP functionality, so we need a
different allocation scheme. Buffers are allocated dynamically and
mapped into hardware. During the Rx operation the driver uses
build_skb() to produce the necessary buffers for the network stack

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
---
 drivers/net/ethernet/socionext/netsec.c | 238 +++++++++++++++++---------------
 1 file changed, 129 insertions(+), 109 deletions(-)

diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 7aa5ebb..8f788a1 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -296,6 +296,11 @@ struct netsec_rx_pkt_info {
 	bool err_flag;
 };
 
+static void netsec_rx_fill(struct netsec_priv *priv, u16 from, u16 num);
+
+static void *netsec_alloc_rx_data(struct netsec_priv *priv,
+				  dma_addr_t *dma_addr, u16 *len);
+
 static void netsec_write(struct netsec_priv *priv, u32 reg_addr, u32 val)
 {
 	writel(val, priv->ioaddr + reg_addr);
@@ -556,34 +561,10 @@ static const struct ethtool_ops netsec_ethtool_ops = {
 
 /************* NETDEV_OPS FOLLOW *************/
 
-static struct sk_buff *netsec_alloc_skb(struct netsec_priv *priv,
-					struct netsec_desc *desc)
-{
-	struct sk_buff *skb;
-
-	if (device_get_dma_attr(priv->dev) == DEV_DMA_COHERENT) {
-		skb = netdev_alloc_skb_ip_align(priv->ndev, desc->len);
-	} else {
-		desc->len = L1_CACHE_ALIGN(desc->len);
-		skb = netdev_alloc_skb(priv->ndev, desc->len);
-	}
-	if (!skb)
-		return NULL;
-
-	desc->addr = skb->data;
-	desc->dma_addr = dma_map_single(priv->dev, desc->addr, desc->len,
-					DMA_FROM_DEVICE);
-	if (dma_mapping_error(priv->dev, desc->dma_addr)) {
-		dev_kfree_skb_any(skb);
-		return NULL;
-	}
-	return skb;
-}
 
 static void netsec_set_rx_de(struct netsec_priv *priv,
 			     struct netsec_desc_ring *dring, u16 idx,
-			     const struct netsec_desc *desc,
-			     struct sk_buff *skb)
+			     const struct netsec_desc *desc)
 {
 	struct netsec_de *de = dring->vaddr + DESC_SZ * idx;
 	u32 attr = (1 << NETSEC_RX_PKT_OWN_FIELD) |
@@ -602,59 +583,6 @@ static void netsec_set_rx_de(struct netsec_priv *priv,
 	dring->desc[idx].dma_addr = desc->dma_addr;
 	dring->desc[idx].addr = desc->addr;
 	dring->desc[idx].len = desc->len;
-	dring->desc[idx].skb = skb;
-}
-
-static struct sk_buff *netsec_get_rx_de(struct netsec_priv *priv,
-					struct netsec_desc_ring *dring,
-					u16 idx,
-					struct netsec_rx_pkt_info *rxpi,
-					struct netsec_desc *desc, u16 *len)
-{
-	struct netsec_de de = {};
-
-	memcpy(&de, dring->vaddr + DESC_SZ * idx, DESC_SZ);
-
-	*len = de.buf_len_info >> 16;
-
-	rxpi->err_flag = (de.attr >> NETSEC_RX_PKT_ER_FIELD) & 1;
-	rxpi->rx_cksum_result = (de.attr >> NETSEC_RX_PKT_CO_FIELD) & 3;
-	rxpi->err_code = (de.attr >> NETSEC_RX_PKT_ERR_FIELD) &
-							NETSEC_RX_PKT_ERR_MASK;
-	*desc = dring->desc[idx];
-	return desc->skb;
-}
-
-static struct sk_buff *netsec_get_rx_pkt_data(struct netsec_priv *priv,
-					      struct netsec_rx_pkt_info *rxpi,
-					      struct netsec_desc *desc,
-					      u16 *len)
-{
-	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
-	struct sk_buff *tmp_skb, *skb = NULL;
-	struct netsec_desc td;
-	int tail;
-
-	*rxpi = (struct netsec_rx_pkt_info){};
-
-	td.len = priv->ndev->mtu + 22;
-
-	tmp_skb = netsec_alloc_skb(priv, &td);
-
-	tail = dring->tail;
-
-	if (!tmp_skb) {
-		netsec_set_rx_de(priv, dring, tail, &dring->desc[tail],
-				 dring->desc[tail].skb);
-	} else {
-		skb = netsec_get_rx_de(priv, dring, tail, rxpi, desc, len);
-		netsec_set_rx_de(priv, dring, tail, &td, tmp_skb);
-	}
-
-	/* move tail ahead */
-	dring->tail = (dring->tail + 1) % DESC_NUM;
-
-	return skb;
 }
 
 static int netsec_clean_tx_dring(struct netsec_priv *priv, int budget)
@@ -721,19 +649,28 @@ static int netsec_process_tx(struct netsec_priv *priv, int budget)
 	return done;
 }
 
+static void netsec_adv_desc(u16 *idx)
+{
+	*idx = *idx + 1;
+	if (unlikely(*idx >= DESC_NUM))
+		*idx = 0;
+}
+
 static int netsec_process_rx(struct netsec_priv *priv, int budget)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
 	struct net_device *ndev = priv->ndev;
-	struct netsec_rx_pkt_info rx_info;
-	int done = 0;
-	struct netsec_desc desc;
 	struct sk_buff *skb;
-	u16 len;
+	int done = 0;
 
 	while (done < budget) {
 		u16 idx = dring->tail;
 		struct netsec_de *de = dring->vaddr + (DESC_SZ * idx);
+		struct netsec_desc *desc = &dring->desc[idx];
+		struct netsec_rx_pkt_info rpi;
+		u16 pkt_len, desc_len;
+		dma_addr_t dma_handle;
+		void *buf_addr;
 
 		if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD))
 			break;
@@ -744,28 +681,62 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
 		 */
 		dma_rmb();
 		done++;
-		skb = netsec_get_rx_pkt_data(priv, &rx_info, &desc, &len);
-		if (unlikely(!skb) || rx_info.err_flag) {
+
+		pkt_len = de->buf_len_info >> 16;
+		rpi.err_code = (de->attr >> NETSEC_RX_PKT_ERR_FIELD) &
+			NETSEC_RX_PKT_ERR_MASK;
+		rpi.err_flag = (de->attr >> NETSEC_RX_PKT_ER_FIELD) & 1;
+		if (rpi.err_flag) {
 			netif_err(priv, drv, priv->ndev,
-				  "%s: rx fail err(%d)\n",
-				  __func__, rx_info.err_code);
+				  "%s: rx fail err(%d)\n", __func__,
+				  rpi.err_code);
 			ndev->stats.rx_dropped++;
+			netsec_adv_desc(&dring->tail);
+			/* reuse buffer page frag */
+			netsec_rx_fill(priv, idx, 1);
 			continue;
 		}
+		rpi.rx_cksum_result = (de->attr >> NETSEC_RX_PKT_CO_FIELD) & 3;
 
-		dma_unmap_single(priv->dev, desc.dma_addr, desc.len,
-				 DMA_FROM_DEVICE);
-		skb_put(skb, len);
+		buf_addr = netsec_alloc_rx_data(priv, &dma_handle, &desc_len);
+		if (unlikely(!buf_addr))
+			break;
+
+		dma_sync_single_for_cpu(priv->dev, desc->dma_addr, pkt_len,
+					DMA_FROM_DEVICE);
+		prefetch(desc->addr);
+
+		skb = build_skb(desc->addr, desc->len);
+		if (unlikely(!skb)) {
+			dma_unmap_single(priv->dev, dma_handle, desc_len,
+					 DMA_TO_DEVICE);
+			skb_free_frag(buf_addr);
+			netif_err(priv, drv, priv->ndev,
+				  "rx failed to alloc skb\n");
+			break;
+		}
+		dma_unmap_single_attrs(priv->dev, desc->dma_addr, desc->len,
+				       DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+
+		/* Update the descriptor with fresh buffers */
+		desc->len = desc_len;
+		desc->dma_addr = dma_handle;
+		desc->addr = buf_addr;
+
+		skb_put(skb, pkt_len);
 		skb->protocol = eth_type_trans(skb, priv->ndev);
 
 		if (priv->rx_cksum_offload_flag &&
-		    rx_info.rx_cksum_result == NETSEC_RX_CKSUM_OK)
+		    rpi.rx_cksum_result == NETSEC_RX_CKSUM_OK)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 		if (napi_gro_receive(&priv->napi, skb) != GRO_DROP) {
 			ndev->stats.rx_packets++;
-			ndev->stats.rx_bytes += len;
+			ndev->stats.rx_bytes += pkt_len;
 		}
+
+		netsec_rx_fill(priv, idx, 1);
+		netsec_adv_desc(&dring->tail);
 	}
 
 	return done;
@@ -928,7 +899,10 @@ static void netsec_uninit_pkt_dring(struct netsec_priv *priv, int id)
 		dma_unmap_single(priv->dev, desc->dma_addr, desc->len,
 				 id == NETSEC_RING_RX ? DMA_FROM_DEVICE :
 							      DMA_TO_DEVICE);
-		dev_kfree_skb(desc->skb);
+		if (id == NETSEC_RING_RX)
+			skb_free_frag(desc->addr);
+		else if (id == NETSEC_RING_TX)
+			dev_kfree_skb(desc->skb);
 	}
 
 	memset(dring->desc, 0, sizeof(struct netsec_desc) * DESC_NUM);
@@ -953,50 +927,96 @@ static void netsec_free_dring(struct netsec_priv *priv, int id)
 	dring->desc = NULL;
 }
 
+static void *netsec_alloc_rx_data(struct netsec_priv *priv,
+				  dma_addr_t *dma_handle, u16 *desc_len)
+{
+	size_t len = priv->ndev->mtu + ETH_HLEN + 2 * VLAN_HLEN + NET_SKB_PAD +
+		NET_IP_ALIGN;
+	dma_addr_t mapping;
+	void *buf;
+
+	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	len = SKB_DATA_ALIGN(len);
+
+	buf = napi_alloc_frag(len);
+	if (!buf)
+		return NULL;
+
+	mapping = dma_map_single(priv->dev, buf, len, DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(priv->dev, mapping)))
+		goto err_out;
+
+	*dma_handle = mapping;
+	*desc_len = len;
+
+	return buf;
+
+err_out:
+	skb_free_frag(buf);
+	return NULL;
+}
+
+static void netsec_rx_fill(struct netsec_priv *priv, u16 from, u16 num)
+{
+	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
+	u16 idx = from;
+
+	while (num) {
+		netsec_set_rx_de(priv, dring, idx, &dring->desc[idx]);
+		idx++;
+		if (idx >= DESC_NUM)
+			idx = 0;
+		num--;
+	}
+}
+
 static int netsec_alloc_dring(struct netsec_priv *priv, enum ring_id id)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[id];
-	int ret = 0;
 
 	dring->vaddr = dma_zalloc_coherent(priv->dev, DESC_SZ * DESC_NUM,
 					   &dring->desc_dma, GFP_KERNEL);
-	if (!dring->vaddr) {
-		ret = -ENOMEM;
+	if (!dring->vaddr)
 		goto err;
-	}
 
 	dring->desc = kcalloc(DESC_NUM, sizeof(*dring->desc), GFP_KERNEL);
-	if (!dring->desc) {
-		ret = -ENOMEM;
+	if (!dring->desc)
 		goto err;
-	}
 
 	return 0;
 err:
 	netsec_free_dring(priv, id);
 
-	return ret;
+	return -ENOMEM;
 }
 
 static int netsec_setup_rx_dring(struct netsec_priv *priv)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
-	struct netsec_desc desc;
-	struct sk_buff *skb;
-	int n;
+	int i;
 
-	desc.len = priv->ndev->mtu + 22;
+	for (i = 0; i < DESC_NUM; i++) {
+		struct netsec_desc *desc = &dring->desc[i];
+		dma_addr_t dma_handle;
+		void *buf;
+		u16 len;
 
-	for (n = 0; n < DESC_NUM; n++) {
-		skb = netsec_alloc_skb(priv, &desc);
-		if (!skb) {
+		buf = netsec_alloc_rx_data(priv, &dma_handle, &len);
+		if (!buf) {
 			netsec_uninit_pkt_dring(priv, NETSEC_RING_RX);
-			return -ENOMEM;
+			goto err_out;
 		}
-		netsec_set_rx_de(priv, dring, n, &desc, skb);
+		desc->dma_addr = dma_handle;
+		desc->addr = buf;
+		desc->len = len;
 	}
 
+	netsec_rx_fill(priv, 0, DESC_NUM);
+
 	return 0;
+
+err_out:
+	return -ENOMEM;
 }
 
 static int netsec_netdev_load_ucode_region(struct netsec_priv *priv, u32 reg,
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [net-next, PATCH 2/2, v3] net: socionext: add XDP support
  2018-09-29 11:28 [net-next, PATCH 0/2, v3] net: socionext: XDP support Ilias Apalodimas
  2018-09-29 11:28 ` [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA Ilias Apalodimas
@ 2018-09-29 11:28 ` Ilias Apalodimas
  2018-10-01 12:48 ` [net-next, PATCH 0/2, v3] net: socionext: " Björn Töpel
  2 siblings, 0 replies; 13+ messages in thread
From: Ilias Apalodimas @ 2018-09-29 11:28 UTC (permalink / raw)
  To: netdev, jaswinder.singh
  Cc: ard.biesheuvel, masami.hiramatsu, arnd, bjorn.topel,
	magnus.karlsson, brouer, daniel, ast, jesus.sanchez-palencia,
	vinicius.gomes, makita.toshiaki, Ilias Apalodimas

Add basic XDP support. The interface only supports 1 Tx queue for now
so locking is introduced on the Tx queue if XDP is enabled to make sure
.ndo_start_xmit and .ndo_xdp_xmit won't corrupt Tx ring

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
---
 drivers/net/ethernet/socionext/netsec.c | 345 +++++++++++++++++++++++++++++---
 1 file changed, 318 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 8f788a1..2b29363 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -9,6 +9,9 @@
 #include <linux/etherdevice.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/netlink.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 
 #include <net/tcp.h>
 #include <net/ip6_checksum.h>
@@ -238,6 +241,15 @@
 
 #define NETSEC_F_NETSEC_VER_MAJOR_NUM(x)	((x) & 0xffff0000)
 
+#define NETSEC_XDP_PASS          0
+#define NETSEC_XDP_CONSUMED      BIT(0)
+#define NETSEC_XDP_TX            BIT(1)
+#define NETSEC_XDP_REDIR         BIT(2)
+#define NETSEC_XDP_RX_OK (NETSEC_XDP_PASS | NETSEC_XDP_TX | NETSEC_XDP_REDIR)
+
+#define NETSEC_RXBUF_HEADROOM (max(XDP_PACKET_HEADROOM, NET_SKB_PAD) + \
+			       NET_IP_ALIGN)
+
 enum ring_id {
 	NETSEC_RING_TX = 0,
 	NETSEC_RING_RX
@@ -256,11 +268,16 @@ struct netsec_desc_ring {
 	void *vaddr;
 	u16 pkt_cnt;
 	u16 head, tail;
+	u16 xdp_xmit; /* netsec_xdp_xmit packets */
+	bool is_xdp;
+	struct xdp_rxq_info xdp_rxq;
+	spinlock_t lock; /* XDP tx queue locking */
 };
 
 struct netsec_priv {
 	struct netsec_desc_ring desc_ring[NETSEC_RING_MAX];
 	struct ethtool_coalesce et_coalesce;
+	struct bpf_prog *xdp_prog;
 	spinlock_t reglock; /* protect reg access */
 	struct napi_struct napi;
 	phy_interface_t phy_interface;
@@ -297,6 +314,8 @@ struct netsec_rx_pkt_info {
 };
 
 static void netsec_rx_fill(struct netsec_priv *priv, u16 from, u16 num);
+static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog,
+			  struct xdp_buff *xdp);
 
 static void *netsec_alloc_rx_data(struct netsec_priv *priv,
 				  dma_addr_t *dma_addr, u16 *len);
@@ -590,6 +609,8 @@ static int netsec_clean_tx_dring(struct netsec_priv *priv, int budget)
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_TX];
 	unsigned int pkts, bytes;
 
+	if (dring->is_xdp)
+		spin_lock(&dring->lock);
 	dring->pkt_cnt += netsec_read(priv, NETSEC_REG_NRM_TX_DONE_PKTCNT);
 
 	if (dring->pkt_cnt < budget)
@@ -615,13 +636,23 @@ static int netsec_clean_tx_dring(struct netsec_priv *priv, int budget)
 
 		dma_unmap_single(priv->dev, desc->dma_addr, desc->len,
 				 DMA_TO_DEVICE);
-		if (eop) {
-			pkts++;
+
+		if (!eop) {
+			*desc = (struct netsec_desc){};
+			continue;
+		}
+
+		if (!desc->skb) {
+			skb_free_frag(desc->addr);
+		} else {
 			bytes += desc->skb->len;
 			dev_kfree_skb(desc->skb);
 		}
+		pkts++;
 		*desc = (struct netsec_desc){};
 	}
+	if (dring->is_xdp)
+		spin_unlock(&dring->lock);
 	dring->pkt_cnt -= budget;
 
 	priv->ndev->stats.tx_packets += budget;
@@ -656,11 +687,30 @@ static void netsec_adv_desc(u16 *idx)
 		*idx = 0;
 }
 
+static void netsec_xdp_ring_tx_db(struct netsec_priv *priv, u16 pkts)
+{
+	if (likely(pkts))
+		netsec_write(priv, NETSEC_REG_NRM_TX_PKTCNT, pkts);
+}
+
+static void netsec_finalize_xdp_rx(struct netsec_priv *priv, u32 xdp_res,
+				   u16 pkts)
+{
+	if (xdp_res & NETSEC_XDP_REDIR)
+		xdp_do_flush_map();
+
+	if (xdp_res & NETSEC_XDP_TX)
+		netsec_xdp_ring_tx_db(priv, pkts);
+}
+
 static int netsec_process_rx(struct netsec_priv *priv, int budget)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
+	struct bpf_prog *xdp_prog = READ_ONCE(priv->xdp_prog);
 	struct net_device *ndev = priv->ndev;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
+	u16 xdp_xmit = 0;
+	u32 xdp_act = 0;
 	int done = 0;
 
 	while (done < budget) {
@@ -668,8 +718,10 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
 		struct netsec_de *de = dring->vaddr + (DESC_SZ * idx);
 		struct netsec_desc *desc = &dring->desc[idx];
 		struct netsec_rx_pkt_info rpi;
-		u16 pkt_len, desc_len;
+		u32 xdp_result = XDP_PASS;
 		dma_addr_t dma_handle;
+		u16 pkt_len, desc_len;
+		struct xdp_buff xdp;
 		void *buf_addr;
 
 		if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD))
@@ -706,7 +758,23 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
 					DMA_FROM_DEVICE);
 		prefetch(desc->addr);
 
-		skb = build_skb(desc->addr, desc->len);
+		xdp.data_hard_start = desc->addr;
+		xdp.data = desc->addr + NETSEC_RXBUF_HEADROOM;
+		xdp_set_data_meta_invalid(&xdp);
+		xdp.data_end = xdp.data + pkt_len;
+		xdp.rxq = &dring->xdp_rxq;
+
+		if (xdp_prog) {
+			xdp_result = netsec_run_xdp(priv, xdp_prog, &xdp);
+			if (xdp_result != NETSEC_XDP_PASS) {
+				xdp_act |= xdp_result;
+				if (xdp_result == NETSEC_XDP_TX)
+					xdp_xmit++;
+				goto next;
+			}
+		}
+
+		skb = build_skb(xdp.data_hard_start, desc->len);
 		if (unlikely(!skb)) {
 			dma_unmap_single(priv->dev, dma_handle, desc_len,
 					 DMA_TO_DEVICE);
@@ -715,30 +783,35 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
 				  "rx failed to alloc skb\n");
 			break;
 		}
-		dma_unmap_single_attrs(priv->dev, desc->dma_addr, desc->len,
-				       DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
 
-		/* Update the descriptor with fresh buffers */
-		desc->len = desc_len;
-		desc->dma_addr = dma_handle;
-		desc->addr = buf_addr;
-
-		skb_put(skb, pkt_len);
+		skb_reserve(skb, xdp.data - xdp.data_hard_start);
+		skb_put(skb, xdp.data_end - xdp.data);
 		skb->protocol = eth_type_trans(skb, priv->ndev);
 
 		if (priv->rx_cksum_offload_flag &&
 		    rpi.rx_cksum_result == NETSEC_RX_CKSUM_OK)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-		if (napi_gro_receive(&priv->napi, skb) != GRO_DROP) {
+next:
+		if ((skb && napi_gro_receive(&priv->napi, skb) != GRO_DROP) ||
+		    xdp_result & NETSEC_XDP_RX_OK) {
 			ndev->stats.rx_packets++;
-			ndev->stats.rx_bytes += pkt_len;
+			ndev->stats.rx_bytes += xdp.data_end - xdp.data;
 		}
 
+		dma_unmap_single_attrs(priv->dev, desc->dma_addr, desc->len,
+				       DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+
+		/* Update the descriptor with fresh buffers */
+		desc->len = desc_len;
+		desc->dma_addr = dma_handle;
+		desc->addr = buf_addr;
 		netsec_rx_fill(priv, idx, 1);
 		netsec_adv_desc(&dring->tail);
 	}
 
+	netsec_finalize_xdp_rx(priv, xdp_act, xdp_xmit);
+
 	return done;
 }
 
@@ -805,7 +878,9 @@ static void netsec_set_tx_de(struct netsec_priv *priv,
 	de->data_buf_addr_lw = lower_32_bits(desc->dma_addr);
 	de->buf_len_info = (tx_ctrl->tcp_seg_len << 16) | desc->len;
 	de->attr = attr;
-	dma_wmb();
+	/* under spin_lock if using XDP */
+	if (!dring->is_xdp)
+		dma_wmb();
 
 	dring->desc[idx] = *desc;
 	dring->desc[idx].skb = skb;
@@ -824,6 +899,8 @@ static netdev_tx_t netsec_netdev_start_xmit(struct sk_buff *skb,
 	u16 tso_seg_len = 0;
 	int filled;
 
+	if (dring->is_xdp)
+		spin_lock_bh(&dring->lock);
 	/* differentiate between full/emtpy ring */
 	if (dring->head >= dring->tail)
 		filled = dring->head - dring->tail;
@@ -831,6 +908,8 @@ static netdev_tx_t netsec_netdev_start_xmit(struct sk_buff *skb,
 		filled = dring->head + DESC_NUM - dring->tail;
 
 	if (DESC_NUM - filled < 2) { /* if less than 2 available */
+		if (dring->is_xdp)
+			spin_unlock_bh(&dring->lock);
 		netif_err(priv, drv, priv->ndev, "%s: TxQFull!\n", __func__);
 		netif_stop_queue(priv->ndev);
 		dma_wmb();
@@ -864,6 +943,8 @@ static netdev_tx_t netsec_netdev_start_xmit(struct sk_buff *skb,
 	tx_desc.dma_addr = dma_map_single(priv->dev, skb->data,
 					  skb_headlen(skb), DMA_TO_DEVICE);
 	if (dma_mapping_error(priv->dev, tx_desc.dma_addr)) {
+		if (dring->is_xdp)
+			spin_unlock_bh(&dring->lock);
 		netif_err(priv, drv, priv->ndev,
 			  "%s: DMA mapping failed\n", __func__);
 		ndev->stats.tx_dropped++;
@@ -877,6 +958,8 @@ static netdev_tx_t netsec_netdev_start_xmit(struct sk_buff *skb,
 	netdev_sent_queue(priv->ndev, skb->len);
 
 	netsec_set_tx_de(priv, dring, &tx_ctrl, &tx_desc, skb);
+	if (dring->is_xdp)
+		spin_unlock_bh(&dring->lock);
 	netsec_write(priv, NETSEC_REG_NRM_TX_PKTCNT, 1); /* submit another tx */
 
 	return NETDEV_TX_OK;
@@ -891,6 +974,9 @@ static void netsec_uninit_pkt_dring(struct netsec_priv *priv, int id)
 	if (!dring->vaddr || !dring->desc)
 		return;
 
+	if (xdp_rxq_info_is_reg(&dring->xdp_rxq))
+		xdp_rxq_info_unreg(&dring->xdp_rxq);
+
 	for (idx = 0; idx < DESC_NUM; idx++) {
 		desc = &dring->desc[idx];
 		if (!desc->addr)
@@ -930,24 +1016,24 @@ static void netsec_free_dring(struct netsec_priv *priv, int id)
 static void *netsec_alloc_rx_data(struct netsec_priv *priv,
 				  dma_addr_t *dma_handle, u16 *desc_len)
 {
-	size_t len = priv->ndev->mtu + ETH_HLEN + 2 * VLAN_HLEN + NET_SKB_PAD +
-		NET_IP_ALIGN;
+	size_t total_len = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	size_t payload_len = priv->ndev->mtu + ETH_HLEN + 2 * VLAN_HLEN;
 	dma_addr_t mapping;
 	void *buf;
 
-	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	len = SKB_DATA_ALIGN(len);
+	total_len += SKB_DATA_ALIGN(payload_len + NETSEC_RXBUF_HEADROOM);
 
-	buf = napi_alloc_frag(len);
+	buf = napi_alloc_frag(total_len);
 	if (!buf)
 		return NULL;
 
-	mapping = dma_map_single(priv->dev, buf, len, DMA_FROM_DEVICE);
+	mapping = dma_map_single(priv->dev, buf + NETSEC_RXBUF_HEADROOM,
+				 payload_len, DMA_FROM_DEVICE);
 	if (unlikely(dma_mapping_error(priv->dev, mapping)))
 		goto err_out;
 
 	*dma_handle = mapping;
-	*desc_len = len;
+	*desc_len = total_len;
 
 	return buf;
 
@@ -990,10 +1076,27 @@ static int netsec_alloc_dring(struct netsec_priv *priv, enum ring_id id)
 	return -ENOMEM;
 }
 
+static void netsec_setup_tx_dring(struct netsec_priv *priv)
+{
+	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_TX];
+	struct bpf_prog *xdp_prog = READ_ONCE(priv->xdp_prog);
+
+	if (xdp_prog)
+		dring->is_xdp = true;
+	else
+		dring->is_xdp = false;
+}
+
 static int netsec_setup_rx_dring(struct netsec_priv *priv)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
-	int i;
+	struct bpf_prog *xdp_prog = READ_ONCE(priv->xdp_prog);
+	int i, err;
+
+	if (xdp_prog)
+		dring->is_xdp = true;
+	else
+		dring->is_xdp = false;
 
 	for (i = 0; i < DESC_NUM; i++) {
 		struct netsec_desc *desc = &dring->desc[i];
@@ -1002,20 +1105,29 @@ static int netsec_setup_rx_dring(struct netsec_priv *priv)
 		u16 len;
 
 		buf = netsec_alloc_rx_data(priv, &dma_handle, &len);
-		if (!buf) {
-			netsec_uninit_pkt_dring(priv, NETSEC_RING_RX);
+		if (!buf)
 			goto err_out;
-		}
 		desc->dma_addr = dma_handle;
 		desc->addr = buf;
 		desc->len = len;
 	}
 
 	netsec_rx_fill(priv, 0, DESC_NUM);
+	err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0);
+	if (err)
+		goto err_out;
+
+	err = xdp_rxq_info_reg_mem_model(&dring->xdp_rxq, MEM_TYPE_PAGE_SHARED,
+					 NULL);
+	if (err) {
+		xdp_rxq_info_unreg(&dring->xdp_rxq);
+		goto err_out;
+	}
 
 	return 0;
 
 err_out:
+	netsec_uninit_pkt_dring(priv, NETSEC_RING_RX);
 	return -ENOMEM;
 }
 
@@ -1290,6 +1402,7 @@ static int netsec_netdev_open(struct net_device *ndev)
 
 	pm_runtime_get_sync(priv->dev);
 
+	netsec_setup_tx_dring(priv);
 	ret = netsec_setup_rx_dring(priv);
 	if (ret) {
 		netif_err(priv, probe, priv->ndev,
@@ -1387,6 +1500,9 @@ static int netsec_netdev_init(struct net_device *ndev)
 	if (ret)
 		goto err2;
 
+	spin_lock_init(&priv->desc_ring[NETSEC_RING_TX].lock);
+	spin_lock_init(&priv->desc_ring[NETSEC_RING_RX].lock);
+
 	return 0;
 err2:
 	netsec_free_dring(priv, NETSEC_RING_RX);
@@ -1419,6 +1535,179 @@ static int netsec_netdev_ioctl(struct net_device *ndev, struct ifreq *ifr,
 	return phy_mii_ioctl(ndev->phydev, ifr, cmd);
 }
 
+/* The current driver only supports 1 Txq, this should run under spin_lock() */
+static u32 netsec_xdp_queue_one(struct netsec_priv *priv,
+				struct xdp_frame *xdpf)
+
+{
+	struct netsec_desc_ring *tx_ring = &priv->desc_ring[NETSEC_RING_TX];
+	struct netsec_tx_pkt_ctrl tx_ctrl = {};
+	struct netsec_desc tx_desc;
+	dma_addr_t dma_handle;
+	u16 filled;
+
+	if (tx_ring->head >= tx_ring->tail)
+		filled = tx_ring->head - tx_ring->tail;
+	else
+		filled = tx_ring->head + DESC_NUM - tx_ring->tail;
+
+	if (DESC_NUM - filled <= 1)
+		return NETSEC_XDP_CONSUMED;
+
+	dma_handle = dma_map_single(priv->dev, xdpf->data, xdpf->len,
+				    DMA_TO_DEVICE);
+	if (dma_mapping_error(priv->dev, dma_handle))
+		return NETSEC_XDP_CONSUMED;
+
+	tx_ctrl.cksum_offload_flag = false;
+	tx_ctrl.tcp_seg_offload_flag = false;
+	tx_ctrl.tcp_seg_len = 0;
+
+	tx_desc.dma_addr = dma_handle;
+	tx_desc.addr = xdpf->data;
+	tx_desc.len = xdpf->len;
+
+	netsec_set_tx_de(priv, tx_ring, &tx_ctrl, &tx_desc, NULL);
+
+	return NETSEC_XDP_TX;
+}
+
+static int netsec_xdp_xmit(struct net_device *ndev, int n,
+			   struct xdp_frame **frames, u32 flags)
+{
+	struct netsec_priv *priv = netdev_priv(ndev);
+	struct netsec_desc_ring *tx_ring = &priv->desc_ring[NETSEC_RING_TX];
+	int drops = 0;
+	int i;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	spin_lock(&tx_ring->lock);
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
+
+		err = netsec_xdp_queue_one(priv, xdpf);
+		if (err != NETSEC_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		} else {
+			tx_ring->xdp_xmit++;
+		}
+	}
+	spin_unlock(&tx_ring->lock);
+
+	if (unlikely(flags & XDP_XMIT_FLUSH)) {
+		netsec_xdp_ring_tx_db(priv, tx_ring->xdp_xmit);
+		tx_ring->xdp_xmit = 0;
+	}
+
+	return n - drops;
+}
+
+static u32 netsec_xdp_xmit_back(struct netsec_priv *priv, struct xdp_buff *xdp)
+{
+	struct netsec_desc_ring *tx_ring = &priv->desc_ring[NETSEC_RING_TX];
+	struct xdp_frame *xdpf = convert_to_xdp_frame(xdp);
+	u32 ret;
+
+	if (unlikely(!xdpf))
+		return NETSEC_XDP_CONSUMED;
+
+	spin_lock(&tx_ring->lock);
+	ret = netsec_xdp_queue_one(priv, xdpf);
+	spin_unlock(&tx_ring->lock);
+
+	return ret;
+}
+
+static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog,
+			  struct xdp_buff *xdp)
+{
+	u32 ret = NETSEC_XDP_PASS;
+	int err;
+	u32 act;
+
+	rcu_read_lock();
+	act = bpf_prog_run_xdp(prog, xdp);
+
+	switch (act) {
+	case XDP_PASS:
+		ret = NETSEC_XDP_PASS;
+		break;
+	case XDP_TX:
+		ret = netsec_xdp_xmit_back(priv, xdp);
+		if (ret != NETSEC_XDP_TX)
+			xdp_return_buff(xdp);
+		break;
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(priv->ndev, xdp, prog);
+		if (!err) {
+			ret = NETSEC_XDP_REDIR;
+		} else {
+			ret = NETSEC_XDP_CONSUMED;
+			xdp_return_buff(xdp);
+		}
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fall through */
+	case XDP_ABORTED:
+		trace_xdp_exception(priv->ndev, prog, act);
+		/* fall through -- handle aborts by dropping packet */
+	case XDP_DROP:
+		ret = NETSEC_XDP_CONSUMED;
+		xdp_return_buff(xdp);
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int netsec_xdp_setup(struct netsec_priv *priv, struct bpf_prog *prog,
+			    struct netlink_ext_ack *extack)
+{
+	struct net_device *dev = priv->ndev;
+	struct bpf_prog *old_prog;
+
+	/* For now just support only the usual MTU sized frames */
+	if (prog && dev->mtu > 1500) {
+		NL_SET_ERR_MSG_MOD(extack, "Jumbo frames not supported on XDP");
+		return -EOPNOTSUPP;
+	}
+
+	if (netif_running(dev))
+		netsec_netdev_stop(dev);
+
+	/* Detach old prog, if any */
+	old_prog = xchg(&priv->xdp_prog, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	if (netif_running(dev))
+		netsec_netdev_open(dev);
+
+	return 0;
+}
+
+static int netsec_xdp(struct net_device *ndev, struct netdev_bpf *xdp)
+{
+	struct netsec_priv *priv = netdev_priv(ndev);
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return netsec_xdp_setup(priv, xdp->prog, xdp->extack);
+	case XDP_QUERY_PROG:
+		xdp->prog_id = priv->xdp_prog ? priv->xdp_prog->aux->id : 0;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops netsec_netdev_ops = {
 	.ndo_init		= netsec_netdev_init,
 	.ndo_uninit		= netsec_netdev_uninit,
@@ -1429,6 +1718,8 @@ static const struct net_device_ops netsec_netdev_ops = {
 	.ndo_set_mac_address    = eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= netsec_netdev_ioctl,
+	.ndo_xdp_xmit		= netsec_xdp_xmit,
+	.ndo_bpf		= netsec_xdp,
 };
 
 static int netsec_of_probe(struct platform_device *pdev,
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA
  2018-09-29 11:28 ` [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA Ilias Apalodimas
@ 2018-10-01  9:26   ` Jesper Dangaard Brouer
  2018-10-01  9:44     ` Ilias Apalodimas
  0 siblings, 1 reply; 13+ messages in thread
From: Jesper Dangaard Brouer @ 2018-10-01  9:26 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	bjorn.topel, magnus.karlsson, daniel, ast,
	jesus.sanchez-palencia, vinicius.gomes, makita.toshiaki, brouer,
	Tariq Toukan, Tariq Toukan


On Sat, 29 Sep 2018 14:28:01 +0300 Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> +static void *netsec_alloc_rx_data(struct netsec_priv *priv,
> +				  dma_addr_t *dma_handle, u16 *desc_len)
> +{
> +	size_t len = priv->ndev->mtu + ETH_HLEN + 2 * VLAN_HLEN + NET_SKB_PAD +
> +		NET_IP_ALIGN;
> +	dma_addr_t mapping;
> +	void *buf;
> +
> +	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +	len = SKB_DATA_ALIGN(len);
> +
> +	buf = napi_alloc_frag(len);

Using napi_alloc_frag here ^^^^

> +	if (!buf)
> +		return NULL;
> +
> +	mapping = dma_map_single(priv->dev, buf, len, DMA_FROM_DEVICE);
> +	if (unlikely(dma_mapping_error(priv->dev, mapping)))
> +		goto err_out;
> +
> +	*dma_handle = mapping;
> +	*desc_len = len;
> +
> +	return buf;
> +
> +err_out:
> +	skb_free_frag(buf);
> +	return NULL;
> +}

Hmmm, you are using napi_alloc_frag() in above code, which behind
your-back allocates order-3 pages (32 Kbytes memory in 8 order-0 pages).

This violates at-least two XDP principals:

#1: You are NOT using order-0 page based allocations for XDP.

Notice, I'm not saying 1-page per packet, as ixgbe + i40e violated
this, and it is now "per-practical-code-example" acceptable to split up
the order-0 page, and store two RX frames per order-0 page (4096 bytes).
(To make this fit you have to reduce XDP_HEADROOM to 192 bytes, which
killed the idea of placing the SKB in this area).

#2: You have allocations on the XDP fast-path.

The REAL secret behind the XDP performance is to avoid allocations on
the fast-path.  While I just told you to use the page-allocator and
order-0 pages, this will actually kill performance.  Thus, to make this
fast, you need a driver local recycle scheme that avoids going through
the page allocator, which makes XDP_DROP and XDP_TX extremely fast.
For the XDP_REDIRECT action (which you seems to be interested in, as
this is needed for AF_XDP), there is a xdp_return_frame() API that can
make this fast.

To avoid every driver inventing their own driver local page-recycle
cache (which many does today), we/I have created the page pool API.
See include/net/page_pool.h, and look at how mlx5 driver uses it
in v4.18 links[1][2][3].  Do notice, that mlx5 ALSO have a driver
recycle scheme on top, which Tariq is working on removing or
generalizing.  AND also that mlx5 does not use the DMA mapping feature
that page_pool also provide yet. (Contact me if you want to use
page_pool for handing DMA mapping, we might need to export
__page_pool_clean_page and call it before XDP_PASS action).


[1] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L226
[2] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L255
[3] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_main.c#L598-L618
-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA
  2018-10-01  9:26   ` Jesper Dangaard Brouer
@ 2018-10-01  9:44     ` Ilias Apalodimas
  2018-10-01  9:56       ` Ilias Apalodimas
  0 siblings, 1 reply; 13+ messages in thread
From: Ilias Apalodimas @ 2018-10-01  9:44 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	bjorn.topel, magnus.karlsson, daniel, ast,
	jesus.sanchez-palencia, vinicius.gomes, makita.toshiaki,
	Tariq Toukan, Tariq Toukan

On Mon, Oct 01, 2018 at 11:26:31AM +0200, Jesper Dangaard Brouer wrote:
> 
> On Sat, 29 Sep 2018 14:28:01 +0300 Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:
> 
> > +static void *netsec_alloc_rx_data(struct netsec_priv *priv,
> > +				  dma_addr_t *dma_handle, u16 *desc_len)
> > +{
> > +	size_t len = priv->ndev->mtu + ETH_HLEN + 2 * VLAN_HLEN + NET_SKB_PAD +
> > +		NET_IP_ALIGN;
> > +	dma_addr_t mapping;
> > +	void *buf;
> > +
> > +	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> > +	len = SKB_DATA_ALIGN(len);
> > +
> > +	buf = napi_alloc_frag(len);
> 
> Using napi_alloc_frag here ^^^^
> 
> > +	if (!buf)
> > +		return NULL;
> > +
> > +	mapping = dma_map_single(priv->dev, buf, len, DMA_FROM_DEVICE);
> > +	if (unlikely(dma_mapping_error(priv->dev, mapping)))
> > +		goto err_out;
> > +
> > +	*dma_handle = mapping;
> > +	*desc_len = len;
> > +
> > +	return buf;
> > +
> > +err_out:
> > +	skb_free_frag(buf);
> > +	return NULL;
> > +}
> 
> Hmmm, you are using napi_alloc_frag() in above code, which behind
> your-back allocates order-3 pages (32 Kbytes memory in 8 order-0 pages).
> 
> This violates at-least two XDP principals:
> 
> #1: You are NOT using order-0 page based allocations for XDP.
> 
> Notice, I'm not saying 1-page per packet, as ixgbe + i40e violated
> this, and it is now "per-practical-code-example" acceptable to split up
> the order-0 page, and store two RX frames per order-0 page (4096 bytes).
> (To make this fit you have to reduce XDP_HEADROOM to 192 bytes, which
> killed the idea of placing the SKB in this area).
Yes i saw the Intel implementation. I just thought it wasn't worth the hassle
for an 1gbit interface (but wasn't aware it violates and XDP principle). 
I also noticed that Netronome(and others) are allocating 1 page per packet 
when using XDP
> 
> #2: You have allocations on the XDP fast-path.
> 
> The REAL secret behind the XDP performance is to avoid allocations on
> the fast-path.  While I just told you to use the page-allocator and
> order-0 pages, this will actually kill performance.  Thus, to make this
> fast, you need a driver local recycle scheme that avoids going through
> the page allocator, which makes XDP_DROP and XDP_TX extremely fast.
> For the XDP_REDIRECT action (which you seems to be interested in, as
> this is needed for AF_XDP), there is a xdp_return_frame() API that can
> make this fast.
I had an initial implementation that did exactly that (that's why you the
dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 
of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh
buffers back to the hardware only when your packets have been processed from
your userspace application
> 
> To avoid every driver inventing their own driver local page-recycle
> cache (which many does today), we/I have created the page pool API.
> See include/net/page_pool.h, and look at how mlx5 driver uses it
> in v4.18 links[1][2][3].  Do notice, that mlx5 ALSO have a driver
> recycle scheme on top, which Tariq is working on removing or
> generalizing.  AND also that mlx5 does not use the DMA mapping feature
> that page_pool also provide yet. (Contact me if you want to use
> page_pool for handing DMA mapping, we might need to export
> __page_pool_clean_page and call it before XDP_PASS action).
Ok i'll have a look on that and let you know.  i

P.S : A few months back we reported that Chelsio is using a different 
'memory scheme' for incoming packets. Essentially they just feed the hardware 
with unstructred pages and it decides were to place
the packet. Maybe that's worth considering for the page pool API?

> 
> 
> [1] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L226
> [2] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L255
> [3] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_main.c#L598-L618


Thanks
/Ilias

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA
  2018-10-01  9:44     ` Ilias Apalodimas
@ 2018-10-01  9:56       ` Ilias Apalodimas
  2018-10-01 11:03         ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 13+ messages in thread
From: Ilias Apalodimas @ 2018-10-01  9:56 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	bjorn.topel, magnus.karlsson, daniel, ast,
	jesus.sanchez-palencia, vinicius.gomes, makita.toshiaki,
	Tariq Toukan, Tariq Toukan

> > #2: You have allocations on the XDP fast-path.
> > 
> > The REAL secret behind the XDP performance is to avoid allocations on
> > the fast-path.  While I just told you to use the page-allocator and
> > order-0 pages, this will actually kill performance.  Thus, to make this
> > fast, you need a driver local recycle scheme that avoids going through
> > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.
> > For the XDP_REDIRECT action (which you seems to be interested in, as
> > this is needed for AF_XDP), there is a xdp_return_frame() API that can
> > make this fast.
> I had an initial implementation that did exactly that (that's why you the
> dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 
> of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh
> buffers back to the hardware only when your packets have been processed from
> your userspace application
Just a clarification here. This is the case if ZC is implemented. In my case
the buffers will be 'ok' to be passed back to the hardware once the use
userspace payload has been copied by xdp_do_redirect()

/Ilias

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA
  2018-10-01  9:56       ` Ilias Apalodimas
@ 2018-10-01 11:03         ` Jesper Dangaard Brouer
  2018-10-01 11:20           ` Ilias Apalodimas
  0 siblings, 1 reply; 13+ messages in thread
From: Jesper Dangaard Brouer @ 2018-10-01 11:03 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	bjorn.topel, magnus.karlsson, daniel, ast,
	jesus.sanchez-palencia, vinicius.gomes, makita.toshiaki,
	Tariq Toukan, Tariq Toukan, brouer

On Mon, 1 Oct 2018 12:56:58 +0300
Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> > > #2: You have allocations on the XDP fast-path.
> > > 
> > > The REAL secret behind the XDP performance is to avoid allocations on
> > > the fast-path.  While I just told you to use the page-allocator and
> > > order-0 pages, this will actually kill performance.  Thus, to make this
> > > fast, you need a driver local recycle scheme that avoids going through
> > > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.
> > > For the XDP_REDIRECT action (which you seems to be interested in, as
> > > this is needed for AF_XDP), there is a xdp_return_frame() API that can
> > > make this fast.  
> >
> > I had an initial implementation that did exactly that (that's why you the
> > dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 
> > of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh
> > buffers back to the hardware only when your packets have been processed from
> > your userspace application 
>
> Just a clarification here. This is the case if ZC is implemented. In my case
> the buffers will be 'ok' to be passed back to the hardware once the use
> userspace payload has been copied by xdp_do_redirect()

Thanks for clarifying.  But no, this is not introducing a 'bottleneck'
for AF_XDP.

For (1) the copy-mode-AF_XDP the frame (as you noticed) is "freed" or
"returned" very quickly after it is copied.  The code is a bit hard to
follow, but in __xsk_rcv() it calls xdp_return_buff() after the memcpy.
Thus, the frame can be kept DMA mapped and reused in RX-ring quickly.

For (2) the zero-copy-AF_XDP, then you need to implement a new
allocator of type MEM_TYPE_ZERO_COPY.  The performance trick here is
that all DMA-map/unmap and allocations go away, given everything is
preallocated by userspace.  Through the 4 rings (SPSC) are used for
recycling the ZC-umem frames (read Documentation/networking/af_xdp.rst).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA
  2018-10-01 11:03         ` Jesper Dangaard Brouer
@ 2018-10-01 11:20           ` Ilias Apalodimas
  2018-10-01 13:48             ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 13+ messages in thread
From: Ilias Apalodimas @ 2018-10-01 11:20 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	bjorn.topel, magnus.karlsson, daniel, ast,
	jesus.sanchez-palencia, vinicius.gomes, makita.toshiaki,
	Tariq Toukan, Tariq Toukan

On Mon, Oct 01, 2018 at 01:03:13PM +0200, Jesper Dangaard Brouer wrote:
> On Mon, 1 Oct 2018 12:56:58 +0300
> Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:
> 
> > > > #2: You have allocations on the XDP fast-path.
> > > > 
> > > > The REAL secret behind the XDP performance is to avoid allocations on
> > > > the fast-path.  While I just told you to use the page-allocator and
> > > > order-0 pages, this will actually kill performance.  Thus, to make this
> > > > fast, you need a driver local recycle scheme that avoids going through
> > > > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.
> > > > For the XDP_REDIRECT action (which you seems to be interested in, as
> > > > this is needed for AF_XDP), there is a xdp_return_frame() API that can
> > > > make this fast.  
> > >
> > > I had an initial implementation that did exactly that (that's why you the
> > > dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 
> > > of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh
> > > buffers back to the hardware only when your packets have been processed from
> > > your userspace application 
> >
> > Just a clarification here. This is the case if ZC is implemented. In my case
> > the buffers will be 'ok' to be passed back to the hardware once the use
> > userspace payload has been copied by xdp_do_redirect()
> 
> Thanks for clarifying.  But no, this is not introducing a 'bottleneck'
> for AF_XDP.
> 
> For (1) the copy-mode-AF_XDP the frame (as you noticed) is "freed" or
> "returned" very quickly after it is copied.  The code is a bit hard to
> follow, but in __xsk_rcv() it calls xdp_return_buff() after the memcpy.
> Thus, the frame can be kept DMA mapped and reused in RX-ring quickly.
Ok makes sense. I'll send a v4 with page re-usage, while using your API for page
allocation

> 
> For (2) the zero-copy-AF_XDP, then you need to implement a new
> allocator of type MEM_TYPE_ZERO_COPY.  The performance trick here is
> that all DMA-map/unmap and allocations go away, given everything is
> preallocated by userspace.  Through the 4 rings (SPSC) are used for
> recycling the ZC-umem frames (read Documentation/networking/af_xdp.rst).
Noted in case we implement ZC support

Thanks
/Ilias

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [net-next, PATCH 0/2, v3] net: socionext: XDP support
  2018-09-29 11:28 [net-next, PATCH 0/2, v3] net: socionext: XDP support Ilias Apalodimas
  2018-09-29 11:28 ` [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA Ilias Apalodimas
  2018-09-29 11:28 ` [net-next, PATCH 2/2, v3] net: socionext: add XDP support Ilias Apalodimas
@ 2018-10-01 12:48 ` Björn Töpel
  2018-10-01 13:59   ` Ilias Apalodimas
  2 siblings, 1 reply; 13+ messages in thread
From: Björn Töpel @ 2018-10-01 12:48 UTC (permalink / raw)
  To: Ilias Apalodimas, netdev, jaswinder.singh
  Cc: ard.biesheuvel, masami.hiramatsu, arnd, magnus.karlsson, brouer,
	daniel, ast, jesus.sanchez-palencia, vinicius.gomes,
	makita.toshiaki

On 2018-09-29 13:28, Ilias Apalodimas wrote:
> This patch series adds AF_XDP support socionext netsec driver

This series adds *XDP* support and as a result, the AF_XDP batteries
are included. ;-)


Björn

> In addition the new dma allocation scheme offers a 10% boost on Rx
> pps rate using 64b packets
> 
> - patch [1/2]: Use a different allocation scheme for Rx DMA buffers to
>    prepare the driver for AF_XDP support
> - patch [2/2]: Add XDP support without zero-copy
> 
> test and performance numbers (64b packets):
> -------------------------------------------
> - Normal SKBs on Rx: ~217kpps
> test: pktgen -> intel i210 -> netsec -> XDP_TX/XDP_REDIRECT
> - XDP_TX: 320kpps
> - XDP_REDIRECT: 320kpps
> 
> qemu -> pktgen -> virtio -> ndo_xdp_xmit -> netsec
> - ndo_xdp_xmit: Could not send more than 120kpps. Interface forwarded that
>                  with success
> 
> Changes since v2:
>   - Always allocate Rx buffers with XDP_PACKET_HEADROOM
>   
>   Björn Töpel:
>   - Added locking in the Tx queue
> 
>   Jesper Dangaard Brouer:
>   - Added support for .ndo_xdp_xmit
>   - XDP_TX does not flush every packet
> 
> Changes since v1:
> - patch [2/2]:
>   Toshiaki Makita:
>   - Added XDP_PACKET_HEADROOM
>   - Fixed a bug on XDP_PASS case
>   - Using extact for error messaging instead of netdev_warn, when
>     trying to setup XDP
> 
> Ilias Apalodimas (2):
>    net: socionext: different approach on DMA
>    net: socionext: add XDP support
> 
>   drivers/net/ethernet/socionext/netsec.c | 541 +++++++++++++++++++++++++-------
>   1 file changed, 426 insertions(+), 115 deletions(-)
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA
  2018-10-01 11:20           ` Ilias Apalodimas
@ 2018-10-01 13:48             ` Jesper Dangaard Brouer
  2018-10-01 14:37               ` Ilias Apalodimas
  0 siblings, 1 reply; 13+ messages in thread
From: Jesper Dangaard Brouer @ 2018-10-01 13:48 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	bjorn.topel, magnus.karlsson, daniel, ast,
	jesus.sanchez-palencia, vinicius.gomes, makita.toshiaki,
	Tariq Toukan, Tariq Toukan, brouer

On Mon, 1 Oct 2018 14:20:21 +0300
Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> On Mon, Oct 01, 2018 at 01:03:13PM +0200, Jesper Dangaard Brouer wrote:
> > On Mon, 1 Oct 2018 12:56:58 +0300
> > Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:
> >   
> > > > > #2: You have allocations on the XDP fast-path.
> > > > > 
> > > > > The REAL secret behind the XDP performance is to avoid allocations on
> > > > > the fast-path.  While I just told you to use the page-allocator and
> > > > > order-0 pages, this will actually kill performance.  Thus, to make this
> > > > > fast, you need a driver local recycle scheme that avoids going through
> > > > > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.
> > > > > For the XDP_REDIRECT action (which you seems to be interested in, as
> > > > > this is needed for AF_XDP), there is a xdp_return_frame() API that can
> > > > > make this fast.    
> > > >
> > > > I had an initial implementation that did exactly that (that's why you the
> > > > dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 
> > > > of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh
> > > > buffers back to the hardware only when your packets have been processed from
> > > > your userspace application   
> > >
> > > Just a clarification here. This is the case if ZC is implemented. In my case
> > > the buffers will be 'ok' to be passed back to the hardware once the use
> > > userspace payload has been copied by xdp_do_redirect()  
> > 
> > Thanks for clarifying.  But no, this is not introducing a 'bottleneck'
> > for AF_XDP.
> > 
> > For (1) the copy-mode-AF_XDP the frame (as you noticed) is "freed" or
> > "returned" very quickly after it is copied.  The code is a bit hard to
> > follow, but in __xsk_rcv() it calls xdp_return_buff() after the memcpy.
> > Thus, the frame can be kept DMA mapped and reused in RX-ring quickly.
>  
> Ok makes sense. I'll send a v4 with page re-usage, while using your
> API for page allocation

Sound good, BUT do notice that using the bare page_pool, will/should
give you increased XDP performance, but might slow-down normal network
stack delivery, because netstack will not call xdp_return_frame() and
instead falls back to returning the pages through the page-allocator.

I'm very interested in knowing what performance increase you see with
XDP_DROP, with just a "bare" page_pool implementation.

The mlx5 driver does not see this netstack slowdown, because it have a
hybrid approach of maintaining a recycle ring for frames going into
netstack, by bumping the refcnt.  I think Tariq is cleaning this up.
The mlx5 code is hard to follow... in mlx5e_xdp_handle()[1] the
refcnt==1 and a bit is set. And in [2] the refcnt is page_ref_inc(),
and bit is caught in [3].  (This really need to be cleaned up and
generalized).



[1] https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c#L83-L88
    https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L952-L959

[2] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1015-L1025

[3] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1094-L1098

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [net-next, PATCH 0/2, v3] net: socionext: XDP support
  2018-10-01 12:48 ` [net-next, PATCH 0/2, v3] net: socionext: " Björn Töpel
@ 2018-10-01 13:59   ` Ilias Apalodimas
  0 siblings, 0 replies; 13+ messages in thread
From: Ilias Apalodimas @ 2018-10-01 13:59 UTC (permalink / raw)
  To: Björn Töpel
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	magnus.karlsson, brouer, daniel, ast, jesus.sanchez-palencia,
	vinicius.gomes, makita.toshiaki

On Mon, Oct 01, 2018 at 02:48:54PM +0200, Björn Töpel wrote:
> On 2018-09-29 13:28, Ilias Apalodimas wrote:
> >This patch series adds AF_XDP support socionext netsec driver
> 
> This series adds *XDP* support and as a result, the AF_XDP batteries
> are included. ;-)
> 
Noted, thanks for cathcing this, i replaced all other AF_XDP -> XDP but seems
like i missed this one!


/Ilias

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA
  2018-10-01 13:48             ` Jesper Dangaard Brouer
@ 2018-10-01 14:37               ` Ilias Apalodimas
  2018-10-01 15:58                 ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 13+ messages in thread
From: Ilias Apalodimas @ 2018-10-01 14:37 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	bjorn.topel, magnus.karlsson, daniel, ast,
	jesus.sanchez-palencia, vinicius.gomes, makita.toshiaki,
	Tariq Toukan, Tariq Toukan

On Mon, Oct 01, 2018 at 03:48:45PM +0200, Jesper Dangaard Brouer wrote:
> On Mon, 1 Oct 2018 14:20:21 +0300
> Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:
> 
> > On Mon, Oct 01, 2018 at 01:03:13PM +0200, Jesper Dangaard Brouer wrote:
> > > On Mon, 1 Oct 2018 12:56:58 +0300
> > > Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:
> > >   
> > > > > > #2: You have allocations on the XDP fast-path.
> > > > > > 
> > > > > > The REAL secret behind the XDP performance is to avoid allocations on
> > > > > > the fast-path.  While I just told you to use the page-allocator and
> > > > > > order-0 pages, this will actually kill performance.  Thus, to make this
> > > > > > fast, you need a driver local recycle scheme that avoids going through
> > > > > > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.
> > > > > > For the XDP_REDIRECT action (which you seems to be interested in, as
> > > > > > this is needed for AF_XDP), there is a xdp_return_frame() API that can
> > > > > > make this fast.    
> > > > >
> > > > > I had an initial implementation that did exactly that (that's why you the
> > > > > dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 
> > > > > of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh
> > > > > buffers back to the hardware only when your packets have been processed from
> > > > > your userspace application   
> > > >
> > > > Just a clarification here. This is the case if ZC is implemented. In my case
> > > > the buffers will be 'ok' to be passed back to the hardware once the use
> > > > userspace payload has been copied by xdp_do_redirect()  
> > > 
> > > Thanks for clarifying.  But no, this is not introducing a 'bottleneck'
> > > for AF_XDP.
> > > 
> > > For (1) the copy-mode-AF_XDP the frame (as you noticed) is "freed" or
> > > "returned" very quickly after it is copied.  The code is a bit hard to
> > > follow, but in __xsk_rcv() it calls xdp_return_buff() after the memcpy.
> > > Thus, the frame can be kept DMA mapped and reused in RX-ring quickly.
> >  
> > Ok makes sense. I'll send a v4 with page re-usage, while using your
> > API for page allocation
> 
> Sound good, BUT do notice that using the bare page_pool, will/should
> give you increased XDP performance, but might slow-down normal network
> stack delivery, because netstack will not call xdp_return_frame() and
> instead falls back to returning the pages through the page-allocator.
> 
> I'm very interested in knowing what performance increase you see with
> XDP_DROP, with just a "bare" page_pool implementation.
When i was just syncing the page fragments instead of unmap -> alloc -> map i
was getting ~340kpps (with XDP_REDIRECT). I ended up with 320kpps on this patch.
I did a couple of more changes though (like the dma mapping when allocating 
the buffers) so i am not 100% sure what caused that difference
I'll let you know once i finish up the code using the API for page allocation

Regarding the change and the 'bottleneck' discussion we had. XDP_REDIRECT is
straight forward (non ZC mode). I agree with you that since the payload is
pretty much immediately copied before being flushed to the userspace, it's
unlikely you'll end up delaying the hardware (starving without buffers).
Do you think that's the same for XDP_TX? The DMA buffer will need to be synced 
for the CPU, then you ring a doorbell with X packets. After that you'll have to
wait for the Tx completion and resync the buffers to the device. So you actually
make your Rx descriptors depending on your Tx completion (and keep in mind this
NIC only has 1 queue per direction)

Now for the measurements part, i'll have to check with the vendor if the
interface can do more than 340kpps and we are missing something
performance-wise. 
Have you done any tests with IOMMU enabled/disabled? In theory the dma recycling
will shine against map/unmap when IOMMU is on (and the IOMMU stressed i.e have a
different NIC doing a traffic test)

> 
> The mlx5 driver does not see this netstack slowdown, because it have a
> hybrid approach of maintaining a recycle ring for frames going into
> netstack, by bumping the refcnt.  I think Tariq is cleaning this up.
> The mlx5 code is hard to follow... in mlx5e_xdp_handle()[1] the
> refcnt==1 and a bit is set. And in [2] the refcnt is page_ref_inc(),
> and bit is caught in [3].  (This really need to be cleaned up and
> generalized).
I've read most of the XDP related code on Intel/Mellanox
before starting my patch series. I'll have a closer look now, thanks!
> 
> 
> 
> [1] https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c#L83-L88
>     https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L952-L959
> 
> [2] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1015-L1025
> 
> [3] https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1094-L1098
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA
  2018-10-01 14:37               ` Ilias Apalodimas
@ 2018-10-01 15:58                 ` Jesper Dangaard Brouer
  0 siblings, 0 replies; 13+ messages in thread
From: Jesper Dangaard Brouer @ 2018-10-01 15:58 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	bjorn.topel, magnus.karlsson, daniel, ast,
	jesus.sanchez-palencia, vinicius.gomes, makita.toshiaki,
	Tariq Toukan, Tariq Toukan, brouer

On Mon, 1 Oct 2018 17:37:06 +0300
Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> On Mon, Oct 01, 2018 at 03:48:45PM +0200, Jesper Dangaard Brouer wrote:
> > On Mon, 1 Oct 2018 14:20:21 +0300
> > Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:
> >   
> > > On Mon, Oct 01, 2018 at 01:03:13PM +0200, Jesper Dangaard Brouer wrote:  
> > > > On Mon, 1 Oct 2018 12:56:58 +0300
> > > > Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:
> > > >     
> > > > > > > #2: You have allocations on the XDP fast-path.
> > > > > > > 
> > > > > > > The REAL secret behind the XDP performance is to avoid allocations on
> > > > > > > the fast-path.  While I just told you to use the page-allocator and
> > > > > > > order-0 pages, this will actually kill performance.  Thus, to make this
> > > > > > > fast, you need a driver local recycle scheme that avoids going through
> > > > > > > the page allocator, which makes XDP_DROP and XDP_TX extremely fast.
> > > > > > > For the XDP_REDIRECT action (which you seems to be interested in, as
> > > > > > > this is needed for AF_XDP), there is a xdp_return_frame() API that can
> > > > > > > make this fast.      
> > > > > >
> > > > > > I had an initial implementation that did exactly that (that's why you the
> > > > > > dma_sync_single_for_cpu() -> dma_unmap_single_attrs() is there). In the case 
> > > > > > of AF_XDP isn't that introducing a 'bottleneck' though? I mean you'll feed fresh
> > > > > > buffers back to the hardware only when your packets have been processed from
> > > > > > your userspace application     
> > > > >
> > > > > Just a clarification here. This is the case if ZC is implemented. In my case
> > > > > the buffers will be 'ok' to be passed back to the hardware once the use
> > > > > userspace payload has been copied by xdp_do_redirect()    
> > > > 
> > > > Thanks for clarifying.  But no, this is not introducing a 'bottleneck'
> > > > for AF_XDP.
> > > > 
> > > > For (1) the copy-mode-AF_XDP the frame (as you noticed) is "freed" or
> > > > "returned" very quickly after it is copied.  The code is a bit hard to
> > > > follow, but in __xsk_rcv() it calls xdp_return_buff() after the memcpy.
> > > > Thus, the frame can be kept DMA mapped and reused in RX-ring quickly.  
> > >  
> > > Ok makes sense. I'll send a v4 with page re-usage, while using your
> > > API for page allocation  
> > 
> > Sound good, BUT do notice that using the bare page_pool, will/should
> > give you increased XDP performance, but might slow-down normal network
> > stack delivery, because netstack will not call xdp_return_frame() and
> > instead falls back to returning the pages through the page-allocator.
> > 
> > I'm very interested in knowing what performance increase you see with
> > XDP_DROP, with just a "bare" page_pool implementation.  
>
> When i was just syncing the page fragments instead of unmap -> alloc -> map i
> was getting ~340kpps (with XDP_REDIRECT). I ended up with 320kpps on this patch.

I explicitly asked for the XDP_DROP performance... because it will tell
me/us if your hardware is actually the bottleneck.  For your specific
hardware, you might be limited by the cost of DMA-sync.  It might be
faster to use the DMA-map/unmap calls(?).

I'm hinting you should take one step at a time, and measure.  Knowing
and identifying the limits, is essential. Else you are doing blind
optimizations. If you don't know the HW limit, then you don't know what
the gap is to optimum (and then you don't know when to stop optimizing).


> I did a couple of more changes though (like the dma mapping when allocating 
> the buffers) so i am not 100% sure what caused that difference
> I'll let you know once i finish up the code using the API for page allocation
>
> Regarding the change and the 'bottleneck' discussion we had. XDP_REDIRECT is
> straight forward (non ZC mode). I agree with you that since the payload is
> pretty much immediately copied before being flushed to the userspace, it's
> unlikely you'll end up delaying the hardware (starving without buffers).
> Do you think that's the same for XDP_TX? The DMA buffer will need to be synced 
> for the CPU, then you ring a doorbell with X packets. After that you'll have to
> wait for the Tx completion and resync the buffers to the device. So you actually
> make your Rx descriptors depending on your Tx completion (and keep in mind this
> NIC only has 1 queue per direction)

The page_pool will cache-pages (it have a pool of pages) that should
large enough to handle some pages are outstanding in the TX completion
queue.

>
> Now for the measurements part, i'll have to check with the vendor if the
> interface can do more than 340kpps and we are missing something
> performance-wise. 

Yes, please. The XDP_DROP test I requested above is exactly an attempt
to determine what the NIC HW limits are... esle you are working blindly.


> Have you done any tests with IOMMU enabled/disabled? In theory the dma recycling
> will shine against map/unmap when IOMMU is on (and the IOMMU stressed i.e have a
> different NIC doing a traffic test)

Nope, I could not get the IOMMU working on my testlab, last time I
tried to activate it.  Hench, why I have not implemented/optimized DMA
map/unmap stuff too much (e.g. mlx5 currently does a DMA unmap for
XDP_REDIRECT, which should be fixed).
 
 
> > The mlx5 driver does not see this netstack slowdown, because it
> > have a hybrid approach of maintaining a recycle ring for frames
> > going into netstack, by bumping the refcnt.  I think Tariq is
> > cleaning this up. The mlx5 code is hard to follow... in
> > mlx5e_xdp_handle()[1] the refcnt==1 and a bit is set. And in [2]
> > the refcnt is page_ref_inc(), and bit is caught in [3].  (This
> > really need to be cleaned up and generalized).  
>
> I've read most of the XDP related code on Intel/Mellanox
> before starting my patch series. I'll have a closer look now, thanks!
> > 
> > 
> > 
> > [1]
> > https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c#L83-L88
> > https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L952-L959
> > 
> > [2]
> > https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1015-L1025
> > 
> > [3]
> > https://github.com/torvalds/linux/blob/v4.18/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L1094-L1098



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-10-01 22:36 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-29 11:28 [net-next, PATCH 0/2, v3] net: socionext: XDP support Ilias Apalodimas
2018-09-29 11:28 ` [net-next, PATCH 1/2, v3] net: socionext: different approach on DMA Ilias Apalodimas
2018-10-01  9:26   ` Jesper Dangaard Brouer
2018-10-01  9:44     ` Ilias Apalodimas
2018-10-01  9:56       ` Ilias Apalodimas
2018-10-01 11:03         ` Jesper Dangaard Brouer
2018-10-01 11:20           ` Ilias Apalodimas
2018-10-01 13:48             ` Jesper Dangaard Brouer
2018-10-01 14:37               ` Ilias Apalodimas
2018-10-01 15:58                 ` Jesper Dangaard Brouer
2018-09-29 11:28 ` [net-next, PATCH 2/2, v3] net: socionext: add XDP support Ilias Apalodimas
2018-10-01 12:48 ` [net-next, PATCH 0/2, v3] net: socionext: " Björn Töpel
2018-10-01 13:59   ` Ilias Apalodimas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.