All of lore.kernel.org
 help / color / mirror / Atom feed
* Add AF_XDP zero-copy support for NFP
@ 2022-03-04 10:22 Simon Horman
  2022-03-04 10:22 ` [PATCH net-next 1/5] nfp: expose common functions to be used for AF_XDP Simon Horman
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Simon Horman @ 2022-03-04 10:22 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski, Niklas Söderlund; +Cc: netdev, oss-drivers

Niklas Söderlund says:

This series adds AF_XDP zero-copy support for the NFP driver. The series
is based on previous work done by Jakub Kicinski.

Patch 1/5 and 2/5 prepares the driver for AF_XDP support by refactoring
functions that will act differently once AF_XDP is active or not making
the driver easier to read and by preparing some functions to be reused
outside the local file scope. Patch 3/5 and 4/5 prepares the driver for
dealing the UMEM while finally patch 5/5 adds AF_XDP support.

Based on work by Jakub Kicinski.

Jakub Kicinski (1):
  nfp: wrap napi add/del logic

Niklas Söderlund (4):
  nfp: expose common functions to be used for AF_XDP
  nfp: xsk: add an array of xsk buffer pools to each data path
  nfp: xsk: add configuration check for XSK socket chunk size
  nfp: xsk: add AF_XDP zero-copy Rx and Tx support

 drivers/net/ethernet/netronome/nfp/Makefile   |   1 +
 drivers/net/ethernet/netronome/nfp/nfp_net.h  |  47 +-
 .../ethernet/netronome/nfp/nfp_net_common.c   | 207 ++++--
 .../ethernet/netronome/nfp/nfp_net_debugfs.c  |  33 +-
 .../net/ethernet/netronome/nfp/nfp_net_xsk.c  | 592 ++++++++++++++++++
 .../net/ethernet/netronome/nfp/nfp_net_xsk.h  |  29 +
 6 files changed, 855 insertions(+), 54 deletions(-)
 create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_xsk.c
 create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_xsk.h

-- 
2.20.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH net-next 1/5] nfp: expose common functions to be used for AF_XDP
  2022-03-04 10:22 Add AF_XDP zero-copy support for NFP Simon Horman
@ 2022-03-04 10:22 ` Simon Horman
  2022-03-04 13:10   ` patchwork-bot+netdevbpf
  2022-03-04 10:22 ` [PATCH net-next 2/5] nfp: wrap napi add/del logic Simon Horman
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 9+ messages in thread
From: Simon Horman @ 2022-03-04 10:22 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski, Niklas Söderlund; +Cc: netdev, oss-drivers

From: Niklas Söderlund <niklas.soderlund@corigine.com>

There are some common functionality that can be reused in the upcoming
AF_XDP support. Expose those functions in the header. While at it mark
some arguments of nfp_net_rx_csum() as const.

Signed-off-by: Niklas Söderlund <niklas.soderlund@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h     | 14 ++++++++++++++
 .../net/ethernet/netronome/nfp/nfp_net_common.c  | 16 ++++++++--------
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 0b1865e9f0b5..fa40d339df8d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -965,6 +965,7 @@ int nfp_net_mbox_reconfig_and_unlock(struct nfp_net *nn, u32 mbox_cmd);
 void nfp_net_mbox_reconfig_post(struct nfp_net *nn, u32 update);
 int nfp_net_mbox_reconfig_wait_posted(struct nfp_net *nn);
 
+void nfp_net_irq_unmask(struct nfp_net *nn, unsigned int entry_nr);
 unsigned int
 nfp_net_irqs_alloc(struct pci_dev *pdev, struct msix_entry *irq_entries,
 		   unsigned int min_irqs, unsigned int want_irqs);
@@ -973,6 +974,19 @@ void
 nfp_net_irqs_assign(struct nfp_net *nn, struct msix_entry *irq_entries,
 		    unsigned int n);
 
+void nfp_net_tx_xmit_more_flush(struct nfp_net_tx_ring *tx_ring);
+void nfp_net_tx_complete(struct nfp_net_tx_ring *tx_ring, int budget);
+
+bool
+nfp_net_parse_meta(struct net_device *netdev, struct nfp_meta_parsed *meta,
+		   void *data, void *pkt, unsigned int pkt_len, int meta_len);
+
+void nfp_net_rx_csum(const struct nfp_net_dp *dp,
+		     struct nfp_net_r_vector *r_vec,
+		     const struct nfp_net_rx_desc *rxd,
+		     const struct nfp_meta_parsed *meta,
+		     struct sk_buff *skb);
+
 struct nfp_net_dp *nfp_net_clone_dp(struct nfp_net *nn);
 int nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_dp *new,
 			  struct netlink_ext_ack *extack);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 79257ec41987..edf7b8716a70 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -381,7 +381,7 @@ int nfp_net_mbox_reconfig_and_unlock(struct nfp_net *nn, u32 mbox_cmd)
  *
  * Clear the ICR for the IRQ entry.
  */
-static void nfp_net_irq_unmask(struct nfp_net *nn, unsigned int entry_nr)
+void nfp_net_irq_unmask(struct nfp_net *nn, unsigned int entry_nr)
 {
 	nn_writeb(nn, NFP_NET_CFG_ICR(entry_nr), NFP_NET_CFG_ICR_UNMASKED);
 	nn_pci_flush(nn);
@@ -923,7 +923,7 @@ static void nfp_net_tls_tx_undo(struct sk_buff *skb, u64 tls_handle)
 #endif
 }
 
-static void nfp_net_tx_xmit_more_flush(struct nfp_net_tx_ring *tx_ring)
+void nfp_net_tx_xmit_more_flush(struct nfp_net_tx_ring *tx_ring)
 {
 	wmb();
 	nfp_qcp_wr_ptr_add(tx_ring->qcp_q, tx_ring->wr_ptr_add);
@@ -1142,7 +1142,7 @@ static netdev_tx_t nfp_net_tx(struct sk_buff *skb, struct net_device *netdev)
  * @tx_ring:	TX ring structure
  * @budget:	NAPI budget (only used as bool to determine if in NAPI context)
  */
-static void nfp_net_tx_complete(struct nfp_net_tx_ring *tx_ring, int budget)
+void nfp_net_tx_complete(struct nfp_net_tx_ring *tx_ring, int budget)
 {
 	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
 	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
@@ -1587,10 +1587,10 @@ static int nfp_net_rx_csum_has_errors(u16 flags)
  * @meta: Parsed metadata prepend
  * @skb: Pointer to SKB
  */
-static void nfp_net_rx_csum(struct nfp_net_dp *dp,
-			    struct nfp_net_r_vector *r_vec,
-			    struct nfp_net_rx_desc *rxd,
-			    struct nfp_meta_parsed *meta, struct sk_buff *skb)
+void nfp_net_rx_csum(const struct nfp_net_dp *dp,
+		     struct nfp_net_r_vector *r_vec,
+		     const struct nfp_net_rx_desc *rxd,
+		     const struct nfp_meta_parsed *meta, struct sk_buff *skb)
 {
 	skb_checksum_none_assert(skb);
 
@@ -1668,7 +1668,7 @@ nfp_net_set_hash_desc(struct net_device *netdev, struct nfp_meta_parsed *meta,
 			 &rx_hash->hash);
 }
 
-static bool
+bool
 nfp_net_parse_meta(struct net_device *netdev, struct nfp_meta_parsed *meta,
 		   void *data, void *pkt, unsigned int pkt_len, int meta_len)
 {
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH net-next 2/5] nfp: wrap napi add/del logic
  2022-03-04 10:22 Add AF_XDP zero-copy support for NFP Simon Horman
  2022-03-04 10:22 ` [PATCH net-next 1/5] nfp: expose common functions to be used for AF_XDP Simon Horman
@ 2022-03-04 10:22 ` Simon Horman
  2022-03-04 10:22 ` [PATCH net-next 3/5] nfp: xsk: add an array of xsk buffer pools to each data path Simon Horman
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Simon Horman @ 2022-03-04 10:22 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski, Niklas Söderlund; +Cc: netdev, oss-drivers

From: Jakub Kicinski <jakub.kicinski@netronome.com>

There will be more NAPI register logic once AF_XDP support is
added, wrap our already conditional napi add/del into helpers.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Niklas Söderlund <niklas.soderlund@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
---
 .../ethernet/netronome/nfp/nfp_net_common.c   | 38 +++++++++++--------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index edf7b8716a70..e6a17af731ba 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2638,6 +2638,25 @@ static void nfp_net_rx_rings_free(struct nfp_net_dp *dp)
 	kfree(dp->rx_rings);
 }
 
+static void
+nfp_net_napi_add(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec)
+{
+	if (dp->netdev)
+		netif_napi_add(dp->netdev, &r_vec->napi,
+			       nfp_net_poll, NAPI_POLL_WEIGHT);
+	else
+		tasklet_enable(&r_vec->tasklet);
+}
+
+static void
+nfp_net_napi_del(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec)
+{
+	if (dp->netdev)
+		netif_napi_del(&r_vec->napi);
+	else
+		tasklet_disable(&r_vec->tasklet);
+}
+
 static void
 nfp_net_vector_assign_rings(struct nfp_net_dp *dp,
 			    struct nfp_net_r_vector *r_vec, int idx)
@@ -2656,23 +2675,14 @@ nfp_net_prepare_vector(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
 {
 	int err;
 
-	/* Setup NAPI */
-	if (nn->dp.netdev)
-		netif_napi_add(nn->dp.netdev, &r_vec->napi,
-			       nfp_net_poll, NAPI_POLL_WEIGHT);
-	else
-		tasklet_enable(&r_vec->tasklet);
+	nfp_net_napi_add(&nn->dp, r_vec);
 
 	snprintf(r_vec->name, sizeof(r_vec->name),
 		 "%s-rxtx-%d", nfp_net_name(nn), idx);
 	err = request_irq(r_vec->irq_vector, r_vec->handler, 0, r_vec->name,
 			  r_vec);
 	if (err) {
-		if (nn->dp.netdev)
-			netif_napi_del(&r_vec->napi);
-		else
-			tasklet_disable(&r_vec->tasklet);
-
+		nfp_net_napi_del(&nn->dp, r_vec);
 		nn_err(nn, "Error requesting IRQ %d\n", r_vec->irq_vector);
 		return err;
 	}
@@ -2690,11 +2700,7 @@ static void
 nfp_net_cleanup_vector(struct nfp_net *nn, struct nfp_net_r_vector *r_vec)
 {
 	irq_set_affinity_hint(r_vec->irq_vector, NULL);
-	if (nn->dp.netdev)
-		netif_napi_del(&r_vec->napi);
-	else
-		tasklet_disable(&r_vec->tasklet);
-
+	nfp_net_napi_del(&nn->dp, r_vec);
 	free_irq(r_vec->irq_vector, r_vec);
 }
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH net-next 3/5] nfp: xsk: add an array of xsk buffer pools to each data path
  2022-03-04 10:22 Add AF_XDP zero-copy support for NFP Simon Horman
  2022-03-04 10:22 ` [PATCH net-next 1/5] nfp: expose common functions to be used for AF_XDP Simon Horman
  2022-03-04 10:22 ` [PATCH net-next 2/5] nfp: wrap napi add/del logic Simon Horman
@ 2022-03-04 10:22 ` Simon Horman
  2022-03-04 10:22 ` [PATCH net-next 4/5] nfp: xsk: add configuration check for XSK socket chunk size Simon Horman
  2022-03-04 10:22 ` [PATCH net-next 5/5] nfp: xsk: add AF_XDP zero-copy Rx and Tx support Simon Horman
  4 siblings, 0 replies; 9+ messages in thread
From: Simon Horman @ 2022-03-04 10:22 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski, Niklas Söderlund; +Cc: netdev, oss-drivers

From: Niklas Söderlund <niklas.soderlund@corigine.com>

Each data path needs an array of xsk pools to track if an xsk socket is
in use. Add this array and make sure it's handled correctly when the
data path is duplicated.

Signed-off-by: Niklas Söderlund <niklas.soderlund@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h  |  4 ++++
 .../ethernet/netronome/nfp/nfp_net_common.c   | 19 +++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index fa40d339df8d..12f403d004ee 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -109,6 +109,7 @@ struct nfp_eth_table_port;
 struct nfp_net;
 struct nfp_net_r_vector;
 struct nfp_port;
+struct xsk_buff_pool;
 
 /* Convenience macro for wrapping descriptor index on ring size */
 #define D_IDX(ring, idx)	((idx) & ((ring)->cnt - 1))
@@ -501,6 +502,7 @@ struct nfp_stat_pair {
  * @num_stack_tx_rings:	Number of TX rings used by the stack (not XDP)
  * @num_rx_rings:	Currently configured number of RX rings
  * @mtu:		Device MTU
+ * @xsk_pools:		AF_XDP UMEM table (@num_r_vecs in size)
  */
 struct nfp_net_dp {
 	struct device *dev;
@@ -537,6 +539,8 @@ struct nfp_net_dp {
 	unsigned int num_rx_rings;
 
 	unsigned int mtu;
+
+	struct xsk_buff_pool **xsk_pools;
 };
 
 /**
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index e6a17af731ba..abfc4f3963c5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3302,6 +3302,15 @@ struct nfp_net_dp *nfp_net_clone_dp(struct nfp_net *nn)
 
 	*new = nn->dp;
 
+	new->xsk_pools = kmemdup(new->xsk_pools,
+				 array_size(nn->max_r_vecs,
+					    sizeof(new->xsk_pools)),
+				 GFP_KERNEL);
+	if (!new->xsk_pools) {
+		kfree(new);
+		return NULL;
+	}
+
 	/* Clear things which need to be recomputed */
 	new->fl_bufsz = 0;
 	new->tx_rings = NULL;
@@ -3312,6 +3321,12 @@ struct nfp_net_dp *nfp_net_clone_dp(struct nfp_net *nn)
 	return new;
 }
 
+static void nfp_net_free_dp(struct nfp_net_dp *dp)
+{
+	kfree(dp->xsk_pools);
+	kfree(dp);
+}
+
 static int
 nfp_net_check_config(struct nfp_net *nn, struct nfp_net_dp *dp,
 		     struct netlink_ext_ack *extack)
@@ -3395,7 +3410,7 @@ int nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_dp *dp,
 
 	nfp_net_open_stack(nn);
 exit_free_dp:
-	kfree(dp);
+	nfp_net_free_dp(dp);
 
 	return err;
 
@@ -3404,7 +3419,7 @@ int nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_dp *dp,
 err_cleanup_vecs:
 	for (r = dp->num_r_vecs - 1; r >= nn->dp.num_r_vecs; r--)
 		nfp_net_cleanup_vector(nn, &nn->r_vecs[r]);
-	kfree(dp);
+	nfp_net_free_dp(dp);
 	return err;
 }
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH net-next 4/5] nfp: xsk: add configuration check for XSK socket chunk size
  2022-03-04 10:22 Add AF_XDP zero-copy support for NFP Simon Horman
                   ` (2 preceding siblings ...)
  2022-03-04 10:22 ` [PATCH net-next 3/5] nfp: xsk: add an array of xsk buffer pools to each data path Simon Horman
@ 2022-03-04 10:22 ` Simon Horman
  2022-03-04 10:22 ` [PATCH net-next 5/5] nfp: xsk: add AF_XDP zero-copy Rx and Tx support Simon Horman
  4 siblings, 0 replies; 9+ messages in thread
From: Simon Horman @ 2022-03-04 10:22 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski, Niklas Söderlund; +Cc: netdev, oss-drivers

From: Niklas Söderlund <niklas.soderlund@corigine.com>

In preparation for adding AF_XDP support add a configuration check to
make sure the buffer size can not be set to a larger value then the XSK
socket chunk size.

Signed-off-by: Niklas Söderlund <niklas.soderlund@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
---
 .../ethernet/netronome/nfp/nfp_net_common.c   | 42 +++++++++++++++++--
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index abfc4f3963c5..c10e977d2472 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -38,6 +38,7 @@
 
 #include <net/tls.h>
 #include <net/vxlan.h>
+#include <net/xdp_sock_drv.h>
 
 #include "nfpcore/nfp_nsp.h"
 #include "ccm.h"
@@ -1338,24 +1339,43 @@ static void nfp_net_tx_timeout(struct net_device *netdev, unsigned int txqueue)
 /* Receive processing
  */
 static unsigned int
-nfp_net_calc_fl_bufsz(struct nfp_net_dp *dp)
+nfp_net_calc_fl_bufsz_data(struct nfp_net_dp *dp)
 {
-	unsigned int fl_bufsz;
+	unsigned int fl_bufsz = 0;
 
-	fl_bufsz = NFP_NET_RX_BUF_HEADROOM;
-	fl_bufsz += dp->rx_dma_off;
 	if (dp->rx_offset == NFP_NET_CFG_RX_OFFSET_DYNAMIC)
 		fl_bufsz += NFP_NET_MAX_PREPEND;
 	else
 		fl_bufsz += dp->rx_offset;
 	fl_bufsz += ETH_HLEN + VLAN_HLEN * 2 + dp->mtu;
 
+	return fl_bufsz;
+}
+
+static unsigned int nfp_net_calc_fl_bufsz(struct nfp_net_dp *dp)
+{
+	unsigned int fl_bufsz;
+
+	fl_bufsz = NFP_NET_RX_BUF_HEADROOM;
+	fl_bufsz += dp->rx_dma_off;
+	fl_bufsz += nfp_net_calc_fl_bufsz_data(dp);
+
 	fl_bufsz = SKB_DATA_ALIGN(fl_bufsz);
 	fl_bufsz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	return fl_bufsz;
 }
 
+static unsigned int nfp_net_calc_fl_bufsz_xsk(struct nfp_net_dp *dp)
+{
+	unsigned int fl_bufsz;
+
+	fl_bufsz = XDP_PACKET_HEADROOM;
+	fl_bufsz += nfp_net_calc_fl_bufsz_data(dp);
+
+	return fl_bufsz;
+}
+
 static void
 nfp_net_free_frag(void *frag, bool xdp)
 {
@@ -3331,6 +3351,8 @@ static int
 nfp_net_check_config(struct nfp_net *nn, struct nfp_net_dp *dp,
 		     struct netlink_ext_ack *extack)
 {
+	unsigned int r, xsk_min_fl_bufsz;
+
 	/* XDP-enabled tests */
 	if (!dp->xdp_prog)
 		return 0;
@@ -3343,6 +3365,18 @@ nfp_net_check_config(struct nfp_net *nn, struct nfp_net_dp *dp,
 		return -EINVAL;
 	}
 
+	xsk_min_fl_bufsz = nfp_net_calc_fl_bufsz_xsk(dp);
+	for (r = 0; r < nn->max_r_vecs; r++) {
+		if (!dp->xsk_pools[r])
+			continue;
+
+		if (xsk_pool_get_rx_frame_size(dp->xsk_pools[r]) < xsk_min_fl_bufsz) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "XSK buffer pool chunk size too small\n");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH net-next 5/5] nfp: xsk: add AF_XDP zero-copy Rx and Tx support
  2022-03-04 10:22 Add AF_XDP zero-copy support for NFP Simon Horman
                   ` (3 preceding siblings ...)
  2022-03-04 10:22 ` [PATCH net-next 4/5] nfp: xsk: add configuration check for XSK socket chunk size Simon Horman
@ 2022-03-04 10:22 ` Simon Horman
  2022-03-04 13:53   ` Maciej Fijalkowski
  4 siblings, 1 reply; 9+ messages in thread
From: Simon Horman @ 2022-03-04 10:22 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski, Niklas Söderlund; +Cc: netdev, oss-drivers

From: Niklas Söderlund <niklas.soderlund@corigine.com>

This patch adds zero-copy Rx and Tx support for AF_XDP sockets. It do so
by adding a separate NAPI poll function that is attached to a each
channel when the XSK socket is attached with XDP_SETUP_XSK_POOL, and
restored when the XSK socket is terminated, this is done per channel.

Support for XDP_TX is implemented and the XDP buffer can safely be moved
from the Rx to the Tx queue and correctly freed and returned to the XSK
pool once it's transmitted.

Note that when AF_XDP zero-copy is enabled, the XDP action XDP_PASS
will allocate a new buffer and copy the zero-copy frame prior
passing it to the kernel stack.

This patch is based on previous work by Jakub Kicinski.

Signed-off-by: Niklas Söderlund <niklas.soderlund@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
---
 drivers/net/ethernet/netronome/nfp/Makefile   |   1 +
 drivers/net/ethernet/netronome/nfp/nfp_net.h  |  31 +-
 .../ethernet/netronome/nfp/nfp_net_common.c   |  98 ++-
 .../ethernet/netronome/nfp/nfp_net_debugfs.c  |  33 +-
 .../net/ethernet/netronome/nfp/nfp_net_xsk.c  | 592 ++++++++++++++++++
 .../net/ethernet/netronome/nfp/nfp_net_xsk.h  |  29 +
 6 files changed, 756 insertions(+), 28 deletions(-)
 create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_xsk.c
 create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_xsk.h

diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile
index 9cff3d48acbc..9c72b43c1581 100644
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -31,6 +31,7 @@ nfp-objs := \
 	    nfp_net_main.o \
 	    nfp_net_repr.o \
 	    nfp_net_sriov.o \
+	    nfp_net_xsk.o \
 	    nfp_netvf_main.o \
 	    nfp_port.o \
 	    nfp_shared_buf.o \
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 12f403d004ee..437a19722fcf 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -171,11 +171,14 @@ struct nfp_net_tx_desc {
  * struct nfp_net_tx_buf - software TX buffer descriptor
  * @skb:	normal ring, sk_buff associated with this buffer
  * @frag:	XDP ring, page frag associated with this buffer
+ * @xdp:	XSK buffer pool handle (for AF_XDP)
  * @dma_addr:	DMA mapping address of the buffer
  * @fidx:	Fragment index (-1 for the head and [0..nr_frags-1] for frags)
  * @pkt_cnt:	Number of packets to be produced out of the skb associated
  *		with this buffer (valid only on the head's buffer).
  *		Will be 1 for all non-TSO packets.
+ * @is_xsk_tx:	Flag if buffer is a RX buffer after a XDP_TX action and not a
+ *		buffer from the TX queue (for AF_XDP).
  * @real_len:	Number of bytes which to be produced out of the skb (valid only
  *		on the head's buffer). Equal to skb->len for non-TSO packets.
  */
@@ -183,10 +186,18 @@ struct nfp_net_tx_buf {
 	union {
 		struct sk_buff *skb;
 		void *frag;
+		struct xdp_buff *xdp;
 	};
 	dma_addr_t dma_addr;
-	short int fidx;
-	u16 pkt_cnt;
+	union {
+		struct {
+			short int fidx;
+			u16 pkt_cnt;
+		};
+		struct {
+			bool is_xsk_tx;
+		};
+	};
 	u32 real_len;
 };
 
@@ -315,6 +326,16 @@ struct nfp_net_rx_buf {
 	dma_addr_t dma_addr;
 };
 
+/**
+ * struct nfp_net_xsk_rx_buf - software RX XSK buffer descriptor
+ * @dma_addr:	DMA mapping address of the buffer
+ * @xdp:	XSK buffer pool handle (for AF_XDP)
+ */
+struct nfp_net_xsk_rx_buf {
+	dma_addr_t dma_addr;
+	struct xdp_buff *xdp;
+};
+
 /**
  * struct nfp_net_rx_ring - RX ring structure
  * @r_vec:      Back pointer to ring vector structure
@@ -325,6 +346,7 @@ struct nfp_net_rx_buf {
  * @fl_qcidx:   Queue Controller Peripheral (QCP) queue index for the freelist
  * @qcp_fl:     Pointer to base of the QCP freelist queue
  * @rxbufs:     Array of transmitted FL/RX buffers
+ * @xsk_rxbufs: Array of transmitted FL/RX buffers (for AF_XDP)
  * @rxds:       Virtual address of FL/RX ring in host memory
  * @xdp_rxq:    RX-ring info avail for XDP
  * @dma:        DMA address of the FL/RX ring
@@ -343,6 +365,7 @@ struct nfp_net_rx_ring {
 	u8 __iomem *qcp_fl;
 
 	struct nfp_net_rx_buf *rxbufs;
+	struct nfp_net_xsk_rx_buf *xsk_rxbufs;
 	struct nfp_net_rx_desc *rxds;
 
 	struct xdp_rxq_info xdp_rxq;
@@ -361,6 +384,7 @@ struct nfp_net_rx_ring {
  * @tx_ring:        Pointer to TX ring
  * @rx_ring:        Pointer to RX ring
  * @xdp_ring:	    Pointer to an extra TX ring for XDP
+ * @xsk_pool:	    XSK buffer pool active on vector queue pair (for AF_XDP)
  * @irq_entry:      MSI-X table entry (use for talking to the device)
  * @event_ctr:	    Number of interrupt
  * @rx_dim:	    Dynamic interrupt moderation structure for RX
@@ -432,6 +456,7 @@ struct nfp_net_r_vector {
 	u64 rx_replace_buf_alloc_fail;
 
 	struct nfp_net_tx_ring *xdp_ring;
+	struct xsk_buff_pool *xsk_pool;
 
 	struct u64_stats_sync tx_sync;
 	u64 tx_pkts;
@@ -502,7 +527,7 @@ struct nfp_stat_pair {
  * @num_stack_tx_rings:	Number of TX rings used by the stack (not XDP)
  * @num_rx_rings:	Currently configured number of RX rings
  * @mtu:		Device MTU
- * @xsk_pools:		AF_XDP UMEM table (@num_r_vecs in size)
+ * @xsk_pools:		XSK buffer pools, @max_r_vecs in size (for AF_XDP).
  */
 struct nfp_net_dp {
 	struct device *dev;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index c10e977d2472..00a09b9e0aee 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -46,6 +46,7 @@
 #include "nfp_net_ctrl.h"
 #include "nfp_net.h"
 #include "nfp_net_sriov.h"
+#include "nfp_net_xsk.h"
 #include "nfp_port.h"
 #include "crypto/crypto.h"
 #include "crypto/fw.h"
@@ -1316,6 +1317,9 @@ nfp_net_tx_ring_reset(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring)
 		tx_ring->rd_p++;
 	}
 
+	if (tx_ring->is_xdp)
+		nfp_net_xsk_tx_bufs_free(tx_ring);
+
 	memset(tx_ring->txds, 0, tx_ring->size);
 	tx_ring->wr_p = 0;
 	tx_ring->rd_p = 0;
@@ -1504,10 +1508,14 @@ static void nfp_net_rx_ring_reset(struct nfp_net_rx_ring *rx_ring)
 	/* Move the empty entry to the end of the list */
 	wr_idx = D_IDX(rx_ring, rx_ring->wr_p);
 	last_idx = rx_ring->cnt - 1;
-	rx_ring->rxbufs[wr_idx].dma_addr = rx_ring->rxbufs[last_idx].dma_addr;
-	rx_ring->rxbufs[wr_idx].frag = rx_ring->rxbufs[last_idx].frag;
-	rx_ring->rxbufs[last_idx].dma_addr = 0;
-	rx_ring->rxbufs[last_idx].frag = NULL;
+	if (rx_ring->r_vec->xsk_pool) {
+		rx_ring->xsk_rxbufs[wr_idx] = rx_ring->xsk_rxbufs[last_idx];
+		memset(&rx_ring->xsk_rxbufs[last_idx], 0,
+		       sizeof(*rx_ring->xsk_rxbufs));
+	} else {
+		rx_ring->rxbufs[wr_idx] = rx_ring->rxbufs[last_idx];
+		memset(&rx_ring->rxbufs[last_idx], 0, sizeof(*rx_ring->rxbufs));
+	}
 
 	memset(rx_ring->rxds, 0, rx_ring->size);
 	rx_ring->wr_p = 0;
@@ -1529,6 +1537,9 @@ nfp_net_rx_ring_bufs_free(struct nfp_net_dp *dp,
 {
 	unsigned int i;
 
+	if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx))
+		return;
+
 	for (i = 0; i < rx_ring->cnt - 1; i++) {
 		/* NULL skb can only happen when initial filling of the ring
 		 * fails to allocate enough buffers and calls here to free
@@ -1556,6 +1567,9 @@ nfp_net_rx_ring_bufs_alloc(struct nfp_net_dp *dp,
 	struct nfp_net_rx_buf *rxbufs;
 	unsigned int i;
 
+	if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx))
+		return 0;
+
 	rxbufs = rx_ring->rxbufs;
 
 	for (i = 0; i < rx_ring->cnt - 1; i++) {
@@ -1580,6 +1594,9 @@ nfp_net_rx_ring_fill_freelist(struct nfp_net_dp *dp,
 {
 	unsigned int i;
 
+	if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx))
+		return nfp_net_xsk_rx_ring_fill_freelist(rx_ring);
+
 	for (i = 0; i < rx_ring->cnt - 1; i++)
 		nfp_net_rx_give_one(dp, rx_ring, rx_ring->rxbufs[i].frag,
 				    rx_ring->rxbufs[i].dma_addr);
@@ -2560,7 +2577,11 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
 
 	if (dp->netdev)
 		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
-	kvfree(rx_ring->rxbufs);
+
+	if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx))
+		kvfree(rx_ring->xsk_rxbufs);
+	else
+		kvfree(rx_ring->rxbufs);
 
 	if (rx_ring->rxds)
 		dma_free_coherent(dp->dev, rx_ring->size,
@@ -2568,6 +2589,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
 
 	rx_ring->cnt = 0;
 	rx_ring->rxbufs = NULL;
+	rx_ring->xsk_rxbufs = NULL;
 	rx_ring->rxds = NULL;
 	rx_ring->dma = 0;
 	rx_ring->size = 0;
@@ -2583,8 +2605,18 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
 static int
 nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
 {
+	enum xdp_mem_type mem_type;
+	size_t rxbuf_sw_desc_sz;
 	int err;
 
+	if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx)) {
+		mem_type = MEM_TYPE_XSK_BUFF_POOL;
+		rxbuf_sw_desc_sz = sizeof(*rx_ring->xsk_rxbufs);
+	} else {
+		mem_type = MEM_TYPE_PAGE_ORDER0;
+		rxbuf_sw_desc_sz = sizeof(*rx_ring->rxbufs);
+	}
+
 	if (dp->netdev) {
 		err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, dp->netdev,
 				       rx_ring->idx, rx_ring->r_vec->napi.napi_id);
@@ -2592,6 +2624,10 @@ nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
 			return err;
 	}
 
+	err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, mem_type, NULL);
+	if (err)
+		goto err_alloc;
+
 	rx_ring->cnt = dp->rxd_cnt;
 	rx_ring->size = array_size(rx_ring->cnt, sizeof(*rx_ring->rxds));
 	rx_ring->rxds = dma_alloc_coherent(dp->dev, rx_ring->size,
@@ -2603,10 +2639,17 @@ nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
 		goto err_alloc;
 	}
 
-	rx_ring->rxbufs = kvcalloc(rx_ring->cnt, sizeof(*rx_ring->rxbufs),
-				   GFP_KERNEL);
-	if (!rx_ring->rxbufs)
-		goto err_alloc;
+	if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx)) {
+		rx_ring->xsk_rxbufs = kvcalloc(rx_ring->cnt, rxbuf_sw_desc_sz,
+					       GFP_KERNEL);
+		if (!rx_ring->xsk_rxbufs)
+			goto err_alloc;
+	} else {
+		rx_ring->rxbufs = kvcalloc(rx_ring->cnt, rxbuf_sw_desc_sz,
+					   GFP_KERNEL);
+		if (!rx_ring->rxbufs)
+			goto err_alloc;
+	}
 
 	return 0;
 
@@ -2659,11 +2702,13 @@ static void nfp_net_rx_rings_free(struct nfp_net_dp *dp)
 }
 
 static void
-nfp_net_napi_add(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec)
+nfp_net_napi_add(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec, int idx)
 {
 	if (dp->netdev)
 		netif_napi_add(dp->netdev, &r_vec->napi,
-			       nfp_net_poll, NAPI_POLL_WEIGHT);
+			       nfp_net_has_xsk_pool_slow(dp, idx) ?
+			       nfp_net_xsk_poll : nfp_net_poll,
+			       NAPI_POLL_WEIGHT);
 	else
 		tasklet_enable(&r_vec->tasklet);
 }
@@ -2687,6 +2732,17 @@ nfp_net_vector_assign_rings(struct nfp_net_dp *dp,
 
 	r_vec->xdp_ring = idx < dp->num_tx_rings - dp->num_stack_tx_rings ?
 		&dp->tx_rings[dp->num_stack_tx_rings + idx] : NULL;
+
+	if (nfp_net_has_xsk_pool_slow(dp, idx) || r_vec->xsk_pool) {
+		r_vec->xsk_pool = dp->xdp_prog ? dp->xsk_pools[idx] : NULL;
+
+		if (r_vec->xsk_pool)
+			xsk_pool_set_rxq_info(r_vec->xsk_pool,
+					      &r_vec->rx_ring->xdp_rxq);
+
+		nfp_net_napi_del(dp, r_vec);
+		nfp_net_napi_add(dp, r_vec, idx);
+	}
 }
 
 static int
@@ -2695,7 +2751,7 @@ nfp_net_prepare_vector(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
 {
 	int err;
 
-	nfp_net_napi_add(&nn->dp, r_vec);
+	nfp_net_napi_add(&nn->dp, r_vec, idx);
 
 	snprintf(r_vec->name, sizeof(r_vec->name),
 		 "%s-rxtx-%d", nfp_net_name(nn), idx);
@@ -2834,8 +2890,11 @@ static void nfp_net_clear_config_and_disable(struct nfp_net *nn)
 	if (err)
 		nn_err(nn, "Could not disable device: %d\n", err);
 
-	for (r = 0; r < nn->dp.num_rx_rings; r++)
+	for (r = 0; r < nn->dp.num_rx_rings; r++) {
 		nfp_net_rx_ring_reset(&nn->dp.rx_rings[r]);
+		if (nfp_net_has_xsk_pool_slow(&nn->dp, nn->dp.rx_rings[r].idx))
+			nfp_net_xsk_rx_bufs_free(&nn->dp.rx_rings[r]);
+	}
 	for (r = 0; r < nn->dp.num_tx_rings; r++)
 		nfp_net_tx_ring_reset(&nn->dp, &nn->dp.tx_rings[r]);
 	for (r = 0; r < nn->dp.num_r_vecs; r++)
@@ -3771,6 +3830,9 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 		return nfp_net_xdp_setup_drv(nn, xdp);
 	case XDP_SETUP_PROG_HW:
 		return nfp_net_xdp_setup_hw(nn, xdp);
+	case XDP_SETUP_XSK_POOL:
+		return nfp_net_xsk_setup_pool(netdev, xdp->xsk.pool,
+					      xdp->xsk.queue_id);
 	default:
 		return nfp_app_bpf(nn->app, nn, xdp);
 	}
@@ -3821,6 +3883,7 @@ const struct net_device_ops nfp_net_netdev_ops = {
 	.ndo_features_check	= nfp_net_features_check,
 	.ndo_get_phys_port_name	= nfp_net_get_phys_port_name,
 	.ndo_bpf		= nfp_net_xdp,
+	.ndo_xsk_wakeup		= nfp_net_xsk_wakeup,
 	.ndo_get_devlink_port	= nfp_devlink_get_devlink_port,
 };
 
@@ -3948,6 +4011,14 @@ nfp_net_alloc(struct pci_dev *pdev, void __iomem *ctrl_bar, bool needs_netdev,
 	nn->dp.num_r_vecs = max(nn->dp.num_tx_rings, nn->dp.num_rx_rings);
 	nn->dp.num_r_vecs = min_t(unsigned int,
 				  nn->dp.num_r_vecs, num_online_cpus());
+	nn->max_r_vecs = nn->dp.num_r_vecs;
+
+	nn->dp.xsk_pools = kcalloc(nn->max_r_vecs, sizeof(nn->dp.xsk_pools),
+				   GFP_KERNEL);
+	if (!nn->dp.xsk_pools) {
+		err = -ENOMEM;
+		goto err_free_nn;
+	}
 
 	nn->dp.txd_cnt = NFP_NET_TX_DESCS_DEFAULT;
 	nn->dp.rxd_cnt = NFP_NET_RX_DESCS_DEFAULT;
@@ -3987,6 +4058,7 @@ void nfp_net_free(struct nfp_net *nn)
 	WARN_ON(timer_pending(&nn->reconfig_timer) || nn->reconfig_posted);
 	nfp_ccm_mbox_free(nn);
 
+	kfree(nn->dp.xsk_pools);
 	if (nn->dp.netdev)
 		free_netdev(nn->dp.netdev);
 	else
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c b/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c
index 553c708694e8..2c74b3c5aef9 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c
@@ -42,13 +42,19 @@ static int nfp_rx_q_show(struct seq_file *file, void *data)
 		seq_printf(file, "%04d: 0x%08x 0x%08x", i,
 			   rxd->vals[0], rxd->vals[1]);
 
-		frag = READ_ONCE(rx_ring->rxbufs[i].frag);
-		if (frag)
-			seq_printf(file, " frag=%p", frag);
-
-		if (rx_ring->rxbufs[i].dma_addr)
-			seq_printf(file, " dma_addr=%pad",
-				   &rx_ring->rxbufs[i].dma_addr);
+		if (!r_vec->xsk_pool) {
+			frag = READ_ONCE(rx_ring->rxbufs[i].frag);
+			if (frag)
+				seq_printf(file, " frag=%p", frag);
+
+			if (rx_ring->rxbufs[i].dma_addr)
+				seq_printf(file, " dma_addr=%pad",
+					   &rx_ring->rxbufs[i].dma_addr);
+		} else {
+			if (rx_ring->xsk_rxbufs[i].dma_addr)
+				seq_printf(file, " dma_addr=%pad",
+					   &rx_ring->xsk_rxbufs[i].dma_addr);
+		}
 
 		if (i == rx_ring->rd_p % rxd_cnt)
 			seq_puts(file, " H_RD ");
@@ -103,20 +109,23 @@ static int nfp_tx_q_show(struct seq_file *file, void *data)
 		   tx_ring->rd_p, tx_ring->wr_p, d_rd_p, d_wr_p);
 
 	for (i = 0; i < txd_cnt; i++) {
+		struct xdp_buff *xdp;
+		struct sk_buff *skb;
+
 		txd = &tx_ring->txds[i];
 		seq_printf(file, "%04d: 0x%08x 0x%08x 0x%08x 0x%08x", i,
 			   txd->vals[0], txd->vals[1],
 			   txd->vals[2], txd->vals[3]);
 
-		if (tx_ring == r_vec->tx_ring) {
-			struct sk_buff *skb = READ_ONCE(tx_ring->txbufs[i].skb);
-
+		if (!tx_ring->is_xdp) {
+			skb = READ_ONCE(tx_ring->txbufs[i].skb);
 			if (skb)
 				seq_printf(file, " skb->head=%p skb->data=%p",
 					   skb->head, skb->data);
 		} else {
-			seq_printf(file, " frag=%p",
-				   READ_ONCE(tx_ring->txbufs[i].frag));
+			xdp = READ_ONCE(tx_ring->txbufs[i].xdp);
+			if (xdp)
+				seq_printf(file, " xdp->data=%p", xdp->data);
 		}
 
 		if (tx_ring->txbufs[i].dma_addr)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_xsk.c b/drivers/net/ethernet/netronome/nfp/nfp_net_xsk.c
new file mode 100644
index 000000000000..ab7243277efa
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_xsk.c
@@ -0,0 +1,592 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2018 Netronome Systems, Inc */
+/* Copyright (C) 2021 Corigine, Inc */
+
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <net/xdp_sock_drv.h>
+#include <trace/events/xdp.h>
+
+#include "nfp_app.h"
+#include "nfp_net.h"
+#include "nfp_net_xsk.h"
+
+static int nfp_net_tx_space(struct nfp_net_tx_ring *tx_ring)
+{
+	return tx_ring->cnt - tx_ring->wr_p + tx_ring->rd_p - 1;
+}
+
+static void nfp_net_xsk_tx_free(struct nfp_net_tx_buf *txbuf)
+{
+	xsk_buff_free(txbuf->xdp);
+
+	txbuf->dma_addr = 0;
+	txbuf->xdp = NULL;
+}
+
+void nfp_net_xsk_tx_bufs_free(struct nfp_net_tx_ring *tx_ring)
+{
+	struct nfp_net_tx_buf *txbuf;
+	unsigned int idx;
+
+	while (tx_ring->rd_p != tx_ring->wr_p) {
+		idx = D_IDX(tx_ring, tx_ring->rd_p);
+		txbuf = &tx_ring->txbufs[idx];
+
+		txbuf->real_len = 0;
+
+		tx_ring->qcp_rd_p++;
+		tx_ring->rd_p++;
+
+		if (tx_ring->r_vec->xsk_pool) {
+			if (txbuf->is_xsk_tx)
+				nfp_net_xsk_tx_free(txbuf);
+
+			xsk_tx_completed(tx_ring->r_vec->xsk_pool, 1);
+		}
+	}
+}
+
+static bool nfp_net_xsk_complete(struct nfp_net_tx_ring *tx_ring)
+{
+	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
+	u32 done_pkts = 0, done_bytes = 0, reused = 0;
+	bool done_all;
+	int idx, todo;
+	u32 qcp_rd_p;
+
+	if (tx_ring->wr_p == tx_ring->rd_p)
+		return true;
+
+	/* Work out how many descriptors have been transmitted. */
+	qcp_rd_p = nfp_qcp_rd_ptr_read(tx_ring->qcp_q);
+
+	if (qcp_rd_p == tx_ring->qcp_rd_p)
+		return true;
+
+	todo = D_IDX(tx_ring, qcp_rd_p - tx_ring->qcp_rd_p);
+
+	done_all = todo <= NFP_NET_XDP_MAX_COMPLETE;
+	todo = min(todo, NFP_NET_XDP_MAX_COMPLETE);
+
+	tx_ring->qcp_rd_p = D_IDX(tx_ring, tx_ring->qcp_rd_p + todo);
+
+	done_pkts = todo;
+	while (todo--) {
+		struct nfp_net_tx_buf *txbuf;
+
+		idx = D_IDX(tx_ring, tx_ring->rd_p);
+		tx_ring->rd_p++;
+
+		txbuf = &tx_ring->txbufs[idx];
+		if (unlikely(!txbuf->real_len))
+			continue;
+
+		done_bytes += txbuf->real_len;
+		txbuf->real_len = 0;
+
+		if (txbuf->is_xsk_tx) {
+			nfp_net_xsk_tx_free(txbuf);
+			reused++;
+		}
+	}
+
+	u64_stats_update_begin(&r_vec->tx_sync);
+	r_vec->tx_bytes += done_bytes;
+	r_vec->tx_pkts += done_pkts;
+	u64_stats_update_end(&r_vec->tx_sync);
+
+	xsk_tx_completed(r_vec->xsk_pool, done_pkts - reused);
+
+	WARN_ONCE(tx_ring->wr_p - tx_ring->rd_p > tx_ring->cnt,
+		  "XDP TX ring corruption rd_p=%u wr_p=%u cnt=%u\n",
+		  tx_ring->rd_p, tx_ring->wr_p, tx_ring->cnt);
+
+	return done_all;
+}
+
+static void nfp_net_xsk_tx(struct nfp_net_tx_ring *tx_ring)
+{
+	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
+	struct xdp_desc desc[NFP_NET_XSK_TX_BATCH];
+	struct xsk_buff_pool *xsk_pool;
+	struct nfp_net_tx_desc *txd;
+	u32 pkts = 0, wr_idx;
+	u32 i, got;
+
+	xsk_pool = r_vec->xsk_pool;
+
+	while (nfp_net_tx_space(tx_ring) >= NFP_NET_XSK_TX_BATCH) {
+		for (i = 0; i < NFP_NET_XSK_TX_BATCH; i++)
+			if (!xsk_tx_peek_desc(xsk_pool, &desc[i]))
+				break;
+		got = i;
+		if (!got)
+			break;
+
+		wr_idx = D_IDX(tx_ring, tx_ring->wr_p + i);
+		prefetchw(&tx_ring->txds[wr_idx]);
+
+		for (i = 0; i < got; i++)
+			xsk_buff_raw_dma_sync_for_device(xsk_pool, desc[i].addr,
+							 desc[i].len);
+
+		for (i = 0; i < got; i++) {
+			wr_idx = D_IDX(tx_ring, tx_ring->wr_p + i);
+
+			tx_ring->txbufs[wr_idx].real_len = desc[i].len;
+			tx_ring->txbufs[wr_idx].is_xsk_tx = false;
+
+			/* Build TX descriptor. */
+			txd = &tx_ring->txds[wr_idx];
+			nfp_desc_set_dma_addr(txd,
+					      xsk_buff_raw_get_dma(xsk_pool,
+								   desc[i].addr
+								   ));
+			txd->offset_eop = PCIE_DESC_TX_EOP;
+			txd->dma_len = cpu_to_le16(desc[i].len);
+			txd->data_len = cpu_to_le16(desc[i].len);
+		}
+
+		tx_ring->wr_p += got;
+		pkts += got;
+	}
+
+	if (!pkts)
+		return;
+
+	xsk_tx_release(xsk_pool);
+	/* Ensure all records are visible before incrementing write counter. */
+	wmb();
+	nfp_qcp_wr_ptr_add(tx_ring->qcp_q, pkts);
+}
+
+static bool
+nfp_net_xsk_tx_xdp(const struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
+		   struct nfp_net_rx_ring *rx_ring,
+		   struct nfp_net_tx_ring *tx_ring,
+		   struct nfp_net_xsk_rx_buf *xrxbuf, unsigned int pkt_len,
+		   int pkt_off)
+{
+	struct xsk_buff_pool *pool = r_vec->xsk_pool;
+	struct nfp_net_tx_buf *txbuf;
+	struct nfp_net_tx_desc *txd;
+	unsigned int wr_idx;
+
+	if (nfp_net_tx_space(tx_ring) < 1)
+		return false;
+
+	xsk_buff_raw_dma_sync_for_device(pool, xrxbuf->dma_addr + pkt_off, pkt_len);
+
+	wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
+
+	txbuf = &tx_ring->txbufs[wr_idx];
+	txbuf->xdp = xrxbuf->xdp;
+	txbuf->real_len = pkt_len;
+	txbuf->is_xsk_tx = true;
+
+	/* Build TX descriptor */
+	txd = &tx_ring->txds[wr_idx];
+	txd->offset_eop = PCIE_DESC_TX_EOP;
+	txd->dma_len = cpu_to_le16(pkt_len);
+	nfp_desc_set_dma_addr(txd, xrxbuf->dma_addr + pkt_off);
+	txd->data_len = cpu_to_le16(pkt_len);
+
+	txd->flags = 0;
+	txd->mss = 0;
+	txd->lso_hdrlen = 0;
+
+	tx_ring->wr_ptr_add++;
+	tx_ring->wr_p++;
+
+	return true;
+}
+
+static int nfp_net_rx_space(struct nfp_net_rx_ring *rx_ring)
+{
+	return rx_ring->cnt - rx_ring->wr_p + rx_ring->rd_p - 1;
+}
+
+static void
+nfp_net_xsk_rx_bufs_stash(struct nfp_net_rx_ring *rx_ring, unsigned int idx,
+			  struct xdp_buff *xdp)
+{
+	unsigned int headroom;
+
+	headroom = xsk_pool_get_headroom(rx_ring->r_vec->xsk_pool);
+
+	rx_ring->rxds[idx].fld.reserved = 0;
+	rx_ring->rxds[idx].fld.meta_len_dd = 0;
+
+	rx_ring->xsk_rxbufs[idx].xdp = xdp;
+	rx_ring->xsk_rxbufs[idx].dma_addr =
+		xsk_buff_xdp_get_frame_dma(xdp) + headroom;
+}
+
+static void nfp_net_xsk_rx_unstash(struct nfp_net_xsk_rx_buf *rxbuf)
+{
+	rxbuf->dma_addr = 0;
+	rxbuf->xdp = NULL;
+}
+
+static void nfp_net_xsk_rx_free(struct nfp_net_xsk_rx_buf *rxbuf)
+{
+	if (rxbuf->xdp)
+		xsk_buff_free(rxbuf->xdp);
+
+	nfp_net_xsk_rx_unstash(rxbuf);
+}
+
+void nfp_net_xsk_rx_bufs_free(struct nfp_net_rx_ring *rx_ring)
+{
+	unsigned int i;
+
+	if (!rx_ring->cnt)
+		return;
+
+	for (i = 0; i < rx_ring->cnt - 1; i++)
+		nfp_net_xsk_rx_free(&rx_ring->xsk_rxbufs[i]);
+}
+
+void nfp_net_xsk_rx_ring_fill_freelist(struct nfp_net_rx_ring *rx_ring)
+{
+	struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
+	struct xsk_buff_pool *pool = r_vec->xsk_pool;
+	unsigned int wr_idx, wr_ptr_add = 0;
+	struct xdp_buff *xdp;
+
+	while (nfp_net_rx_space(rx_ring)) {
+		wr_idx = D_IDX(rx_ring, rx_ring->wr_p);
+
+		xdp = xsk_buff_alloc(pool);
+		if (!xdp)
+			break;
+
+		nfp_net_xsk_rx_bufs_stash(rx_ring, wr_idx, xdp);
+
+		nfp_desc_set_dma_addr(&rx_ring->rxds[wr_idx].fld,
+				      rx_ring->xsk_rxbufs[wr_idx].dma_addr);
+
+		rx_ring->wr_p++;
+		wr_ptr_add++;
+	}
+
+	/* Ensure all records are visible before incrementing write counter. */
+	wmb();
+	nfp_qcp_wr_ptr_add(rx_ring->qcp_fl, wr_ptr_add);
+}
+
+static void nfp_net_xsk_rx_drop(struct nfp_net_r_vector *r_vec,
+				struct nfp_net_xsk_rx_buf *xrxbuf)
+{
+	u64_stats_update_begin(&r_vec->rx_sync);
+	r_vec->rx_drops++;
+	u64_stats_update_end(&r_vec->rx_sync);
+
+	nfp_net_xsk_rx_free(xrxbuf);
+}
+
+static void nfp_net_xsk_rx_skb(struct nfp_net_rx_ring *rx_ring,
+			       const struct nfp_net_rx_desc *rxd,
+			       struct nfp_net_xsk_rx_buf *xrxbuf,
+			       const struct nfp_meta_parsed *meta,
+			       unsigned int pkt_len,
+			       bool meta_xdp,
+			       unsigned int *skbs_polled)
+{
+	struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
+	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
+	struct net_device *netdev;
+	struct sk_buff *skb;
+
+	if (likely(!meta->portid)) {
+		netdev = dp->netdev;
+	} else {
+		struct nfp_net *nn = netdev_priv(dp->netdev);
+
+		netdev = nfp_app_dev_get(nn->app, meta->portid, NULL);
+		if (unlikely(!netdev)) {
+			nfp_net_xsk_rx_drop(r_vec, xrxbuf);
+			return;
+		}
+		nfp_repr_inc_rx_stats(netdev, pkt_len);
+	}
+
+	skb = napi_alloc_skb(&r_vec->napi, pkt_len);
+	if (!skb) {
+		nfp_net_xsk_rx_drop(r_vec, xrxbuf);
+		return;
+	}
+	memcpy(skb_put(skb, pkt_len), xrxbuf->xdp->data, pkt_len);
+
+	skb->mark = meta->mark;
+	skb_set_hash(skb, meta->hash, meta->hash_type);
+
+	skb_record_rx_queue(skb, rx_ring->idx);
+	skb->protocol = eth_type_trans(skb, netdev);
+
+	nfp_net_rx_csum(dp, r_vec, rxd, meta, skb);
+
+	if (rxd->rxd.flags & PCIE_DESC_RX_VLAN)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+				       le16_to_cpu(rxd->rxd.vlan));
+	if (meta_xdp)
+		skb_metadata_set(skb,
+				 xrxbuf->xdp->data - xrxbuf->xdp->data_meta);
+
+	napi_gro_receive(&rx_ring->r_vec->napi, skb);
+
+	nfp_net_xsk_rx_free(xrxbuf);
+
+	(*skbs_polled)++;
+}
+
+static unsigned int
+nfp_net_xsk_rx(struct nfp_net_rx_ring *rx_ring, int budget,
+	       unsigned int *skbs_polled)
+{
+	struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
+	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
+	struct nfp_net_tx_ring *tx_ring;
+	struct bpf_prog *xdp_prog;
+	bool xdp_redir = false;
+	int pkts_polled = 0;
+
+	xdp_prog = READ_ONCE(dp->xdp_prog);
+	tx_ring = r_vec->xdp_ring;
+
+	while (pkts_polled < budget) {
+		unsigned int meta_len, data_len, pkt_len, pkt_off;
+		struct nfp_net_xsk_rx_buf *xrxbuf;
+		struct nfp_net_rx_desc *rxd;
+		struct nfp_meta_parsed meta;
+		int idx, act;
+
+		idx = D_IDX(rx_ring, rx_ring->rd_p);
+
+		rxd = &rx_ring->rxds[idx];
+		if (!(rxd->rxd.meta_len_dd & PCIE_DESC_RX_DD))
+			break;
+
+		rx_ring->rd_p++;
+		pkts_polled++;
+
+		xrxbuf = &rx_ring->xsk_rxbufs[idx];
+
+		/* If starved of buffers "drop" it and scream. */
+		if (rx_ring->rd_p >= rx_ring->wr_p) {
+			nn_dp_warn(dp, "Starved of RX buffers\n");
+			nfp_net_xsk_rx_drop(r_vec, xrxbuf);
+			break;
+		}
+
+		/* Memory barrier to ensure that we won't do other reads
+		 * before the DD bit.
+		 */
+		dma_rmb();
+
+		memset(&meta, 0, sizeof(meta));
+
+		/* Only supporting AF_XDP with dynamic metadata so buffer layout
+		 * is always:
+		 *
+		 *  ---------------------------------------------------------
+		 * |  off | metadata  |             packet           | XXXX  |
+		 *  ---------------------------------------------------------
+		 */
+		meta_len = rxd->rxd.meta_len_dd & PCIE_DESC_RX_META_LEN_MASK;
+		data_len = le16_to_cpu(rxd->rxd.data_len);
+		pkt_len = data_len - meta_len;
+
+		if (unlikely(meta_len > NFP_NET_MAX_PREPEND)) {
+			nn_dp_warn(dp, "Oversized RX packet metadata %u\n",
+				   meta_len);
+			nfp_net_xsk_rx_drop(r_vec, xrxbuf);
+			continue;
+		}
+
+		/* Stats update. */
+		u64_stats_update_begin(&r_vec->rx_sync);
+		r_vec->rx_pkts++;
+		r_vec->rx_bytes += pkt_len;
+		u64_stats_update_end(&r_vec->rx_sync);
+
+		xrxbuf->xdp->data += meta_len;
+		xrxbuf->xdp->data_end = xrxbuf->xdp->data + pkt_len;
+		xdp_set_data_meta_invalid(xrxbuf->xdp);
+		xsk_buff_dma_sync_for_cpu(xrxbuf->xdp, r_vec->xsk_pool);
+		net_prefetch(xrxbuf->xdp->data);
+
+		if (meta_len) {
+			if (unlikely(nfp_net_parse_meta(dp->netdev, &meta,
+							xrxbuf->xdp->data -
+							meta_len,
+							xrxbuf->xdp->data,
+							pkt_len, meta_len))) {
+				nn_dp_warn(dp, "Invalid RX packet metadata\n");
+				nfp_net_xsk_rx_drop(r_vec, xrxbuf);
+				continue;
+			}
+
+			if (unlikely(meta.portid)) {
+				struct nfp_net *nn = netdev_priv(dp->netdev);
+
+				if (meta.portid != NFP_META_PORT_ID_CTRL) {
+					nfp_net_xsk_rx_skb(rx_ring, rxd, xrxbuf,
+							   &meta, pkt_len,
+							   false, skbs_polled);
+					continue;
+				}
+
+				nfp_app_ctrl_rx_raw(nn->app, xrxbuf->xdp->data,
+						    pkt_len);
+				nfp_net_xsk_rx_free(xrxbuf);
+				continue;
+			}
+		}
+
+		act = bpf_prog_run_xdp(xdp_prog, xrxbuf->xdp);
+
+		pkt_len = xrxbuf->xdp->data_end - xrxbuf->xdp->data;
+		pkt_off = xrxbuf->xdp->data - xrxbuf->xdp->data_hard_start;
+
+		switch (act) {
+		case XDP_PASS:
+			nfp_net_xsk_rx_skb(rx_ring, rxd, xrxbuf, &meta, pkt_len,
+					   true, skbs_polled);
+			break;
+		case XDP_TX:
+			if (!nfp_net_xsk_tx_xdp(dp, r_vec, rx_ring, tx_ring,
+						xrxbuf, pkt_len, pkt_off))
+				nfp_net_xsk_rx_drop(r_vec, xrxbuf);
+			else
+				nfp_net_xsk_rx_unstash(xrxbuf);
+			break;
+		case XDP_REDIRECT:
+			if (xdp_do_redirect(dp->netdev, xrxbuf->xdp, xdp_prog)) {
+				nfp_net_xsk_rx_drop(r_vec, xrxbuf);
+			} else {
+				nfp_net_xsk_rx_unstash(xrxbuf);
+				xdp_redir = true;
+			}
+			break;
+		default:
+			bpf_warn_invalid_xdp_action(dp->netdev, xdp_prog, act);
+			fallthrough;
+		case XDP_ABORTED:
+			trace_xdp_exception(dp->netdev, xdp_prog, act);
+			fallthrough;
+		case XDP_DROP:
+			nfp_net_xsk_rx_drop(r_vec, xrxbuf);
+			break;
+		}
+	}
+
+	nfp_net_xsk_rx_ring_fill_freelist(r_vec->rx_ring);
+
+	if (xdp_redir)
+		xdp_do_flush_map();
+
+	if (tx_ring->wr_ptr_add)
+		nfp_net_tx_xmit_more_flush(tx_ring);
+
+	return pkts_polled;
+}
+
+static void nfp_net_xsk_pool_unmap(struct device *dev,
+				   struct xsk_buff_pool *pool)
+{
+	return xsk_pool_dma_unmap(pool, 0);
+}
+
+static int nfp_net_xsk_pool_map(struct device *dev, struct xsk_buff_pool *pool)
+{
+	return xsk_pool_dma_map(pool, dev, 0);
+}
+
+int nfp_net_xsk_setup_pool(struct net_device *netdev,
+			   struct xsk_buff_pool *pool, u16 queue_id)
+{
+	struct nfp_net *nn = netdev_priv(netdev);
+
+	struct xsk_buff_pool *prev_pool;
+	struct nfp_net_dp *dp;
+	int err;
+
+	/* Reject on old FWs so we can drop some checks on datapath. */
+	if (nn->dp.rx_offset != NFP_NET_CFG_RX_OFFSET_DYNAMIC)
+		return -EOPNOTSUPP;
+	if (!nn->dp.chained_metadata_format)
+		return -EOPNOTSUPP;
+
+	/* Install */
+	if (pool) {
+		err = nfp_net_xsk_pool_map(nn->dp.dev, pool);
+		if (err)
+			return err;
+	}
+
+	/* Reconfig/swap */
+	dp = nfp_net_clone_dp(nn);
+	if (!dp) {
+		err = -ENOMEM;
+		goto err_unmap;
+	}
+
+	prev_pool = dp->xsk_pools[queue_id];
+	dp->xsk_pools[queue_id] = pool;
+
+	err = nfp_net_ring_reconfig(nn, dp, NULL);
+	if (err)
+		goto err_unmap;
+
+	/* Uninstall */
+	if (prev_pool)
+		nfp_net_xsk_pool_unmap(nn->dp.dev, prev_pool);
+
+	return 0;
+err_unmap:
+	if (pool)
+		nfp_net_xsk_pool_unmap(nn->dp.dev, pool);
+
+	return err;
+}
+
+int nfp_net_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags)
+{
+	struct nfp_net *nn = netdev_priv(netdev);
+
+	/* queue_id comes from a zero-copy socket, installed with XDP_SETUP_XSK_POOL,
+	 * so it must be within our vector range.  Moreover, our napi structs
+	 * are statically allocated, so we can always kick them without worrying
+	 * if reconfig is in progress or interface down.
+	 */
+	napi_schedule(&nn->r_vecs[queue_id].napi);
+
+	return 0;
+}
+
+int nfp_net_xsk_poll(struct napi_struct *napi, int budget)
+{
+	struct nfp_net_r_vector *r_vec =
+		container_of(napi, struct nfp_net_r_vector, napi);
+	unsigned int pkts_polled, skbs = 0;
+
+	pkts_polled = nfp_net_xsk_rx(r_vec->rx_ring, budget, &skbs);
+
+	if (pkts_polled < budget) {
+		if (r_vec->tx_ring)
+			nfp_net_tx_complete(r_vec->tx_ring, budget);
+
+		if (!nfp_net_xsk_complete(r_vec->xdp_ring))
+			pkts_polled = budget;
+
+		nfp_net_xsk_tx(r_vec->xdp_ring);
+
+		if (pkts_polled < budget && napi_complete_done(napi, skbs))
+			nfp_net_irq_unmask(r_vec->nfp_net, r_vec->irq_entry);
+	}
+
+	return pkts_polled;
+}
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_xsk.h b/drivers/net/ethernet/netronome/nfp/nfp_net_xsk.h
new file mode 100644
index 000000000000..5c8549cb3543
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_xsk.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (C) 2018 Netronome Systems, Inc */
+/* Copyright (C) 2021 Corigine, Inc */
+
+#ifndef _NFP_XSK_H_
+#define _NFP_XSK_H_
+
+#include <net/xdp_sock_drv.h>
+
+#define NFP_NET_XSK_TX_BATCH 16		/* XSK TX transmission batch size. */
+
+static inline bool nfp_net_has_xsk_pool_slow(struct nfp_net_dp *dp,
+					     unsigned int qid)
+{
+	return dp->xdp_prog && dp->xsk_pools[qid];
+}
+
+int nfp_net_xsk_setup_pool(struct net_device *netdev, struct xsk_buff_pool *pool,
+			   u16 queue_id);
+
+void nfp_net_xsk_tx_bufs_free(struct nfp_net_tx_ring *tx_ring);
+void nfp_net_xsk_rx_bufs_free(struct nfp_net_rx_ring *rx_ring);
+
+void nfp_net_xsk_rx_ring_fill_freelist(struct nfp_net_rx_ring *rx_ring);
+
+int nfp_net_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
+int nfp_net_xsk_poll(struct napi_struct *napi, int budget);
+
+#endif /* _NFP_XSK_H_ */
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 1/5] nfp: expose common functions to be used for AF_XDP
  2022-03-04 10:22 ` [PATCH net-next 1/5] nfp: expose common functions to be used for AF_XDP Simon Horman
@ 2022-03-04 13:10   ` patchwork-bot+netdevbpf
  0 siblings, 0 replies; 9+ messages in thread
From: patchwork-bot+netdevbpf @ 2022-03-04 13:10 UTC (permalink / raw)
  To: Simon Horman; +Cc: davem, kuba, niklas.soderlund, netdev, oss-drivers

Hello:

This series was applied to netdev/net-next.git (master)
by David S. Miller <davem@davemloft.net>:

On Fri,  4 Mar 2022 11:22:10 +0100 you wrote:
> From: Niklas Söderlund <niklas.soderlund@corigine.com>
> 
> There are some common functionality that can be reused in the upcoming
> AF_XDP support. Expose those functions in the header. While at it mark
> some arguments of nfp_net_rx_csum() as const.
> 
> Signed-off-by: Niklas Söderlund <niklas.soderlund@corigine.com>
> Signed-off-by: Simon Horman <simon.horman@corigine.com>
> 
> [...]

Here is the summary with links:
  - [net-next,1/5] nfp: expose common functions to be used for AF_XDP
    https://git.kernel.org/netdev/net-next/c/3cdb35fb9cd5
  - [net-next,2/5] nfp: wrap napi add/del logic
    https://git.kernel.org/netdev/net-next/c/58eb43635344
  - [net-next,3/5] nfp: xsk: add an array of xsk buffer pools to each data path
    https://git.kernel.org/netdev/net-next/c/543bd14fc8f6
  - [net-next,4/5] nfp: xsk: add configuration check for XSK socket chunk size
    https://git.kernel.org/netdev/net-next/c/9c91a3653fbb
  - [net-next,5/5] nfp: xsk: add AF_XDP zero-copy Rx and Tx support
    https://git.kernel.org/netdev/net-next/c/6402528b7a0b

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 5/5] nfp: xsk: add AF_XDP zero-copy Rx and Tx support
  2022-03-04 10:22 ` [PATCH net-next 5/5] nfp: xsk: add AF_XDP zero-copy Rx and Tx support Simon Horman
@ 2022-03-04 13:53   ` Maciej Fijalkowski
  2022-03-04 18:42     ` Niklas Söderlund
  0 siblings, 1 reply; 9+ messages in thread
From: Maciej Fijalkowski @ 2022-03-04 13:53 UTC (permalink / raw)
  To: Simon Horman
  Cc: David Miller, Jakub Kicinski, Niklas Söderlund, netdev, oss-drivers

On Fri, Mar 04, 2022 at 11:22:14AM +0100, Simon Horman wrote:
> From: Niklas Söderlund <niklas.soderlund@corigine.com>
> 
> This patch adds zero-copy Rx and Tx support for AF_XDP sockets. It do so
> by adding a separate NAPI poll function that is attached to a each
> channel when the XSK socket is attached with XDP_SETUP_XSK_POOL, and
> restored when the XSK socket is terminated, this is done per channel.
> 
> Support for XDP_TX is implemented and the XDP buffer can safely be moved
> from the Rx to the Tx queue and correctly freed and returned to the XSK
> pool once it's transmitted.
> 
> Note that when AF_XDP zero-copy is enabled, the XDP action XDP_PASS
> will allocate a new buffer and copy the zero-copy frame prior
> passing it to the kernel stack.
> 
> This patch is based on previous work by Jakub Kicinski.
> 
> Signed-off-by: Niklas Söderlund <niklas.soderlund@corigine.com>
> Signed-off-by: Simon Horman <simon.horman@corigine.com>
> ---
>  drivers/net/ethernet/netronome/nfp/Makefile   |   1 +
>  drivers/net/ethernet/netronome/nfp/nfp_net.h  |  31 +-
>  .../ethernet/netronome/nfp/nfp_net_common.c   |  98 ++-
>  .../ethernet/netronome/nfp/nfp_net_debugfs.c  |  33 +-
>  .../net/ethernet/netronome/nfp/nfp_net_xsk.c  | 592 ++++++++++++++++++
>  .../net/ethernet/netronome/nfp/nfp_net_xsk.h  |  29 +
>  6 files changed, 756 insertions(+), 28 deletions(-)
>  create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_xsk.c
>  create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_xsk.h

(...)

> +
> +static void nfp_net_xsk_tx(struct nfp_net_tx_ring *tx_ring)
> +{
> +	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
> +	struct xdp_desc desc[NFP_NET_XSK_TX_BATCH];
> +	struct xsk_buff_pool *xsk_pool;
> +	struct nfp_net_tx_desc *txd;
> +	u32 pkts = 0, wr_idx;
> +	u32 i, got;
> +
> +	xsk_pool = r_vec->xsk_pool;
> +
> +	while (nfp_net_tx_space(tx_ring) >= NFP_NET_XSK_TX_BATCH) {
> +		for (i = 0; i < NFP_NET_XSK_TX_BATCH; i++)
> +			if (!xsk_tx_peek_desc(xsk_pool, &desc[i]))
> +				break;

Please use xsk_tx_peek_release_desc_batch() and avoid your own internal
batching mechanisms. We introduced the array of xdp_descs on xsk_buff_pool
side just for that purpose, so there's no need for having this here on
stack.

I suppose this will also simplify this ZC support.

Dave, could you give us some time for review? It was on list only for few
hours or so and I see it's already applied :<

> +		got = i;
> +		if (!got)
> +			break;
> +
> +		wr_idx = D_IDX(tx_ring, tx_ring->wr_p + i);
> +		prefetchw(&tx_ring->txds[wr_idx]);
> +
> +		for (i = 0; i < got; i++)
> +			xsk_buff_raw_dma_sync_for_device(xsk_pool, desc[i].addr,
> +							 desc[i].len);
> +
> +		for (i = 0; i < got; i++) {
> +			wr_idx = D_IDX(tx_ring, tx_ring->wr_p + i);
> +
> +			tx_ring->txbufs[wr_idx].real_len = desc[i].len;
> +			tx_ring->txbufs[wr_idx].is_xsk_tx = false;
> +
> +			/* Build TX descriptor. */
> +			txd = &tx_ring->txds[wr_idx];
> +			nfp_desc_set_dma_addr(txd,
> +					      xsk_buff_raw_get_dma(xsk_pool,
> +								   desc[i].addr
> +								   ));
> +			txd->offset_eop = PCIE_DESC_TX_EOP;
> +			txd->dma_len = cpu_to_le16(desc[i].len);
> +			txd->data_len = cpu_to_le16(desc[i].len);
> +		}
> +
> +		tx_ring->wr_p += got;
> +		pkts += got;
> +	}
> +
> +	if (!pkts)
> +		return;
> +
> +	xsk_tx_release(xsk_pool);
> +	/* Ensure all records are visible before incrementing write counter. */
> +	wmb();
> +	nfp_qcp_wr_ptr_add(tx_ring->qcp_q, pkts);
> +}
> +
> +static bool
> +nfp_net_xsk_tx_xdp(const struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
> +		   struct nfp_net_rx_ring *rx_ring,
> +		   struct nfp_net_tx_ring *tx_ring,
> +		   struct nfp_net_xsk_rx_buf *xrxbuf, unsigned int pkt_len,
> +		   int pkt_off)
> +{
> +	struct xsk_buff_pool *pool = r_vec->xsk_pool;
> +	struct nfp_net_tx_buf *txbuf;
> +	struct nfp_net_tx_desc *txd;
> +	unsigned int wr_idx;
> +
> +	if (nfp_net_tx_space(tx_ring) < 1)
> +		return false;
> +
> +	xsk_buff_raw_dma_sync_for_device(pool, xrxbuf->dma_addr + pkt_off, pkt_len);
> +
> +	wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
> +
> +	txbuf = &tx_ring->txbufs[wr_idx];
> +	txbuf->xdp = xrxbuf->xdp;
> +	txbuf->real_len = pkt_len;
> +	txbuf->is_xsk_tx = true;
> +
> +	/* Build TX descriptor */
> +	txd = &tx_ring->txds[wr_idx];
> +	txd->offset_eop = PCIE_DESC_TX_EOP;
> +	txd->dma_len = cpu_to_le16(pkt_len);
> +	nfp_desc_set_dma_addr(txd, xrxbuf->dma_addr + pkt_off);
> +	txd->data_len = cpu_to_le16(pkt_len);
> +
> +	txd->flags = 0;
> +	txd->mss = 0;
> +	txd->lso_hdrlen = 0;
> +
> +	tx_ring->wr_ptr_add++;
> +	tx_ring->wr_p++;
> +
> +	return true;
> +}
> +
> +static int nfp_net_rx_space(struct nfp_net_rx_ring *rx_ring)
> +{
> +	return rx_ring->cnt - rx_ring->wr_p + rx_ring->rd_p - 1;
> +}
> +
> +static void
> +nfp_net_xsk_rx_bufs_stash(struct nfp_net_rx_ring *rx_ring, unsigned int idx,
> +			  struct xdp_buff *xdp)
> +{
> +	unsigned int headroom;
> +
> +	headroom = xsk_pool_get_headroom(rx_ring->r_vec->xsk_pool);
> +
> +	rx_ring->rxds[idx].fld.reserved = 0;
> +	rx_ring->rxds[idx].fld.meta_len_dd = 0;
> +
> +	rx_ring->xsk_rxbufs[idx].xdp = xdp;
> +	rx_ring->xsk_rxbufs[idx].dma_addr =
> +		xsk_buff_xdp_get_frame_dma(xdp) + headroom;
> +}
> +
> +static void nfp_net_xsk_rx_unstash(struct nfp_net_xsk_rx_buf *rxbuf)
> +{
> +	rxbuf->dma_addr = 0;
> +	rxbuf->xdp = NULL;
> +}
> +
> +static void nfp_net_xsk_rx_free(struct nfp_net_xsk_rx_buf *rxbuf)
> +{
> +	if (rxbuf->xdp)
> +		xsk_buff_free(rxbuf->xdp);
> +
> +	nfp_net_xsk_rx_unstash(rxbuf);
> +}
> +
> +void nfp_net_xsk_rx_bufs_free(struct nfp_net_rx_ring *rx_ring)
> +{
> +	unsigned int i;
> +
> +	if (!rx_ring->cnt)
> +		return;
> +
> +	for (i = 0; i < rx_ring->cnt - 1; i++)
> +		nfp_net_xsk_rx_free(&rx_ring->xsk_rxbufs[i]);
> +}
> +
> +void nfp_net_xsk_rx_ring_fill_freelist(struct nfp_net_rx_ring *rx_ring)
> +{
> +	struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
> +	struct xsk_buff_pool *pool = r_vec->xsk_pool;
> +	unsigned int wr_idx, wr_ptr_add = 0;
> +	struct xdp_buff *xdp;
> +
> +	while (nfp_net_rx_space(rx_ring)) {
> +		wr_idx = D_IDX(rx_ring, rx_ring->wr_p);
> +
> +		xdp = xsk_buff_alloc(pool);

Could you use xsk_buff_alloc_batch() with nfp_net_rx_space(rx_ring) passed
as requested count of xdp_bufs instead calling xsk_buff_alloc() in a loop?

> +		if (!xdp)
> +			break;
> +
> +		nfp_net_xsk_rx_bufs_stash(rx_ring, wr_idx, xdp);
> +
> +		nfp_desc_set_dma_addr(&rx_ring->rxds[wr_idx].fld,
> +				      rx_ring->xsk_rxbufs[wr_idx].dma_addr);
> +
> +		rx_ring->wr_p++;
> +		wr_ptr_add++;
> +	}
> +
> +	/* Ensure all records are visible before incrementing write counter. */
> +	wmb();
> +	nfp_qcp_wr_ptr_add(rx_ring->qcp_fl, wr_ptr_add);
> +}
> +

(...)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 5/5] nfp: xsk: add AF_XDP zero-copy Rx and Tx support
  2022-03-04 13:53   ` Maciej Fijalkowski
@ 2022-03-04 18:42     ` Niklas Söderlund
  0 siblings, 0 replies; 9+ messages in thread
From: Niklas Söderlund @ 2022-03-04 18:42 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: Simon Horman, David Miller, Jakub Kicinski, netdev, oss-drivers

Hi Maciej,

Thanks for your feedback.

On 2022-03-04 14:53:30 +0100, Maciej Fijalkowski wrote:
> On Fri, Mar 04, 2022 at 11:22:14AM +0100, Simon Horman wrote:
> > From: Niklas Söderlund <niklas.soderlund@corigine.com>
> > 
> > This patch adds zero-copy Rx and Tx support for AF_XDP sockets. It do so
> > by adding a separate NAPI poll function that is attached to a each
> > channel when the XSK socket is attached with XDP_SETUP_XSK_POOL, and
> > restored when the XSK socket is terminated, this is done per channel.
> > 
> > Support for XDP_TX is implemented and the XDP buffer can safely be moved
> > from the Rx to the Tx queue and correctly freed and returned to the XSK
> > pool once it's transmitted.
> > 
> > Note that when AF_XDP zero-copy is enabled, the XDP action XDP_PASS
> > will allocate a new buffer and copy the zero-copy frame prior
> > passing it to the kernel stack.
> > 
> > This patch is based on previous work by Jakub Kicinski.
> > 
> > Signed-off-by: Niklas Söderlund <niklas.soderlund@corigine.com>
> > Signed-off-by: Simon Horman <simon.horman@corigine.com>
> > ---
> >  drivers/net/ethernet/netronome/nfp/Makefile   |   1 +
> >  drivers/net/ethernet/netronome/nfp/nfp_net.h  |  31 +-
> >  .../ethernet/netronome/nfp/nfp_net_common.c   |  98 ++-
> >  .../ethernet/netronome/nfp/nfp_net_debugfs.c  |  33 +-
> >  .../net/ethernet/netronome/nfp/nfp_net_xsk.c  | 592 ++++++++++++++++++
> >  .../net/ethernet/netronome/nfp/nfp_net_xsk.h  |  29 +
> >  6 files changed, 756 insertions(+), 28 deletions(-)
> >  create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_xsk.c
> >  create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_net_xsk.h
> 
> (...)
> 
> > +
> > +static void nfp_net_xsk_tx(struct nfp_net_tx_ring *tx_ring)
> > +{
> > +	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
> > +	struct xdp_desc desc[NFP_NET_XSK_TX_BATCH];
> > +	struct xsk_buff_pool *xsk_pool;
> > +	struct nfp_net_tx_desc *txd;
> > +	u32 pkts = 0, wr_idx;
> > +	u32 i, got;
> > +
> > +	xsk_pool = r_vec->xsk_pool;
> > +
> > +	while (nfp_net_tx_space(tx_ring) >= NFP_NET_XSK_TX_BATCH) {
> > +		for (i = 0; i < NFP_NET_XSK_TX_BATCH; i++)
> > +			if (!xsk_tx_peek_desc(xsk_pool, &desc[i]))
> > +				break;
> 
> Please use xsk_tx_peek_release_desc_batch() and avoid your own internal
> batching mechanisms. We introduced the array of xdp_descs on xsk_buff_pool
> side just for that purpose, so there's no need for having this here on
> stack.
> 
> I suppose this will also simplify this ZC support.

This is a great suggestion. I will create a patch to use 
xsk_tx_peek_release_desc_batch() here.

> 
> Dave, could you give us some time for review? It was on list only for few
> hours or so and I see it's already applied :<
> 
> > +		got = i;
> > +		if (!got)
> > +			break;
> > +
> > +		wr_idx = D_IDX(tx_ring, tx_ring->wr_p + i);
> > +		prefetchw(&tx_ring->txds[wr_idx]);
> > +
> > +		for (i = 0; i < got; i++)
> > +			xsk_buff_raw_dma_sync_for_device(xsk_pool, desc[i].addr,
> > +							 desc[i].len);
> > +
> > +		for (i = 0; i < got; i++) {
> > +			wr_idx = D_IDX(tx_ring, tx_ring->wr_p + i);
> > +
> > +			tx_ring->txbufs[wr_idx].real_len = desc[i].len;
> > +			tx_ring->txbufs[wr_idx].is_xsk_tx = false;
> > +
> > +			/* Build TX descriptor. */
> > +			txd = &tx_ring->txds[wr_idx];
> > +			nfp_desc_set_dma_addr(txd,
> > +					      xsk_buff_raw_get_dma(xsk_pool,
> > +								   desc[i].addr
> > +								   ));
> > +			txd->offset_eop = PCIE_DESC_TX_EOP;
> > +			txd->dma_len = cpu_to_le16(desc[i].len);
> > +			txd->data_len = cpu_to_le16(desc[i].len);
> > +		}
> > +
> > +		tx_ring->wr_p += got;
> > +		pkts += got;
> > +	}
> > +
> > +	if (!pkts)
> > +		return;
> > +
> > +	xsk_tx_release(xsk_pool);
> > +	/* Ensure all records are visible before incrementing write counter. */
> > +	wmb();
> > +	nfp_qcp_wr_ptr_add(tx_ring->qcp_q, pkts);
> > +}
> > +
> > +static bool
> > +nfp_net_xsk_tx_xdp(const struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
> > +		   struct nfp_net_rx_ring *rx_ring,
> > +		   struct nfp_net_tx_ring *tx_ring,
> > +		   struct nfp_net_xsk_rx_buf *xrxbuf, unsigned int pkt_len,
> > +		   int pkt_off)
> > +{
> > +	struct xsk_buff_pool *pool = r_vec->xsk_pool;
> > +	struct nfp_net_tx_buf *txbuf;
> > +	struct nfp_net_tx_desc *txd;
> > +	unsigned int wr_idx;
> > +
> > +	if (nfp_net_tx_space(tx_ring) < 1)
> > +		return false;
> > +
> > +	xsk_buff_raw_dma_sync_for_device(pool, xrxbuf->dma_addr + pkt_off, pkt_len);
> > +
> > +	wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
> > +
> > +	txbuf = &tx_ring->txbufs[wr_idx];
> > +	txbuf->xdp = xrxbuf->xdp;
> > +	txbuf->real_len = pkt_len;
> > +	txbuf->is_xsk_tx = true;
> > +
> > +	/* Build TX descriptor */
> > +	txd = &tx_ring->txds[wr_idx];
> > +	txd->offset_eop = PCIE_DESC_TX_EOP;
> > +	txd->dma_len = cpu_to_le16(pkt_len);
> > +	nfp_desc_set_dma_addr(txd, xrxbuf->dma_addr + pkt_off);
> > +	txd->data_len = cpu_to_le16(pkt_len);
> > +
> > +	txd->flags = 0;
> > +	txd->mss = 0;
> > +	txd->lso_hdrlen = 0;
> > +
> > +	tx_ring->wr_ptr_add++;
> > +	tx_ring->wr_p++;
> > +
> > +	return true;
> > +}
> > +
> > +static int nfp_net_rx_space(struct nfp_net_rx_ring *rx_ring)
> > +{
> > +	return rx_ring->cnt - rx_ring->wr_p + rx_ring->rd_p - 1;
> > +}
> > +
> > +static void
> > +nfp_net_xsk_rx_bufs_stash(struct nfp_net_rx_ring *rx_ring, unsigned int idx,
> > +			  struct xdp_buff *xdp)
> > +{
> > +	unsigned int headroom;
> > +
> > +	headroom = xsk_pool_get_headroom(rx_ring->r_vec->xsk_pool);
> > +
> > +	rx_ring->rxds[idx].fld.reserved = 0;
> > +	rx_ring->rxds[idx].fld.meta_len_dd = 0;
> > +
> > +	rx_ring->xsk_rxbufs[idx].xdp = xdp;
> > +	rx_ring->xsk_rxbufs[idx].dma_addr =
> > +		xsk_buff_xdp_get_frame_dma(xdp) + headroom;
> > +}
> > +
> > +static void nfp_net_xsk_rx_unstash(struct nfp_net_xsk_rx_buf *rxbuf)
> > +{
> > +	rxbuf->dma_addr = 0;
> > +	rxbuf->xdp = NULL;
> > +}
> > +
> > +static void nfp_net_xsk_rx_free(struct nfp_net_xsk_rx_buf *rxbuf)
> > +{
> > +	if (rxbuf->xdp)
> > +		xsk_buff_free(rxbuf->xdp);
> > +
> > +	nfp_net_xsk_rx_unstash(rxbuf);
> > +}
> > +
> > +void nfp_net_xsk_rx_bufs_free(struct nfp_net_rx_ring *rx_ring)
> > +{
> > +	unsigned int i;
> > +
> > +	if (!rx_ring->cnt)
> > +		return;
> > +
> > +	for (i = 0; i < rx_ring->cnt - 1; i++)
> > +		nfp_net_xsk_rx_free(&rx_ring->xsk_rxbufs[i]);
> > +}
> > +
> > +void nfp_net_xsk_rx_ring_fill_freelist(struct nfp_net_rx_ring *rx_ring)
> > +{
> > +	struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
> > +	struct xsk_buff_pool *pool = r_vec->xsk_pool;
> > +	unsigned int wr_idx, wr_ptr_add = 0;
> > +	struct xdp_buff *xdp;
> > +
> > +	while (nfp_net_rx_space(rx_ring)) {
> > +		wr_idx = D_IDX(rx_ring, rx_ring->wr_p);
> > +
> > +		xdp = xsk_buff_alloc(pool);
> 
> Could you use xsk_buff_alloc_batch() with nfp_net_rx_space(rx_ring) passed
> as requested count of xdp_bufs instead calling xsk_buff_alloc() in a loop?

I think I can, I will give it a try and post a patch. Thanks for the 
pointer.

> 
> > +		if (!xdp)
> > +			break;
> > +
> > +		nfp_net_xsk_rx_bufs_stash(rx_ring, wr_idx, xdp);
> > +
> > +		nfp_desc_set_dma_addr(&rx_ring->rxds[wr_idx].fld,
> > +				      rx_ring->xsk_rxbufs[wr_idx].dma_addr);
> > +
> > +		rx_ring->wr_p++;
> > +		wr_ptr_add++;
> > +	}
> > +
> > +	/* Ensure all records are visible before incrementing write counter. */
> > +	wmb();
> > +	nfp_qcp_wr_ptr_add(rx_ring->qcp_fl, wr_ptr_add);
> > +}
> > +
> 
> (...)

-- 
Regards,
Niklas Söderlund

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-03-04 18:42 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-04 10:22 Add AF_XDP zero-copy support for NFP Simon Horman
2022-03-04 10:22 ` [PATCH net-next 1/5] nfp: expose common functions to be used for AF_XDP Simon Horman
2022-03-04 13:10   ` patchwork-bot+netdevbpf
2022-03-04 10:22 ` [PATCH net-next 2/5] nfp: wrap napi add/del logic Simon Horman
2022-03-04 10:22 ` [PATCH net-next 3/5] nfp: xsk: add an array of xsk buffer pools to each data path Simon Horman
2022-03-04 10:22 ` [PATCH net-next 4/5] nfp: xsk: add configuration check for XSK socket chunk size Simon Horman
2022-03-04 10:22 ` [PATCH net-next 5/5] nfp: xsk: add AF_XDP zero-copy Rx and Tx support Simon Horman
2022-03-04 13:53   ` Maciej Fijalkowski
2022-03-04 18:42     ` Niklas Söderlund

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.