All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Björn Töpel" <bjorn.topel@gmail.com>
To: bjorn.topel@gmail.com, magnus.karlsson@intel.com,
	magnus.karlsson@gmail.com, alexander.h.duyck@intel.com,
	alexander.duyck@gmail.com, ast@kernel.org, brouer@redhat.com,
	daniel@iogearbox.net, netdev@vger.kernel.org,
	jesse.brandeburg@intel.com, anjali.singhai@intel.com,
	peter.waskiewicz.jr@intel.com
Cc: michael.lundkvist@ericsson.com, willemdebruijn.kernel@gmail.com,
	john.fastabend@gmail.com, jakub.kicinski@netronome.com,
	neerav.parikh@intel.com, mykyta.iziumtsev@linaro.org,
	francois.ozog@linaro.org, ilias.apalodimas@linaro.org,
	brian.brooks@linaro.org, u9012063@gmail.com,
	pavel@fastnetmon.com, qi.z.zhang@intel.com
Subject: [PATCH bpf-next 10/11] i40e: add AF_XDP zero-copy Tx support
Date: Tue, 28 Aug 2018 14:44:34 +0200	[thread overview]
Message-ID: <20180828124435.30578-11-bjorn.topel@gmail.com> (raw)
In-Reply-To: <20180828124435.30578-1-bjorn.topel@gmail.com>

From: Magnus Karlsson <magnus.karlsson@intel.com>

This patch adds zero-copy Tx support for AF_XDP sockets. It implements
the ndo_xsk_async_xmit netdev ndo and performs all the Tx logic from a
NAPI context. This means pulling egress packets from the Tx ring,
placing the frames on the NIC HW descriptor ring and completing sent
frames back to the application via the completion ring.

The regular XDP Tx ring is used for AF_XDP as well. This rationale for
this is as follows: XDP_REDIRECT guarantees mutual exclusion between
different NAPI contexts based on CPU id. In other words, a netdev can
XDP_REDIRECT to another netdev with a different NAPI context, since
the operation is bound to a specific core and each core has its own
hardware ring.

As the AF_XDP Tx action is running in the same NAPI context and using
the same ring, it will also be protected from XDP_REDIRECT actions
with the exact same mechanism.

As with AF_XDP Rx, all AF_XDP Tx specific functions are added to
i40e_xsk.c.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c |   4 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c |   6 +-
 drivers/net/ethernet/intel/i40e/i40e_xsk.c  | 173 ++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_xsk.h  |   4 +
 4 files changed, 186 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 848eea7c84db..5da7eb0fe4ae 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3074,6 +3074,9 @@ static int i40e_configure_tx_ring(struct i40e_ring *ring)
 	i40e_status err = 0;
 	u32 qtx_ctl = 0;
 
+	if (ring_is_xdp(ring))
+		ring->xsk_umem = i40e_xsk_umem(ring);
+
 	/* some ATR related tx ring init */
 	if (vsi->back->flags & I40E_FLAG_FD_ATR_ENABLED) {
 		ring->atr_sample_rate = vsi->back->atr_sample_rate;
@@ -12185,6 +12188,7 @@ static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_bridge_setlink	= i40e_ndo_bridge_setlink,
 	.ndo_bpf		= i40e_xdp,
 	.ndo_xdp_xmit		= i40e_xdp_xmit,
+	.ndo_xsk_async_xmit	= i40e_xsk_async_xmit,
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 11e201fcb57a..37bd4e50ccde 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2597,7 +2597,11 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
 	 * budget and be more aggressive about cleaning up the Tx descriptors.
 	 */
 	i40e_for_each_ring(ring, q_vector->tx) {
-		if (!i40e_clean_tx_irq(vsi, ring, budget)) {
+		bool wd = ring->xsk_umem ?
+			  i40e_clean_xdp_tx_irq(vsi, ring, budget) :
+			  i40e_clean_tx_irq(vsi, ring, budget);
+
+		if (!wd) {
 			clean_complete = false;
 			continue;
 		}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index bf502f2307c2..94947a826bc3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -659,3 +659,176 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 	return failure ? budget : (int)total_rx_packets;
 }
 
+/**
+ * i40e_xmit_zc - Performs zero-copy Tx AF_XDP
+ * @xdp_ring: XDP Tx ring
+ * @budget: NAPI budget
+ *
+ * Returns true if the work is finished.
+ **/
+static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
+{
+	unsigned int total_packets = 0;
+	struct i40e_tx_buffer *tx_bi;
+	struct i40e_tx_desc *tx_desc;
+	bool work_done = true;
+	dma_addr_t dma;
+	u32 len;
+
+	while (budget-- > 0) {
+		if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
+			xdp_ring->tx_stats.tx_busy++;
+			work_done = false;
+			break;
+		}
+
+		if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &dma, &len))
+			break;
+
+		dma_sync_single_for_device(xdp_ring->dev, dma, len,
+					   DMA_BIDIRECTIONAL);
+
+		tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
+		tx_bi->bytecount = len;
+
+		tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
+		tx_desc->buffer_addr = cpu_to_le64(dma);
+		tx_desc->cmd_type_offset_bsz =
+			build_ctob(I40E_TX_DESC_CMD_ICRC
+				   | I40E_TX_DESC_CMD_EOP,
+				   0, len, 0);
+		total_packets++;
+
+		xdp_ring->next_to_use++;
+		if (xdp_ring->next_to_use == xdp_ring->count)
+			xdp_ring->next_to_use = 0;
+	}
+
+	if (total_packets > 0) {
+		/* Request an interrupt for the last frame and bump tail ptr. */
+		tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS <<
+						 I40E_TXD_QW1_CMD_SHIFT);
+		i40e_xdp_ring_update_tail(xdp_ring);
+
+		xsk_umem_consume_tx_done(xdp_ring->xsk_umem);
+	}
+
+	return !!budget && work_done;
+}
+
+/**
+ * i40e_clean_xdp_tx_buffer - Frees and unmaps an XDP Tx entry
+ * @tx_ring: XDP Tx ring
+ * @tx_bi: Tx buffer info to clean
+ **/
+static void i40e_clean_xdp_tx_buffer(struct i40e_ring *tx_ring,
+				     struct i40e_tx_buffer *tx_bi)
+{
+	xdp_return_frame(tx_bi->xdpf);
+	dma_unmap_single(tx_ring->dev,
+			 dma_unmap_addr(tx_bi, dma),
+			 dma_unmap_len(tx_bi, len), DMA_TO_DEVICE);
+	dma_unmap_len_set(tx_bi, len, 0);
+}
+
+/**
+ * i40e_clean_xdp_tx_irq - Completes AF_XDP entries, and cleans XDP entries
+ * @tx_ring: XDP Tx ring
+ * @tx_bi: Tx buffer info to clean
+ *
+ * Returns true if cleanup/tranmission is done.
+ **/
+bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
+			   struct i40e_ring *tx_ring, int napi_budget)
+{
+	unsigned int ntc, total_bytes = 0, budget = vsi->work_limit;
+	u32 i, completed_frames, frames_ready, xsk_frames = 0;
+	struct xdp_umem *umem = tx_ring->xsk_umem;
+	u32 head_idx = i40e_get_head(tx_ring);
+	bool work_done = true, xmit_done;
+	struct i40e_tx_buffer *tx_bi;
+
+	if (head_idx < tx_ring->next_to_clean)
+		head_idx += tx_ring->count;
+	frames_ready = head_idx - tx_ring->next_to_clean;
+
+	if (frames_ready == 0) {
+		goto out_xmit;
+	} else if (frames_ready > budget) {
+		completed_frames = budget;
+		work_done = false;
+	} else {
+		completed_frames = frames_ready;
+	}
+
+	ntc = tx_ring->next_to_clean;
+
+	for (i = 0; i < completed_frames; i++) {
+		tx_bi = &tx_ring->tx_bi[ntc];
+
+		if (tx_bi->xdpf)
+			i40e_clean_xdp_tx_buffer(tx_ring, tx_bi);
+		else
+			xsk_frames++;
+
+		tx_bi->xdpf = NULL;
+		total_bytes += tx_bi->bytecount;
+
+		if (++ntc >= tx_ring->count)
+			ntc = 0;
+	}
+
+	tx_ring->next_to_clean += completed_frames;
+	if (unlikely(tx_ring->next_to_clean >= tx_ring->count))
+		tx_ring->next_to_clean -= tx_ring->count;
+
+	if (xsk_frames)
+		xsk_umem_complete_tx(umem, xsk_frames);
+
+	i40e_arm_wb(tx_ring, vsi, budget);
+	i40e_update_tx_stats(tx_ring, completed_frames, total_bytes);
+
+out_xmit:
+	xmit_done = i40e_xmit_zc(tx_ring, budget);
+
+	return work_done && xmit_done;
+}
+
+/**
+ * i40e_xsk_async_xmit - Implements the ndo_xsk_async_xmit
+ * @dev: the netdevice
+ * @queue_id: queue id to wake up
+ *
+ * Returns <0 for errors, 0 otherwise.
+ **/
+int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_ring *ring;
+
+	if (test_bit(__I40E_VSI_DOWN, vsi->state))
+		return -ENETDOWN;
+
+	if (!i40e_enabled_xdp_vsi(vsi))
+		return -ENXIO;
+
+	if (queue_id >= vsi->num_queue_pairs)
+		return -ENXIO;
+
+	if (!vsi->xdp_rings[queue_id]->xsk_umem)
+		return -ENXIO;
+
+	ring = vsi->xdp_rings[queue_id];
+
+	/* The idea here is that if NAPI is running, mark a miss, so
+	 * it will run again. If not, trigger an interrupt and
+	 * schedule the NAPI from interrupt context. If NAPI would be
+	 * scheduled here, the interrupt affinity would not be
+	 * honored.
+	 */
+	if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi))
+		i40e_force_wb(vsi, ring->q_vector);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
index 427a844f78a7..9038c5d5cf08 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
@@ -18,4 +18,8 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
 bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count);
 int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
 
+bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
+			   struct i40e_ring *tx_ring, int napi_budget);
+int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id);
+
 #endif /* _I40E_XSK_H_ */
-- 
2.17.1

  parent reply	other threads:[~2018-08-28 16:39 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-08-28 12:44 [PATCH bpf-next 00/11] AF_XDP zero-copy support for i40e Björn Töpel
2018-08-28 12:44 ` [PATCH bpf-next 01/11] xdp: implement convert_to_xdp_frame for MEM_TYPE_ZERO_COPY Björn Töpel
2018-08-28 14:11   ` Jesper Dangaard Brouer
2018-08-28 17:42     ` Björn Töpel
2018-08-29 18:06   ` [bpf-next, " Maciek Fijalkowski
2018-08-28 12:44 ` [PATCH bpf-next 02/11] xdp: export xdp_rxq_info_unreg_mem_model Björn Töpel
2018-08-28 12:44 ` [PATCH bpf-next 03/11] xsk: expose xdp_umem_get_{data,dma} to drivers Björn Töpel
2018-08-28 12:44 ` [PATCH bpf-next 04/11] net: add napi_if_scheduled_mark_missed Björn Töpel
2018-08-28 12:44 ` [PATCH bpf-next 05/11] i40e: added queue pair disable/enable functions Björn Töpel
2018-08-28 12:44 ` [PATCH bpf-next 06/11] i40e: refactor Rx path for re-use Björn Töpel
2018-08-28 12:44 ` [PATCH bpf-next 07/11] i40e: move common Rx functions to i40e_txrx_common.h Björn Töpel
2018-08-28 12:44 ` [PATCH bpf-next 08/11] i40e: add AF_XDP zero-copy Rx support Björn Töpel
2018-08-29 19:14   ` Jakub Kicinski
2018-08-30 12:06     ` Björn Töpel
2018-08-31  7:55       ` Jakub Kicinski
2018-08-29 19:22   ` Alexei Starovoitov
2018-08-28 12:44 ` [PATCH bpf-next 09/11] i40e: move common Tx functions to i40e_txrx_common.h Björn Töpel
2018-08-28 12:44 ` Björn Töpel [this message]
2018-08-28 12:44 ` [PATCH bpf-next 11/11] samples/bpf: add -c/--copy -z/--zero-copy flags to xdpsock Björn Töpel
2018-08-29 12:44   ` Jesper Dangaard Brouer
2018-08-30 10:21     ` Björn Töpel
2018-08-28 12:50 ` [PATCH bpf-next 00/11] AF_XDP zero-copy support for i40e Björn Töpel
2018-08-28 12:50   ` [Intel-wired-lan] " =?unknown-8bit?q?Bj=C3=B6rn_T=C3=B6pel?=
2018-08-29 16:12 ` Daniel Borkmann
2018-08-30  0:10   ` William Tu
2018-08-30  9:05   ` Björn Töpel
2018-08-29 19:19 ` [RFC] net: xsk: add a simple buffer reuse queue Jakub Kicinski
2018-08-31  8:34   ` Björn Töpel
2018-08-29 19:39 ` [PATCH bpf-next 00/11] AF_XDP zero-copy support for i40e Alexei Starovoitov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180828124435.30578-11-bjorn.topel@gmail.com \
    --to=bjorn.topel@gmail.com \
    --cc=alexander.duyck@gmail.com \
    --cc=alexander.h.duyck@intel.com \
    --cc=anjali.singhai@intel.com \
    --cc=ast@kernel.org \
    --cc=brian.brooks@linaro.org \
    --cc=brouer@redhat.com \
    --cc=daniel@iogearbox.net \
    --cc=francois.ozog@linaro.org \
    --cc=ilias.apalodimas@linaro.org \
    --cc=jakub.kicinski@netronome.com \
    --cc=jesse.brandeburg@intel.com \
    --cc=john.fastabend@gmail.com \
    --cc=magnus.karlsson@gmail.com \
    --cc=magnus.karlsson@intel.com \
    --cc=michael.lundkvist@ericsson.com \
    --cc=mykyta.iziumtsev@linaro.org \
    --cc=neerav.parikh@intel.com \
    --cc=netdev@vger.kernel.org \
    --cc=pavel@fastnetmon.com \
    --cc=peter.waskiewicz.jr@intel.com \
    --cc=qi.z.zhang@intel.com \
    --cc=u9012063@gmail.com \
    --cc=willemdebruijn.kernel@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.