netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] xdp: Support zero-copy XDP_TX from AF_XDP sockets.
@ 2019-04-17 19:58 Jonathan Lemon
  2019-04-18 10:10 ` Björn Töpel
  0 siblings, 1 reply; 4+ messages in thread
From: Jonathan Lemon @ 2019-04-17 19:58 UTC (permalink / raw)
  To: netdev; +Cc: bjorn.topel, magnus.karlsson, kernel-team

When the XDP program attached to a zero-copy AF_XDP socket returns XDP_TX,
queue the umem frame on the XDP TX ring, and arrange for it to be released
via the ZCA free routine, which should place it back onto the reuseq.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
 drivers/net/ethernet/intel/i40e/i40e_xsk.c  | 52 +++++++++++++++++++--
 include/net/xdp.h                           | 20 ++++++--
 3 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 100e92d2982f..3e7954277737 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -274,6 +274,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
 #define I40E_TX_FLAGS_TSYN		BIT(8)
 #define I40E_TX_FLAGS_FD_SB		BIT(9)
 #define I40E_TX_FLAGS_UDP_TUNNEL	BIT(10)
+#define I40E_TX_FLAGS_ZC_FRAME		BIT(11)
 #define I40E_TX_FLAGS_VLAN_MASK		0xffff0000
 #define I40E_TX_FLAGS_VLAN_PRIO_MASK	0xe0000000
 #define I40E_TX_FLAGS_VLAN_PRIO_SHIFT	29
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index d2e212d007c3..16a31c57906a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -188,7 +188,6 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
 static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
 {
 	int err, result = I40E_XDP_PASS;
-	struct i40e_ring *xdp_ring;
 	struct bpf_prog *xdp_prog;
 	u32 act;
 
@@ -202,9 +201,8 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
 	switch (act) {
 	case XDP_PASS:
 		break;
-	case XDP_TX:
-		xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
-		result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
+	case XDP_TX
+		result = i40e_xmit_rcvd_zc(rx_ring, xdp);
 		break;
 	case XDP_REDIRECT:
 		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
@@ -623,6 +621,48 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 	return failure ? budget : (int)total_rx_packets;
 }
 
+static int i40e_xmit_rcvd_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
+{
+	struct i40e_ring *xdp_ring;
+	struct i40e_tx_desc *tx_desc;
+	struct i40e_tx_buffer *tx_bi;
+	struct xdp_frame *xdpf;
+	dma_addr_t dma;
+
+	xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
+
+	if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
+		xdp_ring->tx_stats.tx_busy++;
+		return I40E_XDP_CONSUMED;
+	}
+	xpdf = convert_to_xdp_frame_keep_zc(xdp);
+	if (unlikely(!xpdf)
+		return I40E_XDP_CONSUMED;
+	xpdf->handle = xdp->handle;
+
+	dma = xdp_umem_get_dma(rx_ring->xsk_umem, xdp->handle);
+	tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
+	tx_bi->bytecount = xpdf->len;
+	tx_bi->gso_segs = 1;
+	tx_bi->xdpf = xdpf;
+	tx_bi->tx_flags = I40E_TX_FLAGS_ZC_FRAME;
+
+	tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
+						  I40E_TX_DESC_CMD_EOP,
+				                  0, xpdf->len, 0);
+	smp_wmb();
+
+	xdp_ring->next_to_use++;
+	if (xdp_ring->next_to_use == xdp_ring->count)
+		xdp_ring->next_to_use = 0;
+
+	tx_bi->next_to_watch = tx_desc;
+
+	return I40E_XDP_TX;
+}
+
 /**
  * i40e_xmit_zc - Performs zero-copy Tx AF_XDP
  * @xdp_ring: XDP Tx ring
@@ -689,6 +729,10 @@ static void i40e_clean_xdp_tx_buffer(struct i40e_ring *tx_ring,
 				     struct i40e_tx_buffer *tx_bi)
 {
 	xdp_return_frame(tx_bi->xdpf);
+	if (tx_bi->tx_flags & I40E_TX_FLAGS_ZC_FRAME) {
+		tx_bi->tx_flags = 0;
+		return;
+	}
 	dma_unmap_single(tx_ring->dev,
 			 dma_unmap_addr(tx_bi, dma),
 			 dma_unmap_len(tx_bi, len), DMA_TO_DEVICE);
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0f25b3675c5c..191359c5ebdd 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -82,6 +82,7 @@ struct xdp_frame {
 	 */
 	struct xdp_mem_info mem;
 	struct net_device *dev_rx; /* used by cpumap */
+	unsigned long handle;
 };
 
 /* Clear kernel pointers in xdp_frame */
@@ -95,15 +96,12 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
 
 /* Convert xdp_buff to xdp_frame */
 static inline
-struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
+struct xdp_frame *__convert_to_xdp_frame(struct xdp_buff *xdp)
 {
 	struct xdp_frame *xdp_frame;
 	int metasize;
 	int headroom;
 
-	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
-		return xdp_convert_zc_to_xdp_frame(xdp);
-
 	/* Assure headroom is available for storing info */
 	headroom = xdp->data - xdp->data_hard_start;
 	metasize = xdp->data - xdp->data_meta;
@@ -125,6 +123,20 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	return xdp_frame;
 }
 
+static inline
+struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
+{
+	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
+		return xdp_convert_zc_to_xdp_frame(xdp);
+	return __convert_to_xdp_frame(xdp);
+}
+
+static inline
+struct xdp_frame *convert_to_xdp_frame_keep_zc(struct xdp_buff *xdp)
+{
+	return __convert_to_xdp_frame(xdp);
+}
+
 void xdp_return_frame(struct xdp_frame *xdpf);
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC] xdp: Support zero-copy XDP_TX from AF_XDP sockets.
  2019-04-17 19:58 [PATCH RFC] xdp: Support zero-copy XDP_TX from AF_XDP sockets Jonathan Lemon
@ 2019-04-18 10:10 ` Björn Töpel
  2019-04-18 17:55   ` Jonathan Lemon
  0 siblings, 1 reply; 4+ messages in thread
From: Björn Töpel @ 2019-04-18 10:10 UTC (permalink / raw)
  To: Jonathan Lemon
  Cc: Netdev, Karlsson, Magnus, kernel-team, Jesper Dangaard Brouer,
	maciej.fijalkowski

On Wed, 17 Apr 2019 at 21:58, Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
>
> When the XDP program attached to a zero-copy AF_XDP socket returns XDP_TX,
> queue the umem frame on the XDP TX ring, and arrange for it to be released
> via the ZCA free routine, which should place it back onto the reuseq.
>

There are a bunch of compiler errors, but I'll leave them out from the comments!

> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>  drivers/net/ethernet/intel/i40e/i40e_xsk.c  | 52 +++++++++++++++++++--
>  include/net/xdp.h                           | 20 ++++++--
>  3 files changed, 65 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index 100e92d2982f..3e7954277737 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -274,6 +274,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
>  #define I40E_TX_FLAGS_TSYN             BIT(8)
>  #define I40E_TX_FLAGS_FD_SB            BIT(9)
>  #define I40E_TX_FLAGS_UDP_TUNNEL       BIT(10)
> +#define I40E_TX_FLAGS_ZC_FRAME         BIT(11)
>  #define I40E_TX_FLAGS_VLAN_MASK                0xffff0000
>  #define I40E_TX_FLAGS_VLAN_PRIO_MASK   0xe0000000
>  #define I40E_TX_FLAGS_VLAN_PRIO_SHIFT  29
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> index d2e212d007c3..16a31c57906a 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> @@ -188,7 +188,6 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
>  static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
>  {
>         int err, result = I40E_XDP_PASS;
> -       struct i40e_ring *xdp_ring;
>         struct bpf_prog *xdp_prog;
>         u32 act;
>
> @@ -202,9 +201,8 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
>         switch (act) {
>         case XDP_PASS:
>                 break;
> -       case XDP_TX:
> -               xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
> -               result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
> +       case XDP_TX
> +               result = i40e_xmit_rcvd_zc(rx_ring, xdp);
>                 break;
>         case XDP_REDIRECT:
>                 err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
> @@ -623,6 +621,48 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
>         return failure ? budget : (int)total_rx_packets;
>  }
>
> +static int i40e_xmit_rcvd_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
> +{
> +       struct i40e_ring *xdp_ring;
> +       struct i40e_tx_desc *tx_desc;
> +       struct i40e_tx_buffer *tx_bi;
> +       struct xdp_frame *xdpf;
> +       dma_addr_t dma;
> +
> +       xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
> +
> +       if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
> +               xdp_ring->tx_stats.tx_busy++;
> +               return I40E_XDP_CONSUMED;
> +       }
> +       xpdf = convert_to_xdp_frame_keep_zc(xdp);
> +       if (unlikely(!xpdf)
> +               return I40E_XDP_CONSUMED;
> +       xpdf->handle = xdp->handle;
> +
> +       dma = xdp_umem_get_dma(rx_ring->xsk_umem, xdp->handle);
> +       tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
> +       tx_bi->bytecount = xpdf->len;
> +       tx_bi->gso_segs = 1;
> +       tx_bi->xdpf = xdpf;
> +       tx_bi->tx_flags = I40E_TX_FLAGS_ZC_FRAME;
> +
> +       tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
> +       tx_desc->buffer_addr = cpu_to_le64(dma);
> +       tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
> +                                                 I40E_TX_DESC_CMD_EOP,
> +                                                 0, xpdf->len, 0);
> +       smp_wmb();
> +
> +       xdp_ring->next_to_use++;
> +       if (xdp_ring->next_to_use == xdp_ring->count)
> +               xdp_ring->next_to_use = 0;
> +
> +       tx_bi->next_to_watch = tx_desc;
> +
> +       return I40E_XDP_TX;
> +}

What you're basically doing here is a AF_XDP Tx, but triggered from
the Rx path, and instead of completion (after the packet is sent) to
the completion ring, it's recycled to the Rx HW ring (via zca_free). I
like the idea but we need more plumbing first. Let me expand;
Unfortunately, the current recycle mechanism requires that at the
point of recycling, there has to be space in Rx ring. In the XDP_TX
case, there's no completion ring, and we cannot guarantee that there's
space on Rx ring (since Rx and Tx are asynchronous). IOW, Rx recycling
can currently *only* be done in an Rx context.

What I would like to do, is moving i40e-xsk to Jesper's page-pool,
instead of the existing recycle mechanism. Then we could return the
descriptor to the pool, if the Rx ring doesn't have space for the
completed/sent buffer.

TL;DR version: Passing zc-frames in XDP_TX cannot be done properly
until the Rx recycle mechanism is more robust. :-(

(I think Maciej is looking into using the page_pool on the ice driver.)

But again, I like the idea!


Thanks,
Björn


> +
>  /**
>   * i40e_xmit_zc - Performs zero-copy Tx AF_XDP
>   * @xdp_ring: XDP Tx ring
> @@ -689,6 +729,10 @@ static void i40e_clean_xdp_tx_buffer(struct i40e_ring *tx_ring,
>                                      struct i40e_tx_buffer *tx_bi)
>  {
>         xdp_return_frame(tx_bi->xdpf);
> +       if (tx_bi->tx_flags & I40E_TX_FLAGS_ZC_FRAME) {
> +               tx_bi->tx_flags = 0;
> +               return;
> +       }
>         dma_unmap_single(tx_ring->dev,
>                          dma_unmap_addr(tx_bi, dma),
>                          dma_unmap_len(tx_bi, len), DMA_TO_DEVICE);
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 0f25b3675c5c..191359c5ebdd 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -82,6 +82,7 @@ struct xdp_frame {
>          */
>         struct xdp_mem_info mem;
>         struct net_device *dev_rx; /* used by cpumap */
> +       unsigned long handle;
>  };
>
>  /* Clear kernel pointers in xdp_frame */
> @@ -95,15 +96,12 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
>
>  /* Convert xdp_buff to xdp_frame */
>  static inline
> -struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> +struct xdp_frame *__convert_to_xdp_frame(struct xdp_buff *xdp)
>  {
>         struct xdp_frame *xdp_frame;
>         int metasize;
>         int headroom;
>
> -       if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
> -               return xdp_convert_zc_to_xdp_frame(xdp);
> -
>         /* Assure headroom is available for storing info */
>         headroom = xdp->data - xdp->data_hard_start;
>         metasize = xdp->data - xdp->data_meta;
> @@ -125,6 +123,20 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
>         return xdp_frame;
>  }
>
> +static inline
> +struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> +{
> +       if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
> +               return xdp_convert_zc_to_xdp_frame(xdp);
> +       return __convert_to_xdp_frame(xdp);
> +}
> +
> +static inline
> +struct xdp_frame *convert_to_xdp_frame_keep_zc(struct xdp_buff *xdp)
> +{
> +       return __convert_to_xdp_frame(xdp);
> +}
> +
>  void xdp_return_frame(struct xdp_frame *xdpf);
>  void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
>  void xdp_return_buff(struct xdp_buff *xdp);
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC] xdp: Support zero-copy XDP_TX from AF_XDP sockets.
  2019-04-18 10:10 ` Björn Töpel
@ 2019-04-18 17:55   ` Jonathan Lemon
  2019-04-23  6:14     ` Björn Töpel
  0 siblings, 1 reply; 4+ messages in thread
From: Jonathan Lemon @ 2019-04-18 17:55 UTC (permalink / raw)
  To: Björn Töpel
  Cc: Netdev, Karlsson, Magnus, kernel-team, Jesper Dangaard Brouer,
	maciej.fijalkowski

On 18 Apr 2019, at 3:10, Björn Töpel wrote:

> On Wed, 17 Apr 2019 at 21:58, Jonathan Lemon 
> <jonathan.lemon@gmail.com> wrote:
>>
>> When the XDP program attached to a zero-copy AF_XDP socket returns 
>> XDP_TX,
>> queue the umem frame on the XDP TX ring, and arrange for it to be 
>> released
>> via the ZCA free routine, which should place it back onto the reuseq.
>>
>
> There are a bunch of compiler errors, but I'll leave them out from the 
> comments!

Yes, sigh - generated from the wrong tree. I did compile it, honest!


>> +static int i40e_xmit_rcvd_zc(struct i40e_ring *rx_ring, struct 
>> xdp_buff *xdp)
>> +{
>> +       struct i40e_ring *xdp_ring;
>> +       struct i40e_tx_desc *tx_desc;
>> +       struct i40e_tx_buffer *tx_bi;
>> +       struct xdp_frame *xdpf;
>> +       dma_addr_t dma;
>> +
>> +       xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
>> +
>> +       if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
>> +               xdp_ring->tx_stats.tx_busy++;
>> +               return I40E_XDP_CONSUMED;
>> +       }
>> +       xpdf = convert_to_xdp_frame_keep_zc(xdp);
>> +       if (unlikely(!xpdf)
>> +               return I40E_XDP_CONSUMED;
>> +       xpdf->handle = xdp->handle;
>> +
>> +       dma = xdp_umem_get_dma(rx_ring->xsk_umem, xdp->handle);
>> +       tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
>> +       tx_bi->bytecount = xpdf->len;
>> +       tx_bi->gso_segs = 1;
>> +       tx_bi->xdpf = xdpf;
>> +       tx_bi->tx_flags = I40E_TX_FLAGS_ZC_FRAME;
>> +
>> +       tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
>> +       tx_desc->buffer_addr = cpu_to_le64(dma);
>> +       tx_desc->cmd_type_offset_bsz = 
>> build_ctob(I40E_TX_DESC_CMD_ICRC |
>> +                                                 
>> I40E_TX_DESC_CMD_EOP,
>> +                                                 0, xpdf->len, 0);
>> +       smp_wmb();
>> +
>> +       xdp_ring->next_to_use++;
>> +       if (xdp_ring->next_to_use == xdp_ring->count)
>> +               xdp_ring->next_to_use = 0;
>> +
>> +       tx_bi->next_to_watch = tx_desc;
>> +
>> +       return I40E_XDP_TX;
>> +}
>
> What you're basically doing here is a AF_XDP Tx, but triggered from
> the Rx path, and instead of completion (after the packet is sent) to
> the completion ring, it's recycled to the Rx HW ring (via zca_free). I
> like the idea but we need more plumbing first. Let me expand;
> Unfortunately, the current recycle mechanism requires that at the
> point of recycling, there has to be space in Rx ring. In the XDP_TX
> case, there's no completion ring, and we cannot guarantee that there's
> space on Rx ring (since Rx and Tx are asynchronous).

Yes - this is why it's an RFC. Your current design keeps the RX reuse
context in the driver itself, instead of using the reuse queue provided
by umem. I believe the driver only uses it when the ring is torn down,
so this needs to be addressed.

Since we're transmitting something that was just received, on TX 
completion
the buffer should logically be placed back into the fill queue, but with 
the
current SPSC mechanism, this isn't possible. Hence the ReuseQ. I think 
this
should be transparently be made part of the FQ mechanism, so the RQ is 
always
checked before using the FQ. I believe xsk_umem_peek_addr_rq() was added
for this purpose, but should really be the main API.


> IOW, Rx recycling
> can currently *only* be done in an Rx context.

I was assuming that the Tx was part of the same VSI, so the Rx ring 
would
be accessible from the napi_poll() routine.


> What I would like to do, is moving i40e-xsk to Jesper's page-pool,
> instead of the existing recycle mechanism. Then we could return the
> descriptor to the pool, if the Rx ring doesn't have space for the
> completed/sent buffer.

Hm, not sure how this would help? The xdp_return() goes to the ZCA free
routine for zero-copy buffers, bypassing the page pool, doesn't it?


> TL;DR version: Passing zc-frames in XDP_TX cannot be done properly
> until the Rx recycle mechanism is more robust. :-(

Agree. So i40e_zca_free() would look more like:

   if (space in rx_bi)
     /* locally cache buffer */
     i40e_reuse_rx_buffer_zc(rx, buf);
   else
     xsk_umem_fq_reuse(rx->umem, buf)

and the the two alloc_rx_buffers_*_zc() would need to be adjusted?
-- 
Jonathan

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC] xdp: Support zero-copy XDP_TX from AF_XDP sockets.
  2019-04-18 17:55   ` Jonathan Lemon
@ 2019-04-23  6:14     ` Björn Töpel
  0 siblings, 0 replies; 4+ messages in thread
From: Björn Töpel @ 2019-04-23  6:14 UTC (permalink / raw)
  To: Jonathan Lemon
  Cc: Netdev, Karlsson, Magnus, kernel-team, Jesper Dangaard Brouer,
	maciej.fijalkowski

On Thu, 18 Apr 2019 at 19:55, Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
>
> On 18 Apr 2019, at 3:10, Björn Töpel wrote:
>
> > On Wed, 17 Apr 2019 at 21:58, Jonathan Lemon
> > <jonathan.lemon@gmail.com> wrote:
> >>
> >> When the XDP program attached to a zero-copy AF_XDP socket returns
> >> XDP_TX,
> >> queue the umem frame on the XDP TX ring, and arrange for it to be
> >> released
> >> via the ZCA free routine, which should place it back onto the reuseq.
> >>
> >
> > There are a bunch of compiler errors, but I'll leave them out from the
> > comments!
>
> Yes, sigh - generated from the wrong tree. I did compile it, honest!
>
>
> >> +static int i40e_xmit_rcvd_zc(struct i40e_ring *rx_ring, struct
> >> xdp_buff *xdp)
> >> +{
> >> +       struct i40e_ring *xdp_ring;
> >> +       struct i40e_tx_desc *tx_desc;
> >> +       struct i40e_tx_buffer *tx_bi;
> >> +       struct xdp_frame *xdpf;
> >> +       dma_addr_t dma;
> >> +
> >> +       xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
> >> +
> >> +       if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
> >> +               xdp_ring->tx_stats.tx_busy++;
> >> +               return I40E_XDP_CONSUMED;
> >> +       }
> >> +       xpdf = convert_to_xdp_frame_keep_zc(xdp);
> >> +       if (unlikely(!xpdf)
> >> +               return I40E_XDP_CONSUMED;
> >> +       xpdf->handle = xdp->handle;
> >> +
> >> +       dma = xdp_umem_get_dma(rx_ring->xsk_umem, xdp->handle);
> >> +       tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
> >> +       tx_bi->bytecount = xpdf->len;
> >> +       tx_bi->gso_segs = 1;
> >> +       tx_bi->xdpf = xdpf;
> >> +       tx_bi->tx_flags = I40E_TX_FLAGS_ZC_FRAME;
> >> +
> >> +       tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
> >> +       tx_desc->buffer_addr = cpu_to_le64(dma);
> >> +       tx_desc->cmd_type_offset_bsz =
> >> build_ctob(I40E_TX_DESC_CMD_ICRC |
> >> +
> >> I40E_TX_DESC_CMD_EOP,
> >> +                                                 0, xpdf->len, 0);
> >> +       smp_wmb();
> >> +
> >> +       xdp_ring->next_to_use++;
> >> +       if (xdp_ring->next_to_use == xdp_ring->count)
> >> +               xdp_ring->next_to_use = 0;
> >> +
> >> +       tx_bi->next_to_watch = tx_desc;
> >> +
> >> +       return I40E_XDP_TX;
> >> +}
> >
> > What you're basically doing here is a AF_XDP Tx, but triggered from
> > the Rx path, and instead of completion (after the packet is sent) to
> > the completion ring, it's recycled to the Rx HW ring (via zca_free). I
> > like the idea but we need more plumbing first. Let me expand;
> > Unfortunately, the current recycle mechanism requires that at the
> > point of recycling, there has to be space in Rx ring. In the XDP_TX
> > case, there's no completion ring, and we cannot guarantee that there's
> > space on Rx ring (since Rx and Tx are asynchronous).
>
> Yes - this is why it's an RFC. Your current design keeps the RX reuse
> context in the driver itself, instead of using the reuse queue provided
> by umem. I believe the driver only uses it when the ring is torn down,
> so this needs to be addressed.
>

Yes, correct, it's *only* used not to leak descriptors on teardown.

> Since we're transmitting something that was just received, on TX
> completion
> the buffer should logically be placed back into the fill queue, but with
> the
> current SPSC mechanism, this isn't possible. Hence the ReuseQ. I think
> this
> should be transparently be made part of the FQ mechanism, so the RQ is
> always
> checked before using the FQ. I believe xsk_umem_peek_addr_rq() was added
> for this purpose, but should really be the main API.
>

...and here is why I'd like to use the pool instead of using the
(slower) _rq() functions instead. More below. Further, the current _rq
functions are also bounded in size, so we could end-up in the same
scenario here as well; The _rq queue is full *and* the Rx ring is
full, and hence leak. A more robust mechanism is needed (page pool!
:-)).

>
> > IOW, Rx recycling
> > can currently *only* be done in an Rx context.
>
> I was assuming that the Tx was part of the same VSI, so the Rx ring
> would
> be accessible from the napi_poll() routine.
>

Yeah, that's right. Context was not good wording. The problem is that
the napi_poll is broken up into the "Rx part" and the "Tx part", and
the current design doesn't guarantee that there's space for recycling
an Rx descriptor from the "Tx part" (even though they are executed in
the same napi).

>
> > What I would like to do, is moving i40e-xsk to Jesper's page-pool,
> > instead of the existing recycle mechanism. Then we could return the
> > descriptor to the pool, if the Rx ring doesn't have space for the
> > completed/sent buffer.
>
> Hm, not sure how this would help? The xdp_return() goes to the ZCA free
> routine for zero-copy buffers, bypassing the page pool, doesn't it?
>

When the page pool usage would be added, this would be changed to use
the page pool free mechanism instead, and leave per-core
caching/free'ing synchronization and such to the page pool. Again,
this is a bigger task to implement...

>
> > TL;DR version: Passing zc-frames in XDP_TX cannot be done properly
> > until the Rx recycle mechanism is more robust. :-(
>
> Agree. So i40e_zca_free() would look more like:
>
>    if (space in rx_bi)
>      /* locally cache buffer */
>      i40e_reuse_rx_buffer_zc(rx, buf);
>    else
>      xsk_umem_fq_reuse(rx->umem, buf)
>
> and the the two alloc_rx_buffers_*_zc() would need to be adjusted?

...and if the _rq queue is full... :-(



Björn

> --
> Jonathan

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2019-04-23  6:14 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-17 19:58 [PATCH RFC] xdp: Support zero-copy XDP_TX from AF_XDP sockets Jonathan Lemon
2019-04-18 10:10 ` Björn Töpel
2019-04-18 17:55   ` Jonathan Lemon
2019-04-23  6:14     ` Björn Töpel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).