All of lore.kernel.org
 help / color / mirror / Atom feed
From: Boris Pismenny <borispismenny@gmail.com>
To: David Ahern <dsahern@gmail.com>,
	Boris Pismenny <borisp@mellanox.com>,
	kuba@kernel.org, davem@davemloft.net, saeedm@nvidia.com,
	hch@lst.de, sagi@grimberg.me, axboe@fb.com, kbusch@kernel.org,
	viro@zeniv.linux.org.uk, edumazet@google.com, smalin@marvell.com
Cc: boris.pismenny@gmail.com, linux-nvme@lists.infradead.org,
	netdev@vger.kernel.org, benishay@nvidia.com, ogerlitz@nvidia.com,
	yorayz@nvidia.com, Or Gerlitz <ogerlitz@mellanox.com>,
	Yoray Zack <yorayz@mellanox.com>
Subject: Re: [PATCH v2 net-next 19/21] net/mlx5e: NVMEoTCP, data-path for DDP offload
Date: Sun, 17 Jan 2021 10:42:20 +0200	[thread overview]
Message-ID: <15248743-82bf-4283-d8c6-99f2210e42ae@gmail.com> (raw)
In-Reply-To: <10c28b01-49e5-c512-8670-bf8332b24b1b@gmail.com>

On 16/01/2021 6:57, David Ahern wrote:
> I have not had time to review this version of the patches, but this
> patch seems very similar to 13 of 15 from v1 and you did not respond to
> my question on it ...
> 
> On 1/14/21 8:10 AM, Boris Pismenny wrote:
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c
>> new file mode 100644
>> index 000000000000..f446b5d56d64
>> --- /dev/null
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c
>> @@ -0,0 +1,243 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>> +/* Copyright (c) 2021 Mellanox Technologies. */
>> +
>> +#include "en_accel/nvmeotcp_rxtx.h"
>> +#include "en_accel/nvmeotcp.h"
>> +#include <linux/mlx5/mlx5_ifc.h>
>> +
>> +#define	MLX5E_TC_FLOW_ID_MASK  0x00ffffff
>> +static void nvmeotcp_update_resync(struct mlx5e_nvmeotcp_queue *queue,
>> +				   struct mlx5e_cqe128 *cqe128)
>> +{
>> +	const struct tcp_ddp_ulp_ops *ulp_ops;
>> +	u32 seq;
>> +
>> +	seq = be32_to_cpu(cqe128->resync_tcp_sn);
>> +	ulp_ops = inet_csk(queue->sk)->icsk_ulp_ddp_ops;
>> +	if (ulp_ops && ulp_ops->resync_request)
>> +		ulp_ops->resync_request(queue->sk, seq, TCP_DDP_RESYNC_REQ);
>> +}
>> +
>> +static void mlx5e_nvmeotcp_advance_sgl_iter(struct mlx5e_nvmeotcp_queue *queue)
>> +{
>> +	struct nvmeotcp_queue_entry *nqe = &queue->ccid_table[queue->ccid];
>> +
>> +	queue->ccoff += nqe->sgl[queue->ccsglidx].length;
>> +	queue->ccoff_inner = 0;
>> +	queue->ccsglidx++;
>> +}
>> +
>> +static inline void
>> +mlx5e_nvmeotcp_add_skb_frag(struct net_device *netdev, struct sk_buff *skb,
>> +			    struct mlx5e_nvmeotcp_queue *queue,
>> +			    struct nvmeotcp_queue_entry *nqe, u32 fragsz)
>> +{
>> +	dma_sync_single_for_cpu(&netdev->dev,
>> +				nqe->sgl[queue->ccsglidx].offset + queue->ccoff_inner,
>> +				fragsz, DMA_FROM_DEVICE);
>> +	page_ref_inc(compound_head(sg_page(&nqe->sgl[queue->ccsglidx])));
>> +	// XXX: consider reducing the truesize, as no new memory is consumed
>> +	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
>> +			sg_page(&nqe->sgl[queue->ccsglidx]),
>> +			nqe->sgl[queue->ccsglidx].offset + queue->ccoff_inner,
>> +			fragsz,
>> +			fragsz);
>> +}
>> +
>> +static struct sk_buff*
>> +mlx5_nvmeotcp_add_tail_nonlinear(struct mlx5e_nvmeotcp_queue *queue,
>> +				 struct sk_buff *skb, skb_frag_t *org_frags,
>> +				 int org_nr_frags, int frag_index)
>> +{
>> +	struct mlx5e_priv *priv = queue->priv;
>> +
>> +	while (org_nr_frags != frag_index) {
>> +		if (skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) {
>> +			dev_kfree_skb_any(skb);
>> +			return NULL;
>> +		}
>> +		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
>> +				skb_frag_page(&org_frags[frag_index]),
>> +				skb_frag_off(&org_frags[frag_index]),
>> +				skb_frag_size(&org_frags[frag_index]),
>> +				skb_frag_size(&org_frags[frag_index]));
>> +		page_ref_inc(skb_frag_page(&org_frags[frag_index]));
>> +		frag_index++;
>> +	}
>> +	return skb;
>> +}
>> +
>> +static struct sk_buff*
>> +mlx5_nvmeotcp_add_tail(struct mlx5e_nvmeotcp_queue *queue, struct sk_buff *skb,
>> +		       int offset, int len)
>> +{
>> +	struct mlx5e_priv *priv = queue->priv;
>> +
>> +	if (skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) {
>> +		dev_kfree_skb_any(skb);
>> +		return NULL;
>> +	}
>> +	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
>> +			virt_to_page(skb->data),
>> +			offset,
>> +			len,
>> +			len);
>> +	page_ref_inc(virt_to_page(skb->data));
>> +	return skb;
>> +}
>> +
>> +static void mlx5_nvmeotcp_trim_nonlinear(struct sk_buff *skb,
>> +					 skb_frag_t *org_frags,
>> +					 int *frag_index,
>> +					 int remaining)
>> +{
>> +	unsigned int frag_size;
>> +	int nr_frags;
>> +
>> +	/* skip @remaining bytes in frags */
>> +	*frag_index = 0;
>> +	while (remaining) {
>> +		frag_size = skb_frag_size(&skb_shinfo(skb)->frags[*frag_index]);
>> +		if (frag_size > remaining) {
>> +			skb_frag_off_add(&skb_shinfo(skb)->frags[*frag_index],
>> +					 remaining);
>> +			skb_frag_size_sub(&skb_shinfo(skb)->frags[*frag_index],
>> +					  remaining);
>> +			remaining = 0;
>> +		} else {
>> +			remaining -= frag_size;
>> +			skb_frag_unref(skb, *frag_index);
>> +			*frag_index += 1;
>> +		}
>> +	}
>> +
>> +	/* save original frags for the tail and unref */
>> +	nr_frags = skb_shinfo(skb)->nr_frags;
>> +	memcpy(&org_frags[*frag_index], &skb_shinfo(skb)->frags[*frag_index],
>> +	       (nr_frags - *frag_index) * sizeof(skb_frag_t));
>> +	while (--nr_frags >= *frag_index)
>> +		skb_frag_unref(skb, nr_frags);
>> +
>> +	/* remove frags from skb */
>> +	skb_shinfo(skb)->nr_frags = 0;
>> +	skb->len -= skb->data_len;
>> +	skb->truesize -= skb->data_len;
>> +	skb->data_len = 0;
>> +}
>> +
>> +struct sk_buff*
>> +mlx5e_nvmeotcp_handle_rx_skb(struct net_device *netdev, struct sk_buff *skb,
>> +			     struct mlx5_cqe64 *cqe, u32 cqe_bcnt,
>> +			     bool linear)
>> +{
>> +	int ccoff, cclen, hlen, ccid, remaining, fragsz, to_copy = 0;
>> +	struct mlx5e_priv *priv = netdev_priv(netdev);
>> +	skb_frag_t org_frags[MAX_SKB_FRAGS];
>> +	struct mlx5e_nvmeotcp_queue *queue;
>> +	struct nvmeotcp_queue_entry *nqe;
>> +	int org_nr_frags, frag_index;
>> +	struct mlx5e_cqe128 *cqe128;
>> +	u32 queue_id;
>> +
>> +	queue_id = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK);
>> +	queue = mlx5e_nvmeotcp_get_queue(priv->nvmeotcp, queue_id);
>> +	if (unlikely(!queue)) {
>> +		dev_kfree_skb_any(skb);
>> +		return NULL;
>> +	}
>> +
>> +	cqe128 = container_of(cqe, struct mlx5e_cqe128, cqe64);
>> +	if (cqe_is_nvmeotcp_resync(cqe)) {
>> +		nvmeotcp_update_resync(queue, cqe128);
>> +		mlx5e_nvmeotcp_put_queue(queue);
>> +		return skb;
>> +	}
>> +
>> +#ifdef CONFIG_TCP_DDP_CRC
>> +	/* If a resync occurred in the previous cqe,
>> +	 * the current cqe.crcvalid bit may not be valid,
>> +	 * so we will treat it as 0
>> +	 */
>> +	skb->ddp_crc = queue->after_resync_cqe ? 0 :
>> +		cqe_is_nvmeotcp_crcvalid(cqe);
>> +	queue->after_resync_cqe = 0;
>> +#endif
>> +	if (!cqe_is_nvmeotcp_zc(cqe)) {
>> +		mlx5e_nvmeotcp_put_queue(queue);
>> +		return skb;
>> +	}
>> +
>> +	/* cc ddp from cqe */
>> +	ccid = be16_to_cpu(cqe128->ccid);
>> +	ccoff = be32_to_cpu(cqe128->ccoff);
>> +	cclen = be16_to_cpu(cqe128->cclen);
>> +	hlen  = be16_to_cpu(cqe128->hlen);
>> +
>> +	/* carve a hole in the skb for DDP data */
>> +	if (linear) {
>> +		skb_trim(skb, hlen);
>> +	} else {
>> +		org_nr_frags = skb_shinfo(skb)->nr_frags;
>> +		mlx5_nvmeotcp_trim_nonlinear(skb, org_frags, &frag_index,
>> +					     cclen);
>> +	}
>> +
>> +	nqe = &queue->ccid_table[ccid];
>> +
>> +	/* packet starts new ccid? */
>> +	if (queue->ccid != ccid || queue->ccid_gen != nqe->ccid_gen) {
>> +		queue->ccid = ccid;
>> +		queue->ccoff = 0;
>> +		queue->ccoff_inner = 0;
>> +		queue->ccsglidx = 0;
>> +		queue->ccid_gen = nqe->ccid_gen;
>> +	}
>> +
>> +	/* skip inside cc until the ccoff in the cqe */
>> +	while (queue->ccoff + queue->ccoff_inner < ccoff) {
>> +		remaining = nqe->sgl[queue->ccsglidx].length - queue->ccoff_inner;
>> +		fragsz = min_t(off_t, remaining,
>> +			       ccoff - (queue->ccoff + queue->ccoff_inner));
>> +
>> +		if (fragsz == remaining)
>> +			mlx5e_nvmeotcp_advance_sgl_iter(queue);
>> +		else
>> +			queue->ccoff_inner += fragsz;
>> +	}
>> +
>> +	/* adjust the skb according to the cqe cc */
>> +	while (to_copy < cclen) {
>> +		if (skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) {
>> +			dev_kfree_skb_any(skb);
>> +			mlx5e_nvmeotcp_put_queue(queue);
>> +			return NULL;
>> +		}
>> +
>> +		remaining = nqe->sgl[queue->ccsglidx].length - queue->ccoff_inner;
>> +		fragsz = min_t(int, remaining, cclen - to_copy);
>> +
>> +		mlx5e_nvmeotcp_add_skb_frag(netdev, skb, queue, nqe, fragsz);
>> +		to_copy += fragsz;
>> +		if (fragsz == remaining)
>> +			mlx5e_nvmeotcp_advance_sgl_iter(queue);
>> +		else
>> +			queue->ccoff_inner += fragsz;
>> +	}
>> +
>> +	if (cqe_bcnt > hlen + cclen) {
>> +		remaining = cqe_bcnt - hlen - cclen;
>> +		if (linear)
>> +			skb = mlx5_nvmeotcp_add_tail(queue, skb,
>> +						     offset_in_page(skb->data) +
>> +								hlen + cclen,
>> +						     remaining);
>> +		else
>> +			skb = mlx5_nvmeotcp_add_tail_nonlinear(queue, skb,
>> +							       org_frags,
>> +							       org_nr_frags,
>> +							       frag_index);
>> +	}
>> +
>> +	mlx5e_nvmeotcp_put_queue(queue);
>> +	return skb;
>> +}
> 
> 
> 
> ... I'll copy and paste my question here:
> 
> "mlx5e_skb_from_cqe_mpwrq_linear and mlx5e_skb_from_cqe_mpwrq_nolinear
> create an skb and then this function comes behind it, strips any frags
> originally added to the skb, adds the frags for the sgls, and then
> re-adds the original frags.
> 
> Why is this needed? Why can't the skb be created with all of the frags
> in proper order?
> 
> It seems like this dance is not needed if you had generic header/payload
> splits with the payload written to less retrictive SGLs."
> 
> This patch seems to be something very similar, and it is really
> complicated way to create each skb for DDP. The patch description does
> little to explain why it is needed.
> 

This is the same patch as before.

I'll start by explaining why this is needed. Then, clarify why generic
header-data split is not enough.

This is needed for a few reasons that are explained in detail
in the tcp-ddp offload documentation. See patch 21 overview
and rx-data-path sections. Our reasons are as follows:
1) Each SKB may contain multiple PDUs. DDP offload doesn't operate on
PDU headers, so these are written in the receive ring. Therefore, we
need to rebuild the SKB to account for it. Additionally, due to HW
limitations, we will only offload the first PDU in the SKB.
2) The newly constructed SKB represents the original data as it is on
the wire, such that the network stack is oblivious to the offload.
3) We decided not to modify all of the mlx5e_skb_from_cqe* functions
because it would make the offload harder to distinguish, and it would
add overhead to the existing data-path fucntions. Therefore, we opted
for this modular approach.

If we only had generic header-data split, then we just couldn't
provide this offload. It is not enough to place payload into some
buffer without TCP headers because RPC protocols and advanced storage
protocols, such as nvme-tcp, reorder their responses and require data
to be placed into application/pagecache buffers, which are anything
but anonymous. In other words, header-data split alone writes data
to the wrong buffers (reordering), or to anonymous buffers that
can't be page-flipped to replace application/pagecache buffers.


WARNING: multiple messages have this Message-ID (diff)
From: Boris Pismenny <borispismenny@gmail.com>
To: David Ahern <dsahern@gmail.com>,
	Boris Pismenny <borisp@mellanox.com>,
	kuba@kernel.org, davem@davemloft.net, saeedm@nvidia.com,
	hch@lst.de, sagi@grimberg.me, axboe@fb.com, kbusch@kernel.org,
	viro@zeniv.linux.org.uk, edumazet@google.com, smalin@marvell.com
Cc: Yoray Zack <yorayz@mellanox.com>,
	yorayz@nvidia.com, boris.pismenny@gmail.com, benishay@nvidia.com,
	linux-nvme@lists.infradead.org, netdev@vger.kernel.org,
	Or Gerlitz <ogerlitz@mellanox.com>,
	ogerlitz@nvidia.com
Subject: Re: [PATCH v2 net-next 19/21] net/mlx5e: NVMEoTCP, data-path for DDP offload
Date: Sun, 17 Jan 2021 10:42:20 +0200	[thread overview]
Message-ID: <15248743-82bf-4283-d8c6-99f2210e42ae@gmail.com> (raw)
In-Reply-To: <10c28b01-49e5-c512-8670-bf8332b24b1b@gmail.com>

On 16/01/2021 6:57, David Ahern wrote:
> I have not had time to review this version of the patches, but this
> patch seems very similar to 13 of 15 from v1 and you did not respond to
> my question on it ...
> 
> On 1/14/21 8:10 AM, Boris Pismenny wrote:
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c
>> new file mode 100644
>> index 000000000000..f446b5d56d64
>> --- /dev/null
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c
>> @@ -0,0 +1,243 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>> +/* Copyright (c) 2021 Mellanox Technologies. */
>> +
>> +#include "en_accel/nvmeotcp_rxtx.h"
>> +#include "en_accel/nvmeotcp.h"
>> +#include <linux/mlx5/mlx5_ifc.h>
>> +
>> +#define	MLX5E_TC_FLOW_ID_MASK  0x00ffffff
>> +static void nvmeotcp_update_resync(struct mlx5e_nvmeotcp_queue *queue,
>> +				   struct mlx5e_cqe128 *cqe128)
>> +{
>> +	const struct tcp_ddp_ulp_ops *ulp_ops;
>> +	u32 seq;
>> +
>> +	seq = be32_to_cpu(cqe128->resync_tcp_sn);
>> +	ulp_ops = inet_csk(queue->sk)->icsk_ulp_ddp_ops;
>> +	if (ulp_ops && ulp_ops->resync_request)
>> +		ulp_ops->resync_request(queue->sk, seq, TCP_DDP_RESYNC_REQ);
>> +}
>> +
>> +static void mlx5e_nvmeotcp_advance_sgl_iter(struct mlx5e_nvmeotcp_queue *queue)
>> +{
>> +	struct nvmeotcp_queue_entry *nqe = &queue->ccid_table[queue->ccid];
>> +
>> +	queue->ccoff += nqe->sgl[queue->ccsglidx].length;
>> +	queue->ccoff_inner = 0;
>> +	queue->ccsglidx++;
>> +}
>> +
>> +static inline void
>> +mlx5e_nvmeotcp_add_skb_frag(struct net_device *netdev, struct sk_buff *skb,
>> +			    struct mlx5e_nvmeotcp_queue *queue,
>> +			    struct nvmeotcp_queue_entry *nqe, u32 fragsz)
>> +{
>> +	dma_sync_single_for_cpu(&netdev->dev,
>> +				nqe->sgl[queue->ccsglidx].offset + queue->ccoff_inner,
>> +				fragsz, DMA_FROM_DEVICE);
>> +	page_ref_inc(compound_head(sg_page(&nqe->sgl[queue->ccsglidx])));
>> +	// XXX: consider reducing the truesize, as no new memory is consumed
>> +	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
>> +			sg_page(&nqe->sgl[queue->ccsglidx]),
>> +			nqe->sgl[queue->ccsglidx].offset + queue->ccoff_inner,
>> +			fragsz,
>> +			fragsz);
>> +}
>> +
>> +static struct sk_buff*
>> +mlx5_nvmeotcp_add_tail_nonlinear(struct mlx5e_nvmeotcp_queue *queue,
>> +				 struct sk_buff *skb, skb_frag_t *org_frags,
>> +				 int org_nr_frags, int frag_index)
>> +{
>> +	struct mlx5e_priv *priv = queue->priv;
>> +
>> +	while (org_nr_frags != frag_index) {
>> +		if (skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) {
>> +			dev_kfree_skb_any(skb);
>> +			return NULL;
>> +		}
>> +		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
>> +				skb_frag_page(&org_frags[frag_index]),
>> +				skb_frag_off(&org_frags[frag_index]),
>> +				skb_frag_size(&org_frags[frag_index]),
>> +				skb_frag_size(&org_frags[frag_index]));
>> +		page_ref_inc(skb_frag_page(&org_frags[frag_index]));
>> +		frag_index++;
>> +	}
>> +	return skb;
>> +}
>> +
>> +static struct sk_buff*
>> +mlx5_nvmeotcp_add_tail(struct mlx5e_nvmeotcp_queue *queue, struct sk_buff *skb,
>> +		       int offset, int len)
>> +{
>> +	struct mlx5e_priv *priv = queue->priv;
>> +
>> +	if (skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) {
>> +		dev_kfree_skb_any(skb);
>> +		return NULL;
>> +	}
>> +	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
>> +			virt_to_page(skb->data),
>> +			offset,
>> +			len,
>> +			len);
>> +	page_ref_inc(virt_to_page(skb->data));
>> +	return skb;
>> +}
>> +
>> +static void mlx5_nvmeotcp_trim_nonlinear(struct sk_buff *skb,
>> +					 skb_frag_t *org_frags,
>> +					 int *frag_index,
>> +					 int remaining)
>> +{
>> +	unsigned int frag_size;
>> +	int nr_frags;
>> +
>> +	/* skip @remaining bytes in frags */
>> +	*frag_index = 0;
>> +	while (remaining) {
>> +		frag_size = skb_frag_size(&skb_shinfo(skb)->frags[*frag_index]);
>> +		if (frag_size > remaining) {
>> +			skb_frag_off_add(&skb_shinfo(skb)->frags[*frag_index],
>> +					 remaining);
>> +			skb_frag_size_sub(&skb_shinfo(skb)->frags[*frag_index],
>> +					  remaining);
>> +			remaining = 0;
>> +		} else {
>> +			remaining -= frag_size;
>> +			skb_frag_unref(skb, *frag_index);
>> +			*frag_index += 1;
>> +		}
>> +	}
>> +
>> +	/* save original frags for the tail and unref */
>> +	nr_frags = skb_shinfo(skb)->nr_frags;
>> +	memcpy(&org_frags[*frag_index], &skb_shinfo(skb)->frags[*frag_index],
>> +	       (nr_frags - *frag_index) * sizeof(skb_frag_t));
>> +	while (--nr_frags >= *frag_index)
>> +		skb_frag_unref(skb, nr_frags);
>> +
>> +	/* remove frags from skb */
>> +	skb_shinfo(skb)->nr_frags = 0;
>> +	skb->len -= skb->data_len;
>> +	skb->truesize -= skb->data_len;
>> +	skb->data_len = 0;
>> +}
>> +
>> +struct sk_buff*
>> +mlx5e_nvmeotcp_handle_rx_skb(struct net_device *netdev, struct sk_buff *skb,
>> +			     struct mlx5_cqe64 *cqe, u32 cqe_bcnt,
>> +			     bool linear)
>> +{
>> +	int ccoff, cclen, hlen, ccid, remaining, fragsz, to_copy = 0;
>> +	struct mlx5e_priv *priv = netdev_priv(netdev);
>> +	skb_frag_t org_frags[MAX_SKB_FRAGS];
>> +	struct mlx5e_nvmeotcp_queue *queue;
>> +	struct nvmeotcp_queue_entry *nqe;
>> +	int org_nr_frags, frag_index;
>> +	struct mlx5e_cqe128 *cqe128;
>> +	u32 queue_id;
>> +
>> +	queue_id = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK);
>> +	queue = mlx5e_nvmeotcp_get_queue(priv->nvmeotcp, queue_id);
>> +	if (unlikely(!queue)) {
>> +		dev_kfree_skb_any(skb);
>> +		return NULL;
>> +	}
>> +
>> +	cqe128 = container_of(cqe, struct mlx5e_cqe128, cqe64);
>> +	if (cqe_is_nvmeotcp_resync(cqe)) {
>> +		nvmeotcp_update_resync(queue, cqe128);
>> +		mlx5e_nvmeotcp_put_queue(queue);
>> +		return skb;
>> +	}
>> +
>> +#ifdef CONFIG_TCP_DDP_CRC
>> +	/* If a resync occurred in the previous cqe,
>> +	 * the current cqe.crcvalid bit may not be valid,
>> +	 * so we will treat it as 0
>> +	 */
>> +	skb->ddp_crc = queue->after_resync_cqe ? 0 :
>> +		cqe_is_nvmeotcp_crcvalid(cqe);
>> +	queue->after_resync_cqe = 0;
>> +#endif
>> +	if (!cqe_is_nvmeotcp_zc(cqe)) {
>> +		mlx5e_nvmeotcp_put_queue(queue);
>> +		return skb;
>> +	}
>> +
>> +	/* cc ddp from cqe */
>> +	ccid = be16_to_cpu(cqe128->ccid);
>> +	ccoff = be32_to_cpu(cqe128->ccoff);
>> +	cclen = be16_to_cpu(cqe128->cclen);
>> +	hlen  = be16_to_cpu(cqe128->hlen);
>> +
>> +	/* carve a hole in the skb for DDP data */
>> +	if (linear) {
>> +		skb_trim(skb, hlen);
>> +	} else {
>> +		org_nr_frags = skb_shinfo(skb)->nr_frags;
>> +		mlx5_nvmeotcp_trim_nonlinear(skb, org_frags, &frag_index,
>> +					     cclen);
>> +	}
>> +
>> +	nqe = &queue->ccid_table[ccid];
>> +
>> +	/* packet starts new ccid? */
>> +	if (queue->ccid != ccid || queue->ccid_gen != nqe->ccid_gen) {
>> +		queue->ccid = ccid;
>> +		queue->ccoff = 0;
>> +		queue->ccoff_inner = 0;
>> +		queue->ccsglidx = 0;
>> +		queue->ccid_gen = nqe->ccid_gen;
>> +	}
>> +
>> +	/* skip inside cc until the ccoff in the cqe */
>> +	while (queue->ccoff + queue->ccoff_inner < ccoff) {
>> +		remaining = nqe->sgl[queue->ccsglidx].length - queue->ccoff_inner;
>> +		fragsz = min_t(off_t, remaining,
>> +			       ccoff - (queue->ccoff + queue->ccoff_inner));
>> +
>> +		if (fragsz == remaining)
>> +			mlx5e_nvmeotcp_advance_sgl_iter(queue);
>> +		else
>> +			queue->ccoff_inner += fragsz;
>> +	}
>> +
>> +	/* adjust the skb according to the cqe cc */
>> +	while (to_copy < cclen) {
>> +		if (skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) {
>> +			dev_kfree_skb_any(skb);
>> +			mlx5e_nvmeotcp_put_queue(queue);
>> +			return NULL;
>> +		}
>> +
>> +		remaining = nqe->sgl[queue->ccsglidx].length - queue->ccoff_inner;
>> +		fragsz = min_t(int, remaining, cclen - to_copy);
>> +
>> +		mlx5e_nvmeotcp_add_skb_frag(netdev, skb, queue, nqe, fragsz);
>> +		to_copy += fragsz;
>> +		if (fragsz == remaining)
>> +			mlx5e_nvmeotcp_advance_sgl_iter(queue);
>> +		else
>> +			queue->ccoff_inner += fragsz;
>> +	}
>> +
>> +	if (cqe_bcnt > hlen + cclen) {
>> +		remaining = cqe_bcnt - hlen - cclen;
>> +		if (linear)
>> +			skb = mlx5_nvmeotcp_add_tail(queue, skb,
>> +						     offset_in_page(skb->data) +
>> +								hlen + cclen,
>> +						     remaining);
>> +		else
>> +			skb = mlx5_nvmeotcp_add_tail_nonlinear(queue, skb,
>> +							       org_frags,
>> +							       org_nr_frags,
>> +							       frag_index);
>> +	}
>> +
>> +	mlx5e_nvmeotcp_put_queue(queue);
>> +	return skb;
>> +}
> 
> 
> 
> ... I'll copy and paste my question here:
> 
> "mlx5e_skb_from_cqe_mpwrq_linear and mlx5e_skb_from_cqe_mpwrq_nolinear
> create an skb and then this function comes behind it, strips any frags
> originally added to the skb, adds the frags for the sgls, and then
> re-adds the original frags.
> 
> Why is this needed? Why can't the skb be created with all of the frags
> in proper order?
> 
> It seems like this dance is not needed if you had generic header/payload
> splits with the payload written to less retrictive SGLs."
> 
> This patch seems to be something very similar, and it is really
> complicated way to create each skb for DDP. The patch description does
> little to explain why it is needed.
> 

This is the same patch as before.

I'll start by explaining why this is needed. Then, clarify why generic
header-data split is not enough.

This is needed for a few reasons that are explained in detail
in the tcp-ddp offload documentation. See patch 21 overview
and rx-data-path sections. Our reasons are as follows:
1) Each SKB may contain multiple PDUs. DDP offload doesn't operate on
PDU headers, so these are written in the receive ring. Therefore, we
need to rebuild the SKB to account for it. Additionally, due to HW
limitations, we will only offload the first PDU in the SKB.
2) The newly constructed SKB represents the original data as it is on
the wire, such that the network stack is oblivious to the offload.
3) We decided not to modify all of the mlx5e_skb_from_cqe* functions
because it would make the offload harder to distinguish, and it would
add overhead to the existing data-path fucntions. Therefore, we opted
for this modular approach.

If we only had generic header-data split, then we just couldn't
provide this offload. It is not enough to place payload into some
buffer without TCP headers because RPC protocols and advanced storage
protocols, such as nvme-tcp, reorder their responses and require data
to be placed into application/pagecache buffers, which are anything
but anonymous. In other words, header-data split alone writes data
to the wrong buffers (reordering), or to anonymous buffers that
can't be page-flipped to replace application/pagecache buffers.


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

  reply	other threads:[~2021-01-17  8:52 UTC|newest]

Thread overview: 68+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-01-14 15:10 [PATCH v2 net-next 00/21] nvme-tcp receive offloads Boris Pismenny
2021-01-14 15:10 ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 01/21] iov_iter: Introduce new procedures for copy to iter/pages Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 02/21] net: Introduce direct data placement tcp offload Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:57   ` Eric Dumazet
2021-01-14 15:57     ` Eric Dumazet
2021-01-14 20:19     ` Boris Pismenny
2021-01-14 20:19       ` Boris Pismenny
2021-01-14 20:43       ` Eric Dumazet
2021-01-14 20:43         ` Eric Dumazet
2021-01-31 10:40         ` Boris Pismenny
2021-01-31 10:40           ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 03/21] net: Introduce crc offload for tcp ddp ulp Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 04/21] net: SKB copy(+hash) iterators for DDP offloads Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 05/21] net/tls: expose get_netdev_for_sock Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 06/21] nvme-tcp: Add DDP offload control path Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-19  3:47   ` David Ahern
2021-01-19  3:47     ` David Ahern
2021-01-31  7:51     ` Boris Pismenny
2021-01-31  7:51       ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 07/21] nvme-tcp: Add DDP data-path Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-19  4:18   ` David Ahern
2021-01-19  4:18     ` David Ahern
2021-01-31  8:44     ` Boris Pismenny
2021-01-31  8:44       ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 08/21] nvme-tcp : Recalculate crc in the end of the capsule Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 09/21] nvme-tcp: Deal with netdevice DOWN events Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 10/21] net/mlx5: Header file changes for nvme-tcp offload Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 11/21] net/mlx5: Add 128B CQE for NVMEoTCP offload Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 12/21] net/mlx5e: TCP flow steering for nvme-tcp Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 13/21] net/mlx5e: NVMEoTCP offload initialization Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 14/21] net/mlx5e: KLM UMR helper macros Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 15/21] net/mlx5e: NVMEoTCP use KLM UMRs Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 16/21] net/mlx5e: NVMEoTCP queue init/teardown Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 17/21] net/mlx5e: NVMEoTCP async ddp invalidation Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 18/21] net/mlx5e: NVMEoTCP ddp setup and resync Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 19/21] net/mlx5e: NVMEoTCP, data-path for DDP offload Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-16  4:57   ` David Ahern
2021-01-16  4:57     ` David Ahern
2021-01-17  8:42     ` Boris Pismenny [this message]
2021-01-17  8:42       ` Boris Pismenny
2021-01-19  4:36       ` David Ahern
2021-01-19  4:36         ` David Ahern
2021-01-31  9:27         ` Boris Pismenny
2021-01-31  9:27           ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 20/21] net/mlx5e: NVMEoTCP statistics Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 21/21] Documentation: add TCP DDP offload documentation Boris Pismenny
2021-01-14 15:10   ` Boris Pismenny

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=15248743-82bf-4283-d8c6-99f2210e42ae@gmail.com \
    --to=borispismenny@gmail.com \
    --cc=axboe@fb.com \
    --cc=benishay@nvidia.com \
    --cc=boris.pismenny@gmail.com \
    --cc=borisp@mellanox.com \
    --cc=davem@davemloft.net \
    --cc=dsahern@gmail.com \
    --cc=edumazet@google.com \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=kuba@kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=netdev@vger.kernel.org \
    --cc=ogerlitz@mellanox.com \
    --cc=ogerlitz@nvidia.com \
    --cc=saeedm@nvidia.com \
    --cc=sagi@grimberg.me \
    --cc=smalin@marvell.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=yorayz@mellanox.com \
    --cc=yorayz@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.