All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Dumazet <eric.dumazet@gmail.com>
To: "David S . Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>
Cc: netdev <netdev@vger.kernel.org>, Coco Li <lixiaoyan@google.com>,
	Eric Dumazet <edumazet@google.com>,
	Eric Dumazet <eric.dumazet@gmail.com>,
	Tariq Toukan <tariqt@nvidia.com>
Subject: [PATCH v4 net-next 11/12] mlx4: support BIG TCP packets
Date: Fri,  6 May 2022 08:30:47 -0700	[thread overview]
Message-ID: <20220506153048.3695721-12-eric.dumazet@gmail.com> (raw)
In-Reply-To: <20220506153048.3695721-1-eric.dumazet@gmail.com>

From: Eric Dumazet <edumazet@google.com>

mlx4 supports LSOv2 just fine.

IPv6 stack inserts a temporary Hop-by-Hop header
with JUMBO TLV for big packets.

We need to ignore the HBH header when populating TX descriptor.

Tested:

Before: (not enabling bigger TSO/GRO packets)

ip link set dev eth0 gso_ipv6_max_size 65536 gro_ipv6_max_size 65536

netperf -H lpaa18 -t TCP_RR -T2,2 -l 10 -Cc -- -r 70000,70000
MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to lpaa18.prod.google.com () port 0 AF_INET6 : first burst 0 : cpu bind
Local /Remote
Socket Size   Request Resp.  Elapsed Trans.   CPU    CPU    S.dem   S.dem
Send   Recv   Size    Size   Time    Rate     local  remote local   remote
bytes  bytes  bytes   bytes  secs.   per sec  % S    % S    us/Tr   us/Tr

262144 540000 70000   70000  10.00   6591.45  0.86   1.34   62.490  97.446
262144 540000

After: (enabling bigger TSO/GRO packets)

ip link set dev eth0 gso_ipv6_max_size 185000 gro_ipv6_max_size 185000

netperf -H lpaa18 -t TCP_RR -T2,2 -l 10 -Cc -- -r 70000,70000
MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to lpaa18.prod.google.com () port 0 AF_INET6 : first burst 0 : cpu bind
Local /Remote
Socket Size   Request Resp.  Elapsed Trans.   CPU    CPU    S.dem   S.dem
Send   Recv   Size    Size   Time    Rate     local  remote local   remote
bytes  bytes  bytes   bytes  secs.   per sec  % S    % S    us/Tr   us/Tr

262144 540000 70000   70000  10.00   8383.95  0.95   1.01   54.432  57.584
262144 540000

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx4/en_netdev.c    |  3 ++
 drivers/net/ethernet/mellanox/mlx4/en_tx.c    | 47 +++++++++++++++----
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index c61dc7ae0c056a4dbcf24297549f6b1b5cc25d92..e9b854c563ab7cf7a5f037b65d7f734577369f52 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -3417,6 +3417,9 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	dev->min_mtu = ETH_MIN_MTU;
 	dev->max_mtu = priv->max_mtu;
 
+	/* supports LSOv2 packets, 512KB limit has been tested. */
+	netif_set_tso_max_size(dev, 512 * 1024);
+
 	mdev->pndev[port] = dev;
 	mdev->upper[port] = NULL;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index f777151d226fb601f52366850f8c86358e214032..af3b2b59a2a6940a2839b277815ec7c3b4af1008 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -43,6 +43,7 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/indirect_call_wrapper.h>
+#include <net/ipv6.h>
 
 #include "mlx4_en.h"
 
@@ -634,19 +635,28 @@ static int get_real_size(const struct sk_buff *skb,
 			 struct net_device *dev,
 			 int *lso_header_size,
 			 bool *inline_ok,
-			 void **pfrag)
+			 void **pfrag,
+			 int *hopbyhop)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	int real_size;
 
 	if (shinfo->gso_size) {
 		*inline_ok = false;
-		if (skb->encapsulation)
+		*hopbyhop = 0;
+		if (skb->encapsulation) {
 			*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
-		else
+		} else {
+			/* Detects large IPV6 TCP packets and prepares for removal of
+			 * HBH header that has been pushed by ip6_xmit(),
+			 * mainly so that tcpdump can dissect them.
+			 */
+			if (ipv6_has_hopopt_jumbo(skb))
+				*hopbyhop = sizeof(struct hop_jumbo_hdr);
 			*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		}
 		real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
-			ALIGN(*lso_header_size + 4, DS_SIZE);
+			ALIGN(*lso_header_size - *hopbyhop + 4, DS_SIZE);
 		if (unlikely(*lso_header_size != skb_headlen(skb))) {
 			/* We add a segment for the skb linear buffer only if
 			 * it contains data */
@@ -873,6 +883,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	int desc_size;
 	int real_size;
 	u32 index, bf_index;
+	struct ipv6hdr *h6;
 	__be32 op_own;
 	int lso_header_size;
 	void *fragptr = NULL;
@@ -881,6 +892,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	bool stop_queue;
 	bool inline_ok;
 	u8 data_offset;
+	int hopbyhop;
 	bool bf_ok;
 
 	tx_ind = skb_get_queue_mapping(skb);
@@ -890,7 +902,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto tx_drop;
 
 	real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
-				  &inline_ok, &fragptr);
+				  &inline_ok, &fragptr, &hopbyhop);
 	if (unlikely(!real_size))
 		goto tx_drop_count;
 
@@ -943,7 +955,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		data = &tx_desc->data;
 		data_offset = offsetof(struct mlx4_en_tx_desc, data);
 	} else {
-		int lso_align = ALIGN(lso_header_size + 4, DS_SIZE);
+		int lso_align = ALIGN(lso_header_size - hopbyhop + 4, DS_SIZE);
 
 		data = (void *)&tx_desc->lso + lso_align;
 		data_offset = offsetof(struct mlx4_en_tx_desc, lso) + lso_align;
@@ -1008,14 +1020,31 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 			((ring->prod & ring->size) ?
 				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
 
+		lso_header_size -= hopbyhop;
 		/* Fill in the LSO prefix */
 		tx_desc->lso.mss_hdr_size = cpu_to_be32(
 			shinfo->gso_size << 16 | lso_header_size);
 
-		/* Copy headers;
-		 * note that we already verified that it is linear */
-		memcpy(tx_desc->lso.header, skb->data, lso_header_size);
 
+		if (unlikely(hopbyhop)) {
+			/* remove the HBH header.
+			 * Layout: [Ethernet header][IPv6 header][HBH][TCP header]
+			 */
+			memcpy(tx_desc->lso.header, skb->data, ETH_HLEN + sizeof(*h6));
+			h6 = (struct ipv6hdr *)((char *)tx_desc->lso.header + ETH_HLEN);
+			h6->nexthdr = IPPROTO_TCP;
+			/* Copy the TCP header after the IPv6 one */
+			memcpy(h6 + 1,
+			       skb->data + ETH_HLEN + sizeof(*h6) +
+					sizeof(struct hop_jumbo_hdr),
+			       tcp_hdrlen(skb));
+			/* Leave ipv6 payload_len set to 0, as LSO v2 specs request. */
+		} else {
+			/* Copy headers;
+			 * note that we already verified that it is linear
+			 */
+			memcpy(tx_desc->lso.header, skb->data, lso_header_size);
+		}
 		ring->tso_packets++;
 
 		i = shinfo->gso_segs;
-- 
2.36.0.512.ge40c2bad7a-goog


  parent reply	other threads:[~2022-05-06 15:32 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-06 15:30 [PATCH v4 net-next 00/12] tcp: BIG TCP implementation Eric Dumazet
2022-05-06 15:30 ` [PATCH v4 net-next 01/12] net: add IFLA_TSO_{MAX_SIZE|SEGS} attributes Eric Dumazet
2022-05-06 15:30 ` [PATCH v4 net-next 02/12] ipv6: add IFLA_GSO_IPV6_MAX_SIZE Eric Dumazet
2022-05-06 20:48   ` Alexander H Duyck
2022-05-06 21:20     ` Eric Dumazet
2022-05-06 21:37       ` Alexander Duyck
2022-05-06 21:50         ` Eric Dumazet
2022-05-06 22:16           ` Alexander Duyck
2022-05-06 22:25             ` Eric Dumazet
2022-05-06 22:26             ` Jakub Kicinski
2022-05-06 22:46               ` Alexander Duyck
2022-05-06 15:30 ` [PATCH v4 net-next 03/12] tcp_cubic: make hystart_ack_delay() aware of BIG TCP Eric Dumazet
2022-05-06 15:30 ` [PATCH v4 net-next 04/12] ipv6: add struct hop_jumbo_hdr definition Eric Dumazet
2022-05-06 15:30 ` [PATCH v4 net-next 05/12] ipv6/gso: remove temporary HBH/jumbo header Eric Dumazet
2022-05-06 15:30 ` [PATCH v4 net-next 06/12] ipv6/gro: insert " Eric Dumazet
2022-05-06 15:30 ` [PATCH v4 net-next 07/12] ipv6: add IFLA_GRO_IPV6_MAX_SIZE Eric Dumazet
2022-05-06 21:06   ` Alexander H Duyck
2022-05-06 21:22     ` Eric Dumazet
2022-05-06 22:01       ` Alexander Duyck
2022-05-06 22:08         ` Eric Dumazet
2022-05-09 18:17       ` [PATCH 0/2] Replacements for patches 2 and 7 in Big TCP series Alexander Duyck
2022-05-09 18:17         ` [PATCH 1/2] net: Allow gso_max_size to exceed 65536 Alexander Duyck
2022-05-09 18:17         ` [PATCH 2/2] net: Allow gro_max_size " Alexander Duyck
2022-05-09 18:54         ` [PATCH 0/2] Replacements for patches 2 and 7 in Big TCP series Eric Dumazet
2022-05-09 20:21           ` Alexander H Duyck
2022-05-09 20:31             ` Eric Dumazet
2022-05-09 21:05               ` Alexander Duyck
2022-05-06 15:30 ` [PATCH v4 net-next 08/12] ipv6: Add hop-by-hop header to jumbograms in ip6_output Eric Dumazet
2022-05-06 15:30 ` [PATCH v4 net-next 09/12] net: loopback: enable BIG TCP packets Eric Dumazet
2022-05-06 15:30 ` [PATCH v4 net-next 10/12] veth: " Eric Dumazet
2022-05-06 22:33   ` Jakub Kicinski
2022-05-06 15:30 ` Eric Dumazet [this message]
2022-05-06 15:30 ` [PATCH v4 net-next 12/12] mlx5: support " Eric Dumazet
2022-05-06 22:34   ` Jakub Kicinski
2022-05-07  0:32     ` Eric Dumazet
2022-05-07  1:54       ` Jakub Kicinski
2022-05-07  1:54         ` Jakub Kicinski
2022-05-07  2:10         ` Eric Dumazet
2022-05-07  2:37           ` Jakub Kicinski
2022-05-07  2:43             ` Eric Dumazet
2022-05-07  7:16               ` Kees Cook
2022-05-07  7:23             ` Kees Cook
2022-05-07  6:57         ` Kees Cook
2022-05-07  7:46         ` Kees Cook
2022-05-07 11:19           ` Eric Dumazet
2022-05-09  8:05             ` David Laight
2022-05-09 23:20             ` Kees Cook

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220506153048.3695721-12-eric.dumazet@gmail.com \
    --to=eric.dumazet@gmail.com \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=kuba@kernel.org \
    --cc=lixiaoyan@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=tariqt@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.