netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
@ 2023-07-17 15:29 Eric Dumazet
  2023-07-17 16:52 ` Soheil Hassas Yeganeh
                   ` (4 more replies)
  0 siblings, 5 replies; 23+ messages in thread
From: Eric Dumazet @ 2023-07-17 15:29 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet, Eric Dumazet

With modern NIC drivers shifting to full page allocations per
received frame, we face the following issue:

TCP has one per-netns sysctl used to tweak how to translate
a memory use into an expected payload (RWIN), in RX path.

tcp_win_from_space() implementation is limited to few cases.

For hosts dealing with various MSS, we either under estimate
or over estimate the RWIN we send to the remote peers.

For instance with the default sysctl_tcp_adv_win_scale value,
we expect to store 50% of payload per allocated chunk of memory.

For the typical use of MTU=1500 traffic, and order-0 pages allocations
by NIC drivers, we are sending too big RWIN, leading to potential
tcp collapse operations, which are extremely expensive and source
of latency spikes.

This patch makes sysctl_tcp_adv_win_scale obsolete, and instead
uses a per socket scaling factor, so that we can precisely
adjust the RWIN based on effective skb->len/skb->truesize ratio.

This patch alone can double TCP receive performance when receivers
are too slow to drain their receive queue, or by allowing
a bigger RWIN when MSS is close to PAGE_SIZE.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 Documentation/networking/ip-sysctl.rst |  1 +
 include/linux/tcp.h                    |  4 +++-
 include/net/netns/ipv4.h               |  2 +-
 include/net/tcp.h                      | 24 ++++++++++++++++++++----
 net/ipv4/tcp.c                         | 11 ++++++-----
 net/ipv4/tcp_input.c                   | 19 ++++++++++++-------
 6 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 4a010a7cde7f8085db5ba6f1b9af53e9e5223cd5..82f2117cf2b36a834e5e391feda0210d916bff8b 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -321,6 +321,7 @@ tcp_abort_on_overflow - BOOLEAN
 	option can harm clients of your server.
 
 tcp_adv_win_scale - INTEGER
+	Obsolete since linux-6.6
 	Count buffering overhead as bytes/2^tcp_adv_win_scale
 	(if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
 	if it is <= 0.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b4c08ac86983568a9511258708724da15d0b999e..fbcb0ce13171d46aa3697abcd48482b08e78e5e0 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -172,6 +172,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
 	return (struct tcp_request_sock *)req;
 }
 
+#define TCP_RMEM_TO_WIN_SCALE 8
+
 struct tcp_sock {
 	/* inet_connection_sock has to be the first member of tcp_sock */
 	struct inet_connection_sock	inet_conn;
@@ -238,7 +240,7 @@ struct tcp_sock {
 
 	u32	window_clamp;	/* Maximal window to advertise		*/
 	u32	rcv_ssthresh;	/* Current window clamp			*/
-
+	u8	scaling_ratio;	/* see tcp_win_from_space() */
 	/* Information of the most recently (s)acked skb */
 	struct tcp_rack {
 		u64 mstamp; /* (Re)sent time of the skb */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index f003747181593559a4efe1838be719d445417041..7a41c4791536732005cedbb80c223b86aa43249e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -152,7 +152,7 @@ struct netns_ipv4 {
 	u8 sysctl_tcp_abort_on_overflow;
 	u8 sysctl_tcp_fack; /* obsolete */
 	int sysctl_tcp_max_reordering;
-	int sysctl_tcp_adv_win_scale;
+	int sysctl_tcp_adv_win_scale; /* obsolete */
 	u8 sysctl_tcp_dsack;
 	u8 sysctl_tcp_app_win;
 	u8 sysctl_tcp_frto;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 226bce6d1e8c30185260baadec449b67323db91c..2104a71c75ba7eee40612395be4103ae370b3c03 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1434,11 +1434,27 @@ void tcp_select_initial_window(const struct sock *sk, int __space,
 
 static inline int tcp_win_from_space(const struct sock *sk, int space)
 {
-	int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale);
+	s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio;
 
-	return tcp_adv_win_scale <= 0 ?
-		(space>>(-tcp_adv_win_scale)) :
-		space - (space>>tcp_adv_win_scale);
+	return scaled_space >> TCP_RMEM_TO_WIN_SCALE;
+}
+
+/* inverse of tcp_win_from_space() */
+static inline int tcp_space_from_win(const struct sock *sk, int win)
+{
+	u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;
+
+	do_div(val, tcp_sk(sk)->scaling_ratio);
+	return val;
+}
+
+static inline void tcp_scaling_ratio_init(struct sock *sk)
+{
+	/* Assume a conservative default of 1200 bytes of payload per 4K page.
+	 * This may be adjusted later in tcp_measure_rcv_mss().
+	 */
+	tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
+				    SKB_TRUESIZE(4096);
 }
 
 /* Note: caller must be prepared to deal with negative returns */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e03e08745308189c9d64509c2cff94da56c86a0c..88f4ebab12acc11d5f3feb6b13974a0b8e565671 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk)
 
 	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
 	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
+	tcp_scaling_ratio_init(sk);
 
 	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
 	sk_sockets_allocated_inc(sk);
@@ -1700,7 +1701,7 @@ EXPORT_SYMBOL(tcp_peek_len);
 /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
 int tcp_set_rcvlowat(struct sock *sk, int val)
 {
-	int cap;
+	int space, cap;
 
 	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
 		cap = sk->sk_rcvbuf >> 1;
@@ -1715,10 +1716,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
 	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
 		return 0;
 
-	val <<= 1;
-	if (val > sk->sk_rcvbuf) {
-		WRITE_ONCE(sk->sk_rcvbuf, val);
-		tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
+	space = tcp_space_from_win(sk, val);
+	if (space > sk->sk_rcvbuf) {
+		WRITE_ONCE(sk->sk_rcvbuf, space);
+		tcp_sk(sk)->window_clamp = val;
 	}
 	return 0;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 57c8af1859c16eba5e952a23ea959b628006f9c1..3cd92035e0902298baa8afd89ae5edcbfce300e5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
 	 */
 	len = skb_shinfo(skb)->gso_size ? : skb->len;
 	if (len >= icsk->icsk_ack.rcv_mss) {
+		/* Note: divides are still a bit expensive.
+		 * For the moment, only adjust scaling_ratio
+		 * when we update icsk_ack.rcv_mss.
+		 */
+		if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
+			u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
+
+			do_div(val, skb->truesize);
+			tcp_sk(sk)->scaling_ratio = val ? val : 1;
+		}
 		icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
 					       tcp_sk(sk)->advmss);
 		/* Account for possibly-removed options */
@@ -727,8 +737,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
 
 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-		int rcvmem, rcvbuf;
 		u64 rcvwin, grow;
+		int rcvbuf;
 
 		/* minimal window to cope with packet losses, assuming
 		 * steady state. Add some cushion because of small variations.
@@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
 		do_div(grow, tp->rcvq_space.space);
 		rcvwin += (grow << 1);
 
-		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
-		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
-			rcvmem += 128;
-
-		do_div(rcvwin, tp->advmss);
-		rcvbuf = min_t(u64, rcvwin * rcvmem,
+		rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
 			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
 		if (rcvbuf > sk->sk_rcvbuf) {
 			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
-- 
2.41.0.255.g8b1d071c50-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-07-17 15:29 [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale Eric Dumazet
@ 2023-07-17 16:52 ` Soheil Hassas Yeganeh
  2023-07-17 17:17   ` Eric Dumazet
  2023-07-19  2:10 ` patchwork-bot+netdevbpf
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 23+ messages in thread
From: Soheil Hassas Yeganeh @ 2023-07-17 16:52 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Neal Cardwell, Yuchung Cheng, eric.dumazet

On Mon, Jul 17, 2023 at 11:29 AM Eric Dumazet <edumazet@google.com> wrote:
>
> With modern NIC drivers shifting to full page allocations per
> received frame, we face the following issue:
>
> TCP has one per-netns sysctl used to tweak how to translate
> a memory use into an expected payload (RWIN), in RX path.
>
> tcp_win_from_space() implementation is limited to few cases.
>
> For hosts dealing with various MSS, we either under estimate
> or over estimate the RWIN we send to the remote peers.
>
> For instance with the default sysctl_tcp_adv_win_scale value,
> we expect to store 50% of payload per allocated chunk of memory.
>
> For the typical use of MTU=1500 traffic, and order-0 pages allocations
> by NIC drivers, we are sending too big RWIN, leading to potential
> tcp collapse operations, which are extremely expensive and source
> of latency spikes.
>
> This patch makes sysctl_tcp_adv_win_scale obsolete, and instead
> uses a per socket scaling factor, so that we can precisely
> adjust the RWIN based on effective skb->len/skb->truesize ratio.
>
> This patch alone can double TCP receive performance when receivers
> are too slow to drain their receive queue, or by allowing
> a bigger RWIN when MSS is close to PAGE_SIZE.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Great idea!

Acked-by: Soheil Hassas Yeganeh <soheil@google.com>

> ---
>  Documentation/networking/ip-sysctl.rst |  1 +
>  include/linux/tcp.h                    |  4 +++-
>  include/net/netns/ipv4.h               |  2 +-
>  include/net/tcp.h                      | 24 ++++++++++++++++++++----
>  net/ipv4/tcp.c                         | 11 ++++++-----
>  net/ipv4/tcp_input.c                   | 19 ++++++++++++-------
>  6 files changed, 43 insertions(+), 18 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 4a010a7cde7f8085db5ba6f1b9af53e9e5223cd5..82f2117cf2b36a834e5e391feda0210d916bff8b 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -321,6 +321,7 @@ tcp_abort_on_overflow - BOOLEAN
>         option can harm clients of your server.
>
>  tcp_adv_win_scale - INTEGER
> +       Obsolete since linux-6.6
>         Count buffering overhead as bytes/2^tcp_adv_win_scale
>         (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
>         if it is <= 0.
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index b4c08ac86983568a9511258708724da15d0b999e..fbcb0ce13171d46aa3697abcd48482b08e78e5e0 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -172,6 +172,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
>         return (struct tcp_request_sock *)req;
>  }
>
> +#define TCP_RMEM_TO_WIN_SCALE 8
> +
>  struct tcp_sock {
>         /* inet_connection_sock has to be the first member of tcp_sock */
>         struct inet_connection_sock     inet_conn;
> @@ -238,7 +240,7 @@ struct tcp_sock {
>
>         u32     window_clamp;   /* Maximal window to advertise          */
>         u32     rcv_ssthresh;   /* Current window clamp                 */
> -
> +       u8      scaling_ratio;  /* see tcp_win_from_space() */
>         /* Information of the most recently (s)acked skb */
>         struct tcp_rack {
>                 u64 mstamp; /* (Re)sent time of the skb */
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index f003747181593559a4efe1838be719d445417041..7a41c4791536732005cedbb80c223b86aa43249e 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -152,7 +152,7 @@ struct netns_ipv4 {
>         u8 sysctl_tcp_abort_on_overflow;
>         u8 sysctl_tcp_fack; /* obsolete */
>         int sysctl_tcp_max_reordering;
> -       int sysctl_tcp_adv_win_scale;
> +       int sysctl_tcp_adv_win_scale; /* obsolete */
>         u8 sysctl_tcp_dsack;
>         u8 sysctl_tcp_app_win;
>         u8 sysctl_tcp_frto;
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 226bce6d1e8c30185260baadec449b67323db91c..2104a71c75ba7eee40612395be4103ae370b3c03 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1434,11 +1434,27 @@ void tcp_select_initial_window(const struct sock *sk, int __space,
>
>  static inline int tcp_win_from_space(const struct sock *sk, int space)
>  {
> -       int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale);
> +       s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio;
>
> -       return tcp_adv_win_scale <= 0 ?
> -               (space>>(-tcp_adv_win_scale)) :
> -               space - (space>>tcp_adv_win_scale);
> +       return scaled_space >> TCP_RMEM_TO_WIN_SCALE;
> +}
> +
> +/* inverse of tcp_win_from_space() */
> +static inline int tcp_space_from_win(const struct sock *sk, int win)
> +{
> +       u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;
> +
> +       do_div(val, tcp_sk(sk)->scaling_ratio);
> +       return val;
> +}
> +
> +static inline void tcp_scaling_ratio_init(struct sock *sk)
> +{
> +       /* Assume a conservative default of 1200 bytes of payload per 4K page.
> +        * This may be adjusted later in tcp_measure_rcv_mss().
> +        */
> +       tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
> +                                   SKB_TRUESIZE(4096);

Should we use PAGE_SIZE instead of 4096?

>  }
>
>  /* Note: caller must be prepared to deal with negative returns */
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index e03e08745308189c9d64509c2cff94da56c86a0c..88f4ebab12acc11d5f3feb6b13974a0b8e565671 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk)
>
>         WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
>         WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
> +       tcp_scaling_ratio_init(sk);
>
>         set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
>         sk_sockets_allocated_inc(sk);
> @@ -1700,7 +1701,7 @@ EXPORT_SYMBOL(tcp_peek_len);
>  /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
>  int tcp_set_rcvlowat(struct sock *sk, int val)
>  {
> -       int cap;
> +       int space, cap;
>
>         if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
>                 cap = sk->sk_rcvbuf >> 1;
> @@ -1715,10 +1716,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
>         if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
>                 return 0;
>
> -       val <<= 1;
> -       if (val > sk->sk_rcvbuf) {
> -               WRITE_ONCE(sk->sk_rcvbuf, val);
> -               tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
> +       space = tcp_space_from_win(sk, val);
> +       if (space > sk->sk_rcvbuf) {
> +               WRITE_ONCE(sk->sk_rcvbuf, space);
> +               tcp_sk(sk)->window_clamp = val;
>         }
>         return 0;
>  }
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 57c8af1859c16eba5e952a23ea959b628006f9c1..3cd92035e0902298baa8afd89ae5edcbfce300e5 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
>          */
>         len = skb_shinfo(skb)->gso_size ? : skb->len;
>         if (len >= icsk->icsk_ack.rcv_mss) {
> +               /* Note: divides are still a bit expensive.
> +                * For the moment, only adjust scaling_ratio
> +                * when we update icsk_ack.rcv_mss.
> +                */
> +               if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
> +                       u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
> +
> +                       do_div(val, skb->truesize);
> +                       tcp_sk(sk)->scaling_ratio = val ? val : 1;
> +               }
>                 icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
>                                                tcp_sk(sk)->advmss);
>                 /* Account for possibly-removed options */
> @@ -727,8 +737,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
>
>         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
>             !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
> -               int rcvmem, rcvbuf;
>                 u64 rcvwin, grow;
> +               int rcvbuf;
>
>                 /* minimal window to cope with packet losses, assuming
>                  * steady state. Add some cushion because of small variations.
> @@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
>                 do_div(grow, tp->rcvq_space.space);
>                 rcvwin += (grow << 1);
>
> -               rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
> -               while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
> -                       rcvmem += 128;
> -
> -               do_div(rcvwin, tp->advmss);
> -               rcvbuf = min_t(u64, rcvwin * rcvmem,
> +               rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
>                                READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
>                 if (rcvbuf > sk->sk_rcvbuf) {
>                         WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
> --
> 2.41.0.255.g8b1d071c50-goog
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-07-17 16:52 ` Soheil Hassas Yeganeh
@ 2023-07-17 17:17   ` Eric Dumazet
  2023-07-17 17:20     ` Soheil Hassas Yeganeh
  0 siblings, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2023-07-17 17:17 UTC (permalink / raw)
  To: Soheil Hassas Yeganeh
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Neal Cardwell, Yuchung Cheng, eric.dumazet

On Mon, Jul 17, 2023 at 6:52 PM Soheil Hassas Yeganeh <soheil@google.com> wrote:
>
>
> Should we use PAGE_SIZE instead of 4096?

Great question. I thought of using 50%, but used this formula to kind
of document the problem.

Using PAGE_SIZE would probably hurt arches with large pages, for the
initial round trip.

Given that TCP normally uses IW10, there is almost no risk of tcp collapsing
for the first RTT.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-07-17 17:17   ` Eric Dumazet
@ 2023-07-17 17:20     ` Soheil Hassas Yeganeh
  0 siblings, 0 replies; 23+ messages in thread
From: Soheil Hassas Yeganeh @ 2023-07-17 17:20 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Neal Cardwell, Yuchung Cheng, eric.dumazet

On Mon, Jul 17, 2023 at 1:17 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Jul 17, 2023 at 6:52 PM Soheil Hassas Yeganeh <soheil@google.com> wrote:
> >
> >
> > Should we use PAGE_SIZE instead of 4096?
>
> Great question. I thought of using 50%, but used this formula to kind
> of document the problem.
>
> Using PAGE_SIZE would probably hurt arches with large pages, for the
> initial round trip.
>
> Given that TCP normally uses IW10, there is almost no risk of tcp collapsing
> for the first RTT.

This is a very good point. No change needed then.  Thank you!

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-07-17 15:29 [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale Eric Dumazet
  2023-07-17 16:52 ` Soheil Hassas Yeganeh
@ 2023-07-19  2:10 ` patchwork-bot+netdevbpf
  2023-07-20 15:41 ` Paolo Abeni
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 23+ messages in thread
From: patchwork-bot+netdevbpf @ 2023-07-19  2:10 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, kuba, pabeni, netdev, soheil, ncardwell, ycheng, eric.dumazet

Hello:

This patch was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Mon, 17 Jul 2023 15:29:17 +0000 you wrote:
> With modern NIC drivers shifting to full page allocations per
> received frame, we face the following issue:
> 
> TCP has one per-netns sysctl used to tweak how to translate
> a memory use into an expected payload (RWIN), in RX path.
> 
> tcp_win_from_space() implementation is limited to few cases.
> 
> [...]

Here is the summary with links:
  - [net-next] tcp: get rid of sysctl_tcp_adv_win_scale
    https://git.kernel.org/netdev/net-next/c/dfa2f0483360

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-07-17 15:29 [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale Eric Dumazet
  2023-07-17 16:52 ` Soheil Hassas Yeganeh
  2023-07-19  2:10 ` patchwork-bot+netdevbpf
@ 2023-07-20 15:41 ` Paolo Abeni
  2023-07-20 15:50   ` Eric Dumazet
  2023-11-03  6:10 ` Jiri Slaby
  2024-04-02 15:28 ` shironeko
  4 siblings, 1 reply; 23+ messages in thread
From: Paolo Abeni @ 2023-07-20 15:41 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski
  Cc: netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

Hi,

On Mon, 2023-07-17 at 15:29 +0000, Eric Dumazet wrote:
> +static inline void tcp_scaling_ratio_init(struct sock *sk)
> +{
> +	/* Assume a conservative default of 1200 bytes of payload per 4K page.
> +	 * This may be adjusted later in tcp_measure_rcv_mss().
> +	 */
> +	tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
> +				    SKB_TRUESIZE(4096);

I'm giving this patch a closer look because mptcp_rcv_space_adjust
needs to be updated on top of it. Should SKB_TRUESIZE(4096) be replaced
with:

4096 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) 

to be more accurate? The page should already include the shared info,
right?

Likely not very relevant as the ratio is updated to a more accurate
value with the first packet, mostly to try to understand the code
correctly.

Thanks!

Paolo


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-07-20 15:41 ` Paolo Abeni
@ 2023-07-20 15:50   ` Eric Dumazet
  0 siblings, 0 replies; 23+ messages in thread
From: Eric Dumazet @ 2023-07-20 15:50 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: David S . Miller, Jakub Kicinski, netdev, Soheil Hassas Yeganeh,
	Neal Cardwell, Yuchung Cheng, eric.dumazet

On Thu, Jul 20, 2023 at 5:43 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> Hi,
>
> On Mon, 2023-07-17 at 15:29 +0000, Eric Dumazet wrote:
> > +static inline void tcp_scaling_ratio_init(struct sock *sk)
> > +{
> > +     /* Assume a conservative default of 1200 bytes of payload per 4K page.
> > +      * This may be adjusted later in tcp_measure_rcv_mss().
> > +      */
> > +     tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
> > +                                 SKB_TRUESIZE(4096);
>
> I'm giving this patch a closer look because mptcp_rcv_space_adjust
> needs to be updated on top of it. Should SKB_TRUESIZE(4096) be replaced
> with:
>
> 4096 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
>
> to be more accurate? The page should already include the shared info,
> right?
>
> Likely not very relevant as the ratio is updated to a more accurate
> value with the first packet, mostly to try to understand the code
> correctly.

Hi Paolo.

As discussed with Soheil, I do not think the initial value for
tp->scaling_ratio is very important,
as long as it is not too small of course (otherwise we would have to
increase tcp_rmem[1] to avoid regressions for the first RTT)

Rationale for not adding sizeof(struct skb_shared_info) is because
with GRO, this extra cost tends to disappear.
(Same idea used in truesize_adjust())

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-07-17 15:29 [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale Eric Dumazet
                   ` (2 preceding siblings ...)
  2023-07-20 15:41 ` Paolo Abeni
@ 2023-11-03  6:10 ` Jiri Slaby
  2023-11-03  6:56   ` Jiri Slaby
  2024-04-02 15:28 ` shironeko
  4 siblings, 1 reply; 23+ messages in thread
From: Jiri Slaby @ 2023-11-03  6:10 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On 17. 07. 23, 17:29, Eric Dumazet wrote:
> With modern NIC drivers shifting to full page allocations per
> received frame, we face the following issue:
> 
> TCP has one per-netns sysctl used to tweak how to translate
> a memory use into an expected payload (RWIN), in RX path.
> 
> tcp_win_from_space() implementation is limited to few cases.
> 
> For hosts dealing with various MSS, we either under estimate
> or over estimate the RWIN we send to the remote peers.
> 
> For instance with the default sysctl_tcp_adv_win_scale value,
> we expect to store 50% of payload per allocated chunk of memory.
> 
> For the typical use of MTU=1500 traffic, and order-0 pages allocations
> by NIC drivers, we are sending too big RWIN, leading to potential
> tcp collapse operations, which are extremely expensive and source
> of latency spikes.
> 
> This patch makes sysctl_tcp_adv_win_scale obsolete, and instead
> uses a per socket scaling factor, so that we can precisely
> adjust the RWIN based on effective skb->len/skb->truesize ratio.
> 
> This patch alone can double TCP receive performance when receivers
> are too slow to drain their receive queue, or by allowing
> a bigger RWIN when MSS is close to PAGE_SIZE.

Hi,

I bisected a python-eventlet test failure:
 > =================================== FAILURES 
===================================
 > _______________________ TestGreenSocket.test_full_duplex 
_______________________
 >
 > self = <tests.greenio_test.TestGreenSocket testMethod=test_full_duplex>
 >
 >     def test_full_duplex(self):
 > ...
 > >       large_evt.wait()
 >
 > tests/greenio_test.py:424:
 > _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ _ _ _
 > eventlet/greenthread.py:181: in wait
 >     return self._exit_event.wait()
 > eventlet/event.py:125: in wait
 >     result = hub.switch()
...
 > E       tests.TestIsTakingTooLong: 1
 >
 > eventlet/hubs/hub.py:313: TestIsTakingTooLong

to this commit. With the commit, the test takes > 1.5 s. Without the 
commit it takes only < 300 ms. And they set timeout to 1 s.

The reduced self-stadning test case:
#!/usr/bin/python3
import eventlet
from eventlet.green import select, socket, time, ssl

def bufsized(sock, size=1):
     """ Resize both send and receive buffers on a socket.
     Useful for testing trampoline.  Returns the socket.

     >>> import socket
     >>> sock = bufsized(socket.socket(socket.AF_INET, socket.SOCK_STREAM))
     """
     sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, size)
     sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, size)
     return sock

def min_buf_size():
     """Return the minimum buffer size that the platform supports."""
     test_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 1)
     return test_sock.getsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF)

def test_full_duplex():
     large_data = b'*' * 10 * min_buf_size()
     listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
     listener.bind(('127.0.0.1', 0))
     listener.listen(50)
     bufsized(listener)

     def send_large(sock):
         sock.sendall(large_data)

     def read_large(sock):
         result = sock.recv(len(large_data))
         while len(result) < len(large_data):
             result += sock.recv(len(large_data))
         assert result == large_data

     def server():
         (sock, addr) = listener.accept()
         sock = bufsized(sock)
         send_large_coro = eventlet.spawn(send_large, sock)
         eventlet.sleep(0)
         result = sock.recv(10)
         expected = b'hello world'
         while len(result) < len(expected):
             result += sock.recv(10)
         assert result == expected
         send_large_coro.wait()

     server_evt = eventlet.spawn(server)
     client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     client.connect(('127.0.0.1', listener.getsockname()[1]))
     bufsized(client)
     large_evt = eventlet.spawn(read_large, client)
     eventlet.sleep(0)
     client.sendall(b'hello world')
     server_evt.wait()
     large_evt.wait()
     client.close()

test_full_duplex()

=====================================

I speak neither python nor networking, so any ideas :)? Is the test 
simply wrong?

Thanks.

> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>   Documentation/networking/ip-sysctl.rst |  1 +
>   include/linux/tcp.h                    |  4 +++-
>   include/net/netns/ipv4.h               |  2 +-
>   include/net/tcp.h                      | 24 ++++++++++++++++++++----
>   net/ipv4/tcp.c                         | 11 ++++++-----
>   net/ipv4/tcp_input.c                   | 19 ++++++++++++-------
>   6 files changed, 43 insertions(+), 18 deletions(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 4a010a7cde7f8085db5ba6f1b9af53e9e5223cd5..82f2117cf2b36a834e5e391feda0210d916bff8b 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -321,6 +321,7 @@ tcp_abort_on_overflow - BOOLEAN
>   	option can harm clients of your server.
>   
>   tcp_adv_win_scale - INTEGER
> +	Obsolete since linux-6.6
>   	Count buffering overhead as bytes/2^tcp_adv_win_scale
>   	(if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
>   	if it is <= 0.
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index b4c08ac86983568a9511258708724da15d0b999e..fbcb0ce13171d46aa3697abcd48482b08e78e5e0 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -172,6 +172,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
>   	return (struct tcp_request_sock *)req;
>   }
>   
> +#define TCP_RMEM_TO_WIN_SCALE 8
> +
>   struct tcp_sock {
>   	/* inet_connection_sock has to be the first member of tcp_sock */
>   	struct inet_connection_sock	inet_conn;
> @@ -238,7 +240,7 @@ struct tcp_sock {
>   
>   	u32	window_clamp;	/* Maximal window to advertise		*/
>   	u32	rcv_ssthresh;	/* Current window clamp			*/
> -
> +	u8	scaling_ratio;	/* see tcp_win_from_space() */
>   	/* Information of the most recently (s)acked skb */
>   	struct tcp_rack {
>   		u64 mstamp; /* (Re)sent time of the skb */
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index f003747181593559a4efe1838be719d445417041..7a41c4791536732005cedbb80c223b86aa43249e 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -152,7 +152,7 @@ struct netns_ipv4 {
>   	u8 sysctl_tcp_abort_on_overflow;
>   	u8 sysctl_tcp_fack; /* obsolete */
>   	int sysctl_tcp_max_reordering;
> -	int sysctl_tcp_adv_win_scale;
> +	int sysctl_tcp_adv_win_scale; /* obsolete */
>   	u8 sysctl_tcp_dsack;
>   	u8 sysctl_tcp_app_win;
>   	u8 sysctl_tcp_frto;
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 226bce6d1e8c30185260baadec449b67323db91c..2104a71c75ba7eee40612395be4103ae370b3c03 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1434,11 +1434,27 @@ void tcp_select_initial_window(const struct sock *sk, int __space,
>   
>   static inline int tcp_win_from_space(const struct sock *sk, int space)
>   {
> -	int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale);
> +	s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio;
>   
> -	return tcp_adv_win_scale <= 0 ?
> -		(space>>(-tcp_adv_win_scale)) :
> -		space - (space>>tcp_adv_win_scale);
> +	return scaled_space >> TCP_RMEM_TO_WIN_SCALE;
> +}
> +
> +/* inverse of tcp_win_from_space() */
> +static inline int tcp_space_from_win(const struct sock *sk, int win)
> +{
> +	u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;
> +
> +	do_div(val, tcp_sk(sk)->scaling_ratio);
> +	return val;
> +}
> +
> +static inline void tcp_scaling_ratio_init(struct sock *sk)
> +{
> +	/* Assume a conservative default of 1200 bytes of payload per 4K page.
> +	 * This may be adjusted later in tcp_measure_rcv_mss().
> +	 */
> +	tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
> +				    SKB_TRUESIZE(4096);
>   }
>   
>   /* Note: caller must be prepared to deal with negative returns */
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index e03e08745308189c9d64509c2cff94da56c86a0c..88f4ebab12acc11d5f3feb6b13974a0b8e565671 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk)
>   
>   	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
>   	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
> +	tcp_scaling_ratio_init(sk);
>   
>   	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
>   	sk_sockets_allocated_inc(sk);
> @@ -1700,7 +1701,7 @@ EXPORT_SYMBOL(tcp_peek_len);
>   /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
>   int tcp_set_rcvlowat(struct sock *sk, int val)
>   {
> -	int cap;
> +	int space, cap;
>   
>   	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
>   		cap = sk->sk_rcvbuf >> 1;
> @@ -1715,10 +1716,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
>   	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
>   		return 0;
>   
> -	val <<= 1;
> -	if (val > sk->sk_rcvbuf) {
> -		WRITE_ONCE(sk->sk_rcvbuf, val);
> -		tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
> +	space = tcp_space_from_win(sk, val);
> +	if (space > sk->sk_rcvbuf) {
> +		WRITE_ONCE(sk->sk_rcvbuf, space);
> +		tcp_sk(sk)->window_clamp = val;
>   	}
>   	return 0;
>   }
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 57c8af1859c16eba5e952a23ea959b628006f9c1..3cd92035e0902298baa8afd89ae5edcbfce300e5 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
>   	 */
>   	len = skb_shinfo(skb)->gso_size ? : skb->len;
>   	if (len >= icsk->icsk_ack.rcv_mss) {
> +		/* Note: divides are still a bit expensive.
> +		 * For the moment, only adjust scaling_ratio
> +		 * when we update icsk_ack.rcv_mss.
> +		 */
> +		if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
> +			u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
> +
> +			do_div(val, skb->truesize);
> +			tcp_sk(sk)->scaling_ratio = val ? val : 1;
> +		}
>   		icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
>   					       tcp_sk(sk)->advmss);
>   		/* Account for possibly-removed options */
> @@ -727,8 +737,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
>   
>   	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
>   	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
> -		int rcvmem, rcvbuf;
>   		u64 rcvwin, grow;
> +		int rcvbuf;
>   
>   		/* minimal window to cope with packet losses, assuming
>   		 * steady state. Add some cushion because of small variations.
> @@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
>   		do_div(grow, tp->rcvq_space.space);
>   		rcvwin += (grow << 1);
>   
> -		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
> -		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
> -			rcvmem += 128;
> -
> -		do_div(rcvwin, tp->advmss);
> -		rcvbuf = min_t(u64, rcvwin * rcvmem,
> +		rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
>   			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
>   		if (rcvbuf > sk->sk_rcvbuf) {
>   			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);

-- 
js
suse labs


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-11-03  6:10 ` Jiri Slaby
@ 2023-11-03  6:56   ` Jiri Slaby
  2023-11-03  7:07     ` Jiri Slaby
  0 siblings, 1 reply; 23+ messages in thread
From: Jiri Slaby @ 2023-11-03  6:56 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On 03. 11. 23, 7:10, Jiri Slaby wrote:
> On 17. 07. 23, 17:29, Eric Dumazet wrote:
>> With modern NIC drivers shifting to full page allocations per
>> received frame, we face the following issue:
>>
>> TCP has one per-netns sysctl used to tweak how to translate
>> a memory use into an expected payload (RWIN), in RX path.
>>
>> tcp_win_from_space() implementation is limited to few cases.
>>
>> For hosts dealing with various MSS, we either under estimate
>> or over estimate the RWIN we send to the remote peers.
>>
>> For instance with the default sysctl_tcp_adv_win_scale value,
>> we expect to store 50% of payload per allocated chunk of memory.
>>
>> For the typical use of MTU=1500 traffic, and order-0 pages allocations
>> by NIC drivers, we are sending too big RWIN, leading to potential
>> tcp collapse operations, which are extremely expensive and source
>> of latency spikes.
>>
>> This patch makes sysctl_tcp_adv_win_scale obsolete, and instead
>> uses a per socket scaling factor, so that we can precisely
>> adjust the RWIN based on effective skb->len/skb->truesize ratio.
>>
>> This patch alone can double TCP receive performance when receivers
>> are too slow to drain their receive queue, or by allowing
>> a bigger RWIN when MSS is close to PAGE_SIZE.
> 
> Hi,
> 
> I bisected a python-eventlet test failure:
>  > =================================== FAILURES 
> ===================================
>  > _______________________ TestGreenSocket.test_full_duplex 
> _______________________
>  >
>  > self = <tests.greenio_test.TestGreenSocket testMethod=test_full_duplex>
>  >
>  >     def test_full_duplex(self):
>  > ...
>  > >       large_evt.wait()
>  >
>  > tests/greenio_test.py:424:
>  > _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
> _ _ _ _ _
>  > eventlet/greenthread.py:181: in wait
>  >     return self._exit_event.wait()
>  > eventlet/event.py:125: in wait
>  >     result = hub.switch()
> ...
>  > E       tests.TestIsTakingTooLong: 1
>  >
>  > eventlet/hubs/hub.py:313: TestIsTakingTooLong
> 
> to this commit. With the commit, the test takes > 1.5 s. Without the 
> commit it takes only < 300 ms. And they set timeout to 1 s.
> 
> The reduced self-stadning test case:
> #!/usr/bin/python3
> import eventlet
> from eventlet.green import select, socket, time, ssl
> 
> def bufsized(sock, size=1):
>      """ Resize both send and receive buffers on a socket.
>      Useful for testing trampoline.  Returns the socket.
> 
>      >>> import socket
>      >>> sock = bufsized(socket.socket(socket.AF_INET, socket.SOCK_STREAM))
>      """
>      sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, size)
>      sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, size)
>      return sock
> 
> def min_buf_size():
>      """Return the minimum buffer size that the platform supports."""
>      test_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
>      test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 1)
>      return test_sock.getsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF)
> 
> def test_full_duplex():
>      large_data = b'*' * 10 * min_buf_size()
>      listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
>      listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
>      listener.bind(('127.0.0.1', 0))
>      listener.listen(50)
>      bufsized(listener)
> 
>      def send_large(sock):
>          sock.sendall(large_data)
> 
>      def read_large(sock):
>          result = sock.recv(len(large_data))
>          while len(result) < len(large_data):
>              result += sock.recv(len(large_data))
>          assert result == large_data
> 
>      def server():
>          (sock, addr) = listener.accept()
>          sock = bufsized(sock)
>          send_large_coro = eventlet.spawn(send_large, sock)
>          eventlet.sleep(0)
>          result = sock.recv(10)
>          expected = b'hello world'
>          while len(result) < len(expected):
>              result += sock.recv(10)
>          assert result == expected
>          send_large_coro.wait()
> 
>      server_evt = eventlet.spawn(server)
>      client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
>      client.connect(('127.0.0.1', listener.getsockname()[1]))
>      bufsized(client)
>      large_evt = eventlet.spawn(read_large, client)
>      eventlet.sleep(0)
>      client.sendall(b'hello world')
>      server_evt.wait()
>      large_evt.wait()
>      client.close()
> 
> test_full_duplex()
> 
> =====================================
> 
> I speak neither python nor networking, so any ideas :)? Is the test 
> simply wrong?

strace -rT -e trace=network:

GOOD:
 > 0.000000 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3 
<0.000063>
 > 0.000406 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000042>
 > 0.000097 getsockopt(3, SOL_SOCKET, SO_SNDBUF, [4608], [4]) = 0 
<0.000012>
 > 0.000101 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3 
<0.000015>
 > 0.000058 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 <0.000009>
 > 0.000035 bind(3, {sa_family=AF_INET, sin_port=htons(0), 
sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000027>
 > 0.000058 listen(3, 50)       = 0 <0.000014>
 > 0.000029 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000009>
 > 0.000023 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000008>
 > 0.000052 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 5 
<0.000014>
 > 0.000050 getsockname(3, {sa_family=AF_INET, sin_port=htons(44313), 
sin_addr=inet_addr("127.0.0.1")}, [16]) = 0 <0.000011>
 > 0.000037 connect(5, {sa_family=AF_INET, sin_port=htons(44313), 
sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in 
progress) <0.000070>
 > 0.000210 accept4(3, {sa_family=AF_INET, sin_port=htons(56062), 
sin_addr=inet_addr("127.0.0.1")}, [16], SOCK_CLOEXEC) = 6 <0.000012>
 > 0.000040 getsockname(6, {sa_family=AF_INET, sin_port=htons(44313), 
sin_addr=inet_addr("127.0.0.1")}, [128 => 16]) = 0 <0.000007>
 > 0.000062 setsockopt(6, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000007>
 > 0.000020 setsockopt(6, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000007>
 > 0.000082 getsockopt(5, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 <0.000007>
 > 0.000023 connect(5, {sa_family=AF_INET, sin_port=htons(44313), 
sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000008>
 > 0.000022 setsockopt(5, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000020>
 > 0.000036 setsockopt(5, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000007>
 > 0.000061 sendto(6, "********************************"..., 46080, 0, 
NULL, 0) = 32768 <0.000049>
 > 0.000135 sendto(6, "********************************"..., 13312, 0, 
NULL, 0) = 13312 <0.000017>
 > 0.000087 recvfrom(6, 0x7f78e58af890, 10, 0, NULL, NULL) = -1 EAGAIN 
(Resource temporarily unavailable) <0.000010>
 > 0.000125 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 32768 <0.000032>
 > 0.000066 recvfrom(5, 0x55fdb41bb880, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000011>
 > 0.000075 sendto(5, "hello world", 11, 0, NULL, 0) = 11 <0.000023>
 > 0.000117 recvfrom(6, "hello worl", 10, 0, NULL, NULL) = 10 <0.000015>
 > 0.000050 recvfrom(6, "d", 10, 0, NULL, NULL) = 1 <0.000011>
 > 0.000212 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 13312 <0.000019>
 > 0.050676 +++ exited with 0 +++


BAD:
 > 0.000000 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3 
<0.000045>
 > 0.000244 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000015>
 > 0.000057 getsockopt(3, SOL_SOCKET, SO_SNDBUF, [4608], [4]) = 0 <0.000013>
 > 0.000104 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3 
<0.000016>
 > 0.000065 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 <0.000010>
 > 0.000038 bind(3, {sa_family=AF_INET, sin_port=htons(0), 
sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000031>
 > 0.000068 listen(3, 50)       = 0 <0.000014>
 > 0.000032 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000010>
 > 0.000030 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000018>
 > 0.000060 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 5 
<0.000023>
 > 0.000071 getsockname(3, {sa_family=AF_INET, sin_port=htons(45901), 
sin_addr=inet_addr("127.0.0.1")}, [16]) = 0 <0.000019>
 > 0.000068 connect(5, {sa_family=AF_INET, sin_port=htons(45901), 
sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in 
progress) <0.000074>
 > 0.000259 accept4(3, {sa_family=AF_INET, sin_port=htons(35002), 
sin_addr=inet_addr("127.0.0.1")}, [16], SOCK_CLOEXEC) = 6 <0.000014>
 > 0.000051 getsockname(6, {sa_family=AF_INET, sin_port=htons(45901), 
sin_addr=inet_addr("127.0.0.1")}, [128 => 16]) = 0 <0.000010>
 > 0.000082 setsockopt(6, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000009>
 > 0.000040 setsockopt(6, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000009>
 > 0.000104 getsockopt(5, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 <0.000009>
 > 0.000028 connect(5, {sa_family=AF_INET, sin_port=htons(45901), 
sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000009>
 > 0.000026 setsockopt(5, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000009>
 > 0.000024 setsockopt(5, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000008>
 > 0.000071 sendto(6, "********************************"..., 46080, 0, 
NULL, 0) = 16640 <0.000026>
 > 0.000117 sendto(6, "********************************"..., 29440, 0, 
NULL, 0) = 16640 <0.000017>
 > 0.000041 sendto(6, "********************************"..., 12800, 0, 
NULL, 0) = -1 EAGAIN (Resource temporarily unavailable) <0.000009>
 > 0.000075 recvfrom(6, 0x7f4db88a38c0, 10, 0, NULL, NULL) = -1 EAGAIN 
(Resource temporarily unavailable) <0.000010>
 > 0.000086 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 16640 <0.000018>
 > 0.000044 recvfrom(5, 0x55a64d59b2a0, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000009>
 > 0.000059 sendto(5, "hello world", 11, 0, NULL, 0) = 11 <0.000018>
 > 0.000093 recvfrom(6, "hello worl", 10, 0, NULL, NULL) = 10 <0.000009>
 > 0.000029 recvfrom(6, "d", 10, 0, NULL, NULL) = 1 <0.000009>
 > 0.206685 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 16640 <0.000116>
 > 0.000306 recvfrom(5, 0x55a64d5a7600, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000013>
 > 0.000208 sendto(6, "********************************"..., 12800, 0, 
NULL, 0) = 12800 <0.000025>
 > 0.206317 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 2304 <0.000171>
 > 0.000304 recvfrom(5, 0x55a64d597170, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000029>
 > 0.206161 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 2304 <0.000082>
 > 0.000212 recvfrom(5, 0x55a64d5a0ed0, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000034>
 > 0.206572 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 2304 <0.000146>
 > 0.000274 recvfrom(5, 0x55a64d597170, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000029>
 > 0.206604 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 2304 <0.000162>
 > 0.000270 recvfrom(5, 0x55a64d5a20d0, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000016>
 > 0.206164 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 2304 <0.000116>
 > 0.000291 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 1280 <0.000038>
 > 0.052224 +++ exited with 0 +++

I.e. recvfrom() returns -EAGAIN and takes 200 ms.


-- 
js
suse labs


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-11-03  6:56   ` Jiri Slaby
@ 2023-11-03  7:07     ` Jiri Slaby
  2023-11-03  8:17       ` Eric Dumazet
  0 siblings, 1 reply; 23+ messages in thread
From: Jiri Slaby @ 2023-11-03  7:07 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On 03. 11. 23, 7:56, Jiri Slaby wrote:
> On 03. 11. 23, 7:10, Jiri Slaby wrote:
>> On 17. 07. 23, 17:29, Eric Dumazet wrote:
>>> With modern NIC drivers shifting to full page allocations per
>>> received frame, we face the following issue:
>>>
>>> TCP has one per-netns sysctl used to tweak how to translate
>>> a memory use into an expected payload (RWIN), in RX path.
>>>
>>> tcp_win_from_space() implementation is limited to few cases.
>>>
>>> For hosts dealing with various MSS, we either under estimate
>>> or over estimate the RWIN we send to the remote peers.
>>>
>>> For instance with the default sysctl_tcp_adv_win_scale value,
>>> we expect to store 50% of payload per allocated chunk of memory.
>>>
>>> For the typical use of MTU=1500 traffic, and order-0 pages allocations
>>> by NIC drivers, we are sending too big RWIN, leading to potential
>>> tcp collapse operations, which are extremely expensive and source
>>> of latency spikes.
>>>
>>> This patch makes sysctl_tcp_adv_win_scale obsolete, and instead
>>> uses a per socket scaling factor, so that we can precisely
>>> adjust the RWIN based on effective skb->len/skb->truesize ratio.
>>>
>>> This patch alone can double TCP receive performance when receivers
>>> are too slow to drain their receive queue, or by allowing
>>> a bigger RWIN when MSS is close to PAGE_SIZE.
>>
>> Hi,
>>
>> I bisected a python-eventlet test failure:
>>  > =================================== FAILURES 
>> ===================================
>>  > _______________________ TestGreenSocket.test_full_duplex 
>> _______________________
>>  >
>>  > self = <tests.greenio_test.TestGreenSocket 
>> testMethod=test_full_duplex>
>>  >
>>  >     def test_full_duplex(self):
>>  > ...
>>  > >       large_evt.wait()
>>  >
>>  > tests/greenio_test.py:424:
>>  > _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
>> _ _ _ _ _ _
>>  > eventlet/greenthread.py:181: in wait
>>  >     return self._exit_event.wait()
>>  > eventlet/event.py:125: in wait
>>  >     result = hub.switch()
>> ...
>>  > E       tests.TestIsTakingTooLong: 1
>>  >
>>  > eventlet/hubs/hub.py:313: TestIsTakingTooLong
>>
>> to this commit. With the commit, the test takes > 1.5 s. Without the 
>> commit it takes only < 300 ms. And they set timeout to 1 s.
>>
>> The reduced self-stadning test case:
>> #!/usr/bin/python3
>> import eventlet
>> from eventlet.green import select, socket, time, ssl
>>
>> def bufsized(sock, size=1):
>>      """ Resize both send and receive buffers on a socket.
>>      Useful for testing trampoline.  Returns the socket.
>>
>>      >>> import socket
>>      >>> sock = bufsized(socket.socket(socket.AF_INET, 
>> socket.SOCK_STREAM))
>>      """
>>      sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, size)
>>      sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, size)
>>      return sock
>>
>> def min_buf_size():
>>      """Return the minimum buffer size that the platform supports."""
>>      test_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
>>      test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 1)
>>      return test_sock.getsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF)
>>
>> def test_full_duplex():
>>      large_data = b'*' * 10 * min_buf_size()
>>      listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
>>      listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
>>      listener.bind(('127.0.0.1', 0))
>>      listener.listen(50)
>>      bufsized(listener)
>>
>>      def send_large(sock):
>>          sock.sendall(large_data)
>>
>>      def read_large(sock):
>>          result = sock.recv(len(large_data))
>>          while len(result) < len(large_data):
>>              result += sock.recv(len(large_data))
>>          assert result == large_data
>>
>>      def server():
>>          (sock, addr) = listener.accept()
>>          sock = bufsized(sock)
>>          send_large_coro = eventlet.spawn(send_large, sock)
>>          eventlet.sleep(0)
>>          result = sock.recv(10)
>>          expected = b'hello world'
>>          while len(result) < len(expected):
>>              result += sock.recv(10)
>>          assert result == expected
>>          send_large_coro.wait()
>>
>>      server_evt = eventlet.spawn(server)
>>      client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
>>      client.connect(('127.0.0.1', listener.getsockname()[1]))
>>      bufsized(client)
>>      large_evt = eventlet.spawn(read_large, client)
>>      eventlet.sleep(0)
>>      client.sendall(b'hello world')
>>      server_evt.wait()
>>      large_evt.wait()
>>      client.close()
>>
>> test_full_duplex()
>>
>> =====================================
>>
>> I speak neither python nor networking, so any ideas :)? Is the test 
>> simply wrong?
> 
> strace -rT -e trace=network:
> 
> GOOD:
>  > 0.000000 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3 
> <0.000063>
>  > 0.000406 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000042>
>  > 0.000097 getsockopt(3, SOL_SOCKET, SO_SNDBUF, [4608], [4]) = 0 
> <0.000012>
>  > 0.000101 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3 
> <0.000015>
>  > 0.000058 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 <0.000009>
>  > 0.000035 bind(3, {sa_family=AF_INET, sin_port=htons(0), 
> sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000027>
>  > 0.000058 listen(3, 50)       = 0 <0.000014>
>  > 0.000029 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000009>
>  > 0.000023 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000008>
>  > 0.000052 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 5 
> <0.000014>
>  > 0.000050 getsockname(3, {sa_family=AF_INET, sin_port=htons(44313), 
> sin_addr=inet_addr("127.0.0.1")}, [16]) = 0 <0.000011>
>  > 0.000037 connect(5, {sa_family=AF_INET, sin_port=htons(44313), 
> sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in 
> progress) <0.000070>
>  > 0.000210 accept4(3, {sa_family=AF_INET, sin_port=htons(56062), 
> sin_addr=inet_addr("127.0.0.1")}, [16], SOCK_CLOEXEC) = 6 <0.000012>
>  > 0.000040 getsockname(6, {sa_family=AF_INET, sin_port=htons(44313), 
> sin_addr=inet_addr("127.0.0.1")}, [128 => 16]) = 0 <0.000007>
>  > 0.000062 setsockopt(6, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000007>
>  > 0.000020 setsockopt(6, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000007>
>  > 0.000082 getsockopt(5, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 <0.000007>
>  > 0.000023 connect(5, {sa_family=AF_INET, sin_port=htons(44313), 
> sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000008>
>  > 0.000022 setsockopt(5, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000020>
>  > 0.000036 setsockopt(5, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000007>
>  > 0.000061 sendto(6, "********************************"..., 46080, 0, 
> NULL, 0) = 32768 <0.000049>
>  > 0.000135 sendto(6, "********************************"..., 13312, 0, 
> NULL, 0) = 13312 <0.000017>
>  > 0.000087 recvfrom(6, 0x7f78e58af890, 10, 0, NULL, NULL) = -1 EAGAIN 
> (Resource temporarily unavailable) <0.000010>
>  > 0.000125 recvfrom(5, "********************************"..., 46080, 0, 
> NULL, NULL) = 32768 <0.000032>
>  > 0.000066 recvfrom(5, 0x55fdb41bb880, 46080, 0, NULL, NULL) = -1 
> EAGAIN (Resource temporarily unavailable) <0.000011>
>  > 0.000075 sendto(5, "hello world", 11, 0, NULL, 0) = 11 <0.000023>
>  > 0.000117 recvfrom(6, "hello worl", 10, 0, NULL, NULL) = 10 <0.000015>
>  > 0.000050 recvfrom(6, "d", 10, 0, NULL, NULL) = 1 <0.000011>
>  > 0.000212 recvfrom(5, "********************************"..., 46080, 0, 
> NULL, NULL) = 13312 <0.000019>
>  > 0.050676 +++ exited with 0 +++
> 
> 
> BAD:
>  > 0.000000 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3 
> <0.000045>
>  > 0.000244 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000015>
>  > 0.000057 getsockopt(3, SOL_SOCKET, SO_SNDBUF, [4608], [4]) = 0 
> <0.000013>
>  > 0.000104 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3 
> <0.000016>
>  > 0.000065 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 <0.000010>
>  > 0.000038 bind(3, {sa_family=AF_INET, sin_port=htons(0), 
> sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000031>
>  > 0.000068 listen(3, 50)       = 0 <0.000014>
>  > 0.000032 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000010>
>  > 0.000030 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000018>
>  > 0.000060 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 5 
> <0.000023>
>  > 0.000071 getsockname(3, {sa_family=AF_INET, sin_port=htons(45901), 
> sin_addr=inet_addr("127.0.0.1")}, [16]) = 0 <0.000019>
>  > 0.000068 connect(5, {sa_family=AF_INET, sin_port=htons(45901), 
> sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in 
> progress) <0.000074>
>  > 0.000259 accept4(3, {sa_family=AF_INET, sin_port=htons(35002), 
> sin_addr=inet_addr("127.0.0.1")}, [16], SOCK_CLOEXEC) = 6 <0.000014>
>  > 0.000051 getsockname(6, {sa_family=AF_INET, sin_port=htons(45901), 
> sin_addr=inet_addr("127.0.0.1")}, [128 => 16]) = 0 <0.000010>
>  > 0.000082 setsockopt(6, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000009>
>  > 0.000040 setsockopt(6, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000009>
>  > 0.000104 getsockopt(5, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 <0.000009>
>  > 0.000028 connect(5, {sa_family=AF_INET, sin_port=htons(45901), 
> sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000009>
>  > 0.000026 setsockopt(5, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000009>
>  > 0.000024 setsockopt(5, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000008>
>  > 0.000071 sendto(6, "********************************"..., 46080, 0, 
> NULL, 0) = 16640 <0.000026>
>  > 0.000117 sendto(6, "********************************"..., 29440, 0, 
> NULL, 0) = 16640 <0.000017>
>  > 0.000041 sendto(6, "********************************"..., 12800, 0, 
> NULL, 0) = -1 EAGAIN (Resource temporarily unavailable) <0.000009>
>  > 0.000075 recvfrom(6, 0x7f4db88a38c0, 10, 0, NULL, NULL) = -1 EAGAIN 
> (Resource temporarily unavailable) <0.000010>
>  > 0.000086 recvfrom(5, "********************************"..., 46080, 0, 
> NULL, NULL) = 16640 <0.000018>
>  > 0.000044 recvfrom(5, 0x55a64d59b2a0, 46080, 0, NULL, NULL) = -1 
> EAGAIN (Resource temporarily unavailable) <0.000009>
>  > 0.000059 sendto(5, "hello world", 11, 0, NULL, 0) = 11 <0.000018>
>  > 0.000093 recvfrom(6, "hello worl", 10, 0, NULL, NULL) = 10 <0.000009>
>  > 0.000029 recvfrom(6, "d", 10, 0, NULL, NULL) = 1 <0.000009>
>  > 0.206685 recvfrom(5, "********************************"..., 46080, 0, 
> NULL, NULL) = 16640 <0.000116>
>  > 0.000306 recvfrom(5, 0x55a64d5a7600, 46080, 0, NULL, NULL) = -1 
> EAGAIN (Resource temporarily unavailable) <0.000013>
>  > 0.000208 sendto(6, "********************************"..., 12800, 0, 
> NULL, 0) = 12800 <0.000025>
>  > 0.206317 recvfrom(5, "********************************"..., 46080, 0, 
> NULL, NULL) = 2304 <0.000171>
>  > 0.000304 recvfrom(5, 0x55a64d597170, 46080, 0, NULL, NULL) = -1 
> EAGAIN (Resource temporarily unavailable) <0.000029>
>  > 0.206161 recvfrom(5, "********************************"..., 46080, 0, 
> NULL, NULL) = 2304 <0.000082>
>  > 0.000212 recvfrom(5, 0x55a64d5a0ed0, 46080, 0, NULL, NULL) = -1 
> EAGAIN (Resource temporarily unavailable) <0.000034>
>  > 0.206572 recvfrom(5, "********************************"..., 46080, 0, 
> NULL, NULL) = 2304 <0.000146>
>  > 0.000274 recvfrom(5, 0x55a64d597170, 46080, 0, NULL, NULL) = -1 
> EAGAIN (Resource temporarily unavailable) <0.000029>
>  > 0.206604 recvfrom(5, "********************************"..., 46080, 0, 
> NULL, NULL) = 2304 <0.000162>
>  > 0.000270 recvfrom(5, 0x55a64d5a20d0, 46080, 0, NULL, NULL) = -1 
> EAGAIN (Resource temporarily unavailable) <0.000016>
>  > 0.206164 recvfrom(5, "********************************"..., 46080, 0, 
> NULL, NULL) = 2304 <0.000116>
>  > 0.000291 recvfrom(5, "********************************"..., 46080, 0, 
> NULL, NULL) = 1280 <0.000038>
>  > 0.052224 +++ exited with 0 +++
> 
> I.e. recvfrom() returns -EAGAIN and takes 200 ms.

Ah, no, those 200 ms are not spent in recvfrom() but in poll:
 > 0.000029 recvfrom(6, "d", 10, 0, NULL, NULL) = 1 <0.000009>
 > 0.000040 epoll_wait(4, [{events=EPOLLIN, data={u32=5, 
u64=139942919405573}}], 1023, 60000) = 1 <0.204038>
 > 0.204258 epoll_ctl(4, EPOLL_CTL_DEL, 5, 0x7ffd49b4049c) = 0 <0.000045>
 > 0.000104 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 16640 <0.000078>
 > 0.000264 recvfrom(5, 0x5603425f3550, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000025>
 > 0.000127 epoll_ctl(4, EPOLL_CTL_ADD, 5, 
{events=EPOLLIN|EPOLLPRI|EPOLLERR|EPOLLHUP, data={u32=5, 
u64=94570884890629}}) = 0 <0.000031>
 > 0.000112 epoll_wait(4, [{events=EPOLLOUT, data={u32=6, 
u64=139942919405574}}], 1023, 60000) = 1 <0.000026>
 > 0.000083 epoll_ctl(4, EPOLL_CTL_DEL, 6, 0x7ffd49b404fc) = 0 <0.000025>
 > 0.000063 sendto(6, "********************************"..., 12800, 0, 
NULL, 0) = 12800 <0.000028>
 > 0.000226 epoll_wait(4, [], 1023, 0) = 0 <0.000007>
 > 0.000029 epoll_wait(4, [{events=EPOLLIN, data={u32=5, 
u64=94570884890629}}], 1023, 60000) = 1 <0.205476>
 > 0.205728 epoll_ctl(4, EPOLL_CTL_DEL, 5, 0x7ffd49b404fc) = 0 <0.000077>
 > 0.000157 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 2304 <0.000066>
 > 0.000180 recvfrom(5, 0x5603425e30c0, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000026>
 > 0.000139 epoll_ctl(4, EPOLL_CTL_ADD, 5, 
{events=EPOLLIN|EPOLLPRI|EPOLLERR|EPOLLHUP, data={u32=5, u64=5}}) = 0 
<0.000030>
 > 0.000104 epoll_wait(4, [{events=EPOLLIN, data={u32=5, u64=5}}], 1023, 
60000) = 1 <0.205881>
 > 0.206222 epoll_ctl(4, EPOLL_CTL_DEL, 5, 0x7ffd49b404fc) = 0 <0.000086>
 > 0.000189 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 2304 <0.000027>
 > 0.000153 recvfrom(5, 0x5603425ece20, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000017>
 > 0.000074 epoll_ctl(4, EPOLL_CTL_ADD, 5, 
{events=EPOLLIN|EPOLLPRI|EPOLLERR|EPOLLHUP, data={u32=5, u64=5}}) = 0 
<0.000018>
 > 0.000067 epoll_wait(4, [{events=EPOLLIN, data={u32=5, u64=5}}], 1023, 
60000) = 1 <0.205749>
 > 0.206088 epoll_ctl(4, EPOLL_CTL_DEL, 5, 0x7ffd49b404fc) = 0 <0.000197>
 > 0.000391 recvfrom(5, "********************************"..., 46080, 0, 
NULL, NULL) = 2304 <0.000080>
 > 0.000210 recvfrom(5, 0x5603425e30c0, 46080, 0, NULL, NULL) = -1 
EAGAIN (Resource temporarily unavailable) <0.000027>
 > 0.000174 epoll_ctl(4, EPOLL_CTL_ADD, 5, 
{events=EPOLLIN|EPOLLPRI|EPOLLERR|EPOLLHUP, data={u32=5, u64=5}}) = 0 
<0.000031>
 > 0.000127 epoll_wait(4, [{events=EPOLLIN, data={u32=5, u64=5}}], 1023, 
60000) = 1 <0.205471>
 > 0.205783 epoll_ctl(4, EPOLL_CTL_DEL, 5, 0x7ffd49b404fc) = 0 <0.000205>


-- 
js
suse labs


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-11-03  7:07     ` Jiri Slaby
@ 2023-11-03  8:17       ` Eric Dumazet
  2023-11-03  9:27         ` Michal Kubecek
  2023-11-03 10:14         ` Jiri Slaby
  0 siblings, 2 replies; 23+ messages in thread
From: Eric Dumazet @ 2023-11-03  8:17 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On Fri, Nov 3, 2023 at 8:07 AM Jiri Slaby <jirislaby@kernel.org> wrote:
>
> On 03. 11. 23, 7:56, Jiri Slaby wrote:
> > On 03. 11. 23, 7:10, Jiri Slaby wrote:
> >> On 17. 07. 23, 17:29, Eric Dumazet wrote:
> >>> With modern NIC drivers shifting to full page allocations per
> >>> received frame, we face the following issue:
> >>>
> >>> TCP has one per-netns sysctl used to tweak how to translate
> >>> a memory use into an expected payload (RWIN), in RX path.
> >>>
> >>> tcp_win_from_space() implementation is limited to few cases.
> >>>
> >>> For hosts dealing with various MSS, we either under estimate
> >>> or over estimate the RWIN we send to the remote peers.
> >>>
> >>> For instance with the default sysctl_tcp_adv_win_scale value,
> >>> we expect to store 50% of payload per allocated chunk of memory.
> >>>
> >>> For the typical use of MTU=1500 traffic, and order-0 pages allocations
> >>> by NIC drivers, we are sending too big RWIN, leading to potential
> >>> tcp collapse operations, which are extremely expensive and source
> >>> of latency spikes.
> >>>
> >>> This patch makes sysctl_tcp_adv_win_scale obsolete, and instead
> >>> uses a per socket scaling factor, so that we can precisely
> >>> adjust the RWIN based on effective skb->len/skb->truesize ratio.
> >>>
> >>> This patch alone can double TCP receive performance when receivers
> >>> are too slow to drain their receive queue, or by allowing
> >>> a bigger RWIN when MSS is close to PAGE_SIZE.
> >>
> >> Hi,
> >>
> >> I bisected a python-eventlet test failure:
> >>  > =================================== FAILURES
> >> ===================================
> >>  > _______________________ TestGreenSocket.test_full_duplex
> >> _______________________
> >>  >
> >>  > self = <tests.greenio_test.TestGreenSocket
> >> testMethod=test_full_duplex>
> >>  >
> >>  >     def test_full_duplex(self):
> >>  > ...
> >>  > >       large_evt.wait()
> >>  >
> >>  > tests/greenio_test.py:424:
> >>  > _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> >> _ _ _ _ _ _
> >>  > eventlet/greenthread.py:181: in wait
> >>  >     return self._exit_event.wait()
> >>  > eventlet/event.py:125: in wait
> >>  >     result = hub.switch()
> >> ...
> >>  > E       tests.TestIsTakingTooLong: 1
> >>  >
> >>  > eventlet/hubs/hub.py:313: TestIsTakingTooLong
> >>
> >> to this commit. With the commit, the test takes > 1.5 s. Without the
> >> commit it takes only < 300 ms. And they set timeout to 1 s.
> >>
> >> The reduced self-stadning test case:
> >> #!/usr/bin/python3
> >> import eventlet
> >> from eventlet.green import select, socket, time, ssl
> >>
> >> def bufsized(sock, size=1):
> >>      """ Resize both send and receive buffers on a socket.
> >>      Useful for testing trampoline.  Returns the socket.
> >>
> >>      >>> import socket
> >>      >>> sock = bufsized(socket.socket(socket.AF_INET,
> >> socket.SOCK_STREAM))
> >>      """
> >>      sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, size)
> >>      sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, size)
> >>      return sock
> >>
> >> def min_buf_size():
> >>      """Return the minimum buffer size that the platform supports."""
> >>      test_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
> >>      test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 1)
> >>      return test_sock.getsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF)
> >>
> >> def test_full_duplex():
> >>      large_data = b'*' * 10 * min_buf_size()
> >>      listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
> >>      listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
> >>      listener.bind(('127.0.0.1', 0))
> >>      listener.listen(50)
> >>      bufsized(listener)
> >>
> >>      def send_large(sock):
> >>          sock.sendall(large_data)
> >>
> >>      def read_large(sock):
> >>          result = sock.recv(len(large_data))
> >>          while len(result) < len(large_data):
> >>              result += sock.recv(len(large_data))
> >>          assert result == large_data
> >>
> >>      def server():
> >>          (sock, addr) = listener.accept()
> >>          sock = bufsized(sock)
> >>          send_large_coro = eventlet.spawn(send_large, sock)
> >>          eventlet.sleep(0)
> >>          result = sock.recv(10)
> >>          expected = b'hello world'
> >>          while len(result) < len(expected):
> >>              result += sock.recv(10)
> >>          assert result == expected
> >>          send_large_coro.wait()
> >>
> >>      server_evt = eventlet.spawn(server)
> >>      client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
> >>      client.connect(('127.0.0.1', listener.getsockname()[1]))
> >>      bufsized(client)
> >>      large_evt = eventlet.spawn(read_large, client)
> >>      eventlet.sleep(0)
> >>      client.sendall(b'hello world')
> >>      server_evt.wait()
> >>      large_evt.wait()
> >>      client.close()
> >>
> >> test_full_duplex()
> >>
> >> =====================================
> >>
> >> I speak neither python nor networking, so any ideas :)? Is the test
> >> simply wrong?
> >
> > strace -rT -e trace=network:
> >
> > GOOD:
> >  > 0.000000 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3
> > <0.000063>
> >  > 0.000406 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000042>
> >  > 0.000097 getsockopt(3, SOL_SOCKET, SO_SNDBUF, [4608], [4]) = 0
> > <0.000012>
> >  > 0.000101 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3
> > <0.000015>
> >  > 0.000058 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 <0.000009>
> >  > 0.000035 bind(3, {sa_family=AF_INET, sin_port=htons(0),
> > sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000027>
> >  > 0.000058 listen(3, 50)       = 0 <0.000014>
> >  > 0.000029 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000009>
> >  > 0.000023 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000008>
> >  > 0.000052 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 5
> > <0.000014>
> >  > 0.000050 getsockname(3, {sa_family=AF_INET, sin_port=htons(44313),
> > sin_addr=inet_addr("127.0.0.1")}, [16]) = 0 <0.000011>
> >  > 0.000037 connect(5, {sa_family=AF_INET, sin_port=htons(44313),
> > sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in
> > progress) <0.000070>
> >  > 0.000210 accept4(3, {sa_family=AF_INET, sin_port=htons(56062),
> > sin_addr=inet_addr("127.0.0.1")}, [16], SOCK_CLOEXEC) = 6 <0.000012>
> >  > 0.000040 getsockname(6, {sa_family=AF_INET, sin_port=htons(44313),
> > sin_addr=inet_addr("127.0.0.1")}, [128 => 16]) = 0 <0.000007>
> >  > 0.000062 setsockopt(6, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000007>
> >  > 0.000020 setsockopt(6, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000007>
> >  > 0.000082 getsockopt(5, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 <0.000007>
> >  > 0.000023 connect(5, {sa_family=AF_INET, sin_port=htons(44313),
> > sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000008>
> >  > 0.000022 setsockopt(5, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000020>
> >  > 0.000036 setsockopt(5, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000007>
> >  > 0.000061 sendto(6, "********************************"..., 46080, 0,
> > NULL, 0) = 32768 <0.000049>
> >  > 0.000135 sendto(6, "********************************"..., 13312, 0,
> > NULL, 0) = 13312 <0.000017>
> >  > 0.000087 recvfrom(6, 0x7f78e58af890, 10, 0, NULL, NULL) = -1 EAGAIN
> > (Resource temporarily unavailable) <0.000010>
> >  > 0.000125 recvfrom(5, "********************************"..., 46080, 0,
> > NULL, NULL) = 32768 <0.000032>
> >  > 0.000066 recvfrom(5, 0x55fdb41bb880, 46080, 0, NULL, NULL) = -1
> > EAGAIN (Resource temporarily unavailable) <0.000011>
> >  > 0.000075 sendto(5, "hello world", 11, 0, NULL, 0) = 11 <0.000023>
> >  > 0.000117 recvfrom(6, "hello worl", 10, 0, NULL, NULL) = 10 <0.000015>
> >  > 0.000050 recvfrom(6, "d", 10, 0, NULL, NULL) = 1 <0.000011>
> >  > 0.000212 recvfrom(5, "********************************"..., 46080, 0,
> > NULL, NULL) = 13312 <0.000019>
> >  > 0.050676 +++ exited with 0 +++
> >
> >
> > BAD:
> >  > 0.000000 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3
> > <0.000045>
> >  > 0.000244 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000015>
> >  > 0.000057 getsockopt(3, SOL_SOCKET, SO_SNDBUF, [4608], [4]) = 0
> > <0.000013>
> >  > 0.000104 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 3
> > <0.000016>
> >  > 0.000065 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 <0.000010>
> >  > 0.000038 bind(3, {sa_family=AF_INET, sin_port=htons(0),
> > sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000031>
> >  > 0.000068 listen(3, 50)       = 0 <0.000014>
> >  > 0.000032 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000010>
> >  > 0.000030 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000018>
> >  > 0.000060 socket(AF_INET, SOCK_STREAM|SOCK_CLOEXEC, IPPROTO_IP) = 5
> > <0.000023>
> >  > 0.000071 getsockname(3, {sa_family=AF_INET, sin_port=htons(45901),
> > sin_addr=inet_addr("127.0.0.1")}, [16]) = 0 <0.000019>
> >  > 0.000068 connect(5, {sa_family=AF_INET, sin_port=htons(45901),
> > sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in
> > progress) <0.000074>
> >  > 0.000259 accept4(3, {sa_family=AF_INET, sin_port=htons(35002),
> > sin_addr=inet_addr("127.0.0.1")}, [16], SOCK_CLOEXEC) = 6 <0.000014>
> >  > 0.000051 getsockname(6, {sa_family=AF_INET, sin_port=htons(45901),
> > sin_addr=inet_addr("127.0.0.1")}, [128 => 16]) = 0 <0.000010>
> >  > 0.000082 setsockopt(6, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000009>
> >  > 0.000040 setsockopt(6, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000009>
> >  > 0.000104 getsockopt(5, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 <0.000009>
> >  > 0.000028 connect(5, {sa_family=AF_INET, sin_port=htons(45901),
> > sin_addr=inet_addr("127.0.0.1")}, 16) = 0 <0.000009>
> >  > 0.000026 setsockopt(5, SOL_SOCKET, SO_SNDBUF, [1], 4) = 0 <0.000009>
> >  > 0.000024 setsockopt(5, SOL_SOCKET, SO_RCVBUF, [1], 4) = 0 <0.000008>
> >  > 0.000071 sendto(6, "********************************"..., 46080, 0,
> > NULL, 0) = 16640 <0.000026>
> >  > 0.000117 sendto(6, "********************************"..., 29440, 0,
> > NULL, 0) = 16640 <0.000017>
> >  > 0.000041 sendto(6, "********************************"..., 12800, 0,
> > NULL, 0) = -1 EAGAIN (Resource temporarily unavailable) <0.000009>
> >  > 0.000075 recvfrom(6, 0x7f4db88a38c0, 10, 0, NULL, NULL) = -1 EAGAIN
> > (Resource temporarily unavailable) <0.000010>
> >  > 0.000086 recvfrom(5, "********************************"..., 46080, 0,
> > NULL, NULL) = 16640 <0.000018>
> >  > 0.000044 recvfrom(5, 0x55a64d59b2a0, 46080, 0, NULL, NULL) = -1
> > EAGAIN (Resource temporarily unavailable) <0.000009>
> >  > 0.000059 sendto(5, "hello world", 11, 0, NULL, 0) = 11 <0.000018>
> >  > 0.000093 recvfrom(6, "hello worl", 10, 0, NULL, NULL) = 10 <0.000009>
> >  > 0.000029 recvfrom(6, "d", 10, 0, NULL, NULL) = 1 <0.000009>
> >  > 0.206685 recvfrom(5, "********************************"..., 46080, 0,
> > NULL, NULL) = 16640 <0.000116>
> >  > 0.000306 recvfrom(5, 0x55a64d5a7600, 46080, 0, NULL, NULL) = -1
> > EAGAIN (Resource temporarily unavailable) <0.000013>
> >  > 0.000208 sendto(6, "********************************"..., 12800, 0,
> > NULL, 0) = 12800 <0.000025>
> >  > 0.206317 recvfrom(5, "********************************"..., 46080, 0,
> > NULL, NULL) = 2304 <0.000171>
> >  > 0.000304 recvfrom(5, 0x55a64d597170, 46080, 0, NULL, NULL) = -1
> > EAGAIN (Resource temporarily unavailable) <0.000029>
> >  > 0.206161 recvfrom(5, "********************************"..., 46080, 0,
> > NULL, NULL) = 2304 <0.000082>
> >  > 0.000212 recvfrom(5, 0x55a64d5a0ed0, 46080, 0, NULL, NULL) = -1
> > EAGAIN (Resource temporarily unavailable) <0.000034>
> >  > 0.206572 recvfrom(5, "********************************"..., 46080, 0,
> > NULL, NULL) = 2304 <0.000146>
> >  > 0.000274 recvfrom(5, 0x55a64d597170, 46080, 0, NULL, NULL) = -1
> > EAGAIN (Resource temporarily unavailable) <0.000029>
> >  > 0.206604 recvfrom(5, "********************************"..., 46080, 0,
> > NULL, NULL) = 2304 <0.000162>
> >  > 0.000270 recvfrom(5, 0x55a64d5a20d0, 46080, 0, NULL, NULL) = -1
> > EAGAIN (Resource temporarily unavailable) <0.000016>
> >  > 0.206164 recvfrom(5, "********************************"..., 46080, 0,
> > NULL, NULL) = 2304 <0.000116>
> >  > 0.000291 recvfrom(5, "********************************"..., 46080, 0,
> > NULL, NULL) = 1280 <0.000038>
> >  > 0.052224 +++ exited with 0 +++
> >
> > I.e. recvfrom() returns -EAGAIN and takes 200 ms.
>
> Ah, no, those 200 ms are not spent in recvfrom() but in poll:
>  > 0.000029 recvfrom(6, "d", 10, 0, NULL, NULL) = 1 <0.000009>
>  > 0.000040 epoll_wait(4, [{events=EPOLLIN, data={u32=5,
> u64=139942919405573}}], 1023, 60000) = 1 <0.204038>
>  > 0.204258 epoll_ctl(4, EPOLL_CTL_DEL, 5, 0x7ffd49b4049c) = 0 <0.000045>
>  > 0.000104 recvfrom(5, "********************************"..., 46080, 0,
> NULL, NULL) = 16640 <0.000078>
>  > 0.000264 recvfrom(5, 0x5603425f3550, 46080, 0, NULL, NULL) = -1
> EAGAIN (Resource temporarily unavailable) <0.000025>
>  > 0.000127 epoll_ctl(4, EPOLL_CTL_ADD, 5,
> {events=EPOLLIN|EPOLLPRI|EPOLLERR|EPOLLHUP, data={u32=5,
> u64=94570884890629}}) = 0 <0.000031>
>  > 0.000112 epoll_wait(4, [{events=EPOLLOUT, data={u32=6,
> u64=139942919405574}}], 1023, 60000) = 1 <0.000026>
>  > 0.000083 epoll_ctl(4, EPOLL_CTL_DEL, 6, 0x7ffd49b404fc) = 0 <0.000025>
>  > 0.000063 sendto(6, "********************************"..., 12800, 0,
> NULL, 0) = 12800 <0.000028>
>  > 0.000226 epoll_wait(4, [], 1023, 0) = 0 <0.000007>
>  > 0.000029 epoll_wait(4, [{events=EPOLLIN, data={u32=5,
> u64=94570884890629}}], 1023, 60000) = 1 <0.205476>
>  > 0.205728 epoll_ctl(4, EPOLL_CTL_DEL, 5, 0x7ffd49b404fc) = 0 <0.000077>
>  > 0.000157 recvfrom(5, "********************************"..., 46080, 0,
> NULL, NULL) = 2304 <0.000066>
>  > 0.000180 recvfrom(5, 0x5603425e30c0, 46080, 0, NULL, NULL) = -1
> EAGAIN (Resource temporarily unavailable) <0.000026>
>  > 0.000139 epoll_ctl(4, EPOLL_CTL_ADD, 5,
> {events=EPOLLIN|EPOLLPRI|EPOLLERR|EPOLLHUP, data={u32=5, u64=5}}) = 0
> <0.000030>
>  > 0.000104 epoll_wait(4, [{events=EPOLLIN, data={u32=5, u64=5}}], 1023,
> 60000) = 1 <0.205881>
>  > 0.206222 epoll_ctl(4, EPOLL_CTL_DEL, 5, 0x7ffd49b404fc) = 0 <0.000086>
>  > 0.000189 recvfrom(5, "********************************"..., 46080, 0,
> NULL, NULL) = 2304 <0.000027>
>  > 0.000153 recvfrom(5, 0x5603425ece20, 46080, 0, NULL, NULL) = -1
> EAGAIN (Resource temporarily unavailable) <0.000017>
>  > 0.000074 epoll_ctl(4, EPOLL_CTL_ADD, 5,
> {events=EPOLLIN|EPOLLPRI|EPOLLERR|EPOLLHUP, data={u32=5, u64=5}}) = 0
> <0.000018>
>  > 0.000067 epoll_wait(4, [{events=EPOLLIN, data={u32=5, u64=5}}], 1023,
> 60000) = 1 <0.205749>
>  > 0.206088 epoll_ctl(4, EPOLL_CTL_DEL, 5, 0x7ffd49b404fc) = 0 <0.000197>
>  > 0.000391 recvfrom(5, "********************************"..., 46080, 0,
> NULL, NULL) = 2304 <0.000080>
>  > 0.000210 recvfrom(5, 0x5603425e30c0, 46080, 0, NULL, NULL) = -1
> EAGAIN (Resource temporarily unavailable) <0.000027>
>  > 0.000174 epoll_ctl(4, EPOLL_CTL_ADD, 5,
> {events=EPOLLIN|EPOLLPRI|EPOLLERR|EPOLLHUP, data={u32=5, u64=5}}) = 0
> <0.000031>
>  > 0.000127 epoll_wait(4, [{events=EPOLLIN, data={u32=5, u64=5}}], 1023,
> 60000) = 1 <0.205471>
>  > 0.205783 epoll_ctl(4, EPOLL_CTL_DEL, 5, 0x7ffd49b404fc) = 0 <0.000205>
>

It seems the test had some expectations.

Setting a small (1 byte) RCVBUF/SNDBUF, and yet expecting to send
46080 bytes fast enough was not reasonable.
It might have relied on the fact that tcp sendmsg() can cook large GSO
packets, even if sk->sk_sndbuf is small.

With tight memory settings, it is possible TCP has to resort on RTO
timers (200ms by default) to recover from dropped packets.

What happens if you double /proc/sys/net/ipv4/tcp_wmem  and/or
/proc/sys/net/ipv4/tcp_rmem first value ?
(4096 -> 8192)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-11-03  8:17       ` Eric Dumazet
@ 2023-11-03  9:27         ` Michal Kubecek
  2023-11-03  9:53           ` Eric Dumazet
  2023-11-03 10:14         ` Jiri Slaby
  1 sibling, 1 reply; 23+ messages in thread
From: Michal Kubecek @ 2023-11-03  9:27 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jiri Slaby, David S . Miller, Jakub Kicinski, Paolo Abeni,
	netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On Fri, Nov 03, 2023 at 09:17:27AM +0100, Eric Dumazet wrote:
> 
> It seems the test had some expectations.
> 
> Setting a small (1 byte) RCVBUF/SNDBUF, and yet expecting to send
> 46080 bytes fast enough was not reasonable.
> It might have relied on the fact that tcp sendmsg() can cook large GSO
> packets, even if sk->sk_sndbuf is small.
> 
> With tight memory settings, it is possible TCP has to resort on RTO
> timers (200ms by default) to recover from dropped packets.

There seems to be one drop but somehow the sender does not recover from
it, even if the retransmit and following packets are acked quickly:

09:15:29.424017 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [S], seq 104649613, win 33280, options [mss 65495,sackOK,TS val 1319295278 ecr 0,nop,wscale 7], length 0
09:15:29.424024 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [S.], seq 1343383818, ack 104649614, win 585, options [mss 65495,sackOK,TS val 1319295278 ecr 1319295278,nop,wscale 0], length 0
09:15:29.424031 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 1, win 260, options [nop,nop,TS val 1319295278 ecr 1319295278], length 0
09:15:29.424155 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [.], seq 1:16641, ack 1, win 585, options [nop,nop,TS val 1319295279 ecr 1319295278], length 16640
09:15:29.424160 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 16641, win 130, options [nop,nop,TS val 1319295279 ecr 1319295279], length 0
09:15:29.424179 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 16641:33281, ack 1, win 585, options [nop,nop,TS val 1319295279 ecr 1319295279], length 16640
09:15:29.424183 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 16641, win 0, options [nop,nop,TS val 1319295279 ecr 1319295279], length 0
09:15:29.424280 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [P.], seq 1:12, ack 16641, win 16640, options [nop,nop,TS val 1319295279 ecr 1319295279], length 11
09:15:29.424284 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [.], ack 12, win 574, options [nop,nop,TS val 1319295279 ecr 1319295279], length 0
09:15:29.630272 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 16641:33281, ack 12, win 574, options [nop,nop,TS val 1319295485 ecr 1319295279], length 16640
09:15:29.630334 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 33281, win 2304, options [nop,nop,TS val 1319295485 ecr 1319295485], length 0
09:15:29.836938 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 33281:35585, ack 12, win 574, options [nop,nop,TS val 1319295691 ecr 1319295485], length 2304
09:15:29.836984 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 35585, win 2304, options [nop,nop,TS val 1319295691 ecr 1319295691], length 0
09:15:30.043606 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 35585:37889, ack 12, win 574, options [nop,nop,TS val 1319295898 ecr 1319295691], length 2304
09:15:30.043653 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 37889, win 2304, options [nop,nop,TS val 1319295898 ecr 1319295898], length 0
09:15:30.250270 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 37889:40193, ack 12, win 574, options [nop,nop,TS val 1319296105 ecr 1319295898], length 2304
09:15:30.250316 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 40193, win 2304, options [nop,nop,TS val 1319296105 ecr 1319296105], length 0
09:15:30.456932 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 40193:42497, ack 12, win 574, options [nop,nop,TS val 1319296311 ecr 1319296105], length 2304
09:15:30.456975 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 42497, win 2304, options [nop,nop,TS val 1319296311 ecr 1319296311], length 0
09:15:30.663598 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 42497:44801, ack 12, win 574, options [nop,nop,TS val 1319296518 ecr 1319296311], length 2304
09:15:30.663638 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 44801, win 2304, options [nop,nop,TS val 1319296518 ecr 1319296518], length 0
09:15:30.663646 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [FP.], seq 44801:46081, ack 12, win 574, options [nop,nop,TS val 1319296518 ecr 1319296518], length 1280
09:15:30.663712 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [F.], seq 12, ack 46082, win 2304, options [nop,nop,TS val 1319296518 ecr 1319296518], length 0
09:15:30.663724 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [.], ack 13, win 573, options [nop,nop,TS val 1319296518 ecr 1319296518], length 0

(window size values are scaled here). Part of the problem is that the
receiver side sets SO_RCVBUF after connect() so that the window shrinks
after sender already sent more data; when I move the bufsized() calls
in the python script before listen() and connect(), the test runs
quickly.

Michal

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-11-03  9:27         ` Michal Kubecek
@ 2023-11-03  9:53           ` Eric Dumazet
  0 siblings, 0 replies; 23+ messages in thread
From: Eric Dumazet @ 2023-11-03  9:53 UTC (permalink / raw)
  To: Michal Kubecek
  Cc: Jiri Slaby, David S . Miller, Jakub Kicinski, Paolo Abeni,
	netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On Fri, Nov 3, 2023 at 10:27 AM Michal Kubecek <mkubecek@suse.cz> wrote:
>
> On Fri, Nov 03, 2023 at 09:17:27AM +0100, Eric Dumazet wrote:
> >
> > It seems the test had some expectations.
> >
> > Setting a small (1 byte) RCVBUF/SNDBUF, and yet expecting to send
> > 46080 bytes fast enough was not reasonable.
> > It might have relied on the fact that tcp sendmsg() can cook large GSO
> > packets, even if sk->sk_sndbuf is small.
> >
> > With tight memory settings, it is possible TCP has to resort on RTO
> > timers (200ms by default) to recover from dropped packets.
>
> There seems to be one drop but somehow the sender does not recover from
> it, even if the retransmit and following packets are acked quickly:
>
> 09:15:29.424017 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [S], seq 104649613, win 33280, options [mss 65495,sackOK,TS val 1319295278 ecr 0,nop,wscale 7], length 0
> 09:15:29.424024 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [S.], seq 1343383818, ack 104649614, win 585, options [mss 65495,sackOK,TS val 1319295278 ecr 1319295278,nop,wscale 0], length 0
> 09:15:29.424031 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 1, win 260, options [nop,nop,TS val 1319295278 ecr 1319295278], length 0
> 09:15:29.424155 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [.], seq 1:16641, ack 1, win 585, options [nop,nop,TS val 1319295279 ecr 1319295278], length 16640
> 09:15:29.424160 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 16641, win 130, options [nop,nop,TS val 1319295279 ecr 1319295279], length 0
> 09:15:29.424179 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 16641:33281, ack 1, win 585, options [nop,nop,TS val 1319295279 ecr 1319295279], length 16640
> 09:15:29.424183 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 16641, win 0, options [nop,nop,TS val 1319295279 ecr 1319295279], length 0
> 09:15:29.424280 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [P.], seq 1:12, ack 16641, win 16640, options [nop,nop,TS val 1319295279 ecr 1319295279], length 11
> 09:15:29.424284 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [.], ack 12, win 574, options [nop,nop,TS val 1319295279 ecr 1319295279], length 0
> 09:15:29.630272 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 16641:33281, ack 12, win 574, options [nop,nop,TS val 1319295485 ecr 1319295279], length 16640
> 09:15:29.630334 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 33281, win 2304, options [nop,nop,TS val 1319295485 ecr 1319295485], length 0



> 09:15:29.836938 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 33281:35585, ack 12, win 574, options [nop,nop,TS val 1319295691 ecr 1319295485], length 2304
> 09:15:29.836984 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 35585, win 2304, options [nop,nop,TS val 1319295691 ecr 1319295691], length 0
> 09:15:30.043606 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 35585:37889, ack 12, win 574, options [nop,nop,TS val 1319295898 ecr 1319295691], length 2304
> 09:15:30.043653 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 37889, win 2304, options [nop,nop,TS val 1319295898 ecr 1319295898], length 0
> 09:15:30.250270 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 37889:40193, ack 12, win 574, options [nop,nop,TS val 1319296105 ecr 1319295898], length 2304
> 09:15:30.250316 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 40193, win 2304, options [nop,nop,TS val 1319296105 ecr 1319296105], length 0
> 09:15:30.456932 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 40193:42497, ack 12, win 574, options [nop,nop,TS val 1319296311 ecr 1319296105], length 2304
> 09:15:30.456975 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 42497, win 2304, options [nop,nop,TS val 1319296311 ecr 1319296311], length 0
> 09:15:30.663598 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [P.], seq 42497:44801, ack 12, win 574, options [nop,nop,TS val 1319296518 ecr 1319296311], length 2304
> 09:15:30.663638 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [.], ack 44801, win 2304, options [nop,nop,TS val 1319296518 ecr 1319296518], length 0
> 09:15:30.663646 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [FP.], seq 44801:46081, ack 12, win 574, options [nop,nop,TS val 1319296518 ecr 1319296518], length 1280
> 09:15:30.663712 IP 127.0.0.1.40386 > 127.0.0.1.42483: Flags [F.], seq 12, ack 46082, win 2304, options [nop,nop,TS val 1319296518 ecr 1319296518], length 0
> 09:15:30.663724 IP 127.0.0.1.42483 > 127.0.0.1.40386: Flags [.], ack 13, win 573, options [nop,nop,TS val 1319296518 ecr 1319296518], length 0
>
> (window size values are scaled here). Part of the problem is that the
> receiver side sets SO_RCVBUF after connect() so that the window shrinks
> after sender already sent more data; when I move the bufsized() calls
> in the python script before listen() and connect(), the test runs
> quickly.

This makes sense.

Old kernels would have instead dropped a packet, without changing test status:

09:49:49.390066 IP localhost.39710 > localhost.44173: Flags [S], seq
1464131415, win 65495, options [mss 65495,sackOK,TS val 578664891 ecr
0,nop,wscale 7], length 0
09:49:49.390078 IP localhost.44173 > localhost.39710: Flags [S.], seq
2322612108, ack 1464131416, win 1152, options [mss 65495,sackOK,TS val
578664891 ecr 578664891,nop,wscale 0], length 0
09:49:49.390088 IP localhost.39710 > localhost.44173: Flags [.], ack
1, win 512, options [nop,nop,TS val 578664891 ecr 578664891], length 0
09:49:49.390319 IP localhost.44173 > localhost.39710: Flags [.], seq
1:32769, ack 1, win 1152, options [nop,nop,TS val 578664892 ecr
578664891], length 32768
09:49:49.390325 IP localhost.39710 > localhost.44173: Flags [.], ack
32769, win 256, options [nop,nop,TS val 578664892 ecr 578664892],
length 0
09:49:49.390355 IP localhost.44173 > localhost.39710: Flags [P.], seq
32769:46081, ack 1, win 1152, options [nop,nop,TS val 578664892 ecr
578664892], length 13312
<prior packet has been dropped by receiver>
09:49:49.390479 IP localhost.39710 > localhost.44173: Flags [P.], seq
1:12, ack 32769, win 256, options [nop,nop,TS val 578664892 ecr
578664892], length 11
09:49:49.390483 IP localhost.44173 > localhost.39710: Flags [.], ack
12, win 1141, options [nop,nop,TS val 578664892 ecr 578664892], length
0
09:49:49.390547 IP localhost.44173 > localhost.39710: Flags [F.], seq
46081, ack 12, win 1141, options [nop,nop,TS val 578664892 ecr
578664892], length 0
09:49:49.390552 IP localhost.39710 > localhost.44173: Flags [.], ack
32769, win 256, options [nop,nop,TS val 578664892 ecr
578664892,nop,nop,sack 1 {46081:46082}], length 0

<packet retransmit>
09:49:49.390562 IP localhost.44173 > localhost.39710: Flags [P.], seq
32769:46081, ack 12, win 1141, options [nop,nop,TS val 578664892 ecr
578664892], length 13312
09:49:49.390567 IP localhost.39710 > localhost.44173: Flags [.], ack
46082, win 152, options [nop,nop,TS val 578664892 ecr 578664892],
length 0
09:49:49.390677 IP localhost.39710 > localhost.44173: Flags [F.], seq
12, ack 46082, win 152, options [nop,nop,TS val 578664892 ecr
578664892], length 0
09:49:49.390685 IP localhost.44173 > localhost.39710: Flags [.], ack
13, win 1141, options [nop,nop,TS val 578664892 ecr 578664892], length
0


Retracting TCP windows has always been problematic.

If we really want to be very gentle, this could add more logic,
shorter timer events for pathological cases like that,
I am not sure this is really worth it, especially if dealing with one
million TCP sockets in this state.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-11-03  8:17       ` Eric Dumazet
  2023-11-03  9:27         ` Michal Kubecek
@ 2023-11-03 10:14         ` Jiri Slaby
  2023-11-03 10:27           ` Eric Dumazet
  1 sibling, 1 reply; 23+ messages in thread
From: Jiri Slaby @ 2023-11-03 10:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On 03. 11. 23, 9:17, Eric Dumazet wrote:
> What happens if you double /proc/sys/net/ipv4/tcp_wmem  and/or
> /proc/sys/net/ipv4/tcp_rmem first value ?
> (4096 -> 8192)

No change:
# cat /proc/sys/net/ipv4/tcp_wmem
8192    16384   4194304


But this:
# cat /proc/sys/net/ipv4/tcp_rmem
8192    16384   4194304

results in test to run 4.7 s, so even 3 more times slower!

thanks,
-- 
js
suse labs


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-11-03 10:14         ` Jiri Slaby
@ 2023-11-03 10:27           ` Eric Dumazet
  2023-11-03 11:07             ` Jiri Slaby
  0 siblings, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2023-11-03 10:27 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On Fri, Nov 3, 2023 at 11:14 AM Jiri Slaby <jirislaby@kernel.org> wrote:
>
> On 03. 11. 23, 9:17, Eric Dumazet wrote:
> > What happens if you double /proc/sys/net/ipv4/tcp_wmem  and/or
> > /proc/sys/net/ipv4/tcp_rmem first value ?
> > (4096 -> 8192)
>
> No change:
> # cat /proc/sys/net/ipv4/tcp_wmem
> 8192    16384   4194304
>
>
> But this:
> # cat /proc/sys/net/ipv4/tcp_rmem
> 8192    16384   4194304
>
> results in test to run 4.7 s, so even 3 more times slower!


You might try setting tcp_shrink_window sysctl to one, with some side
effects for normal TCP flows.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-11-03 10:27           ` Eric Dumazet
@ 2023-11-03 11:07             ` Jiri Slaby
  0 siblings, 0 replies; 23+ messages in thread
From: Jiri Slaby @ 2023-11-03 11:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On 03. 11. 23, 11:27, Eric Dumazet wrote:
> On Fri, Nov 3, 2023 at 11:14 AM Jiri Slaby <jirislaby@kernel.org> wrote:
>>
>> On 03. 11. 23, 9:17, Eric Dumazet wrote:
>>> What happens if you double /proc/sys/net/ipv4/tcp_wmem  and/or
>>> /proc/sys/net/ipv4/tcp_rmem first value ?
>>> (4096 -> 8192)
>>
>> No change:
>> # cat /proc/sys/net/ipv4/tcp_wmem
>> 8192    16384   4194304
>>
>>
>> But this:
>> # cat /proc/sys/net/ipv4/tcp_rmem
>> 8192    16384   4194304
>>
>> results in test to run 4.7 s, so even 3 more times slower!
> 
> 
> You might try setting tcp_shrink_window sysctl to one, with some side
> effects for normal TCP flows.

That results in 3 seconds.

In anyway, it looks like they need to fix the test, right? Created:
https://github.com/eventlet/eventlet/issues/821

thanks,
-- 
js
suse labs


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2023-07-17 15:29 [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale Eric Dumazet
                   ` (3 preceding siblings ...)
  2023-11-03  6:10 ` Jiri Slaby
@ 2024-04-02 15:28 ` shironeko
  2024-04-02 15:50   ` Eric Dumazet
  4 siblings, 1 reply; 23+ messages in thread
From: shironeko @ 2024-04-02 15:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

Hi all,

These parts seems to be causing a regression for a specific USB NIC, I 
have this on one of home server, and it's network will randomly cut out 
a few times a week.
Seems others have ran into the same issue with this particular NIC as 
well https://bugzilla.kernel.org/show_bug.cgi?id=218536

> +/* inverse of tcp_win_from_space() */
> +static inline int tcp_space_from_win(const struct sock *sk, int win)
> +{
> +	u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;
> +
> +	do_div(val, tcp_sk(sk)->scaling_ratio);
> +	return val;
> +}
> +
> +static inline void tcp_scaling_ratio_init(struct sock *sk)
> +{
> +	/* Assume a conservative default of 1200 bytes of payload per 4K 
> page.
> +	 * This may be adjusted later in tcp_measure_rcv_mss().
> +	 */
> +	tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
> +				    SKB_TRUESIZE(4096);
>  }
...
> @@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
>  		do_div(grow, tp->rcvq_space.space);
>  		rcvwin += (grow << 1);
> 
> -		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
> -		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
> -			rcvmem += 128;
> -
> -		do_div(rcvwin, tp->advmss);
> -		rcvbuf = min_t(u64, rcvwin * rcvmem,
> +		rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
>  			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
>  		if (rcvbuf > sk->sk_rcvbuf) {
>  			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);

The NIC:
usb 2-2: new SuperSpeed USB device number 4 using xhci_hcd
usb 2-2: New USB device found, idVendor=0b95, idProduct=1790, bcdDevice= 
1.00
usb 2-2: New USB device strings: Mfr=1, Product=2, SerialNumber=3
usb 2-2: Product: AX88179
usb 2-2: Manufacturer: ASIX Elec. Corp.
usb 2-2: SerialNumber: 0000000000009D
ax88179_178a 2-2:1.0 eth0: register 'ax88179_178a' at 
usb-0000:00:14.0-2, ASIX AX88179 USB 3.0 Gigabit Ethernet, 
02:5e:c0:4b:a4:f7
ax88179_178a 2-2:1.0 eth0: ax88179 - Link status is: 1

The dmesg error I get:
divide error: 0000 [#1] PREEMPT SMP PTI
CPU: 6 PID: 2737 Comm: bitmagnet Tainted: P           OE      
6.8.0-76060800daily20240311-generic 
#202403110203~1711393930~22.04~331756a
Hardware name: Dell Inc. XPS 15 9560/05FFDN, BIOS 1.31.0 11/10/2022
RIP: 0010:tcp_rcv_space_adjust+0xbe/0x170
Code: f8 41 89 d0 29 d0 31 d2 49 0f af c2 49 f7 f0 45 8b 81 f0 02 00 00 
44 0f b6 8b ae 05 00 00 31 d2 49 8d 04 42 48 98 48 c1 e0 08 <49> f7 f1 
49 63 d0 48 98 48 39 d0 48 0f 47 c2 39 83 18 01 00 00 7c
RSP: 0018:ffffba4b07657ba0 EFLAGS: 00010206
RAX: 0000000001217e00 RBX: ffff948b6e0565c0 RCX: 000000002f2775fe
RDX: 0000000000000000 RSI: 000000007ee9298e RDI: 000000000000414f
RBP: ffffba4b07657ba8 R08: 0000000000600000 R09: 0000000000000000
R10: 000000000000dd1e R11: 0000000000000000 R12: ffff948b6e0565c0
R13: ffff948b6e056698 R14: 0000000000000000 R15: ffff948b6e056b70
FS:  00007a12d929db38(0000) GS:ffff9491de500000(0000) 
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000078cc12cfd000 CR3: 0000000157d54005 CR4: 00000000003706f0
Call Trace:
  <TASK>
  ? show_regs+0x6d/0x80
  ? die+0x37/0xa0
  ? do_trap+0xd4/0xf0
  ? do_error_trap+0x71/0xb0
  ? tcp_rcv_space_adjust+0xbe/0x170
  ? exc_divide_error+0x3a/0x70
  ? tcp_rcv_space_adjust+0xbe/0x170
  ? asm_exc_divide_error+0x1b/0x20
  ? tcp_rcv_space_adjust+0xbe/0x170
  tcp_recvmsg_locked+0x2d4/0x9c0
  tcp_recvmsg+0x84/0x200
  inet_recvmsg+0x54/0x140
  ? security_socket_recvmsg+0x44/0x80
  sock_recvmsg+0xc6/0xf0
  sock_read_iter+0x8f/0x100
  vfs_read+0x347/0x390
  ksys_read+0xc9/0x100
  __x64_sys_read+0x19/0x30
  do_syscall_64+0x76/0x140
  ? do_syscall_64+0x85/0x140
  ? syscall_exit_to_user_mode+0x8e/0x1e0
  ? do_syscall_64+0x85/0x140
  ? do_syscall_64+0x85/0x140
  ? do_syscall_64+0x85/0x140
  ? do_syscall_64+0x85/0x140
  entry_SYSCALL_64_after_hwframe+0x6e/0x76
RIP: 0033:0x4087ee
Code: 48 83 ec 38 e8 13 00 00 00 48 83 c4 38 5d c3 cc cc cc cc cc cc cc 
cc cc cc cc cc cc 49 89 f2 48 89 fa 48 89 ce 48 89 df 0f 05 <48> 3d 01 
f0 ff ff 76 15 48 f7 d8 48 89 c1 48 c7 c0 ff ff ff ff 48
RSP: 002b:000000c0023a2560 EFLAGS: 00000212 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 0000000000000086 RCX: 00000000004087ee
RDX: 00000000000007f4 RSI: 000000c002b9d03c RDI: 0000000000000086
RBP: 000000c0023a25a0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000212 R12: 00007a12d929d8e8
R13: 0000000000000001 R14: 000000c0021a2380 R15: 0000000000000007
  </TASK>
Modules linked in: tls xt_mark xt_nat veth nft_chain_nat xt_MASQUERADE 
nf_nat nf_conntrack_netlink xfrm_user xfrm_algo br_netfilter bridge stp 
llc rfcomm snd_seq_dummy snd_hrtimer cmac algif_hash overlay 
algif_skcipher af_alg bnep zstd zram nvidia_uvm(POE) ip6t_REJECT 
nf_reject_ipv6 xt_hl intel_uncore_frequency 
intel_uncore_frequency_common ip6t_rt intel_tcc_cooling ipt_REJECT 
nf_reject_ipv4 snd_hda_codec_hdmi xt_LOG nf_log_syslog xt_comment 
xt_multiport snd_ctl_led nft_limit snd_soc_avs snd_soc_hda_codec 
snd_hda_ext_core snd_soc_core snd_hda_codec_realtek 
snd_hda_codec_generic snd_compress ac97_bus snd_pcm_dmaengine 
x86_pkg_temp_thermal intel_powerclamp xt_limit xt_addrtype xt_tcpudp 
joydev snd_hda_intel nvidia_drm(POE) coretemp nvidia_modeset(POE) 
snd_intel_dspcfg snd_intel_sdw_acpi xt_conntrack snd_hda_codec 
nf_conntrack snd_hda_core nf_defrag_ipv6 nf_defrag_ipv4 nft_compat 
snd_hwdep dell_laptop snd_pcm nf_tables snd_seq_midi snd_seq_midi_event 
nfnetlink binfmt_misc snd_rawmidi ath10k_pci snd_seq
  ath10k_core kvm_intel uvcvideo ath snd_seq_device nls_iso8859_1 
dm_crypt mei_wdt mei_pxp mei_hdcp intel_rapl_msr nvidia(POE) dell_wmi 
dell_smm_hwmon bfq kvm input_leds snd_timer videobuf2_vmalloc irqbypass 
dell_smbios mac80211 uvc videobuf2_memops videobuf2_v4l2 dcdbas btusb 
iTCO_wdt rapl snd btrtl ledtrig_audio videodev hid_multitouch 
intel_pmc_bxt btintel ee1004 iTCO_vendor_support intel_cstate btbcm 
intel_wmi_thunderbolt soundcore serio_raw 
processor_thermal_device_pci_legacy wmi_bmof dell_wmi_descriptor btmtk 
mxm_wmi cfg80211 videobuf2_common bluetooth processor_thermal_device 
processor_thermal_wt_hint processor_thermal_rfim mc libarc4 
processor_thermal_rapl ecdh_generic ecc mei_me intel_rapl_common mei 
processor_thermal_wt_req processor_thermal_power_floor 
processor_thermal_mbox intel_pch_thermal intel_soc_dts_iosf 
intel_pmc_core intel_vsec pmt_telemetry dell_smo8800 intel_hid 
int3400_thermal int3403_thermal pmt_class int340x_thermal_zone 
acpi_thermal_rel acpi_pad mac_hid sparse_keymap tcp_bbr
  sch_cake kyber_iosched msr parport_pc ppdev lp parport efi_pstore 
ip_tables x_tables autofs4 raid10 raid456 async_raid6_recov async_memcpy 
async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 
system76_io(OE) system76_acpi(OE) i915 ax88179_178a crct10dif_pclmul 
crc32_pclmul usbnet drm_buddy polyval_clmulni mii polyval_generic 
i2c_algo_bit ghash_clmulni_intel hid_generic rtsx_pci_sdmmc nvme ttm 
sha256_ssse3 sha1_ssse3 psmouse drm_display_helper i2c_i801 i2c_smbus 
nvme_core intel_lpss_pci cec ahci rtsx_pci intel_lpss nvme_auth xhci_pci 
libahci idma64 i2c_hid_acpi xhci_pci_renesas rc_core i2c_hid hid video 
wmi aesni_intel crypto_simd cryptd
---[ end trace 0000000000000000 ]---
RIP: 0010:tcp_rcv_space_adjust+0xbe/0x170
Code: f8 41 89 d0 29 d0 31 d2 49 0f af c2 49 f7 f0 45 8b 81 f0 02 00 00 
44 0f b6 8b ae 05 00 00 31 d2 49 8d 04 42 48 98 48 c1 e0 08 <49> f7 f1 
49 63 d0 48 98 48 39 d0 48 0f 47 c2 39 83 18 01 00 00 7c
RSP: 0018:ffffba4b07657ba0 EFLAGS: 00010206
RAX: 0000000001217e00 RBX: ffff948b6e0565c0 RCX: 000000002f2775fe
RDX: 0000000000000000 RSI: 000000007ee9298e RDI: 000000000000414f
RBP: ffffba4b07657ba8 R08: 0000000000600000 R09: 0000000000000000
R10: 000000000000dd1e R11: 0000000000000000 R12: ffff948b6e0565c0
R13: ffff948b6e056698 R14: 0000000000000000 R15: ffff948b6e056b70
FS:  00007a12d929db38(0000) GS:ffff9491de500000(0000) 
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000078cc12cfd000 CR3: 0000000157d54005 CR4: 00000000003706f0

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2024-04-02 15:28 ` shironeko
@ 2024-04-02 15:50   ` Eric Dumazet
  2024-04-02 16:23     ` Eric Dumazet
  0 siblings, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2024-04-02 15:50 UTC (permalink / raw)
  To: shironeko, Jose Alonso
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On Tue, Apr 2, 2024 at 5:28 PM <shironeko@tesaguri.club> wrote:
>
> Hi all,
>
> These parts seems to be causing a regression for a specific USB NIC, I
> have this on one of home server, and it's network will randomly cut out
> a few times a week.
> Seems others have ran into the same issue with this particular NIC as
> well https://bugzilla.kernel.org/show_bug.cgi?id=218536
>
> > +/* inverse of tcp_win_from_space() */
> > +static inline int tcp_space_from_win(const struct sock *sk, int win)
> > +{
> > +     u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;
> > +
> > +     do_div(val, tcp_sk(sk)->scaling_ratio);
> > +     return val;
> > +}
> > +
> > +static inline void tcp_scaling_ratio_init(struct sock *sk)
> > +{
> > +     /* Assume a conservative default of 1200 bytes of payload per 4K
> > page.
> > +      * This may be adjusted later in tcp_measure_rcv_mss().
> > +      */
> > +     tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
> > +                                 SKB_TRUESIZE(4096);
> >  }
> ...
> > @@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
> >               do_div(grow, tp->rcvq_space.space);
> >               rcvwin += (grow << 1);
> >
> > -             rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
> > -             while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
> > -                     rcvmem += 128;
> > -
> > -             do_div(rcvwin, tp->advmss);
> > -             rcvbuf = min_t(u64, rcvwin * rcvmem,
> > +             rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
> >                              READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
> >               if (rcvbuf > sk->sk_rcvbuf) {
> >                       WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
>
> The NIC:
> usb 2-2: new SuperSpeed USB device number 4 using xhci_hcd
> usb 2-2: New USB device found, idVendor=0b95, idProduct=1790, bcdDevice=
> 1.00
> usb 2-2: New USB device strings: Mfr=1, Product=2, SerialNumber=3
> usb 2-2: Product: AX88179
> usb 2-2: Manufacturer: ASIX Elec. Corp.
> usb 2-2: SerialNumber: 0000000000009D
> ax88179_178a 2-2:1.0 eth0: register 'ax88179_178a' at
> usb-0000:00:14.0-2, ASIX AX88179 USB 3.0 Gigabit Ethernet,
> 02:5e:c0:4b:a4:f7
> ax88179_178a 2-2:1.0 eth0: ax88179 - Link status is: 1
>
> The dmesg error I get:
>

Thanks for the report. This driver is probably lying on skb->truesize.

This commit looks suspicious

commit f8ebb3ac881b17712e1d5967c97ab1806b16d3d6
Author: Jose Alonso <joalonsof@gmail.com>
Date:   Tue Jun 28 12:13:02 2022 -0300

    net: usb: ax88179_178a: Fix packet receiving

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2024-04-02 15:50   ` Eric Dumazet
@ 2024-04-02 16:23     ` Eric Dumazet
  2024-04-02 16:29       ` shironeko
  0 siblings, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2024-04-02 16:23 UTC (permalink / raw)
  To: shironeko, Jose Alonso
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On Tue, Apr 2, 2024 at 5:50 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Tue, Apr 2, 2024 at 5:28 PM <shironeko@tesaguri.club> wrote:
> >
> > Hi all,
> >
> > These parts seems to be causing a regression for a specific USB NIC, I
> > have this on one of home server, and it's network will randomly cut out
> > a few times a week.
> > Seems others have ran into the same issue with this particular NIC as
> > well https://bugzilla.kernel.org/show_bug.cgi?id=218536
> >
> > > +/* inverse of tcp_win_from_space() */
> > > +static inline int tcp_space_from_win(const struct sock *sk, int win)
> > > +{
> > > +     u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;
> > > +
> > > +     do_div(val, tcp_sk(sk)->scaling_ratio);
> > > +     return val;
> > > +}
> > > +
> > > +static inline void tcp_scaling_ratio_init(struct sock *sk)
> > > +{
> > > +     /* Assume a conservative default of 1200 bytes of payload per 4K
> > > page.
> > > +      * This may be adjusted later in tcp_measure_rcv_mss().
> > > +      */
> > > +     tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
> > > +                                 SKB_TRUESIZE(4096);
> > >  }
> > ...
> > > @@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
> > >               do_div(grow, tp->rcvq_space.space);
> > >               rcvwin += (grow << 1);
> > >
> > > -             rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
> > > -             while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
> > > -                     rcvmem += 128;
> > > -
> > > -             do_div(rcvwin, tp->advmss);
> > > -             rcvbuf = min_t(u64, rcvwin * rcvmem,
> > > +             rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
> > >                              READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
> > >               if (rcvbuf > sk->sk_rcvbuf) {
> > >                       WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
> >
> > The NIC:
> > usb 2-2: new SuperSpeed USB device number 4 using xhci_hcd
> > usb 2-2: New USB device found, idVendor=0b95, idProduct=1790, bcdDevice=
> > 1.00
> > usb 2-2: New USB device strings: Mfr=1, Product=2, SerialNumber=3
> > usb 2-2: Product: AX88179
> > usb 2-2: Manufacturer: ASIX Elec. Corp.
> > usb 2-2: SerialNumber: 0000000000009D
> > ax88179_178a 2-2:1.0 eth0: register 'ax88179_178a' at
> > usb-0000:00:14.0-2, ASIX AX88179 USB 3.0 Gigabit Ethernet,
> > 02:5e:c0:4b:a4:f7
> > ax88179_178a 2-2:1.0 eth0: ax88179 - Link status is: 1
> >
> > The dmesg error I get:
> >
>
> Thanks for the report. This driver is probably lying on skb->truesize.
>
> This commit looks suspicious
>
> commit f8ebb3ac881b17712e1d5967c97ab1806b16d3d6
> Author: Jose Alonso <joalonsof@gmail.com>
> Date:   Tue Jun 28 12:13:02 2022 -0300
>
>     net: usb: ax88179_178a: Fix packet receiving

Could you try this patch ?

diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index 88e084534853dd50505fd730e7ccd07c70f2d8ee..ca33365e49cc3993a974ddbdbf68189ce4df2e82
100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -1452,21 +1452,16 @@ static int ax88179_rx_fixup(struct usbnet
*dev, struct sk_buff *skb)
                        /* Skip IP alignment pseudo header */
                        skb_pull(skb, 2);

-                       skb->truesize = SKB_TRUESIZE(pkt_len_plus_padd);
                        ax88179_rx_checksum(skb, pkt_hdr);
                        return 1;
                }

-               ax_skb = skb_clone(skb, GFP_ATOMIC);
+               ax_skb = netdev_alloc_skb_ip_align(dev->net, pkt_len);
                if (!ax_skb)
                        return 0;
-               skb_trim(ax_skb, pkt_len);
+               skb_put(ax_skb, pkt_len);
+               memcpy(ax_skb->data, skb->data + 2, pkt_len);

-               /* Skip IP alignment pseudo header */
-               skb_pull(ax_skb, 2);
-
-               skb->truesize = pkt_len_plus_padd +
-                               SKB_DATA_ALIGN(sizeof(struct sk_buff));
                ax88179_rx_checksum(ax_skb, pkt_hdr);
                usbnet_skb_return(dev, ax_skb);

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2024-04-02 16:23     ` Eric Dumazet
@ 2024-04-02 16:29       ` shironeko
  2024-04-06  0:22         ` shironeko
  0 siblings, 1 reply; 23+ messages in thread
From: shironeko @ 2024-04-02 16:29 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jose Alonso, David S . Miller, Jakub Kicinski, Paolo Abeni,
	netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On 2024-04-02 12:23, Eric Dumazet wrote:
> 
> Could you try this patch ?
> 
> diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
> index 88e084534853dd50505fd730e7ccd07c70f2d8ee..ca33365e49cc3993a974ddbdbf68189ce4df2e82
> 100644
> --- a/drivers/net/usb/ax88179_178a.c
> +++ b/drivers/net/usb/ax88179_178a.c
> @@ -1452,21 +1452,16 @@ static int ax88179_rx_fixup(struct usbnet
> *dev, struct sk_buff *skb)
>                         /* Skip IP alignment pseudo header */
>                         skb_pull(skb, 2);
> 
> -                       skb->truesize = SKB_TRUESIZE(pkt_len_plus_padd);
>                         ax88179_rx_checksum(skb, pkt_hdr);
>                         return 1;
>                 }
> 
> -               ax_skb = skb_clone(skb, GFP_ATOMIC);
> +               ax_skb = netdev_alloc_skb_ip_align(dev->net, pkt_len);
>                 if (!ax_skb)
>                         return 0;
> -               skb_trim(ax_skb, pkt_len);
> +               skb_put(ax_skb, pkt_len);
> +               memcpy(ax_skb->data, skb->data + 2, pkt_len);
> 
> -               /* Skip IP alignment pseudo header */
> -               skb_pull(ax_skb, 2);
> -
> -               skb->truesize = pkt_len_plus_padd +
> -                               SKB_DATA_ALIGN(sizeof(struct sk_buff));
>                 ax88179_rx_checksum(ax_skb, pkt_hdr);
>                 usbnet_skb_return(dev, ax_skb);

will report back next week :)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2024-04-02 16:29       ` shironeko
@ 2024-04-06  0:22         ` shironeko
  2024-04-06  6:11           ` Eric Dumazet
  0 siblings, 1 reply; 23+ messages in thread
From: shironeko @ 2024-04-06  0:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jose Alonso, David S . Miller, Jakub Kicinski, Paolo Abeni,
	netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

The patch seems to be working, no more dmesg errors or network cut-outs. Thank you!

There is however this line printed yesterday afternoon, so seem there's still some weirdness.
> TCP: eth0: Driver has suspect GRO implementation, TCP performance may be compromised.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2024-04-06  0:22         ` shironeko
@ 2024-04-06  6:11           ` Eric Dumazet
  2024-04-06  7:07             ` Eric Dumazet
  0 siblings, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2024-04-06  6:11 UTC (permalink / raw)
  To: shironeko
  Cc: Jose Alonso, David S . Miller, Jakub Kicinski, Paolo Abeni,
	netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On Sat, Apr 6, 2024 at 2:22 AM <shironeko@tesaguri.club> wrote:
>
> The patch seems to be working, no more dmesg errors or network cut-outs. Thank you!
>
> There is however this line printed yesterday afternoon, so seem there's still some weirdness.
> > TCP: eth0: Driver has suspect GRO implementation, TCP performance may be compromised.

This is great, could you add the following to get some details from the skb ?

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1b6cd384001202df5f8e8e8c73adff0db89ece63..e30895a9cc8627cf423c3c5a783d525db0b2db8e
100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -209,9 +209,11 @@ static __cold void tcp_gro_dev_warn(const struct
sock *sk, const struct sk_buff

        rcu_read_lock();
        dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
-       if (!dev || len >= READ_ONCE(dev->mtu))
+       if (!dev || len >= READ_ONCE(dev->mtu)) {
+               skb_dump(KERN_ERR, skb, false);
                pr_warn("%s: Driver has suspect GRO implementation,
TCP performance may be compromised.\n",
                        dev ? dev->name : "Unknown driver");
+       }
        rcu_read_unlock();
 }

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale
  2024-04-06  6:11           ` Eric Dumazet
@ 2024-04-06  7:07             ` Eric Dumazet
  0 siblings, 0 replies; 23+ messages in thread
From: Eric Dumazet @ 2024-04-06  7:07 UTC (permalink / raw)
  To: shironeko
  Cc: Jose Alonso, David S . Miller, Jakub Kicinski, Paolo Abeni,
	netdev, Soheil Hassas Yeganeh, Neal Cardwell, Yuchung Cheng,
	eric.dumazet

On Sat, Apr 6, 2024 at 8:11 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Sat, Apr 6, 2024 at 2:22 AM <shironeko@tesaguri.club> wrote:
> >
> > The patch seems to be working, no more dmesg errors or network cut-outs. Thank you!
> >
> > There is however this line printed yesterday afternoon, so seem there's still some weirdness.
> > > TCP: eth0: Driver has suspect GRO implementation, TCP performance may be compromised.
>
> This is great, could you add the following to get some details from the skb ?
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 1b6cd384001202df5f8e8e8c73adff0db89ece63..e30895a9cc8627cf423c3c5a783d525db0b2db8e
> 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -209,9 +209,11 @@ static __cold void tcp_gro_dev_warn(const struct
> sock *sk, const struct sk_buff
>
>         rcu_read_lock();
>         dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
> -       if (!dev || len >= READ_ONCE(dev->mtu))
> +       if (!dev || len >= READ_ONCE(dev->mtu)) {
> +               skb_dump(KERN_ERR, skb, false);
>                 pr_warn("%s: Driver has suspect GRO implementation,
> TCP performance may be compromised.\n",
>                         dev ? dev->name : "Unknown driver");
> +       }
>         rcu_read_unlock();
>  }

Thinking more about this, I  think this (old) issue comes from the
large skb->head allocated by usbnet,
allowing tcp_add_backlog/skb_try_coalesce() to aggregate multiple
non-gro packets into a single skb.

We could fix this generically from usbnet_skb_return(), instead of
'fixing' all usbnet drivers.

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2024-04-06  7:07 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-17 15:29 [PATCH net-next] tcp: get rid of sysctl_tcp_adv_win_scale Eric Dumazet
2023-07-17 16:52 ` Soheil Hassas Yeganeh
2023-07-17 17:17   ` Eric Dumazet
2023-07-17 17:20     ` Soheil Hassas Yeganeh
2023-07-19  2:10 ` patchwork-bot+netdevbpf
2023-07-20 15:41 ` Paolo Abeni
2023-07-20 15:50   ` Eric Dumazet
2023-11-03  6:10 ` Jiri Slaby
2023-11-03  6:56   ` Jiri Slaby
2023-11-03  7:07     ` Jiri Slaby
2023-11-03  8:17       ` Eric Dumazet
2023-11-03  9:27         ` Michal Kubecek
2023-11-03  9:53           ` Eric Dumazet
2023-11-03 10:14         ` Jiri Slaby
2023-11-03 10:27           ` Eric Dumazet
2023-11-03 11:07             ` Jiri Slaby
2024-04-02 15:28 ` shironeko
2024-04-02 15:50   ` Eric Dumazet
2024-04-02 16:23     ` Eric Dumazet
2024-04-02 16:29       ` shironeko
2024-04-06  0:22         ` shironeko
2024-04-06  6:11           ` Eric Dumazet
2024-04-06  7:07             ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).