All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
@ 2022-11-24 11:22 Breno Leitao
  2022-11-29  1:00 ` Kuniyuki Iwashima
  2022-11-29 10:18 ` Paolo Abeni
  0 siblings, 2 replies; 14+ messages in thread
From: Breno Leitao @ 2022-11-24 11:22 UTC (permalink / raw)
  To: edumazet, davem, kuba
  Cc: netdev, leit, yoshfuji, pabeni, dsahern, linux-kernel

There are cases where we need information about the socket during a
warning, so, it could help us to find bugs that happens and do not have
an easy repro.

This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
dumps more information about the TCP socket.

This new warning is not only useful to give more insight about kernel bugs, but,
it is also helpful to expose information that might be coming from buggy
BPF applications, such as BPF applications that sets invalid
tcp_sock->snd_cwnd values.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 include/net/tcp.h       |  3 ++-
 include/net/tcp_debug.h | 10 ++++++++++
 net/ipv4/tcp.c          | 30 ++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 include/net/tcp_debug.h

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 14d45661a84d..e490af8e6fdc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -40,6 +40,7 @@
 #include <net/inet_ecn.h>
 #include <net/dst.h>
 #include <net/mptcp.h>
+#include <net/tcp_debug.h>
 
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
@@ -1229,7 +1230,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
 
 static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
 {
-	WARN_ON_ONCE((int)val <= 0);
+	TCP_SOCK_WARN_ON_ONCE(tp, (int)val <= 0);
 	tp->snd_cwnd = val;
 }
 
diff --git a/include/net/tcp_debug.h b/include/net/tcp_debug.h
new file mode 100644
index 000000000000..50e96d87d335
--- /dev/null
+++ b/include/net/tcp_debug.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TCP_DEBUG_H
+#define _LINUX_TCP_DEBUG_H
+
+void tcp_sock_warn(const struct tcp_sock *tp);
+
+#define TCP_SOCK_WARN_ON_ONCE(tcp_sock, condition) \
+		DO_ONCE_LITE_IF(condition, tcp_sock_warn, tcp_sock)
+
+#endif  /* _LINUX_TCP_DEBUG_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 54836a6b81d6..dd682f60c7cb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4705,6 +4705,36 @@ int tcp_abort(struct sock *sk, int err)
 }
 EXPORT_SYMBOL_GPL(tcp_abort);
 
+void tcp_sock_warn(const struct tcp_sock *tp)
+{
+	const struct sock *sk = (const struct sock *)tp;
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	WARN_ON(1);
+
+	if (!tp)
+		return;
+
+	pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
+		sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
+		ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
+
+	switch (sk->sk_family) {
+	case AF_INET:
+		pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
+			&inet->inet_daddr);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		pr_warn("saddr=%pI6 daddr=%pI6", &sk->sk_v6_rcv_saddr,
+			&sk->sk_v6_daddr);
+		break;
+#endif
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_sock_warn);
+
 extern struct tcp_congestion_ops tcp_reno;
 
 static __initdata unsigned long thash_entries;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-11-24 11:22 [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE() Breno Leitao
@ 2022-11-29  1:00 ` Kuniyuki Iwashima
  2022-11-29 12:47   ` Breno Leitao
  2022-11-29 10:18 ` Paolo Abeni
  1 sibling, 1 reply; 14+ messages in thread
From: Kuniyuki Iwashima @ 2022-11-29  1:00 UTC (permalink / raw)
  To: leitao
  Cc: davem, dsahern, edumazet, kuba, leit, linux-kernel, netdev,
	pabeni, yoshfuji, kuniyu

From:   Breno Leitao <leitao@debian.org>
Date:   Thu, 24 Nov 2022 03:22:29 -0800
> There are cases where we need information about the socket during a
> warning, so, it could help us to find bugs that happens and do not have
> an easy repro.
> 
> This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
> dumps more information about the TCP socket.
> 
> This new warning is not only useful to give more insight about kernel bugs, but,
> it is also helpful to expose information that might be coming from buggy
> BPF applications, such as BPF applications that sets invalid
> tcp_sock->snd_cwnd values.

Have you finally found a root cause on BPF or TCP side ?


> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  include/net/tcp.h       |  3 ++-
>  include/net/tcp_debug.h | 10 ++++++++++
>  net/ipv4/tcp.c          | 30 ++++++++++++++++++++++++++++++
>  3 files changed, 42 insertions(+), 1 deletion(-)
>  create mode 100644 include/net/tcp_debug.h
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 14d45661a84d..e490af8e6fdc 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -40,6 +40,7 @@
>  #include <net/inet_ecn.h>
>  #include <net/dst.h>
>  #include <net/mptcp.h>
> +#include <net/tcp_debug.h>
>  
>  #include <linux/seq_file.h>
>  #include <linux/memcontrol.h>
> @@ -1229,7 +1230,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
>  
>  static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
>  {
> -	WARN_ON_ONCE((int)val <= 0);
> +	TCP_SOCK_WARN_ON_ONCE(tp, (int)val <= 0);
>  	tp->snd_cwnd = val;
>  }
>  
> diff --git a/include/net/tcp_debug.h b/include/net/tcp_debug.h
> new file mode 100644
> index 000000000000..50e96d87d335
> --- /dev/null
> +++ b/include/net/tcp_debug.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_TCP_DEBUG_H
> +#define _LINUX_TCP_DEBUG_H
> +
> +void tcp_sock_warn(const struct tcp_sock *tp);
> +
> +#define TCP_SOCK_WARN_ON_ONCE(tcp_sock, condition) \
> +		DO_ONCE_LITE_IF(condition, tcp_sock_warn, tcp_sock)
> +
> +#endif  /* _LINUX_TCP_DEBUG_H */
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 54836a6b81d6..dd682f60c7cb 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -4705,6 +4705,36 @@ int tcp_abort(struct sock *sk, int err)
>  }
>  EXPORT_SYMBOL_GPL(tcp_abort);
>  
> +void tcp_sock_warn(const struct tcp_sock *tp)
> +{
> +	const struct sock *sk = (const struct sock *)tp;
> +	struct inet_sock *inet = inet_sk(sk);
> +	struct inet_connection_sock *icsk = inet_csk(sk);
> +
> +	WARN_ON(1);
> +
> +	if (!tp)

Is this needed ?


> +		return;
> +
> +	pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
> +		sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
> +		ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
> +
> +	switch (sk->sk_family) {
> +	case AF_INET:
> +		pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
> +			&inet->inet_daddr);

As with tcp_syn_flood_action(), [address]:port format is easy
to read and consistent in kernel ?


> +		break;
> +#if IS_ENABLED(CONFIG_IPV6)
> +	case AF_INET6:
> +		pr_warn("saddr=%pI6 daddr=%pI6", &sk->sk_v6_rcv_saddr,
> +			&sk->sk_v6_daddr);
> +		break;
> +#endif
> +	}
> +}
> +EXPORT_SYMBOL_GPL(tcp_sock_warn);
> +
>  extern struct tcp_congestion_ops tcp_reno;
>  
>  static __initdata unsigned long thash_entries;
> -- 
> 2.30.2

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-11-24 11:22 [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE() Breno Leitao
  2022-11-29  1:00 ` Kuniyuki Iwashima
@ 2022-11-29 10:18 ` Paolo Abeni
  2022-11-30  2:28   ` Jakub Kicinski
  1 sibling, 1 reply; 14+ messages in thread
From: Paolo Abeni @ 2022-11-29 10:18 UTC (permalink / raw)
  To: Breno Leitao, edumazet, davem, kuba
  Cc: netdev, leit, yoshfuji, dsahern, linux-kernel

Hello,

On Thu, 2022-11-24 at 03:22 -0800, Breno Leitao wrote:
> There are cases where we need information about the socket during a
> warning, so, it could help us to find bugs that happens and do not have
> an easy repro.
> 
> This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
> dumps more information about the TCP socket.
> 
> This new warning is not only useful to give more insight about kernel bugs, but,
> it is also helpful to expose information that might be coming from buggy
> BPF applications, such as BPF applications that sets invalid
> tcp_sock->snd_cwnd values.

I personally find this use-case a little too tight, you could likelly
fetch the same information with a perf probe or something similar.

> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  include/net/tcp.h       |  3 ++-
>  include/net/tcp_debug.h | 10 ++++++++++
>  net/ipv4/tcp.c          | 30 ++++++++++++++++++++++++++++++
>  3 files changed, 42 insertions(+), 1 deletion(-)
>  create mode 100644 include/net/tcp_debug.h
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 14d45661a84d..e490af8e6fdc 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -40,6 +40,7 @@
>  #include <net/inet_ecn.h>
>  #include <net/dst.h>
>  #include <net/mptcp.h>
> +#include <net/tcp_debug.h>
>  
>  #include <linux/seq_file.h>
>  #include <linux/memcontrol.h>
> @@ -1229,7 +1230,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
>  
>  static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
>  {
> -	WARN_ON_ONCE((int)val <= 0);
> +	TCP_SOCK_WARN_ON_ONCE(tp, (int)val <= 0);
>  	tp->snd_cwnd = val;
>  }
>  
> diff --git a/include/net/tcp_debug.h b/include/net/tcp_debug.h
> new file mode 100644
> index 000000000000..50e96d87d335
> --- /dev/null
> +++ b/include/net/tcp_debug.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_TCP_DEBUG_H
> +#define _LINUX_TCP_DEBUG_H
> +
> +void tcp_sock_warn(const struct tcp_sock *tp);
> +
> +#define TCP_SOCK_WARN_ON_ONCE(tcp_sock, condition) \
> +		DO_ONCE_LITE_IF(condition, tcp_sock_warn, tcp_sock)
> +
> +#endif  /* _LINUX_TCP_DEBUG_H */
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 54836a6b81d6..dd682f60c7cb 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -4705,6 +4705,36 @@ int tcp_abort(struct sock *sk, int err)
>  }
>  EXPORT_SYMBOL_GPL(tcp_abort);
>  
> +void tcp_sock_warn(const struct tcp_sock *tp)
> +{
> +	const struct sock *sk = (const struct sock *)tp;
> +	struct inet_sock *inet = inet_sk(sk);
> +	struct inet_connection_sock *icsk = inet_csk(sk);
> +
> +	WARN_ON(1);
> +
> +	if (!tp)
> +		return;
> +
> +	pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
> +		sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
> +		ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
> +
> +	switch (sk->sk_family) {
> +	case AF_INET:
> +		pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
> +			&inet->inet_daddr);
> +		break;
> +#if IS_ENABLED(CONFIG_IPV6)
> +	case AF_INET6:
> +		pr_warn("saddr=%pI6 daddr=%pI6", &sk->sk_v6_rcv_saddr,
> +			&sk->sk_v6_daddr);
> +		break;
> +#endif

Please, adjust the output format as suggested by Kuniyuki,

thanks!

Paolo


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-11-29  1:00 ` Kuniyuki Iwashima
@ 2022-11-29 12:47   ` Breno Leitao
  2022-11-29 21:16     ` Iwashima, Kuniyuki
  0 siblings, 1 reply; 14+ messages in thread
From: Breno Leitao @ 2022-11-29 12:47 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: davem, dsahern, edumazet, kuba, leit, linux-kernel, netdev,
	pabeni, yoshfuji

On Tue, Nov 29, 2022 at 10:00:55AM +0900, Kuniyuki Iwashima wrote:
> From:   Breno Leitao <leitao@debian.org>
> Date:   Thu, 24 Nov 2022 03:22:29 -0800
> > There are cases where we need information about the socket during a
> > warning, so, it could help us to find bugs that happens and do not have
> > an easy repro.
> > 
> > This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
> > dumps more information about the TCP socket.
> > 
> > This new warning is not only useful to give more insight about kernel bugs, but,
> > it is also helpful to expose information that might be coming from buggy
> > BPF applications, such as BPF applications that sets invalid
> > tcp_sock->snd_cwnd values.
> 
> Have you finally found a root cause on BPF or TCP side ?

Yes, this demonstrated to be very useful to find out BPF applications
that are doing nasty things with the congestion window.

We currently have this patch applied to Meta's infrastructure to track
BPF applications that are misbehaving, and easily track down to which
BPF application is the responsible one.

> > +#endif  /* _LINUX_TCP_DEBUG_H */
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index 54836a6b81d6..dd682f60c7cb 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -4705,6 +4705,36 @@ int tcp_abort(struct sock *sk, int err)
> >  }
> >  EXPORT_SYMBOL_GPL(tcp_abort);
> >  
> > +void tcp_sock_warn(const struct tcp_sock *tp)
> > +{
> > +	const struct sock *sk = (const struct sock *)tp;
> > +	struct inet_sock *inet = inet_sk(sk);
> > +	struct inet_connection_sock *icsk = inet_csk(sk);
> > +
> > +	WARN_ON(1);
> > +
> > +	if (!tp)
> 
> Is this needed ?

We are de-referencing tp/sk in the lines below, so, I think it is safe to
check if they are not NULL before the de-refencing it.

Should I do check for "ck" instead of "tp" to make the code a bit
cleaner to read?

> > +	pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
> > +		sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
> > +		ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
> > +
> > +	switch (sk->sk_family) {
> > +	case AF_INET:
> > +		pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
> > +			&inet->inet_daddr);
> 
> As with tcp_syn_flood_action(), [address]:port format is easy
> to read and consistent in kernel ?

Absolutely. I am going to fix it in v2. Thanks!

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-11-29 12:47   ` Breno Leitao
@ 2022-11-29 21:16     ` Iwashima, Kuniyuki
  2022-11-30 13:18       ` Breno Leitao
  0 siblings, 1 reply; 14+ messages in thread
From: Iwashima, Kuniyuki @ 2022-11-29 21:16 UTC (permalink / raw)
  To: Breno Leitao
  Cc: davem, dsahern, edumazet, kuba, leit, linux-kernel, netdev,
	pabeni, yoshfuji



> On Nov 29, 2022, at 21:48, Breno Leitao <leitao@debian.org> wrote:
>> On Tue, Nov 29, 2022 at 10:00:55AM +0900, Kuniyuki Iwashima wrote:
>> From:   Breno Leitao <leitao@debian.org>
>> Date:   Thu, 24 Nov 2022 03:22:29 -0800
>>> There are cases where we need information about the socket during a
>>> warning, so, it could help us to find bugs that happens and do not have
>>> an easy repro.
>>> 
>>> This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
>>> dumps more information about the TCP socket.
>>> 
>>> This new warning is not only useful to give more insight about kernel bugs, but,
>>> it is also helpful to expose information that might be coming from buggy
>>> BPF applications, such as BPF applications that sets invalid
>>> tcp_sock->snd_cwnd values.
>> 
>> Have you finally found a root cause on BPF or TCP side ?
> 
> Yes, this demonstrated to be very useful to find out BPF applications
> that are doing nasty things with the congestion window.
> 
> We currently have this patch applied to Meta's infrastructure to track
> BPF applications that are misbehaving, and easily track down to which
> BPF application is the responsible one.

If you have a fix merged on the BPF side, 
it would be helpful to mention the commit to 
well understand the issue, background, 
and why other tooling is not enough as Paolo wondered.



>>> +#endif  /* _LINUX_TCP_DEBUG_H */
>>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>>> index 54836a6b81d6..dd682f60c7cb 100644
>>> --- a/net/ipv4/tcp.c
>>> +++ b/net/ipv4/tcp.c
>>> @@ -4705,6 +4705,36 @@ int tcp_abort(struct sock *sk, int err)
>>> }
>>> EXPORT_SYMBOL_GPL(tcp_abort);
>>> 
>>> +void tcp_sock_warn(const struct tcp_sock *tp)
>>> +{
>>> +   const struct sock *sk = (const struct sock *)tp;
>>> +   struct inet_sock *inet = inet_sk(sk);
>>> +   struct inet_connection_sock *icsk = inet_csk(sk);
>>> +
>>> +   WARN_ON(1);
>>> +
>>> +   if (!tp)
>> 
>> Is this needed ?
> 
> We are de-referencing tp/sk in the lines below, so, I think it is safe to
> check if they are not NULL before the de-refencing it.

tp->snd_cwnd is accessed just after this WARN, 
so I thought there were no cases where tp is NULL.
If it exists, KASAN should be complaining.
I think this additional if could confuse future readers and 
want to make sure if there is such a case.

Thank you!

> 
> Should I do check for "ck" instead of "tp" to make the code a bit
> cleaner to read?
> 
>>> +   pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
>>> +           sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
>>> +           ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
>>> +
>>> +   switch (sk->sk_family) {
>>> +   case AF_INET:
>>> +           pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
>>> +                   &inet->inet_daddr);
>> 
>> As with tcp_syn_flood_action(), [address]:port format is easy
>> to read and consistent in kernel ?
> 
> Absolutely. I am going to fix it in v2. Thanks!

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-11-29 10:18 ` Paolo Abeni
@ 2022-11-30  2:28   ` Jakub Kicinski
  0 siblings, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2022-11-30  2:28 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Breno Leitao, edumazet, davem, netdev, leit, yoshfuji, dsahern,
	linux-kernel

On Tue, 29 Nov 2022 11:18:27 +0100 Paolo Abeni wrote:
> On Thu, 2022-11-24 at 03:22 -0800, Breno Leitao wrote:
> > There are cases where we need information about the socket during a
> > warning, so, it could help us to find bugs that happens and do not have
> > an easy repro.
> > 
> > This diff creates a TCP socket-specific version of WARN_ON_ONCE(), which
> > dumps more information about the TCP socket.
> > 
> > This new warning is not only useful to give more insight about kernel bugs, but,
> > it is also helpful to expose information that might be coming from buggy
> > BPF applications, such as BPF applications that sets invalid
> > tcp_sock->snd_cwnd values.  
> 
> I personally find this use-case a little too tight, you could likelly
> fetch the same information with a perf probe or something similar.

It's just the initial case, to keep the patch small.
The intent is to convert all TCP warnings to this helper.

As Breno says in the first sentence this is about having enough
relevant information to zero in on the cause of the rare crashes /
warnings (which are hit quite a lot on our "millions of machines").

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-11-29 21:16     ` Iwashima, Kuniyuki
@ 2022-11-30 13:18       ` Breno Leitao
  0 siblings, 0 replies; 14+ messages in thread
From: Breno Leitao @ 2022-11-30 13:18 UTC (permalink / raw)
  To: Iwashima, Kuniyuki
  Cc: davem, dsahern, edumazet, kuba, leit, linux-kernel, netdev,
	pabeni, yoshfuji

On Tue, Nov 29, 2022 at 09:16:16PM +0000, Iwashima, Kuniyuki wrote:
> > On Nov 29, 2022, at 21:48, Breno Leitao <leitao@debian.org> wrote:
> >> On Tue, Nov 29, 2022 at 10:00:55AM +0900, Kuniyuki Iwashima wrote:

<snip>

> >>> +void tcp_sock_warn(const struct tcp_sock *tp)
> >>> +{
> >>> +   const struct sock *sk = (const struct sock *)tp;
> >>> +   struct inet_sock *inet = inet_sk(sk);
> >>> +   struct inet_connection_sock *icsk = inet_csk(sk);
> >>> +
> >>> +   WARN_ON(1);
> >>> +
> >>> +   if (!tp)
> >> 
> >> Is this needed ?
> > 
> > We are de-referencing tp/sk in the lines below, so, I think it is safe to
> > check if they are not NULL before the de-refencing it.
> 
> tp->snd_cwnd is accessed just after this WARN, 
> so I thought there were no cases where tp is NULL.

Oh, important to say that we want to re-use this macro on other places
as well. This initial usage (on tcp_snd_cwnd_set()) is just for the
initial patch. I see value replacing some WARN_ON_*() by
TCP_SOCK_WARN_ON_ONCE() in other parts of the code, so, this check is to
protect this warning when TCP_SOCK_WARN_ON_ONCE() is called from
different places.

Anyway, I definitely can remove the check here, but, we might want to
re-add it later, as we replace some WARN_ON_* by TCP_SOCK_WARN_ON_*();

> I think this additional if could confuse future readers and 
> want to make sure if there is such a case.

How come checking if a pointer is valid before de-refencing it could
confuse readers?

Thank you for the review!

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-12-08 15:44       ` Breno Leitao
@ 2022-12-09  8:39         ` Eric Dumazet
  0 siblings, 0 replies; 14+ messages in thread
From: Eric Dumazet @ 2022-12-09  8:39 UTC (permalink / raw)
  To: Breno Leitao
  Cc: David Miller, Jakub Kicinski, netdev, leit, Hideaki YOSHIFUJI,
	Paolo Abeni, David Ahern, LKML

On Thu, Dec 8, 2022 at 4:44 PM Breno Leitao <leitao@debian.org> wrote:
>
> On Wed, Dec 07, 2022 at 06:59:38PM +0100, Eric Dumazet wrote:

> > Try to give us symbols with scripts/decode_stacktrace.sh , thanks.
>
> Sorry, here it is:
>
>  [749619.538804] WARNING: CPU: 19 PID: 0 at net/ipv4/tcp.c:4552 tcp_sock_warn+0x6/0x20
>  [749619.553969] Modules linked in: sch_fq sunrpc bpf_preload tls act_gact cls_bpf tcp_diag inet_diag skx_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel iTCO_wdt iTCO_vendor_support kvm evdev ses irqbypass enclosure i2c_i801 i2c_smbus ipmi_si ipmi_devintf ipmi_msghandler acpi_cpufreq button tpm_crb sch_fq_codel vhost_net tun tap vhost vhost_iotlb virtio_net net_failover failover mpls_gso mpls_iptunnel mpls_router ip_tunnel fou ip6_udp_tunnel udp_tunnel fuse sg nvme mpi3mr scsi_transport_sas nvme_core xhci_pci xhci_hcd loop efivarfs autofs4
>  [749619.678066] Hardware name: XXXXX
>  [749619.695308] RIP: tcp_sock_warn+0x6/0x20
>  [749619.704034] Code: 4d 01 3e 85 c0 0f 84 57 ff ff ff 48 8b 0c 24 44 8b 01 eb 82 e8 eb b7 14 00 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 53 <0f> 0b 48 85 ff 0f 85 77 70 14 00 5b c3 66 66 2e 0f 1f 84 00 00 00
>  All code
>  ========
>     0:  4d 01 3e                add    %r15,(%r14)
>     3:  85 c0                   test   %eax,%eax
>     5:  0f 84 57 ff ff ff       je     0xffffffffffffff62
>     b:  48 8b 0c 24             mov    (%rsp),%rcx
>     f:  44 8b 01                mov    (%rcx),%r8d
>    12:  eb 82                   jmp    0xffffffffffffff96
>    14:  e8 eb b7 14 00          callq  0x14b804
>    19:  66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)
>    20:  00 00 00 00
>    24:  0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
>    29:  53                      push   %rbx
>    2a:* 0f 0b                   ud2             <-- trapping instruction
>    2c:  48 85 ff                test   %rdi,%rdi
>    2f:  0f 85 77 70 14 00       jne    0x1470ac
>    35:  5b                      pop    %rbx
>    36:  c3                      retq
>    37:  66                      data16
>    38:  66                      data16
>    39:  2e                      cs
>    3a:  0f                      .byte 0xf
>    3b:  1f                      (bad)
>    3c:  84 00                   test   %al,(%rax)
>         ...
>
>  Code starting with the faulting instruction
>  ===========================================
>     0:  0f 0b                   ud2
>     2:  48 85 ff                test   %rdi,%rdi
>     5:  0f 85 77 70 14 00       jne    0x147082
>     b:  5b                      pop    %rbx
>     c:  c3                      retq
>     d:  66                      data16
>     e:  66                      data16
>     f:  2e                      cs
>    10:  0f                      .byte 0xf
>    11:  1f                      (bad)
>    12:  84 00                   test   %al,(%rax)
>         ...
>  [749619.741779] RSP: 0018:ffffc90000d08988 EFLAGS: 00010246
>  [749619.752436] RAX: ffff88814b57f5c0 RBX: ffff8881bd2540c0 RCX: ffffc90000d08a34
>  [749619.766900] RDX: 0000000000000000 RSI: 00000000cda8f4af RDI: ffff8881bd2540c0
>  [749619.781364] RBP: 0000000000000000 R08: ffffc90000d08a38 R09: 00000000cda8f44f
>  [749619.795831] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
>  [749619.810300] R13: ffffc90000d08a34 R14: 0000000000011406 R15: 0000000000000000
>  [749619.824788] FS:  0000000000000000(0000) GS:ffff88903f8c0000(0000) knlGS:0000000000000000
>  [749619.841168] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  [749619.852857] CR2: 000000000007c2e9 CR3: 0000000b82412002 CR4: 00000000007706e0
>  [749619.867331] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>  [749619.881800] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>  [749619.896260] PKRU: 55555554
>  [749619.901859] Call Trace:
>  [749619.906927]  <IRQ>
>  [749619.911129] tcp_fastretrans_alert+0x988/0x9e0
>  [749619.920222] ? kmem_cache_free+0x33c/0x3d0
>  [749619.928606] tcp_ack+0x8b4/0x1360
>  [749619.935425] ? __cgroup_bpf_run_filter_skb+0x185/0x440
>  [749619.945910] tcp_rcv_established+0x2f3/0x660
>  [749619.954639] ? sk_filter_trim_cap+0xbc/0x220
>  [749619.963358] tcp_v6_do_rcv+0xbe/0x3e0
>  [749619.970863] tcp_v6_rcv+0xc01/0xc90

Still no symbols (file name : line number).

>  [749619.978029] ip6_protocol_deliver_rcu+0xbd/0x450
>  [749619.987453] ip6_input_finish+0x3d/0x60
>  [749619.995313] ip6_input+0xb5/0xc0
>  [749620.001958] ip6_sublist_rcv_finish+0x37/0x50
>  [749620.010851] ip6_sublist_rcv+0x1dd/0x270
>  [749620.018877] ipv6_list_rcv+0x113/0x140
>  [749620.026552] __netif_receive_skb_list_core+0x1a0/0x210
>  [749620.037025] netif_receive_skb_list_internal+0x186/0x2a0
>  [749620.047838] ? napi_gro_complete+0x6c/0xd0
>  [749620.056215] gro_normal_list.part.171+0x19/0x40
>  [749620.065471] napi_complete_done+0x65/0x150
>  [749620.073856] bnxt_poll_p5+0x25b/0x2b0
>  [749620.081369] __napi_poll+0x25/0x120
>  [749620.088537] net_rx_action+0x189/0x300
>  [749620.096224] __do_softirq+0xbb/0x271
>  [749620.103571] irq_exit_rcu+0x97/0xa0
>  [749620.110732] common_interrupt+0x7f/0xa0
>  [749620.118595]  </IRQ>
>  [749620.122964] asm_common_interrupt+0x1e/0x40
>  [749620.131511] RIP: cpuidle_enter_state+0xc2/0x340
>  [749620.141621] Code: 48 89 c5 0f 1f 44 00 00 31 ff e8 f9 8d 73 ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 38 02 00 00 31 ff e8 b2 36 79 ff fb 45 85 f6 <0f> 88 e8 00 00 00 49 63 d6 48 2b 2c 24 48 6b ca 68 48 8d 04 52 48

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-12-07 17:59     ` Eric Dumazet
@ 2022-12-08 15:44       ` Breno Leitao
  2022-12-09  8:39         ` Eric Dumazet
  0 siblings, 1 reply; 14+ messages in thread
From: Breno Leitao @ 2022-12-08 15:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Jakub Kicinski, netdev, leit, Hideaki YOSHIFUJI,
	Paolo Abeni, David Ahern, LKML

On Wed, Dec 07, 2022 at 06:59:38PM +0100, Eric Dumazet wrote:
> On Wed, Dec 7, 2022 at 6:37 PM Breno Leitao <leitao@debian.org> wrote:
> >
> > On Sat, Sep 03, 2022 at 09:42:43AM -0700, Eric Dumazet wrote:
> > > On Wed, Aug 31, 2022 at 6:38 AM Breno Leitao <leitao@debian.org> wrote:
> > > >
> > > > There are cases where we need information about the socket during a
> > > > warning, so, it could help us to find bugs that happens that do not have
> > > > a easily repro.
> > > >
> > > > BPF congestion control algorithms can change socket state in unexpected
> > > > ways, leading to WARNings. Additional information about the socket state
> > > > is useful to identify the culprit.
> > >
> > > Well, this suggests we need to fix BPF side ?
> > >
> > > Not sure how this can happen, because TCP_BPF_IW has
> > >
> > > if (val <= 0 || tp->data_segs_out > tp->syn_data)
> > >      ret = -EINVAL;
> > > else
> > >     tcp_snd_cwnd_set(tp, val);
> >
> > I am not sure we are hitting this path, please check the stack below
> >
> > > It seems you already found the issue in an eBPF CC, can you share the details ?
> >
> > Sure, here is an example of what we are facing (with some obfuscation).
> > Remeber that there are users' BPF application running:
> >
> > [155375.750105] ------------[ cut here ]------------
> > [155375.759526] WARNING: CPU: 19 PID: 0 at net/ipv4/tcp.c:4552 tcp_sock_warn+0x6/0x20
> > [155375.774700] Modules linked in: vhost_net tun vhost vhost_iotlb tap virtio_net net_failover failover mpls_gso mpls_iptunnel mpls_router ip_tunnel fou ip6_udp_tunnel udp_tunnel sch_fq sunrpc bpf_preload tls act_gact cls_bpf tcp_diag inet_diag skx_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp iTCO_wdt kvm_intel iTCO_vendor_support kvm evdev ses irqbypass enclosure i2c_i801 i2c_smbus ipmi_si ipmi_devintf ipmi_msghandler acpi_cpufreq button tpm_crb sch_fq_codel fuse sg nvme mpi3mr scsi_transport_sas nvme_core xhci_pci xhci_hcd loop efivarfs autofs4
> > [155375.874916] CPU: 19 PID: 0 Comm: swapper/19 Kdump: loaded Tainted: G S                5.12.0-0_XXXXXXXX_g0fed6f189e14 #1
> > [155375.898770] Hardware name: XXXX XXX XXXX XXXXX, BIOS BS_BIOS_XXX XX/XX/2022
> > [155375.916015] RIP: 0010:tcp_sock_warn+0x6/0x20
> > [155375.924755] Code: 4d 01 3e 85 c0 0f 84 57 ff ff ff 48 8b 0c 24 44 8b 01 eb 82 e8 eb b7 14 00 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 53 <0f> 0b 48 85 ff 0f 85 77 70 14 00 5b c3 66 66 2e 0f 1f 84 00 00 00
> > [155375.962518] RSP: 0018:ffffc90000d08988 EFLAGS: 00010246
> > [155375.973157] RAX: ffff88817432b5c0 RBX: ffff88828f748000 RCX: ffffc90000d08a34
> > [155375.987614] RDX: 0000000000000000 RSI: 00000000822be8c8 RDI: ffff88828f748000
> > [155376.002074] RBP: 0000000000000040 R08: ffffc90000d08a38 R09: 00000000822be334
> > [155376.016531] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
> > [155376.030988] R13: ffffc90000d08a34 R14: 0000000000005546 R15: 0000000000000000
> > [155376.045441] FS:  0000000000000000(0000) GS:ffff88903f8c0000(0000) knlGS:0000000000000000
> > [155376.061804] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [155376.073474] CR2: 00007fe407603080 CR3: 0000000b19993006 CR4: 00000000007706e0
> > [155376.087928] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [155376.102383] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > [155376.116841] PKRU: 55555554
> > [155376.122427] Call Trace:
> > [155376.127497]  <IRQ>
> > [155376.131697]  tcp_fastretrans_alert+0x988/0x9e0
> > [155376.140774]  tcp_ack+0x8b4/0x1360
> > [155376.147581]  ? __cgroup_bpf_run_filter_skb+0x185/0x440
> > [155376.158037]  tcp_rcv_established+0x135/0x660
> > [155376.166755]  ? sk_filter_trim_cap+0xbc/0x220
> > [155376.175472]  tcp_v6_do_rcv+0xbe/0x3e0
> > [155376.182974]  tcp_v6_rcv+0xc01/0xc90
> > [155376.190128]  ip6_protocol_deliver_rcu+0xbd/0x450
> > [155376.199541]  ip6_input_finish+0x3d/0x60
> > [155376.207388]  ip6_input+0xb5/0xc0
> > [155376.214019]  ip6_sublist_rcv_finish+0x37/0x50
> > [155376.222912]  ip6_sublist_rcv+0x1dd/0x270
> > [155376.230935]  ipv6_list_rcv+0x113/0x140
> > [155376.238618]  __netif_receive_skb_list_core+0x1a0/0x210
> > [155376.249080]  netif_receive_skb_list_internal+0x186/0x2a0
> > [155376.259887]  gro_normal_list.part.171+0x19/0x40
> > [155376.269137]  napi_complete_done+0x65/0x150
> > [155376.277514]  bnxt_poll_p5+0x25b/0x2b0
> > [155376.285027]  ? tcp_write_xmit+0x278/0x1060
> > [155376.293398]  __napi_poll+0x25/0x120
> > [155376.300552]  net_rx_action+0x189/0x300
> > [155376.308227]  __do_softirq+0xbb/0x271
> > [155376.315554]  irq_exit_rcu+0x97/0xa0
> > [155376.322710]  common_interrupt+0x7f/0xa0
> > [155376.330566]  </IRQ>
> > [155376.334947]  asm_common_interrupt+0x1e/0x40
> > [155376.343499] RIP: 0010:cpuidle_enter_state+0xc2/0x340
> > [155376.353619] Code: 48 89 c5 0f 1f 44 00 00 31 ff e8 f9 8d 73 ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 38 02 00 00 31 ff e8 b2 36 79 ff fb 45 85 f6 <0f> 88 e8 00 00 00 49 63 d6 48 2b 2c 24 48 6b ca 68 48 8d 04 52 48
> > [155376.391367] RSP: 0018:ffffc90000293e90 EFLAGS: 00000202
> > [155376.402004] RAX: ffff88903f8eaa80 RBX: ffffe8ffff4c6c00 RCX: 000000000000001f
> > [155376.416456] RDX: 00008d503c63f228 RSI: 000000005ba4b680 RDI: 0000000000000000
> > [155376.430910] RBP: 00008d503c63f228 R08: 0000000000000002 R09: 000000000002a300
> > [155376.445372] R10: 002435b781e62f4a R11: ffff88903f8e9a84 R12: 0000000000000001
> > [155376.459832] R13: ffffffff83621200 R14: 0000000000000001 R15: 0000000000000000
> > [155376.474307]  cpuidle_enter+0x29/0x40
> > [155376.481654]  do_idle+0x1bb/0x200
> > [155376.488297]  cpu_startup_entry+0x19/0x20
> > [155376.496320]  start_secondary+0x104/0x140
> > [155376.504342]  secondary_startup_64_no_verify+0xb0/0xbb
> > [155376.514629] ---[ end trace 9b428a0d7901c3ff ]---
> > [155376.524041] TCP: Socket Info: family=10 state=1 sport=12345 dport=57616 ccname=ned_tcp_dctcp01 cwnd=1
> > [155376.524045] TCP: saddr=XXXX:XXXX:XXXX:XXXX:XXXX:0000:00c8:0000 daddr=XXXX:XXXX:XXXX:XXXX:XXXX:0000:0288:00
> 
> Try to give us symbols with scripts/decode_stacktrace.sh , thanks.

Sorry, here it is:

 [749619.538804] WARNING: CPU: 19 PID: 0 at net/ipv4/tcp.c:4552 tcp_sock_warn+0x6/0x20
 [749619.553969] Modules linked in: sch_fq sunrpc bpf_preload tls act_gact cls_bpf tcp_diag inet_diag skx_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel iTCO_wdt iTCO_vendor_support kvm evdev ses irqbypass enclosure i2c_i801 i2c_smbus ipmi_si ipmi_devintf ipmi_msghandler acpi_cpufreq button tpm_crb sch_fq_codel vhost_net tun tap vhost vhost_iotlb virtio_net net_failover failover mpls_gso mpls_iptunnel mpls_router ip_tunnel fou ip6_udp_tunnel udp_tunnel fuse sg nvme mpi3mr scsi_transport_sas nvme_core xhci_pci xhci_hcd loop efivarfs autofs4
 [749619.678066] Hardware name: XXXXX
 [749619.695308] RIP: tcp_sock_warn+0x6/0x20
 [749619.704034] Code: 4d 01 3e 85 c0 0f 84 57 ff ff ff 48 8b 0c 24 44 8b 01 eb 82 e8 eb b7 14 00 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 53 <0f> 0b 48 85 ff 0f 85 77 70 14 00 5b c3 66 66 2e 0f 1f 84 00 00 00
 All code
 ========
    0:	4d 01 3e             	add    %r15,(%r14)
    3:	85 c0                	test   %eax,%eax
    5:	0f 84 57 ff ff ff    	je     0xffffffffffffff62
    b:	48 8b 0c 24          	mov    (%rsp),%rcx
    f:	44 8b 01             	mov    (%rcx),%r8d
   12:	eb 82                	jmp    0xffffffffffffff96
   14:	e8 eb b7 14 00       	callq  0x14b804
   19:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
   20:	00 00 00 00
   24:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)
   29:	53                   	push   %rbx
   2a:*	0f 0b                	ud2    		<-- trapping instruction
   2c:	48 85 ff             	test   %rdi,%rdi
   2f:	0f 85 77 70 14 00    	jne    0x1470ac
   35:	5b                   	pop    %rbx
   36:	c3                   	retq
   37:	66                   	data16
   38:	66                   	data16
   39:	2e                   	cs
   3a:	0f                   	.byte 0xf
   3b:	1f                   	(bad)
   3c:	84 00                	test   %al,(%rax)
 	...

 Code starting with the faulting instruction
 ===========================================
    0:	0f 0b                	ud2
    2:	48 85 ff             	test   %rdi,%rdi
    5:	0f 85 77 70 14 00    	jne    0x147082
    b:	5b                   	pop    %rbx
    c:	c3                   	retq
    d:	66                   	data16
    e:	66                   	data16
    f:	2e                   	cs
   10:	0f                   	.byte 0xf
   11:	1f                   	(bad)
   12:	84 00                	test   %al,(%rax)
 	...
 [749619.741779] RSP: 0018:ffffc90000d08988 EFLAGS: 00010246
 [749619.752436] RAX: ffff88814b57f5c0 RBX: ffff8881bd2540c0 RCX: ffffc90000d08a34
 [749619.766900] RDX: 0000000000000000 RSI: 00000000cda8f4af RDI: ffff8881bd2540c0
 [749619.781364] RBP: 0000000000000000 R08: ffffc90000d08a38 R09: 00000000cda8f44f
 [749619.795831] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
 [749619.810300] R13: ffffc90000d08a34 R14: 0000000000011406 R15: 0000000000000000
 [749619.824788] FS:  0000000000000000(0000) GS:ffff88903f8c0000(0000) knlGS:0000000000000000
 [749619.841168] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [749619.852857] CR2: 000000000007c2e9 CR3: 0000000b82412002 CR4: 00000000007706e0
 [749619.867331] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 [749619.881800] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 [749619.896260] PKRU: 55555554
 [749619.901859] Call Trace:
 [749619.906927]  <IRQ>
 [749619.911129] tcp_fastretrans_alert+0x988/0x9e0
 [749619.920222] ? kmem_cache_free+0x33c/0x3d0
 [749619.928606] tcp_ack+0x8b4/0x1360
 [749619.935425] ? __cgroup_bpf_run_filter_skb+0x185/0x440
 [749619.945910] tcp_rcv_established+0x2f3/0x660
 [749619.954639] ? sk_filter_trim_cap+0xbc/0x220
 [749619.963358] tcp_v6_do_rcv+0xbe/0x3e0
 [749619.970863] tcp_v6_rcv+0xc01/0xc90
 [749619.978029] ip6_protocol_deliver_rcu+0xbd/0x450
 [749619.987453] ip6_input_finish+0x3d/0x60
 [749619.995313] ip6_input+0xb5/0xc0
 [749620.001958] ip6_sublist_rcv_finish+0x37/0x50
 [749620.010851] ip6_sublist_rcv+0x1dd/0x270
 [749620.018877] ipv6_list_rcv+0x113/0x140
 [749620.026552] __netif_receive_skb_list_core+0x1a0/0x210
 [749620.037025] netif_receive_skb_list_internal+0x186/0x2a0
 [749620.047838] ? napi_gro_complete+0x6c/0xd0
 [749620.056215] gro_normal_list.part.171+0x19/0x40
 [749620.065471] napi_complete_done+0x65/0x150
 [749620.073856] bnxt_poll_p5+0x25b/0x2b0
 [749620.081369] __napi_poll+0x25/0x120
 [749620.088537] net_rx_action+0x189/0x300
 [749620.096224] __do_softirq+0xbb/0x271
 [749620.103571] irq_exit_rcu+0x97/0xa0
 [749620.110732] common_interrupt+0x7f/0xa0
 [749620.118595]  </IRQ>
 [749620.122964] asm_common_interrupt+0x1e/0x40
 [749620.131511] RIP: cpuidle_enter_state+0xc2/0x340
 [749620.141621] Code: 48 89 c5 0f 1f 44 00 00 31 ff e8 f9 8d 73 ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 38 02 00 00 31 ff e8 b2 36 79 ff fb 45 85 f6 <0f> 88 e8 00 00 00 49 63 d6 48 2b 2c 24 48 6b ca 68 48 8d 04 52 48
 All code
 ========
    0:	48 89 c5             	mov    %rax,%rbp
    3:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)
    8:	31 ff                	xor    %edi,%edi
    a:	e8 f9 8d 73 ff       	callq  0xffffffffff738e08
    f:	45 84 ff             	test   %r15b,%r15b
   12:	74 12                	je     0x26
   14:	9c                   	pushfq
   15:	58                   	pop    %rax
   16:	f6 c4 02             	test   $0x2,%ah
   19:	0f 85 38 02 00 00    	jne    0x257
   1f:	31 ff                	xor    %edi,%edi
   21:	e8 b2 36 79 ff       	callq  0xffffffffff7936d8
   26:	fb                   	sti
   27:	45 85 f6             	test   %r14d,%r14d
   2a:*	0f 88 e8 00 00 00    	js     0x118		<-- trapping instruction
   30:	49 63 d6             	movslq %r14d,%rdx
   33:	48 2b 2c 24          	sub    (%rsp),%rbp
   37:	48 6b ca 68          	imul   $0x68,%rdx,%rcx
   3b:	48 8d 04 52          	lea    (%rdx,%rdx,2),%rax
   3f:	48                   	rex.W
 
 Code starting with the faulting instruction
 ===========================================
    0:	0f 88 e8 00 00 00    	js     0xee
    6:	49 63 d6             	movslq %r14d,%rdx
    9:	48 2b 2c 24          	sub    (%rsp),%rbp
    d:	48 6b ca 68          	imul   $0x68,%rdx,%rcx
   11:	48 8d 04 52          	lea    (%rdx,%rdx,2),%rax
   15:	48                   	rex.W
 [749620.179375] RSP: 0018:ffffc90000293e90 EFLAGS: 00000202
 [749620.190016] RAX: ffff88903f8eaa80 RBX: ffffe8ffff4c6c00 RCX: 000000000000001f
 [749620.204491] RDX: 0002a9c665c30050 RSI: 000000005ba4b680 RDI: 0000000000000000
 [749620.218949] RBP: 0002a9c665c30050 R08: 0000000000000002 R09: 000000000002a300
 [749620.233403] R10: 00219fc7cf6dd4ca R11: ffff88903f8e9a84 R12: 0000000000000001
 [749620.247854] R13: ffffffff83621200 R14: 0000000000000001 R15: 0000000000000000
 [749620.262315] cpuidle_enter+0x29/0x40
 [749620.269649] do_idle+0x1bb/0x200
 [749620.276296] cpu_startup_entry+0x19/0x20
 [749620.284333] start_secondary+0x104/0x140
 [749620.292364] secondary_startup_64_no_verify+0xb0/0xbb
 [749620.302657] ---[ end trace 44f03a0d3a4e81fd ]---

> Also 5.12 is kind of old :/

Right. Unfortunately this problem is not easily reproducible, and, we
need millions of machines running some specific workload to reproduce
it.


That said, the motivation for this patch is to dump more TCP information
when we get such warnings, so, we have relevant data to understand more
about the issues (when it happens). I.e, this patch is not this problem
above, but, providing more information when we hit such warnings.

That said, this is patch is about creating the
TCP_SOCK_WARN_ON_ONCE() macro, and eventually move some of the
WARN_ON_ONCE() to TCP_SOCK_WARN_ON_ONCE(). (I.e, this is not a patch to solve
the case above.)

Anyway, I got some feedbacks from the patch, let me send a V2 of the
patch, which I expect to clarify the problem better.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-12-07 17:37   ` Breno Leitao
@ 2022-12-07 17:59     ` Eric Dumazet
  2022-12-08 15:44       ` Breno Leitao
  0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2022-12-07 17:59 UTC (permalink / raw)
  To: Breno Leitao
  Cc: David Miller, Jakub Kicinski, netdev, leit, Hideaki YOSHIFUJI,
	Paolo Abeni, David Ahern, LKML

On Wed, Dec 7, 2022 at 6:37 PM Breno Leitao <leitao@debian.org> wrote:
>
> On Sat, Sep 03, 2022 at 09:42:43AM -0700, Eric Dumazet wrote:
> > On Wed, Aug 31, 2022 at 6:38 AM Breno Leitao <leitao@debian.org> wrote:
> > >
> > > There are cases where we need information about the socket during a
> > > warning, so, it could help us to find bugs that happens that do not have
> > > a easily repro.
> > >
> > > BPF congestion control algorithms can change socket state in unexpected
> > > ways, leading to WARNings. Additional information about the socket state
> > > is useful to identify the culprit.
> >
> > Well, this suggests we need to fix BPF side ?
> >
> > Not sure how this can happen, because TCP_BPF_IW has
> >
> > if (val <= 0 || tp->data_segs_out > tp->syn_data)
> >      ret = -EINVAL;
> > else
> >     tcp_snd_cwnd_set(tp, val);
>
> I am not sure we are hitting this path, please check the stack below
>
> > It seems you already found the issue in an eBPF CC, can you share the details ?
>
> Sure, here is an example of what we are facing (with some obfuscation).
> Remeber that there are users' BPF application running:
>
> [155375.750105] ------------[ cut here ]------------
> [155375.759526] WARNING: CPU: 19 PID: 0 at net/ipv4/tcp.c:4552 tcp_sock_warn+0x6/0x20
> [155375.774700] Modules linked in: vhost_net tun vhost vhost_iotlb tap virtio_net net_failover failover mpls_gso mpls_iptunnel mpls_router ip_tunnel fou ip6_udp_tunnel udp_tunnel sch_fq sunrpc bpf_preload tls act_gact cls_bpf tcp_diag inet_diag skx_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp iTCO_wdt kvm_intel iTCO_vendor_support kvm evdev ses irqbypass enclosure i2c_i801 i2c_smbus ipmi_si ipmi_devintf ipmi_msghandler acpi_cpufreq button tpm_crb sch_fq_codel fuse sg nvme mpi3mr scsi_transport_sas nvme_core xhci_pci xhci_hcd loop efivarfs autofs4
> [155375.874916] CPU: 19 PID: 0 Comm: swapper/19 Kdump: loaded Tainted: G S                5.12.0-0_XXXXXXXX_g0fed6f189e14 #1
> [155375.898770] Hardware name: XXXX XXX XXXX XXXXX, BIOS BS_BIOS_XXX XX/XX/2022
> [155375.916015] RIP: 0010:tcp_sock_warn+0x6/0x20
> [155375.924755] Code: 4d 01 3e 85 c0 0f 84 57 ff ff ff 48 8b 0c 24 44 8b 01 eb 82 e8 eb b7 14 00 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 53 <0f> 0b 48 85 ff 0f 85 77 70 14 00 5b c3 66 66 2e 0f 1f 84 00 00 00
> [155375.962518] RSP: 0018:ffffc90000d08988 EFLAGS: 00010246
> [155375.973157] RAX: ffff88817432b5c0 RBX: ffff88828f748000 RCX: ffffc90000d08a34
> [155375.987614] RDX: 0000000000000000 RSI: 00000000822be8c8 RDI: ffff88828f748000
> [155376.002074] RBP: 0000000000000040 R08: ffffc90000d08a38 R09: 00000000822be334
> [155376.016531] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
> [155376.030988] R13: ffffc90000d08a34 R14: 0000000000005546 R15: 0000000000000000
> [155376.045441] FS:  0000000000000000(0000) GS:ffff88903f8c0000(0000) knlGS:0000000000000000
> [155376.061804] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [155376.073474] CR2: 00007fe407603080 CR3: 0000000b19993006 CR4: 00000000007706e0
> [155376.087928] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [155376.102383] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [155376.116841] PKRU: 55555554
> [155376.122427] Call Trace:
> [155376.127497]  <IRQ>
> [155376.131697]  tcp_fastretrans_alert+0x988/0x9e0
> [155376.140774]  tcp_ack+0x8b4/0x1360
> [155376.147581]  ? __cgroup_bpf_run_filter_skb+0x185/0x440
> [155376.158037]  tcp_rcv_established+0x135/0x660
> [155376.166755]  ? sk_filter_trim_cap+0xbc/0x220
> [155376.175472]  tcp_v6_do_rcv+0xbe/0x3e0
> [155376.182974]  tcp_v6_rcv+0xc01/0xc90
> [155376.190128]  ip6_protocol_deliver_rcu+0xbd/0x450
> [155376.199541]  ip6_input_finish+0x3d/0x60
> [155376.207388]  ip6_input+0xb5/0xc0
> [155376.214019]  ip6_sublist_rcv_finish+0x37/0x50
> [155376.222912]  ip6_sublist_rcv+0x1dd/0x270
> [155376.230935]  ipv6_list_rcv+0x113/0x140
> [155376.238618]  __netif_receive_skb_list_core+0x1a0/0x210
> [155376.249080]  netif_receive_skb_list_internal+0x186/0x2a0
> [155376.259887]  gro_normal_list.part.171+0x19/0x40
> [155376.269137]  napi_complete_done+0x65/0x150
> [155376.277514]  bnxt_poll_p5+0x25b/0x2b0
> [155376.285027]  ? tcp_write_xmit+0x278/0x1060
> [155376.293398]  __napi_poll+0x25/0x120
> [155376.300552]  net_rx_action+0x189/0x300
> [155376.308227]  __do_softirq+0xbb/0x271
> [155376.315554]  irq_exit_rcu+0x97/0xa0
> [155376.322710]  common_interrupt+0x7f/0xa0
> [155376.330566]  </IRQ>
> [155376.334947]  asm_common_interrupt+0x1e/0x40
> [155376.343499] RIP: 0010:cpuidle_enter_state+0xc2/0x340
> [155376.353619] Code: 48 89 c5 0f 1f 44 00 00 31 ff e8 f9 8d 73 ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 38 02 00 00 31 ff e8 b2 36 79 ff fb 45 85 f6 <0f> 88 e8 00 00 00 49 63 d6 48 2b 2c 24 48 6b ca 68 48 8d 04 52 48
> [155376.391367] RSP: 0018:ffffc90000293e90 EFLAGS: 00000202
> [155376.402004] RAX: ffff88903f8eaa80 RBX: ffffe8ffff4c6c00 RCX: 000000000000001f
> [155376.416456] RDX: 00008d503c63f228 RSI: 000000005ba4b680 RDI: 0000000000000000
> [155376.430910] RBP: 00008d503c63f228 R08: 0000000000000002 R09: 000000000002a300
> [155376.445372] R10: 002435b781e62f4a R11: ffff88903f8e9a84 R12: 0000000000000001
> [155376.459832] R13: ffffffff83621200 R14: 0000000000000001 R15: 0000000000000000
> [155376.474307]  cpuidle_enter+0x29/0x40
> [155376.481654]  do_idle+0x1bb/0x200
> [155376.488297]  cpu_startup_entry+0x19/0x20
> [155376.496320]  start_secondary+0x104/0x140
> [155376.504342]  secondary_startup_64_no_verify+0xb0/0xbb
> [155376.514629] ---[ end trace 9b428a0d7901c3ff ]---
> [155376.524041] TCP: Socket Info: family=10 state=1 sport=12345 dport=57616 ccname=ned_tcp_dctcp01 cwnd=1
> [155376.524045] TCP: saddr=XXXX:XXXX:XXXX:XXXX:XXXX:0000:00c8:0000 daddr=XXXX:XXXX:XXXX:XXXX:XXXX:0000:0288:00

Try to give us symbols with scripts/decode_stacktrace.sh , thanks.

Also 5.12 is kind of old :/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-09-03 16:42 ` Eric Dumazet
  2022-09-05 13:44   ` Breno Leitao
@ 2022-12-07 17:37   ` Breno Leitao
  2022-12-07 17:59     ` Eric Dumazet
  1 sibling, 1 reply; 14+ messages in thread
From: Breno Leitao @ 2022-12-07 17:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Jakub Kicinski, netdev, leit, Hideaki YOSHIFUJI,
	Paolo Abeni, David Ahern, LKML

On Sat, Sep 03, 2022 at 09:42:43AM -0700, Eric Dumazet wrote:
> On Wed, Aug 31, 2022 at 6:38 AM Breno Leitao <leitao@debian.org> wrote:
> >
> > There are cases where we need information about the socket during a
> > warning, so, it could help us to find bugs that happens that do not have
> > a easily repro.
> >
> > BPF congestion control algorithms can change socket state in unexpected
> > ways, leading to WARNings. Additional information about the socket state
> > is useful to identify the culprit.
> 
> Well, this suggests we need to fix BPF side ?
> 
> Not sure how this can happen, because TCP_BPF_IW has
> 
> if (val <= 0 || tp->data_segs_out > tp->syn_data)
>      ret = -EINVAL;
> else
>     tcp_snd_cwnd_set(tp, val);

I am not sure we are hitting this path, please check the stack below

> It seems you already found the issue in an eBPF CC, can you share the details ?

Sure, here is an example of what we are facing (with some obfuscation).
Remeber that there are users' BPF application running:

[155375.750105] ------------[ cut here ]------------
[155375.759526] WARNING: CPU: 19 PID: 0 at net/ipv4/tcp.c:4552 tcp_sock_warn+0x6/0x20
[155375.774700] Modules linked in: vhost_net tun vhost vhost_iotlb tap virtio_net net_failover failover mpls_gso mpls_iptunnel mpls_router ip_tunnel fou ip6_udp_tunnel udp_tunnel sch_fq sunrpc bpf_preload tls act_gact cls_bpf tcp_diag inet_diag skx_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp iTCO_wdt kvm_intel iTCO_vendor_support kvm evdev ses irqbypass enclosure i2c_i801 i2c_smbus ipmi_si ipmi_devintf ipmi_msghandler acpi_cpufreq button tpm_crb sch_fq_codel fuse sg nvme mpi3mr scsi_transport_sas nvme_core xhci_pci xhci_hcd loop efivarfs autofs4
[155375.874916] CPU: 19 PID: 0 Comm: swapper/19 Kdump: loaded Tainted: G S                5.12.0-0_XXXXXXXX_g0fed6f189e14 #1
[155375.898770] Hardware name: XXXX XXX XXXX XXXXX, BIOS BS_BIOS_XXX XX/XX/2022
[155375.916015] RIP: 0010:tcp_sock_warn+0x6/0x20
[155375.924755] Code: 4d 01 3e 85 c0 0f 84 57 ff ff ff 48 8b 0c 24 44 8b 01 eb 82 e8 eb b7 14 00 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 53 <0f> 0b 48 85 ff 0f 85 77 70 14 00 5b c3 66 66 2e 0f 1f 84 00 00 00
[155375.962518] RSP: 0018:ffffc90000d08988 EFLAGS: 00010246
[155375.973157] RAX: ffff88817432b5c0 RBX: ffff88828f748000 RCX: ffffc90000d08a34
[155375.987614] RDX: 0000000000000000 RSI: 00000000822be8c8 RDI: ffff88828f748000
[155376.002074] RBP: 0000000000000040 R08: ffffc90000d08a38 R09: 00000000822be334
[155376.016531] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[155376.030988] R13: ffffc90000d08a34 R14: 0000000000005546 R15: 0000000000000000
[155376.045441] FS:  0000000000000000(0000) GS:ffff88903f8c0000(0000) knlGS:0000000000000000
[155376.061804] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[155376.073474] CR2: 00007fe407603080 CR3: 0000000b19993006 CR4: 00000000007706e0
[155376.087928] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[155376.102383] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[155376.116841] PKRU: 55555554
[155376.122427] Call Trace:
[155376.127497]  <IRQ>
[155376.131697]  tcp_fastretrans_alert+0x988/0x9e0
[155376.140774]  tcp_ack+0x8b4/0x1360
[155376.147581]  ? __cgroup_bpf_run_filter_skb+0x185/0x440
[155376.158037]  tcp_rcv_established+0x135/0x660
[155376.166755]  ? sk_filter_trim_cap+0xbc/0x220
[155376.175472]  tcp_v6_do_rcv+0xbe/0x3e0
[155376.182974]  tcp_v6_rcv+0xc01/0xc90
[155376.190128]  ip6_protocol_deliver_rcu+0xbd/0x450
[155376.199541]  ip6_input_finish+0x3d/0x60
[155376.207388]  ip6_input+0xb5/0xc0
[155376.214019]  ip6_sublist_rcv_finish+0x37/0x50
[155376.222912]  ip6_sublist_rcv+0x1dd/0x270
[155376.230935]  ipv6_list_rcv+0x113/0x140
[155376.238618]  __netif_receive_skb_list_core+0x1a0/0x210
[155376.249080]  netif_receive_skb_list_internal+0x186/0x2a0
[155376.259887]  gro_normal_list.part.171+0x19/0x40
[155376.269137]  napi_complete_done+0x65/0x150
[155376.277514]  bnxt_poll_p5+0x25b/0x2b0
[155376.285027]  ? tcp_write_xmit+0x278/0x1060
[155376.293398]  __napi_poll+0x25/0x120
[155376.300552]  net_rx_action+0x189/0x300
[155376.308227]  __do_softirq+0xbb/0x271
[155376.315554]  irq_exit_rcu+0x97/0xa0
[155376.322710]  common_interrupt+0x7f/0xa0
[155376.330566]  </IRQ>
[155376.334947]  asm_common_interrupt+0x1e/0x40
[155376.343499] RIP: 0010:cpuidle_enter_state+0xc2/0x340
[155376.353619] Code: 48 89 c5 0f 1f 44 00 00 31 ff e8 f9 8d 73 ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 38 02 00 00 31 ff e8 b2 36 79 ff fb 45 85 f6 <0f> 88 e8 00 00 00 49 63 d6 48 2b 2c 24 48 6b ca 68 48 8d 04 52 48
[155376.391367] RSP: 0018:ffffc90000293e90 EFLAGS: 00000202
[155376.402004] RAX: ffff88903f8eaa80 RBX: ffffe8ffff4c6c00 RCX: 000000000000001f
[155376.416456] RDX: 00008d503c63f228 RSI: 000000005ba4b680 RDI: 0000000000000000
[155376.430910] RBP: 00008d503c63f228 R08: 0000000000000002 R09: 000000000002a300
[155376.445372] R10: 002435b781e62f4a R11: ffff88903f8e9a84 R12: 0000000000000001
[155376.459832] R13: ffffffff83621200 R14: 0000000000000001 R15: 0000000000000000
[155376.474307]  cpuidle_enter+0x29/0x40
[155376.481654]  do_idle+0x1bb/0x200
[155376.488297]  cpu_startup_entry+0x19/0x20
[155376.496320]  start_secondary+0x104/0x140
[155376.504342]  secondary_startup_64_no_verify+0xb0/0xbb
[155376.514629] ---[ end trace 9b428a0d7901c3ff ]---
[155376.524041] TCP: Socket Info: family=10 state=1 sport=12345 dport=57616 ccname=ned_tcp_dctcp01 cwnd=1
[155376.524045] TCP: saddr=XXXX:XXXX:XXXX:XXXX:XXXX:0000:00c8:0000 daddr=XXXX:XXXX:XXXX:XXXX:XXXX:0000:0288:00

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-09-03 16:42 ` Eric Dumazet
@ 2022-09-05 13:44   ` Breno Leitao
  2022-12-07 17:37   ` Breno Leitao
  1 sibling, 0 replies; 14+ messages in thread
From: Breno Leitao @ 2022-09-05 13:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Jakub Kicinski, netdev, leit, Hideaki YOSHIFUJI,
	Paolo Abeni, David Ahern, LKML

Hello Eric,

On Sat, Sep 03, 2022 at 09:42:43AM -0700, Eric Dumazet wrote:
> On Wed, Aug 31, 2022 at 6:38 AM Breno Leitao <leitao@debian.org> wrote:
> >
> > There are cases where we need information about the socket during a
> > warning, so, it could help us to find bugs that happens that do not have
> > a easily repro.
> >
> > BPF congestion control algorithms can change socket state in unexpected
> > ways, leading to WARNings. Additional information about the socket state
> > is useful to identify the culprit.

A little bit of more context here. We hit this warning in production
several hundred times a day. We don't know exactly where it is coming
from, that is why this patch is being proposed.

> Well, this suggests we need to fix BPF side ?

This patch might help us to identify who is the culprit that is setting
the wrong value in the congestion window. If the problem is on the BPF
side, we probably need to Fix BPF side, for sure.

> It seems you already found the issue in an eBPF CC, can you share the details ?

Not really.  I've applied this patch into our internal kernel, and we
might soon find more information of what is causing this warning.

> > This diff creates a TCP socket-specific version of WARN_ON_ONCE(), and
> > attaches it to tcp_snd_cwnd_set().
> 
> Well, I feel this will need constant additions... the state of a
> custom BPF CC is opaque to core TCP stack anyway ?
> 
> >
> > Signed-off-by: Breno Leitao <leitao@debian.org>
> > ---
> >  include/net/tcp.h       |  3 ++-
> >  include/net/tcp_debug.h | 10 ++++++++++
> >  net/ipv4/tcp.c          | 30 ++++++++++++++++++++++++++++++
> >  3 files changed, 42 insertions(+), 1 deletion(-)
> >  create mode 100644 include/net/tcp_debug.h
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index d10962b9f0d0..73c3970d8839 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -40,6 +40,7 @@
> >  #include <net/inet_ecn.h>
> >  #include <net/dst.h>
> >  #include <net/mptcp.h>
> > +#include <net/tcp_debug.h>
> >
> >  #include <linux/seq_file.h>
> >  #include <linux/memcontrol.h>
> > @@ -1222,7 +1223,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
> >
> >  static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
> >  {
> > -       WARN_ON_ONCE((int)val <= 0);
> > +       TCP_SOCK_WARN_ON_ONCE(tp, (int)val <= 0);
> >         tp->snd_cwnd = val;
> >  }
> >
> > diff --git a/include/net/tcp_debug.h b/include/net/tcp_debug.h
> > new file mode 100644
> > index 000000000000..50e96d87d335
> > --- /dev/null
> > +++ b/include/net/tcp_debug.h
> > @@ -0,0 +1,10 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _LINUX_TCP_DEBUG_H
> > +#define _LINUX_TCP_DEBUG_H
> > +
> > +void tcp_sock_warn(const struct tcp_sock *tp);
> > +
> > +#define TCP_SOCK_WARN_ON_ONCE(tcp_sock, condition) \
> > +               DO_ONCE_LITE_IF(condition, tcp_sock_warn, tcp_sock)
> > +
> > +#endif  /* _LINUX_TCP_DEBUG_H */
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index bbe218753662..71771fee72f7 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -4684,6 +4684,36 @@ int tcp_abort(struct sock *sk, int err)
> >  }
> >  EXPORT_SYMBOL_GPL(tcp_abort);
> >
> > +void tcp_sock_warn(const struct tcp_sock *tp)
> > +{
> > +       const struct sock *sk = (const struct sock *)tp;
> > +       struct inet_sock *inet = inet_sk(sk);
> > +       struct inet_connection_sock *icsk = inet_csk(sk);
> > +
> > +       WARN_ON(1);
> > +
> > +       if (!tp)
> > +               return;
> > +
> > +       pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
> > +               sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
> > +               ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
> > +
> > +       switch (sk->sk_family) {
> > +       case AF_INET:
> > +               pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
> > +                       &inet->inet_daddr);
> > +               break;
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +       case AF_INET6:
> > +               pr_warn("saddr=%pI6 daddr=%pI6", &sk->sk_v6_rcv_saddr,
> > +                       &sk->sk_v6_daddr);
> > +               break;
> > +#endif
> > +       }
> > +}
> > +EXPORT_SYMBOL_GPL(tcp_sock_warn);
> > +
> >  extern struct tcp_congestion_ops tcp_reno;
> >
> >  static __initdata unsigned long thash_entries;
> > --
> > 2.30.2
> >

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
  2022-08-31 13:37 Breno Leitao
@ 2022-09-03 16:42 ` Eric Dumazet
  2022-09-05 13:44   ` Breno Leitao
  2022-12-07 17:37   ` Breno Leitao
  0 siblings, 2 replies; 14+ messages in thread
From: Eric Dumazet @ 2022-09-03 16:42 UTC (permalink / raw)
  To: Breno Leitao
  Cc: David Miller, Jakub Kicinski, netdev, leit, Hideaki YOSHIFUJI,
	Paolo Abeni, David Ahern, LKML

On Wed, Aug 31, 2022 at 6:38 AM Breno Leitao <leitao@debian.org> wrote:
>
> There are cases where we need information about the socket during a
> warning, so, it could help us to find bugs that happens that do not have
> a easily repro.
>
> BPF congestion control algorithms can change socket state in unexpected
> ways, leading to WARNings. Additional information about the socket state
> is useful to identify the culprit.

Well, this suggests we need to fix BPF side ?

Not sure how this can happen, because TCP_BPF_IW has

if (val <= 0 || tp->data_segs_out > tp->syn_data)
     ret = -EINVAL;
else
    tcp_snd_cwnd_set(tp, val);

It seems you already found the issue in an eBPF CC, can you share the details ?


>
> This diff creates a TCP socket-specific version of WARN_ON_ONCE(), and
> attaches it to tcp_snd_cwnd_set().

Well, I feel this will need constant additions... the state of a
custom BPF CC is opaque to core TCP stack anyway ?

>
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  include/net/tcp.h       |  3 ++-
>  include/net/tcp_debug.h | 10 ++++++++++
>  net/ipv4/tcp.c          | 30 ++++++++++++++++++++++++++++++
>  3 files changed, 42 insertions(+), 1 deletion(-)
>  create mode 100644 include/net/tcp_debug.h
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index d10962b9f0d0..73c3970d8839 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -40,6 +40,7 @@
>  #include <net/inet_ecn.h>
>  #include <net/dst.h>
>  #include <net/mptcp.h>
> +#include <net/tcp_debug.h>
>
>  #include <linux/seq_file.h>
>  #include <linux/memcontrol.h>
> @@ -1222,7 +1223,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
>
>  static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
>  {
> -       WARN_ON_ONCE((int)val <= 0);
> +       TCP_SOCK_WARN_ON_ONCE(tp, (int)val <= 0);
>         tp->snd_cwnd = val;
>  }
>
> diff --git a/include/net/tcp_debug.h b/include/net/tcp_debug.h
> new file mode 100644
> index 000000000000..50e96d87d335
> --- /dev/null
> +++ b/include/net/tcp_debug.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_TCP_DEBUG_H
> +#define _LINUX_TCP_DEBUG_H
> +
> +void tcp_sock_warn(const struct tcp_sock *tp);
> +
> +#define TCP_SOCK_WARN_ON_ONCE(tcp_sock, condition) \
> +               DO_ONCE_LITE_IF(condition, tcp_sock_warn, tcp_sock)
> +
> +#endif  /* _LINUX_TCP_DEBUG_H */
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index bbe218753662..71771fee72f7 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -4684,6 +4684,36 @@ int tcp_abort(struct sock *sk, int err)
>  }
>  EXPORT_SYMBOL_GPL(tcp_abort);
>
> +void tcp_sock_warn(const struct tcp_sock *tp)
> +{
> +       const struct sock *sk = (const struct sock *)tp;
> +       struct inet_sock *inet = inet_sk(sk);
> +       struct inet_connection_sock *icsk = inet_csk(sk);
> +
> +       WARN_ON(1);
> +
> +       if (!tp)
> +               return;
> +
> +       pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
> +               sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
> +               ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
> +
> +       switch (sk->sk_family) {
> +       case AF_INET:
> +               pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
> +                       &inet->inet_daddr);
> +               break;
> +#if IS_ENABLED(CONFIG_IPV6)
> +       case AF_INET6:
> +               pr_warn("saddr=%pI6 daddr=%pI6", &sk->sk_v6_rcv_saddr,
> +                       &sk->sk_v6_daddr);
> +               break;
> +#endif
> +       }
> +}
> +EXPORT_SYMBOL_GPL(tcp_sock_warn);
> +
>  extern struct tcp_congestion_ops tcp_reno;
>
>  static __initdata unsigned long thash_entries;
> --
> 2.30.2
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE()
@ 2022-08-31 13:37 Breno Leitao
  2022-09-03 16:42 ` Eric Dumazet
  0 siblings, 1 reply; 14+ messages in thread
From: Breno Leitao @ 2022-08-31 13:37 UTC (permalink / raw)
  To: edumazet, davem, kuba
  Cc: netdev, leit, yoshfuji, pabeni, dsahern, linux-kernel

There are cases where we need information about the socket during a
warning, so, it could help us to find bugs that happens that do not have
a easily repro.

BPF congestion control algorithms can change socket state in unexpected
ways, leading to WARNings. Additional information about the socket state
is useful to identify the culprit.

This diff creates a TCP socket-specific version of WARN_ON_ONCE(), and
attaches it to tcp_snd_cwnd_set().

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 include/net/tcp.h       |  3 ++-
 include/net/tcp_debug.h | 10 ++++++++++
 net/ipv4/tcp.c          | 30 ++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 include/net/tcp_debug.h

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d10962b9f0d0..73c3970d8839 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -40,6 +40,7 @@
 #include <net/inet_ecn.h>
 #include <net/dst.h>
 #include <net/mptcp.h>
+#include <net/tcp_debug.h>
 
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
@@ -1222,7 +1223,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
 
 static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
 {
-	WARN_ON_ONCE((int)val <= 0);
+	TCP_SOCK_WARN_ON_ONCE(tp, (int)val <= 0);
 	tp->snd_cwnd = val;
 }
 
diff --git a/include/net/tcp_debug.h b/include/net/tcp_debug.h
new file mode 100644
index 000000000000..50e96d87d335
--- /dev/null
+++ b/include/net/tcp_debug.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TCP_DEBUG_H
+#define _LINUX_TCP_DEBUG_H
+
+void tcp_sock_warn(const struct tcp_sock *tp);
+
+#define TCP_SOCK_WARN_ON_ONCE(tcp_sock, condition) \
+		DO_ONCE_LITE_IF(condition, tcp_sock_warn, tcp_sock)
+
+#endif  /* _LINUX_TCP_DEBUG_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bbe218753662..71771fee72f7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4684,6 +4684,36 @@ int tcp_abort(struct sock *sk, int err)
 }
 EXPORT_SYMBOL_GPL(tcp_abort);
 
+void tcp_sock_warn(const struct tcp_sock *tp)
+{
+	const struct sock *sk = (const struct sock *)tp;
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	WARN_ON(1);
+
+	if (!tp)
+		return;
+
+	pr_warn("Socket Info: family=%u state=%d sport=%u dport=%u ccname=%s cwnd=%u",
+		sk->sk_family, sk->sk_state, ntohs(inet->inet_sport),
+		ntohs(inet->inet_dport), icsk->icsk_ca_ops->name, tcp_snd_cwnd(tp));
+
+	switch (sk->sk_family) {
+	case AF_INET:
+		pr_warn("saddr=%pI4 daddr=%pI4", &inet->inet_saddr,
+			&inet->inet_daddr);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		pr_warn("saddr=%pI6 daddr=%pI6", &sk->sk_v6_rcv_saddr,
+			&sk->sk_v6_daddr);
+		break;
+#endif
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_sock_warn);
+
 extern struct tcp_congestion_ops tcp_reno;
 
 static __initdata unsigned long thash_entries;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-12-09  8:39 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-24 11:22 [PATCH RESEND net-next] tcp: socket-specific version of WARN_ON_ONCE() Breno Leitao
2022-11-29  1:00 ` Kuniyuki Iwashima
2022-11-29 12:47   ` Breno Leitao
2022-11-29 21:16     ` Iwashima, Kuniyuki
2022-11-30 13:18       ` Breno Leitao
2022-11-29 10:18 ` Paolo Abeni
2022-11-30  2:28   ` Jakub Kicinski
  -- strict thread matches above, loose matches on Subject: below --
2022-08-31 13:37 Breno Leitao
2022-09-03 16:42 ` Eric Dumazet
2022-09-05 13:44   ` Breno Leitao
2022-12-07 17:37   ` Breno Leitao
2022-12-07 17:59     ` Eric Dumazet
2022-12-08 15:44       ` Breno Leitao
2022-12-09  8:39         ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.