linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] net/mlx4: tcp_drop replace of tcp_drop_new
@ 2021-08-21 15:53 jony-one
  2021-08-23  0:21 ` Yonghong Song
  2021-08-23 19:06 ` Cong Wang
  0 siblings, 2 replies; 3+ messages in thread
From: jony-one @ 2021-08-21 15:53 UTC (permalink / raw)
  To: kuba
  Cc: edumazet, rostedt, mingo, davem, yoshfuji, dsahern, netdev,
	linux-kernel, hengqi.chen, yhs, jony-one

We never know why we are deleting a tcp packet when we delete it,
and the tcp_drop_new() function can effectively solve this problem.
The tcp_drop_new() will learn from the specified status code why the
packet was deleted, and the caller from whom the packet was deleted.
The kernel should be a little more open to data that is about to be
destroyed and useless, and users should be able to keep track of it.

Signed-off-by: jony-one <yan2228598786@gmail.com>
---
 include/trace/events/tcp.h | 51 ++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c       | 29 ++++++++++++++--------
 2 files changed, 69 insertions(+), 11 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 521059d8d..5a0478440 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -371,6 +371,57 @@ DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,
 	TP_ARGS(skb)
 );
 
+/*
+ * tcp event whit argument sk, skb, reason
+ */
+TRACE_EVENT(tcp_drop_new,
+
+		TP_PROTO(struct sock *sk, struct sk_buff *skb, const char *reason),
+
+		TP_ARGS(sk, skb, reason),
+
+		TP_STRUCT__entry(
+			__field(const void *, skbaddr)
+			__field(const void *, skaddr)
+			__string(reason, reason)
+			__field(int, state)
+			__field(__u16, sport)
+			__field(__u16, dport)
+			__array(__u8, saddr, 4)
+			__array(__u8, daddr, 4)
+			__array(__u8, saddr_v6, 16)
+			__array(__u8, daddr_v6, 16)
+		),
+
+		TP_fast_assign(
+			struct inet_sock *inet = inet_sk(sk);
+			__be32 *p32;
+
+			__assign_str(reason, reason);
+
+			__entry->skbaddr = skb;
+			__entry->skaddr = sk;
+			__entry->state = sk->sk_state;
+
+			__entry->sport = ntohs(inet->inet_sport);
+			__entry->dport = ntohs(inet->inet_dport);
+
+			p32 = (__be32 *) __entry->saddr;
+			*p32 = inet->inet_saddr;
+
+			p32 = (__be32 *) __entry->daddr;
+			*p32 =  inet->inet_daddr;
+
+			TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+				sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+		),
+
+		TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s reason=%s",
+				__entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
+				__entry->saddr_v6, __entry->daddr_v6,
+				show_tcp_state_name(__entry->state), __get_str(reason))
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 149ceb5c9..988989e25 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4649,6 +4649,13 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
+static void tcp_drop_new(struct sock *sk, struct sk_buff *skb, const char *reason)
+{
+	trace_tcp_drop_new(sk, skb, reason);
+	sk_drops_add(sk, skb);
+	__kfree_skb(skb);
+}
+
 /* This one checks to see if we can put data from the
  * out_of_order queue into the receive_queue.
  */
@@ -4676,7 +4683,7 @@ static void tcp_ofo_queue(struct sock *sk)
 		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
 
 		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
-			tcp_drop(sk, skb);
+			tcp_drop_new(sk, skb, __func__);
 			continue;
 		}
 
@@ -4732,7 +4739,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
 		sk->sk_data_ready(sk);
-		tcp_drop(sk, skb);
+		tcp_drop_new(sk, skb, __func__);
 		return;
 	}
 
@@ -4795,7 +4802,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 				/* All the bits are present. Drop. */
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				tcp_drop(sk, skb);
+				tcp_drop_new(sk, skb, __func__);
 				skb = NULL;
 				tcp_dsack_set(sk, seq, end_seq);
 				goto add_sack;
@@ -4814,7 +4821,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 						 TCP_SKB_CB(skb1)->end_seq);
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				tcp_drop(sk, skb1);
+				tcp_drop_new(sk, skb1, __func__);
 				goto merge_right;
 			}
 		} else if (tcp_ooo_try_coalesce(sk, skb1,
@@ -4842,7 +4849,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
 				 TCP_SKB_CB(skb1)->end_seq);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-		tcp_drop(sk, skb1);
+		tcp_drop_new(sk, skb1, __func__);
 	}
 	/* If there is no skb after us, we are the last_skb ! */
 	if (!skb1)
@@ -5019,7 +5026,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
 		inet_csk_schedule_ack(sk);
 drop:
-		tcp_drop(sk, skb);
+		tcp_drop_new(sk, skb, __func__);
 		return;
 	}
 
@@ -5276,7 +5283,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
 		goal -= rb_to_skb(node)->truesize;
-		tcp_drop(sk, rb_to_skb(node));
+		tcp_drop_new(sk, rb_to_skb(node), __func__);
 		if (!prev || goal <= 0) {
 			sk_mem_reclaim(sk);
 			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
@@ -5701,7 +5708,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 	return true;
 
 discard:
-	tcp_drop(sk, skb);
+	tcp_drop_new(sk, skb, __func__);
 	return false;
 }
 
@@ -5905,7 +5912,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
 
 discard:
-	tcp_drop(sk, skb);
+	tcp_drop_new(sk, skb, __func__);
 }
 EXPORT_SYMBOL(tcp_rcv_established);
 
@@ -6196,7 +6203,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 						  TCP_DELACK_MAX, TCP_RTO_MAX);
 
 discard:
-			tcp_drop(sk, skb);
+			tcp_drop_new(sk, skb, __func__);
 			return 0;
 		} else {
 			tcp_send_ack(sk);
@@ -6568,7 +6575,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
 	if (!queued) {
 discard:
-		tcp_drop(sk, skb);
+		tcp_drop_new(sk, skb, __func__);
 	}
 	return 0;
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] net/mlx4: tcp_drop replace of tcp_drop_new
  2021-08-21 15:53 [PATCH] net/mlx4: tcp_drop replace of tcp_drop_new jony-one
@ 2021-08-23  0:21 ` Yonghong Song
  2021-08-23 19:06 ` Cong Wang
  1 sibling, 0 replies; 3+ messages in thread
From: Yonghong Song @ 2021-08-23  0:21 UTC (permalink / raw)
  To: jony-one, kuba
  Cc: edumazet, rostedt, mingo, davem, yoshfuji, dsahern, netdev,
	linux-kernel, hengqi.chen



On 8/21/21 8:53 AM, jony-one wrote:
> We never know why we are deleting a tcp packet when we delete it,
> and the tcp_drop_new() function can effectively solve this problem.
> The tcp_drop_new() will learn from the specified status code why the
> packet was deleted, and the caller from whom the packet was deleted.
> The kernel should be a little more open to data that is about to be
> destroyed and useless, and users should be able to keep track of it.

For the subject, prefix "net/mlx4" is not accurate. This patch has
nothing to do with mlx4. Just use "net". The subject is not accurate
too, maybe "introduce tracepoint tcp_drop". Tracepoint name "tcp_drop"
seems more accurate compared to "tcp_drop_new".

It is worthwhile to mention the context/why we want to this
tracepoint with bcc issue https://github.com/iovisor/bcc/issues/3533.
Mainly two reasons: (1). tcp_drop is a tiny function which
may easily get inlined, a tracepoint is more stable, and (2).
tcp_drop does not provide enough information on why it is dropped.

> 
> Signed-off-by: jony-one <yan2228598786@gmail.com>

Please use your proper name. "jony-one" is not a good one.

> ---
>   include/trace/events/tcp.h | 51 ++++++++++++++++++++++++++++++++++++++
>   net/ipv4/tcp_input.c       | 29 ++++++++++++++--------
>   2 files changed, 69 insertions(+), 11 deletions(-)
> 
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index 521059d8d..5a0478440 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -371,6 +371,57 @@ DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,
>   	TP_ARGS(skb)
>   );
>   
> +/*
> + * tcp event whit argument sk, skb, reason
> + */
> +TRACE_EVENT(tcp_drop_new,

tcp_drop instead of tcp_deop_new?

> +
> +		TP_PROTO(struct sock *sk, struct sk_buff *skb, const char *reason),
> +
> +		TP_ARGS(sk, skb, reason),
> +
> +		TP_STRUCT__entry(
> +			__field(const void *, skbaddr)
> +			__field(const void *, skaddr)
> +			__string(reason, reason)

I understand we may want to print out the reason with more
elaborate reason, but for kernel internal implementation
an enum should be enough. See tracepoint sched/sched_switch.

> +			__field(int, state)
> +			__field(__u16, sport)
> +			__field(__u16, dport)
> +			__array(__u8, saddr, 4)
> +			__array(__u8, daddr, 4)
> +			__array(__u8, saddr_v6, 16)
> +			__array(__u8, daddr_v6, 16)
> +		),
> +
> +		TP_fast_assign(
> +			struct inet_sock *inet = inet_sk(sk);
> +			__be32 *p32;
> +
> +			__assign_str(reason, reason);
> +
> +			__entry->skbaddr = skb;
> +			__entry->skaddr = sk;
> +			__entry->state = sk->sk_state;
> +
> +			__entry->sport = ntohs(inet->inet_sport);
> +			__entry->dport = ntohs(inet->inet_dport);
> +
> +			p32 = (__be32 *) __entry->saddr;
> +			*p32 = inet->inet_saddr;
> +
> +			p32 = (__be32 *) __entry->daddr;
> +			*p32 =  inet->inet_daddr;
> +
> +			TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
> +				sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
> +		),
> +
> +		TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s reason=%s",
> +				__entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
> +				__entry->saddr_v6, __entry->daddr_v6,
> +				show_tcp_state_name(__entry->state), __get_str(reason))
> +);
> +
>   #endif /* _TRACE_TCP_H */
>   
>   /* This part must be outside protection */
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 149ceb5c9..988989e25 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4649,6 +4649,13 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
>   	__kfree_skb(skb);
>   }
>   
> +static void tcp_drop_new(struct sock *sk, struct sk_buff *skb, const char *reason)

You can rename funciton name to __tcp_drop(). tcp_drop_new() is not a 
good name.

> +{
> +	trace_tcp_drop_new(sk, skb, reason);
> +	sk_drops_add(sk, skb);
> +	__kfree_skb(skb);
> +}
> +
>   /* This one checks to see if we can put data from the
>    * out_of_order queue into the receive_queue.
>    */
> @@ -4676,7 +4683,7 @@ static void tcp_ofo_queue(struct sock *sk)
>   		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
>   
>   		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
> -			tcp_drop(sk, skb);
> +			tcp_drop_new(sk, skb, __func__);

use enumeration?

>   			continue;
>   		}
>   
> @@ -4732,7 +4739,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
>   	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
>   		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
>   		sk->sk_data_ready(sk);
> -		tcp_drop(sk, skb);
> +		tcp_drop_new(sk, skb, __func__);
>   		return;
>   	}
>   
> @@ -4795,7 +4802,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
>   				/* All the bits are present. Drop. */
>   				NET_INC_STATS(sock_net(sk),
>   					      LINUX_MIB_TCPOFOMERGE);
> -				tcp_drop(sk, skb);
> +				tcp_drop_new(sk, skb, __func__);
>   				skb = NULL;
>   				tcp_dsack_set(sk, seq, end_seq);
>   				goto add_sack;
> @@ -4814,7 +4821,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
>   						 TCP_SKB_CB(skb1)->end_seq);
>   				NET_INC_STATS(sock_net(sk),
>   					      LINUX_MIB_TCPOFOMERGE);
> -				tcp_drop(sk, skb1);
> +				tcp_drop_new(sk, skb1, __func__);

This one and the above one are in the same function so they will have
the same reason. It would be good if we can differentiate them.

>   				goto merge_right;
>   			}
>   		} else if (tcp_ooo_try_coalesce(sk, skb1,
> @@ -4842,7 +4849,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
>   		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
>   				 TCP_SKB_CB(skb1)->end_seq);
>   		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
> -		tcp_drop(sk, skb1);
> +		tcp_drop_new(sk, skb1, __func__);
>   	}
>   	/* If there is no skb after us, we are the last_skb ! */
>   	if (!skb1)
> @@ -5019,7 +5026,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
>   		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
>   		inet_csk_schedule_ack(sk);
>   drop:
> -		tcp_drop(sk, skb);
> +		tcp_drop_new(sk, skb, __func__);
>   		return;
>   	}
>   
> @@ -5276,7 +5283,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
>   		prev = rb_prev(node);
>   		rb_erase(node, &tp->out_of_order_queue);
>   		goal -= rb_to_skb(node)->truesize;
> -		tcp_drop(sk, rb_to_skb(node));
> +		tcp_drop_new(sk, rb_to_skb(node), __func__);
>   		if (!prev || goal <= 0) {
>   			sk_mem_reclaim(sk);
>   			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
> @@ -5701,7 +5708,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
>   	return true;
>   
>   discard:
> -	tcp_drop(sk, skb);
> +	tcp_drop_new(sk, skb, __func__);
>   	return false;
>   }
>   
> @@ -5905,7 +5912,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
>   	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
>   
>   discard:
> -	tcp_drop(sk, skb);
> +	tcp_drop_new(sk, skb, __func__);
>   }
>   EXPORT_SYMBOL(tcp_rcv_established);
>   
> @@ -6196,7 +6203,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
>   						  TCP_DELACK_MAX, TCP_RTO_MAX);
>   
>   discard:
> -			tcp_drop(sk, skb);
> +			tcp_drop_new(sk, skb, __func__);
>   			return 0;
>   		} else {
>   			tcp_send_ack(sk);
> @@ -6568,7 +6575,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
>   
>   	if (!queued) {
>   discard:
> -		tcp_drop(sk, skb);
> +		tcp_drop_new(sk, skb, __func__);
>   	}
>   	return 0;
>   }
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] net/mlx4: tcp_drop replace of tcp_drop_new
  2021-08-21 15:53 [PATCH] net/mlx4: tcp_drop replace of tcp_drop_new jony-one
  2021-08-23  0:21 ` Yonghong Song
@ 2021-08-23 19:06 ` Cong Wang
  1 sibling, 0 replies; 3+ messages in thread
From: Cong Wang @ 2021-08-23 19:06 UTC (permalink / raw)
  To: jony-one
  Cc: Jakub Kicinski, Eric Dumazet, Steven Rostedt, Ingo Molnar,
	David Miller, Hideaki YOSHIFUJI, dsahern,
	Linux Kernel Network Developers, LKML, hengqi.chen,
	Yonghong Song

On Sat, Aug 21, 2021 at 8:55 AM jony-one <yan2228598786@gmail.com> wrote:
> @@ -4676,7 +4683,7 @@ static void tcp_ofo_queue(struct sock *sk)
>                 rb_erase(&skb->rbnode, &tp->out_of_order_queue);
>
>                 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
> -                       tcp_drop(sk, skb);
> +                       tcp_drop_new(sk, skb, __func__);

This is very similar to race_kfree_skb():

void kfree_skb(struct sk_buff *skb)
{
        if (!skb_unref(skb))
                return;

        trace_kfree_skb(skb, __builtin_return_address(0));
        __kfree_skb(skb);
}

So... why not something like this?

iff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3f7bd7ae7d7a..cc840e4552c9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4678,6 +4678,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
 static void tcp_drop(struct sock *sk, struct sk_buff *skb)
 {
        sk_drops_add(sk, skb);
+       trace_kfree_skb(skb, __builtin_return_address(0));
        __kfree_skb(skb);
 }

Thanks.

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-08-23 19:07 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-21 15:53 [PATCH] net/mlx4: tcp_drop replace of tcp_drop_new jony-one
2021-08-23  0:21 ` Yonghong Song
2021-08-23 19:06 ` Cong Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).