netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] sock: add tracepoint for send recv length
@ 2023-01-07  3:59 Yunhui Cui
  2023-01-07 20:02 ` Cong Wang
  2023-01-08 21:34 ` Steven Rostedt
  0 siblings, 2 replies; 4+ messages in thread
From: Yunhui Cui @ 2023-01-07  3:59 UTC (permalink / raw)
  To: rostedt, mhiramat, davem, edumazet, kuba, pabeni, kuniyu,
	xiyou.wangcong, duanxiongchun, cuiyunhui, linux-kernel,
	linux-trace-kernel, netdev

Add 2 tracepoints to monitor the tcp/udp traffic
of per process and per cgroup.

Regarding monitoring the tcp/udp traffic of each process, there are two
existing solutions, the first one is https://www.atoptool.nl/netatop.php.
The second is via kprobe/kretprobe.

Netatop solution is implemented by registering the hook function at the
hook point provided by the netfilter framework.

These hook functions may be in the soft interrupt context and cannot
directly obtain the pid. Some data structures are added to bind packets
and processes. For example, struct taskinfobucket, struct taskinfo ...

Every time the process sends and receives packets it needs multiple
hashmaps,resulting in low performance and it has the problem fo inaccurate
tcp/udp traffic statistics(for example: multiple threads share sockets).

We can obtain the information with kretprobe, but as we know, kprobe gets
the result by trappig in an exception, which loses performance compared
to tracepoint.

We compared the performance of tracepoints with the above two methods, and
the results are as follows:

ab -n 1000000 -c 1000 -r http://127.0.0.1/index.html
without trace:
Time per request: 39.660 [ms] (mean)
Time per request: 0.040 [ms] (mean, across all concurrent requests)

netatop:
Time per request: 50.717 [ms] (mean)
Time per request: 0.051 [ms] (mean, across all concurrent requests)

kr:
Time per request: 43.168 [ms] (mean)
Time per request: 0.043 [ms] (mean, across all concurrent requests)

tracepoint:
Time per request: 41.004 [ms] (mean)
Time per request: 0.041 [ms] (mean, across all concurrent requests

It can be seen that tracepoint has better performance.

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
Signed-off-by: Xiongchun Duan <duanxiongchun@bytedance.com>
---
 include/trace/events/sock.h | 49 +++++++++++++++++++++++++++++++++++++
 net/socket.c                | 20 +++++++++++++--
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 777ee6cbe933..16092c0406a2 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -263,6 +263,55 @@ TRACE_EVENT(inet_sk_error_report,
 		  __entry->error)
 );
 
+/*
+ * sock send/recv msg length
+ */
+DECLARE_EVENT_CLASS(sock_msg_length,
+
+	TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret,
+		 int flags),
+
+	TP_ARGS(sk, family, protocol, ret, flags),
+
+	TP_STRUCT__entry(
+		__field(void *, sk)
+		__field(__u16, family)
+		__field(__u16, protocol)
+		__field(int, length)
+		__field(int, error)
+		__field(int, flags)
+	),
+
+	TP_fast_assign(
+		__entry->sk = sk;
+		__entry->family = sk->sk_family;
+		__entry->protocol = sk->sk_protocol;
+		__entry->length = ret > 0 ? ret : 0;
+		__entry->error = ret < 0 ? ret : 0;
+		__entry->flags = flags;
+	),
+
+	TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x",
+			__entry->sk,
+			show_family_name(__entry->family),
+			show_inet_protocol_name(__entry->protocol),
+			__entry->length,
+			__entry->error, __entry->flags)
+);
+
+DEFINE_EVENT(sock_msg_length, sock_sendmsg_length,
+	TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret,
+		 int flags),
+
+	TP_ARGS(sk, family, protocol, ret, flags)
+);
+
+DEFINE_EVENT(sock_msg_length, sock_recvmsg_length,
+	TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret,
+		 int flags),
+
+	TP_ARGS(sk, family, protocol, ret, flags)
+);
 #endif /* _TRACE_SOCK_H */
 
 /* This part must be outside protection */
diff --git a/net/socket.c b/net/socket.c
index 888cd618a968..37578e8c685b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -106,6 +106,7 @@
 #include <net/busy_poll.h>
 #include <linux/errqueue.h>
 #include <linux/ptp_clock_kernel.h>
+#include <trace/events/sock.h>
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 unsigned int sysctl_net_busy_read __read_mostly;
@@ -715,6 +716,9 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
 				     inet_sendmsg, sock, msg,
 				     msg_data_left(msg));
 	BUG_ON(ret == -EIOCBQUEUED);
+
+	trace_sock_sendmsg_length(sock->sk, sock->sk->sk_family,
+				  sock->sk->sk_protocol, ret, 0);
 	return ret;
 }
 
@@ -992,9 +996,16 @@ INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
 static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
 				     int flags)
 {
-	return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
+	int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
 				  inet_recvmsg, sock, msg, msg_data_left(msg),
 				  flags);
+
+	trace_sock_recvmsg_length(sock->sk, sock->sk->sk_family,
+				  sock->sk->sk_protocol,
+				  !(flags & MSG_PEEK) ? ret :
+				  (ret < 0 ? ret : 0),
+				  flags);
+	return ret;
 }
 
 /**
@@ -1044,6 +1055,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
 {
 	struct socket *sock;
 	int flags;
+	int ret;
 
 	sock = file->private_data;
 
@@ -1051,7 +1063,11 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
 	/* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
 	flags |= more;
 
-	return kernel_sendpage(sock, page, offset, size, flags);
+	ret = kernel_sendpage(sock, page, offset, size, flags);
+
+	trace_sock_sendmsg_length(sock->sk, sock->sk->sk_family,
+				  sock->sk->sk_protocol, ret, 0);
+	return ret;
 }
 
 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v3] sock: add tracepoint for send recv length
  2023-01-07  3:59 [PATCH v3] sock: add tracepoint for send recv length Yunhui Cui
@ 2023-01-07 20:02 ` Cong Wang
  2023-01-08  1:34   ` [External] " 运辉崔
  2023-01-08 21:34 ` Steven Rostedt
  1 sibling, 1 reply; 4+ messages in thread
From: Cong Wang @ 2023-01-07 20:02 UTC (permalink / raw)
  To: Yunhui Cui
  Cc: rostedt, mhiramat, davem, edumazet, kuba, pabeni, kuniyu,
	duanxiongchun, linux-kernel, linux-trace-kernel, netdev

On Sat, Jan 07, 2023 at 11:59:23AM +0800, Yunhui Cui wrote:
> Add 2 tracepoints to monitor the tcp/udp traffic
> of per process and per cgroup.
> 
> Regarding monitoring the tcp/udp traffic of each process, there are two
> existing solutions, the first one is https://www.atoptool.nl/netatop.php.
> The second is via kprobe/kretprobe.
> 
> Netatop solution is implemented by registering the hook function at the
> hook point provided by the netfilter framework.
> 
> These hook functions may be in the soft interrupt context and cannot
> directly obtain the pid. Some data structures are added to bind packets
> and processes. For example, struct taskinfobucket, struct taskinfo ...
> 
> Every time the process sends and receives packets it needs multiple
> hashmaps,resulting in low performance and it has the problem fo inaccurate
> tcp/udp traffic statistics(for example: multiple threads share sockets).
> 
> We can obtain the information with kretprobe, but as we know, kprobe gets
> the result by trappig in an exception, which loses performance compared
> to tracepoint.
> 
> We compared the performance of tracepoints with the above two methods, and
> the results are as follows:
> 
> ab -n 1000000 -c 1000 -r http://127.0.0.1/index.html
> without trace:
> Time per request: 39.660 [ms] (mean)
> Time per request: 0.040 [ms] (mean, across all concurrent requests)
> 
> netatop:
> Time per request: 50.717 [ms] (mean)
> Time per request: 0.051 [ms] (mean, across all concurrent requests)
> 
> kr:
> Time per request: 43.168 [ms] (mean)
> Time per request: 0.043 [ms] (mean, across all concurrent requests)
> 
> tracepoint:
> Time per request: 41.004 [ms] (mean)
> Time per request: 0.041 [ms] (mean, across all concurrent requests
> 
> It can be seen that tracepoint has better performance.
> 
> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
> Signed-off-by: Xiongchun Duan <duanxiongchun@bytedance.com>

Thanks for the numbers, they help a lot.

Acked-by: Cong Wang <cong.wang@bytedance.com>

A few minor issues below.


> ---
>  include/trace/events/sock.h | 49 +++++++++++++++++++++++++++++++++++++
>  net/socket.c                | 20 +++++++++++++--
>  2 files changed, 67 insertions(+), 2 deletions(-)
> 
> diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
> index 777ee6cbe933..16092c0406a2 100644
> --- a/include/trace/events/sock.h
> +++ b/include/trace/events/sock.h
> @@ -263,6 +263,55 @@ TRACE_EVENT(inet_sk_error_report,
>  		  __entry->error)
>  );
>  
> +/*
> + * sock send/recv msg length
> + */
> +DECLARE_EVENT_CLASS(sock_msg_length,
> +
> +	TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret,
> +		 int flags),
> +
> +	TP_ARGS(sk, family, protocol, ret, flags),
> +
> +	TP_STRUCT__entry(
> +		__field(void *, sk)
> +		__field(__u16, family)
> +		__field(__u16, protocol)
> +		__field(int, length)
> +		__field(int, error)
> +		__field(int, flags)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->sk = sk;
> +		__entry->family = sk->sk_family;
> +		__entry->protocol = sk->sk_protocol;
> +		__entry->length = ret > 0 ? ret : 0;
> +		__entry->error = ret < 0 ? ret : 0;
> +		__entry->flags = flags;
> +	),
> +
> +	TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x",
> +			__entry->sk,
> +			show_family_name(__entry->family),
> +			show_inet_protocol_name(__entry->protocol),
> +			__entry->length,
> +			__entry->error, __entry->flags)

Please align and pack those parameters properly.


> +);
> +
> +DEFINE_EVENT(sock_msg_length, sock_sendmsg_length,
> +	TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret,
> +		 int flags),
> +
> +	TP_ARGS(sk, family, protocol, ret, flags)
> +);
> +
> +DEFINE_EVENT(sock_msg_length, sock_recvmsg_length,
> +	TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret,
> +		 int flags),
> +
> +	TP_ARGS(sk, family, protocol, ret, flags)
> +);

It might be a better idea to remove "msg" from the tracepoint names, in
case of any confusion with "sendpage". So
s/sock_sendmsg_length/sock_send_length/ ?


>  #endif /* _TRACE_SOCK_H */
>  
>  /* This part must be outside protection */
> diff --git a/net/socket.c b/net/socket.c
> index 888cd618a968..37578e8c685b 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -106,6 +106,7 @@
>  #include <net/busy_poll.h>
>  #include <linux/errqueue.h>
>  #include <linux/ptp_clock_kernel.h>
> +#include <trace/events/sock.h>
>  
>  #ifdef CONFIG_NET_RX_BUSY_POLL
>  unsigned int sysctl_net_busy_read __read_mostly;
> @@ -715,6 +716,9 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
>  				     inet_sendmsg, sock, msg,
>  				     msg_data_left(msg));
>  	BUG_ON(ret == -EIOCBQUEUED);
> +
> +	trace_sock_sendmsg_length(sock->sk, sock->sk->sk_family,
> +				  sock->sk->sk_protocol, ret, 0);
>  	return ret;
>  }
>  
> @@ -992,9 +996,16 @@ INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
>  static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
>  				     int flags)
>  {
> -	return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
> +	int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
>  				  inet_recvmsg, sock, msg, msg_data_left(msg),
>  				  flags);

Please adjust the indentations properly.

Thanks.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [External] Re: [PATCH v3] sock: add tracepoint for send recv length
  2023-01-07 20:02 ` Cong Wang
@ 2023-01-08  1:34   ` 运辉崔
  0 siblings, 0 replies; 4+ messages in thread
From: 运辉崔 @ 2023-01-08  1:34 UTC (permalink / raw)
  To: Cong Wang
  Cc: rostedt, mhiramat, davem, edumazet, kuba, pabeni, kuniyu,
	duanxiongchun, linux-kernel, linux-trace-kernel, netdev

On Sun, Jan 8, 2023 at 4:03 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:

>
> Thanks for the numbers, they help a lot.
>
> Acked-by: Cong Wang <cong.wang@bytedance.com>
>
> A few minor issues below.

Ok, Thanks for your suggestion, I'll update these on v4.

> > +
> > +     TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x",
> > +                     __entry->sk,
> > +                     show_family_name(__entry->family),
> > +                     show_inet_protocol_name(__entry->protocol),
> > +                     __entry->length,
> > +                     __entry->error, __entry->flags)
>
> Please align and pack those parameters properly.

OK, thanks.

> > +
> > +DEFINE_EVENT(sock_msg_length, sock_recvmsg_length,
> > +     TP_PROTO(struct sock *sk, __u16 family, __u16 protocol, int ret,
> > +              int flags),
> > +
> > +     TP_ARGS(sk, family, protocol, ret, flags)
> > +);
>
> It might be a better idea to remove "msg" from the tracepoint names, in
> case of any confusion with "sendpage". So
> s/sock_sendmsg_length/sock_send_length/ ?

OK, I'll rename both to sock_send_length/sock_recv_length.


> > -     return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
> > +     int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
> >                                 inet_recvmsg, sock, msg, msg_data_left(msg),
> >                                 flags);
>
> Please adjust the indentations properly.

Roger that!

Thanks,
Yunhui

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v3] sock: add tracepoint for send recv length
  2023-01-07  3:59 [PATCH v3] sock: add tracepoint for send recv length Yunhui Cui
  2023-01-07 20:02 ` Cong Wang
@ 2023-01-08 21:34 ` Steven Rostedt
  1 sibling, 0 replies; 4+ messages in thread
From: Steven Rostedt @ 2023-01-08 21:34 UTC (permalink / raw)
  To: Yunhui Cui
  Cc: mhiramat, davem, edumazet, kuba, pabeni, kuniyu, xiyou.wangcong,
	duanxiongchun, linux-kernel, linux-trace-kernel, netdev

On Sat,  7 Jan 2023 11:59:23 +0800
Yunhui Cui <cuiyunhui@bytedance.com> wrote:

> Add 2 tracepoints to monitor the tcp/udp traffic
> of per process and per cgroup.
> 
> Regarding monitoring the tcp/udp traffic of each process, there are two
> existing solutions, the first one is https://www.atoptool.nl/netatop.php.
> The second is via kprobe/kretprobe.
> 
> Netatop solution is implemented by registering the hook function at the
> hook point provided by the netfilter framework.
> 
> These hook functions may be in the soft interrupt context and cannot
> directly obtain the pid. Some data structures are added to bind packets
> and processes. For example, struct taskinfobucket, struct taskinfo ...
> 
> Every time the process sends and receives packets it needs multiple
> hashmaps,resulting in low performance and it has the problem fo inaccurate
> tcp/udp traffic statistics(for example: multiple threads share sockets).
> 
> We can obtain the information with kretprobe, but as we know, kprobe gets
> the result by trappig in an exception, which loses performance compared
> to tracepoint.
> 
> We compared the performance of tracepoints with the above two methods, and
> the results are as follows:
> 
> ab -n 1000000 -c 1000 -r http://127.0.0.1/index.html
> without trace:
> Time per request: 39.660 [ms] (mean)
> Time per request: 0.040 [ms] (mean, across all concurrent requests)
> 
> netatop:
> Time per request: 50.717 [ms] (mean)
> Time per request: 0.051 [ms] (mean, across all concurrent requests)
> 
> kr:
> Time per request: 43.168 [ms] (mean)
> Time per request: 0.043 [ms] (mean, across all concurrent requests)
> 
> tracepoint:
> Time per request: 41.004 [ms] (mean)
> Time per request: 0.041 [ms] (mean, across all concurrent requests
> 
> It can be seen that tracepoint has better performance.
> 
> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
> Signed-off-by: Xiongchun Duan <duanxiongchun@bytedance.com>
> ---


From a tracing POV:

Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>

-- Steve

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-01-08 21:34 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-07  3:59 [PATCH v3] sock: add tracepoint for send recv length Yunhui Cui
2023-01-07 20:02 ` Cong Wang
2023-01-08  1:34   ` [External] " 运辉崔
2023-01-08 21:34 ` Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).