All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] net: add net namespace inode for all net_dev events
@ 2021-03-09  4:43 Tony Lu
  2021-03-09 17:40 ` Steven Rostedt
  2021-03-09 20:12 ` Eric Dumazet
  0 siblings, 2 replies; 17+ messages in thread
From: Tony Lu @ 2021-03-09  4:43 UTC (permalink / raw)
  To: davem, rostedt, mingo; +Cc: netdev, linux-kernel

There are lots of net namespaces on the host runs containers like k8s.
It is very common to see the same interface names among different net
namespaces, such as eth0. It is not possible to distinguish them without
net namespace inode.

This adds net namespace inode for all net_dev events, help us
distinguish between different net devices.

Output:
  <idle>-0       [006] ..s.   133.306989: net_dev_xmit: net_inum=4026531992 dev=eth0 skbaddr=0000000011a87c68 len=54 rc=0

Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
---
 include/trace/events/net.h | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 2399073c3afc..a52f90d83411 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -35,6 +35,7 @@ TRACE_EVENT(net_dev_start_xmit,
 		__field(	u16,			gso_size	)
 		__field(	u16,			gso_segs	)
 		__field(	u16,			gso_type	)
+		__field(	unsigned int,		net_inum	)
 	),
 
 	TP_fast_assign(
@@ -56,10 +57,12 @@ TRACE_EVENT(net_dev_start_xmit,
 		__entry->gso_size = skb_shinfo(skb)->gso_size;
 		__entry->gso_segs = skb_shinfo(skb)->gso_segs;
 		__entry->gso_type = skb_shinfo(skb)->gso_type;
+		__entry->net_inum = dev_net(skb->dev)->ns.inum;
 	),
 
-	TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
-		  __get_str(name), __entry->queue_mapping, __entry->skbaddr,
+	TP_printk("net_inum=%u dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
+		  __entry->net_inum, __get_str(name), __entry->queue_mapping,
+		  __entry->skbaddr,
 		  __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
 		  __entry->protocol, __entry->ip_summed, __entry->len,
 		  __entry->data_len,
@@ -82,6 +85,7 @@ TRACE_EVENT(net_dev_xmit,
 		__field(	unsigned int,	len		)
 		__field(	int,		rc		)
 		__string(	name,		dev->name	)
+		__field(	unsigned int,	net_inum	)
 	),
 
 	TP_fast_assign(
@@ -89,10 +93,12 @@ TRACE_EVENT(net_dev_xmit,
 		__entry->len = skb_len;
 		__entry->rc = rc;
 		__assign_str(name, dev->name);
+		__entry->net_inum = dev_net(skb->dev)->ns.inum;
 	),
 
-	TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
-		__get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
+	TP_printk("net_inum=%u dev=%s skbaddr=%p len=%u rc=%d",
+		__entry->net_inum, __get_str(name), __entry->skbaddr,
+		__entry->len, __entry->rc)
 );
 
 TRACE_EVENT(net_dev_xmit_timeout,
@@ -106,16 +112,19 @@ TRACE_EVENT(net_dev_xmit_timeout,
 		__string(	name,		dev->name	)
 		__string(	driver,		netdev_drivername(dev))
 		__field(	int,		queue_index	)
+		__field(	unsigned int,	net_inum	)
 	),
 
 	TP_fast_assign(
 		__assign_str(name, dev->name);
 		__assign_str(driver, netdev_drivername(dev));
 		__entry->queue_index = queue_index;
+		__entry->net_inum = dev_net(dev)->ns.inum;
 	),
 
-	TP_printk("dev=%s driver=%s queue=%d",
-		__get_str(name), __get_str(driver), __entry->queue_index)
+	TP_printk("net_inum=%u dev=%s driver=%s queue=%d",
+		__entry->net_inum, __get_str(name), __get_str(driver),
+		__entry->queue_index)
 );
 
 DECLARE_EVENT_CLASS(net_dev_template,
@@ -128,16 +137,19 @@ DECLARE_EVENT_CLASS(net_dev_template,
 		__field(	void *,		skbaddr		)
 		__field(	unsigned int,	len		)
 		__string(	name,		skb->dev->name	)
+		__field(	unsigned int,	net_inum	)
 	),
 
 	TP_fast_assign(
 		__entry->skbaddr = skb;
 		__entry->len = skb->len;
 		__assign_str(name, skb->dev->name);
+		__entry->net_inum = dev_net(skb->dev)->ns.inum;
 	),
 
-	TP_printk("dev=%s skbaddr=%p len=%u",
-		__get_str(name), __entry->skbaddr, __entry->len)
+	TP_printk("net_inum=%u dev=%s skbaddr=%p len=%u",
+		__entry->net_inum, __get_str(name), __entry->skbaddr,
+		__entry->len)
 )
 
 DEFINE_EVENT(net_dev_template, net_dev_queue,
@@ -187,6 +199,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 		__field(	unsigned char,		nr_frags	)
 		__field(	u16,			gso_size	)
 		__field(	u16,			gso_type	)
+		__field(	unsigned int,		net_inum	)
 	),
 
 	TP_fast_assign(
@@ -213,10 +226,12 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
 		__entry->gso_size = skb_shinfo(skb)->gso_size;
 		__entry->gso_type = skb_shinfo(skb)->gso_type;
+		__entry->net_inum = dev_net(skb->dev)->ns.inum;
 	),
 
-	TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
-		  __get_str(name), __entry->napi_id, __entry->queue_mapping,
+	TP_printk("net_inum=%u dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
+		  __entry->net_inum, __get_str(name), __entry->napi_id,
+		  __entry->queue_mapping,
 		  __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
 		  __entry->vlan_tci, __entry->protocol, __entry->ip_summed,
 		  __entry->hash, __entry->l4_hash, __entry->len,
-- 
2.19.1.6.gb485710b


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09  4:43 [PATCH] net: add net namespace inode for all net_dev events Tony Lu
@ 2021-03-09 17:40 ` Steven Rostedt
  2021-03-09 19:53   ` David Ahern
  2021-03-10  9:03   ` Tony Lu
  2021-03-09 20:12 ` Eric Dumazet
  1 sibling, 2 replies; 17+ messages in thread
From: Steven Rostedt @ 2021-03-09 17:40 UTC (permalink / raw)
  To: Tony Lu; +Cc: davem, mingo, netdev, linux-kernel

On Tue,  9 Mar 2021 12:43:50 +0800
Tony Lu <tonylu@linux.alibaba.com> wrote:

> There are lots of net namespaces on the host runs containers like k8s.
> It is very common to see the same interface names among different net
> namespaces, such as eth0. It is not possible to distinguish them without
> net namespace inode.
> 
> This adds net namespace inode for all net_dev events, help us
> distinguish between different net devices.
> 
> Output:
>   <idle>-0       [006] ..s.   133.306989: net_dev_xmit: net_inum=4026531992 dev=eth0 skbaddr=0000000011a87c68 len=54 rc=0
> 
> Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
> ---
>  include/trace/events/net.h | 35 +++++++++++++++++++++++++----------
>  1 file changed, 25 insertions(+), 10 deletions(-)
> 
> diff --git a/include/trace/events/net.h b/include/trace/events/net.h
> index 2399073c3afc..a52f90d83411 100644
> --- a/include/trace/events/net.h
> +++ b/include/trace/events/net.h
> @@ -35,6 +35,7 @@ TRACE_EVENT(net_dev_start_xmit,
>  		__field(	u16,			gso_size	)
>  		__field(	u16,			gso_segs	)
>  		__field(	u16,			gso_type	)
> +		__field(	unsigned int,		net_inum	)
>  	),

This patch made me take a look at the net_dev_start_xmit trace event, and I
see it's very "holy". That is, it has lots of holes in the structure.

	TP_STRUCT__entry(
		__string(	name,			dev->name	)
		__field(	u16,			queue_mapping	)
		__field(	const void *,		skbaddr		)
		__field(	bool,			vlan_tagged	)
		__field(	u16,			vlan_proto	)
		__field(	u16,			vlan_tci	)
		__field(	u16,			protocol	)
		__field(	u8,			ip_summed	)
		__field(	unsigned int,		len		)
		__field(	unsigned int,		data_len	)
		__field(	int,			network_offset	)
		__field(	bool,			transport_offset_valid)
		__field(	int,			transport_offset)
		__field(	u8,			tx_flags	)
		__field(	u16,			gso_size	)
		__field(	u16,			gso_segs	)
		__field(	u16,			gso_type	)
		__field(	unsigned int,		net_inum	)
	),

If you look at /sys/kernel/tracing/events/net/net_dev_start_xmit/format

name: net_dev_start_xmit
ID: 1581
format:
	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
	field:int common_pid;	offset:4;	size:4;	signed:1;

	field:__data_loc char[] name;	offset:8;	size:4;	signed:1;
	field:u16 queue_mapping;	offset:12;	size:2;	signed:0;
	field:const void * skbaddr;	offset:16;	size:8;	signed:0;

Notice, queue_mapping is 2 bytes at offset 12 (ends at offset 14), but
skbaddr starts at offset 16. That means there's two bytes wasted.

	field:bool vlan_tagged;	offset:24;	size:1;	signed:0;
	field:u16 vlan_proto;	offset:26;	size:2;	signed:0;

Another byte missing above (24 + 1 != 26)

	field:u16 vlan_tci;	offset:28;	size:2;	signed:0;
	field:u16 protocol;	offset:30;	size:2;	signed:0;
	field:u8 ip_summed;	offset:32;	size:1;	signed:0;
	field:unsigned int len;	offset:36;	size:4;	signed:0;

Again another three bytes missing (32 + 1 != 36)

	field:unsigned int data_len;	offset:40;	size:4;	signed:0;
	field:int network_offset;	offset:44;	size:4;	signed:1;
	field:bool transport_offset_valid;	offset:48;	size:1;	signed:0;
	field:int transport_offset;	offset:52;	size:4;	signed:1;

Again, another 3 bytes missing (48 + 1 != 52)

	field:u8 tx_flags;	offset:56;	size:1;	signed:0;
	field:u16 gso_size;	offset:58;	size:2;	signed:0;

Another byte missing (56 + 1 != 58)

	field:u16 gso_segs;	offset:60;	size:2;	signed:0;	
	field:u16 gso_type;	offset:62;	size:2;	signed:0;
	field:unsigned int net_inum;	offset:64;	size:4;	signed:0;

The above shows 10 bytes wasted for this event.

The order of the fields is important. Don't worry about breaking API by
fixing it. The parsing code uses this output to find where the binary data
is.

Hmm, I should write a script that reads all the format files and point out
areas that have holes in it.

I haven't looked at the other trace events, but they may all have the same
issues.

-- Steve



>  
>  	TP_fast_assign(
> @@ -56,10 +57,12 @@ TRACE_EVENT(net_dev_start_xmit,
>  		__entry->gso_size = skb_shinfo(skb)->gso_size;
>  		__entry->gso_segs = skb_shinfo(skb)->gso_segs;
>  		__entry->gso_type = skb_shinfo(skb)->gso_type;
> +		__entry->net_inum = dev_net(skb->dev)->ns.inum;
>  	),
>  
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09 17:40 ` Steven Rostedt
@ 2021-03-09 19:53   ` David Ahern
  2021-03-09 20:02     ` Steven Rostedt
  2021-03-10  9:03   ` Tony Lu
  1 sibling, 1 reply; 17+ messages in thread
From: David Ahern @ 2021-03-09 19:53 UTC (permalink / raw)
  To: Steven Rostedt, Tony Lu; +Cc: davem, mingo, netdev, linux-kernel

On 3/9/21 10:40 AM, Steven Rostedt wrote:
> The order of the fields is important. Don't worry about breaking API by
> fixing it. The parsing code uses this output to find where the binary data
> is.

Changing the order of the fields will impact any bpf programs expecting
the existing format.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09 19:53   ` David Ahern
@ 2021-03-09 20:02     ` Steven Rostedt
  2021-03-09 20:17       ` David Ahern
  2021-03-09 20:18       ` Alexei Starovoitov
  0 siblings, 2 replies; 17+ messages in thread
From: Steven Rostedt @ 2021-03-09 20:02 UTC (permalink / raw)
  To: David Ahern; +Cc: Tony Lu, davem, mingo, netdev, linux-kernel

On Tue, 9 Mar 2021 12:53:37 -0700
David Ahern <dsahern@gmail.com> wrote:

> Changing the order of the fields will impact any bpf programs expecting
> the existing format

I thought bpf programs were not API. And why are they not parsing this
information? They have these offsets hard coded???? Why would they do that!
The information to extract the data where ever it is has been there from
day 1! Way before BPF ever had access to trace events.

Please, STOP HARD CODING FIELD OFFSETS!!!! This is why people do not want
to use trace points in the first place. Because tools do stupid things.

-- Steve

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09  4:43 [PATCH] net: add net namespace inode for all net_dev events Tony Lu
  2021-03-09 17:40 ` Steven Rostedt
@ 2021-03-09 20:12 ` Eric Dumazet
  2021-03-10  9:22   ` Lorenz Bauer
  2021-03-10  9:33   ` Tony Lu
  1 sibling, 2 replies; 17+ messages in thread
From: Eric Dumazet @ 2021-03-09 20:12 UTC (permalink / raw)
  To: Tony Lu, davem, rostedt, mingo, Lorenz Bauer; +Cc: netdev, linux-kernel



On 3/9/21 5:43 AM, Tony Lu wrote:
> There are lots of net namespaces on the host runs containers like k8s.
> It is very common to see the same interface names among different net
> namespaces, such as eth0. It is not possible to distinguish them without
> net namespace inode.
> 
> This adds net namespace inode for all net_dev events, help us
> distinguish between different net devices.
> 
> Output:
>   <idle>-0       [006] ..s.   133.306989: net_dev_xmit: net_inum=4026531992 dev=eth0 skbaddr=0000000011a87c68 len=54 rc=0
> 
> Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
> ---
>

There was a proposal from Lorenz to use netns cookies (SO_NETNS_COOKIE) instead.

They have a guarantee of being not reused.

After 3d368ab87cf6681f9 ("net: initialize net->net_cookie at netns setup")
net->net_cookie is directly available.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09 20:02     ` Steven Rostedt
@ 2021-03-09 20:17       ` David Ahern
  2021-03-09 20:20         ` Alexei Starovoitov
  2021-03-09 20:35         ` Steven Rostedt
  2021-03-09 20:18       ` Alexei Starovoitov
  1 sibling, 2 replies; 17+ messages in thread
From: David Ahern @ 2021-03-09 20:17 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Tony Lu, davem, mingo, netdev, linux-kernel

On 3/9/21 1:02 PM, Steven Rostedt wrote:
> On Tue, 9 Mar 2021 12:53:37 -0700
> David Ahern <dsahern@gmail.com> wrote:
> 
>> Changing the order of the fields will impact any bpf programs expecting
>> the existing format
> 
> I thought bpf programs were not API. And why are they not parsing this
> information? They have these offsets hard coded???? Why would they do that!
> The information to extract the data where ever it is has been there from
> day 1! Way before BPF ever had access to trace events.

BPF programs attached to a tracepoint are passed a context - a structure
based on the format for the tracepoint. To take an in-tree example, look
at samples/bpf/offwaketime_kern.c:

...

/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
struct sched_switch_args {
        unsigned long long pad;
        char prev_comm[16];
        int prev_pid;
        int prev_prio;
        long long prev_state;
        char next_comm[16];
        int next_pid;
        int next_prio;
};
SEC("tracepoint/sched/sched_switch")
int oncpu(struct sched_switch_args *ctx)
{

...

Production systems do not typically have toolchains installed, so
dynamic generation of the program based on the 'format' file on the
running system is not realistic. That means creating the programs on a
development machine and installing on the production box. Further, there
is an expectation that a bpf program compiled against version X works on
version Y. Changing the order of the fields will break such programs in
non-obvious ways.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09 20:02     ` Steven Rostedt
  2021-03-09 20:17       ` David Ahern
@ 2021-03-09 20:18       ` Alexei Starovoitov
  1 sibling, 0 replies; 17+ messages in thread
From: Alexei Starovoitov @ 2021-03-09 20:18 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: David Ahern, Tony Lu, David S. Miller, Ingo Molnar,
	Network Development, LKML

On Tue, Mar 9, 2021 at 12:03 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 9 Mar 2021 12:53:37 -0700
> David Ahern <dsahern@gmail.com> wrote:
>
> > Changing the order of the fields will impact any bpf programs expecting
> > the existing format
>
> I thought bpf programs were not API.

That is correct.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09 20:17       ` David Ahern
@ 2021-03-09 20:20         ` Alexei Starovoitov
  2021-03-09 20:35         ` Steven Rostedt
  1 sibling, 0 replies; 17+ messages in thread
From: Alexei Starovoitov @ 2021-03-09 20:20 UTC (permalink / raw)
  To: David Ahern
  Cc: Steven Rostedt, Tony Lu, David S. Miller, Ingo Molnar,
	Network Development, LKML

On Tue, Mar 9, 2021 at 12:18 PM David Ahern <dsahern@gmail.com> wrote:
>
> On 3/9/21 1:02 PM, Steven Rostedt wrote:
> > On Tue, 9 Mar 2021 12:53:37 -0700
> > David Ahern <dsahern@gmail.com> wrote:
> >
> >> Changing the order of the fields will impact any bpf programs expecting
> >> the existing format
> >
> > I thought bpf programs were not API. And why are they not parsing this
> > information? They have these offsets hard coded???? Why would they do that!
> > The information to extract the data where ever it is has been there from
> > day 1! Way before BPF ever had access to trace events.
>
> BPF programs attached to a tracepoint are passed a context - a structure
> based on the format for the tracepoint. To take an in-tree example, look
> at samples/bpf/offwaketime_kern.c:
>
> ...
>
> /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
> struct sched_switch_args {
>         unsigned long long pad;
>         char prev_comm[16];
>         int prev_pid;
>         int prev_prio;
>         long long prev_state;
>         char next_comm[16];
>         int next_pid;
>         int next_prio;
> };
> SEC("tracepoint/sched/sched_switch")
> int oncpu(struct sched_switch_args *ctx)
> {
>
> ...
>
> Production systems do not typically have toolchains installed, so
> dynamic generation of the program based on the 'format' file on the
> running system is not realistic. That means creating the programs on a
> development machine and installing on the production box. Further, there
> is an expectation that a bpf program compiled against version X works on
> version Y.

This is not true.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09 20:17       ` David Ahern
  2021-03-09 20:20         ` Alexei Starovoitov
@ 2021-03-09 20:35         ` Steven Rostedt
  2021-03-09 20:39           ` Alexei Starovoitov
  1 sibling, 1 reply; 17+ messages in thread
From: Steven Rostedt @ 2021-03-09 20:35 UTC (permalink / raw)
  To: David Ahern; +Cc: Tony Lu, davem, mingo, netdev, linux-kernel

On Tue, 9 Mar 2021 13:17:23 -0700
David Ahern <dsahern@gmail.com> wrote:

> On 3/9/21 1:02 PM, Steven Rostedt wrote:
> > On Tue, 9 Mar 2021 12:53:37 -0700
> > David Ahern <dsahern@gmail.com> wrote:
> >   
> >> Changing the order of the fields will impact any bpf programs expecting
> >> the existing format  
> > 
> > I thought bpf programs were not API. And why are they not parsing this
> > information? They have these offsets hard coded???? Why would they do that!
> > The information to extract the data where ever it is has been there from
> > day 1! Way before BPF ever had access to trace events.  
> 
> BPF programs attached to a tracepoint are passed a context - a structure
> based on the format for the tracepoint. To take an in-tree example, look
> at samples/bpf/offwaketime_kern.c:
> 
> ...
> 
> /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
> struct sched_switch_args {
>         unsigned long long pad;
>         char prev_comm[16];
>         int prev_pid;
>         int prev_prio;
>         long long prev_state;
>         char next_comm[16];
>         int next_pid;
>         int next_prio;
> };
> SEC("tracepoint/sched/sched_switch")
> int oncpu(struct sched_switch_args *ctx)
> {
> 
> ...
> 
> Production systems do not typically have toolchains installed, so
> dynamic generation of the program based on the 'format' file on the
> running system is not realistic. That means creating the programs on a
> development machine and installing on the production box. Further, there
> is an expectation that a bpf program compiled against version X works on
> version Y. Changing the order of the fields will break such programs in
> non-obvious ways.

The size of the fields and order changes all the time in various events. I
recommend doing so *all the time*. If you upgrade a kernel, then all the bpf
programs you have for that kernel should also be updated. You can't rely on
fields being the same, size or order. The best you can do is expect the
field to continue to exist, and that's not even a guarantee.

I'm not sure how that sample is used. I can't find "oncpu()" anywhere in
that directory besides where it is defined, and I wouldn't think a bpf
program would just blindly map the fields without verifying them.


-- Steve

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09 20:35         ` Steven Rostedt
@ 2021-03-09 20:39           ` Alexei Starovoitov
  0 siblings, 0 replies; 17+ messages in thread
From: Alexei Starovoitov @ 2021-03-09 20:39 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: David Ahern, Tony Lu, David S. Miller, Ingo Molnar,
	Network Development, LKML

On Tue, Mar 9, 2021 at 12:37 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> The size of the fields and order changes all the time in various events. I
> recommend doing so *all the time*. If you upgrade a kernel, then all the bpf
> programs you have for that kernel should also be updated. You can't rely on
> fields being the same, size or order. The best you can do is expect the
> field to continue to exist, and that's not even a guarantee.

+1. Tracing bpf progs already do that.
Old style tracing progs do it based on the kernel version.
New style is using CO-RE.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09 17:40 ` Steven Rostedt
  2021-03-09 19:53   ` David Ahern
@ 2021-03-10  9:03   ` Tony Lu
  2021-03-10 16:31     ` Steven Rostedt
  2021-03-10 16:49     ` Steven Rostedt
  1 sibling, 2 replies; 17+ messages in thread
From: Tony Lu @ 2021-03-10  9:03 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: davem, mingo, netdev, linux-kernel

On Tue, Mar 09, 2021 at 12:40:11PM -0500, Steven Rostedt wrote:
> On Tue,  9 Mar 2021 12:43:50 +0800
> Tony Lu <tonylu@linux.alibaba.com> wrote:
> 
> > There are lots of net namespaces on the host runs containers like k8s.
> > It is very common to see the same interface names among different net
> > namespaces, such as eth0. It is not possible to distinguish them without
> > net namespace inode.
> > 
> > This adds net namespace inode for all net_dev events, help us
> > distinguish between different net devices.
> > 
> > Output:
> >   <idle>-0       [006] ..s.   133.306989: net_dev_xmit: net_inum=4026531992 dev=eth0 skbaddr=0000000011a87c68 len=54 rc=0
> > 
> > Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
> > ---
> >  include/trace/events/net.h | 35 +++++++++++++++++++++++++----------
> >  1 file changed, 25 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/trace/events/net.h b/include/trace/events/net.h
> > index 2399073c3afc..a52f90d83411 100644
> > --- a/include/trace/events/net.h
> > +++ b/include/trace/events/net.h
> > @@ -35,6 +35,7 @@ TRACE_EVENT(net_dev_start_xmit,
> >  		__field(	u16,			gso_size	)
> >  		__field(	u16,			gso_segs	)
> >  		__field(	u16,			gso_type	)
> > +		__field(	unsigned int,		net_inum	)
> >  	),
> 
> This patch made me take a look at the net_dev_start_xmit trace event, and I
> see it's very "holy". That is, it has lots of holes in the structure.
> 
> 	TP_STRUCT__entry(
> 		__string(	name,			dev->name	)
> 		__field(	u16,			queue_mapping	)
> 		__field(	const void *,		skbaddr		)
> 		__field(	bool,			vlan_tagged	)
> 		__field(	u16,			vlan_proto	)
> 		__field(	u16,			vlan_tci	)
> 		__field(	u16,			protocol	)
> 		__field(	u8,			ip_summed	)
> 		__field(	unsigned int,		len		)
> 		__field(	unsigned int,		data_len	)
> 		__field(	int,			network_offset	)
> 		__field(	bool,			transport_offset_valid)
> 		__field(	int,			transport_offset)
> 		__field(	u8,			tx_flags	)
> 		__field(	u16,			gso_size	)
> 		__field(	u16,			gso_segs	)
> 		__field(	u16,			gso_type	)
> 		__field(	unsigned int,		net_inum	)
> 	),
> 
> If you look at /sys/kernel/tracing/events/net/net_dev_start_xmit/format
> 
> name: net_dev_start_xmit
> ID: 1581
> format:
> 	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
> 	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
> 	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
> 	field:int common_pid;	offset:4;	size:4;	signed:1;
> 
> 	field:__data_loc char[] name;	offset:8;	size:4;	signed:1;
> 	field:u16 queue_mapping;	offset:12;	size:2;	signed:0;
> 	field:const void * skbaddr;	offset:16;	size:8;	signed:0;
> 
> Notice, queue_mapping is 2 bytes at offset 12 (ends at offset 14), but
> skbaddr starts at offset 16. That means there's two bytes wasted.
> 
> 	field:bool vlan_tagged;	offset:24;	size:1;	signed:0;
> 	field:u16 vlan_proto;	offset:26;	size:2;	signed:0;
> 
> Another byte missing above (24 + 1 != 26)
> 
> 	field:u16 vlan_tci;	offset:28;	size:2;	signed:0;
> 	field:u16 protocol;	offset:30;	size:2;	signed:0;
> 	field:u8 ip_summed;	offset:32;	size:1;	signed:0;
> 	field:unsigned int len;	offset:36;	size:4;	signed:0;
> 
> Again another three bytes missing (32 + 1 != 36)
> 
> 	field:unsigned int data_len;	offset:40;	size:4;	signed:0;
> 	field:int network_offset;	offset:44;	size:4;	signed:1;
> 	field:bool transport_offset_valid;	offset:48;	size:1;	signed:0;
> 	field:int transport_offset;	offset:52;	size:4;	signed:1;
> 
> Again, another 3 bytes missing (48 + 1 != 52)
> 
> 	field:u8 tx_flags;	offset:56;	size:1;	signed:0;
> 	field:u16 gso_size;	offset:58;	size:2;	signed:0;
> 
> Another byte missing (56 + 1 != 58)
> 
> 	field:u16 gso_segs;	offset:60;	size:2;	signed:0;	
> 	field:u16 gso_type;	offset:62;	size:2;	signed:0;
> 	field:unsigned int net_inum;	offset:64;	size:4;	signed:0;
> 
> The above shows 10 bytes wasted for this event.
> 
> The order of the fields is important. Don't worry about breaking API by
> fixing it. The parsing code uses this output to find where the binary data
> is.
> 
> Hmm, I should write a script that reads all the format files and point out
> areas that have holes in it.

I use pahole to read vmlinux.o directly with defconfig and
CONFIG_DEBUG_INFO enabled, the result shows 22 structs prefixed with
trace_event_raw_ that have at least one hole.

Command:
	# structs have at least one hole and can be packed
	pahole vmlinux.o -H 1 -R -P -y trace_event_raw_ > output.txt

	# details (result includes above)
	pahole vmlinux.o -H 1 -y trace_event_raw_ > output_detail.txt

Here is the lists (>= 1 hole and can be packed, #1 size, #2 ):

	trace_event_raw_mm_shrink_slab_start    80      72      8
	trace_event_raw_mm_shrink_slab_end      64      56      8
	trace_event_raw_percpu_alloc_percpu     56      48      8
	trace_event_raw_writeback_inode_template        48      40      8
	trace_event_raw_filelock_lock   88      80      8
	trace_event_raw_iomap_apply     72      64      8
	trace_event_raw_ext4__map_blocks_exit   56      48      8
	trace_event_raw_ext4_ext_rm_leaf        64      56      8
	trace_event_raw_ext4_ext_remove_space_done      64      56      8
	trace_event_raw_nfs_rename_event_done   56      48      8
	trace_event_raw_nfs4_rename     56      48      8
	trace_event_raw_net_dev_start_xmit      72      64      8
	trace_event_raw_net_dev_rx_verbose_template     88      72      16
	trace_event_raw_tcp_probe       120     112     8
	trace_event_raw_qdisc_dequeue   64      56      8
	trace_event_raw_rpc_xdr_alignment       88      80      8
	trace_event_raw_rdev_return_int_mesh_config     108     104     4
	trace_event_raw_rdev_update_mesh_config 128     120     8
	trace_event_raw_rdev_get_ftm_responder_stats    120     112     8
	trace_event_raw_drv_bss_info_changed    184     168     16
	trace_event_raw_drv_ampdu_action        88      80      8
	trace_event_raw_drv_tdls_recv_channel_switch    112     104     8


Here is the detail (net_dev_start_xmit as example):

	struct trace_event_raw_net_dev_start_xmit {
	        struct trace_entry         ent;                  /*     0     8 */
	        u32                        __data_loc_name;      /*     8     4 */
	        u16                        queue_mapping;        /*    12     2 */
	
	        /* XXX 2 bytes hole, try to pack */
	
	        const void  *              skbaddr;              /*    16     8 */
	        bool                       vlan_tagged;          /*    24     1 */
	
	        /* XXX 1 byte hole, try to pack */
	
	        u16                        vlan_proto;           /*    26     2 */
	        u16                        vlan_tci;             /*    28     2 */
	        u16                        protocol;             /*    30     2 */
	        u8                         ip_summed;            /*    32     1 */
	
	        /* XXX 3 bytes hole, try to pack */
	
	        unsigned int               len;                  /*    36     4 */
	        unsigned int               data_len;             /*    40     4 */
	        int                        network_offset;       /*    44     4 */
	        bool                       transport_offset_valid; /*    48     1 */
	
	        /* XXX 3 bytes hole, try to pack */
	
	        int                        transport_offset;     /*    52     4 */
	        u8                         tx_flags;             /*    56     1 */
	
	        /* XXX 1 byte hole, try to pack */
	
	        u16                        gso_size;             /*    58     2 */
	        u16                        gso_segs;             /*    60     2 */
	        u16                        gso_type;             /*    62     2 */
	        /* --- cacheline 1 boundary (64 bytes) --- */
	        unsigned int               net_inum;             /*    64     4 */
	        char                       __data[];             /*    68     0 */
	
	        /* size: 72, cachelines: 2, members: 20 */
	        /* sum members: 58, holes: 5, sum holes: 10 */
	        /* padding: 4 */
	        /* last cacheline: 8 bytes */
	};

pahole shows there are 5 holes with 10 bytes in net_dev_start_xmit event.


Cheers,
Tony Lu

> 
> I haven't looked at the other trace events, but they may all have the same
> issues.
> 
> -- Steve
> 
> 
> 
> >  
> >  	TP_fast_assign(
> > @@ -56,10 +57,12 @@ TRACE_EVENT(net_dev_start_xmit,
> >  		__entry->gso_size = skb_shinfo(skb)->gso_size;
> >  		__entry->gso_segs = skb_shinfo(skb)->gso_segs;
> >  		__entry->gso_type = skb_shinfo(skb)->gso_type;
> > +		__entry->net_inum = dev_net(skb->dev)->ns.inum;
> >  	),
> >  
> >

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09 20:12 ` Eric Dumazet
@ 2021-03-10  9:22   ` Lorenz Bauer
  2021-03-10 12:01     ` Tony Lu
  2021-03-10  9:33   ` Tony Lu
  1 sibling, 1 reply; 17+ messages in thread
From: Lorenz Bauer @ 2021-03-10  9:22 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tony Lu, David S . Miller, rostedt, mingo, Networking, LKML

On Tue, 9 Mar 2021 at 20:12, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> On 3/9/21 5:43 AM, Tony Lu wrote:
> > There are lots of net namespaces on the host runs containers like k8s.
> > It is very common to see the same interface names among different net
> > namespaces, such as eth0. It is not possible to distinguish them without
> > net namespace inode.
> >
> > This adds net namespace inode for all net_dev events, help us
> > distinguish between different net devices.
> >
> > Output:
> >   <idle>-0       [006] ..s.   133.306989: net_dev_xmit: net_inum=4026531992 dev=eth0 skbaddr=0000000011a87c68 len=54 rc=0
> >
> > Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
> > ---
> >
>
> There was a proposal from Lorenz to use netns cookies (SO_NETNS_COOKIE) instead.
>
> They have a guarantee of being not reused.
>
> After 3d368ab87cf6681f9 ("net: initialize net->net_cookie at netns setup")
> net->net_cookie is directly available.

The patch set is at
https://lore.kernel.org/bpf/20210219154330.93615-1-lmb@cloudflare.com/
but I decided to abandon it. I can work around my issue by comparing
the netns inode of two processes, which is "good enough" for now.

-- 
Lorenz Bauer  |  Systems Engineer
6th Floor, County Hall/The Riverside Building, SE1 7PB, UK

www.cloudflare.com

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-09 20:12 ` Eric Dumazet
  2021-03-10  9:22   ` Lorenz Bauer
@ 2021-03-10  9:33   ` Tony Lu
  1 sibling, 0 replies; 17+ messages in thread
From: Tony Lu @ 2021-03-10  9:33 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, rostedt, mingo, Lorenz Bauer, netdev, linux-kernel

On Tue, Mar 09, 2021 at 09:12:45PM +0100, Eric Dumazet wrote:
> 
> 
> On 3/9/21 5:43 AM, Tony Lu wrote:
> > There are lots of net namespaces on the host runs containers like k8s.
> > It is very common to see the same interface names among different net
> > namespaces, such as eth0. It is not possible to distinguish them without
> > net namespace inode.
> > 
> > This adds net namespace inode for all net_dev events, help us
> > distinguish between different net devices.
> > 
> > Output:
> >   <idle>-0       [006] ..s.   133.306989: net_dev_xmit: net_inum=4026531992 dev=eth0 skbaddr=0000000011a87c68 len=54 rc=0
> > 
> > Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
> > ---
> >
> 
> There was a proposal from Lorenz to use netns cookies (SO_NETNS_COOKIE) instead.
> 
> They have a guarantee of being not reused.
> 
> After 3d368ab87cf6681f9 ("net: initialize net->net_cookie at netns setup")
> net->net_cookie is directly available.

It looks better to identify ns with net_cookie rather than inode, and
get the value with NS_GET_COOKIE. I will switch net_inum to net_cookie
in the next patch.


Cheers,
Tony Lu

> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-10  9:22   ` Lorenz Bauer
@ 2021-03-10 12:01     ` Tony Lu
  0 siblings, 0 replies; 17+ messages in thread
From: Tony Lu @ 2021-03-10 12:01 UTC (permalink / raw)
  To: Lorenz Bauer
  Cc: Eric Dumazet, David S . Miller, rostedt, mingo, Networking, LKML

On Wed, Mar 10, 2021 at 09:22:34AM +0000, Lorenz Bauer wrote:
> On Tue, 9 Mar 2021 at 20:12, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > On 3/9/21 5:43 AM, Tony Lu wrote:
> > > There are lots of net namespaces on the host runs containers like k8s.
> > > It is very common to see the same interface names among different net
> > > namespaces, such as eth0. It is not possible to distinguish them without
> > > net namespace inode.
> > >
> > > This adds net namespace inode for all net_dev events, help us
> > > distinguish between different net devices.
> > >
> > > Output:
> > >   <idle>-0       [006] ..s.   133.306989: net_dev_xmit: net_inum=4026531992 dev=eth0 skbaddr=0000000011a87c68 len=54 rc=0
> > >
> > > Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
> > > ---
> > >
> >
> > There was a proposal from Lorenz to use netns cookies (SO_NETNS_COOKIE) instead.
> >
> > They have a guarantee of being not reused.
> >
> > After 3d368ab87cf6681f9 ("net: initialize net->net_cookie at netns setup")
> > net->net_cookie is directly available.
> 
> The patch set is at
> https://lore.kernel.org/bpf/20210219154330.93615-1-lmb@cloudflare.com/
> but I decided to abandon it. I can work around my issue by comparing
> the netns inode of two processes, which is "good enough" for now.

Without the patch set, it is impossible to get net_cookie from
userspace, except bpf prog. AFAIK, netns inode has been widely used to
distinguish different netns, it is easy to use for docker
(/proc/${container_pid}/ns/net). It would be better to provide a unified
approach to do so.


Cheers,
Tony Lu

> 
> -- 
> Lorenz Bauer  |  Systems Engineer
> 6th Floor, County Hall/The Riverside Building, SE1 7PB, UK
> 
> www.cloudflare.com

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-10  9:03   ` Tony Lu
@ 2021-03-10 16:31     ` Steven Rostedt
  2021-03-11  6:39       ` Tony Lu
  2021-03-10 16:49     ` Steven Rostedt
  1 sibling, 1 reply; 17+ messages in thread
From: Steven Rostedt @ 2021-03-10 16:31 UTC (permalink / raw)
  To: Tony Lu; +Cc: davem, mingo, netdev, linux-kernel

On Wed, 10 Mar 2021 17:03:40 +0800
Tony Lu <tonylu@linux.alibaba.com> wrote:

> I use pahole to read vmlinux.o directly with defconfig and
> CONFIG_DEBUG_INFO enabled, the result shows 22 structs prefixed with
> trace_event_raw_ that have at least one hole.

I was thinking of pahole too ;-)

But the information can also be captured from the format files with simple
scripts as well. And perhaps be more tuned to see if there's actually a fix
for them, and ignore reporting it if there is no fix, as all trace events
are 4 byte aligned, and if we are off by one, sometimes it doesn't matter.

-- Steve

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-10  9:03   ` Tony Lu
  2021-03-10 16:31     ` Steven Rostedt
@ 2021-03-10 16:49     ` Steven Rostedt
  1 sibling, 0 replies; 17+ messages in thread
From: Steven Rostedt @ 2021-03-10 16:49 UTC (permalink / raw)
  To: Tony Lu; +Cc: davem, mingo, netdev, linux-kernel

On Wed, 10 Mar 2021 17:03:40 +0800
Tony Lu <tonylu@linux.alibaba.com> wrote:

> On Tue, Mar 09, 2021 at 12:40:11PM -0500, Steven Rostedt wrote:


> > The above shows 10 bytes wasted for this event.
> > 


> 
> I use pahole to read vmlinux.o directly with defconfig and
> CONFIG_DEBUG_INFO enabled, the result shows 22 structs prefixed with
> trace_event_raw_ that have at least one hole.
> 

> 
> pahole shows there are 5 holes with 10 bytes in net_dev_start_xmit event.
> 

Oh, an I'm glad that my analysis matched with pahole ;-)

-- Steve

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: add net namespace inode for all net_dev events
  2021-03-10 16:31     ` Steven Rostedt
@ 2021-03-11  6:39       ` Tony Lu
  0 siblings, 0 replies; 17+ messages in thread
From: Tony Lu @ 2021-03-11  6:39 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: davem, mingo, netdev, linux-kernel

On Wed, Mar 10, 2021 at 11:31:12AM -0500, Steven Rostedt wrote:
> On Wed, 10 Mar 2021 17:03:40 +0800
> Tony Lu <tonylu@linux.alibaba.com> wrote:
> 
> > I use pahole to read vmlinux.o directly with defconfig and
> > CONFIG_DEBUG_INFO enabled, the result shows 22 structs prefixed with
> > trace_event_raw_ that have at least one hole.
> 
> I was thinking of pahole too ;-)
> 
> But the information can also be captured from the format files with simple
> scripts as well. And perhaps be more tuned to see if there's actually a fix
> for them, and ignore reporting it if there is no fix, as all trace events
> are 4 byte aligned, and if we are off by one, sometimes it doesn't matter.

I am going to send a patch to fix this issue later. There are many
events have more than 1 holes, I am trying to pick up the events that are
really to be fixed.


Cheers,
Tony Lu

> 
> -- Steve

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2021-03-11  6:40 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-09  4:43 [PATCH] net: add net namespace inode for all net_dev events Tony Lu
2021-03-09 17:40 ` Steven Rostedt
2021-03-09 19:53   ` David Ahern
2021-03-09 20:02     ` Steven Rostedt
2021-03-09 20:17       ` David Ahern
2021-03-09 20:20         ` Alexei Starovoitov
2021-03-09 20:35         ` Steven Rostedt
2021-03-09 20:39           ` Alexei Starovoitov
2021-03-09 20:18       ` Alexei Starovoitov
2021-03-10  9:03   ` Tony Lu
2021-03-10 16:31     ` Steven Rostedt
2021-03-11  6:39       ` Tony Lu
2021-03-10 16:49     ` Steven Rostedt
2021-03-09 20:12 ` Eric Dumazet
2021-03-10  9:22   ` Lorenz Bauer
2021-03-10 12:01     ` Tony Lu
2021-03-10  9:33   ` Tony Lu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.