All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net] bpf: expose netns inode to bpf programs
@ 2017-01-26  3:27 Alexei Starovoitov
  2017-01-26  5:46 ` Eric W. Biederman
                   ` (3 more replies)
  0 siblings, 4 replies; 35+ messages in thread
From: Alexei Starovoitov @ 2017-01-26  3:27 UTC (permalink / raw)
  To: David S . Miller
  Cc: Daniel Borkmann, David Ahern, Tejun Heo, Andy Lutomirski,
	Eric W . Biederman, Thomas Graf, netdev

in cases where bpf programs are looking at sockets and packets
that belong to different netns, it could be useful to read netns inode,
so that programs can make intelligent decisions.
For example to disallow raw sockets in all non-init netns the program can do:
if (sk->type == SOCK_RAW && sk->netns_inum != 0xf0000075)
  return 0;
where 0xf0000075 inode comes from /proc/pid/ns/net

Similarly TC cls_bpf/act_bpf and socket filters can do
if (skb->netns_inum == expected_inode)

The lack of netns awareness was a concern even for socket filters,
since the application can attach the same bpf program to sockets
in a different netns. Just like tc cls_bpf program can work in
different netns as well, so it has to be addressed uniformly
across all types of bpf programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
with corresponding change in 'ip vrf' that David Ahern is working on,
this will address 'malfunction' concern that Andy discovered in 'ip vrf',
hence this fix is needed for 'net'.
---
 include/uapi/linux/bpf.h      |  2 ++
 net/core/filter.c             | 27 +++++++++++++++++++++++++++
 samples/bpf/sock_flags_kern.c |  2 ++
 samples/bpf/sockex1_kern.c    |  3 +++
 4 files changed, 34 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87dbe9f..867cbe043d77 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -546,6 +546,7 @@ struct __sk_buff {
 	__u32 tc_classid;
 	__u32 data;
 	__u32 data_end;
+	__u32 netns_inum;
 };
 
 struct bpf_tunnel_key {
@@ -581,6 +582,7 @@ struct bpf_sock {
 	__u32 family;
 	__u32 type;
 	__u32 protocol;
+	__u32 netns_inum;
 };
 
 #define XDP_PACKET_HEADROOM 256
diff --git a/net/core/filter.c b/net/core/filter.c
index 1969b3f118c1..b2a15402c034 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3118,6 +3118,22 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 			*insn++ = BPF_MOV64_IMM(dst_reg, 0);
 		break;
 #endif
+	case offsetof(struct __sk_buff, netns_inum):
+#ifdef CONFIG_NET_NS
+		/* return dev_net(skb->dev)->ns.inum; */
+		BUILD_BUG_ON(FIELD_SIZEOF(struct net, ns.inum) != 4);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+				      dst_reg, src_reg,
+				      offsetof(struct sk_buff, dev));
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 2);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, dst_reg,
+				      offsetof(struct net_device, nd_net));
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+				      offsetof(struct net, ns.inum));
+#else
+		*insn++ = BPF_MOV64_IMM(dst_reg, 0);
+		break;
+#endif
 	}
 
 	return insn - insn_buf;
@@ -3163,6 +3179,17 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
 		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
 		break;
+	case offsetof(struct bpf_sock, netns_inum):
+#ifdef CONFIG_NET_NS
+		/* return sock_net(sk)->ns.inum; */
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
+				      offsetof(struct sock, sk_net));
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+				      offsetof(struct net, ns.inum));
+#else
+		*insn++ = BPF_MOV64_IMM(dst_reg, 0);
+		break;
+#endif
 	}
 
 	return insn - insn_buf;
diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c
index 533dd11a6baa..6d9073509b45 100644
--- a/samples/bpf/sock_flags_kern.c
+++ b/samples/bpf/sock_flags_kern.c
@@ -9,8 +9,10 @@ SEC("cgroup/sock1")
 int bpf_prog1(struct bpf_sock *sk)
 {
 	char fmt[] = "socket: family %d type %d protocol %d\n";
+	char fmt2[] = "socket: netns_inum %u\n";
 
 	bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
+	bpf_trace_printk(fmt2, sizeof(fmt2), sk->netns_inum);
 
 	/* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets
 	 * ie., make ping6 fail
diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c
index ed18e9a4909c..d522a70bd661 100644
--- a/samples/bpf/sockex1_kern.c
+++ b/samples/bpf/sockex1_kern.c
@@ -24,6 +24,9 @@ int bpf_prog1(struct __sk_buff *skb)
 	if (value)
 		__sync_fetch_and_add(value, skb->len);
 
+	char fmt[] = "skb %p netns inode %u\n";
+
+	bpf_trace_printk(fmt, sizeof(fmt), skb, skb->netns_inum);
 	return 0;
 }
 char _license[] SEC("license") = "GPL";
-- 
2.8.0

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26  3:27 [PATCH net] bpf: expose netns inode to bpf programs Alexei Starovoitov
@ 2017-01-26  5:46 ` Eric W. Biederman
  2017-01-26  6:00   ` Ying Xue
  2017-01-26  6:23   ` Alexei Starovoitov
  2017-01-31 18:02 ` David Miller
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 35+ messages in thread
From: Eric W. Biederman @ 2017-01-26  5:46 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Andy Lutomirski, Thomas Graf, netdev

Alexei Starovoitov <ast@fb.com> writes:

> in cases where bpf programs are looking at sockets and packets
> that belong to different netns, it could be useful to read netns inode,
> so that programs can make intelligent decisions.
> For example to disallow raw sockets in all non-init netns the program can do:
> if (sk->type == SOCK_RAW && sk->netns_inum != 0xf0000075)
>   return 0;
> where 0xf0000075 inode comes from /proc/pid/ns/net
>
> Similarly TC cls_bpf/act_bpf and socket filters can do
> if (skb->netns_inum == expected_inode)

Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>

Particularly you need to compare more than the inode number.
Further I have never guaranteed there will be exactly one inode
per network namespace, just that if the device number and the inode
number pair match they are the same.

Beyond that the entire concept is complete rubbish.

The only sane thing is to interpret whatever your bpf program in the
context of the program which installs it.

If you can't do that you have a very broken piece of userspace
interface.  Which appears to be the case here.

Eric

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26  5:46 ` Eric W. Biederman
@ 2017-01-26  6:00   ` Ying Xue
  2017-01-26  6:23   ` Alexei Starovoitov
  1 sibling, 0 replies; 35+ messages in thread
From: Ying Xue @ 2017-01-26  6:00 UTC (permalink / raw)
  To: ying.xue
  Cc: Daniel Borkmann, David Ahern, Tejun Heo, Andy Lutomirski,
	Thomas Graf, netdev

tetst teste tetet tetest tetetttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt

On 01/26/2017 01:46 PM, Eric W. Biederman wrote:
> Alexei Starovoitov <ast@fb.com> writes:
> 
>> in cases where bpf programs are looking at sockets and packets
>> that belong to different netns, it could be useful to read netns inode,
>> so that programs can make intelligent decisions.
>> For example to disallow raw sockets in all non-init netns the program can do:
>> if (sk->type == SOCK_RAW && sk->netns_inum != 0xf0000075)
>>   return 0;
>> where 0xf0000075 inode comes from /proc/pid/ns/net
>>
>> Similarly TC cls_bpf/act_bpf and socket filters can do
>> if (skb->netns_inum == expected_inode)
> 
> Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>
> 
> Particularly you need to compare more than the inode number.
> Further I have never guaranteed there will be exactly one inode
> per network namespace, just that if the device number and the inode
> number pair match they are the same.
> 
> Beyond that the entire concept is complete rubbish.
> 
> The only sane thing is to interpret whatever your bpf program in the
> context of the program which installs it.
> 
> If you can't do that you have a very broken piece of userspace
> interface.  Which appears to be the case here.
> 
> Eric
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26  5:46 ` Eric W. Biederman
  2017-01-26  6:00   ` Ying Xue
@ 2017-01-26  6:23   ` Alexei Starovoitov
  2017-01-26 16:37     ` Andy Lutomirski
  1 sibling, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-01-26  6:23 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Andy Lutomirski, Thomas Graf, netdev

On 1/25/17 9:46 PM, Eric W. Biederman wrote:
> Alexei Starovoitov <ast@fb.com> writes:
>
>> in cases where bpf programs are looking at sockets and packets
>> that belong to different netns, it could be useful to read netns inode,
>> so that programs can make intelligent decisions.
>> For example to disallow raw sockets in all non-init netns the program can do:
>> if (sk->type == SOCK_RAW && sk->netns_inum != 0xf0000075)
>>    return 0;
>> where 0xf0000075 inode comes from /proc/pid/ns/net
>>
>> Similarly TC cls_bpf/act_bpf and socket filters can do
>> if (skb->netns_inum == expected_inode)
>
> Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>

I very much value your opinion, but your ack or nack doesn't apply here.
Exposing existing inode has nothing to do with what you maintain.
It's a bpf feature that is exposing already visible to user space id.
Period.
If I was proposing to expose some internal namespace id, then yes,
we'd need to have a discussion. ns.inum is already visible.

> Particularly you need to compare more than the inode number.
> Further I have never guaranteed there will be exactly one inode
> per network namespace, just that if the device number and the inode
> number pair match they are the same.

people already rely on inodes for _all_ namespaces.
The current implementation of
net_ns_net_init->..>ida_get_new is stuck the way it is.
We can change ids, generation algorithm, but uniqueness is
already assumed by user space.

> Beyond that the entire concept is complete rubbish.

care to explain what you think the 'entire concept' is?

> The only sane thing is to interpret whatever your bpf program in the
> context of the program which installs it.

that's impossible. The programs are operating in the context that
is disjoined from the app that installs it.

> If you can't do that you have a very broken piece of userspace
> interface.  Which appears to be the case here.

Call it rubbish, but this is how it is.
cls_bpf is operating on packets. xdp_bpf is operating on raw dma buffers
and there we might need eventually lookup sockets and net namespaces.
Think of bpf programs as safe kernel modules. They don't have
confined boundaries and program authors, if not careful, can shoot
themselves in the foot. We're not trying to prevent that because
it's impossible to check that the program is sane. Just like
it's impossible to check that kernel module is sane.
But in case of bpf we check that bpf program is _safe_ from the kernel
point of view. If it's doing some garbage, it's program's business.
Does it make more sense now?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26  6:23   ` Alexei Starovoitov
@ 2017-01-26 16:37     ` Andy Lutomirski
  2017-01-26 17:46       ` Alexei Starovoitov
  0 siblings, 1 reply; 35+ messages in thread
From: Andy Lutomirski @ 2017-01-26 16:37 UTC (permalink / raw)
  To: Alexei Starovoitov, Linus Torvalds
  Cc: Eric W. Biederman, David S . Miller, Daniel Borkmann,
	David Ahern, Tejun Heo, Thomas Graf, Network Development

Hi Linus-

Can you weigh in here before we get stuck in a potentially unfortunate place?

On Wed, Jan 25, 2017 at 10:23 PM, Alexei Starovoitov <ast@fb.com> wrote:
> On 1/25/17 9:46 PM, Eric W. Biederman wrote:
>>
>> Alexei Starovoitov <ast@fb.com> writes:
>>

[...]

>>> Similarly TC cls_bpf/act_bpf and socket filters can do
>>> if (skb->netns_inum == expected_inode)
>>
>>
>> Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>
>
>
> I very much value your opinion, but your ack or nack doesn't apply here.

I'm confused.  Eric, the namespace maintainer, says nack.  This is
namespace code regardless of what file it lives in.

>> The only sane thing is to interpret whatever your bpf program in the
>> context of the program which installs it.
>
>
> that's impossible. The programs are operating in the context that
> is disjoined from the app that installs it.
>
>> If you can't do that you have a very broken piece of userspace
>> interface.  Which appears to be the case here.
>
>
> Call it rubbish, but this is how it is.
> cls_bpf is operating on packets. xdp_bpf is operating on raw dma buffers
> and there we might need eventually lookup sockets and net namespaces.
> Think of bpf programs as safe kernel modules. They don't have
> confined boundaries and program authors, if not careful, can shoot
> themselves in the foot. We're not trying to prevent that because
> it's impossible to check that the program is sane. Just like
> it's impossible to check that kernel module is sane.
> But in case of bpf we check that bpf program is _safe_ from the kernel
> point of view. If it's doing some garbage, it's program's business.
> Does it make more sense now?
>

With all due respect, I think this is not an acceptable way to think
about BPF at all.  If you think of BPF this way, I think there needs
to be a real discussion at KS or similar as to whether this is okay.
The reason is simple: the kernel promises a stable ABI to userspace
but not to kernel modules.  By thinking of BPF as more like a module,
you're taking a big shortcut that will either result in ABI breakage
down the road or in committing to a problematic stable ABI.

In fact, this was discussed a bit at KS in the context of tracepoints.
The general consensus seemed to be that tracepoints should not be
considered fully stable in large part because they're a debugging
feature.  But these BPF hooks are very much not a debugging feature.

Concretely, iproute2 git contains an eBPF program.  If that program
were just a "safe kernel module", then it would be totally fine
(expected, even) for future kernel versions to break it outright.  I
don't think this is how it works.

--Andy

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26 16:37     ` Andy Lutomirski
@ 2017-01-26 17:46       ` Alexei Starovoitov
  2017-01-26 18:12         ` Andy Lutomirski
  0 siblings, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-01-26 17:46 UTC (permalink / raw)
  To: Andy Lutomirski, Linus Torvalds
  Cc: Eric W. Biederman, David S . Miller, Daniel Borkmann,
	David Ahern, Tejun Heo, Thomas Graf, Network Development

On 1/26/17 8:37 AM, Andy Lutomirski wrote:
>> Think of bpf programs as safe kernel modules. They don't have
>> confined boundaries and program authors, if not careful, can shoot
>> themselves in the foot. We're not trying to prevent that because
>> it's impossible to check that the program is sane. Just like
>> it's impossible to check that kernel module is sane.
>> But in case of bpf we check that bpf program is _safe_ from the kernel
>> point of view. If it's doing some garbage, it's program's business.
>> Does it make more sense now?
>>
>
> With all due respect, I think this is not an acceptable way to think
> about BPF at all.  If you think of BPF this way, I think there needs
> to be a real discussion at KS or similar as to whether this is okay.
> The reason is simple: the kernel promises a stable ABI to userspace
> but not to kernel modules.  By thinking of BPF as more like a module,
> you're taking a big shortcut that will either result in ABI breakage
> down the road or in committing to a problematic stable ABI.

you misunderstood the analogy.
bpf abi is certainly stable. that's why we were careful of not
exposing anything to it that is not already stable.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26 17:46       ` Alexei Starovoitov
@ 2017-01-26 18:12         ` Andy Lutomirski
  2017-01-26 18:32           ` Alexei Starovoitov
  0 siblings, 1 reply; 35+ messages in thread
From: Andy Lutomirski @ 2017-01-26 18:12 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eric W. Biederman, David S . Miller,
	Daniel Borkmann, David Ahern, Tejun Heo, Thomas Graf,
	Network Development

On Thu, Jan 26, 2017 at 9:46 AM, Alexei Starovoitov <ast@fb.com> wrote:
> On 1/26/17 8:37 AM, Andy Lutomirski wrote:
>>>
>>> Think of bpf programs as safe kernel modules. They don't have
>>> confined boundaries and program authors, if not careful, can shoot
>>> themselves in the foot. We're not trying to prevent that because
>>> it's impossible to check that the program is sane. Just like
>>> it's impossible to check that kernel module is sane.
>>> But in case of bpf we check that bpf program is _safe_ from the kernel
>>> point of view. If it's doing some garbage, it's program's business.
>>> Does it make more sense now?
>>>
>>
>> With all due respect, I think this is not an acceptable way to think
>> about BPF at all.  If you think of BPF this way, I think there needs
>> to be a real discussion at KS or similar as to whether this is okay.
>> The reason is simple: the kernel promises a stable ABI to userspace
>> but not to kernel modules.  By thinking of BPF as more like a module,
>> you're taking a big shortcut that will either result in ABI breakage
>> down the road or in committing to a problematic stable ABI.
>
>
> you misunderstood the analogy.
> bpf abi is certainly stable. that's why we were careful of not
> exposing anything to it that is not already stable.
>

In that case I don't understand what you're trying to say.  Eric
thinks your patch exposes a bad interface.  A bad interface for
userspace is a very different thing from a bad interface available to
kernel modules.  Are you saying that BPF is kernel-module-like in that
the ABI exposed to BPF programs doesn't need to meet the same quality
standards as userspace ABIs?

--Andy

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26 18:12         ` Andy Lutomirski
@ 2017-01-26 18:32           ` Alexei Starovoitov
  2017-01-26 19:07             ` Andy Lutomirski
  0 siblings, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-01-26 18:32 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Linus Torvalds, Eric W. Biederman, David S . Miller,
	Daniel Borkmann, David Ahern, Tejun Heo, Thomas Graf,
	Network Development

On 1/26/17 10:12 AM, Andy Lutomirski wrote:
> On Thu, Jan 26, 2017 at 9:46 AM, Alexei Starovoitov <ast@fb.com> wrote:
>> On 1/26/17 8:37 AM, Andy Lutomirski wrote:
>>>>
>>>> Think of bpf programs as safe kernel modules. They don't have
>>>> confined boundaries and program authors, if not careful, can shoot
>>>> themselves in the foot. We're not trying to prevent that because
>>>> it's impossible to check that the program is sane. Just like
>>>> it's impossible to check that kernel module is sane.
>>>> But in case of bpf we check that bpf program is _safe_ from the kernel
>>>> point of view. If it's doing some garbage, it's program's business.
>>>> Does it make more sense now?
>>>>
>>>
>>> With all due respect, I think this is not an acceptable way to think
>>> about BPF at all.  If you think of BPF this way, I think there needs
>>> to be a real discussion at KS or similar as to whether this is okay.
>>> The reason is simple: the kernel promises a stable ABI to userspace
>>> but not to kernel modules.  By thinking of BPF as more like a module,
>>> you're taking a big shortcut that will either result in ABI breakage
>>> down the road or in committing to a problematic stable ABI.
>>
>>
>> you misunderstood the analogy.
>> bpf abi is certainly stable. that's why we were careful of not
>> exposing anything to it that is not already stable.
>>
>
> In that case I don't understand what you're trying to say.  Eric
> thinks your patch exposes a bad interface.  A bad interface for
> userspace is a very different thing from a bad interface available to
> kernel modules.  Are you saying that BPF is kernel-module-like in that
> the ABI exposed to BPF programs doesn't need to meet the same quality
> standards as userspace ABIs?

of course not.
ns.inum is already exposed to user space as a value.
This patch exposes it to bpf program in a convenient and stable way,
therefore I don't see why it's a big deal to you and Eric and why it
has anything to do with namespaces in general. It doesn't change
any existing behavior and doesn't impose any new restrictions.
Like ns.inum can be moved around. User space visible field
'netns_inum' is a shadow of kernel field. Only 'netns_inum'
has to be stable and that is my headache.

The kernel module analogy is an attempt to explain that programs
can do insane things.
Like the user can create a socket attach a program to it, change
netns, create another socket and attach the same program.
Inside the program it can do 'if (skb->ifindex == xxx)'.
This would be nonsensical program, since ifindex is obviously scoped
by netns and comparing ifindex without regard to netns is bogus.
But kernel cannot prevent users to write such programs.
Hence the kernel module analogy: the kernel cannot prevent
nonsensical modules.

With this patch the user will be able to do
if (skb->netns_inum == ... && skb->ifindex == ...)
which would be more sane thing to do, but without appropriate
control plane, it's also nonsensical, since netns inode and
dev ifindex can disappear while the program is running.
We obviously don't want to pin net_devices and netns-es for the program.
It would be debugging nightmare. Therefore the user has to write
the program understanding all this.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26 18:32           ` Alexei Starovoitov
@ 2017-01-26 19:07             ` Andy Lutomirski
  2017-01-26 19:25               ` Alexei Starovoitov
  0 siblings, 1 reply; 35+ messages in thread
From: Andy Lutomirski @ 2017-01-26 19:07 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eric W. Biederman, David S . Miller,
	Daniel Borkmann, David Ahern, Tejun Heo, Thomas Graf,
	Network Development

On Thu, Jan 26, 2017 at 10:32 AM, Alexei Starovoitov <ast@fb.com> wrote:
> On 1/26/17 10:12 AM, Andy Lutomirski wrote:
>>
>> On Thu, Jan 26, 2017 at 9:46 AM, Alexei Starovoitov <ast@fb.com> wrote:
>>>
>>> On 1/26/17 8:37 AM, Andy Lutomirski wrote:
>>>>>
>>>>>
>>>>> Think of bpf programs as safe kernel modules. They don't have
>>>>> confined boundaries and program authors, if not careful, can shoot
>>>>> themselves in the foot. We're not trying to prevent that because
>>>>> it's impossible to check that the program is sane. Just like
>>>>> it's impossible to check that kernel module is sane.
>>>>> But in case of bpf we check that bpf program is _safe_ from the kernel
>>>>> point of view. If it's doing some garbage, it's program's business.
>>>>> Does it make more sense now?
>>>>>
>>>>
>>>> With all due respect, I think this is not an acceptable way to think
>>>> about BPF at all.  If you think of BPF this way, I think there needs
>>>> to be a real discussion at KS or similar as to whether this is okay.
>>>> The reason is simple: the kernel promises a stable ABI to userspace
>>>> but not to kernel modules.  By thinking of BPF as more like a module,
>>>> you're taking a big shortcut that will either result in ABI breakage
>>>> down the road or in committing to a problematic stable ABI.
>>>
>>>
>>>
>>> you misunderstood the analogy.
>>> bpf abi is certainly stable. that's why we were careful of not
>>> exposing anything to it that is not already stable.
>>>
>>
>> In that case I don't understand what you're trying to say.  Eric
>> thinks your patch exposes a bad interface.  A bad interface for
>> userspace is a very different thing from a bad interface available to
>> kernel modules.  Are you saying that BPF is kernel-module-like in that
>> the ABI exposed to BPF programs doesn't need to meet the same quality
>> standards as userspace ABIs?
>
>
> of course not.
> ns.inum is already exposed to user space as a value.
> This patch exposes it to bpf program in a convenient and stable way,

Here's what I'm imaging Eric is thinking:

ns.inum is currently exposed to userspace via procfs.  In principle,
the value could be local to a namespace, though, which would enable
CRIU to be able to preserve namespace inode numbers across a
checkpoint+restore operation.  If this happened, the contained and
restored procfs would see a different inode number than the outermost
procfs.

If you start exposing the raw ns.inum field to BPF programs and those
programs are not themselves scoped to a namespace, then this could
create a problem for CRIU.

But you told Eric that his nack doesn't matter, and maybe it would be
nice to ask him to clarify instead.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26 19:07             ` Andy Lutomirski
@ 2017-01-26 19:25               ` Alexei Starovoitov
  2017-02-03  4:33                 ` Eric W. Biederman
  0 siblings, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-01-26 19:25 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Linus Torvalds, Eric W. Biederman, David S . Miller,
	Daniel Borkmann, David Ahern, Tejun Heo, Thomas Graf,
	Network Development

On 1/26/17 11:07 AM, Andy Lutomirski wrote:
> On Thu, Jan 26, 2017 at 10:32 AM, Alexei Starovoitov <ast@fb.com> wrote:
>> On 1/26/17 10:12 AM, Andy Lutomirski wrote:
>>>
>>> On Thu, Jan 26, 2017 at 9:46 AM, Alexei Starovoitov <ast@fb.com> wrote:
>>>>
>>>> On 1/26/17 8:37 AM, Andy Lutomirski wrote:
>>>>>>
>>>>>>
>>>>>> Think of bpf programs as safe kernel modules. They don't have
>>>>>> confined boundaries and program authors, if not careful, can shoot
>>>>>> themselves in the foot. We're not trying to prevent that because
>>>>>> it's impossible to check that the program is sane. Just like
>>>>>> it's impossible to check that kernel module is sane.
>>>>>> But in case of bpf we check that bpf program is _safe_ from the kernel
>>>>>> point of view. If it's doing some garbage, it's program's business.
>>>>>> Does it make more sense now?
>>>>>>
>>>>>
>>>>> With all due respect, I think this is not an acceptable way to think
>>>>> about BPF at all.  If you think of BPF this way, I think there needs
>>>>> to be a real discussion at KS or similar as to whether this is okay.
>>>>> The reason is simple: the kernel promises a stable ABI to userspace
>>>>> but not to kernel modules.  By thinking of BPF as more like a module,
>>>>> you're taking a big shortcut that will either result in ABI breakage
>>>>> down the road or in committing to a problematic stable ABI.
>>>>
>>>>
>>>>
>>>> you misunderstood the analogy.
>>>> bpf abi is certainly stable. that's why we were careful of not
>>>> exposing anything to it that is not already stable.
>>>>
>>>
>>> In that case I don't understand what you're trying to say.  Eric
>>> thinks your patch exposes a bad interface.  A bad interface for
>>> userspace is a very different thing from a bad interface available to
>>> kernel modules.  Are you saying that BPF is kernel-module-like in that
>>> the ABI exposed to BPF programs doesn't need to meet the same quality
>>> standards as userspace ABIs?
>>
>>
>> of course not.
>> ns.inum is already exposed to user space as a value.
>> This patch exposes it to bpf program in a convenient and stable way,
>
> Here's what I'm imaging Eric is thinking:
>
> ns.inum is currently exposed to userspace via procfs.  In principle,
> the value could be local to a namespace, though, which would enable
> CRIU to be able to preserve namespace inode numbers across a
> checkpoint+restore operation.  If this happened, the contained and
> restored procfs would see a different inode number than the outermost
> procfs.

sure. there are many different ways for the program to see inode
that either was already reused or disappeared.
What I'm saying that it is expected. We cannot prevent that from
bpf side. Just like ifindex value read by the program can be bogus
as in the example I just provided.

> If you start exposing the raw ns.inum field to BPF programs and those
> programs are not themselves scoped to a namespace, then this could
> create a problem for CRIU.

criu doesn't support ebpf because maps are not snapshot-able and
programs are detached from the control plane. I cannot see how one can
criu of xdp or cls program. The ssh connection to the box might die in
the middle while criu is messing with unknown. Hence the analogy to
the kernel modules. Imagine a set of mini-kernel modules and a set
of apps that depend on them. What kind of criu can we even talk about?

> But you told Eric that his nack doesn't matter, and maybe it would be
> nice to ask him to clarify instead.

Fair enough. Eric, thoughts?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26  3:27 [PATCH net] bpf: expose netns inode to bpf programs Alexei Starovoitov
  2017-01-26  5:46 ` Eric W. Biederman
@ 2017-01-31 18:02 ` David Miller
  2017-01-31 22:11 ` David Ahern
  2017-02-03 21:56 ` Daniel Borkmann
  3 siblings, 0 replies; 35+ messages in thread
From: David Miller @ 2017-01-31 18:02 UTC (permalink / raw)
  To: ast; +Cc: daniel, dsa, tj, luto, ebiederm, tgraf, netdev


Eric, you cannot just stay silent on this thread for days at a time.

Alexei has sought your feedback in his latest post in this thread,
and your response is holding the entire discussion up.

Do not just give a terse response, which will just trigger Alexei
asking for more clarification.  Put time and effort into it so we
can move forward.

Thank you.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26  3:27 [PATCH net] bpf: expose netns inode to bpf programs Alexei Starovoitov
  2017-01-26  5:46 ` Eric W. Biederman
  2017-01-31 18:02 ` David Miller
@ 2017-01-31 22:11 ` David Ahern
  2017-02-03 21:56 ` Daniel Borkmann
  3 siblings, 0 replies; 35+ messages in thread
From: David Ahern @ 2017-01-31 22:11 UTC (permalink / raw)
  To: Alexei Starovoitov, David S . Miller
  Cc: Daniel Borkmann, Tejun Heo, Andy Lutomirski, Eric W . Biederman,
	Thomas Graf, netdev

On 1/25/17 8:27 PM, Alexei Starovoitov wrote:
> in cases where bpf programs are looking at sockets and packets
> that belong to different netns, it could be useful to read netns inode,
> so that programs can make intelligent decisions.
> For example to disallow raw sockets in all non-init netns the program can do:
> if (sk->type == SOCK_RAW && sk->netns_inum != 0xf0000075)
>   return 0;
> where 0xf0000075 inode comes from /proc/pid/ns/net
> 
> Similarly TC cls_bpf/act_bpf and socket filters can do
> if (skb->netns_inum == expected_inode)
> 
> The lack of netns awareness was a concern even for socket filters,
> since the application can attach the same bpf program to sockets
> in a different netns. Just like tc cls_bpf program can work in
> different netns as well, so it has to be addressed uniformly
> across all types of bpf programs.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
> with corresponding change in 'ip vrf' that David Ahern is working on,
> this will address 'malfunction' concern that Andy discovered in 'ip vrf',
> hence this fix is needed for 'net'.

FWIW, the iproute2 patch (along with a few other namespace related fixups) can be found here:

    https://github.com/dsahern/iproute2

vrf/ip-vrf branch.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26 19:25               ` Alexei Starovoitov
@ 2017-02-03  4:33                 ` Eric W. Biederman
  2017-02-03  6:05                   ` Alexei Starovoitov
  2017-02-03 21:00                   ` Andy Lutomirski
  0 siblings, 2 replies; 35+ messages in thread
From: Eric W. Biederman @ 2017-02-03  4:33 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Linus Torvalds, David S . Miller,
	Daniel Borkmann, David Ahern, Tejun Heo, Thomas Graf,
	Network Development

Alexei Starovoitov <ast@fb.com> writes:

> On 1/26/17 11:07 AM, Andy Lutomirski wrote:
>> On Thu, Jan 26, 2017 at 10:32 AM, Alexei Starovoitov <ast@fb.com> wrote:
>>> On 1/26/17 10:12 AM, Andy Lutomirski wrote:
>>>>
>>>> On Thu, Jan 26, 2017 at 9:46 AM, Alexei Starovoitov <ast@fb.com> wrote:
>>>>>
>>>>> On 1/26/17 8:37 AM, Andy Lutomirski wrote:
>>>>>>>
>>>>>>>
>>>>>>> Think of bpf programs as safe kernel modules. They don't have
>>>>>>> confined boundaries and program authors, if not careful, can shoot
>>>>>>> themselves in the foot. We're not trying to prevent that because
>>>>>>> it's impossible to check that the program is sane. Just like
>>>>>>> it's impossible to check that kernel module is sane.
>>>>>>> But in case of bpf we check that bpf program is _safe_ from the kernel
>>>>>>> point of view. If it's doing some garbage, it's program's business.
>>>>>>> Does it make more sense now?
>>>>>>>
>>>>>>
>>>>>> With all due respect, I think this is not an acceptable way to think
>>>>>> about BPF at all.  If you think of BPF this way, I think there needs
>>>>>> to be a real discussion at KS or similar as to whether this is okay.
>>>>>> The reason is simple: the kernel promises a stable ABI to userspace
>>>>>> but not to kernel modules.  By thinking of BPF as more like a module,
>>>>>> you're taking a big shortcut that will either result in ABI breakage
>>>>>> down the road or in committing to a problematic stable ABI.
>>>>>
>>>>>
>>>>>
>>>>> you misunderstood the analogy.
>>>>> bpf abi is certainly stable. that's why we were careful of not
>>>>> exposing anything to it that is not already stable.
>>>>>
>>>>
>>>> In that case I don't understand what you're trying to say.  Eric
>>>> thinks your patch exposes a bad interface.  A bad interface for
>>>> userspace is a very different thing from a bad interface available to
>>>> kernel modules.  Are you saying that BPF is kernel-module-like in that
>>>> the ABI exposed to BPF programs doesn't need to meet the same quality
>>>> standards as userspace ABIs?
>>>
>>>
>>> of course not.
>>> ns.inum is already exposed to user space as a value.
>>> This patch exposes it to bpf program in a convenient and stable way,
>>
>> Here's what I'm imaging Eric is thinking:
>>
>> ns.inum is currently exposed to userspace via procfs.  In principle,
>> the value could be local to a namespace, though, which would enable
>> CRIU to be able to preserve namespace inode numbers across a
>> checkpoint+restore operation.  If this happened, the contained and
>> restored procfs would see a different inode number than the outermost
>> procfs.
>
> sure. there are many different ways for the program to see inode
> that either was already reused or disappeared.
> What I'm saying that it is expected. We cannot prevent that from
> bpf side. Just like ifindex value read by the program can be bogus
> as in the example I just provided.

The point is that we can make the inode number stable across migration
and the user space API for namespaces has been designed with that
possibility in mind.

What you have proposed is the equivalent of reporting a file name, and
instead of reporting /dir1/file1 /dir2/file1 just reporting file1 for
both cases.

That is problematic.

It doesn't matter that eBPF and CRIU do not mix.  When we implement
migration of the namespace file descriptors and can move them from
one system to another preserving the device number and inode number
so that criu of other parts of userspace can function better there will
be a problem.  There is not one unique inode number per namespace and
the proposed interface in your eBPF programs is broken.

I don't know when inode numbers are going to be the bottleneck we decide
to make migratable to make CRIU work better but things have been
designed and maintained very carefully so that we can do that.

Inode numbers are in the namespace of the filesystem they reside in.

>> But you told Eric that his nack doesn't matter, and maybe it would be
>> nice to ask him to clarify instead.
>
> Fair enough. Eric, thoughts?

In very short terms exporting just the inode number would require
implementing a namespace of namespaces, and that is NOT happening.
We are not going to design our kernel interfaces so badly that we need
to do that.

At a bare minimum you need to export the device number of the filesystem
as well as the inode number.

My expectation would be that now you are starting to look at concepts
that are namespaced the way you would proceed would be to associate a
full set of namespaces with your ebpf program.  Those namespaces would
come from the submitter of your ebpf program.  Namespaced values
would be in the terms of your associated namespaces.

That keeps things working the way userspace would expect.

The easy way to build such an association is to not allow your
contextless ebpf programs from being submitted to kernel in anything
other than the initial set of namespaces.

But please assume all global identifiers are namespaced.  If they aren't
that needs to be fixed because not having them namespaced will break
process migration at some point.

In short the fix here is to export both the inode number the device
number.  That is what it takes to uniquely identify a file.  It would be
good if you went farther and limited your contextless ebpf programs to
only being installed by programs in the initial set of namespaces.

Does that make things clearer?

Eric

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-03  4:33                 ` Eric W. Biederman
@ 2017-02-03  6:05                   ` Alexei Starovoitov
  2017-02-03 10:30                     ` Eric W. Biederman
  2017-02-03 21:00                   ` Andy Lutomirski
  1 sibling, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-02-03  6:05 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alexei Starovoitov, Andy Lutomirski, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

On Fri, Feb 03, 2017 at 05:33:45PM +1300, Eric W. Biederman wrote:
> 
> The point is that we can make the inode number stable across migration
> and the user space API for namespaces has been designed with that
> possibility in mind.
> 
> What you have proposed is the equivalent of reporting a file name, and
> instead of reporting /dir1/file1 /dir2/file1 just reporting file1 for
> both cases.
> 
> That is problematic.
> 
> It doesn't matter that eBPF and CRIU do not mix.  When we implement
> migration of the namespace file descriptors and can move them from
> one system to another preserving the device number and inode number
> so that criu of other parts of userspace can function better there will
> be a problem.  There is not one unique inode number per namespace and
> the proposed interface in your eBPF programs is broken.
> 
> I don't know when inode numbers are going to be the bottleneck we decide
> to make migratable to make CRIU work better but things have been
> designed and maintained very carefully so that we can do that.
> 
> Inode numbers are in the namespace of the filesystem they reside in.

I saw that iproute2 is doing:
  if ((st.st_dev == netst.st_dev) &&
      (st.st_ino == netst.st_ino)) {
but proc_alloc_inum() is using global ida,
so I figured that iproute2 extra st_dev check must have been obsolete.
So the long term plan is to make /proc to be namespace-aware?
That's fair. In such case exposing inode only will
lead to wrong assumptions.

> >> But you told Eric that his nack doesn't matter, and maybe it would be
> >> nice to ask him to clarify instead.
> >
> > Fair enough. Eric, thoughts?
> 
> In very short terms exporting just the inode number would require
> implementing a namespace of namespaces, and that is NOT happening.
> We are not going to design our kernel interfaces so badly that we need
> to do that.
> 
> At a bare minimum you need to export the device number of the filesystem
> as well as the inode number.

Agree. Will do.

> My expectation would be that now you are starting to look at concepts
> that are namespaced the way you would proceed would be to associate a
> full set of namespaces with your ebpf program.  Those namespaces would
> come from the submitter of your ebpf program.  Namespaced values
> would be in the terms of your associated namespaces.
> 
> That keeps things working the way userspace would expect.
> 
> The easy way to build such an association is to not allow your
> contextless ebpf programs from being submitted to kernel in anything
> other than the initial set of namespaces.
> 
> But please assume all global identifiers are namespaced.  If they aren't
> that needs to be fixed because not having them namespaced will break
> process migration at some point.
> 
> In short the fix here is to export both the inode number the device
> number.  That is what it takes to uniquely identify a file.  It would be

Agree. Will respin.

> good if you went farther and limited your contextless ebpf programs to
> only being installed by programs in the initial set of namespaces.

you mean to limit to init_net only? This might break existing users.

> Does that make things clearer?

yep. thanks for the feedback.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-03  6:05                   ` Alexei Starovoitov
@ 2017-02-03 10:30                     ` Eric W. Biederman
  0 siblings, 0 replies; 35+ messages in thread
From: Eric W. Biederman @ 2017-02-03 10:30 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Andy Lutomirski, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Fri, Feb 03, 2017 at 05:33:45PM +1300, Eric W. Biederman wrote:
>> 
>> The point is that we can make the inode number stable across migration
>> and the user space API for namespaces has been designed with that
>> possibility in mind.
>> 
>> What you have proposed is the equivalent of reporting a file name, and
>> instead of reporting /dir1/file1 /dir2/file1 just reporting file1 for
>> both cases.
>> 
>> That is problematic.
>> 
>> It doesn't matter that eBPF and CRIU do not mix.  When we implement
>> migration of the namespace file descriptors and can move them from
>> one system to another preserving the device number and inode number
>> so that criu of other parts of userspace can function better there will
>> be a problem.  There is not one unique inode number per namespace and
>> the proposed interface in your eBPF programs is broken.
>> 
>> I don't know when inode numbers are going to be the bottleneck we decide
>> to make migratable to make CRIU work better but things have been
>> designed and maintained very carefully so that we can do that.
>> 
>> Inode numbers are in the namespace of the filesystem they reside in.
>
> I saw that iproute2 is doing:
>   if ((st.st_dev == netst.st_dev) &&
>       (st.st_ino == netst.st_ino)) {
> but proc_alloc_inum() is using global ida,
> so I figured that iproute2 extra st_dev check must have been obsolete.
> So the long term plan is to make /proc to be namespace-aware?

The long term plan is to make /proc more namespace aware.  It is pretty
strongly namespace aware in several senses.  Of course when those things
are well executed you don't see them in userspace so they are easy to
over look.

> That's fair. In such case exposing inode only will
> lead to wrong assumptions.

Exactly.

>> >> But you told Eric that his nack doesn't matter, and maybe it would be
>> >> nice to ask him to clarify instead.
>> >
>> > Fair enough. Eric, thoughts?
>> 
>> In very short terms exporting just the inode number would require
>> implementing a namespace of namespaces, and that is NOT happening.
>> We are not going to design our kernel interfaces so badly that we need
>> to do that.
>> 
>> At a bare minimum you need to export the device number of the filesystem
>> as well as the inode number.
>
> Agree. Will do.
>
>> My expectation would be that now you are starting to look at concepts
>> that are namespaced the way you would proceed would be to associate a
>> full set of namespaces with your ebpf program.  Those namespaces would
>> come from the submitter of your ebpf program.  Namespaced values
>> would be in the terms of your associated namespaces.
>> 
>> That keeps things working the way userspace would expect.
>> 
>> The easy way to build such an association is to not allow your
>> contextless ebpf programs from being submitted to kernel in anything
>> other than the initial set of namespaces.
>> 
>> But please assume all global identifiers are namespaced.  If they aren't
>> that needs to be fixed because not having them namespaced will break
>> process migration at some point.
>> 
>> In short the fix here is to export both the inode number the device
>> number.  That is what it takes to uniquely identify a file.  It would be
>
> Agree. Will respin.
>
>> good if you went farther and limited your contextless ebpf programs to
>> only being installed by programs in the initial set of namespaces.
>
> you mean to limit to init_net only? This might break existing users.

And for proc/pid_namespace things like the device and inum the initial
pid namespace.

I expect you can limit yourself to fields that are namespace specific
before adding such a requirement in which case that will avoid breaking
existing users.

>> Does that make things clearer?
>
> yep. thanks for the feedback.

Welcome.

Eric

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-03  4:33                 ` Eric W. Biederman
  2017-02-03  6:05                   ` Alexei Starovoitov
@ 2017-02-03 21:00                   ` Andy Lutomirski
  2017-02-03 21:06                     ` Eric W. Biederman
  2017-02-03 23:08                     ` Alexei Starovoitov
  1 sibling, 2 replies; 35+ messages in thread
From: Andy Lutomirski @ 2017-02-03 21:00 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alexei Starovoitov, Linus Torvalds, David S . Miller,
	Daniel Borkmann, David Ahern, Tejun Heo, Thomas Graf,
	Network Development

On Thu, Feb 2, 2017 at 8:33 PM, Eric W. Biederman <ebiederm@xmission.com> wrote:
> Alexei Starovoitov <ast@fb.com> writes:
>
>> On 1/26/17 11:07 AM, Andy Lutomirski wrote:
>>> On Thu, Jan 26, 2017 at 10:32 AM, Alexei Starovoitov <ast@fb.com> wrote:
>>>> On 1/26/17 10:12 AM, Andy Lutomirski wrote:
>>>>>
>>>>> On Thu, Jan 26, 2017 at 9:46 AM, Alexei Starovoitov <ast@fb.com> wrote:
>>>>>>
>>>>>> On 1/26/17 8:37 AM, Andy Lutomirski wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> Think of bpf programs as safe kernel modules. They don't have
>>>>>>>> confined boundaries and program authors, if not careful, can shoot
>>>>>>>> themselves in the foot. We're not trying to prevent that because
>>>>>>>> it's impossible to check that the program is sane. Just like
>>>>>>>> it's impossible to check that kernel module is sane.
>>>>>>>> But in case of bpf we check that bpf program is _safe_ from the kernel
>>>>>>>> point of view. If it's doing some garbage, it's program's business.
>>>>>>>> Does it make more sense now?
>>>>>>>>
>>>>>>>
>>>>>>> With all due respect, I think this is not an acceptable way to think
>>>>>>> about BPF at all.  If you think of BPF this way, I think there needs
>>>>>>> to be a real discussion at KS or similar as to whether this is okay.
>>>>>>> The reason is simple: the kernel promises a stable ABI to userspace
>>>>>>> but not to kernel modules.  By thinking of BPF as more like a module,
>>>>>>> you're taking a big shortcut that will either result in ABI breakage
>>>>>>> down the road or in committing to a problematic stable ABI.
>>>>>>
>>>>>>
>>>>>>
>>>>>> you misunderstood the analogy.
>>>>>> bpf abi is certainly stable. that's why we were careful of not
>>>>>> exposing anything to it that is not already stable.
>>>>>>
>>>>>
>>>>> In that case I don't understand what you're trying to say.  Eric
>>>>> thinks your patch exposes a bad interface.  A bad interface for
>>>>> userspace is a very different thing from a bad interface available to
>>>>> kernel modules.  Are you saying that BPF is kernel-module-like in that
>>>>> the ABI exposed to BPF programs doesn't need to meet the same quality
>>>>> standards as userspace ABIs?
>>>>
>>>>
>>>> of course not.
>>>> ns.inum is already exposed to user space as a value.
>>>> This patch exposes it to bpf program in a convenient and stable way,
>>>
>>> Here's what I'm imaging Eric is thinking:
>>>
>>> ns.inum is currently exposed to userspace via procfs.  In principle,
>>> the value could be local to a namespace, though, which would enable
>>> CRIU to be able to preserve namespace inode numbers across a
>>> checkpoint+restore operation.  If this happened, the contained and
>>> restored procfs would see a different inode number than the outermost
>>> procfs.
>>
>> sure. there are many different ways for the program to see inode
>> that either was already reused or disappeared.
>> What I'm saying that it is expected. We cannot prevent that from
>> bpf side. Just like ifindex value read by the program can be bogus
>> as in the example I just provided.
>
> The point is that we can make the inode number stable across migration
> and the user space API for namespaces has been designed with that
> possibility in mind.

How does it help if BPF starts exposing both inode number and device number?

ISTM any ability to migrate namespaces and to migrate eBPF programs
that know about namespaces needs to have the eBPF program firmly
rooted in some namespace (or perhaps cgroup in this case) so that it
can see a namespaced view of the world.  For this to work, presumably
we need to make sure that eBPF programs that are installed by programs
that are in a container don't see traffic that isn't in that
container.  This is part of why I think that we should consider
preventing programs that aren't in the root namespace (perhaps *all*
the root namespaces) from installing bpf+cgroup programs in the first
place until there's a clearer understanding of how this all fits
together.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-03 21:00                   ` Andy Lutomirski
@ 2017-02-03 21:06                     ` Eric W. Biederman
  2017-02-03 23:08                     ` Alexei Starovoitov
  1 sibling, 0 replies; 35+ messages in thread
From: Eric W. Biederman @ 2017-02-03 21:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Linus Torvalds, David S . Miller,
	Daniel Borkmann, David Ahern, Tejun Heo, Thomas Graf,
	Network Development

Andy Lutomirski <luto@amacapital.net> writes:

> On Thu, Feb 2, 2017 at 8:33 PM, Eric W. Biederman <ebiederm@xmission.com> wrote:
>> Alexei Starovoitov <ast@fb.com> writes:
>>
>>> On 1/26/17 11:07 AM, Andy Lutomirski wrote:
>>>> On Thu, Jan 26, 2017 at 10:32 AM, Alexei Starovoitov <ast@fb.com> wrote:
>>>>> On 1/26/17 10:12 AM, Andy Lutomirski wrote:
>>>>>>
>>>>>> On Thu, Jan 26, 2017 at 9:46 AM, Alexei Starovoitov <ast@fb.com> wrote:
>>>>>>>
>>>>>>> On 1/26/17 8:37 AM, Andy Lutomirski wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Think of bpf programs as safe kernel modules. They don't have
>>>>>>>>> confined boundaries and program authors, if not careful, can shoot
>>>>>>>>> themselves in the foot. We're not trying to prevent that because
>>>>>>>>> it's impossible to check that the program is sane. Just like
>>>>>>>>> it's impossible to check that kernel module is sane.
>>>>>>>>> But in case of bpf we check that bpf program is _safe_ from the kernel
>>>>>>>>> point of view. If it's doing some garbage, it's program's business.
>>>>>>>>> Does it make more sense now?
>>>>>>>>>
>>>>>>>>
>>>>>>>> With all due respect, I think this is not an acceptable way to think
>>>>>>>> about BPF at all.  If you think of BPF this way, I think there needs
>>>>>>>> to be a real discussion at KS or similar as to whether this is okay.
>>>>>>>> The reason is simple: the kernel promises a stable ABI to userspace
>>>>>>>> but not to kernel modules.  By thinking of BPF as more like a module,
>>>>>>>> you're taking a big shortcut that will either result in ABI breakage
>>>>>>>> down the road or in committing to a problematic stable ABI.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> you misunderstood the analogy.
>>>>>>> bpf abi is certainly stable. that's why we were careful of not
>>>>>>> exposing anything to it that is not already stable.
>>>>>>>
>>>>>>
>>>>>> In that case I don't understand what you're trying to say.  Eric
>>>>>> thinks your patch exposes a bad interface.  A bad interface for
>>>>>> userspace is a very different thing from a bad interface available to
>>>>>> kernel modules.  Are you saying that BPF is kernel-module-like in that
>>>>>> the ABI exposed to BPF programs doesn't need to meet the same quality
>>>>>> standards as userspace ABIs?
>>>>>
>>>>>
>>>>> of course not.
>>>>> ns.inum is already exposed to user space as a value.
>>>>> This patch exposes it to bpf program in a convenient and stable way,
>>>>
>>>> Here's what I'm imaging Eric is thinking:
>>>>
>>>> ns.inum is currently exposed to userspace via procfs.  In principle,
>>>> the value could be local to a namespace, though, which would enable
>>>> CRIU to be able to preserve namespace inode numbers across a
>>>> checkpoint+restore operation.  If this happened, the contained and
>>>> restored procfs would see a different inode number than the outermost
>>>> procfs.
>>>
>>> sure. there are many different ways for the program to see inode
>>> that either was already reused or disappeared.
>>> What I'm saying that it is expected. We cannot prevent that from
>>> bpf side. Just like ifindex value read by the program can be bogus
>>> as in the example I just provided.
>>
>> The point is that we can make the inode number stable across migration
>> and the user space API for namespaces has been designed with that
>> possibility in mind.
>
> How does it help if BPF starts exposing both inode number and device
> number?

Adding the device number comparison helps in that it is explicit what is
being compared against.  That gives me at least a bit of a namespace
for the namespaces, and a program from a sufficiently wrong context will
have it's comparisons fail rather than having a match.

I think the operation that is exported in the BPF should be a full
comparison operation of device and inode number so that it could be
optimized/compiled to something else depending upon the context.

AKA the compilation of the bpf program would have the opportunity to
remove the namespace dependency and make the program work in a global
context.  So we don't have to carry namespace information around at run
time.

> ISTM any ability to migrate namespaces and to migrate eBPF programs
> that know about namespaces needs to have the eBPF program firmly
> rooted in some namespace (or perhaps cgroup in this case) so that it
> can see a namespaced view of the world.  For this to work, presumably
> we need to make sure that eBPF programs that are installed by programs
> that are in a container don't see traffic that isn't in that
> container.  This is part of why I think that we should consider
> preventing programs that aren't in the root namespace (perhaps *all*
> the root namespaces) from installing bpf+cgroup programs in the first
> place until there's a clearer understanding of how this all fits
> together.

Andy I agree.  At least to the point those programs are
reading attributes that are in a namespace.  Something that should be
straight forward to verify in the bpf checker when installing the
program.

Eric

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-01-26  3:27 [PATCH net] bpf: expose netns inode to bpf programs Alexei Starovoitov
                   ` (2 preceding siblings ...)
  2017-01-31 22:11 ` David Ahern
@ 2017-02-03 21:56 ` Daniel Borkmann
  2017-02-03 23:06   ` Alexei Starovoitov
  3 siblings, 1 reply; 35+ messages in thread
From: Daniel Borkmann @ 2017-02-03 21:56 UTC (permalink / raw)
  To: Alexei Starovoitov, David S . Miller
  Cc: David Ahern, Tejun Heo, Andy Lutomirski, Eric W . Biederman,
	Thomas Graf, netdev

On 01/26/2017 04:27 AM, Alexei Starovoitov wrote:
> in cases where bpf programs are looking at sockets and packets
> that belong to different netns, it could be useful to read netns inode,
> so that programs can make intelligent decisions.
> For example to disallow raw sockets in all non-init netns the program can do:
> if (sk->type == SOCK_RAW && sk->netns_inum != 0xf0000075)
>    return 0;
> where 0xf0000075 inode comes from /proc/pid/ns/net
>
> Similarly TC cls_bpf/act_bpf and socket filters can do
> if (skb->netns_inum == expected_inode)
>
> The lack of netns awareness was a concern even for socket filters,
> since the application can attach the same bpf program to sockets
> in a different netns. Just like tc cls_bpf program can work in
> different netns as well, so it has to be addressed uniformly
> across all types of bpf programs.

Sorry for jumping in late, but my question is, isn't this helper
really only relevant for BPF_PROG_TYPE_CGROUP_* typed programs?
Thus other prog types making use of bpf_convert_ctx_access()
should probably reject that in .is_valid_access() callback?

Reason why I'm asking is that for sockets or tc progs, you
already have a netns context where you're attached to, and f.e.
skbs leaving that netns context will be orphaned. Thus, why
would tc or sock filter tailor a program with such a check,
if it can only match/mismatch its own netns inum eventually?
When making this effort to lookup and hardcode the dev/inode
num into the prog, wouldn't it be easier for these types if
the managing app that loads these progs tailors the progs for
a given netns directly, so also such runtime check can generally
be avoided? Am I missing something wrt 'concerns'? The cgroup
ones are global, so there I can see that it could be used in
some way f.e. to restrict access, account, etc.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-03 21:56 ` Daniel Borkmann
@ 2017-02-03 23:06   ` Alexei Starovoitov
  2017-02-03 23:42     ` Daniel Borkmann
  0 siblings, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-02-03 23:06 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Alexei Starovoitov, David S . Miller, David Ahern, Tejun Heo,
	Andy Lutomirski, Eric W . Biederman, Thomas Graf, netdev

On Fri, Feb 03, 2017 at 10:56:43PM +0100, Daniel Borkmann wrote:
> On 01/26/2017 04:27 AM, Alexei Starovoitov wrote:
> >in cases where bpf programs are looking at sockets and packets
> >that belong to different netns, it could be useful to read netns inode,
> >so that programs can make intelligent decisions.
> >For example to disallow raw sockets in all non-init netns the program can do:
> >if (sk->type == SOCK_RAW && sk->netns_inum != 0xf0000075)
> >   return 0;
> >where 0xf0000075 inode comes from /proc/pid/ns/net
> >
> >Similarly TC cls_bpf/act_bpf and socket filters can do
> >if (skb->netns_inum == expected_inode)
> >
> >The lack of netns awareness was a concern even for socket filters,
> >since the application can attach the same bpf program to sockets
> >in a different netns. Just like tc cls_bpf program can work in
> >different netns as well, so it has to be addressed uniformly
> >across all types of bpf programs.
> 
> Sorry for jumping in late, but my question is, isn't this helper
> really only relevant for BPF_PROG_TYPE_CGROUP_* typed programs?
> Thus other prog types making use of bpf_convert_ctx_access()
> should probably reject that in .is_valid_access() callback?
> 
> Reason why I'm asking is that for sockets or tc progs, you
> already have a netns context where you're attached to, and f.e.
> skbs leaving that netns context will be orphaned. Thus, why
> would tc or sock filter tailor a program with such a check,
> if it can only match/mismatch its own netns inum eventually?

Please see the example I provided earlier.
We can have the same cls_bpf attached to all netns-es.
Same for socket filters and everything else.
All bpf programs are global.
They can all share info via maps and so on.

> When making this effort to lookup and hardcode the dev/inode
> num into the prog, wouldn't it be easier for these types if

we cannot hardcode dev/inode. They are dynamic and depends
where program runs.
I'll send a patch shortly that exposes both.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-03 21:00                   ` Andy Lutomirski
  2017-02-03 21:06                     ` Eric W. Biederman
@ 2017-02-03 23:08                     ` Alexei Starovoitov
  2017-02-04 17:07                       ` Andy Lutomirski
  1 sibling, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-02-03 23:08 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Eric W. Biederman, Alexei Starovoitov, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

On Fri, Feb 03, 2017 at 01:00:47PM -0800, Andy Lutomirski wrote:
> 
> ISTM any ability to migrate namespaces and to migrate eBPF programs
> that know about namespaces needs to have the eBPF program firmly
> rooted in some namespace (or perhaps cgroup in this case) so that it

programs are already global. We cannot break that.

> can see a namespaced view of the world.  For this to work, presumably
> we need to make sure that eBPF programs that are installed by programs
> that are in a container don't see traffic that isn't in that
> container.

such approach will break existing users.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-03 23:06   ` Alexei Starovoitov
@ 2017-02-03 23:42     ` Daniel Borkmann
  2017-02-04  1:25       ` Alexei Starovoitov
  2017-02-04 17:08       ` Andy Lutomirski
  0 siblings, 2 replies; 35+ messages in thread
From: Daniel Borkmann @ 2017-02-03 23:42 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, David S . Miller, David Ahern, Tejun Heo,
	Andy Lutomirski, Eric W . Biederman, Thomas Graf, netdev

On 02/04/2017 12:06 AM, Alexei Starovoitov wrote:
> On Fri, Feb 03, 2017 at 10:56:43PM +0100, Daniel Borkmann wrote:
>> On 01/26/2017 04:27 AM, Alexei Starovoitov wrote:
>>> in cases where bpf programs are looking at sockets and packets
>>> that belong to different netns, it could be useful to read netns inode,
>>> so that programs can make intelligent decisions.
>>> For example to disallow raw sockets in all non-init netns the program can do:
>>> if (sk->type == SOCK_RAW && sk->netns_inum != 0xf0000075)
>>>    return 0;
>>> where 0xf0000075 inode comes from /proc/pid/ns/net
>>>
>>> Similarly TC cls_bpf/act_bpf and socket filters can do
>>> if (skb->netns_inum == expected_inode)
>>>
>>> The lack of netns awareness was a concern even for socket filters,
>>> since the application can attach the same bpf program to sockets
>>> in a different netns. Just like tc cls_bpf program can work in
>>> different netns as well, so it has to be addressed uniformly
>>> across all types of bpf programs.
>>
>> Sorry for jumping in late, but my question is, isn't this helper
>> really only relevant for BPF_PROG_TYPE_CGROUP_* typed programs?
>> Thus other prog types making use of bpf_convert_ctx_access()
>> should probably reject that in .is_valid_access() callback?
>>
>> Reason why I'm asking is that for sockets or tc progs, you
>> already have a netns context where you're attached to, and f.e.
>> skbs leaving that netns context will be orphaned. Thus, why
>> would tc or sock filter tailor a program with such a check,
>> if it can only match/mismatch its own netns inum eventually?
>
> Please see the example I provided earlier.

That example for both socket filter and tc progs specifically
wasn't quite clear to me, hence my question wrt why it's right
now a "concern" for these ones. (Again, clear to me for cgroups
progs.)

> We can have the same cls_bpf attached to all netns-es.
> Same for socket filters and everything else.

So use-case would be that someone wants to attach the very same
prog via tc to various netdevs sitting in different netns, and
that prog looks up a map, controlled by initns, with skb->netns_inum
as key and the resulting value could contain allowed feature bits
for that specific netns prog the skbs goes through? That would be
a feature, not "concern", no? At the same time, it's up to the
user or mgmt app what gets loaded so f.e. it might just as well
tailor/optimize the progs individually for the devs sitting in
netns-es to avoid such map lookup.

> All bpf programs are global.

True, but for socket filter and tc they are hooked/attached under
a given netns context.

> They can all share info via maps and so on.

>> When making this effort to lookup and hardcode the dev/inode
>> num into the prog, wouldn't it be easier for these types if
>
> we cannot hardcode dev/inode. They are dynamic and depends
> where program runs.

Was referring to the test from above provided example:

 >>> if (skb->netns_inum == expected_inode)

> I'll send a patch shortly that exposes both.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-03 23:42     ` Daniel Borkmann
@ 2017-02-04  1:25       ` Alexei Starovoitov
  2017-02-04 17:08       ` Andy Lutomirski
  1 sibling, 0 replies; 35+ messages in thread
From: Alexei Starovoitov @ 2017-02-04  1:25 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Alexei Starovoitov, David S . Miller, David Ahern, Tejun Heo,
	Andy Lutomirski, Eric W . Biederman, Thomas Graf, netdev

On Sat, Feb 04, 2017 at 12:42:31AM +0100, Daniel Borkmann wrote:
> On 02/04/2017 12:06 AM, Alexei Starovoitov wrote:
> >On Fri, Feb 03, 2017 at 10:56:43PM +0100, Daniel Borkmann wrote:
> >>On 01/26/2017 04:27 AM, Alexei Starovoitov wrote:
> >>>in cases where bpf programs are looking at sockets and packets
> >>>that belong to different netns, it could be useful to read netns inode,
> >>>so that programs can make intelligent decisions.
> >>>For example to disallow raw sockets in all non-init netns the program can do:
> >>>if (sk->type == SOCK_RAW && sk->netns_inum != 0xf0000075)
> >>>   return 0;
> >>>where 0xf0000075 inode comes from /proc/pid/ns/net
> >>>
> >>>Similarly TC cls_bpf/act_bpf and socket filters can do
> >>>if (skb->netns_inum == expected_inode)
> >>>
> >>>The lack of netns awareness was a concern even for socket filters,
> >>>since the application can attach the same bpf program to sockets
> >>>in a different netns. Just like tc cls_bpf program can work in
> >>>different netns as well, so it has to be addressed uniformly
> >>>across all types of bpf programs.
> >>
> >>Sorry for jumping in late, but my question is, isn't this helper
> >>really only relevant for BPF_PROG_TYPE_CGROUP_* typed programs?
> >>Thus other prog types making use of bpf_convert_ctx_access()
> >>should probably reject that in .is_valid_access() callback?
> >>
> >>Reason why I'm asking is that for sockets or tc progs, you
> >>already have a netns context where you're attached to, and f.e.
> >>skbs leaving that netns context will be orphaned. Thus, why
> >>would tc or sock filter tailor a program with such a check,
> >>if it can only match/mismatch its own netns inum eventually?
> >
> >Please see the example I provided earlier.
> 
> That example for both socket filter and tc progs specifically
> wasn't quite clear to me, hence my question wrt why it's right
> now a "concern" for these ones. (Again, clear to me for cgroups
> progs.)
> 
> >We can have the same cls_bpf attached to all netns-es.
> >Same for socket filters and everything else.
> 
> So use-case would be that someone wants to attach the very same
> prog via tc to various netdevs sitting in different netns, and
> that prog looks up a map, controlled by initns, with skb->netns_inum
> as key and the resulting value could contain allowed feature bits
> for that specific netns prog the skbs goes through? That would be
> a feature, not "concern", no? At the same time, it's up to the
> user or mgmt app what gets loaded so f.e. it might just as well
> tailor/optimize the progs individually for the devs sitting in
> netns-es to avoid such map lookup.

yes. It's partially feature and partially bugfix.
Just sent a new patch and tried to explain that bit in commit log.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-03 23:08                     ` Alexei Starovoitov
@ 2017-02-04 17:07                       ` Andy Lutomirski
  2017-02-05  3:10                         ` Alexei Starovoitov
  0 siblings, 1 reply; 35+ messages in thread
From: Andy Lutomirski @ 2017-02-04 17:07 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Alexei Starovoitov, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

On Fri, Feb 3, 2017 at 3:08 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Fri, Feb 03, 2017 at 01:00:47PM -0800, Andy Lutomirski wrote:
>>
>> ISTM any ability to migrate namespaces and to migrate eBPF programs
>> that know about namespaces needs to have the eBPF program firmly
>> rooted in some namespace (or perhaps cgroup in this case) so that it
>
> programs are already global. We cannot break that.

I don't know what you mean here.  It ought to be possible to have a
(privileged) program that installs and uses cgroup+bpf programs run
under CRIU and get migrated.  Maybe not yet, but some day.  This
should be doable without the program noticing, and it would be
unfortunate if the API makes this harder than necessary.

>
>> can see a namespaced view of the world.  For this to work, presumably
>> we need to make sure that eBPF programs that are installed by programs
>> that are in a container don't see traffic that isn't in that
>> container.
>
> such approach will break existing users.

What existing users?  The whole feature has never been in a released kernel.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-03 23:42     ` Daniel Borkmann
  2017-02-04  1:25       ` Alexei Starovoitov
@ 2017-02-04 17:08       ` Andy Lutomirski
  2017-02-05  3:18         ` Alexei Starovoitov
  1 sibling, 1 reply; 35+ messages in thread
From: Andy Lutomirski @ 2017-02-04 17:08 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Alexei Starovoitov, Alexei Starovoitov, David S . Miller,
	David Ahern, Tejun Heo, Eric W . Biederman, Thomas Graf,
	Network Development

On Fri, Feb 3, 2017 at 3:42 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 02/04/2017 12:06 AM, Alexei Starovoitov wrote:
>>
>> On Fri, Feb 03, 2017 at 10:56:43PM +0100, Daniel Borkmann wrote:
>>>
>>> On 01/26/2017 04:27 AM, Alexei Starovoitov wrote:
>>>>
>>>> in cases where bpf programs are looking at sockets and packets
>>>> that belong to different netns, it could be useful to read netns inode,
>>>> so that programs can make intelligent decisions.
>>>> For example to disallow raw sockets in all non-init netns the program
>>>> can do:
>>>> if (sk->type == SOCK_RAW && sk->netns_inum != 0xf0000075)
>>>>    return 0;
>>>> where 0xf0000075 inode comes from /proc/pid/ns/net
>>>>
>>>> Similarly TC cls_bpf/act_bpf and socket filters can do
>>>> if (skb->netns_inum == expected_inode)
>>>>
>>>> The lack of netns awareness was a concern even for socket filters,
>>>> since the application can attach the same bpf program to sockets
>>>> in a different netns. Just like tc cls_bpf program can work in
>>>> different netns as well, so it has to be addressed uniformly
>>>> across all types of bpf programs.
>>>
>>>
>>> Sorry for jumping in late, but my question is, isn't this helper
>>> really only relevant for BPF_PROG_TYPE_CGROUP_* typed programs?
>>> Thus other prog types making use of bpf_convert_ctx_access()
>>> should probably reject that in .is_valid_access() callback?
>>>
>>> Reason why I'm asking is that for sockets or tc progs, you
>>> already have a netns context where you're attached to, and f.e.
>>> skbs leaving that netns context will be orphaned. Thus, why
>>> would tc or sock filter tailor a program with such a check,
>>> if it can only match/mismatch its own netns inum eventually?
>>
>>
>> Please see the example I provided earlier.
>
>
> That example for both socket filter and tc progs specifically
> wasn't quite clear to me, hence my question wrt why it's right
> now a "concern" for these ones. (Again, clear to me for cgroups
> progs.)
>
>> We can have the same cls_bpf attached to all netns-es.
>> Same for socket filters and everything else.
>
>
> So use-case would be that someone wants to attach the very same
> prog via tc to various netdevs sitting in different netns, and
> that prog looks up a map, controlled by initns, with skb->netns_inum
> as key and the resulting value could contain allowed feature bits
> for that specific netns prog the skbs goes through? That would be
> a feature, not "concern", no? At the same time, it's up to the
> user or mgmt app what gets loaded so f.e. it might just as well
> tailor/optimize the progs individually for the devs sitting in
> netns-es to avoid such map lookup.

Agreed.  I don't see why you would install the exact same program on
two sockets in different netnses if the program contains, say, an
ifindex.  Why not just install a variant with the right ifindex into
each socket?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-04 17:07                       ` Andy Lutomirski
@ 2017-02-05  3:10                         ` Alexei Starovoitov
  2017-02-05  3:27                           ` Andy Lutomirski
  0 siblings, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-02-05  3:10 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Eric W. Biederman, Alexei Starovoitov, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

On Sat, Feb 04, 2017 at 09:07:19AM -0800, Andy Lutomirski wrote:
> >> can see a namespaced view of the world.  For this to work, presumably
> >> we need to make sure that eBPF programs that are installed by programs
> >> that are in a container don't see traffic that isn't in that
> >> container.
> >
> > such approach will break existing users.
> 
> What existing users?  The whole feature has never been in a released kernel.

the v3 patch commit log explains the use cases that work across netns.
And they would break if netns boundary is suddenly started to be
enforced for networking program types.
If you're suggesting to limit root netns for cgroup_sock program type only
then David explained weeks ago why it's not acceptable for vrf use
case which was the main reason to add that program type in the first place.
Also it would make one specific bpf program type to be different
from all others which will be confusing.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-04 17:08       ` Andy Lutomirski
@ 2017-02-05  3:18         ` Alexei Starovoitov
  2017-02-05  3:22           ` Andy Lutomirski
  0 siblings, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-02-05  3:18 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Daniel Borkmann, Alexei Starovoitov, David S . Miller,
	David Ahern, Tejun Heo, Eric W . Biederman, Thomas Graf,
	Network Development

On Sat, Feb 04, 2017 at 09:08:38AM -0800, Andy Lutomirski wrote:
> > So use-case would be that someone wants to attach the very same
> > prog via tc to various netdevs sitting in different netns, and
> > that prog looks up a map, controlled by initns, with skb->netns_inum
> > as key and the resulting value could contain allowed feature bits
> > for that specific netns prog the skbs goes through? That would be
> > a feature, not "concern", no? At the same time, it's up to the
> > user or mgmt app what gets loaded so f.e. it might just as well
> > tailor/optimize the progs individually for the devs sitting in
> > netns-es to avoid such map lookup.
> 
> Agreed.  I don't see why you would install the exact same program on
> two sockets in different netnses if the program contains, say, an
> ifindex.  Why not just install a variant with the right ifindex into
> each socket?

for some use cases it may be feasible to generate different
program on the fly for every netns that is created. Some people
generate them already for things like IP addresses. Like to stop
an attack to particular vip the fastest xdp program has
that vip built-in, then compiled on the fly and loaded.
In other cases people prefer to have one program compiled once
and thoroughly tested offline, since some folks are worried
that on-the-fly compilation may cause generated code to
be rejected by the verifier.
Therefore new program for every netns may be acceptable
to some users, but not all.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-05  3:18         ` Alexei Starovoitov
@ 2017-02-05  3:22           ` Andy Lutomirski
  2017-02-05  3:35             ` Alexei Starovoitov
  0 siblings, 1 reply; 35+ messages in thread
From: Andy Lutomirski @ 2017-02-05  3:22 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, Alexei Starovoitov, David S . Miller,
	David Ahern, Tejun Heo, Eric W . Biederman, Thomas Graf,
	Network Development

On Sat, Feb 4, 2017 at 7:18 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Sat, Feb 04, 2017 at 09:08:38AM -0800, Andy Lutomirski wrote:
>> > So use-case would be that someone wants to attach the very same
>> > prog via tc to various netdevs sitting in different netns, and
>> > that prog looks up a map, controlled by initns, with skb->netns_inum
>> > as key and the resulting value could contain allowed feature bits
>> > for that specific netns prog the skbs goes through? That would be
>> > a feature, not "concern", no? At the same time, it's up to the
>> > user or mgmt app what gets loaded so f.e. it might just as well
>> > tailor/optimize the progs individually for the devs sitting in
>> > netns-es to avoid such map lookup.
>>
>> Agreed.  I don't see why you would install the exact same program on
>> two sockets in different netnses if the program contains, say, an
>> ifindex.  Why not just install a variant with the right ifindex into
>> each socket?
>
> In other cases people prefer to have one program compiled once
> and thoroughly tested offline, since some folks are worried
> that on-the-fly compilation may cause generated code to
> be rejected by the verifier.

I would be rather surprised if someone wrote a program that did had an
expression like (ifindex == 17), tested it offline, and refused to
update the 17 based on the actual ifindex in use.

--Andy

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-05  3:10                         ` Alexei Starovoitov
@ 2017-02-05  3:27                           ` Andy Lutomirski
  2017-02-05  3:48                             ` Alexei Starovoitov
  0 siblings, 1 reply; 35+ messages in thread
From: Andy Lutomirski @ 2017-02-05  3:27 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Alexei Starovoitov, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

On Sat, Feb 4, 2017 at 7:10 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Sat, Feb 04, 2017 at 09:07:19AM -0800, Andy Lutomirski wrote:
>> >> can see a namespaced view of the world.  For this to work, presumably
>> >> we need to make sure that eBPF programs that are installed by programs
>> >> that are in a container don't see traffic that isn't in that
>> >> container.
>> >
>> > such approach will break existing users.
>>
>> What existing users?  The whole feature has never been in a released kernel.
>
> the v3 patch commit log explains the use cases that work across netns.
> And they would break if netns boundary is suddenly started to be
> enforced for networking program types.

This is why these issues should be addressed carefully before this
feature is stabilized in 4.10, and I'm increasingly unconvinced that
this will happen.  There may be a single week left, and the cgroup
override issue is still entirely unaddressed, for example.

Also, the v3 commit log's example seems a bit weak, since seccomp can
handle that case (block non-init-netns raw sockets) and it's not clear
to me that a filter like that serves a purpose regardless besides
kernel attack surface reduction.

> If you're suggesting to limit root netns for cgroup_sock program type only
> then David explained weeks ago why it's not acceptable for vrf use
> case which was the main reason to add that program type in the first place.
> Also it would make one specific bpf program type to be different
> from all others which will be confusing.
>

I'm suggesting limiting it to the init netns for *all* cases.
(Specifically, limiting installing programs to the init netns.  But
perhaps limiting running of programs to the netns in which they're
installed would be the best solution after all.)  If there isn't a
well-understood solution that plays well with netns and that satisfies
ip vrf, the it's worth seriously thinking about whether ip vrf is
really ready for 4.10.

--Andy

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-05  3:22           ` Andy Lutomirski
@ 2017-02-05  3:35             ` Alexei Starovoitov
  2017-02-05  3:49               ` Andy Lutomirski
  0 siblings, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-02-05  3:35 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Daniel Borkmann, Alexei Starovoitov, David S . Miller,
	David Ahern, Tejun Heo, Eric W . Biederman, Thomas Graf,
	Network Development

On Sat, Feb 04, 2017 at 07:22:03PM -0800, Andy Lutomirski wrote:
> On Sat, Feb 4, 2017 at 7:18 PM, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> > On Sat, Feb 04, 2017 at 09:08:38AM -0800, Andy Lutomirski wrote:
> >> > So use-case would be that someone wants to attach the very same
> >> > prog via tc to various netdevs sitting in different netns, and
> >> > that prog looks up a map, controlled by initns, with skb->netns_inum
> >> > as key and the resulting value could contain allowed feature bits
> >> > for that specific netns prog the skbs goes through? That would be
> >> > a feature, not "concern", no? At the same time, it's up to the
> >> > user or mgmt app what gets loaded so f.e. it might just as well
> >> > tailor/optimize the progs individually for the devs sitting in
> >> > netns-es to avoid such map lookup.
> >>
> >> Agreed.  I don't see why you would install the exact same program on
> >> two sockets in different netnses if the program contains, say, an
> >> ifindex.  Why not just install a variant with the right ifindex into
> >> each socket?
> >
> > In other cases people prefer to have one program compiled once
> > and thoroughly tested offline, since some folks are worried
> > that on-the-fly compilation may cause generated code to
> > be rejected by the verifier.
> 
> I would be rather surprised if someone wrote a program that did had an
> expression like (ifindex == 17), tested it offline, and refused to
> update the 17 based on the actual ifindex in use.

All programs have bugs. What bpf subsytem is trying to do is
to be flexible and satisfy as many use cases as possible.
Boxing users into "one way to do things" isn't a successful
strategy in my opinion. perl vs python, if you like :)
bpf is perl like. We don't enforce spaces vs tabs :)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-05  3:27                           ` Andy Lutomirski
@ 2017-02-05  3:48                             ` Alexei Starovoitov
  2017-02-05  3:54                               ` Andy Lutomirski
  0 siblings, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-02-05  3:48 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Eric W. Biederman, Alexei Starovoitov, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

On Sat, Feb 04, 2017 at 07:27:01PM -0800, Andy Lutomirski wrote:
> On Sat, Feb 4, 2017 at 7:10 PM, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> > On Sat, Feb 04, 2017 at 09:07:19AM -0800, Andy Lutomirski wrote:
> >> >> can see a namespaced view of the world.  For this to work, presumably
> >> >> we need to make sure that eBPF programs that are installed by programs
> >> >> that are in a container don't see traffic that isn't in that
> >> >> container.
> >> >
> >> > such approach will break existing users.
> >>
> >> What existing users?  The whole feature has never been in a released kernel.
> >
> > the v3 patch commit log explains the use cases that work across netns.
> > And they would break if netns boundary is suddenly started to be
> > enforced for networking program types.
> 
> This is why these issues should be addressed carefully before this
> feature is stabilized in 4.10, and I'm increasingly unconvinced that
> this will happen.  There may be a single week left, and the cgroup
> override issue is still entirely unaddressed, for example.

imo cgroup override issue is not an issue and the api can be extended
at any time. I offered to change the default to disable override,
but I need override to be there, since it's the fastest and most
flexible way. So the sooner we can land bpf_sk_netns_id() patch
the faster I can post override disable.

> Also, the v3 commit log's example seems a bit weak, since seccomp can
> handle that case (block non-init-netns raw sockets) and it's not clear
> to me that a filter like that serves a purpose regardless besides
> kernel attack surface reduction.

seccomp doesn't even support extended bpf yet and that makes it
quite inflexible.

> > If you're suggesting to limit root netns for cgroup_sock program type only
> > then David explained weeks ago why it's not acceptable for vrf use
> > case which was the main reason to add that program type in the first place.
> > Also it would make one specific bpf program type to be different
> > from all others which will be confusing.
> >
> 
> I'm suggesting limiting it to the init netns for *all* cases.
> (Specifically, limiting installing programs to the init netns.  But
> perhaps limiting running of programs to the netns in which they're
> installed would be the best solution after all.)  If there isn't a
> well-understood solution that plays well with netns and that satisfies
> ip vrf, the it's worth seriously thinking about whether ip vrf is
> really ready for 4.10.

The proposed v3 is imo solves the ip vrf 'malfunction' that you found.
v1 was also solving it and iproute2 patch was ready, but as Eric pointed
out just ns.inum is not enough.
The 'malfunction' can also be addressed _without_ kernel changes,
but the kernel can make it easier, hence v3 patch.

Andy, can you please reply in one thread? In particular in v3 patch?
I'm not sure why you making more or less the same points in
different threads. I'm sure it's hard for everyone to follow.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-05  3:35             ` Alexei Starovoitov
@ 2017-02-05  3:49               ` Andy Lutomirski
  0 siblings, 0 replies; 35+ messages in thread
From: Andy Lutomirski @ 2017-02-05  3:49 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, Alexei Starovoitov, David S . Miller,
	David Ahern, Tejun Heo, Eric W . Biederman, Thomas Graf,
	Network Development

On Sat, Feb 4, 2017 at 7:35 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Sat, Feb 04, 2017 at 07:22:03PM -0800, Andy Lutomirski wrote:
>> On Sat, Feb 4, 2017 at 7:18 PM, Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>> > On Sat, Feb 04, 2017 at 09:08:38AM -0800, Andy Lutomirski wrote:
>> >> > So use-case would be that someone wants to attach the very same
>> >> > prog via tc to various netdevs sitting in different netns, and
>> >> > that prog looks up a map, controlled by initns, with skb->netns_inum
>> >> > as key and the resulting value could contain allowed feature bits
>> >> > for that specific netns prog the skbs goes through? That would be
>> >> > a feature, not "concern", no? At the same time, it's up to the
>> >> > user or mgmt app what gets loaded so f.e. it might just as well
>> >> > tailor/optimize the progs individually for the devs sitting in
>> >> > netns-es to avoid such map lookup.
>> >>
>> >> Agreed.  I don't see why you would install the exact same program on
>> >> two sockets in different netnses if the program contains, say, an
>> >> ifindex.  Why not just install a variant with the right ifindex into
>> >> each socket?
>> >
>> > In other cases people prefer to have one program compiled once
>> > and thoroughly tested offline, since some folks are worried
>> > that on-the-fly compilation may cause generated code to
>> > be rejected by the verifier.
>>
>> I would be rather surprised if someone wrote a program that did had an
>> expression like (ifindex == 17), tested it offline, and refused to
>> update the 17 based on the actual ifindex in use.
>
> All programs have bugs. What bpf subsytem is trying to do is
> to be flexible and satisfy as many use cases as possible.
> Boxing users into "one way to do things" isn't a successful
> strategy in my opinion. perl vs python, if you like :)
> bpf is perl like. We don't enforce spaces vs tabs :)
>

Daniel's asking what the netns query is good for in programs that are
attached to sockets.  I think your example isn't relevant here.  In
fact, I think your example of pre-tested programs is even less
relevant, since a there's no way to know what the netns id will be
prior to the creation of a netns, so you can't usefully hard-code a
netns id into a precompiled BPF program.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-05  3:48                             ` Alexei Starovoitov
@ 2017-02-05  3:54                               ` Andy Lutomirski
  2017-02-05  4:37                                 ` Alexei Starovoitov
  0 siblings, 1 reply; 35+ messages in thread
From: Andy Lutomirski @ 2017-02-05  3:54 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Alexei Starovoitov, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

On Sat, Feb 4, 2017 at 7:48 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Sat, Feb 04, 2017 at 07:27:01PM -0800, Andy Lutomirski wrote:
>> On Sat, Feb 4, 2017 at 7:10 PM, Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>> > On Sat, Feb 04, 2017 at 09:07:19AM -0800, Andy Lutomirski wrote:
>> >> >> can see a namespaced view of the world.  For this to work, presumably
>> >> >> we need to make sure that eBPF programs that are installed by programs
>> >> >> that are in a container don't see traffic that isn't in that
>> >> >> container.
>> >> >
>> >> > such approach will break existing users.
>> >>
>> >> What existing users?  The whole feature has never been in a released kernel.
>> >
>> > the v3 patch commit log explains the use cases that work across netns.
>> > And they would break if netns boundary is suddenly started to be
>> > enforced for networking program types.
>>
>> This is why these issues should be addressed carefully before this
>> feature is stabilized in 4.10, and I'm increasingly unconvinced that
>> this will happen.  There may be a single week left, and the cgroup
>> override issue is still entirely unaddressed, for example.
>
> imo cgroup override issue is not an issue and the api can be extended
> at any time. I offered to change the default to disable override,
> but I need override to be there, since it's the fastest and most
> flexible way. So the sooner we can land bpf_sk_netns_id() patch
> the faster I can post override disable.

Override is barely the fastest way.  Long ago in one of these threads,
I showed how the full run-all-the-in-scope-programs behavior could be
implemented with a single correctly-predicted branch worth of overhead
compared to the current behavior.  The run-them-all behavior also
seems like the least surprising, most secure, and most generally
useful behavior.

I've repeatedly asked how you plan to make a "don't override" flag
have sensible semantics when someone tries to add a new flag or change
the behavior to "don't override but, rather then rejecting programs
down the hierarchy, just run them all".  You haven't answered that
question.

Given that we're doing API design and it's waaaaay past the merge
window, I just don't see how any of this is ready for 4.10.

>
>> Also, the v3 commit log's example seems a bit weak, since seccomp can
>> handle that case (block non-init-netns raw sockets) and it's not clear
>> to me that a filter like that serves a purpose regardless besides
>> kernel attack surface reduction.
>
> seccomp doesn't even support extended bpf yet and that makes it
> quite inflexible.

You can trivially write a seccomp filter that blocks raw sockets.

>
>> > If you're suggesting to limit root netns for cgroup_sock program type only
>> > then David explained weeks ago why it's not acceptable for vrf use
>> > case which was the main reason to add that program type in the first place.
>> > Also it would make one specific bpf program type to be different
>> > from all others which will be confusing.
>> >
>>
>> I'm suggesting limiting it to the init netns for *all* cases.
>> (Specifically, limiting installing programs to the init netns.  But
>> perhaps limiting running of programs to the netns in which they're
>> installed would be the best solution after all.)  If there isn't a
>> well-understood solution that plays well with netns and that satisfies
>> ip vrf, the it's worth seriously thinking about whether ip vrf is
>> really ready for 4.10.
>
> The proposed v3 is imo solves the ip vrf 'malfunction' that you found.
> v1 was also solving it and iproute2 patch was ready, but as Eric pointed
> out just ns.inum is not enough.
> The 'malfunction' can also be addressed _without_ kernel changes,
> but the kernel can make it easier, hence v3 patch.

I don't know whether Eric will be okay with your v3.  I pointed out a
possible problem.

>
> Andy, can you please reply in one thread? In particular in v3 patch?
> I'm not sure why you making more or less the same points in
> different threads. I'm sure it's hard for everyone to follow.
>

I can try.  Can you try the same thing so I can reply once?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-05  3:54                               ` Andy Lutomirski
@ 2017-02-05  4:37                                 ` Alexei Starovoitov
  2017-02-05  5:05                                   ` Andy Lutomirski
  0 siblings, 1 reply; 35+ messages in thread
From: Alexei Starovoitov @ 2017-02-05  4:37 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Eric W. Biederman, Alexei Starovoitov, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

On Sat, Feb 04, 2017 at 07:54:20PM -0800, Andy Lutomirski wrote:
>
> I've repeatedly asked how you plan to make a "don't override" flag
> have sensible semantics when someone tries to add a new flag or change
> the behavior to "don't override but, rather then rejecting programs
> down the hierarchy, just run them all".  You haven't answered that
> question.

I explained already that I need to do combining on the bpf side instead
of running the list, since running several programs where 90% of
the logic is the same is the overhead that is not acceptable
for production server. It may be fine for desktop, but not
when every cycle matters. With one program per cgroup I can
combine multiple of them on bpf side. In networking the most of
the prep work like packet parsing and lookups are common,
but the actions are different, so for one part of the hieararchy
I can have program A monitoring tcp and in other
part I can have program B monitoring tcp and udp.
What you're saying that for tcp and udp monitoring
run two programs. One for udp and one for tcp.
That is not efficient. Hence the need to combine
the programs on bpf side and attach only one with override.

The "dont override flag" was also explained before. Here it is again:
Without breaking above "override" scenario we can add a flag
that when the program is attached with that flag to one part of
the hierarchy the override of it will not be allowed in the descendent.
This extension can be done at any time in the future.
The only question is what is the default when such flag
is not present. The current default is override allowed.
You insist on changing the default right now.
I don't mind, therefore I'm working on such "dont override flag",
since if we need to change the default it needs to be done now,
but if it doesn't happen for 4.10, it's absolutely fine too.
For security use cases the flag will be added later
and sandboxing use cases will use that flag too.
There are no expections that if cgroup is there the program
attach command must always succeed. That's why we have error codes
and we can dissallow attach based on this flag or any future
restrictions. All of it is root now anyway and sandboxing/security
use cases need to wait until bpf side can be made unprivileged.
I see current api to have a ton of room for future extensions.

> Given that we're doing API design and it's waaaaay past the merge
> window, I just don't see how any of this is ready for 4.10.

We are NOT doing design now. Design and implementation was
done back last summer. It took many interations and lots
of discussions on public lists with netdev and cgroup lists cc-ed.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-05  4:37                                 ` Alexei Starovoitov
@ 2017-02-05  5:05                                   ` Andy Lutomirski
  2017-02-07  1:43                                     ` Alexei Starovoitov
  0 siblings, 1 reply; 35+ messages in thread
From: Andy Lutomirski @ 2017-02-05  5:05 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric W. Biederman, Alexei Starovoitov, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

On Sat, Feb 4, 2017 at 8:37 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Sat, Feb 04, 2017 at 07:54:20PM -0800, Andy Lutomirski wrote:
>>
>> I've repeatedly asked how you plan to make a "don't override" flag
>> have sensible semantics when someone tries to add a new flag or change
>> the behavior to "don't override but, rather then rejecting programs
>> down the hierarchy, just run them all".  You haven't answered that
>> question.
>
> I explained already that I need to do combining on the bpf side instead
> of running the list, since running several programs where 90% of
> the logic is the same is the overhead that is not acceptable
> for production server. It may be fine for desktop, but not
> when every cycle matters. With one program per cgroup I can
> combine multiple of them on bpf side. In networking the most of
> the prep work like packet parsing and lookups are common,
> but the actions are different, so for one part of the hieararchy
> I can have program A monitoring tcp and in other
> part I can have program B monitoring tcp and udp.
> What you're saying that for tcp and udp monitoring
> run two programs. One for udp and one for tcp.
> That is not efficient. Hence the need to combine
> the programs on bpf side and attach only one with override.

I'm not saying that at all.  I'm saying that this use case sounds
valid, but maybe it could be solved differently.  Here are some ideas:

 - Expose the actual cgroup (relative to the hooked cgroup) to the BPF
program.  Then you could parse the headers, check what cgroup you're
in, and proceed accordingly.  This could potentially be even faster
for your use case if done carefully because it will stress the
instruction cache less.

 - Have a (non-default) flag that says "overridable".  The effect is
that, if /A and /A/B both have overridable programs attached, then
sockets in /A/B don't run /A's program.  If, however, /A has a
non-overridable program and /A/B has an overridable program, then
sockets in /A/B run both programs.  IOW overridable programs override
other overridable programs, but non-overridable programs never
override anything and are never overridden by anything.

>
> The "dont override flag" was also explained before. Here it is again:
> Without breaking above "override" scenario we can add a flag
> that when the program is attached with that flag to one part of
> the hierarchy the override of it will not be allowed in the descendent.
> This extension can be done at any time in the future.
> The only question is what is the default when such flag
> is not present. The current default is override allowed.
> You insist on changing the default right now.
> I don't mind, therefore I'm working on such "dont override flag",
> since if we need to change the default it needs to be done now,
> but if it doesn't happen for 4.10, it's absolutely fine too.
> For security use cases the flag will be added later
> and sandboxing use cases will use that flag too.
> There are no expections that if cgroup is there the program
> attach command must always succeed. That's why we have error codes
> and we can dissallow attach based on this flag or any future
> restrictions. All of it is root now anyway and sandboxing/security
> use cases need to wait until bpf side can be made unprivileged.
> I see current api to have a ton of room for future extensions.
>

Suppose someone wants to make CGROUP_BPF work right when a container
and a container manager both use it.  It'll have to run both programs.
What would the semantics be if this were to be added?  This is really
a question, not an indictment of your approach.  For all I know,
you're proposing exactly what I suggested above.

And sandboxing needn't, and won't, wait until unprivileged bpf
programs are added.  Plenty of sandboxes run as root.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net] bpf: expose netns inode to bpf programs
  2017-02-05  5:05                                   ` Andy Lutomirski
@ 2017-02-07  1:43                                     ` Alexei Starovoitov
  0 siblings, 0 replies; 35+ messages in thread
From: Alexei Starovoitov @ 2017-02-07  1:43 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Eric W. Biederman, Alexei Starovoitov, Linus Torvalds,
	David S . Miller, Daniel Borkmann, David Ahern, Tejun Heo,
	Thomas Graf, Network Development

On Sat, Feb 04, 2017 at 09:05:29PM -0800, Andy Lutomirski wrote:
> 
> I'm not saying that at all.  I'm saying that this use case sounds
> valid, but maybe it could be solved differently.  Here are some ideas:

Great. Combining multiple threads. Replied in bpf_sk_netns_id thread.

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2017-02-07  1:44 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-26  3:27 [PATCH net] bpf: expose netns inode to bpf programs Alexei Starovoitov
2017-01-26  5:46 ` Eric W. Biederman
2017-01-26  6:00   ` Ying Xue
2017-01-26  6:23   ` Alexei Starovoitov
2017-01-26 16:37     ` Andy Lutomirski
2017-01-26 17:46       ` Alexei Starovoitov
2017-01-26 18:12         ` Andy Lutomirski
2017-01-26 18:32           ` Alexei Starovoitov
2017-01-26 19:07             ` Andy Lutomirski
2017-01-26 19:25               ` Alexei Starovoitov
2017-02-03  4:33                 ` Eric W. Biederman
2017-02-03  6:05                   ` Alexei Starovoitov
2017-02-03 10:30                     ` Eric W. Biederman
2017-02-03 21:00                   ` Andy Lutomirski
2017-02-03 21:06                     ` Eric W. Biederman
2017-02-03 23:08                     ` Alexei Starovoitov
2017-02-04 17:07                       ` Andy Lutomirski
2017-02-05  3:10                         ` Alexei Starovoitov
2017-02-05  3:27                           ` Andy Lutomirski
2017-02-05  3:48                             ` Alexei Starovoitov
2017-02-05  3:54                               ` Andy Lutomirski
2017-02-05  4:37                                 ` Alexei Starovoitov
2017-02-05  5:05                                   ` Andy Lutomirski
2017-02-07  1:43                                     ` Alexei Starovoitov
2017-01-31 18:02 ` David Miller
2017-01-31 22:11 ` David Ahern
2017-02-03 21:56 ` Daniel Borkmann
2017-02-03 23:06   ` Alexei Starovoitov
2017-02-03 23:42     ` Daniel Borkmann
2017-02-04  1:25       ` Alexei Starovoitov
2017-02-04 17:08       ` Andy Lutomirski
2017-02-05  3:18         ` Alexei Starovoitov
2017-02-05  3:22           ` Andy Lutomirski
2017-02-05  3:35             ` Alexei Starovoitov
2017-02-05  3:49               ` Andy Lutomirski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.