* [PATCH bpf-next v3 2/4] bpf: sync bpf.h to tools/ for bpf_sock_ops->netns*
2019-04-26 15:48 [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context Alban Crequy
@ 2019-04-26 15:48 ` Alban Crequy
2019-04-26 15:48 ` [PATCH bpf-next v3 3/4] selftests: bpf: read netns_ino from struct bpf_sock_ops Alban Crequy
` (3 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Alban Crequy @ 2019-04-26 15:48 UTC (permalink / raw)
To: john.fastabend, ast, daniel; +Cc: bpf, netdev, linux-kernel, alban, iago
From: Alban Crequy <alban@kinvolk.io>
The change in struct bpf_sock_ops is synchronised
from: include/uapi/linux/bpf.h
to: tools/include/uapi/linux/bpf.h
Signed-off-by: Alban Crequy <alban@kinvolk.io>
---
Changes since v2:
- standalone patch for the sync (requested by Y Song)
---
tools/include/uapi/linux/bpf.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 704bb69514a2..eb56620a9d7a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3206,6 +3206,8 @@ struct bpf_sock_ops {
__u32 sk_txhash;
__u64 bytes_received;
__u64 bytes_acked;
+ __u64 netns_dev;
+ __u64 netns_ino;
};
/* Definitions for bpf_sock_ops_cb_flags */
--
2.20.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH bpf-next v3 3/4] selftests: bpf: read netns_ino from struct bpf_sock_ops
2019-04-26 15:48 [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context Alban Crequy
2019-04-26 15:48 ` [PATCH bpf-next v3 2/4] bpf: sync bpf.h to tools/ for bpf_sock_ops->netns* Alban Crequy
@ 2019-04-26 15:48 ` Alban Crequy
2019-04-26 15:48 ` [PATCH bpf-next v3 4/4] selftests: bpf: verifier: read netns_dev and " Alban Crequy
` (2 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Alban Crequy @ 2019-04-26 15:48 UTC (permalink / raw)
To: john.fastabend, ast, daniel; +Cc: bpf, netdev, linux-kernel, alban, iago
From: Alban Crequy <alban@kinvolk.io>
This shows how a sockops program could be restricted to a specific
network namespace. The sockops program looks at the current netns via
(struct bpf_sock_ops)->netns_ino and checks if the value matches the
configuration in the new BPF map "sock_netns".
The test program ./test_sockmap accepts a new parameter "--netns"; the
default value is the current netns found by stat() on /proc/self/ns/net,
so the previous tests still pass:
sudo ./test_sockmap
...
Summary: 412 PASSED 0 FAILED
...
Summary: 824 PASSED 0 FAILED
I run my additional test in the following way:
NETNS=$(readlink /proc/self/ns/net | sed 's/^net:\[\(.*\)\]$/\1/')
CGR=/sys/fs/cgroup/unified/user.slice/user-1000.slice/session-5.scope/
sudo ./test_sockmap --cgroup $CGR --netns $NETNS &
cat /sys/kernel/debug/tracing/trace_pipe
echo foo | nc -l 127.0.0.1 8080 &
echo bar | nc 127.0.0.1 8080
=> the connection goes through the sockmap
When testing with a wrong $NETNS, I get the trace_pipe log:
> not binding connection on netns 4026531992
Signed-off-by: Alban Crequy <alban@kinvolk.io>
---
Changes since v1:
- tools/include/uapi/linux/bpf.h: update with netns_dev
- tools/testing/selftests/bpf/test_sockmap_kern.h: print debugs with
both netns_dev and netns_ino
Changes since v2:
- update commitmsg to refer to netns_ino
---
tools/testing/selftests/bpf/test_sockmap.c | 38 +++++++++++++++++--
.../testing/selftests/bpf/test_sockmap_kern.h | 22 +++++++++++
2 files changed, 57 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 3845144e2c91..5a1b9c96fca1 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -2,6 +2,7 @@
// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
#include <stdio.h>
#include <stdlib.h>
+#include <stdint.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/select.h>
@@ -21,6 +22,7 @@
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/sendfile.h>
+#include <sys/stat.h>
#include <linux/netlink.h>
#include <linux/socket.h>
@@ -63,8 +65,8 @@ int s1, s2, c1, c2, p1, p2;
int test_cnt;
int passed;
int failed;
-int map_fd[8];
-struct bpf_map *maps[8];
+int map_fd[9];
+struct bpf_map *maps[9];
int prog_fd[11];
int txmsg_pass;
@@ -84,6 +86,7 @@ int txmsg_ingress;
int txmsg_skb;
int ktls;
int peek_flag;
+uint64_t netns_opt;
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
@@ -111,6 +114,7 @@ static const struct option long_options[] = {
{"txmsg_skb", no_argument, &txmsg_skb, 1 },
{"ktls", no_argument, &ktls, 1 },
{"peek", no_argument, &peek_flag, 1 },
+ {"netns", required_argument, NULL, 'n'},
{0, 0, NULL, 0 }
};
@@ -1585,6 +1589,7 @@ char *map_names[] = {
"sock_bytes",
"sock_redir_flags",
"sock_skb_opts",
+ "sock_netns",
};
int prog_attach_type[] = {
@@ -1619,6 +1624,8 @@ static int populate_progs(char *bpf_file)
struct bpf_object *obj;
int i = 0;
long err;
+ struct stat netns_sb;
+ uint64_t netns_ino;
obj = bpf_object__open(bpf_file);
err = libbpf_get_error(obj);
@@ -1655,6 +1662,28 @@ static int populate_progs(char *bpf_file)
}
}
+ if (netns_opt == 0) {
+ err = stat("/proc/self/ns/net", &netns_sb);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: cannot stat network namespace: %ld (%s)\n",
+ err, strerror(errno));
+ return -1;
+ }
+ netns_ino = netns_sb.st_ino;
+ } else {
+ netns_ino = netns_opt;
+ }
+ i = 1;
+ err = bpf_map_update_elem(map_fd[8], &netns_ino, &i, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (netns): %ld (%s)\n",
+ err, strerror(errno));
+ return -1;
+ }
+
+
return 0;
}
@@ -1738,7 +1767,7 @@ int main(int argc, char **argv)
if (argc < 2)
return test_suite(-1);
- while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:",
+ while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:n:",
long_options, &longindex)) != -1) {
switch (opt) {
case 's':
@@ -1805,6 +1834,9 @@ int main(int argc, char **argv)
return -1;
}
break;
+ case 'n':
+ netns_opt = strtoull(optarg, NULL, 10);
+ break;
case 0:
break;
case 'h':
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
index e7639f66a941..317406dad6cf 100644
--- a/tools/testing/selftests/bpf/test_sockmap_kern.h
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
@@ -91,6 +91,13 @@ struct bpf_map_def SEC("maps") sock_skb_opts = {
.max_entries = 1
};
+struct bpf_map_def SEC("maps") sock_netns = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(__u64),
+ .value_size = sizeof(int),
+ .max_entries = 16
+};
+
SEC("sk_skb1")
int bpf_prog1(struct __sk_buff *skb)
{
@@ -132,9 +139,24 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
{
__u32 lport, rport;
int op, err = 0, index, key, ret;
+ int i = 0;
+ __u64 netns_dev, netns_ino;
+ int *allowed;
op = (int) skops->op;
+ netns_dev = skops->netns_dev;
+ netns_ino = skops->netns_ino;
+ bpf_printk("bpf_sockmap: netns_dev = %lu netns_ino = %lu\n",
+ netns_dev, netns_ino);
+
+ // Only allow sockmap connection on the configured network namespace
+ allowed = bpf_map_lookup_elem(&sock_netns, &netns_ino);
+ if (allowed == NULL || *allowed == 0) {
+ bpf_printk("not binding connection on netns_ino %lu\n",
+ netns_ino);
+ return 0;
+ }
switch (op) {
case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
--
2.20.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH bpf-next v3 4/4] selftests: bpf: verifier: read netns_dev and netns_ino from struct bpf_sock_ops
2019-04-26 15:48 [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context Alban Crequy
2019-04-26 15:48 ` [PATCH bpf-next v3 2/4] bpf: sync bpf.h to tools/ for bpf_sock_ops->netns* Alban Crequy
2019-04-26 15:48 ` [PATCH bpf-next v3 3/4] selftests: bpf: read netns_ino from struct bpf_sock_ops Alban Crequy
@ 2019-04-26 15:48 ` Alban Crequy
2019-04-26 21:03 ` [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context Jakub Kicinski
2019-04-27 16:34 ` Y Song
4 siblings, 0 replies; 10+ messages in thread
From: Alban Crequy @ 2019-04-26 15:48 UTC (permalink / raw)
To: john.fastabend, ast, daniel; +Cc: bpf, netdev, linux-kernel, alban, iago
From: Alban Crequy <alban@kinvolk.io>
Tested with:
> $ sudo ./test_verifier
> ...
> #905/p sockops accessing bpf_sock_ops->netns_dev, ok OK
> #906/p sockops accessing bpf_sock_ops->netns_ino, ok OK
> ...
> Summary: 1421 PASSED, 0 SKIPPED, 0 FAILED
Signed-off-by: Alban Crequy <alban@kinvolk.io>
---
Changes since v1:
- This is a new selftest (review from Song)
Changes since v2:
- test partial reads on netns_dev (review from Y Song)
- split in two tests
---
.../testing/selftests/bpf/verifier/var_off.c | 53 +++++++++++++++++++
1 file changed, 53 insertions(+)
diff --git a/tools/testing/selftests/bpf/verifier/var_off.c b/tools/testing/selftests/bpf/verifier/var_off.c
index 8504ac937809..9e4c6c78eb9d 100644
--- a/tools/testing/selftests/bpf/verifier/var_off.c
+++ b/tools/testing/selftests/bpf/verifier/var_off.c
@@ -246,3 +246,56 @@
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_LWT_IN,
},
+{
+ "sockops accessing bpf_sock_ops->netns_dev, ok",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev)),
+
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 4),
+
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev)),
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 2),
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 4),
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 6),
+
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 4),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 5),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 6),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+},
+{
+ "sockops accessing bpf_sock_ops->netns_ino, ok",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_ino)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+},
--
2.20.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context
2019-04-26 15:48 [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context Alban Crequy
` (2 preceding siblings ...)
2019-04-26 15:48 ` [PATCH bpf-next v3 4/4] selftests: bpf: verifier: read netns_dev and " Alban Crequy
@ 2019-04-26 21:03 ` Jakub Kicinski
2019-04-27 10:48 ` Alban Crequy
2019-04-27 16:34 ` Y Song
4 siblings, 1 reply; 10+ messages in thread
From: Jakub Kicinski @ 2019-04-26 21:03 UTC (permalink / raw)
To: Alban Crequy
Cc: john.fastabend, ast, daniel, bpf, netdev, linux-kernel, alban, iago
On Fri, 26 Apr 2019 17:48:45 +0200, Alban Crequy wrote:
> In the unlikely case where network namespaces are not compiled in
> (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
Naive question - why return an error? init_net should always be there,
no?
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context
2019-04-26 21:03 ` [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context Jakub Kicinski
@ 2019-04-27 10:48 ` Alban Crequy
2019-04-27 18:39 ` Jakub Kicinski
0 siblings, 1 reply; 10+ messages in thread
From: Alban Crequy @ 2019-04-27 10:48 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Alban Crequy, John Fastabend, Alexei Starovoitov,
Daniel Borkmann, bpf, netdev, LKML, Iago López Galeiras
On Fri, Apr 26, 2019 at 11:03 PM Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
>
> On Fri, 26 Apr 2019 17:48:45 +0200, Alban Crequy wrote:
> > In the unlikely case where network namespaces are not compiled in
> > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
>
> Naive question - why return an error? init_net should always be there,
> no?
True for netns_dev. However, without CONFIG_NET_NS, we cannot access netns_ino:
(struct sock_common).possible_net_t.(struct net *):
typedef struct {
#ifdef CONFIG_NET_NS
struct net *net;
#endif
} possible_net_t;
And I don't think it would make much sense to allow access to
netns_dev but not netns_ino.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context
2019-04-27 10:48 ` Alban Crequy
@ 2019-04-27 18:39 ` Jakub Kicinski
2019-04-29 15:42 ` Alban Crequy
0 siblings, 1 reply; 10+ messages in thread
From: Jakub Kicinski @ 2019-04-27 18:39 UTC (permalink / raw)
To: Alban Crequy
Cc: Alban Crequy, John Fastabend, Alexei Starovoitov,
Daniel Borkmann, bpf, netdev, LKML, Iago López Galeiras
On Sat, 27 Apr 2019 12:48:25 +0200, Alban Crequy wrote:
> On Fri, Apr 26, 2019 at 11:03 PM Jakub Kicinski
> <jakub.kicinski@netronome.com> wrote:
> >
> > On Fri, 26 Apr 2019 17:48:45 +0200, Alban Crequy wrote:
> > > In the unlikely case where network namespaces are not compiled in
> > > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> >
> > Naive question - why return an error? init_net should always be there,
> > no?
>
> True for netns_dev. However, without CONFIG_NET_NS, we cannot access netns_ino:
>
> (struct sock_common).possible_net_t.(struct net *):
>
> typedef struct {
> #ifdef CONFIG_NET_NS
> struct net *net;
> #endif
> } possible_net_t;
>
> And I don't think it would make much sense to allow access to
> netns_dev but not netns_ino.
Right, if CONFIG_NET_NS=n we could just take the pointer to init_net
directly, and not worry about the field. IMHO it'd be preferable to
changing the UAPI based on kernel config, but I don't feel super
strongly.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context
2019-04-27 18:39 ` Jakub Kicinski
@ 2019-04-29 15:42 ` Alban Crequy
0 siblings, 0 replies; 10+ messages in thread
From: Alban Crequy @ 2019-04-29 15:42 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Alban Crequy, John Fastabend, Alexei Starovoitov,
Daniel Borkmann, bpf, netdev, LKML, Iago López Galeiras
On Sat, Apr 27, 2019 at 8:39 PM Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
>
> On Sat, 27 Apr 2019 12:48:25 +0200, Alban Crequy wrote:
> > On Fri, Apr 26, 2019 at 11:03 PM Jakub Kicinski
> > <jakub.kicinski@netronome.com> wrote:
> > >
> > > On Fri, 26 Apr 2019 17:48:45 +0200, Alban Crequy wrote:
> > > > In the unlikely case where network namespaces are not compiled in
> > > > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> > >
> > > Naive question - why return an error? init_net should always be there,
> > > no?
> >
> > True for netns_dev. However, without CONFIG_NET_NS, we cannot access netns_ino:
> >
> > (struct sock_common).possible_net_t.(struct net *):
> >
> > typedef struct {
> > #ifdef CONFIG_NET_NS
> > struct net *net;
> > #endif
> > } possible_net_t;
> >
> > And I don't think it would make much sense to allow access to
> > netns_dev but not netns_ino.
>
> Right, if CONFIG_NET_NS=n we could just take the pointer to init_net
> directly, and not worry about the field. IMHO it'd be preferable to
> changing the UAPI based on kernel config, but I don't feel super
> strongly.
I see the point about not changing the UAPI. So I will update the patch to:
- return netns_dev unconditionally, regardless of CONFIG_NET_NS
- return netns_ino with either the correct value or zero depending on
CONFIG_NET_NS.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context
2019-04-26 15:48 [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context Alban Crequy
` (3 preceding siblings ...)
2019-04-26 21:03 ` [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context Jakub Kicinski
@ 2019-04-27 16:34 ` Y Song
2019-04-29 15:34 ` Alban Crequy
4 siblings, 1 reply; 10+ messages in thread
From: Y Song @ 2019-04-27 16:34 UTC (permalink / raw)
To: Alban Crequy
Cc: John Fastabend, Alexei Starovoitov, Daniel Borkmann, bpf, netdev,
LKML, Alban Crequy, Iago López Galeiras
On Fri, Apr 26, 2019 at 8:50 AM Alban Crequy <alban.crequy@gmail.com> wrote:
>
> From: Alban Crequy <alban@kinvolk.io>
>
> sockops programs can now access the network namespace inode and device
> via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> to apply different policies on different network namespaces.
>
> In the unlikely case where network namespaces are not compiled in
> (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
>
> The generated BPF bytecode for netns_ino is loading the correct inode
> number at the time of execution.
>
> However, the generated BPF bytecode for netns_dev is loading an
> immediate value determined at BPF-load-time by looking at the initial
> network namespace. In practice, this works because all netns currently
> use the same virtual device. If this was to change, this code would need
> to be updated too.
>
> Signed-off-by: Alban Crequy <alban@kinvolk.io>
>
> ---
>
> Changes since v1:
> - add netns_dev (review from Alexei)
>
> Changes since v2:
> - replace __u64 by u64 in kernel code (review from Y Song)
> - remove unneeded #else branch: program would be rejected in
> is_valid_access (review from Y Song)
> - allow partial reads (<u64) (review from Y Song)
>
> Note: I have not been able to fully test partial reads on netns_dev.
> The following patches check partial reads in the verifier but it does
> not actually execute the program to check if partial reads generate the
> correct value. I tried to write a BPF program in C and declare the
> struct bpf_sock_ops as a volatile variable and I could get llvm to
> generate the BPF instructions to do partial loads. But then, I get the
> verifier error "dereference of modified ctx ptr R2 off=184 disallowed",
> explained in https://www.spinics.net/lists/netdev/msg531582.html
> What do you think should be done here?
You added partial read tests in test_verifier with raw asm codes.
It should be good enough.
For the compiler generated code causing verifier error, will take
a detailed look later.
Also I did not see a cover letter. For a series with 4 patches, it would be
the best if you can provide a separate cover letter.
> ---
> include/uapi/linux/bpf.h | 2 +
> net/core/filter.c | 94 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 96 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index eaf2d3284248..f4f841dde42c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> __u32 sk_txhash;
> __u64 bytes_received;
> __u64 bytes_acked;
> + __u64 netns_dev;
> + __u64 netns_ino;
> };
>
> /* Definitions for bpf_sock_ops_cb_flags */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 2f88baf39cc2..9c77464b1501 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -75,6 +75,8 @@
> #include <net/seg6_local.h>
> #include <net/lwtunnel.h>
> #include <net/ipv6_stubs.h>
> +#include <linux/kdev_t.h>
> +#include <linux/proc_ns.h>
>
> /**
> * sk_filter_trim_cap - run a packet through a socket filter
> @@ -6810,6 +6812,24 @@ static bool sock_ops_is_valid_access(int off, int size,
> }
> } else {
> switch (off) {
> + case offsetof(struct bpf_sock_ops, netns_dev) ...
> + offsetof(struct bpf_sock_ops, netns_dev) + sizeof(u64) - 1:
> +#ifdef CONFIG_NET_NS
> + if (off - offsetof(struct bpf_sock_ops, netns_dev)
> + + size > sizeof(u64))
This will allow something off = 1, size = 4. This is not what we want as
the access is not properly aligned.
You can look at function bpf_skb_is_valid_access(), esp. the two lines below:
bpf_ctx_record_field_size(info, size_default);
if (!bpf_ctx_narrow_access_ok(off, size, size_default))
return false;
> + return false;
> +#else
> + return false;
> +#endif
> + break;
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + if (size != sizeof(u64))
> + return false;
> +#else
> + return false;
> +#endif
> + break;
> case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> bytes_acked):
> if (size != sizeof(__u64))
> @@ -7727,6 +7747,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> return insn - insn_buf;
> }
>
> +static struct ns_common *sockops_netns_cb(void *private_data)
> +{
> + return &init_net.ns;
> +}
> +
> static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> const struct bpf_insn *si,
> struct bpf_insn *insn_buf,
> @@ -7735,6 +7760,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> {
> struct bpf_insn *insn = insn_buf;
> int off;
> + struct inode *ns_inode;
> + struct path ns_path;
> + u64 netns_dev;
> + void *res;
>
> /* Helper macro for adding read access to tcp_sock or sock fields. */
> #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> @@ -7981,6 +8010,71 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> struct sock, type);
> break;
> +
> + case offsetof(struct bpf_sock_ops, netns_dev) ...
> + offsetof(struct bpf_sock_ops, netns_dev) + sizeof(u64) - 1:
> +#ifdef CONFIG_NET_NS
> + /* We get the netns_dev at BPF-load-time and not at
> + * BPF-exec-time. We assume that netns_dev is a constant.
> + */
> + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> + if (IS_ERR(res)) {
> + netns_dev = 0;
> + } else {
> + ns_inode = ns_path.dentry->d_inode;
> + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> + }
> + off = si->off;
> + off -= offsetof(struct bpf_sock_ops, netns_dev);
> + switch (BPF_LDST_BYTES(si)) {
> + case sizeof(u64):
> + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> + break;
> + case sizeof(u32):
> + netns_dev = *(u32 *)(((char *)&netns_dev) + off);
> + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> + break;
> + case sizeof(u16):
> + netns_dev = *(u16 *)(((char *)&netns_dev) + off);
> + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> + break;
> + case sizeof(u8):
> + netns_dev = *(u8 *)(((char *)&netns_dev) + off);
> + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> + break;
> + }
> +#endif
> + break;
> +
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> + * Type: (struct bpf_sock_ops_kern *)
> + * ->(struct sock *)
> + * ->(struct sock_common)
> + * .possible_net_t
> + * .(struct net *)
> + * ->(struct ns_common)
> + * .(unsigned int)
> + */
> + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct bpf_sock_ops_kern, sk),
> + si->dst_reg, si->src_reg,
> + offsetof(struct bpf_sock_ops_kern, sk));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + possible_net_t, net),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct sock_common, skc_net));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct ns_common, inum),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct net, ns) +
> + offsetof(struct ns_common, inum));
> +#endif
> + break;
> +
> }
> return insn - insn_buf;
> }
> --
> 2.20.1
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context
2019-04-27 16:34 ` Y Song
@ 2019-04-29 15:34 ` Alban Crequy
0 siblings, 0 replies; 10+ messages in thread
From: Alban Crequy @ 2019-04-29 15:34 UTC (permalink / raw)
To: Y Song
Cc: Alban Crequy, John Fastabend, Alexei Starovoitov,
Daniel Borkmann, bpf, netdev, LKML, Iago López Galeiras
On Sat, Apr 27, 2019 at 6:35 PM Y Song <ys114321@gmail.com> wrote:
>
> On Fri, Apr 26, 2019 at 8:50 AM Alban Crequy <alban.crequy@gmail.com> wrote:
> >
> > From: Alban Crequy <alban@kinvolk.io>
> >
> > sockops programs can now access the network namespace inode and device
> > via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> > to apply different policies on different network namespaces.
> >
> > In the unlikely case where network namespaces are not compiled in
> > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> >
> > The generated BPF bytecode for netns_ino is loading the correct inode
> > number at the time of execution.
> >
> > However, the generated BPF bytecode for netns_dev is loading an
> > immediate value determined at BPF-load-time by looking at the initial
> > network namespace. In practice, this works because all netns currently
> > use the same virtual device. If this was to change, this code would need
> > to be updated too.
> >
> > Signed-off-by: Alban Crequy <alban@kinvolk.io>
> >
> > ---
> >
> > Changes since v1:
> > - add netns_dev (review from Alexei)
> >
> > Changes since v2:
> > - replace __u64 by u64 in kernel code (review from Y Song)
> > - remove unneeded #else branch: program would be rejected in
> > is_valid_access (review from Y Song)
> > - allow partial reads (<u64) (review from Y Song)
> >
> > Note: I have not been able to fully test partial reads on netns_dev.
> > The following patches check partial reads in the verifier but it does
> > not actually execute the program to check if partial reads generate the
> > correct value. I tried to write a BPF program in C and declare the
> > struct bpf_sock_ops as a volatile variable and I could get llvm to
> > generate the BPF instructions to do partial loads. But then, I get the
> > verifier error "dereference of modified ctx ptr R2 off=184 disallowed",
> > explained in https://www.spinics.net/lists/netdev/msg531582.html
> > What do you think should be done here?
>
> You added partial read tests in test_verifier with raw asm codes.
> It should be good enough.
>
> For the compiler generated code causing verifier error, will take
> a detailed look later.
Thanks! To clarify my note: the patches I sent on the mailing list
don't generate a verifier error.
It only errors out when I try partial reads in C. You can see the code
of the failed attempt that generate the error here:
https://github.com/kinvolk/linux/blob/c5fe70990c897a866c7006a0068876b0fde9ee4d/tools/testing/selftests/bpf/test_sockmap_kern.h#L146-L176
So if the partial read tests in test_verifier with raw asm codes are
good enough, there is no need to investigate more on that.
> Also I did not see a cover letter. For a series with 4 patches, it would be
> the best if you can provide a separate cover letter.
Ok, I will do that for the next iteration.
> > ---
> > include/uapi/linux/bpf.h | 2 +
> > net/core/filter.c | 94 ++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 96 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index eaf2d3284248..f4f841dde42c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> > __u32 sk_txhash;
> > __u64 bytes_received;
> > __u64 bytes_acked;
> > + __u64 netns_dev;
> > + __u64 netns_ino;
> > };
> >
> > /* Definitions for bpf_sock_ops_cb_flags */
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 2f88baf39cc2..9c77464b1501 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -75,6 +75,8 @@
> > #include <net/seg6_local.h>
> > #include <net/lwtunnel.h>
> > #include <net/ipv6_stubs.h>
> > +#include <linux/kdev_t.h>
> > +#include <linux/proc_ns.h>
> >
> > /**
> > * sk_filter_trim_cap - run a packet through a socket filter
> > @@ -6810,6 +6812,24 @@ static bool sock_ops_is_valid_access(int off, int size,
> > }
> > } else {
> > switch (off) {
> > + case offsetof(struct bpf_sock_ops, netns_dev) ...
> > + offsetof(struct bpf_sock_ops, netns_dev) + sizeof(u64) - 1:
> > +#ifdef CONFIG_NET_NS
> > + if (off - offsetof(struct bpf_sock_ops, netns_dev)
> > + + size > sizeof(u64))
>
> This will allow something off = 1, size = 4. This is not what we want as
> the access is not properly aligned.
sock_ops_is_valid_access() does not allow off = 1, size = 4. There is
this check at the beginning of the function:
if (off % size != 0)
return false;
> You can look at function bpf_skb_is_valid_access(), esp. the two lines below:
> bpf_ctx_record_field_size(info, size_default);
> if (!bpf_ctx_narrow_access_ok(off, size, size_default))
> return false;
Thanks for the pointer! I now see that if I use them, my code in
sock_ops_convert_ctx_access() can be simplified.
> > + return false;
> > +#else
> > + return false;
> > +#endif
> > + break;
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + if (size != sizeof(u64))
> > + return false;
> > +#else
> > + return false;
> > +#endif
> > + break;
> > case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> > bytes_acked):
> > if (size != sizeof(__u64))
> > @@ -7727,6 +7747,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> > return insn - insn_buf;
> > }
> >
> > +static struct ns_common *sockops_netns_cb(void *private_data)
> > +{
> > + return &init_net.ns;
> > +}
> > +
> > static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > const struct bpf_insn *si,
> > struct bpf_insn *insn_buf,
> > @@ -7735,6 +7760,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > {
> > struct bpf_insn *insn = insn_buf;
> > int off;
> > + struct inode *ns_inode;
> > + struct path ns_path;
> > + u64 netns_dev;
> > + void *res;
> >
> > /* Helper macro for adding read access to tcp_sock or sock fields. */
> > #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> > @@ -7981,6 +8010,71 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> > struct sock, type);
> > break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_dev) ...
> > + offsetof(struct bpf_sock_ops, netns_dev) + sizeof(u64) - 1:
> > +#ifdef CONFIG_NET_NS
> > + /* We get the netns_dev at BPF-load-time and not at
> > + * BPF-exec-time. We assume that netns_dev is a constant.
> > + */
> > + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> > + if (IS_ERR(res)) {
> > + netns_dev = 0;
> > + } else {
> > + ns_inode = ns_path.dentry->d_inode;
> > + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> > + }
> > + off = si->off;
> > + off -= offsetof(struct bpf_sock_ops, netns_dev);
> > + switch (BPF_LDST_BYTES(si)) {
> > + case sizeof(u64):
> > + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> > + break;
> > + case sizeof(u32):
> > + netns_dev = *(u32 *)(((char *)&netns_dev) + off);
> > + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> > + break;
> > + case sizeof(u16):
> > + netns_dev = *(u16 *)(((char *)&netns_dev) + off);
> > + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> > + break;
> > + case sizeof(u8):
> > + netns_dev = *(u8 *)(((char *)&netns_dev) + off);
> > + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> > + break;
> > + }
> > +#endif
> > + break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> > + * Type: (struct bpf_sock_ops_kern *)
> > + * ->(struct sock *)
> > + * ->(struct sock_common)
> > + * .possible_net_t
> > + * .(struct net *)
> > + * ->(struct ns_common)
> > + * .(unsigned int)
> > + */
> > + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> > + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct bpf_sock_ops_kern, sk),
> > + si->dst_reg, si->src_reg,
> > + offsetof(struct bpf_sock_ops_kern, sk));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + possible_net_t, net),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct sock_common, skc_net));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct ns_common, inum),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct net, ns) +
> > + offsetof(struct ns_common, inum));
> > +#endif
> > + break;
> > +
> > }
> > return insn - insn_buf;
> > }
> > --
> > 2.20.1
> >
^ permalink raw reply [flat|nested] 10+ messages in thread