* [PATCH bpf-next v2 2/3] selftests: bpf: read netns from struct bpf_sock_ops
2019-04-18 15:56 [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context Alban Crequy
@ 2019-04-18 15:56 ` Alban Crequy
2019-04-18 21:35 ` Y Song
2019-04-18 15:56 ` [PATCH bpf-next v2 3/3] selftests: bpf: verifier: read netns_dev and netns_ino " Alban Crequy
` (2 subsequent siblings)
3 siblings, 1 reply; 9+ messages in thread
From: Alban Crequy @ 2019-04-18 15:56 UTC (permalink / raw)
To: john.fastabend, ast, daniel; +Cc: bpf, netdev, linux-kernel, alban, iago
From: Alban Crequy <alban@kinvolk.io>
This shows how a sockops program could be restricted to a specific
network namespace. The sockops program looks at the current netns via
(struct bpf_sock_ops)->netns and checks if the value matches the
configuration in the new BPF map "sock_netns".
The test program ./test_sockmap accepts a new parameter "--netns"; the
default value is the current netns found by stat() on /proc/self/ns/net,
so the previous tests still pass:
sudo ./test_sockmap
...
Summary: 412 PASSED 0 FAILED
...
Summary: 824 PASSED 0 FAILED
I run my additional test in the following way:
NETNS=$(readlink /proc/self/ns/net | sed 's/^net:\[\(.*\)\]$/\1/')
CGR=/sys/fs/cgroup/unified/user.slice/user-1000.slice/session-5.scope/
sudo ./test_sockmap --cgroup $CGR --netns $NETNS &
cat /sys/kernel/debug/tracing/trace_pipe
echo foo | nc -l 127.0.0.1 8080 &
echo bar | nc 127.0.0.1 8080
=> the connection goes through the sockmap
When testing with a wrong $NETNS, I get the trace_pipe log:
> not binding connection on netns 4026531992
Signed-off-by: Alban Crequy <alban@kinvolk.io>
---
Changes since v1:
- tools/include/uapi/linux/bpf.h: update with netns_dev
- tools/testing/selftests/bpf/test_sockmap_kern.h: print debugs with
both netns_dev and netns_ino
---
tools/include/uapi/linux/bpf.h | 2 +
tools/testing/selftests/bpf/test_sockmap.c | 38 +++++++++++++++++--
.../testing/selftests/bpf/test_sockmap_kern.h | 22 +++++++++++
3 files changed, 59 insertions(+), 3 deletions(-)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 704bb69514a2..eb56620a9d7a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3206,6 +3206,8 @@ struct bpf_sock_ops {
__u32 sk_txhash;
__u64 bytes_received;
__u64 bytes_acked;
+ __u64 netns_dev;
+ __u64 netns_ino;
};
/* Definitions for bpf_sock_ops_cb_flags */
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 3845144e2c91..5a1b9c96fca1 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -2,6 +2,7 @@
// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
#include <stdio.h>
#include <stdlib.h>
+#include <stdint.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/select.h>
@@ -21,6 +22,7 @@
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/sendfile.h>
+#include <sys/stat.h>
#include <linux/netlink.h>
#include <linux/socket.h>
@@ -63,8 +65,8 @@ int s1, s2, c1, c2, p1, p2;
int test_cnt;
int passed;
int failed;
-int map_fd[8];
-struct bpf_map *maps[8];
+int map_fd[9];
+struct bpf_map *maps[9];
int prog_fd[11];
int txmsg_pass;
@@ -84,6 +86,7 @@ int txmsg_ingress;
int txmsg_skb;
int ktls;
int peek_flag;
+uint64_t netns_opt;
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
@@ -111,6 +114,7 @@ static const struct option long_options[] = {
{"txmsg_skb", no_argument, &txmsg_skb, 1 },
{"ktls", no_argument, &ktls, 1 },
{"peek", no_argument, &peek_flag, 1 },
+ {"netns", required_argument, NULL, 'n'},
{0, 0, NULL, 0 }
};
@@ -1585,6 +1589,7 @@ char *map_names[] = {
"sock_bytes",
"sock_redir_flags",
"sock_skb_opts",
+ "sock_netns",
};
int prog_attach_type[] = {
@@ -1619,6 +1624,8 @@ static int populate_progs(char *bpf_file)
struct bpf_object *obj;
int i = 0;
long err;
+ struct stat netns_sb;
+ uint64_t netns_ino;
obj = bpf_object__open(bpf_file);
err = libbpf_get_error(obj);
@@ -1655,6 +1662,28 @@ static int populate_progs(char *bpf_file)
}
}
+ if (netns_opt == 0) {
+ err = stat("/proc/self/ns/net", &netns_sb);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: cannot stat network namespace: %ld (%s)\n",
+ err, strerror(errno));
+ return -1;
+ }
+ netns_ino = netns_sb.st_ino;
+ } else {
+ netns_ino = netns_opt;
+ }
+ i = 1;
+ err = bpf_map_update_elem(map_fd[8], &netns_ino, &i, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (netns): %ld (%s)\n",
+ err, strerror(errno));
+ return -1;
+ }
+
+
return 0;
}
@@ -1738,7 +1767,7 @@ int main(int argc, char **argv)
if (argc < 2)
return test_suite(-1);
- while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:",
+ while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:n:",
long_options, &longindex)) != -1) {
switch (opt) {
case 's':
@@ -1805,6 +1834,9 @@ int main(int argc, char **argv)
return -1;
}
break;
+ case 'n':
+ netns_opt = strtoull(optarg, NULL, 10);
+ break;
case 0:
break;
case 'h':
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
index e7639f66a941..317406dad6cf 100644
--- a/tools/testing/selftests/bpf/test_sockmap_kern.h
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
@@ -91,6 +91,13 @@ struct bpf_map_def SEC("maps") sock_skb_opts = {
.max_entries = 1
};
+struct bpf_map_def SEC("maps") sock_netns = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(__u64),
+ .value_size = sizeof(int),
+ .max_entries = 16
+};
+
SEC("sk_skb1")
int bpf_prog1(struct __sk_buff *skb)
{
@@ -132,9 +139,24 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
{
__u32 lport, rport;
int op, err = 0, index, key, ret;
+ int i = 0;
+ __u64 netns_dev, netns_ino;
+ int *allowed;
op = (int) skops->op;
+ netns_dev = skops->netns_dev;
+ netns_ino = skops->netns_ino;
+ bpf_printk("bpf_sockmap: netns_dev = %lu netns_ino = %lu\n",
+ netns_dev, netns_ino);
+
+ // Only allow sockmap connection on the configured network namespace
+ allowed = bpf_map_lookup_elem(&sock_netns, &netns_ino);
+ if (allowed == NULL || *allowed == 0) {
+ bpf_printk("not binding connection on netns_ino %lu\n",
+ netns_ino);
+ return 0;
+ }
switch (op) {
case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
--
2.20.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 2/3] selftests: bpf: read netns from struct bpf_sock_ops
2019-04-18 15:56 ` [PATCH bpf-next v2 2/3] selftests: bpf: read netns from struct bpf_sock_ops Alban Crequy
@ 2019-04-18 21:35 ` Y Song
0 siblings, 0 replies; 9+ messages in thread
From: Y Song @ 2019-04-18 21:35 UTC (permalink / raw)
To: Alban Crequy
Cc: John Fastabend, Alexei Starovoitov, Daniel Borkmann, bpf, netdev,
LKML, Alban Crequy, Iago López Galeiras
On Thu, Apr 18, 2019 at 8:58 AM Alban Crequy <alban.crequy@gmail.com> wrote:
>
> From: Alban Crequy <alban@kinvolk.io>
>
> This shows how a sockops program could be restricted to a specific
> network namespace. The sockops program looks at the current netns via
> (struct bpf_sock_ops)->netns and checks if the value matches the
> configuration in the new BPF map "sock_netns".
>
> The test program ./test_sockmap accepts a new parameter "--netns"; the
> default value is the current netns found by stat() on /proc/self/ns/net,
> so the previous tests still pass:
>
> sudo ./test_sockmap
> ...
> Summary: 412 PASSED 0 FAILED
> ...
> Summary: 824 PASSED 0 FAILED
>
> I run my additional test in the following way:
>
> NETNS=$(readlink /proc/self/ns/net | sed 's/^net:\[\(.*\)\]$/\1/')
> CGR=/sys/fs/cgroup/unified/user.slice/user-1000.slice/session-5.scope/
> sudo ./test_sockmap --cgroup $CGR --netns $NETNS &
>
> cat /sys/kernel/debug/tracing/trace_pipe
>
> echo foo | nc -l 127.0.0.1 8080 &
> echo bar | nc 127.0.0.1 8080
>
> => the connection goes through the sockmap
>
> When testing with a wrong $NETNS, I get the trace_pipe log:
> > not binding connection on netns 4026531992
>
> Signed-off-by: Alban Crequy <alban@kinvolk.io>
>
> ---
>
> Changes since v1:
> - tools/include/uapi/linux/bpf.h: update with netns_dev
Please have a separate patch for the uapi header sync
(tools/include/uapi/linux/bpf.h).
> - tools/testing/selftests/bpf/test_sockmap_kern.h: print debugs with
> both netns_dev and netns_ino
> ---
> tools/include/uapi/linux/bpf.h | 2 +
> tools/testing/selftests/bpf/test_sockmap.c | 38 +++++++++++++++++--
> .../testing/selftests/bpf/test_sockmap_kern.h | 22 +++++++++++
> 3 files changed, 59 insertions(+), 3 deletions(-)
>
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 704bb69514a2..eb56620a9d7a 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -3206,6 +3206,8 @@ struct bpf_sock_ops {
> __u32 sk_txhash;
> __u64 bytes_received;
> __u64 bytes_acked;
> + __u64 netns_dev;
> + __u64 netns_ino;
> };
>
> /* Definitions for bpf_sock_ops_cb_flags */
> diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
> index 3845144e2c91..5a1b9c96fca1 100644
> --- a/tools/testing/selftests/bpf/test_sockmap.c
> +++ b/tools/testing/selftests/bpf/test_sockmap.c
> @@ -2,6 +2,7 @@
> // Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
> #include <stdio.h>
> #include <stdlib.h>
> +#include <stdint.h>
> #include <sys/socket.h>
> #include <sys/ioctl.h>
> #include <sys/select.h>
> @@ -21,6 +22,7 @@
> #include <sys/resource.h>
> #include <sys/types.h>
> #include <sys/sendfile.h>
> +#include <sys/stat.h>
>
> #include <linux/netlink.h>
> #include <linux/socket.h>
> @@ -63,8 +65,8 @@ int s1, s2, c1, c2, p1, p2;
> int test_cnt;
> int passed;
> int failed;
> -int map_fd[8];
> -struct bpf_map *maps[8];
> +int map_fd[9];
> +struct bpf_map *maps[9];
> int prog_fd[11];
>
> int txmsg_pass;
> @@ -84,6 +86,7 @@ int txmsg_ingress;
> int txmsg_skb;
> int ktls;
> int peek_flag;
> +uint64_t netns_opt;
>
> static const struct option long_options[] = {
> {"help", no_argument, NULL, 'h' },
> @@ -111,6 +114,7 @@ static const struct option long_options[] = {
> {"txmsg_skb", no_argument, &txmsg_skb, 1 },
> {"ktls", no_argument, &ktls, 1 },
> {"peek", no_argument, &peek_flag, 1 },
> + {"netns", required_argument, NULL, 'n'},
> {0, 0, NULL, 0 }
> };
>
> @@ -1585,6 +1589,7 @@ char *map_names[] = {
> "sock_bytes",
> "sock_redir_flags",
> "sock_skb_opts",
> + "sock_netns",
> };
>
> int prog_attach_type[] = {
> @@ -1619,6 +1624,8 @@ static int populate_progs(char *bpf_file)
> struct bpf_object *obj;
> int i = 0;
> long err;
> + struct stat netns_sb;
> + uint64_t netns_ino;
>
> obj = bpf_object__open(bpf_file);
> err = libbpf_get_error(obj);
> @@ -1655,6 +1662,28 @@ static int populate_progs(char *bpf_file)
> }
> }
>
> + if (netns_opt == 0) {
> + err = stat("/proc/self/ns/net", &netns_sb);
> + if (err) {
> + fprintf(stderr,
> + "ERROR: cannot stat network namespace: %ld (%s)\n",
> + err, strerror(errno));
> + return -1;
> + }
> + netns_ino = netns_sb.st_ino;
> + } else {
> + netns_ino = netns_opt;
> + }
> + i = 1;
> + err = bpf_map_update_elem(map_fd[8], &netns_ino, &i, BPF_ANY);
> + if (err) {
> + fprintf(stderr,
> + "ERROR: bpf_map_update_elem (netns): %ld (%s)\n",
> + err, strerror(errno));
> + return -1;
> + }
> +
> +
> return 0;
> }
>
> @@ -1738,7 +1767,7 @@ int main(int argc, char **argv)
> if (argc < 2)
> return test_suite(-1);
>
> - while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:",
> + while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:n:",
> long_options, &longindex)) != -1) {
> switch (opt) {
> case 's':
> @@ -1805,6 +1834,9 @@ int main(int argc, char **argv)
> return -1;
> }
> break;
> + case 'n':
> + netns_opt = strtoull(optarg, NULL, 10);
> + break;
> case 0:
> break;
> case 'h':
> diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
> index e7639f66a941..317406dad6cf 100644
> --- a/tools/testing/selftests/bpf/test_sockmap_kern.h
> +++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
> @@ -91,6 +91,13 @@ struct bpf_map_def SEC("maps") sock_skb_opts = {
> .max_entries = 1
> };
>
> +struct bpf_map_def SEC("maps") sock_netns = {
> + .type = BPF_MAP_TYPE_HASH,
> + .key_size = sizeof(__u64),
> + .value_size = sizeof(int),
> + .max_entries = 16
> +};
> +
> SEC("sk_skb1")
> int bpf_prog1(struct __sk_buff *skb)
> {
> @@ -132,9 +139,24 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
> {
> __u32 lport, rport;
> int op, err = 0, index, key, ret;
> + int i = 0;
> + __u64 netns_dev, netns_ino;
> + int *allowed;
>
>
> op = (int) skops->op;
> + netns_dev = skops->netns_dev;
> + netns_ino = skops->netns_ino;
> + bpf_printk("bpf_sockmap: netns_dev = %lu netns_ino = %lu\n",
> + netns_dev, netns_ino);
> +
> + // Only allow sockmap connection on the configured network namespace
> + allowed = bpf_map_lookup_elem(&sock_netns, &netns_ino);
> + if (allowed == NULL || *allowed == 0) {
> + bpf_printk("not binding connection on netns_ino %lu\n",
> + netns_ino);
> + return 0;
> + }
>
> switch (op) {
> case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
> --
> 2.20.1
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH bpf-next v2 3/3] selftests: bpf: verifier: read netns_dev and netns_ino from struct bpf_sock_ops
2019-04-18 15:56 [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context Alban Crequy
2019-04-18 15:56 ` [PATCH bpf-next v2 2/3] selftests: bpf: read netns from struct bpf_sock_ops Alban Crequy
@ 2019-04-18 15:56 ` Alban Crequy
2019-04-18 21:31 ` [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context Y Song
2019-04-18 21:41 ` Song Liu
3 siblings, 0 replies; 9+ messages in thread
From: Alban Crequy @ 2019-04-18 15:56 UTC (permalink / raw)
To: john.fastabend, ast, daniel; +Cc: bpf, netdev, linux-kernel, alban, iago
From: Alban Crequy <alban@kinvolk.io>
Tested with:
> $ sudo ./test_verifier
> ...
> #905/p sockops accessing bpf_sock_ops->netns_dev and netns_ino, ok OK
> ...
> Summary: 1420 PASSED, 0 SKIPPED, 0 FAILED
Signed-off-by: Alban Crequy <alban@kinvolk.io>
---
Changes since v1:
- This is a new selftest (review from Song)
---
tools/testing/selftests/bpf/verifier/var_off.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/tools/testing/selftests/bpf/verifier/var_off.c b/tools/testing/selftests/bpf/verifier/var_off.c
index 8504ac937809..1ad3f64145ba 100644
--- a/tools/testing/selftests/bpf/verifier/var_off.c
+++ b/tools/testing/selftests/bpf/verifier/var_off.c
@@ -246,3 +246,16 @@
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_LWT_IN,
},
+{
+ "sockops accessing bpf_sock_ops->netns_dev and netns_ino, ok",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_ino)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+},
--
2.20.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context
2019-04-18 15:56 [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context Alban Crequy
2019-04-18 15:56 ` [PATCH bpf-next v2 2/3] selftests: bpf: read netns from struct bpf_sock_ops Alban Crequy
2019-04-18 15:56 ` [PATCH bpf-next v2 3/3] selftests: bpf: verifier: read netns_dev and netns_ino " Alban Crequy
@ 2019-04-18 21:31 ` Y Song
2019-04-19 11:51 ` Alban Crequy
2019-04-18 21:41 ` Song Liu
3 siblings, 1 reply; 9+ messages in thread
From: Y Song @ 2019-04-18 21:31 UTC (permalink / raw)
To: Alban Crequy
Cc: John Fastabend, Alexei Starovoitov, Daniel Borkmann, bpf, netdev,
LKML, Alban Crequy, Iago López Galeiras
On Thu, Apr 18, 2019 at 8:58 AM Alban Crequy <alban.crequy@gmail.com> wrote:
>
> From: Alban Crequy <alban@kinvolk.io>
>
> sockops programs can now access the network namespace inode and device
> via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> to apply different policies on different network namespaces.
>
> In the unlikely case where network namespaces are not compiled in
> (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
>
> The generated BPF bytecode for netns_ino is loading the correct inode
> number at the time of execution.
>
> However, the generated BPF bytecode for netns_dev is loading an
> immediate value determined at BPF-load-time by looking at the initial
> network namespace. In practice, this works because all netns currently
> use the same virtual device. If this was to change, this code would need
> to be updated too.
>
> Signed-off-by: Alban Crequy <alban@kinvolk.io>
>
> ---
>
> Changes since v1:
> - add netns_dev (review from Alexei)
> ---
> include/uapi/linux/bpf.h | 2 ++
> net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 72 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index eaf2d3284248..f4f841dde42c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> __u32 sk_txhash;
> __u64 bytes_received;
> __u64 bytes_acked;
> + __u64 netns_dev;
> + __u64 netns_ino;
Maybe we can define netns_dev as __u32?
__u64 netns_ino;
__u32 netns_dev;
There is a hole at the end which can be used if the next
field to be added in the future is a __u32.
From
static inline u32 new_encode_dev(dev_t dev)
{
unsigned major = MAJOR(dev);
unsigned minor = MINOR(dev);
return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12);
}
device num is encoded in a u32.
> };
>
> /* Definitions for bpf_sock_ops_cb_flags */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 1833926a63fc..93e3429603d7 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -75,6 +75,8 @@
> #include <net/seg6_local.h>
> #include <net/lwtunnel.h>
> #include <net/ipv6_stubs.h>
> +#include <linux/kdev_t.h>
> +#include <linux/proc_ns.h>
>
> /**
> * sk_filter_trim_cap - run a packet through a socket filter
> @@ -6774,6 +6776,15 @@ static bool sock_ops_is_valid_access(int off, int size,
> }
> } else {
> switch (off) {
> + case offsetof(struct bpf_sock_ops, netns_dev):
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + if (size != sizeof(__u64))
> + return false;
for netns_dev, looks like we have encoding like
(minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12)
it is possible user may access like:
u16 dev = sk->netns_dev;
the compiler may translate into a 2-byte load
or even
u8 major = (sk->netns_dev >> 8) & 0xf
the compiler may translate to a byte load of &sk->netns_dev + 1 or 3
depends on endianness.
I suggest we relax the access pattern now to avoid patch later.
> +#else
> + return false;
> +#endif
> + break;
> case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> bytes_acked):
> if (size != sizeof(__u64))
> @@ -7660,6 +7671,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> return insn - insn_buf;
> }
>
> +static struct ns_common *sockops_netns_cb(void *private_data)
> +{
> + return &init_net.ns;
> +}
> +
> static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> const struct bpf_insn *si,
> struct bpf_insn *insn_buf,
> @@ -7668,6 +7684,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> {
> struct bpf_insn *insn = insn_buf;
> int off;
> + struct inode *ns_inode;
> + struct path ns_path;
> + __u64 netns_dev;
Although there are a few discrepancies, I suggest to use u64 for
kernel as __u64 is typically
used in user space.
> + void *res;
>
> /* Helper macro for adding read access to tcp_sock or sock fields. */
> #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> @@ -7914,6 +7934,56 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> struct sock, type);
> break;
> +
> + case offsetof(struct bpf_sock_ops, netns_dev):
> +#ifdef CONFIG_NET_NS
> + /* We get the netns_dev at BPF-load-time and not at
> + * BPF-exec-time. We assume that netns_dev is a constant.
> + */
> + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> + if (IS_ERR(res)) {
> + netns_dev = 0;
> + } else {
> + ns_inode = ns_path.dentry->d_inode;
> + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> + }
> +#else
> + netns_dev = 0;
The #else branch is not needed. During is_valid_access check, if CONFIG_NET_NS
is not defined, the program should have been rejected.
> +#endif
> + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> + break;
> +
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> + * Type: (struct bpf_sock_ops_kern *)
> + * ->(struct sock *)
> + * ->(struct sock_common)
> + * .possible_net_t
> + * .(struct net *)
> + * ->(struct ns_common)
> + * .(unsigned int)
> + */
> + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct bpf_sock_ops_kern, sk),
> + si->dst_reg, si->src_reg,
> + offsetof(struct bpf_sock_ops_kern, sk));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + possible_net_t, net),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct sock_common, skc_net));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct ns_common, inum),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct net, ns) +
> + offsetof(struct ns_common, inum));
> +#else
> + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
The same as above, this else branch is not needed.
> +#endif
> + break;
> +
> }
> return insn - insn_buf;
> }
> --
> 2.20.1
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context
2019-04-18 21:31 ` [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context Y Song
@ 2019-04-19 11:51 ` Alban Crequy
2019-04-19 22:59 ` Y Song
0 siblings, 1 reply; 9+ messages in thread
From: Alban Crequy @ 2019-04-19 11:51 UTC (permalink / raw)
To: Y Song
Cc: Alban Crequy, John Fastabend, Alexei Starovoitov,
Daniel Borkmann, bpf, netdev, LKML, Alban Crequy,
Iago López Galeiras
On Thu, Apr 18, 2019 at 11:31 PM Y Song <ys114321@gmail.com> wrote:
>
> On Thu, Apr 18, 2019 at 8:58 AM Alban Crequy <alban.crequy@gmail.com> wrote:
> >
> > From: Alban Crequy <alban@kinvolk.io>
> >
> > sockops programs can now access the network namespace inode and device
> > via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> > to apply different policies on different network namespaces.
> >
> > In the unlikely case where network namespaces are not compiled in
> > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> >
> > The generated BPF bytecode for netns_ino is loading the correct inode
> > number at the time of execution.
> >
> > However, the generated BPF bytecode for netns_dev is loading an
> > immediate value determined at BPF-load-time by looking at the initial
> > network namespace. In practice, this works because all netns currently
> > use the same virtual device. If this was to change, this code would need
> > to be updated too.
> >
> > Signed-off-by: Alban Crequy <alban@kinvolk.io>
> >
> > ---
> >
> > Changes since v1:
> > - add netns_dev (review from Alexei)
> > ---
> > include/uapi/linux/bpf.h | 2 ++
> > net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 72 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index eaf2d3284248..f4f841dde42c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> > __u32 sk_txhash;
> > __u64 bytes_received;
> > __u64 bytes_acked;
> > + __u64 netns_dev;
> > + __u64 netns_ino;
>
> Maybe we can define netns_dev as __u32?
> __u64 netns_ino;
> __u32 netns_dev;
>
> There is a hole at the end which can be used if the next
> field to be added in the future is a __u32.
>
> From
> static inline u32 new_encode_dev(dev_t dev)
> {
> unsigned major = MAJOR(dev);
> unsigned minor = MINOR(dev);
> return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12);
> }
>
> device num is encoded in a u32.
I could do that but there are already two occurrences of "__u64
netns_dev" in bpf.h:
- struct bpf_prog_info
- struct bpf_map_info
Should I keep it a u64 for consistency with the rest of bpf.h, or
change it to u32?
> > };
> >
> > /* Definitions for bpf_sock_ops_cb_flags */
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 1833926a63fc..93e3429603d7 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -75,6 +75,8 @@
> > #include <net/seg6_local.h>
> > #include <net/lwtunnel.h>
> > #include <net/ipv6_stubs.h>
> > +#include <linux/kdev_t.h>
> > +#include <linux/proc_ns.h>
> >
> > /**
> > * sk_filter_trim_cap - run a packet through a socket filter
> > @@ -6774,6 +6776,15 @@ static bool sock_ops_is_valid_access(int off, int size,
> > }
> > } else {
> > switch (off) {
> > + case offsetof(struct bpf_sock_ops, netns_dev):
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + if (size != sizeof(__u64))
> > + return false;
>
> for netns_dev, looks like we have encoding like
> (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12)
>
> it is possible user may access like:
> u16 dev = sk->netns_dev;
> the compiler may translate into a 2-byte load
> or even
> u8 major = (sk->netns_dev >> 8) & 0xf
> the compiler may translate to a byte load of &sk->netns_dev + 1 or 3
> depends on endianness.
> I suggest we relax the access pattern now to avoid patch later.
Ok, I can do that.
My use case was to compare (struct bpf_sock_ops)->netns_dev/netns_ino
with the value from a map prepared from userspace, i.e. the value of
netns_dev/ino is an opaque buffer for the BPF program, so the problem
is unlikely. But I've got bitten by a similar thing before (struct
sk_msg_md->remote_port/local_port was read in one go as a u64) so we
can improve this a bit.
Thanks for the review, I'll do the rest of the requested changes
probably end of next week.
> > +#else
> > + return false;
> > +#endif
> > + break;
> > case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> > bytes_acked):
> > if (size != sizeof(__u64))
> > @@ -7660,6 +7671,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> > return insn - insn_buf;
> > }
> >
> > +static struct ns_common *sockops_netns_cb(void *private_data)
> > +{
> > + return &init_net.ns;
> > +}
> > +
> > static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > const struct bpf_insn *si,
> > struct bpf_insn *insn_buf,
> > @@ -7668,6 +7684,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > {
> > struct bpf_insn *insn = insn_buf;
> > int off;
> > + struct inode *ns_inode;
> > + struct path ns_path;
> > + __u64 netns_dev;
>
> Although there are a few discrepancies, I suggest to use u64 for
> kernel as __u64 is typically
> used in user space.
>
> > + void *res;
> >
> > /* Helper macro for adding read access to tcp_sock or sock fields. */
> > #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> > @@ -7914,6 +7934,56 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> > struct sock, type);
> > break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_dev):
> > +#ifdef CONFIG_NET_NS
> > + /* We get the netns_dev at BPF-load-time and not at
> > + * BPF-exec-time. We assume that netns_dev is a constant.
> > + */
> > + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> > + if (IS_ERR(res)) {
> > + netns_dev = 0;
> > + } else {
> > + ns_inode = ns_path.dentry->d_inode;
> > + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> > + }
> > +#else
> > + netns_dev = 0;
>
> The #else branch is not needed. During is_valid_access check, if CONFIG_NET_NS
> is not defined, the program should have been rejected.
>
> > +#endif
> > + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> > + break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> > + * Type: (struct bpf_sock_ops_kern *)
> > + * ->(struct sock *)
> > + * ->(struct sock_common)
> > + * .possible_net_t
> > + * .(struct net *)
> > + * ->(struct ns_common)
> > + * .(unsigned int)
> > + */
> > + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> > + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct bpf_sock_ops_kern, sk),
> > + si->dst_reg, si->src_reg,
> > + offsetof(struct bpf_sock_ops_kern, sk));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + possible_net_t, net),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct sock_common, skc_net));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct ns_common, inum),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct net, ns) +
> > + offsetof(struct ns_common, inum));
> > +#else
> > + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
>
> The same as above, this else branch is not needed.
>
> > +#endif
> > + break;
> > +
> > }
> > return insn - insn_buf;
> > }
> > --
> > 2.20.1
> >
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context
2019-04-19 11:51 ` Alban Crequy
@ 2019-04-19 22:59 ` Y Song
0 siblings, 0 replies; 9+ messages in thread
From: Y Song @ 2019-04-19 22:59 UTC (permalink / raw)
To: Alban Crequy
Cc: Alban Crequy, John Fastabend, Alexei Starovoitov,
Daniel Borkmann, bpf, netdev, LKML, Iago López Galeiras
On Fri, Apr 19, 2019 at 4:51 AM Alban Crequy <alban@kinvolk.io> wrote:
>
> On Thu, Apr 18, 2019 at 11:31 PM Y Song <ys114321@gmail.com> wrote:
> >
> > On Thu, Apr 18, 2019 at 8:58 AM Alban Crequy <alban.crequy@gmail.com> wrote:
> > >
> > > From: Alban Crequy <alban@kinvolk.io>
> > >
> > > sockops programs can now access the network namespace inode and device
> > > via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> > > to apply different policies on different network namespaces.
> > >
> > > In the unlikely case where network namespaces are not compiled in
> > > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> > >
> > > The generated BPF bytecode for netns_ino is loading the correct inode
> > > number at the time of execution.
> > >
> > > However, the generated BPF bytecode for netns_dev is loading an
> > > immediate value determined at BPF-load-time by looking at the initial
> > > network namespace. In practice, this works because all netns currently
> > > use the same virtual device. If this was to change, this code would need
> > > to be updated too.
> > >
> > > Signed-off-by: Alban Crequy <alban@kinvolk.io>
> > >
> > > ---
> > >
> > > Changes since v1:
> > > - add netns_dev (review from Alexei)
> > > ---
> > > include/uapi/linux/bpf.h | 2 ++
> > > net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
> > > 2 files changed, 72 insertions(+)
> > >
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index eaf2d3284248..f4f841dde42c 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> > > __u32 sk_txhash;
> > > __u64 bytes_received;
> > > __u64 bytes_acked;
> > > + __u64 netns_dev;
> > > + __u64 netns_ino;
> >
> > Maybe we can define netns_dev as __u32?
> > __u64 netns_ino;
> > __u32 netns_dev;
> >
> > There is a hole at the end which can be used if the next
> > field to be added in the future is a __u32.
> >
> > From
> > static inline u32 new_encode_dev(dev_t dev)
> > {
> > unsigned major = MAJOR(dev);
> > unsigned minor = MINOR(dev);
> > return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12);
> > }
> >
> > device num is encoded in a u32.
>
> I could do that but there are already two occurrences of "__u64
> netns_dev" in bpf.h:
> - struct bpf_prog_info
> - struct bpf_map_info
>
> Should I keep it a u64 for consistency with the rest of bpf.h, or
> change it to u32?
Agreed. We probably should keep it to be __u64 to be consistent with others.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context
2019-04-18 15:56 [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context Alban Crequy
` (2 preceding siblings ...)
2019-04-18 21:31 ` [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context Y Song
@ 2019-04-18 21:41 ` Song Liu
2019-04-18 21:51 ` Song Liu
3 siblings, 1 reply; 9+ messages in thread
From: Song Liu @ 2019-04-18 21:41 UTC (permalink / raw)
To: Alban Crequy
Cc: John Fastabend, Alexei Starovoitov, Daniel Borkmann, bpf,
Networking, open list, Alban Crequy (Kinvolk),
Iago López Galeiras
On Thu, Apr 18, 2019 at 8:59 AM Alban Crequy <alban.crequy@gmail.com> wrote:
>
> From: Alban Crequy <alban@kinvolk.io>
>
> sockops programs can now access the network namespace inode and device
> via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> to apply different policies on different network namespaces.
>
> In the unlikely case where network namespaces are not compiled in
> (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
>
> The generated BPF bytecode for netns_ino is loading the correct inode
> number at the time of execution.
>
> However, the generated BPF bytecode for netns_dev is loading an
> immediate value determined at BPF-load-time by looking at the initial
> network namespace. In practice, this works because all netns currently
> use the same virtual device. If this was to change, this code would need
> to be updated too.
>
> Signed-off-by: Alban Crequy <alban@kinvolk.io>
Acked-by: Song Liu <songliubraving@fb.com>
>
> ---
>
> Changes since v1:
> - add netns_dev (review from Alexei)
> ---
> include/uapi/linux/bpf.h | 2 ++
> net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 72 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index eaf2d3284248..f4f841dde42c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> __u32 sk_txhash;
> __u64 bytes_received;
> __u64 bytes_acked;
> + __u64 netns_dev;
> + __u64 netns_ino;
> };
>
> /* Definitions for bpf_sock_ops_cb_flags */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 1833926a63fc..93e3429603d7 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -75,6 +75,8 @@
> #include <net/seg6_local.h>
> #include <net/lwtunnel.h>
> #include <net/ipv6_stubs.h>
> +#include <linux/kdev_t.h>
> +#include <linux/proc_ns.h>
>
> /**
> * sk_filter_trim_cap - run a packet through a socket filter
> @@ -6774,6 +6776,15 @@ static bool sock_ops_is_valid_access(int off, int size,
> }
> } else {
> switch (off) {
> + case offsetof(struct bpf_sock_ops, netns_dev):
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + if (size != sizeof(__u64))
> + return false;
> +#else
> + return false;
> +#endif
> + break;
> case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> bytes_acked):
> if (size != sizeof(__u64))
> @@ -7660,6 +7671,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> return insn - insn_buf;
> }
>
> +static struct ns_common *sockops_netns_cb(void *private_data)
> +{
> + return &init_net.ns;
> +}
> +
> static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> const struct bpf_insn *si,
> struct bpf_insn *insn_buf,
> @@ -7668,6 +7684,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> {
> struct bpf_insn *insn = insn_buf;
> int off;
> + struct inode *ns_inode;
> + struct path ns_path;
> + __u64 netns_dev;
> + void *res;
>
> /* Helper macro for adding read access to tcp_sock or sock fields. */
> #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> @@ -7914,6 +7934,56 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> struct sock, type);
> break;
> +
> + case offsetof(struct bpf_sock_ops, netns_dev):
> +#ifdef CONFIG_NET_NS
> + /* We get the netns_dev at BPF-load-time and not at
> + * BPF-exec-time. We assume that netns_dev is a constant.
> + */
> + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> + if (IS_ERR(res)) {
> + netns_dev = 0;
> + } else {
> + ns_inode = ns_path.dentry->d_inode;
> + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> + }
> +#else
> + netns_dev = 0;
> +#endif
> + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> + break;
> +
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> + * Type: (struct bpf_sock_ops_kern *)
> + * ->(struct sock *)
> + * ->(struct sock_common)
> + * .possible_net_t
> + * .(struct net *)
> + * ->(struct ns_common)
> + * .(unsigned int)
> + */
> + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct bpf_sock_ops_kern, sk),
> + si->dst_reg, si->src_reg,
> + offsetof(struct bpf_sock_ops_kern, sk));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + possible_net_t, net),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct sock_common, skc_net));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct ns_common, inum),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct net, ns) +
> + offsetof(struct ns_common, inum));
> +#else
> + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
> +#endif
> + break;
> +
> }
> return insn - insn_buf;
> }
> --
> 2.20.1
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context
2019-04-18 21:41 ` Song Liu
@ 2019-04-18 21:51 ` Song Liu
0 siblings, 0 replies; 9+ messages in thread
From: Song Liu @ 2019-04-18 21:51 UTC (permalink / raw)
To: Alban Crequy
Cc: John Fastabend, Alexei Starovoitov, Daniel Borkmann, bpf,
Networking, open list, Alban Crequy (Kinvolk),
Iago López Galeiras
On Thu, Apr 18, 2019 at 2:41 PM Song Liu <liu.song.a23@gmail.com> wrote:
>
> On Thu, Apr 18, 2019 at 8:59 AM Alban Crequy <alban.crequy@gmail.com> wrote:
> >
> > From: Alban Crequy <alban@kinvolk.io>
> >
> > sockops programs can now access the network namespace inode and device
> > via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> > to apply different policies on different network namespaces.
> >
> > In the unlikely case where network namespaces are not compiled in
> > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> >
> > The generated BPF bytecode for netns_ino is loading the correct inode
> > number at the time of execution.
> >
> > However, the generated BPF bytecode for netns_dev is loading an
> > immediate value determined at BPF-load-time by looking at the initial
> > network namespace. In practice, this works because all netns currently
> > use the same virtual device. If this was to change, this code would need
> > to be updated too.
> >
> > Signed-off-by: Alban Crequy <alban@kinvolk.io>
>
> Acked-by: Song Liu <songliubraving@fb.com>
Sorry! I didn't see Yonghong's comments. Please resolve those.
Sorry for the confusion.
Song
>
> >
> > ---
> >
> > Changes since v1:
> > - add netns_dev (review from Alexei)
> > ---
> > include/uapi/linux/bpf.h | 2 ++
> > net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 72 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index eaf2d3284248..f4f841dde42c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> > __u32 sk_txhash;
> > __u64 bytes_received;
> > __u64 bytes_acked;
> > + __u64 netns_dev;
> > + __u64 netns_ino;
> > };
> >
> > /* Definitions for bpf_sock_ops_cb_flags */
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 1833926a63fc..93e3429603d7 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -75,6 +75,8 @@
> > #include <net/seg6_local.h>
> > #include <net/lwtunnel.h>
> > #include <net/ipv6_stubs.h>
> > +#include <linux/kdev_t.h>
> > +#include <linux/proc_ns.h>
> >
> > /**
> > * sk_filter_trim_cap - run a packet through a socket filter
> > @@ -6774,6 +6776,15 @@ static bool sock_ops_is_valid_access(int off, int size,
> > }
> > } else {
> > switch (off) {
> > + case offsetof(struct bpf_sock_ops, netns_dev):
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + if (size != sizeof(__u64))
> > + return false;
> > +#else
> > + return false;
> > +#endif
> > + break;
> > case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> > bytes_acked):
> > if (size != sizeof(__u64))
> > @@ -7660,6 +7671,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> > return insn - insn_buf;
> > }
> >
> > +static struct ns_common *sockops_netns_cb(void *private_data)
> > +{
> > + return &init_net.ns;
> > +}
> > +
> > static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > const struct bpf_insn *si,
> > struct bpf_insn *insn_buf,
> > @@ -7668,6 +7684,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > {
> > struct bpf_insn *insn = insn_buf;
> > int off;
> > + struct inode *ns_inode;
> > + struct path ns_path;
> > + __u64 netns_dev;
> > + void *res;
> >
> > /* Helper macro for adding read access to tcp_sock or sock fields. */
> > #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> > @@ -7914,6 +7934,56 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> > struct sock, type);
> > break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_dev):
> > +#ifdef CONFIG_NET_NS
> > + /* We get the netns_dev at BPF-load-time and not at
> > + * BPF-exec-time. We assume that netns_dev is a constant.
> > + */
> > + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> > + if (IS_ERR(res)) {
> > + netns_dev = 0;
> > + } else {
> > + ns_inode = ns_path.dentry->d_inode;
> > + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> > + }
> > +#else
> > + netns_dev = 0;
> > +#endif
> > + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> > + break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> > + * Type: (struct bpf_sock_ops_kern *)
> > + * ->(struct sock *)
> > + * ->(struct sock_common)
> > + * .possible_net_t
> > + * .(struct net *)
> > + * ->(struct ns_common)
> > + * .(unsigned int)
> > + */
> > + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> > + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct bpf_sock_ops_kern, sk),
> > + si->dst_reg, si->src_reg,
> > + offsetof(struct bpf_sock_ops_kern, sk));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + possible_net_t, net),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct sock_common, skc_net));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct ns_common, inum),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct net, ns) +
> > + offsetof(struct ns_common, inum));
> > +#else
> > + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
> > +#endif
> > + break;
> > +
> > }
> > return insn - insn_buf;
> > }
> > --
> > 2.20.1
> >
^ permalink raw reply [flat|nested] 9+ messages in thread