* [PATCH 1/5] bpf: add map_copy_value hook [not found] <20200310174711.7490-1-lmb@cloudflare.com> @ 2020-03-10 17:47 ` Lorenz Bauer 2020-03-10 17:47 ` [PATCH 2/5] bpf: convert queue and stack map to map_copy_value Lorenz Bauer ` (3 subsequent siblings) 4 siblings, 0 replies; 14+ messages in thread From: Lorenz Bauer @ 2020-03-10 17:47 UTC (permalink / raw) To: Alexei Starovoitov, Daniel Borkmann Cc: kernel-team, Lorenz Bauer, netdev, bpf, linux-kernel bpf_map_copy_value has a lot of special cases for different map types that want more control than map_lookup_elem provides. On closer inspection, almost all of them follow the pattern int func(struct bpf_map *, void *, void *) Introduce a new member map_copy_value to struct bpf_map_ops, and convert the current special cases to use it. Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> --- include/linux/bpf-cgroup.h | 5 ----- include/linux/bpf.h | 21 +-------------------- include/linux/bpf_types.h | 2 +- kernel/bpf/arraymap.c | 13 ++++++++++--- kernel/bpf/bpf_struct_ops.c | 7 ++++--- kernel/bpf/hashtab.c | 10 +++++++--- kernel/bpf/local_storage.c | 14 +++++++++++++- kernel/bpf/reuseport_array.c | 5 +++-- kernel/bpf/syscall.c | 24 ++++-------------------- 9 files changed, 43 insertions(+), 58 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a7cd5c7a2509..6741a6c460f6 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -162,7 +162,6 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map); -int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); @@ -370,10 +369,6 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} -static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, - void *value) { - return 0; -} static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags) { return 0; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 94a329b9da81..ad9f3be830f0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -44,6 +44,7 @@ struct bpf_map_ops { int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); void (*map_release_uref)(struct bpf_map *map); void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); + int (*map_copy_value)(struct bpf_map *map, void *key, void *value); int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_lookup_and_delete_batch)(struct bpf_map *map, @@ -741,8 +742,6 @@ const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id); void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log); bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); -int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, - void *value); static inline bool bpf_try_module_get(const void *data, struct module *owner) { if (owner == BPF_MODULE_OWNER) @@ -774,12 +773,6 @@ static inline void bpf_module_put(const void *data, struct module *owner) { module_put(owner); } -static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, - void *key, - void *value) -{ - return -EINVAL; -} #endif struct bpf_array { @@ -1082,8 +1075,6 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, @@ -1093,10 +1084,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); -int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); -int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_get_file_flag(int flags); int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, @@ -1437,8 +1426,6 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr, #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) void bpf_sk_reuseport_detach(struct sock *sk); -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, - void *value); int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags); #else @@ -1447,12 +1434,6 @@ static inline void bpf_sk_reuseport_detach(struct sock *sk) } #ifdef CONFIG_BPF_SYSCALL -static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, - void *key, void *value) -{ - return -EOPNOTSUPP; -} - static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index c81d4ece79a4..4949638cd049 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -81,7 +81,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) #endif #ifdef CONFIG_CGROUP_BPF BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, percpu_cgroup_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..58a0a8b3abe3 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -249,7 +249,8 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +static int percpu_array_map_copy_value(struct bpf_map *map, void *key, + void *value) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; @@ -513,6 +514,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, + .map_copy_value = percpu_array_map_copy_value, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, @@ -550,7 +552,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) } /* only called from syscall */ -int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +static int fd_array_map_lookup_elem_sys_copy(struct bpf_map *map, void *key, + void *value) { void **elem, *ptr; int ret = 0; @@ -561,7 +564,7 @@ int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) rcu_read_lock(); elem = array_map_lookup_elem(map, key); if (elem && (ptr = READ_ONCE(*elem))) - *value = map->ops->map_fd_sys_lookup_elem(ptr); + *(u32 *)value = map->ops->map_fd_sys_lookup_elem(ptr); else ret = -ENOENT; rcu_read_unlock(); @@ -872,6 +875,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_poke_run = prog_array_map_poke_run, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, + .map_copy_value = fd_array_map_lookup_elem_sys_copy, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, @@ -962,6 +966,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, + .map_copy_value = fd_array_map_lookup_elem_sys_copy, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, @@ -995,6 +1000,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, + .map_copy_value = fd_array_map_lookup_elem_sys_copy, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, @@ -1078,6 +1084,7 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = array_of_map_lookup_elem, + .map_copy_value = fd_array_map_lookup_elem_sys_copy, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index ca5cc8cdb6eb..cc1d7d1077c1 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -238,8 +238,8 @@ static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, return 0; } -int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, - void *value) +static int bpf_struct_ops_map_copy_value(struct bpf_map *map, void *key, + void *value) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; struct bpf_struct_ops_value *uvalue, *kvalue; @@ -509,7 +509,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, if (!value) return; - err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); + err = bpf_struct_ops_map_copy_value(map, key, value); if (!err) { btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, value, m); @@ -609,6 +609,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_free = bpf_struct_ops_map_free, .map_get_next_key = bpf_struct_ops_map_get_next_key, .map_lookup_elem = bpf_struct_ops_map_lookup_elem, + .map_copy_value = bpf_struct_ops_map_copy_value, .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..f5452a8a5177 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1664,7 +1664,8 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +static int htab_percpu_map_copy_value(struct bpf_map *map, void *key, + void *value) { struct htab_elem *l; void __percpu *pptr; @@ -1749,6 +1750,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_copy_value = htab_percpu_map_copy_value, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, @@ -1761,6 +1763,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_percpu_map_lookup_elem, + .map_copy_value = htab_percpu_map_copy_value, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, @@ -1796,7 +1799,7 @@ static void fd_htab_map_free(struct bpf_map *map) } /* only called from syscall */ -int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +static int fd_htab_map_copy_value(struct bpf_map *map, void *key, void *value) { void **ptr; int ret = 0; @@ -1807,7 +1810,7 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) rcu_read_lock(); ptr = htab_map_lookup_elem(map, key); if (ptr) - *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); + *(u32 *)value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); else ret = -ENOENT; rcu_read_unlock(); @@ -1893,6 +1896,7 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_free = htab_of_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_of_map_lookup_elem, + .map_copy_value = fd_htab_map_copy_value, .map_delete_elem = htab_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 33d01866bcc2..fcc0b168dad2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -167,7 +167,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return 0; } -int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, +static int percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, void *value) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); @@ -420,6 +420,18 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_seq_show_elem = cgroup_storage_seq_show_elem, }; +const struct bpf_map_ops percpu_cgroup_storage_map_ops = { + .map_alloc = cgroup_storage_map_alloc, + .map_free = cgroup_storage_map_free, + .map_get_next_key = cgroup_storage_get_next_key, + .map_lookup_elem = cgroup_storage_lookup_elem, + .map_copy_value = percpu_cgroup_storage_copy, + .map_update_elem = cgroup_storage_update_elem, + .map_delete_elem = cgroup_storage_delete_elem, + .map_check_btf = cgroup_storage_check_btf, + .map_seq_show_elem = cgroup_storage_seq_show_elem, +}; + int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..f36ccbf2612e 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -178,8 +178,8 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) return &array->map; } -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, - void *value) +static int reuseport_array_copy_value(struct bpf_map *map, void *key, + void *value) { struct sock *sk; int err; @@ -350,6 +350,7 @@ const struct bpf_map_ops reuseport_array_ops = { .map_alloc = reuseport_array_alloc, .map_free = reuseport_array_free, .map_lookup_elem = reuseport_array_lookup_elem, + .map_copy_value = reuseport_array_copy_value, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ce0815793dd..6503824e81e9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -218,27 +218,11 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, return bpf_map_offload_lookup_elem(map, key, value); bpf_disable_instrumentation(); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { - err = bpf_stackmap_copy(map, key, value); - } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { - err = bpf_fd_array_map_lookup_elem(map, key, value); - } else if (IS_FD_HASH(map)) { - err = bpf_fd_htab_map_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { - err = bpf_fd_reuseport_array_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { + if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_peek_elem(map, value); - } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { - /* struct_ops map requires directly updating "value" */ - err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); + } else if (map->ops->map_copy_value) { + err = map->ops->map_copy_value(map, key, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) -- 2.20.1 ^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 2/5] bpf: convert queue and stack map to map_copy_value [not found] <20200310174711.7490-1-lmb@cloudflare.com> 2020-03-10 17:47 ` [PATCH 1/5] bpf: add map_copy_value hook Lorenz Bauer @ 2020-03-10 17:47 ` Lorenz Bauer 2020-03-11 14:00 ` Jakub Sitnicki 2020-03-10 17:47 ` [PATCH 3/5] bpf: convert sock map and hash " Lorenz Bauer ` (2 subsequent siblings) 4 siblings, 1 reply; 14+ messages in thread From: Lorenz Bauer @ 2020-03-10 17:47 UTC (permalink / raw) To: Alexei Starovoitov, Daniel Borkmann Cc: kernel-team, Lorenz Bauer, netdev, bpf, linux-kernel Migrate BPF_MAP_TYPE_QUEUE and BPF_MAP_TYPE_STACK to map_copy_value, by introducing small wrappers that discard the (unused) key argument. Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> --- kernel/bpf/queue_stack_maps.c | 18 ++++++++++++++++++ kernel/bpf/syscall.c | 5 +---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f697647ceb54..5c89b7583cd2 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -262,11 +262,28 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, return -EINVAL; } +/* Called from syscall */ +static int queue_map_copy_value(struct bpf_map *map, void *key, void *value) +{ + (void)key; + + return queue_map_peek_elem(map, value); +} + +/* Called from syscall */ +static int stack_map_copy_value(struct bpf_map *map, void *key, void *value) +{ + (void)key; + + return stack_map_peek_elem(map, value); +} + const struct bpf_map_ops queue_map_ops = { .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, .map_lookup_elem = queue_stack_map_lookup_elem, + .map_copy_value = queue_map_copy_value, .map_update_elem = queue_stack_map_update_elem, .map_delete_elem = queue_stack_map_delete_elem, .map_push_elem = queue_stack_map_push_elem, @@ -280,6 +297,7 @@ const struct bpf_map_ops stack_map_ops = { .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, .map_lookup_elem = queue_stack_map_lookup_elem, + .map_copy_value = stack_map_copy_value, .map_update_elem = queue_stack_map_update_elem, .map_delete_elem = queue_stack_map_delete_elem, .map_push_elem = queue_stack_map_push_elem, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6503824e81e9..20c6cdace275 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -218,10 +218,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, return bpf_map_offload_lookup_elem(map, key, value); bpf_disable_instrumentation(); - if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { - err = map->ops->map_peek_elem(map, value); - } else if (map->ops->map_copy_value) { + if (map->ops->map_copy_value) { err = map->ops->map_copy_value(map, key, value); } else { rcu_read_lock(); -- 2.20.1 ^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 2/5] bpf: convert queue and stack map to map_copy_value 2020-03-10 17:47 ` [PATCH 2/5] bpf: convert queue and stack map to map_copy_value Lorenz Bauer @ 2020-03-11 14:00 ` Jakub Sitnicki 2020-03-11 22:31 ` John Fastabend 0 siblings, 1 reply; 14+ messages in thread From: Jakub Sitnicki @ 2020-03-11 14:00 UTC (permalink / raw) To: Lorenz Bauer Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team, netdev, bpf, linux-kernel On Tue, Mar 10, 2020 at 06:47 PM CET, Lorenz Bauer wrote: > Migrate BPF_MAP_TYPE_QUEUE and BPF_MAP_TYPE_STACK to map_copy_value, > by introducing small wrappers that discard the (unused) key argument. > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > --- > kernel/bpf/queue_stack_maps.c | 18 ++++++++++++++++++ > kernel/bpf/syscall.c | 5 +---- > 2 files changed, 19 insertions(+), 4 deletions(-) > > diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c > index f697647ceb54..5c89b7583cd2 100644 > --- a/kernel/bpf/queue_stack_maps.c > +++ b/kernel/bpf/queue_stack_maps.c > @@ -262,11 +262,28 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, > return -EINVAL; > } > > +/* Called from syscall */ > +static int queue_map_copy_value(struct bpf_map *map, void *key, void *value) > +{ > + (void)key; Alternatively, there's is the __always_unused compiler attribute from include/linux/compiler_attributes.h that seems to be in wide use. > + > + return queue_map_peek_elem(map, value); > +} > + > +/* Called from syscall */ > +static int stack_map_copy_value(struct bpf_map *map, void *key, void *value) > +{ > + (void)key; > + > + return stack_map_peek_elem(map, value); > +} > + > const struct bpf_map_ops queue_map_ops = { > .map_alloc_check = queue_stack_map_alloc_check, > .map_alloc = queue_stack_map_alloc, > .map_free = queue_stack_map_free, > .map_lookup_elem = queue_stack_map_lookup_elem, > + .map_copy_value = queue_map_copy_value, > .map_update_elem = queue_stack_map_update_elem, > .map_delete_elem = queue_stack_map_delete_elem, > .map_push_elem = queue_stack_map_push_elem, > @@ -280,6 +297,7 @@ const struct bpf_map_ops stack_map_ops = { > .map_alloc = queue_stack_map_alloc, > .map_free = queue_stack_map_free, > .map_lookup_elem = queue_stack_map_lookup_elem, > + .map_copy_value = stack_map_copy_value, > .map_update_elem = queue_stack_map_update_elem, > .map_delete_elem = queue_stack_map_delete_elem, > .map_push_elem = queue_stack_map_push_elem, > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 6503824e81e9..20c6cdace275 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -218,10 +218,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, > return bpf_map_offload_lookup_elem(map, key, value); > > bpf_disable_instrumentation(); > - if (map->map_type == BPF_MAP_TYPE_QUEUE || > - map->map_type == BPF_MAP_TYPE_STACK) { > - err = map->ops->map_peek_elem(map, value); > - } else if (map->ops->map_copy_value) { > + if (map->ops->map_copy_value) { > err = map->ops->map_copy_value(map, key, value); > } else { > rcu_read_lock(); ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/5] bpf: convert queue and stack map to map_copy_value 2020-03-11 14:00 ` Jakub Sitnicki @ 2020-03-11 22:31 ` John Fastabend 0 siblings, 0 replies; 14+ messages in thread From: John Fastabend @ 2020-03-11 22:31 UTC (permalink / raw) To: Jakub Sitnicki, Lorenz Bauer Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team, netdev, bpf, linux-kernel Jakub Sitnicki wrote: > On Tue, Mar 10, 2020 at 06:47 PM CET, Lorenz Bauer wrote: > > Migrate BPF_MAP_TYPE_QUEUE and BPF_MAP_TYPE_STACK to map_copy_value, > > by introducing small wrappers that discard the (unused) key argument. > > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > > --- > > kernel/bpf/queue_stack_maps.c | 18 ++++++++++++++++++ > > kernel/bpf/syscall.c | 5 +---- > > 2 files changed, 19 insertions(+), 4 deletions(-) > > > > diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c > > index f697647ceb54..5c89b7583cd2 100644 > > --- a/kernel/bpf/queue_stack_maps.c > > +++ b/kernel/bpf/queue_stack_maps.c > > @@ -262,11 +262,28 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, > > return -EINVAL; > > } > > > > +/* Called from syscall */ > > +static int queue_map_copy_value(struct bpf_map *map, void *key, void *value) > > +{ > > + (void)key; > > Alternatively, there's is the __always_unused compiler attribute from > include/linux/compiler_attributes.h that seems to be in wide use. > +1 use the attribute its much nicer imo. > > + > > + return queue_map_peek_elem(map, value); > > +} > > + > > +/* Called from syscall */ > > +static int stack_map_copy_value(struct bpf_map *map, void *key, void *value) > > +{ > > + (void)key; > > + > > + return stack_map_peek_elem(map, value); > > +} > > + > > const struct bpf_map_ops queue_map_ops = { > > .map_alloc_check = queue_stack_map_alloc_check, > > .map_alloc = queue_stack_map_alloc, > > .map_free = queue_stack_map_free, > > .map_lookup_elem = queue_stack_map_lookup_elem, > > + .map_copy_value = queue_map_copy_value, > > .map_update_elem = queue_stack_map_update_elem, > > .map_delete_elem = queue_stack_map_delete_elem, > > .map_push_elem = queue_stack_map_push_elem, > > @@ -280,6 +297,7 @@ const struct bpf_map_ops stack_map_ops = { > > .map_alloc = queue_stack_map_alloc, > > .map_free = queue_stack_map_free, > > .map_lookup_elem = queue_stack_map_lookup_elem, > > + .map_copy_value = stack_map_copy_value, > > .map_update_elem = queue_stack_map_update_elem, > > .map_delete_elem = queue_stack_map_delete_elem, > > .map_push_elem = queue_stack_map_push_elem, > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > index 6503824e81e9..20c6cdace275 100644 > > --- a/kernel/bpf/syscall.c > > +++ b/kernel/bpf/syscall.c > > @@ -218,10 +218,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, > > return bpf_map_offload_lookup_elem(map, key, value); > > > > bpf_disable_instrumentation(); > > - if (map->map_type == BPF_MAP_TYPE_QUEUE || > > - map->map_type == BPF_MAP_TYPE_STACK) { > > - err = map->ops->map_peek_elem(map, value); > > - } else if (map->ops->map_copy_value) { > > + if (map->ops->map_copy_value) { > > err = map->ops->map_copy_value(map, key, value); > > } else { > > rcu_read_lock(); ^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 3/5] bpf: convert sock map and hash to map_copy_value [not found] <20200310174711.7490-1-lmb@cloudflare.com> 2020-03-10 17:47 ` [PATCH 1/5] bpf: add map_copy_value hook Lorenz Bauer 2020-03-10 17:47 ` [PATCH 2/5] bpf: convert queue and stack map to map_copy_value Lorenz Bauer @ 2020-03-10 17:47 ` Lorenz Bauer 2020-03-11 13:55 ` Jakub Sitnicki 2020-03-10 17:47 ` [PATCH 4/5] bpf: sockmap, sockhash: return file descriptors from privileged lookup Lorenz Bauer 2020-03-10 17:47 ` [PATCH 5/5] bpf: sockmap, sockhash: test looking up fds Lorenz Bauer 4 siblings, 1 reply; 14+ messages in thread From: Lorenz Bauer @ 2020-03-10 17:47 UTC (permalink / raw) To: John Fastabend, Daniel Borkmann, Jakub Sitnicki, Lorenz Bauer, David S. Miller, Jakub Kicinski, Alexei Starovoitov Cc: kernel-team, netdev, bpf, linux-kernel Migrate sockmap and sockhash to use map_copy_value instead of map_lookup_elem_sys_only. Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> --- net/core/sock_map.c | 48 ++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index a7075b3b4489..03e04426cd21 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -344,19 +344,34 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) return __sock_map_lookup_elem(map, *(u32 *)key); } -static void *sock_map_lookup_sys(struct bpf_map *map, void *key) +static int __sock_map_copy_value(struct bpf_map *map, struct sock *sk, + void *value) +{ + switch (map->value_size) { + case sizeof(u64): + sock_gen_cookie(sk); + *(u64 *)value = atomic64_read(&sk->sk_cookie); + return 0; + + default: + return -ENOSPC; + } +} + +static int sock_map_copy_value(struct bpf_map *map, void *key, void *value) { struct sock *sk; + int ret = -ENOENT; - if (map->value_size != sizeof(u64)) - return ERR_PTR(-ENOSPC); - + rcu_read_lock(); sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) - return ERR_PTR(-ENOENT); + goto out; - sock_gen_cookie(sk); - return &sk->sk_cookie; + ret = __sock_map_copy_value(map, sk, value); +out: + rcu_read_unlock(); + return ret; } static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, @@ -636,7 +651,7 @@ const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, - .map_lookup_elem_sys_only = sock_map_lookup_sys, + .map_copy_value = sock_map_copy_value, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_lookup_elem = sock_map_lookup, @@ -1030,19 +1045,20 @@ static void sock_hash_free(struct bpf_map *map) kfree(htab); } -static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) +static int sock_hash_copy_value(struct bpf_map *map, void *key, void *value) { struct sock *sk; + int ret = -ENOENT; - if (map->value_size != sizeof(u64)) - return ERR_PTR(-ENOSPC); - + rcu_read_lock(); sk = __sock_hash_lookup_elem(map, key); if (!sk) - return ERR_PTR(-ENOENT); + goto out; - sock_gen_cookie(sk); - return &sk->sk_cookie; + ret = __sock_map_copy_value(map, sk, value); +out: + rcu_read_unlock(); + return ret; } static void *sock_hash_lookup(struct bpf_map *map, void *key) @@ -1139,7 +1155,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, - .map_lookup_elem_sys_only = sock_hash_lookup_sys, + .map_copy_value = sock_hash_copy_value, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, }; -- 2.20.1 ^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 3/5] bpf: convert sock map and hash to map_copy_value 2020-03-10 17:47 ` [PATCH 3/5] bpf: convert sock map and hash " Lorenz Bauer @ 2020-03-11 13:55 ` Jakub Sitnicki 0 siblings, 0 replies; 14+ messages in thread From: Jakub Sitnicki @ 2020-03-11 13:55 UTC (permalink / raw) To: Lorenz Bauer Cc: John Fastabend, Daniel Borkmann, David S. Miller, Jakub Kicinski, Alexei Starovoitov, kernel-team, netdev, bpf, linux-kernel On Tue, Mar 10, 2020 at 06:47 PM CET, Lorenz Bauer wrote: > Migrate sockmap and sockhash to use map_copy_value instead of > map_lookup_elem_sys_only. > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > --- > net/core/sock_map.c | 48 ++++++++++++++++++++++++++++++--------------- > 1 file changed, 32 insertions(+), 16 deletions(-) > > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > index a7075b3b4489..03e04426cd21 100644 > --- a/net/core/sock_map.c > +++ b/net/core/sock_map.c > @@ -344,19 +344,34 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) > return __sock_map_lookup_elem(map, *(u32 *)key); > } > > -static void *sock_map_lookup_sys(struct bpf_map *map, void *key) > +static int __sock_map_copy_value(struct bpf_map *map, struct sock *sk, > + void *value) > +{ > + switch (map->value_size) { > + case sizeof(u64): > + sock_gen_cookie(sk); > + *(u64 *)value = atomic64_read(&sk->sk_cookie); You could use sock_gen_cookie return value to merge the two above statements into one. sock_gen_cookie also reads out the value. > + return 0; > + > + default: > + return -ENOSPC; > + } > +} > + > +static int sock_map_copy_value(struct bpf_map *map, void *key, void *value) > { > struct sock *sk; > + int ret = -ENOENT; > > - if (map->value_size != sizeof(u64)) > - return ERR_PTR(-ENOSPC); > - > + rcu_read_lock(); > sk = __sock_map_lookup_elem(map, *(u32 *)key); > if (!sk) > - return ERR_PTR(-ENOENT); > + goto out; > > - sock_gen_cookie(sk); > - return &sk->sk_cookie; > + ret = __sock_map_copy_value(map, sk, value); > +out: > + rcu_read_unlock(); > + return ret; > } > > static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, > @@ -636,7 +651,7 @@ const struct bpf_map_ops sock_map_ops = { > .map_alloc = sock_map_alloc, > .map_free = sock_map_free, > .map_get_next_key = sock_map_get_next_key, > - .map_lookup_elem_sys_only = sock_map_lookup_sys, > + .map_copy_value = sock_map_copy_value, > .map_update_elem = sock_map_update_elem, > .map_delete_elem = sock_map_delete_elem, > .map_lookup_elem = sock_map_lookup, > @@ -1030,19 +1045,20 @@ static void sock_hash_free(struct bpf_map *map) > kfree(htab); > } > > -static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) > +static int sock_hash_copy_value(struct bpf_map *map, void *key, void *value) > { > struct sock *sk; > + int ret = -ENOENT; > > - if (map->value_size != sizeof(u64)) > - return ERR_PTR(-ENOSPC); > - > + rcu_read_lock(); > sk = __sock_hash_lookup_elem(map, key); > if (!sk) > - return ERR_PTR(-ENOENT); > + goto out; > > - sock_gen_cookie(sk); > - return &sk->sk_cookie; > + ret = __sock_map_copy_value(map, sk, value); > +out: > + rcu_read_unlock(); > + return ret; > } > > static void *sock_hash_lookup(struct bpf_map *map, void *key) > @@ -1139,7 +1155,7 @@ const struct bpf_map_ops sock_hash_ops = { > .map_update_elem = sock_hash_update_elem, > .map_delete_elem = sock_hash_delete_elem, > .map_lookup_elem = sock_hash_lookup, > - .map_lookup_elem_sys_only = sock_hash_lookup_sys, > + .map_copy_value = sock_hash_copy_value, > .map_release_uref = sock_hash_release_progs, > .map_check_btf = map_check_no_btf, > }; ^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 4/5] bpf: sockmap, sockhash: return file descriptors from privileged lookup [not found] <20200310174711.7490-1-lmb@cloudflare.com> ` (2 preceding siblings ...) 2020-03-10 17:47 ` [PATCH 3/5] bpf: convert sock map and hash " Lorenz Bauer @ 2020-03-10 17:47 ` Lorenz Bauer 2020-03-11 23:27 ` John Fastabend 2020-03-17 15:18 ` Jakub Sitnicki 2020-03-10 17:47 ` [PATCH 5/5] bpf: sockmap, sockhash: test looking up fds Lorenz Bauer 4 siblings, 2 replies; 14+ messages in thread From: Lorenz Bauer @ 2020-03-10 17:47 UTC (permalink / raw) To: John Fastabend, Daniel Borkmann, Jakub Sitnicki, Lorenz Bauer, David S. Miller, Jakub Kicinski, Alexei Starovoitov Cc: kernel-team, netdev, bpf, linux-kernel Allow callers with CAP_NET_ADMIN to retrieve file descriptors from a sockmap and sockhash. O_CLOEXEC is enforced on all fds. Without this, it's difficult to resize or otherwise rebuild existing sockmap or sockhashes. Suggested-by: Jakub Sitnicki <jakub@cloudflare.com> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> --- net/core/sock_map.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 03e04426cd21..3228936aa31e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -347,12 +347,31 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) static int __sock_map_copy_value(struct bpf_map *map, struct sock *sk, void *value) { + struct file *file; + int fd; + switch (map->value_size) { case sizeof(u64): sock_gen_cookie(sk); *(u64 *)value = atomic64_read(&sk->sk_cookie); return 0; + case sizeof(u32): + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (unlikely(fd < 0)) + return fd; + + read_lock_bh(&sk->sk_callback_lock); + file = get_file(sk->sk_socket->file); + read_unlock_bh(&sk->sk_callback_lock); + + fd_install(fd, file); + *(u32 *)value = fd; + return 0; + default: return -ENOSPC; } -- 2.20.1 ^ permalink raw reply related [flat|nested] 14+ messages in thread
* RE: [PATCH 4/5] bpf: sockmap, sockhash: return file descriptors from privileged lookup 2020-03-10 17:47 ` [PATCH 4/5] bpf: sockmap, sockhash: return file descriptors from privileged lookup Lorenz Bauer @ 2020-03-11 23:27 ` John Fastabend 2020-03-17 10:17 ` Lorenz Bauer 2020-03-17 15:18 ` Jakub Sitnicki 1 sibling, 1 reply; 14+ messages in thread From: John Fastabend @ 2020-03-11 23:27 UTC (permalink / raw) To: Lorenz Bauer, John Fastabend, Daniel Borkmann, Jakub Sitnicki, Lorenz Bauer, David S. Miller, Jakub Kicinski, Alexei Starovoitov Cc: kernel-team, netdev, bpf, linux-kernel Lorenz Bauer wrote: > Allow callers with CAP_NET_ADMIN to retrieve file descriptors from a > sockmap and sockhash. O_CLOEXEC is enforced on all fds. > > Without this, it's difficult to resize or otherwise rebuild existing > sockmap or sockhashes. > > Suggested-by: Jakub Sitnicki <jakub@cloudflare.com> > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > --- > net/core/sock_map.c | 19 +++++++++++++++++++ > 1 file changed, 19 insertions(+) > > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > index 03e04426cd21..3228936aa31e 100644 > --- a/net/core/sock_map.c > +++ b/net/core/sock_map.c > @@ -347,12 +347,31 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) > static int __sock_map_copy_value(struct bpf_map *map, struct sock *sk, > void *value) > { > + struct file *file; > + int fd; > + > switch (map->value_size) { > case sizeof(u64): > sock_gen_cookie(sk); > *(u64 *)value = atomic64_read(&sk->sk_cookie); > return 0; > > + case sizeof(u32): > + if (!capable(CAP_NET_ADMIN)) > + return -EPERM; > + > + fd = get_unused_fd_flags(O_CLOEXEC); > + if (unlikely(fd < 0)) > + return fd; > + > + read_lock_bh(&sk->sk_callback_lock); > + file = get_file(sk->sk_socket->file); > + read_unlock_bh(&sk->sk_callback_lock); > + > + fd_install(fd, file); > + *(u32 *)value = fd; > + return 0; > + Hi Lorenz, Can you say something about what happens if the sk is deleted from the map or the sock is closed/unhashed ideally in the commit message so we have it for later reference. I guess because we are in an rcu block here the sk will be OK and psock reference will exist until after the rcu block at least because of call_rcu(). If the psock is destroyed from another path then the fd will still point at the sock. correct? Thanks. ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4/5] bpf: sockmap, sockhash: return file descriptors from privileged lookup 2020-03-11 23:27 ` John Fastabend @ 2020-03-17 10:17 ` Lorenz Bauer 0 siblings, 0 replies; 14+ messages in thread From: Lorenz Bauer @ 2020-03-17 10:17 UTC (permalink / raw) To: John Fastabend Cc: Daniel Borkmann, Jakub Sitnicki, David S. Miller, Jakub Kicinski, Alexei Starovoitov, kernel-team, Networking, bpf, linux-kernel On Wed, 11 Mar 2020 at 23:27, John Fastabend <john.fastabend@gmail.com> wrote: > > Lorenz Bauer wrote: > > Allow callers with CAP_NET_ADMIN to retrieve file descriptors from a > > sockmap and sockhash. O_CLOEXEC is enforced on all fds. > > > > Without this, it's difficult to resize or otherwise rebuild existing > > sockmap or sockhashes. > > > > Suggested-by: Jakub Sitnicki <jakub@cloudflare.com> > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > > --- > > net/core/sock_map.c | 19 +++++++++++++++++++ > > 1 file changed, 19 insertions(+) > > > > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > > index 03e04426cd21..3228936aa31e 100644 > > --- a/net/core/sock_map.c > > +++ b/net/core/sock_map.c > > @@ -347,12 +347,31 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) > > static int __sock_map_copy_value(struct bpf_map *map, struct sock *sk, > > void *value) > > { > > + struct file *file; > > + int fd; > > + > > switch (map->value_size) { > > case sizeof(u64): > > sock_gen_cookie(sk); > > *(u64 *)value = atomic64_read(&sk->sk_cookie); > > return 0; > > > > + case sizeof(u32): > > + if (!capable(CAP_NET_ADMIN)) > > + return -EPERM; > > + > > + fd = get_unused_fd_flags(O_CLOEXEC); > > + if (unlikely(fd < 0)) > > + return fd; > > + > > + read_lock_bh(&sk->sk_callback_lock); > > + file = get_file(sk->sk_socket->file); > > + read_unlock_bh(&sk->sk_callback_lock); > > + > > + fd_install(fd, file); > > + *(u32 *)value = fd; > > + return 0; > > + > > Hi Lorenz, Can you say something about what happens if the sk > is deleted from the map or the sock is closed/unhashed ideally > in the commit message so we have it for later reference. I guess > because we are in an rcu block here the sk will be OK and psock > reference will exist until after the rcu block at least because > of call_rcu(). If the psock is destroyed from another path then > the fd will still point at the sock. correct? This is how I understand it: * sk is protected by rcu_read_lock (as you point out) * sk->sk_callback_lock protects against sk->sk_socket being modified by sock_orphan, sock_graft, etc. via sk_set_socket * get_file increments the refcount on the file I'm not sure how the psock figures into this, maybe you can elaborate a little? -- Lorenz Bauer | Systems Engineer 6th Floor, County Hall/The Riverside Building, SE1 7PB, UK www.cloudflare.com ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4/5] bpf: sockmap, sockhash: return file descriptors from privileged lookup 2020-03-10 17:47 ` [PATCH 4/5] bpf: sockmap, sockhash: return file descriptors from privileged lookup Lorenz Bauer 2020-03-11 23:27 ` John Fastabend @ 2020-03-17 15:18 ` Jakub Sitnicki 2020-03-17 18:16 ` John Fastabend 1 sibling, 1 reply; 14+ messages in thread From: Jakub Sitnicki @ 2020-03-17 15:18 UTC (permalink / raw) To: Lorenz Bauer Cc: John Fastabend, Daniel Borkmann, David S. Miller, Jakub Kicinski, Alexei Starovoitov, kernel-team, netdev, bpf, linux-kernel On Tue, Mar 10, 2020 at 06:47 PM CET, Lorenz Bauer wrote: > Allow callers with CAP_NET_ADMIN to retrieve file descriptors from a > sockmap and sockhash. O_CLOEXEC is enforced on all fds. > > Without this, it's difficult to resize or otherwise rebuild existing > sockmap or sockhashes. > > Suggested-by: Jakub Sitnicki <jakub@cloudflare.com> > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > --- > net/core/sock_map.c | 19 +++++++++++++++++++ > 1 file changed, 19 insertions(+) > > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > index 03e04426cd21..3228936aa31e 100644 > --- a/net/core/sock_map.c > +++ b/net/core/sock_map.c > @@ -347,12 +347,31 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) > static int __sock_map_copy_value(struct bpf_map *map, struct sock *sk, > void *value) > { > + struct file *file; > + int fd; > + > switch (map->value_size) { > case sizeof(u64): > sock_gen_cookie(sk); > *(u64 *)value = atomic64_read(&sk->sk_cookie); > return 0; > > + case sizeof(u32): > + if (!capable(CAP_NET_ADMIN)) > + return -EPERM; > + > + fd = get_unused_fd_flags(O_CLOEXEC); > + if (unlikely(fd < 0)) > + return fd; > + > + read_lock_bh(&sk->sk_callback_lock); > + file = get_file(sk->sk_socket->file); I think this deserves a second look. We don't lock the sock, so what if tcp_close orphans it before we enter this critical section? Looks like sk->sk_socket might be NULL. I'd find a test that tries to trigger the race helpful, like: thread A: loop in lookup FD from map thread B: loop in insert FD into map, close FD > + read_unlock_bh(&sk->sk_callback_lock); > + > + fd_install(fd, file); > + *(u32 *)value = fd; > + return 0; > + > default: > return -ENOSPC; > } ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4/5] bpf: sockmap, sockhash: return file descriptors from privileged lookup 2020-03-17 15:18 ` Jakub Sitnicki @ 2020-03-17 18:16 ` John Fastabend 0 siblings, 0 replies; 14+ messages in thread From: John Fastabend @ 2020-03-17 18:16 UTC (permalink / raw) To: Jakub Sitnicki, Lorenz Bauer Cc: John Fastabend, Daniel Borkmann, David S. Miller, Jakub Kicinski, Alexei Starovoitov, kernel-team, netdev, bpf, linux-kernel Jakub Sitnicki wrote: > On Tue, Mar 10, 2020 at 06:47 PM CET, Lorenz Bauer wrote: > > Allow callers with CAP_NET_ADMIN to retrieve file descriptors from a > > sockmap and sockhash. O_CLOEXEC is enforced on all fds. > > > > Without this, it's difficult to resize or otherwise rebuild existing > > sockmap or sockhashes. > > > > Suggested-by: Jakub Sitnicki <jakub@cloudflare.com> > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > > --- > > net/core/sock_map.c | 19 +++++++++++++++++++ > > 1 file changed, 19 insertions(+) > > > > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > > index 03e04426cd21..3228936aa31e 100644 > > --- a/net/core/sock_map.c > > +++ b/net/core/sock_map.c > > @@ -347,12 +347,31 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) > > static int __sock_map_copy_value(struct bpf_map *map, struct sock *sk, > > void *value) > > { > > + struct file *file; > > + int fd; > > + > > switch (map->value_size) { > > case sizeof(u64): > > sock_gen_cookie(sk); > > *(u64 *)value = atomic64_read(&sk->sk_cookie); > > return 0; > > > > + case sizeof(u32): > > + if (!capable(CAP_NET_ADMIN)) > > + return -EPERM; > > + > > + fd = get_unused_fd_flags(O_CLOEXEC); > > + if (unlikely(fd < 0)) > > + return fd; > > + > > + read_lock_bh(&sk->sk_callback_lock); > > + file = get_file(sk->sk_socket->file); > > I think this deserves a second look. > > We don't lock the sock, so what if tcp_close orphans it before we enter > this critical section? Looks like sk->sk_socket might be NULL. > > I'd find a test that tries to trigger the race helpful, like: > > thread A: loop in lookup FD from map > thread B: loop in insert FD into map, close FD Agreed, this was essentially my question above as well. When the psock is created we call sock_hold() and will only do a sock_put() after an rcu grace period when its removed. So at least if you have the sock here it should have a sk_refcnt. (Note the user data is set to NULL so if you do reference psock you need to check its non-null.) Is that enough to ensure sk_socket? Seems not to me, tcp_close for example will still happen and call sock_orphan(sk) based on my admittddly quick look. Further, even if you do check sk->sk_socket is non-null what does it mean to return a file with a socket that is closed, deleted from the sock_map and psock removed? At this point is it just a dangling reference? Still a bit confused as well what would or should happen when the sock is closed after you have the file reference? I could probably dig up what exactly would happen but I think we need it in the commiit message so we understand it. I also didn't dig up the details here but if the receiver of the fd crashes or otherwise disappears this hopefully all get cleaned up? > > > + read_unlock_bh(&sk->sk_callback_lock); > > + > > + fd_install(fd, file); > > + *(u32 *)value = fd; > > + return 0; > > + > > default: > > return -ENOSPC; > > } ^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 5/5] bpf: sockmap, sockhash: test looking up fds [not found] <20200310174711.7490-1-lmb@cloudflare.com> ` (3 preceding siblings ...) 2020-03-10 17:47 ` [PATCH 4/5] bpf: sockmap, sockhash: return file descriptors from privileged lookup Lorenz Bauer @ 2020-03-10 17:47 ` Lorenz Bauer 2020-03-11 13:52 ` Jakub Sitnicki 4 siblings, 1 reply; 14+ messages in thread From: Lorenz Bauer @ 2020-03-10 17:47 UTC (permalink / raw) To: Shuah Khan, Alexei Starovoitov, Daniel Borkmann Cc: kernel-team, Lorenz Bauer, linux-kselftest, netdev, bpf, linux-kernel Make sure that looking up an element from the map succeeds, and that the fd is valid by using it an fcntl call. Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> --- .../selftests/bpf/prog_tests/sockmap_listen.c | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 52aa468bdccd..929e1e77ecc6 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -453,7 +453,7 @@ static void test_lookup_after_delete(int family, int sotype, int mapfd) xclose(s); } -static void test_lookup_32_bit_value(int family, int sotype, int mapfd) +static void test_lookup_fd(int family, int sotype, int mapfd) { u32 key, value32; int err, s; @@ -466,7 +466,7 @@ static void test_lookup_32_bit_value(int family, int sotype, int mapfd) sizeof(value32), 1, 0); if (mapfd < 0) { FAIL_ERRNO("map_create"); - goto close; + goto close_sock; } key = 0; @@ -475,11 +475,25 @@ static void test_lookup_32_bit_value(int family, int sotype, int mapfd) errno = 0; err = bpf_map_lookup_elem(mapfd, &key, &value32); - if (!err || errno != ENOSPC) - FAIL_ERRNO("map_lookup: expected ENOSPC"); + if (err) { + FAIL_ERRNO("map_lookup"); + goto close_map; + } + if ((int)value32 == s) { + FAIL("return value is identical"); + goto close; + } + + err = fcntl(value32, F_GETFD); + if (err == -1) + FAIL_ERRNO("fcntl"); + +close: + xclose(value32); +close_map: xclose(mapfd); -close: +close_sock: xclose(s); } @@ -1456,7 +1470,7 @@ static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, /* lookup */ TEST(test_lookup_after_insert), TEST(test_lookup_after_delete), - TEST(test_lookup_32_bit_value), + TEST(test_lookup_fd), /* update */ TEST(test_update_existing), /* races with insert/delete */ -- 2.20.1 ^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 5/5] bpf: sockmap, sockhash: test looking up fds 2020-03-10 17:47 ` [PATCH 5/5] bpf: sockmap, sockhash: test looking up fds Lorenz Bauer @ 2020-03-11 13:52 ` Jakub Sitnicki 2020-03-11 17:24 ` Lorenz Bauer 0 siblings, 1 reply; 14+ messages in thread From: Jakub Sitnicki @ 2020-03-11 13:52 UTC (permalink / raw) To: Lorenz Bauer Cc: Shuah Khan, Alexei Starovoitov, Daniel Borkmann, kernel-team, linux-kselftest, netdev, bpf, linux-kernel On Tue, Mar 10, 2020 at 06:47 PM CET, Lorenz Bauer wrote: > Make sure that looking up an element from the map succeeds, > and that the fd is valid by using it an fcntl call. > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > --- > .../selftests/bpf/prog_tests/sockmap_listen.c | 26 ++++++++++++++----- > 1 file changed, 20 insertions(+), 6 deletions(-) > > diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c > index 52aa468bdccd..929e1e77ecc6 100644 > --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c > +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c > @@ -453,7 +453,7 @@ static void test_lookup_after_delete(int family, int sotype, int mapfd) > xclose(s); > } > > -static void test_lookup_32_bit_value(int family, int sotype, int mapfd) > +static void test_lookup_fd(int family, int sotype, int mapfd) > { > u32 key, value32; > int err, s; > @@ -466,7 +466,7 @@ static void test_lookup_32_bit_value(int family, int sotype, int mapfd) > sizeof(value32), 1, 0); > if (mapfd < 0) { > FAIL_ERRNO("map_create"); > - goto close; > + goto close_sock; > } > > key = 0; > @@ -475,11 +475,25 @@ static void test_lookup_32_bit_value(int family, int sotype, int mapfd) > > errno = 0; > err = bpf_map_lookup_elem(mapfd, &key, &value32); > - if (!err || errno != ENOSPC) > - FAIL_ERRNO("map_lookup: expected ENOSPC"); > + if (err) { > + FAIL_ERRNO("map_lookup"); > + goto close_map; > + } > > + if ((int)value32 == s) { > + FAIL("return value is identical"); > + goto close; > + } > + > + err = fcntl(value32, F_GETFD); > + if (err == -1) > + FAIL_ERRNO("fcntl"); I would call getsockopt()/getsockname() to assert that the FD lookup succeeded. We want to know not only that it's an FD (-EBADFD case), but also that it's associated with a socket (-ENOTSOCK). We can go even further, and compare socket cookies to ensure we got an FD for the expected socket. Also, I'm wondering if we could keep the -ENOSPC case test-covered by temporarily dropping NET_ADMIN capability. > + > +close: > + xclose(value32); > +close_map: > xclose(mapfd); > -close: > +close_sock: > xclose(s); > } > > @@ -1456,7 +1470,7 @@ static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, > /* lookup */ > TEST(test_lookup_after_insert), > TEST(test_lookup_after_delete), > - TEST(test_lookup_32_bit_value), > + TEST(test_lookup_fd), > /* update */ > TEST(test_update_existing), > /* races with insert/delete */ ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 5/5] bpf: sockmap, sockhash: test looking up fds 2020-03-11 13:52 ` Jakub Sitnicki @ 2020-03-11 17:24 ` Lorenz Bauer 0 siblings, 0 replies; 14+ messages in thread From: Lorenz Bauer @ 2020-03-11 17:24 UTC (permalink / raw) To: Jakub Sitnicki Cc: Shuah Khan, Alexei Starovoitov, Daniel Borkmann, kernel-team, linux-kselftest, Networking, bpf, linux-kernel On Wed, 11 Mar 2020 at 13:52, Jakub Sitnicki <jakub@cloudflare.com> wrote: > > On Tue, Mar 10, 2020 at 06:47 PM CET, Lorenz Bauer wrote: > > Make sure that looking up an element from the map succeeds, > > and that the fd is valid by using it an fcntl call. > > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > > --- > > .../selftests/bpf/prog_tests/sockmap_listen.c | 26 ++++++++++++++----- > > 1 file changed, 20 insertions(+), 6 deletions(-) > > > > diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c > > index 52aa468bdccd..929e1e77ecc6 100644 > > --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c > > +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c > > @@ -453,7 +453,7 @@ static void test_lookup_after_delete(int family, int sotype, int mapfd) > > xclose(s); > > } > > > > -static void test_lookup_32_bit_value(int family, int sotype, int mapfd) > > +static void test_lookup_fd(int family, int sotype, int mapfd) > > { > > u32 key, value32; > > int err, s; > > @@ -466,7 +466,7 @@ static void test_lookup_32_bit_value(int family, int sotype, int mapfd) > > sizeof(value32), 1, 0); > > if (mapfd < 0) { > > FAIL_ERRNO("map_create"); > > - goto close; > > + goto close_sock; > > } > > > > key = 0; > > @@ -475,11 +475,25 @@ static void test_lookup_32_bit_value(int family, int sotype, int mapfd) > > > > errno = 0; > > err = bpf_map_lookup_elem(mapfd, &key, &value32); > > - if (!err || errno != ENOSPC) > > - FAIL_ERRNO("map_lookup: expected ENOSPC"); > > + if (err) { > > + FAIL_ERRNO("map_lookup"); > > + goto close_map; > > + } > > > > + if ((int)value32 == s) { > > + FAIL("return value is identical"); > > + goto close; > > + } > > + > > + err = fcntl(value32, F_GETFD); > > + if (err == -1) > > + FAIL_ERRNO("fcntl"); > > I would call getsockopt()/getsockname() to assert that the FD lookup > succeeded. We want to know not only that it's an FD (-EBADFD case), but > also that it's associated with a socket (-ENOTSOCK). > > We can go even further, and compare socket cookies to ensure we got an > FD for the expected socket. Good idea, thanks! > Also, I'm wondering if we could keep the -ENOSPC case test-covered by > temporarily dropping NET_ADMIN capability. You mean EPERM? ENOSPC isn't reachable, since the map can only be created with a map_value of 4 or 8. > > > + > > +close: > > + xclose(value32); > > +close_map: > > xclose(mapfd); > > -close: > > +close_sock: > > xclose(s); > > } > > > > @@ -1456,7 +1470,7 @@ static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, > > /* lookup */ > > TEST(test_lookup_after_insert), > > TEST(test_lookup_after_delete), > > - TEST(test_lookup_32_bit_value), > > + TEST(test_lookup_fd), > > /* update */ > > TEST(test_update_existing), > > /* races with insert/delete */ -- Lorenz Bauer | Systems Engineer 6th Floor, County Hall/The Riverside Building, SE1 7PB, UK www.cloudflare.com ^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2020-03-17 18:16 UTC | newest] Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <20200310174711.7490-1-lmb@cloudflare.com> 2020-03-10 17:47 ` [PATCH 1/5] bpf: add map_copy_value hook Lorenz Bauer 2020-03-10 17:47 ` [PATCH 2/5] bpf: convert queue and stack map to map_copy_value Lorenz Bauer 2020-03-11 14:00 ` Jakub Sitnicki 2020-03-11 22:31 ` John Fastabend 2020-03-10 17:47 ` [PATCH 3/5] bpf: convert sock map and hash " Lorenz Bauer 2020-03-11 13:55 ` Jakub Sitnicki 2020-03-10 17:47 ` [PATCH 4/5] bpf: sockmap, sockhash: return file descriptors from privileged lookup Lorenz Bauer 2020-03-11 23:27 ` John Fastabend 2020-03-17 10:17 ` Lorenz Bauer 2020-03-17 15:18 ` Jakub Sitnicki 2020-03-17 18:16 ` John Fastabend 2020-03-10 17:47 ` [PATCH 5/5] bpf: sockmap, sockhash: test looking up fds Lorenz Bauer 2020-03-11 13:52 ` Jakub Sitnicki 2020-03-11 17:24 ` Lorenz Bauer
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).