From mboxrd@z Thu Jan 1 00:00:00 1970 From: Mauricio Vasquez Subject: Re: [PATCH bpf-next 1/3] bpf: add bpf queue map Date: Wed, 8 Aug 2018 21:50:05 -0500 Message-ID: <04feef42-9183-1f95-cdd6-b53623c968f2@polito.it> References: <153356387977.6981.12236150594041620482.stgit@kernel> <153356390770.6981.4228793745105954649.stgit@kernel> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 7bit Cc: netdev@vger.kernel.org To: Daniel Borkmann , Alexei Starovoitov Return-path: Received: from fm2nodo1.polito.it ([130.192.180.17]:34754 "EHLO fm2nodo1.polito.it" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726926AbeHIFNG (ORCPT ); Thu, 9 Aug 2018 01:13:06 -0400 In-Reply-To: Content-Language: en-US Sender: netdev-owner@vger.kernel.org List-ID: On 08/07/2018 08:40 AM, Daniel Borkmann wrote: > On 08/06/2018 03:58 PM, Mauricio Vasquez B wrote: >> Bpf queue implements a LIFO/FIFO data containers for ebpf programs. >> >> It allows to push an element to the queue by using the update operation >> and to pop an element from the queue by using the lookup operation. >> >> A use case for this is to keep track of a pool of elements, like >> network ports in a SNAT. >> >> Signed-off-by: Mauricio Vasquez B >> --- >> include/linux/bpf_types.h | 1 >> include/uapi/linux/bpf.h | 5 + >> kernel/bpf/Makefile | 2 >> kernel/bpf/queuemap.c | 287 +++++++++++++++++++++++++++++++++++++++++++++ >> kernel/bpf/syscall.c | 61 +++++++--- >> kernel/bpf/verifier.c | 16 ++- >> 6 files changed, 353 insertions(+), 19 deletions(-) >> create mode 100644 kernel/bpf/queuemap.c >> >> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h >> index c5700c2d5549..6c7a62f3fe43 100644 >> --- a/include/linux/bpf_types.h >> +++ b/include/linux/bpf_types.h >> @@ -58,3 +58,4 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) >> BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) >> #endif >> #endif >> +BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops) >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 0ebaaf7f3568..2c171c40eb45 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -120,6 +120,7 @@ enum bpf_map_type { >> BPF_MAP_TYPE_CPUMAP, >> BPF_MAP_TYPE_XSKMAP, >> BPF_MAP_TYPE_SOCKHASH, >> + BPF_MAP_TYPE_QUEUE, >> }; >> >> enum bpf_prog_type { >> @@ -255,6 +256,10 @@ enum bpf_attach_type { >> /* Flag for stack_map, store build_id+offset instead of pointer */ >> #define BPF_F_STACK_BUILD_ID (1U << 5) >> >> +/* Flags for queue_map, type of queue */ >> +#define BPF_F_QUEUE_FIFO (1U << 16) >> +#define BPF_F_QUEUE_LIFO (2U << 16) >> + >> enum bpf_stack_build_id_status { >> /* user space need an empty entry to identify end of a trace */ >> BPF_STACK_BUILD_ID_EMPTY = 0, >> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile >> index f27f5496d6fe..30f02ef66635 100644 >> --- a/kernel/bpf/Makefile >> +++ b/kernel/bpf/Makefile >> @@ -2,7 +2,7 @@ >> obj-y := core.o >> >> obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o >> -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o >> +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o queuemap.o >> obj-$(CONFIG_BPF_SYSCALL) += disasm.o >> obj-$(CONFIG_BPF_SYSCALL) += btf.o >> ifeq ($(CONFIG_NET),y) >> diff --git a/kernel/bpf/queuemap.c b/kernel/bpf/queuemap.c >> new file mode 100644 >> index 000000000000..ab30af43b4cc >> --- /dev/null >> +++ b/kernel/bpf/queuemap.c >> @@ -0,0 +1,287 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +/* >> + * queuemap.c: BPF queue map >> + * >> + * Copyright (c) 2018 Politecnico di Torino >> + */ >> +#include >> +#include >> +#include >> +#include "percpu_freelist.h" >> + >> +#define QUEUE_CREATE_FLAG_MASK \ >> + (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ >> + BPF_F_QUEUE_FIFO | BPF_F_QUEUE_LIFO) >> + >> +enum queue_type { >> + QUEUE_FIFO = (BPF_F_QUEUE_FIFO >> 16), >> + QUEUE_LIFO = (BPF_F_QUEUE_LIFO >> 16), >> +}; >> + >> +struct bpf_queue { >> + struct bpf_map map; >> + struct list_head head; >> + struct pcpu_freelist freelist; >> + void *nodes; >> + enum queue_type type; >> + raw_spinlock_t lock; >> + atomic_t count; >> + u32 node_size; >> +}; >> + >> +struct queue_node { >> + struct pcpu_freelist_node fnode; >> + struct bpf_queue *queue; >> + struct list_head list; >> + struct rcu_head rcu; >> + char element[0] __aligned(8); >> +}; >> + >> +static bool queue_map_is_prealloc(struct bpf_queue *queue) >> +{ >> + return !(queue->map.map_flags & BPF_F_NO_PREALLOC); >> +} >> + >> +/* Called from syscall */ >> +static int queue_map_alloc_check(union bpf_attr *attr) >> +{ >> + /* check sanity of attributes */ >> + if (attr->max_entries == 0 || attr->key_size != 0 || >> + attr->value_size == 0 || attr->map_flags & ~QUEUE_CREATE_FLAG_MASK) >> + return -EINVAL; >> + >> + if ((attr->map_flags >> 16) != QUEUE_FIFO && >> + (attr->map_flags >> 16) != QUEUE_LIFO) { >> + return -EINVAL; >> + } >> + >> + if (attr->value_size > KMALLOC_MAX_SIZE) >> + /* if value_size is bigger, the user space won't be able to >> + * access the elements. >> + */ >> + return -E2BIG; >> + >> + return 0; >> +} >> + >> +static int prealloc_init(struct bpf_queue *queue) >> +{ >> + u32 node_size = sizeof(struct queue_node) + >> + round_up(queue->map.value_size, 8); >> + u32 num_entries = queue->map.max_entries; >> + int err; >> + >> + queue->nodes = bpf_map_area_alloc(node_size * num_entries, >> + queue->map.numa_node); >> + if (!queue->nodes) >> + return -ENOMEM; >> + >> + err = pcpu_freelist_init(&queue->freelist); >> + if (err) >> + goto free_nodes; >> + >> + pcpu_freelist_populate(&queue->freelist, >> + queue->nodes + >> + offsetof(struct queue_node, fnode), >> + node_size, num_entries); >> + >> + return 0; >> + >> +free_nodes: >> + bpf_map_area_free(queue->nodes); >> + return err; >> +} >> + >> +static void prealloc_destroy(struct bpf_queue *queue) >> +{ >> + bpf_map_area_free(queue->nodes); >> + pcpu_freelist_destroy(&queue->freelist); >> +} >> + >> +static struct bpf_map *queue_map_alloc(union bpf_attr *attr) >> +{ >> + struct bpf_queue *queue; >> + u64 cost = sizeof(*queue); >> + int ret; >> + >> + queue = kzalloc(sizeof(*queue), GFP_USER); >> + if (!queue) >> + return ERR_PTR(-ENOMEM); >> + >> + bpf_map_init_from_attr(&queue->map, attr); >> + >> + queue->node_size = sizeof(struct queue_node) + >> + round_up(attr->value_size, 8); >> + cost += (u64) attr->max_entries * queue->node_size; >> + if (cost >= U32_MAX - PAGE_SIZE) { >> + ret = -E2BIG; >> + goto free_queue; >> + } >> + >> + queue->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; >> + >> + ret = bpf_map_precharge_memlock(queue->map.pages); >> + if (ret) >> + goto free_queue; >> + >> + INIT_LIST_HEAD(&queue->head); >> + >> + raw_spin_lock_init(&queue->lock); >> + >> + queue->type = attr->map_flags >> 16; >> + >> + if (queue_map_is_prealloc(queue)) >> + ret = prealloc_init(queue); >> + if (ret) >> + goto free_queue; >> + >> + return &queue->map; >> + >> +free_queue: >> + kfree(queue); >> + return ERR_PTR(ret); >> +} >> + >> +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ >> +static void queue_map_free(struct bpf_map *map) >> +{ >> + struct bpf_queue *queue = container_of(map, struct bpf_queue, map); >> + struct queue_node *l; >> + >> + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, >> + * so the programs (can be more than one that used this map) were >> + * disconnected from events. Wait for outstanding critical sections in >> + * these programs to complete >> + */ >> + synchronize_rcu(); >> + >> + /* some of queue_elem_free_rcu() callbacks for elements of this map may >> + * not have executed. Wait for them. >> + */ >> + rcu_barrier(); >> + if (!queue_map_is_prealloc(queue)) >> + list_for_each_entry_rcu(l, &queue->head, list) { >> + list_del_rcu(&l->list); >> + kfree(l); >> + } >> + else >> + prealloc_destroy(queue); >> + kfree(queue); >> +} >> + >> +static void queue_elem_free_rcu(struct rcu_head *head) >> +{ >> + struct queue_node *l = container_of(head, struct queue_node, rcu); >> + struct bpf_queue *queue = l->queue; >> + >> + /* must increment bpf_prog_active to avoid kprobe+bpf triggering while >> + * we're calling kfree, otherwise deadlock is possible if kprobes >> + * are placed somewhere inside of slub >> + */ >> + preempt_disable(); >> + __this_cpu_inc(bpf_prog_active); >> + if (queue_map_is_prealloc(queue)) >> + pcpu_freelist_push(&queue->freelist, &l->fnode); >> + else >> + kfree(l); >> + __this_cpu_dec(bpf_prog_active); >> + preempt_enable(); >> +} >> + >> +/* Called from syscall or from eBPF program */ >> +static void *queue_map_lookup_elem(struct bpf_map *map, void *key) >> +{ >> + struct bpf_queue *queue = container_of(map, struct bpf_queue, map); >> + unsigned long flags; >> + struct queue_node *node; >> + >> + raw_spin_lock_irqsave(&queue->lock, flags); > I think a per-cpu flavor would be very useful here as well in this map > type such that we wouldn't need a lock here but only guarantee that > preemption is disabled, and it could be used as temp store for the program > for example. I agree. I'd focus for now in the basic implementation, once it is ready we could go ahead with a per-cpu version. > >> + node = list_first_or_null_rcu(&queue->head, struct queue_node, list); >> + if (!node) { >> + raw_spin_unlock_irqrestore(&queue->lock, flags); >> + return NULL; >> + } >> + >> + if (!queue_map_is_prealloc(queue)) >> + atomic_dec(&queue->count); >> + >> + list_del_rcu(&node->list); >> + call_rcu(&node->rcu, queue_elem_free_rcu); >> + >> + raw_spin_unlock_irqrestore(&queue->lock, flags); >> + >> + return &node->element; >> +} >> + >> +/* Called from syscall or from eBPF program */ >> +static int queue_map_update_elem(struct bpf_map *map, void *key, void *value, >> + u64 map_flags) >> +{ >> + struct bpf_queue *queue = container_of(map, struct bpf_queue, map); >> + unsigned long flags; >> + struct queue_node *new; > Should reject invalid map update flags here. Will do in new push helper proposed by Alexei. > >> + if (!queue_map_is_prealloc(queue)) { >> + if (atomic_inc_return(&queue->count) > queue->map.max_entries) { >> + atomic_dec(&queue->count); >> + return -E2BIG; >> + } >> + >> + new = kmalloc_node(queue->node_size, GFP_ATOMIC | __GFP_NOWARN, >> + queue->map.numa_node); >> + if (!new) { >> + atomic_dec(&queue->count); >> + return -ENOMEM; >> + } >> + } else { >> + struct pcpu_freelist_node *l; >> + >> + l = pcpu_freelist_pop(&queue->freelist); >> + if (!l) >> + return -E2BIG; >> + new = container_of(l, struct queue_node, fnode); >> + } >> + >> + memcpy(new->element, value, queue->map.value_size); >> + new->queue = queue; >> + >> + raw_spin_lock_irqsave(&queue->lock, flags); >> + switch (queue->type) { >> + case QUEUE_FIFO: >> + list_add_tail_rcu(&new->list, &queue->head); >> + break; >> + >> + case QUEUE_LIFO: >> + list_add_rcu(&new->list, &queue->head); >> + break; >> + } >> + >> + raw_spin_unlock_irqrestore(&queue->lock, flags); >> + >> + return 0; >> +} >> + >> +/* Called from syscall or from eBPF program */ >> +static int queue_map_delete_elem(struct bpf_map *map, void *key) >> +{ >> + return -EINVAL; >> +} >> + >> +/* Called from syscall */ >> +static int queue_map_get_next_key(struct bpf_map *map, void *key, >> + void *next_key) >> +{ >> + return -EINVAL; >> +} >> + >> +const struct bpf_map_ops queue_map_ops = { >> + .map_alloc_check = queue_map_alloc_check, >> + .map_alloc = queue_map_alloc, >> + .map_free = queue_map_free, >> + .map_lookup_elem = queue_map_lookup_elem, >> + .map_update_elem = queue_map_update_elem, >> + .map_delete_elem = queue_map_delete_elem, >> + .map_get_next_key = queue_map_get_next_key, >> +}; >> + >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index a31a1ba0f8ea..7e9a11d69eef 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -622,11 +622,19 @@ static int map_lookup_elem(union bpf_attr *attr) >> err = -EPERM; >> goto err_put; >> } >> + if (map->map_type != BPF_MAP_TYPE_QUEUE) { >> + key = memdup_user(ukey, map->key_size); >> + if (IS_ERR(key)) { >> + err = PTR_ERR(key); >> + goto err_put; >> + } >> + } else { >> + if (ukey) { >> + err = -EINVAL; >> + goto err_put; >> + } > Given you have a fixed key_size of 0, this could probably be made more > generic and refactored a bit into a helper to check for 0 map->key_size > instead of map type. With a new set of helpers following changes are not needed anymore. > >> - key = memdup_user(ukey, map->key_size); >> - if (IS_ERR(key)) { >> - err = PTR_ERR(key); >> - goto err_put; >> + key = NULL; >> } >> >> if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || >> @@ -709,10 +717,19 @@ static int map_update_elem(union bpf_attr *attr) >> goto err_put; >> } >> >> - key = memdup_user(ukey, map->key_size); >> - if (IS_ERR(key)) { >> - err = PTR_ERR(key); >> - goto err_put; >> + if (map->map_type != BPF_MAP_TYPE_QUEUE) { >> + key = memdup_user(ukey, map->key_size); >> + if (IS_ERR(key)) { >> + err = PTR_ERR(key); >> + goto err_put; >> + } >> + } else { >> + if (ukey) { >> + err = -EINVAL; >> + goto err_put; >> + } >> + >> + key = NULL; >> } > Ditto, and also further below as well for the other syscall cases. > >> >> if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || >> @@ -803,10 +820,19 @@ static int map_delete_elem(union bpf_attr *attr) >> goto err_put; >> } >> >> - key = memdup_user(ukey, map->key_size); >> - if (IS_ERR(key)) { >> - err = PTR_ERR(key); >> - goto err_put; >> + if (map->map_type != BPF_MAP_TYPE_QUEUE) { >> + key = memdup_user(ukey, map->key_size); >> + if (IS_ERR(key)) { >> + err = PTR_ERR(key); >> + goto err_put; >> + } >> + } else { >> + if (ukey) { >> + err = -EINVAL; >> + goto err_put; >> + } >> + >> + key = NULL; >> } >> >> if (bpf_map_is_dev_bound(map)) { >> @@ -855,9 +881,14 @@ static int map_get_next_key(union bpf_attr *attr) >> } >> >> if (ukey) { >> - key = memdup_user(ukey, map->key_size); >> - if (IS_ERR(key)) { >> - err = PTR_ERR(key); >> + if (map->map_type != BPF_MAP_TYPE_QUEUE) { >> + key = memdup_user(ukey, map->key_size); >> + if (IS_ERR(key)) { >> + err = PTR_ERR(key); >> + goto err_put; >> + } >> + } else { >> + err = -EINVAL; >> goto err_put; >> } >> } else { >> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c >> index 25e47c195874..83099a9a21d9 100644 >> --- a/kernel/bpf/verifier.c >> +++ b/kernel/bpf/verifier.c >> @@ -1976,8 +1976,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, >> return -EACCES; >> } >> >> - if (arg_type == ARG_PTR_TO_MAP_KEY || >> - arg_type == ARG_PTR_TO_MAP_VALUE) { >> + if (arg_type == ARG_PTR_TO_MAP_KEY) { >> + expected_type = PTR_TO_STACK; >> + if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && >> + type != expected_type && type != SCALAR_VALUE) >> + goto err_type; >> + } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { >> expected_type = PTR_TO_STACK; >> if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && >> type != expected_type) >> @@ -2021,6 +2025,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, >> /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ >> meta->map_ptr = reg->map_ptr; >> } else if (arg_type == ARG_PTR_TO_MAP_KEY) { >> + bool zero_size_allowed = false; >> /* bpf_map_xxx(..., map_ptr, ..., key) call: >> * check that [key, key + map->key_size) are within >> * stack limits and initialized >> @@ -2034,8 +2039,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, >> verbose(env, "invalid map_ptr to access map->key\n"); >> return -EACCES; >> } >> + >> + if (meta->map_ptr->map_type == BPF_MAP_TYPE_QUEUE) > Here as well. > >> + zero_size_allowed = true; > Also, verifier should rather enforce a const NULL key here than allowing one to > be set (but not as the only 'valid' input option), meaning, I don't think it would > be good to allow a 'zero' sized pointer to stack in this case. > >> err = check_helper_mem_access(env, regno, >> - meta->map_ptr->key_size, false, >> + meta->map_ptr->key_size, >> + zero_size_allowed, >> NULL); >> } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { >> /* bpf_map_xxx(..., map_ptr, ..., value) call: >> >