From mboxrd@z Thu Jan  1 00:00:00 1970
From: Mauricio Vasquez <mauricio.vasquez@polito.it>
Subject: Re: [PATCH bpf-next 1/3] bpf: add bpf queue map
Date: Wed, 8 Aug 2018 21:50:05 -0500
Message-ID: <04feef42-9183-1f95-cdd6-b53623c968f2@polito.it>
References: <153356387977.6981.12236150594041620482.stgit@kernel>
 <153356390770.6981.4228793745105954649.stgit@kernel>
 <bb6fcefb-3dbd-d6cd-2e86-a8921d9be8bd@iogearbox.net>
Mime-Version: 1.0
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Transfer-Encoding: 7bit
Cc: netdev@vger.kernel.org
To: Daniel Borkmann <daniel@iogearbox.net>,
        Alexei Starovoitov <ast@kernel.org>
Return-path: <netdev-owner@vger.kernel.org>
Received: from fm2nodo1.polito.it ([130.192.180.17]:34754 "EHLO
        fm2nodo1.polito.it" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1726926AbeHIFNG (ORCPT
        <rfc822;netdev@vger.kernel.org>); Thu, 9 Aug 2018 01:13:06 -0400
In-Reply-To: <bb6fcefb-3dbd-d6cd-2e86-a8921d9be8bd@iogearbox.net>
Content-Language: en-US
Sender: netdev-owner@vger.kernel.org
List-ID: <netdev.vger.kernel.org>


On 08/07/2018 08:40 AM, Daniel Borkmann wrote:
> On 08/06/2018 03:58 PM, Mauricio Vasquez B wrote:
>> Bpf queue implements a LIFO/FIFO data containers for ebpf programs.
>>
>> It allows to push an element to the queue by using the update operation
>> and to pop an element from the queue by using the lookup operation.
>>
>> A use case for this is to keep track of a pool of elements, like
>> network ports in a SNAT.
>>
>> Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
>> ---
>>   include/linux/bpf_types.h |    1
>>   include/uapi/linux/bpf.h  |    5 +
>>   kernel/bpf/Makefile       |    2
>>   kernel/bpf/queuemap.c     |  287 +++++++++++++++++++++++++++++++++++++++++++++
>>   kernel/bpf/syscall.c      |   61 +++++++---
>>   kernel/bpf/verifier.c     |   16 ++-
>>   6 files changed, 353 insertions(+), 19 deletions(-)
>>   create mode 100644 kernel/bpf/queuemap.c
>>
>> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
>> index c5700c2d5549..6c7a62f3fe43 100644
>> --- a/include/linux/bpf_types.h
>> +++ b/include/linux/bpf_types.h
>> @@ -58,3 +58,4 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
>>   BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
>>   #endif
>>   #endif
>> +BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 0ebaaf7f3568..2c171c40eb45 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -120,6 +120,7 @@ enum bpf_map_type {
>>   	BPF_MAP_TYPE_CPUMAP,
>>   	BPF_MAP_TYPE_XSKMAP,
>>   	BPF_MAP_TYPE_SOCKHASH,
>> +	BPF_MAP_TYPE_QUEUE,
>>   };
>>   
>>   enum bpf_prog_type {
>> @@ -255,6 +256,10 @@ enum bpf_attach_type {
>>   /* Flag for stack_map, store build_id+offset instead of pointer */
>>   #define BPF_F_STACK_BUILD_ID	(1U << 5)
>>   
>> +/* Flags for queue_map, type of queue */
>> +#define BPF_F_QUEUE_FIFO	(1U << 16)
>> +#define BPF_F_QUEUE_LIFO	(2U << 16)
>> +
>>   enum bpf_stack_build_id_status {
>>   	/* user space need an empty entry to identify end of a trace */
>>   	BPF_STACK_BUILD_ID_EMPTY = 0,
>> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
>> index f27f5496d6fe..30f02ef66635 100644
>> --- a/kernel/bpf/Makefile
>> +++ b/kernel/bpf/Makefile
>> @@ -2,7 +2,7 @@
>>   obj-y := core.o
>>   
>>   obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
>> -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>> +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o queuemap.o
>>   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
>>   obj-$(CONFIG_BPF_SYSCALL) += btf.o
>>   ifeq ($(CONFIG_NET),y)
>> diff --git a/kernel/bpf/queuemap.c b/kernel/bpf/queuemap.c
>> new file mode 100644
>> index 000000000000..ab30af43b4cc
>> --- /dev/null
>> +++ b/kernel/bpf/queuemap.c
>> @@ -0,0 +1,287 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * queuemap.c: BPF queue map
>> + *
>> + * Copyright (c) 2018 Politecnico di Torino
>> + */
>> +#include <linux/bpf.h>
>> +#include <linux/rculist.h>
>> +#include <linux/slab.h>
>> +#include "percpu_freelist.h"
>> +
>> +#define QUEUE_CREATE_FLAG_MASK \
>> +	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
>> +	 BPF_F_QUEUE_FIFO | BPF_F_QUEUE_LIFO)
>> +
>> +enum queue_type {
>> +	QUEUE_FIFO = (BPF_F_QUEUE_FIFO >> 16),
>> +	QUEUE_LIFO = (BPF_F_QUEUE_LIFO >> 16),
>> +};
>> +
>> +struct bpf_queue {
>> +	struct bpf_map map;
>> +	struct list_head head;
>> +	struct pcpu_freelist freelist;
>> +	void *nodes;
>> +	enum queue_type type;
>> +	raw_spinlock_t lock;
>> +	atomic_t count;
>> +	u32 node_size;
>> +};
>> +
>> +struct queue_node {
>> +	struct pcpu_freelist_node fnode;
>> +	struct bpf_queue *queue;
>> +	struct list_head list;
>> +	struct rcu_head rcu;
>> +	char element[0] __aligned(8);
>> +};
>> +
>> +static bool queue_map_is_prealloc(struct bpf_queue *queue)
>> +{
>> +	return !(queue->map.map_flags & BPF_F_NO_PREALLOC);
>> +}
>> +
>> +/* Called from syscall */
>> +static int queue_map_alloc_check(union bpf_attr *attr)
>> +{
>> +	/* check sanity of attributes */
>> +	if (attr->max_entries == 0 || attr->key_size != 0 ||
>> +	    attr->value_size == 0 || attr->map_flags & ~QUEUE_CREATE_FLAG_MASK)
>> +		return -EINVAL;
>> +
>> +	if ((attr->map_flags >> 16) != QUEUE_FIFO &&
>> +	    (attr->map_flags >> 16) != QUEUE_LIFO) {
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (attr->value_size > KMALLOC_MAX_SIZE)
>> +		/* if value_size is bigger, the user space won't be able to
>> +		 * access the elements.
>> +		 */
>> +		return -E2BIG;
>> +
>> +	return 0;
>> +}
>> +
>> +static int prealloc_init(struct bpf_queue *queue)
>> +{
>> +	u32 node_size = sizeof(struct queue_node) +
>> +			round_up(queue->map.value_size, 8);
>> +	u32 num_entries = queue->map.max_entries;
>> +	int err;
>> +
>> +	queue->nodes = bpf_map_area_alloc(node_size * num_entries,
>> +					  queue->map.numa_node);
>> +	if (!queue->nodes)
>> +		return -ENOMEM;
>> +
>> +	err = pcpu_freelist_init(&queue->freelist);
>> +	if (err)
>> +		goto free_nodes;
>> +
>> +	pcpu_freelist_populate(&queue->freelist,
>> +			       queue->nodes +
>> +			       offsetof(struct queue_node, fnode),
>> +			       node_size, num_entries);
>> +
>> +	return 0;
>> +
>> +free_nodes:
>> +	bpf_map_area_free(queue->nodes);
>> +	return err;
>> +}
>> +
>> +static void prealloc_destroy(struct bpf_queue *queue)
>> +{
>> +	bpf_map_area_free(queue->nodes);
>> +	pcpu_freelist_destroy(&queue->freelist);
>> +}
>> +
>> +static struct bpf_map *queue_map_alloc(union bpf_attr *attr)
>> +{
>> +	struct bpf_queue *queue;
>> +	u64 cost = sizeof(*queue);
>> +	int ret;
>> +
>> +	queue = kzalloc(sizeof(*queue), GFP_USER);
>> +	if (!queue)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	bpf_map_init_from_attr(&queue->map, attr);
>> +
>> +	queue->node_size = sizeof(struct queue_node) +
>> +			   round_up(attr->value_size, 8);
>> +	cost += (u64) attr->max_entries * queue->node_size;
>> +	if (cost >= U32_MAX - PAGE_SIZE) {
>> +		ret = -E2BIG;
>> +		goto free_queue;
>> +	}
>> +
>> +	queue->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
>> +
>> +	ret = bpf_map_precharge_memlock(queue->map.pages);
>> +	if (ret)
>> +		goto free_queue;
>> +
>> +	INIT_LIST_HEAD(&queue->head);
>> +
>> +	raw_spin_lock_init(&queue->lock);
>> +
>> +	queue->type = attr->map_flags >> 16;
>> +
>> +	if (queue_map_is_prealloc(queue))
>> +		ret = prealloc_init(queue);
>> +		if (ret)
>> +			goto free_queue;
>> +
>> +	return &queue->map;
>> +
>> +free_queue:
>> +	kfree(queue);
>> +	return ERR_PTR(ret);
>> +}
>> +
>> +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
>> +static void queue_map_free(struct bpf_map *map)
>> +{
>> +	struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
>> +	struct queue_node *l;
>> +
>> +	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
>> +	 * so the programs (can be more than one that used this map) were
>> +	 * disconnected from events. Wait for outstanding critical sections in
>> +	 * these programs to complete
>> +	 */
>> +	synchronize_rcu();
>> +
>> +	/* some of queue_elem_free_rcu() callbacks for elements of this map may
>> +	 * not have executed. Wait for them.
>> +	 */
>> +	rcu_barrier();
>> +	if (!queue_map_is_prealloc(queue))
>> +		list_for_each_entry_rcu(l, &queue->head, list) {
>> +			list_del_rcu(&l->list);
>> +			kfree(l);
>> +		}
>> +	else
>> +		prealloc_destroy(queue);
>> +	kfree(queue);
>> +}
>> +
>> +static void queue_elem_free_rcu(struct rcu_head *head)
>> +{
>> +	struct queue_node *l = container_of(head, struct queue_node, rcu);
>> +	struct bpf_queue *queue = l->queue;
>> +
>> +	/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
>> +	 * we're calling kfree, otherwise deadlock is possible if kprobes
>> +	 * are placed somewhere inside of slub
>> +	 */
>> +	preempt_disable();
>> +	__this_cpu_inc(bpf_prog_active);
>> +	if (queue_map_is_prealloc(queue))
>> +		pcpu_freelist_push(&queue->freelist, &l->fnode);
>> +	else
>> +		kfree(l);
>> +	__this_cpu_dec(bpf_prog_active);
>> +	preempt_enable();
>> +}
>> +
>> +/* Called from syscall or from eBPF program */
>> +static void *queue_map_lookup_elem(struct bpf_map *map, void *key)
>> +{
>> +	struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
>> +	unsigned long flags;
>> +	struct queue_node *node;
>> +
>> +	raw_spin_lock_irqsave(&queue->lock, flags);
> I think a per-cpu flavor would be very useful here as well in this map
> type such that we wouldn't need a lock here but only guarantee that
> preemption is disabled, and it could be used as temp store for the program
> for example.
I agree. I'd focus for now in the basic implementation, once it is ready 
we could go ahead with a per-cpu version.
>
>> +	node = list_first_or_null_rcu(&queue->head, struct queue_node, list);
>> +	if (!node) {
>> +		raw_spin_unlock_irqrestore(&queue->lock, flags);
>> +		return NULL;
>> +	}
>> +
>> +	if (!queue_map_is_prealloc(queue))
>> +		atomic_dec(&queue->count);
>> +
>> +	list_del_rcu(&node->list);
>> +	call_rcu(&node->rcu, queue_elem_free_rcu);
>> +
>> +	raw_spin_unlock_irqrestore(&queue->lock, flags);
>> +
>> +	return &node->element;
>> +}
>> +
>> +/* Called from syscall or from eBPF program */
>> +static int queue_map_update_elem(struct bpf_map *map, void *key, void *value,
>> +				 u64 map_flags)
>> +{
>> +	struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
>> +	unsigned long flags;
>> +	struct queue_node *new;
> Should reject invalid map update flags here.
Will do in new push helper proposed by Alexei.
>
>> +	if (!queue_map_is_prealloc(queue)) {
>> +		if (atomic_inc_return(&queue->count) > queue->map.max_entries) {
>> +			atomic_dec(&queue->count);
>> +			return -E2BIG;
>> +		}
>> +
>> +		new = kmalloc_node(queue->node_size, GFP_ATOMIC | __GFP_NOWARN,
>> +				   queue->map.numa_node);
>> +		if (!new) {
>> +			atomic_dec(&queue->count);
>> +			return -ENOMEM;
>> +		}
>> +	} else {
>> +		struct pcpu_freelist_node *l;
>> +
>> +		l = pcpu_freelist_pop(&queue->freelist);
>> +		if (!l)
>> +			return -E2BIG;
>> +		new = container_of(l, struct queue_node, fnode);
>> +	}
>> +
>> +	memcpy(new->element, value, queue->map.value_size);
>> +	new->queue = queue;
>> +
>> +	raw_spin_lock_irqsave(&queue->lock, flags);
>> +	switch (queue->type) {
>> +	case QUEUE_FIFO:
>> +		list_add_tail_rcu(&new->list, &queue->head);
>> +		break;
>> +
>> +	case QUEUE_LIFO:
>> +		list_add_rcu(&new->list, &queue->head);
>> +		break;
>> +	}
>> +
>> +	raw_spin_unlock_irqrestore(&queue->lock, flags);
>> +
>> +	return 0;
>> +}
>> +
>> +/* Called from syscall or from eBPF program */
>> +static int queue_map_delete_elem(struct bpf_map *map, void *key)
>> +{
>> +	return -EINVAL;
>> +}
>> +
>> +/* Called from syscall */
>> +static int queue_map_get_next_key(struct bpf_map *map, void *key,
>> +				  void *next_key)
>> +{
>> +	return -EINVAL;
>> +}
>> +
>> +const struct bpf_map_ops queue_map_ops = {
>> +	.map_alloc_check = queue_map_alloc_check,
>> +	.map_alloc = queue_map_alloc,
>> +	.map_free = queue_map_free,
>> +	.map_lookup_elem = queue_map_lookup_elem,
>> +	.map_update_elem = queue_map_update_elem,
>> +	.map_delete_elem = queue_map_delete_elem,
>> +	.map_get_next_key = queue_map_get_next_key,
>> +};
>> +
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index a31a1ba0f8ea..7e9a11d69eef 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -622,11 +622,19 @@ static int map_lookup_elem(union bpf_attr *attr)
>>   		err = -EPERM;
>>   		goto err_put;
>>   	}
>> +	if (map->map_type != BPF_MAP_TYPE_QUEUE) {
>> +		key = memdup_user(ukey, map->key_size);
>> +		if (IS_ERR(key)) {
>> +			err = PTR_ERR(key);
>> +			goto err_put;
>> +		}
>> +	} else {
>> +		if (ukey) {
>> +			err = -EINVAL;
>> +			goto err_put;
>> +		}
> Given you have a fixed key_size of 0, this could probably be made more
> generic and refactored a bit into a helper to check for 0 map->key_size
> instead of map type.

With a new set of helpers following changes are not needed anymore.
>
>> -	key = memdup_user(ukey, map->key_size);
>> -	if (IS_ERR(key)) {
>> -		err = PTR_ERR(key);
>> -		goto err_put;
>> +		key = NULL;
>>   	}
>>   
>>   	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
>> @@ -709,10 +717,19 @@ static int map_update_elem(union bpf_attr *attr)
>>   		goto err_put;
>>   	}
>>   
>> -	key = memdup_user(ukey, map->key_size);
>> -	if (IS_ERR(key)) {
>> -		err = PTR_ERR(key);
>> -		goto err_put;
>> +	if (map->map_type != BPF_MAP_TYPE_QUEUE) {
>> +		key = memdup_user(ukey, map->key_size);
>> +		if (IS_ERR(key)) {
>> +			err = PTR_ERR(key);
>> +			goto err_put;
>> +		}
>> +	} else {
>> +		if (ukey) {
>> +			err = -EINVAL;
>> +			goto err_put;
>> +		}
>> +
>> +		key = NULL;
>>   	}
> Ditto, and also further below as well for the other syscall cases.
>
>>   
>>   	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
>> @@ -803,10 +820,19 @@ static int map_delete_elem(union bpf_attr *attr)
>>   		goto err_put;
>>   	}
>>   
>> -	key = memdup_user(ukey, map->key_size);
>> -	if (IS_ERR(key)) {
>> -		err = PTR_ERR(key);
>> -		goto err_put;
>> +	if (map->map_type != BPF_MAP_TYPE_QUEUE) {
>> +		key = memdup_user(ukey, map->key_size);
>> +		if (IS_ERR(key)) {
>> +			err = PTR_ERR(key);
>> +			goto err_put;
>> +		}
>> +	} else {
>> +		if (ukey) {
>> +			err = -EINVAL;
>> +			goto err_put;
>> +		}
>> +
>> +		key = NULL;
>>   	}
>>   
>>   	if (bpf_map_is_dev_bound(map)) {
>> @@ -855,9 +881,14 @@ static int map_get_next_key(union bpf_attr *attr)
>>   	}
>>   
>>   	if (ukey) {
>> -		key = memdup_user(ukey, map->key_size);
>> -		if (IS_ERR(key)) {
>> -			err = PTR_ERR(key);
>> +		if (map->map_type != BPF_MAP_TYPE_QUEUE) {
>> +			key = memdup_user(ukey, map->key_size);
>> +			if (IS_ERR(key)) {
>> +				err = PTR_ERR(key);
>> +				goto err_put;
>> +			}
>> +		} else {
>> +			err = -EINVAL;
>>   			goto err_put;
>>   		}
>>   	} else {
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 25e47c195874..83099a9a21d9 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -1976,8 +1976,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
>>   		return -EACCES;
>>   	}
>>   
>> -	if (arg_type == ARG_PTR_TO_MAP_KEY ||
>> -	    arg_type == ARG_PTR_TO_MAP_VALUE) {
>> +	if (arg_type == ARG_PTR_TO_MAP_KEY) {
>> +		expected_type = PTR_TO_STACK;
>> +		if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
>> +		    type != expected_type && type != SCALAR_VALUE)
>> +			goto err_type;
>> +	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
>>   		expected_type = PTR_TO_STACK;
>>   		if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
>>   		    type != expected_type)
>> @@ -2021,6 +2025,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
>>   		/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
>>   		meta->map_ptr = reg->map_ptr;
>>   	} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
>> +		bool zero_size_allowed = false;
>>   		/* bpf_map_xxx(..., map_ptr, ..., key) call:
>>   		 * check that [key, key + map->key_size) are within
>>   		 * stack limits and initialized
>> @@ -2034,8 +2039,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
>>   			verbose(env, "invalid map_ptr to access map->key\n");
>>   			return -EACCES;
>>   		}
>> +
>> +		if (meta->map_ptr->map_type == BPF_MAP_TYPE_QUEUE)
> Here as well.
>
>> +			zero_size_allowed = true;
> Also, verifier should rather enforce a const NULL key here than allowing one to
> be set (but not as the only 'valid' input option), meaning, I don't think it would
> be good to allow a 'zero' sized pointer to stack in this case.
>
>>   		err = check_helper_mem_access(env, regno,
>> -					      meta->map_ptr->key_size, false,
>> +					      meta->map_ptr->key_size,
>> +					      zero_size_allowed,
>>   					      NULL);
>>   	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
>>   		/* bpf_map_xxx(..., map_ptr, ..., value) call:
>>
>