Re: [bpf-next V4 PATCH 2/8] bpf: devmap prepare xdp frames for bulking

From: Daniel Borkmann <daniel@iogearbox.net>
To: Jesper Dangaard Brouer <brouer@redhat.com>,
	netdev@vger.kernel.org, Daniel Borkmann <borkmann@iogearbox.net>,
	Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: "Christoph Hellwig" <hch@infradead.org>,
	BjörnTöpel <bjorn.topel@intel.com>,
	"Magnus Karlsson" <magnus.karlsson@intel.com>,
	makita.toshiaki@lab.ntt.co.jp
Subject: Re: [bpf-next V4 PATCH 2/8] bpf: devmap prepare xdp frames for bulking
Date: Wed, 23 May 2018 11:54:38 +0200	[thread overview]
Message-ID: <1db29c53-1568-a3b5-f2d3-1c830aefe33b@iogearbox.net> (raw)
In-Reply-To: <152665048175.21055.15345477060144555643.stgit@firesoul>

On 05/18/2018 03:34 PM, Jesper Dangaard Brouer wrote:
> Like cpumap create queue for xdp frames that will be bulked.  For now,
> this patch simply invoke ndo_xdp_xmit foreach frame.  This happens,
> either when the map flush operation is envoked, or when the limit
> DEV_MAP_BULK_SIZE is reached.
> 
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  kernel/bpf/devmap.c |   77 ++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 73 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 808808bf2bf2..cab72c100bb5 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -54,11 +54,18 @@
>  #define DEV_CREATE_FLAG_MASK \
>  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>  
> +#define DEV_MAP_BULK_SIZE 16
> +struct xdp_bulk_queue {
> +	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
> +	unsigned int count;
> +};
> +
>  /* objects in the map */
>  struct bpf_dtab_netdev {
>  	struct net_device *dev; /* must be first member, due to tracepoint */
>  	struct bpf_dtab *dtab;
>  	unsigned int bit;
> +	struct xdp_bulk_queue __percpu *bulkq;
>  	struct rcu_head rcu;
>  };
>  
> @@ -209,6 +216,38 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
>  	__set_bit(bit, bitmap);
>  }
>  
> +static int bq_xmit_all(struct bpf_dtab_netdev *obj,
> +			 struct xdp_bulk_queue *bq)
> +{
> +	unsigned int processed = 0, drops = 0;
> +	struct net_device *dev = obj->dev;
> +	int i;
> +
> +	if (unlikely(!bq->count))
> +		return 0;
> +
> +	for (i = 0; i < bq->count; i++) {
> +		struct xdp_frame *xdpf = bq->q[i];
> +
> +		prefetch(xdpf);
> +	}
> +
> +	for (i = 0; i < bq->count; i++) {
> +		struct xdp_frame *xdpf = bq->q[i];
> +		int err;
> +
> +		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> +		if (err) {
> +			drops++;
> +			xdp_return_frame(xdpf);
> +		}
> +		processed++;

This sort of thing makes it really hard to review. 'processed' and
'drops' are not read anywhere in this function. So I need to go and
check all the other patches whether there's further logic involved here
or not. I had to review your series after applying all patches in a
local branch, please never do this. Add the logic in a patch where it's
self-contained and obvious to review.

> +	}
> +	bq->count = 0;
> +
> +	return 0;
> +}
> +
>  /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
>   * from the driver before returning from its napi->poll() routine. The poll()
>   * routine is called either from busy_poll context or net_rx_action signaled
> @@ -224,6 +263,7 @@ void __dev_map_flush(struct bpf_map *map)
>  
>  	for_each_set_bit(bit, bitmap, map->max_entries) {
>  		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
> +		struct xdp_bulk_queue *bq;
>  		struct net_device *netdev;
>  
>  		/* This is possible if the dev entry is removed by user space
> @@ -233,6 +273,9 @@ void __dev_map_flush(struct bpf_map *map)
>  			continue;
>  
>  		__clear_bit(bit, bitmap);
> +
> +		bq = this_cpu_ptr(dev->bulkq);
> +		bq_xmit_all(dev, bq);
>  		netdev = dev->dev;
>  		if (likely(netdev->netdev_ops->ndo_xdp_flush))
>  			netdev->netdev_ops->ndo_xdp_flush(netdev);
> @@ -255,6 +298,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
>  	return obj;
>  }
>  
> +/* Runs under RCU-read-side, plus in softirq under NAPI protection.
> + * Thus, safe percpu variable access.
> + */
> +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
> +{
> +	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
> +
> +	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
> +		bq_xmit_all(obj, bq);
> +
> +	bq->q[bq->count++] = xdpf;
> +	return 0;
> +}
> +
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
>  {
>  	struct net_device *dev = dst->dev;
> @@ -268,8 +325,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
>  	if (unlikely(!xdpf))
>  		return -EOVERFLOW;
>  
> -	/* TODO: implement a bulking/enqueue step later */
> -	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> +	err = bq_enqueue(dst, xdpf);
>  	if (err)
>  		return err;
>  
> @@ -288,13 +344,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
>  {
>  	if (dev->dev->netdev_ops->ndo_xdp_flush) {
>  		struct net_device *fl = dev->dev;
> +		struct xdp_bulk_queue *bq;
>  		unsigned long *bitmap;
> +
>  		int cpu;

Please remove the newline from above.

>  		for_each_online_cpu(cpu) {
>  			bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
>  			__clear_bit(dev->bit, bitmap);
>  
> +			bq = per_cpu_ptr(dev->bulkq, cpu);
> +			bq_xmit_all(dev, bq);
> +
>  			fl->netdev_ops->ndo_xdp_flush(dev->dev);
>  		}
>  	}
> @@ -306,6 +367,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
>  
>  	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
>  	dev_map_flush_old(dev);
> +	free_percpu(dev->bulkq);
>  	dev_put(dev->dev);
>  	kfree(dev);
>  }
> @@ -338,6 +400,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>  {
>  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>  	struct net *net = current->nsproxy->net_ns;
> +	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
>  	struct bpf_dtab_netdev *dev, *old_dev;
>  	u32 i = *(u32 *)key;
>  	u32 ifindex = *(u32 *)value;
> @@ -352,11 +415,17 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>  	if (!ifindex) {
>  		dev = NULL;
>  	} else {
> -		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
> -				   map->numa_node);
> +		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
>  		if (!dev)
>  			return -ENOMEM;
>  
> +		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
> +						sizeof(void *), gfp);
> +		if (!dev->bulkq) {
> +			kfree(dev);
> +			return -ENOMEM;
> +		}
> +
>  		dev->dev = dev_get_by_index(net, ifindex);
>  		if (!dev->dev) {
>  			kfree(dev);

Ahh well, and I just realized that here you are leaking memory in the error path. :(

A free_percpu(dev->bulkq) is missing.

Please fix this bug up and send a fresh series, so we can fix this right away without
needing a follow-up in bpf-next.

Thanks,
Daniel