All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jakub Kicinski <jakub.kicinski@netronome.com>
To: "Toke Høiland-Jørgensen" <toke@redhat.com>
Cc: David Miller <davem@davemloft.net>,
	netdev@vger.kernel.org,
	Jesper Dangaard Brouer <brouer@redhat.com>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Alexei Starovoitov <ast@kernel.org>
Subject: Re: [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions
Date: Fri, 1 Mar 2019 17:08:31 -0800	[thread overview]
Message-ID: <20190301170831.6ae29baa@cakuba.netronome.com> (raw)
In-Reply-To: <155144955040.28287.1075106871059918653.stgit@alrua-x1>

On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
> The subsequent commits introducing default maps and a hash-based ifindex
> devmap require a bit of refactoring of the devmap code. Perform this first
> so the subsequent commits become easier to read.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>

> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 191b79948424..1037fc08c504 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -75,6 +75,7 @@ struct bpf_dtab {
>  	struct bpf_dtab_netdev **netdev_map;
>  	unsigned long __percpu *flush_needed;
>  	struct list_head list;
> +	struct rcu_head rcu;

I think the RCU change may warrant a separate commit or at least a
mention in the commit message ;)

>  };
>  
>  static DEFINE_SPINLOCK(dev_map_lock);
> @@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>  	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
>  }
>  
> -static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
> +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> +			    bool check_memlock)
>  {
> -	struct bpf_dtab *dtab;
> -	int err = -EINVAL;
>  	u64 cost;
> -
> -	if (!capable(CAP_NET_ADMIN))
> -		return ERR_PTR(-EPERM);
> -
> -	/* check sanity of attributes */
> -	if (attr->max_entries == 0 || attr->key_size != 4 ||
> -	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> -		return ERR_PTR(-EINVAL);

perhaps consider moving these to a map_alloc_check callback?

> -	dtab = kzalloc(sizeof(*dtab), GFP_USER);
> -	if (!dtab)
> -		return ERR_PTR(-ENOMEM);
> +	int err;
>  
>  	bpf_map_init_from_attr(&dtab->map, attr);
>  
> @@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>  	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
>  	if (cost >= U32_MAX - PAGE_SIZE)
> -		goto free_dtab;
> +		return -EINVAL;
>  
>  	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
>  
> -	/* if map size is larger than memlock limit, reject it early */
> -	err = bpf_map_precharge_memlock(dtab->map.pages);
> -	if (err)
> -		goto free_dtab;
> -
> -	err = -ENOMEM;
> +	if (check_memlock) {
> +		/* if map size is larger than memlock limit, reject it early */
> +		err = bpf_map_precharge_memlock(dtab->map.pages);
> +		if (err)
> +			return -EINVAL;
> +	}
>  
>  	/* A per cpu bitfield with a bit per possible net device */
>  	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
>  						__alignof__(unsigned long),
>  						GFP_KERNEL | __GFP_NOWARN);
>  	if (!dtab->flush_needed)
> -		goto free_dtab;
> +		goto err_alloc;
>  
>  	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
>  					      sizeof(struct bpf_dtab_netdev *),
>  					      dtab->map.numa_node);
>  	if (!dtab->netdev_map)
> -		goto free_dtab;
> +		goto err_map;
>  
> -	spin_lock(&dev_map_lock);
> -	list_add_tail_rcu(&dtab->list, &dev_map_list);
> -	spin_unlock(&dev_map_lock);
> +	return 0;
>  
> -	return &dtab->map;
> -free_dtab:
> + err_map:

The label should name the first action.  So it was correct, you're
making it less good :)  Also why the space?

>  	free_percpu(dtab->flush_needed);
> -	kfree(dtab);
> -	return ERR_PTR(err);
> + err_alloc:

and no need for this one

> +	return -ENOMEM;
>  }
>  
> -static void dev_map_free(struct bpf_map *map)
> +static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  {
> -	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> -	int i, cpu;
> +	struct bpf_dtab *dtab;
> +	int err;
>  
> -	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> -	 * so the programs (can be more than one that used this map) were
> -	 * disconnected from events. Wait for outstanding critical sections in
> -	 * these programs to complete. The rcu critical section only guarantees
> -	 * no further reads against netdev_map. It does __not__ ensure pending
> -	 * flush operations (if any) are complete.
> -	 */
> +	if (!capable(CAP_NET_ADMIN))
> +		return ERR_PTR(-EPERM);
> +
> +	/* check sanity of attributes */
> +	if (attr->max_entries == 0 || attr->key_size != 4 ||
> +	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> +		return ERR_PTR(-EINVAL);
> +
> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);
> +	if (!dtab)
> +		return ERR_PTR(-ENOMEM);
> +
> +	err = dev_map_init_map(dtab, attr, true);
> +	if (err) {
> +		kfree(dtab);
> +		return ERR_PTR(err);
> +	}
>  
>  	spin_lock(&dev_map_lock);
> -	list_del_rcu(&dtab->list);
> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
>  	spin_unlock(&dev_map_lock);
>  
> -	bpf_clear_redirect_map(map);
> -	synchronize_rcu();
> +	return &dtab->map;
> +}
> +
> +static void __dev_map_free(struct rcu_head *rcu)
> +{
> +	struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
> +	int i, cpu;
>  
>  	/* To ensure all pending flush operations have completed wait for flush
>  	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
> @@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map)

There is a cond_resched() here, I don't think you can call
cond_resched() from an RCU callback.

>  	kfree(dtab);
>  }
>  
> +static void dev_map_free(struct bpf_map *map)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +
> +	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> +	 * so the programs (can be more than one that used this map) were
> +	 * disconnected from events. Wait for outstanding critical sections in
> +	 * these programs to complete. The rcu critical section only guarantees
> +	 * no further reads against netdev_map. It does __not__ ensure pending
> +	 * flush operations (if any) are complete.
> +	 */
> +
> +	spin_lock(&dev_map_lock);
> +	list_del_rcu(&dtab->list);
> +	spin_unlock(&dev_map_lock);
> +
> +	bpf_clear_redirect_map(map);
> +	call_rcu(&dtab->rcu, __dev_map_free);
> +}
> +
>  static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>  {
>  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);

  reply	other threads:[~2019-03-02  1:08 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-03-01 14:12 [PATCH net-next v3 0/3] xdp: Use a default map for xdp_redirect helper Toke Høiland-Jørgensen
2019-03-01 14:12 ` [PATCH net-next v3 3/3] xdp: Add devmap_idx map type for looking up devices by ifindex Toke Høiland-Jørgensen
2019-03-01 14:12 ` [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions Toke Høiland-Jørgensen
2019-03-02  1:08   ` Jakub Kicinski [this message]
2019-03-04 12:47     ` Toke Høiland-Jørgensen
2019-03-04 17:08       ` Jakub Kicinski
2019-03-04 17:37         ` Toke Høiland-Jørgensen
2019-03-01 14:12 ` [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device Toke Høiland-Jørgensen
2019-03-02  2:09   ` Jakub Kicinski
2019-03-04 11:58     ` Toke Høiland-Jørgensen
2019-03-04 17:44       ` Jakub Kicinski
2019-03-04 19:05         ` Toke Høiland-Jørgensen
2019-03-04 22:15           ` Jakub Kicinski
2019-03-04 22:28             ` Toke Høiland-Jørgensen
2019-03-04 22:49               ` Jakub Kicinski
2019-03-05  9:53                 ` Toke Høiland-Jørgensen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190301170831.6ae29baa@cakuba.netronome.com \
    --to=jakub.kicinski@netronome.com \
    --cc=ast@kernel.org \
    --cc=brouer@redhat.com \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    --cc=toke@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.