From mboxrd@z Thu Jan  1 00:00:00 1970
From: Daniel Borkmann <daniel@iogearbox.net>
Subject: Re: [PATCH bpf-next v6 2/4] bpf: sockmap, add hash map support
Date: Tue, 15 May 2018 21:01:38 +0200
Message-ID: <cc2e3011-0309-842c-4dbc-d3e82d5fc2d0@iogearbox.net>
References: <1526317219-7752-1-git-send-email-john.fastabend@gmail.com>
 <1526317219-7752-3-git-send-email-john.fastabend@gmail.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit
Cc: netdev@vger.kernel.org, davem@davemloft.net
To: John Fastabend <john.fastabend@gmail.com>, ast@kernel.org
Return-path: <netdev-owner@vger.kernel.org>
Received: from www62.your-server.de ([213.133.104.62]:47768 "EHLO
        www62.your-server.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1752272AbeEOTBn (ORCPT
        <rfc822;netdev@vger.kernel.org>); Tue, 15 May 2018 15:01:43 -0400
In-Reply-To: <1526317219-7752-3-git-send-email-john.fastabend@gmail.com>
Content-Language: en-US
Sender: netdev-owner@vger.kernel.org
List-ID: <netdev.vger.kernel.org>

On 05/14/2018 07:00 PM, John Fastabend wrote:
> Sockmap is currently backed by an array and enforces keys to be
> four bytes. This works well for many use cases and was originally
> modeled after devmap which also uses four bytes keys. However,
> this has become limiting in larger use cases where a hash would
> be more appropriate. For example users may want to use the 5-tuple
> of the socket as the lookup key.
> 
> To support this add hash support.
> 
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> Acked-by: David S. Miller <davem@davemloft.net>
> ---
>  include/linux/bpf.h       |   8 +
>  include/linux/bpf_types.h |   1 +
>  include/uapi/linux/bpf.h  |  52 ++++-
>  kernel/bpf/core.c         |   1 +
>  kernel/bpf/sockmap.c      | 494 ++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/bpf/verifier.c     |  14 +-
>  net/core/filter.c         |  58 ++++++
>  7 files changed, 610 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index a38e474..ed0122b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
>  
>  #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
>  struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
> +struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
>  int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
>  #else
>  static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
> @@ -675,6 +676,12 @@ static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
>  	return NULL;
>  }
>  
> +static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
> +						    void *key)
> +{
> +	return NULL;
> +}
> +
>  static inline int sock_map_prog(struct bpf_map *map,
>  				struct bpf_prog *prog,
>  				u32 type)
> @@ -724,6 +731,7 @@ static inline void __xsk_map_flush(struct bpf_map *map)
>  extern const struct bpf_func_proto bpf_get_stackid_proto;
>  extern const struct bpf_func_proto bpf_get_stack_proto;
>  extern const struct bpf_func_proto bpf_sock_map_update_proto;
> +extern const struct bpf_func_proto bpf_sock_hash_update_proto;
>  
>  /* Shared helpers among cBPF and eBPF. */
>  void bpf_user_rnd_init_once(void);
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index d7df1b32..b67f879 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -47,6 +47,7 @@
>  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
>  #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
> +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
>  #endif
>  BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
>  #if defined(CONFIG_XDP_SOCKETS)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 02e4112..1205d86 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -118,6 +118,7 @@ enum bpf_map_type {
>  	BPF_MAP_TYPE_SOCKMAP,
>  	BPF_MAP_TYPE_CPUMAP,
>  	BPF_MAP_TYPE_XSKMAP,
> +	BPF_MAP_TYPE_SOCKHASH,
>  };
>  
>  enum bpf_prog_type {
> @@ -1855,6 +1856,52 @@ struct bpf_stack_build_id {
>   *             Egress device index on success, 0 if packet needs to continue
>   *             up the stack for further processing or a negative error in case
>   *             of failure.
> + * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)

When you rebase please fix this up properly next time and add a newline in between
the helpers. I fixed this up while applying.

> + *	Description
> + *		Add an entry to, or update a sockhash *map* referencing sockets.
> + *		The *skops* is used as a new value for the entry associated to
> + *		*key*. *flags* is one of:
> + *
> + *		**BPF_NOEXIST**
> + *			The entry for *key* must not exist in the map.
> + *		**BPF_EXIST**
> + *			The entry for *key* must already exist in the map.
> + *		**BPF_ANY**
> + *			No condition on the existence of the entry for *key*.
> + *
> + *		If the *map* has eBPF programs (parser and verdict), those will
> + *		be inherited by the socket being added. If the socket is
> + *		already attached to eBPF programs, this results in an error.
> + *	Return
> + *		0 on success, or a negative error in case of failure.
[...]
> +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
> +{
> +	struct bpf_htab *htab;
> +	int i, err;
> +	u64 cost;
> +
> +	if (!capable(CAP_NET_ADMIN))
> +		return ERR_PTR(-EPERM);
> +
> +	/* check sanity of attributes */
> +	if (attr->max_entries == 0 || attr->value_size != 4 ||
> +	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
> +		return ERR_PTR(-EINVAL);
> +
> +	err = bpf_tcp_ulp_register();
> +	if (err && err != -EEXIST)
> +		return ERR_PTR(err);
> +
> +	htab = kzalloc(sizeof(*htab), GFP_USER);
> +	if (!htab)
> +		return ERR_PTR(-ENOMEM);
> +
> +	bpf_map_init_from_attr(&htab->map, attr);
> +
> +	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
> +	htab->elem_size = sizeof(struct htab_elem) +
> +			  round_up(htab->map.key_size, 8);
> +
> +	if (htab->n_buckets == 0 ||
> +	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
> +		goto free_htab;
> +
> +	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
> +	       (u64) htab->elem_size * htab->map.max_entries;
> +
> +	if (cost >= U32_MAX - PAGE_SIZE)
> +		goto free_htab;

Also above two goto free_htab were buggy! Error after bpf_tcp_ulp_register()
is either 0 or -EEXIST, if it's 0, then when you bail out here this will cause
a NULL pointer dereference in subsequent map setup paths. Adapting the same
logic from sock_map_alloc() would have been enough. I've added a err = -EINVAL
to the first occasion above this time.

> +	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
> +	err = bpf_map_precharge_memlock(htab->map.pages);
> +	if (err)
> +		goto free_htab;
> +
> +	err = -ENOMEM;
> +	htab->buckets = bpf_map_area_alloc(
> +				htab->n_buckets * sizeof(struct bucket),
> +				htab->map.numa_node);
> +	if (!htab->buckets)
> +		goto free_htab;
> +
> +	for (i = 0; i < htab->n_buckets; i++) {
> +		INIT_HLIST_HEAD(&htab->buckets[i].head);
> +		raw_spin_lock_init(&htab->buckets[i].lock);
> +	}
> +
> +	return &htab->map;
> +free_htab:
> +	kfree(htab);
> +	return ERR_PTR(err);
> +}