Re: [PATCH bpf-next v2 1/4] bpf: Generalize bpf_sk_storage

From: Martin KaFai Lau <kafai@fb.com>
To: KP Singh <kpsingh@chromium.org>
Cc: <bpf@vger.kernel.org>, <linux-security-module@vger.kernel.org>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Paul Turner <pjt@google.com>, Jann Horn <jannh@google.com>
Subject: Re: [PATCH bpf-next v2 1/4] bpf: Generalize bpf_sk_storage
Date: Thu, 18 Jun 2020 23:43:32 -0700	[thread overview]
Message-ID: <20200619064332.fycpxuegmmkbfe54@kafai-mbp.dhcp.thefacebook.com> (raw)
In-Reply-To: <20200617202941.3034-2-kpsingh@chromium.org>

On Wed, Jun 17, 2020 at 10:29:38PM +0200, KP Singh wrote:
> From: KP Singh <kpsingh@google.com>
> 
> Refactor the functionality in bpf_sk_storage.c so that concept of
> storage linked to kernel objects can be extended to other objects like
> inode, task_struct etc.
> 
> bpf_sk_storage is updated to be bpf_local_storage with a union that
> contains a pointer to the owner object. The type of the
> bpf_local_storage can be determined using the newly added
> bpf_local_storage_type enum.
> 
> Each new local storage will still be a separate map and provide its own
> set of helpers. This allows for future object specific extensions and
> still share a lot of the underlying implementation.
Thanks for taking up this effort to refactor sk_local_storage.

I took a quick look.  I have some comments and would like to explore
some thoughts.

> --- a/net/core/bpf_sk_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -1,19 +1,22 @@
>  // SPDX-License-Identifier: GPL-2.0
>  /* Copyright (c) 2019 Facebook  */
> +#include "linux/bpf.h"
> +#include "asm-generic/bug.h"
> +#include "linux/err.h"
"<" ">"

>  #include <linux/rculist.h>
>  #include <linux/list.h>
>  #include <linux/hash.h>
>  #include <linux/types.h>
>  #include <linux/spinlock.h>
>  #include <linux/bpf.h>
> -#include <net/bpf_sk_storage.h>
> +#include <linux/bpf_local_storage.h>
>  #include <net/sock.h>
>  #include <uapi/linux/sock_diag.h>
>  #include <uapi/linux/btf.h>
>  
>  static atomic_t cache_idx;
inode local storage and sk local storage probably need a separate
cache_idx.  An improvement on picking cache_idx has just been
landed also.

[ ... ]

> +struct bpf_local_storage {
> +	struct bpf_local_storage_data __rcu *cache[BPF_STORAGE_CACHE_SIZE];
> +	struct hlist_head list;		/* List of bpf_local_storage_elem */
> +	/* The object that owns the the above "list" of
> +	 * bpf_local_storage_elem
> +	 */
> +	union {
> +		struct sock *sk;
> +	};
>  	struct rcu_head rcu;
>  	raw_spinlock_t lock;	/* Protect adding/removing from the "list" */
> +	enum bpf_local_storage_type stype;
>  };

[ ... ]

> +static struct bpf_local_storage_elem *sk_selem_alloc(
> +	struct bpf_local_storage_map *smap, struct sock *sk, void *value,
> +	bool charge_omem)
> +{
> +	struct bpf_local_storage_elem *selem;
> +
> +	if (charge_omem && omem_charge(sk, smap->elem_size))
> +		return NULL;
> +
> +	selem = selem_alloc(smap, value);
> +	if (selem)
> +		return selem;
> +
>  	if (charge_omem)
>  		atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
>  
>  	return NULL;
>  }
>  
> -/* sk_storage->lock must be held and selem->sk_storage == sk_storage.
> +static void __unlink_local_storage(struct bpf_local_storage *local_storage,
> +				   bool uncharge_omem)
Nit. indent is off.  There are a few more cases like this.

> +{
> +	struct sock *sk;
> +
> +	switch (local_storage->stype) {
Does it need a new bpf_local_storage_type?  Is map_type as good?

Instead of adding any new member (e.g. stype) to
"struct bpf_local_storage",  can the smap pointer be directly used
here instead?

For example in __unlink_local_storage() here, it should
have a hold to the selem which then has a hold to smap.

> +	case BPF_LOCAL_STORAGE_SK:
> +		sk = local_storage->sk;
> +		if (uncharge_omem)
> +			atomic_sub(sizeof(struct bpf_local_storage),
> +				   &sk->sk_omem_alloc);
> +
> +		/* After this RCU_INIT, sk may be freed and cannot be used */
> +		RCU_INIT_POINTER(sk->sk_bpf_storage, NULL);
> +		local_storage->sk = NULL;
> +		break;
> +	}
Another thought on the stype switch cases.

Instead of having multiple switches on stype in bpf_local_storage.c which may
not be scalable soon if we are planning to support a few more kernel objects,
have you considered putting them into its own "ops".  May be a few new
ops can be added to bpf_map_ops to do local storage unlink/update/alloc...etc.

> +}
> +
> +/* local_storage->lock must be held and selem->local_storage == local_storage.
>   * The caller must ensure selem->smap is still valid to be
>   * dereferenced for its smap->elem_size and smap->cache_idx.
> + *
> + * uncharge_omem is only relevant when:
> + *
> + *	local_storage->stype == BPF_LOCAL_STORAGE_SK
>   */

[ ... ]

> @@ -845,7 +947,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
>  BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
>  	   void *, value, u64, flags)
>  {
> -	struct bpf_sk_storage_data *sdata;
> +	struct bpf_local_storage_data *sdata;
>  
>  	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
>  		return (unsigned long)NULL;
> @@ -854,14 +956,14 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
>  	if (sdata)
>  		return (unsigned long)sdata->data;
>  
> -	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
> +	if (flags == BPF_LOCAL_STORAGE_GET_F_CREATE &&
>  	    /* Cannot add new elem to a going away sk.
>  	     * Otherwise, the new elem may become a leak
>  	     * (and also other memory issues during map
>  	     *  destruction).
>  	     */
>  	    refcount_inc_not_zero(&sk->sk_refcnt)) {
> -		sdata = sk_storage_update(sk, map, value, BPF_NOEXIST);
> +		sdata = local_storage_update(sk, map, value, BPF_NOEXIST);
>  		/* sk must be a fullsock (guaranteed by verifier),
>  		 * so sock_gen_put() is unnecessary.
>  		 */
> @@ -887,14 +989,14 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
>  }
>  
>  const struct bpf_map_ops sk_storage_map_ops = {
> -	.map_alloc_check = bpf_sk_storage_map_alloc_check,
> -	.map_alloc = bpf_sk_storage_map_alloc,
> -	.map_free = bpf_sk_storage_map_free,
> +	.map_alloc_check = bpf_local_storage_map_alloc_check,
> +	.map_alloc = bpf_local_storage_map_alloc,
> +	.map_free = bpf_local_storage_map_free,
>  	.map_get_next_key = notsupp_get_next_key,
> -	.map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
> -	.map_update_elem = bpf_fd_sk_storage_update_elem,
> -	.map_delete_elem = bpf_fd_sk_storage_delete_elem,
> -	.map_check_btf = bpf_sk_storage_map_check_btf,
> +	.map_lookup_elem = bpf_sk_storage_lookup_elem,
> +	.map_update_elem = bpf_sk_storage_update_elem,
> +	.map_delete_elem = bpf_sk_storage_delete_elem,
> +	.map_check_btf = bpf_local_storage_map_check_btf,
>  };
>  
>  const struct bpf_func_proto bpf_sk_storage_get_proto = {
> @@ -975,7 +1077,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
>  	u32 nr_maps = 0;
>  	int rem, err;
>  
> -	/* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
> +	/* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as
>  	 * the map_alloc_check() side also does.
>  	 */
>  	if (!bpf_capable())
> @@ -1025,10 +1127,10 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
>  }
>  EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
Would it be cleaner to leave bpf_sk specific function, map_ops, and func_proto
in net/core/bpf_sk_storage.c?

There is a test in map_tests/sk_storage_map.c, in case you may not notice.