Re: [PATCH bpf-next 1/2] bpf: Allow bpf_local_storage to be used by sleepable programs

From: Martin KaFai Lau <kafai@fb.com>
To: KP Singh <kpsingh@kernel.org>
Cc: <bpf@vger.kernel.org>, Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Andrii Nakryiko <andrii@kernel.org>,
	"Paul E. McKenney" <paulmck@kernel.org>,
	Jann Horn <jannh@google.com>,
	Florent Revest <revest@chromium.org>,
	Brendan Jackman <jackmanb@chromium.org>
Subject: Re: [PATCH bpf-next 1/2] bpf: Allow bpf_local_storage to be used by sleepable programs
Date: Fri, 27 Aug 2021 13:55:30 -0700	[thread overview]
Message-ID: <20210827205530.zzqawd6wz52n65qh@kafai-mbp> (raw)
In-Reply-To: <20210826235127.303505-2-kpsingh@kernel.org>

On Thu, Aug 26, 2021 at 11:51:26PM +0000, KP Singh wrote:
> Other maps like hashmaps are already available to sleepable programs.
> Sleepable BPF programs run under trace RCU. Allow task, local and inode
> storage to be used from sleepable programs.
> 
> The local storage code mostly runs under the programs RCU read section
> (in __bpf_prog_enter{_sleepable} and __bpf_prog_exit{_sleepable})
> (rcu_read_lock or rcu_read_lock_trace) with the exception the logic
> where the map is freed.
> 
> After some discussions and help from Jann Horn, the following changes
> were made:
> 
> bpf_local_storage{_elem} are freed with a kfree_rcu
> wrapped with a call_rcu_tasks_trace callback instead of a direct
> kfree_rcu which does not respect the trace RCU grace periods. The
> callback frees the storage/selem with kfree_rcu which handles the normal
> RCU grace period similar to BPF trampolines.
> 
> Update the synchronise_rcu to synchronize_rcu_mult in the map freeing
> code to wait for trace RCU and normal RCU grace periods.
> While this is an expensive operation, bpf_local_storage_map_free
> is not called from within a BPF program, rather only called when the
> owning object is being freed.
> 
> Update the dereferencing of the pointers to use rcu_derference_protected
> (with either the trace or normal RCU locks held) and add warnings in the
> beginning of the get and delete helpers.
> 
> Signed-off-by: KP Singh <kpsingh@kernel.org>
> ---
>  include/linux/bpf_local_storage.h |  5 ++++
>  kernel/bpf/bpf_inode_storage.c    |  9 +++++--
>  kernel/bpf/bpf_local_storage.c    | 43 ++++++++++++++++++++++++-------
>  kernel/bpf/bpf_task_storage.c     |  6 ++++-
>  kernel/bpf/verifier.c             |  3 +++
>  net/core/bpf_sk_storage.c         |  8 +++++-
>  6 files changed, 61 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> index 24496bc28e7b..8453488b334d 100644
> --- a/include/linux/bpf_local_storage.h
> +++ b/include/linux/bpf_local_storage.h
> @@ -16,6 +16,9 @@
>  
>  #define BPF_LOCAL_STORAGE_CACHE_SIZE	16
>  
> +#define bpf_local_storage_rcu_lock_held()			\
> +	(rcu_read_lock_held() || rcu_read_lock_trace_held() ||	\
> +		rcu_read_lock_bh_held())
There is a similar test in hashtab.  May be renaming it to a more
generic name such that it can be reused there?

[ ... ]

> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index b305270b7a4b..7760bc4e9565 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -11,6 +11,8 @@
>  #include <net/sock.h>
>  #include <uapi/linux/sock_diag.h>
>  #include <uapi/linux/btf.h>
> +#include <linux/rcupdate_trace.h>
> +#include <linux/rcupdate_wait.h>
>  
>  #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
>  
> @@ -81,6 +83,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
>  	return NULL;
>  }
>  
> +void bpf_local_storage_free_rcu(struct rcu_head *rcu)
> +{
> +	struct bpf_local_storage *local_storage;
> +
> +	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
> +	kfree_rcu(local_storage, rcu);
> +}
> +
> +static void bpf_selem_free_rcu(struct rcu_head *rcu)
> +{
> +	struct bpf_local_storage_elem *selem;
> +
> +	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
> +	kfree_rcu(selem, rcu);
> +}
> +
>  /* local_storage->lock must be held and selem->local_storage == local_storage.
>   * The caller must ensure selem->smap is still valid to be
>   * dereferenced for its smap->elem_size and smap->cache_idx.
> @@ -118,12 +136,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
>  		 *
>  		 * Although the unlock will be done under
>  		 * rcu_read_lock(),  it is more intutivie to
> -		 * read if kfree_rcu(local_storage, rcu) is done
> +		 * read if the freeing of the storage is done
>  		 * after the raw_spin_unlock_bh(&local_storage->lock).
>  		 *
>  		 * Hence, a "bool free_local_storage" is returned
> -		 * to the caller which then calls the kfree_rcu()
> -		 * after unlock.
> +		 * to the caller which then calls then frees the storage after
> +		 * all the RCU grace periods have expired.
>  		 */
>  	}
>  	hlist_del_init_rcu(&selem->snode);
> @@ -131,7 +149,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
>  	    SDATA(selem))
>  		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
>  
> -	kfree_rcu(selem, rcu);
> +	call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
Although the common use case is usually storage_get() much more often
than storage_delete(), do you aware any performance impact for
the bpf prog that does a lot of storage_delete()?

>  
>  	return free_local_storage;
>  }
> @@ -154,7 +172,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
>  	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
>  
>  	if (free_local_storage)
> -		kfree_rcu(local_storage, rcu);
> +		call_rcu_tasks_trace(&local_storage->rcu,
> +				     bpf_local_storage_free_rcu);
>  }
>  
>  void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
> @@ -213,7 +232,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
>  	struct bpf_local_storage_elem *selem;
>  
>  	/* Fast path (cache hit) */
> -	sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
> +	sdata = rcu_dereference_protected(local_storage->cache[smap->cache_idx],
> +					  bpf_local_storage_rcu_lock_held());
There are other places using rcu_dereference() also.
e.g. in bpf_local_storage_update().
Should they be changed also?

[ ... ]

> --- a/net/core/bpf_sk_storage.c
> +++ b/net/core/bpf_sk_storage.c
> @@ -13,6 +13,7 @@
>  #include <net/sock.h>
>  #include <uapi/linux/sock_diag.h>
>  #include <uapi/linux/btf.h>
> +#include <linux/rcupdate_trace.h>
>  
>  DEFINE_BPF_STORAGE_CACHE(sk_cache);
>  
> @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
>  	struct bpf_local_storage *sk_storage;
>  	struct bpf_local_storage_map *smap;
>  
> -	sk_storage = rcu_dereference(sk->sk_bpf_storage);
> +	sk_storage = rcu_dereference_protected(sk->sk_bpf_storage,
> +					       bpf_local_storage_rcu_lock_held());
>  	if (!sk_storage)
>  		return NULL;
>  
> @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
>  {
>  	struct bpf_local_storage_data *sdata;
>  
> +	WARN_ON_ONCE(!bpf_local_storage_rcu_lock_held());
>  	if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
sk is protected by rcu_read_lock here.
Is it always safe to access it with the rcu_read_lock_trace alone ?

>  		return (unsigned long)NULL;
>