Re: [PATCH v2 bpf-next 03/10] bpf: introduce per-cpu cgroup local storage

From: Roman Gushchin <guro@fb.com>
To: Song Liu <songliubraving@fb.com>
Cc: "netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Kernel Team <Kernel-team@fb.com>,
	Daniel Borkmann <daniel@iogearbox.net>,
	"Alexei Starovoitov" <ast@kernel.org>
Subject: Re: [PATCH v2 bpf-next 03/10] bpf: introduce per-cpu cgroup local storage
Date: Wed, 26 Sep 2018 08:42:23 +0000	[thread overview]
Message-ID: <20180926084208.GA25056@castle.DHCP.thefacebook.com> (raw)
In-Reply-To: <27B81458-17C5-4840-9679-D1F5FBF6E805@fb.com>

On Tue, Sep 25, 2018 at 11:54:33AM -0700, Song Liu wrote:
> 
> 
> > On Sep 25, 2018, at 8:21 AM, Roman Gushchin <guro@fb.com> wrote:
> > 
> > This commit introduced per-cpu cgroup local storage.
> > 
> > Per-cpu cgroup local storage is very similar to simple cgroup storage
> > (let's call it shared), except all the data is per-cpu.
> > 
> > The main goal of per-cpu variant is to implement super fast
> > counters (e.g. packet counters), which don't require neither
> > lookups, neither atomic operations.
> > 
> > From userspace's point of view, accessing a per-cpu cgroup storage
> > is similar to other per-cpu map types (e.g. per-cpu hashmaps and
> > arrays).
> > 
> > Writing to a per-cpu cgroup storage is not atomic, but is performed
> > by copying longs, so some minimal atomicity is here, exactly
> > as with other per-cpu maps.
> > 
> > Signed-off-by: Roman Gushchin <guro@fb.com>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Alexei Starovoitov <ast@kernel.org>
> > ---
> > include/linux/bpf-cgroup.h |  20 ++++-
> > include/linux/bpf.h        |   1 +
> > include/linux/bpf_types.h  |   1 +
> > include/uapi/linux/bpf.h   |   1 +
> > kernel/bpf/helpers.c       |   8 +-
> > kernel/bpf/local_storage.c | 148 ++++++++++++++++++++++++++++++++-----
> > kernel/bpf/syscall.c       |  11 ++-
> > kernel/bpf/verifier.c      |  15 +++-
> > 8 files changed, 177 insertions(+), 28 deletions(-)
> > 
> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > index 7e0c9a1d48b7..9bd907657f9b 100644
> > --- a/include/linux/bpf-cgroup.h
> > +++ b/include/linux/bpf-cgroup.h
> > @@ -37,7 +37,10 @@ struct bpf_storage_buffer {
> > };
> > 
> > struct bpf_cgroup_storage {
> > -	struct bpf_storage_buffer *buf;
> > +	union {
> > +		struct bpf_storage_buffer *buf;
> > +		char __percpu *percpu_buf;
> 
> "char *" here looks weird. Did you mean to use "void *"?

Fair enough. It's probably a leftover from (previously used) char[0].

> 
> > +	};
> > 	struct bpf_cgroup_storage_map *map;
> > 	struct bpf_cgroup_storage_key key;
> > 	struct list_head list;
> > @@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
> > static inline enum bpf_cgroup_storage_type cgroup_storage_type(
> > 	struct bpf_map *map)
> > {
> > +	if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> > +		return BPF_CGROUP_STORAGE_PERCPU;
> > +
> > 	return BPF_CGROUP_STORAGE_SHARED;
> > }
> > 
> > @@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
> > int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
> > void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
> > 
> > +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
> > +int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> > +				     void *value, u64 flags);
> > +
> > /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
> > #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
> > ({									      \
> > @@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
> > 	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
> > static inline void bpf_cgroup_storage_free(
> > 	struct bpf_cgroup_storage *storage) {}
> > +static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
> > +						 void *value) {
> > +	return 0;
> > +}
> > +static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> > +					void *key, void *value, u64 flags) {
> > +	return 0;
> > +}
> > 
> > #define cgroup_bpf_enabled (0)
> > #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index b457fbe7b70b..018299a595c8 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -274,6 +274,7 @@ struct bpf_prog_offload {
> > 
> > enum bpf_cgroup_storage_type {
> > 	BPF_CGROUP_STORAGE_SHARED,
> > +	BPF_CGROUP_STORAGE_PERCPU,
> > 	__BPF_CGROUP_STORAGE_MAX
> > };
> > 
> > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> > index c9bd6fb765b0..5432f4c9f50e 100644
> > --- a/include/linux/bpf_types.h
> > +++ b/include/linux/bpf_types.h
> > @@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
> > #endif
> > #ifdef CONFIG_CGROUP_BPF
> > BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
> > +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
> > #endif
> > BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
> > BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index aa5ccd2385ed..e2070d819e04 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -127,6 +127,7 @@ enum bpf_map_type {
> > 	BPF_MAP_TYPE_SOCKHASH,
> > 	BPF_MAP_TYPE_CGROUP_STORAGE,
> > 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
> > +	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
> > };
> > 
> > enum bpf_prog_type {
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index e42f8789b7ea..1f21ef1c4ad3 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
> > 	 */
> > 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
> > 	struct bpf_cgroup_storage *storage;
> > +	void *ptr = NULL;
> 
> Not necessary to initialize to NULL.

Fixed.

> 
> > 
> > 	storage = this_cpu_read(bpf_cgroup_storage[stype]);
> > 
> > -	return (unsigned long)&READ_ONCE(storage->buf)->data[0];
> > +	if (stype == BPF_CGROUP_STORAGE_SHARED)
> > +		ptr = &READ_ONCE(storage->buf)->data[0];
> > +	else
> > +		ptr = this_cpu_ptr(storage->percpu_buf);
> > +
> > +	return (unsigned long)ptr;
> > }
> > 
> > const struct bpf_func_proto bpf_get_local_storage_proto = {
> > diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> > index 6742292fb39e..d991355b5b46 100644
> > --- a/kernel/bpf/local_storage.c
> > +++ b/kernel/bpf/local_storage.c
> > @@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
> > 	return 0;
> > }
> > 
> > +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
> > +				   void *value)
> > +{
> > +	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> > +	struct bpf_cgroup_storage_key *key = _key;
> > +	struct bpf_cgroup_storage *storage;
> > +	int cpu, off = 0;
> > +	u32 size;
> > +
> > +	rcu_read_lock();
> > +	storage = cgroup_storage_lookup(map, key, false);
> > +	if (!storage) {
> > +		rcu_read_unlock();
> > +		return -ENOENT;
> > +	}
> > +
> > +	/* per_cpu areas are zero-filled and bpf programs can only
> > +	 * access 'value_size' of them, so copying rounded areas
> > +	 * will not leak any kernel data
> > +	 */
> > +	size = round_up(_map->value_size, 8);
> > +	for_each_possible_cpu(cpu) {
> > +		bpf_long_memcpy(value + off,
> > +				per_cpu_ptr(storage->percpu_buf, cpu), size);
> > +		off += size;
> > +	}
> > +	rcu_read_unlock();
> > +	return 0;
> > +}
> > +
> > +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
> > +				     void *value, u64 map_flags)
> > +{
> > +	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> > +	struct bpf_cgroup_storage_key *key = _key;
> > +	struct bpf_cgroup_storage *storage;
> > +	int cpu, off = 0;
> > +	u32 size;
> > +
> > +	if (unlikely(map_flags & BPF_EXIST))
> > +		return -EINVAL;
> > +
> > +	rcu_read_lock();
> > +	storage = cgroup_storage_lookup(map, key, false);
> > +	if (!storage) {
> > +		rcu_read_unlock();
> > +		return -ENOENT;
> > +	}
> > +
> > +	/* the user space will provide round_up(value_size, 8) bytes that
> > +	 * will be copied into per-cpu area. bpf programs can only access
> > +	 * value_size of it. During lookup the same extra bytes will be
> > +	 * returned or zeros which were zero-filled by percpu_alloc,
> > +	 * so no kernel data leaks possible
> > +	 */
> > +	size = round_up(_map->value_size, 8);
> > +	for_each_possible_cpu(cpu) {
> > +		bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
> > +				value + off, size);
> > +		off += size;
> > +	}
> > +	rcu_read_unlock();
> > +	return 0;
> > +}
> > +
> > static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
> > 				       void *_next_key)
> > {
> > @@ -292,55 +357,98 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
> > {
> > 	struct bpf_cgroup_storage *storage;
> > 	struct bpf_map *map;
> > +	gfp_t flags;
> > +	size_t size;
> > 	u32 pages;
> > 
> > 	map = prog->aux->cgroup_storage[stype];
> > 	if (!map)
> > 		return NULL;
> > 
> > -	pages = round_up(sizeof(struct bpf_cgroup_storage) +
> > -			 sizeof(struct bpf_storage_buffer) +
> > -			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
> > +	if (stype == BPF_CGROUP_STORAGE_SHARED) {
> > +		size = sizeof(struct bpf_storage_buffer) + map->value_size;
> > +		pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
> > +				 PAGE_SIZE) >> PAGE_SHIFT;
> > +	} else {
> > +		size = map->value_size;
> > +		pages = round_up(round_up(size, 8) * num_possible_cpus(),
> > +				 PAGE_SIZE) >> PAGE_SHIFT;
> > +	}
> > +
> > 	if (bpf_map_charge_memlock(map, pages))
> > 		return ERR_PTR(-EPERM);
> > 
> > 	storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
> > 			       __GFP_ZERO | GFP_USER, map->numa_node);
> > -	if (!storage) {
> > -		bpf_map_uncharge_memlock(map, pages);
> > -		return ERR_PTR(-ENOMEM);
> > -	}
> > +	if (!storage)
> > +		goto enomem;
> > 
> > -	storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
> > -				    map->value_size, __GFP_ZERO | GFP_USER,
> > -				    map->numa_node);
> > -	if (!storage->buf) {
> > -		bpf_map_uncharge_memlock(map, pages);
> > -		kfree(storage);
> > -		return ERR_PTR(-ENOMEM);
> > +	flags = __GFP_ZERO | GFP_USER;
> > +
> > +	if (stype == BPF_CGROUP_STORAGE_SHARED) {
> > +		storage->buf = kmalloc_node(size, flags, map->numa_node);
> > +		if (!storage->buf)
> > +			goto enomem;
> > +	} else {
> > +		storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
> > +		if (!storage->percpu_buf)
> > +			goto enomem;
> > 	}
> > 
> > 	storage->map = (struct bpf_cgroup_storage_map *)map;
> > 
> > 	return storage;
> > +
> > +enomem:
> > +	bpf_map_uncharge_memlock(map, pages);
> > +	kfree(storage);
> > +	return ERR_PTR(-ENOMEM);
> > +}
> > +
> > +static void free_cgroup_storage_rcu(struct rcu_head *rcu)
> 
> Maybe rename as free_shared_cgroup_storage_rcu()?

Yeah, might be more clear.

Thank you for the review!
Will send v3 with these changes and your acks soon.

Thanks!