All of lore.kernel.org
 help / color / mirror / Atom feed
From: Roman Gushchin <guro@fb.com>
To: Song Liu <songliubraving@fb.com>
Cc: "netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Kernel Team <Kernel-team@fb.com>,
	Daniel Borkmann <daniel@iogearbox.net>,
	"Alexei Starovoitov" <ast@kernel.org>
Subject: Re: [PATCH v2 bpf-next 03/10] bpf: introduce per-cpu cgroup local storage
Date: Wed, 26 Sep 2018 08:42:23 +0000	[thread overview]
Message-ID: <20180926084208.GA25056@castle.DHCP.thefacebook.com> (raw)
In-Reply-To: <27B81458-17C5-4840-9679-D1F5FBF6E805@fb.com>

On Tue, Sep 25, 2018 at 11:54:33AM -0700, Song Liu wrote:
> 
> 
> > On Sep 25, 2018, at 8:21 AM, Roman Gushchin <guro@fb.com> wrote:
> > 
> > This commit introduced per-cpu cgroup local storage.
> > 
> > Per-cpu cgroup local storage is very similar to simple cgroup storage
> > (let's call it shared), except all the data is per-cpu.
> > 
> > The main goal of per-cpu variant is to implement super fast
> > counters (e.g. packet counters), which don't require neither
> > lookups, neither atomic operations.
> > 
> > From userspace's point of view, accessing a per-cpu cgroup storage
> > is similar to other per-cpu map types (e.g. per-cpu hashmaps and
> > arrays).
> > 
> > Writing to a per-cpu cgroup storage is not atomic, but is performed
> > by copying longs, so some minimal atomicity is here, exactly
> > as with other per-cpu maps.
> > 
> > Signed-off-by: Roman Gushchin <guro@fb.com>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Alexei Starovoitov <ast@kernel.org>
> > ---
> > include/linux/bpf-cgroup.h |  20 ++++-
> > include/linux/bpf.h        |   1 +
> > include/linux/bpf_types.h  |   1 +
> > include/uapi/linux/bpf.h   |   1 +
> > kernel/bpf/helpers.c       |   8 +-
> > kernel/bpf/local_storage.c | 148 ++++++++++++++++++++++++++++++++-----
> > kernel/bpf/syscall.c       |  11 ++-
> > kernel/bpf/verifier.c      |  15 +++-
> > 8 files changed, 177 insertions(+), 28 deletions(-)
> > 
> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > index 7e0c9a1d48b7..9bd907657f9b 100644
> > --- a/include/linux/bpf-cgroup.h
> > +++ b/include/linux/bpf-cgroup.h
> > @@ -37,7 +37,10 @@ struct bpf_storage_buffer {
> > };
> > 
> > struct bpf_cgroup_storage {
> > -	struct bpf_storage_buffer *buf;
> > +	union {
> > +		struct bpf_storage_buffer *buf;
> > +		char __percpu *percpu_buf;
> 
> "char *" here looks weird. Did you mean to use "void *"?

Fair enough. It's probably a leftover from (previously used) char[0].

> 
> > +	};
> > 	struct bpf_cgroup_storage_map *map;
> > 	struct bpf_cgroup_storage_key key;
> > 	struct list_head list;
> > @@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
> > static inline enum bpf_cgroup_storage_type cgroup_storage_type(
> > 	struct bpf_map *map)
> > {
> > +	if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> > +		return BPF_CGROUP_STORAGE_PERCPU;
> > +
> > 	return BPF_CGROUP_STORAGE_SHARED;
> > }
> > 
> > @@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
> > int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
> > void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
> > 
> > +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
> > +int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> > +				     void *value, u64 flags);
> > +
> > /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
> > #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
> > ({									      \
> > @@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
> > 	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
> > static inline void bpf_cgroup_storage_free(
> > 	struct bpf_cgroup_storage *storage) {}
> > +static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
> > +						 void *value) {
> > +	return 0;
> > +}
> > +static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> > +					void *key, void *value, u64 flags) {
> > +	return 0;
> > +}
> > 
> > #define cgroup_bpf_enabled (0)
> > #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index b457fbe7b70b..018299a595c8 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -274,6 +274,7 @@ struct bpf_prog_offload {
> > 
> > enum bpf_cgroup_storage_type {
> > 	BPF_CGROUP_STORAGE_SHARED,
> > +	BPF_CGROUP_STORAGE_PERCPU,
> > 	__BPF_CGROUP_STORAGE_MAX
> > };
> > 
> > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> > index c9bd6fb765b0..5432f4c9f50e 100644
> > --- a/include/linux/bpf_types.h
> > +++ b/include/linux/bpf_types.h
> > @@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
> > #endif
> > #ifdef CONFIG_CGROUP_BPF
> > BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
> > +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
> > #endif
> > BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
> > BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index aa5ccd2385ed..e2070d819e04 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -127,6 +127,7 @@ enum bpf_map_type {
> > 	BPF_MAP_TYPE_SOCKHASH,
> > 	BPF_MAP_TYPE_CGROUP_STORAGE,
> > 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
> > +	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
> > };
> > 
> > enum bpf_prog_type {
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index e42f8789b7ea..1f21ef1c4ad3 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
> > 	 */
> > 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
> > 	struct bpf_cgroup_storage *storage;
> > +	void *ptr = NULL;
> 
> Not necessary to initialize to NULL.

Fixed.

> 
> > 
> > 	storage = this_cpu_read(bpf_cgroup_storage[stype]);
> > 
> > -	return (unsigned long)&READ_ONCE(storage->buf)->data[0];
> > +	if (stype == BPF_CGROUP_STORAGE_SHARED)
> > +		ptr = &READ_ONCE(storage->buf)->data[0];
> > +	else
> > +		ptr = this_cpu_ptr(storage->percpu_buf);
> > +
> > +	return (unsigned long)ptr;
> > }
> > 
> > const struct bpf_func_proto bpf_get_local_storage_proto = {
> > diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> > index 6742292fb39e..d991355b5b46 100644
> > --- a/kernel/bpf/local_storage.c
> > +++ b/kernel/bpf/local_storage.c
> > @@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
> > 	return 0;
> > }
> > 
> > +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
> > +				   void *value)
> > +{
> > +	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> > +	struct bpf_cgroup_storage_key *key = _key;
> > +	struct bpf_cgroup_storage *storage;
> > +	int cpu, off = 0;
> > +	u32 size;
> > +
> > +	rcu_read_lock();
> > +	storage = cgroup_storage_lookup(map, key, false);
> > +	if (!storage) {
> > +		rcu_read_unlock();
> > +		return -ENOENT;
> > +	}
> > +
> > +	/* per_cpu areas are zero-filled and bpf programs can only
> > +	 * access 'value_size' of them, so copying rounded areas
> > +	 * will not leak any kernel data
> > +	 */
> > +	size = round_up(_map->value_size, 8);
> > +	for_each_possible_cpu(cpu) {
> > +		bpf_long_memcpy(value + off,
> > +				per_cpu_ptr(storage->percpu_buf, cpu), size);
> > +		off += size;
> > +	}
> > +	rcu_read_unlock();
> > +	return 0;
> > +}
> > +
> > +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
> > +				     void *value, u64 map_flags)
> > +{
> > +	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> > +	struct bpf_cgroup_storage_key *key = _key;
> > +	struct bpf_cgroup_storage *storage;
> > +	int cpu, off = 0;
> > +	u32 size;
> > +
> > +	if (unlikely(map_flags & BPF_EXIST))
> > +		return -EINVAL;
> > +
> > +	rcu_read_lock();
> > +	storage = cgroup_storage_lookup(map, key, false);
> > +	if (!storage) {
> > +		rcu_read_unlock();
> > +		return -ENOENT;
> > +	}
> > +
> > +	/* the user space will provide round_up(value_size, 8) bytes that
> > +	 * will be copied into per-cpu area. bpf programs can only access
> > +	 * value_size of it. During lookup the same extra bytes will be
> > +	 * returned or zeros which were zero-filled by percpu_alloc,
> > +	 * so no kernel data leaks possible
> > +	 */
> > +	size = round_up(_map->value_size, 8);
> > +	for_each_possible_cpu(cpu) {
> > +		bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
> > +				value + off, size);
> > +		off += size;
> > +	}
> > +	rcu_read_unlock();
> > +	return 0;
> > +}
> > +
> > static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
> > 				       void *_next_key)
> > {
> > @@ -292,55 +357,98 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
> > {
> > 	struct bpf_cgroup_storage *storage;
> > 	struct bpf_map *map;
> > +	gfp_t flags;
> > +	size_t size;
> > 	u32 pages;
> > 
> > 	map = prog->aux->cgroup_storage[stype];
> > 	if (!map)
> > 		return NULL;
> > 
> > -	pages = round_up(sizeof(struct bpf_cgroup_storage) +
> > -			 sizeof(struct bpf_storage_buffer) +
> > -			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
> > +	if (stype == BPF_CGROUP_STORAGE_SHARED) {
> > +		size = sizeof(struct bpf_storage_buffer) + map->value_size;
> > +		pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
> > +				 PAGE_SIZE) >> PAGE_SHIFT;
> > +	} else {
> > +		size = map->value_size;
> > +		pages = round_up(round_up(size, 8) * num_possible_cpus(),
> > +				 PAGE_SIZE) >> PAGE_SHIFT;
> > +	}
> > +
> > 	if (bpf_map_charge_memlock(map, pages))
> > 		return ERR_PTR(-EPERM);
> > 
> > 	storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
> > 			       __GFP_ZERO | GFP_USER, map->numa_node);
> > -	if (!storage) {
> > -		bpf_map_uncharge_memlock(map, pages);
> > -		return ERR_PTR(-ENOMEM);
> > -	}
> > +	if (!storage)
> > +		goto enomem;
> > 
> > -	storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
> > -				    map->value_size, __GFP_ZERO | GFP_USER,
> > -				    map->numa_node);
> > -	if (!storage->buf) {
> > -		bpf_map_uncharge_memlock(map, pages);
> > -		kfree(storage);
> > -		return ERR_PTR(-ENOMEM);
> > +	flags = __GFP_ZERO | GFP_USER;
> > +
> > +	if (stype == BPF_CGROUP_STORAGE_SHARED) {
> > +		storage->buf = kmalloc_node(size, flags, map->numa_node);
> > +		if (!storage->buf)
> > +			goto enomem;
> > +	} else {
> > +		storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
> > +		if (!storage->percpu_buf)
> > +			goto enomem;
> > 	}
> > 
> > 	storage->map = (struct bpf_cgroup_storage_map *)map;
> > 
> > 	return storage;
> > +
> > +enomem:
> > +	bpf_map_uncharge_memlock(map, pages);
> > +	kfree(storage);
> > +	return ERR_PTR(-ENOMEM);
> > +}
> > +
> > +static void free_cgroup_storage_rcu(struct rcu_head *rcu)
> 
> Maybe rename as free_shared_cgroup_storage_rcu()?

Yeah, might be more clear.

Thank you for the review!
Will send v3 with these changes and your acks soon.

Thanks!

  reply	other threads:[~2018-09-26  8:43 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-09-25 15:21 [PATCH v2 bpf-next 00/10] bpf: per-cpu cgroup local storage Roman Gushchin
2018-09-25 15:21 ` [PATCH v2 bpf-next 01/10] bpf: extend cgroup bpf core to allow multiple cgroup storage types Roman Gushchin
2018-09-25 15:21 ` [PATCH v2 bpf-next 02/10] bpf: rework cgroup storage pointer passing Roman Gushchin
2018-09-25 15:21 ` [PATCH v2 bpf-next 03/10] bpf: introduce per-cpu cgroup local storage Roman Gushchin
2018-09-25 15:21   ` Roman Gushchin
2018-09-25 18:54   ` Song Liu
2018-09-26  8:42     ` Roman Gushchin [this message]
2018-09-25 15:21 ` [PATCH v2 bpf-next 04/10] bpf: don't allow create maps of per-cpu cgroup local storages Roman Gushchin
2018-09-25 15:21 ` [PATCH v2 bpf-next 05/10] bpf: sync include/uapi/linux/bpf.h to tools/include/uapi/linux/bpf.h Roman Gushchin
2018-09-25 15:21 ` [PATCH v2 bpf-next 06/10] bpftool: add support for PERCPU_CGROUP_STORAGE maps Roman Gushchin
2018-09-25 15:21 ` [PATCH v2 bpf-next 07/10] selftests/bpf: add verifier per-cpu cgroup storage tests Roman Gushchin
2018-09-25 15:21 ` [PATCH v2 bpf-next 08/10] selftests/bpf: extend the storage test to test per-cpu cgroup storage Roman Gushchin
2018-09-25 15:21 ` [PATCH v2 bpf-next 09/10] samples/bpf: extend test_cgrp2_attach2 test to use " Roman Gushchin
2018-09-25 15:21 ` [PATCH v2 bpf-next 10/10] selftests/bpf: cgroup local storage-based network counters Roman Gushchin
2018-09-25 19:05 ` [PATCH v2 bpf-next 00/10] bpf: per-cpu cgroup local storage Song Liu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180926084208.GA25056@castle.DHCP.thefacebook.com \
    --to=guro@fb.com \
    --cc=Kernel-team@fb.com \
    --cc=ast@kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=linux-kernel@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=songliubraving@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.