linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Roman Gushchin <guro@fb.com>
To: <bpf@vger.kernel.org>
Cc: <ast@kernel.org>, <daniel@iogearbox.net>,
	<netdev@vger.kernel.org>, <andrii@kernel.org>,
	<akpm@linux-foundation.org>, <linux-mm@kvack.org>,
	<linux-kernel@vger.kernel.org>, <kernel-team@fb.com>
Subject: [PATCH bpf-next v9 06/34] bpf: prepare for memcg-based memory accounting for bpf maps
Date: Tue, 1 Dec 2020 13:58:32 -0800	[thread overview]
Message-ID: <20201201215900.3569844-7-guro@fb.com> (raw)
In-Reply-To: <20201201215900.3569844-1-guro@fb.com>

Bpf maps can be updated from an interrupt context and in such
case there is no process which can be charged. It makes the memory
accounting of bpf maps non-trivial.

Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel
memcg accounting from interrupt contexts") and commit b87d8cefe43c
("mm, memcg: rework remote charging API to support nesting")
it's finally possible.

To make the ownership model simple and consistent, when the map
is created, the memory cgroup of the current process is recorded.
All subsequent allocations related to the bpf map are charged to
the same memory cgroup. It includes allocations made by any processes
(even if they do belong to a different cgroup) and from interrupts.

This commit introduces 3 new helpers, which will be used by following
commits to enable the accounting of bpf maps memory:
  - bpf_map_kmalloc_node()
  - bpf_map_kzalloc()
  - bpf_map_alloc_percpu()

They are wrapping popular memory allocation functions. They set
the active memory cgroup to the map's memory cgroup and add
__GFP_ACCOUNT to the passed gfp flags. Then they call into
the corresponding memory allocation function and restore
the original active memory cgroup.

These helpers are supposed to use everywhere except the map creation
path. During the map creation when the map structure is allocated by
itself, it cannot be passed to those helpers. In those cases default
memory allocation function will be used with the __GFP_ACCOUNT flag.

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 include/linux/bpf.h  | 34 ++++++++++++++++++++++++
 kernel/bpf/syscall.c | 63 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1bcb6d7345c..e1f2c95c15ec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -20,6 +20,8 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/capability.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -37,6 +39,7 @@ struct bpf_iter_aux_info;
 struct bpf_local_storage;
 struct bpf_local_storage_map;
 struct kobject;
+struct mem_cgroup;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -161,6 +164,9 @@ struct bpf_map {
 	u32 btf_value_type_id;
 	struct btf *btf;
 	struct bpf_map_memory memory;
+#ifdef CONFIG_MEMCG_KMEM
+	struct mem_cgroup *memcg;
+#endif
 	char name[BPF_OBJ_NAME_LEN];
 	u32 btf_vmlinux_value_type_id;
 	bool bypass_spec_v1;
@@ -1240,6 +1246,34 @@ int  generic_map_delete_batch(struct bpf_map *map,
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
+#ifdef CONFIG_MEMCG_KMEM
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+			   int node);
+void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags);
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+				    size_t align, gfp_t flags);
+#else
+static inline void *
+bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+		     int node)
+{
+	return kmalloc_node(size, flags, node);
+}
+
+static inline void *
+bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
+{
+	return kzalloc(size, flags);
+}
+
+static inline void __percpu *
+bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
+		     gfp_t flags)
+{
+	return __alloc_percpu_gfp(size, align, flags);
+}
+#endif
+
 extern int sysctl_unprivileged_bpf_disabled;
 
 static inline bool bpf_allow_ptr_leaks(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f3fe9f53f93c..dedbf6d4cd84 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -31,6 +31,7 @@
 #include <linux/poll.h>
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/memcontrol.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -456,6 +457,65 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 		__release(&map_idr_lock);
 }
 
+#ifdef CONFIG_MEMCG_KMEM
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+	map->memcg = get_mem_cgroup_from_mm(current->mm);
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+	mem_cgroup_put(map->memcg);
+}
+
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+			   int node)
+{
+	struct mem_cgroup *old_memcg;
+	void *ptr;
+
+	old_memcg = set_active_memcg(map->memcg);
+	ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
+	set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
+{
+	struct mem_cgroup *old_memcg;
+	void *ptr;
+
+	old_memcg = set_active_memcg(map->memcg);
+	ptr = kzalloc(size, flags | __GFP_ACCOUNT);
+	set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+				    size_t align, gfp_t flags)
+{
+	struct mem_cgroup *old_memcg;
+	void __percpu *ptr;
+
+	old_memcg = set_active_memcg(map->memcg);
+	ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
+	set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+#else
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+}
+#endif
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
@@ -464,6 +524,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_charge_move(&mem, &map->memory);
 	security_bpf_map_free(map);
+	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 	bpf_map_charge_finish(&mem);
@@ -875,6 +936,8 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map_sec;
 
+	bpf_map_save_memcg(map);
+
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.
-- 
2.26.2


  parent reply	other threads:[~2020-12-01 22:00 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-01 21:58 [PATCH bpf-next v9 00/34] bpf: switch to memcg-based memory accounting Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 01/34] mm: memcontrol: use helpers to read page's memcg data Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 02/34] mm: memcontrol/slab: use helpers to access slab page's memcg_data Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 03/34] mm: introduce page memcg flags Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 04/34] mm: convert page kmemcg type to a page memcg flag Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 05/34] bpf: memcg-based memory accounting for bpf progs Roman Gushchin
2020-12-01 21:58 ` Roman Gushchin [this message]
2020-12-01 21:58 ` [PATCH bpf-next v9 07/34] bpf: memcg-based memory accounting for bpf maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 08/34] bpf: refine memcg-based memory accounting for arraymap maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 09/34] bpf: refine memcg-based memory accounting for cpumap maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 10/34] bpf: memcg-based memory accounting for cgroup storage maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 11/34] bpf: refine memcg-based memory accounting for devmap maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 12/34] bpf: refine memcg-based memory accounting for hashtab maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 13/34] bpf: memcg-based memory accounting for lpm_trie maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 14/34] bpf: memcg-based memory accounting for bpf ringbuffer Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 15/34] bpf: memcg-based memory accounting for bpf local storage maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 16/34] bpf: refine memcg-based memory accounting for sockmap and sockhash maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 17/34] bpf: refine memcg-based memory accounting for xskmap maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 18/34] bpf: eliminate rlimit-based memory accounting for arraymap maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 19/34] bpf: eliminate rlimit-based memory accounting for bpf_struct_ops maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 20/34] bpf: eliminate rlimit-based memory accounting for cpumap maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 21/34] bpf: eliminate rlimit-based memory accounting for cgroup storage maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 22/34] bpf: eliminate rlimit-based memory accounting for devmap maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 23/34] bpf: eliminate rlimit-based memory accounting for hashtab maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 24/34] bpf: eliminate rlimit-based memory accounting for lpm_trie maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 25/34] bpf: eliminate rlimit-based memory accounting for queue_stack_maps maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 26/34] bpf: eliminate rlimit-based memory accounting for reuseport_array maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 27/34] bpf: eliminate rlimit-based memory accounting for bpf ringbuffer Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 28/34] bpf: eliminate rlimit-based memory accounting for sockmap and sockhash maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 29/34] bpf: eliminate rlimit-based memory accounting for stackmap maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 30/34] bpf: eliminate rlimit-based memory accounting for xskmap maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 31/34] bpf: eliminate rlimit-based memory accounting for bpf local storage maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 32/34] bpf: eliminate rlimit-based memory accounting infra for bpf maps Roman Gushchin
2020-12-01 21:58 ` [PATCH bpf-next v9 33/34] bpf: eliminate rlimit-based memory accounting for bpf progs Roman Gushchin
2020-12-01 21:59 ` [PATCH bpf-next v9 34/34] bpf: samples: do not touch RLIMIT_MEMLOCK Roman Gushchin
2020-12-03  2:50 ` [PATCH bpf-next v9 00/34] bpf: switch to memcg-based memory accounting patchwork-bot+netdevbpf
2020-12-03  2:54 ` Alexei Starovoitov
2020-12-03  3:26   ` Roman Gushchin
2020-12-05  0:37     ` Daniel Borkmann
2020-12-08  2:53       ` Andrii Nakryiko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201201215900.3569844-7-guro@fb.com \
    --to=guro@fb.com \
    --cc=akpm@linux-foundation.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).