All of lore.kernel.org
 help / color / mirror / Atom feed
From: Roman Gushchin <guro@fb.com>
To: <bpf@vger.kernel.org>
Cc: <ast@kernel.org>, <daniel@iogearbox.net>,
	<netdev@vger.kernel.org>, <andrii@kernel.org>,
	<akpm@linux-foundation.org>, <linux-mm@kvack.org>,
	<linux-kernel@vger.kernel.org>, <kernel-team@fb.com>
Subject: [PATCH bpf-next v7 06/34] bpf: prepare for memcg-based memory accounting for bpf maps
Date: Thu, 19 Nov 2020 09:37:26 -0800	[thread overview]
Message-ID: <20201119173754.4125257-7-guro@fb.com> (raw)
In-Reply-To: <20201119173754.4125257-1-guro@fb.com>

In the absolute majority of cases if a process is making a kernel
allocation, it's memory cgroup is getting charged.

Bpf maps can be updated from an interrupt context and in such
case there is no process which can be charged. It makes the memory
accounting of bpf maps non-trivial.

Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel
memcg accounting from interrupt contexts") and b87d8cefe43c
("mm, memcg: rework remote charging API to support nesting")
it's finally possible.

To do it, a pointer to the memory cgroup of the process, which created
the map, is saved, and this cgroup can be charged for all allocations
made from an interrupt context. This commit introduces 2 helpers:
bpf_map_kmalloc_node() and bpf_map_alloc_percpu(). They can be used in
the bpf code for accounted memory allocations, both in the process and
interrupt contexts. In the interrupt context they're using the saved
memory cgroup, otherwise the current cgroup is getting charged.

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 include/linux/bpf.h  | 26 +++++++++++++++
 kernel/bpf/syscall.c | 76 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1bcb6d7345c..b11436cb9e3d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/capability.h>
+#include <linux/slab.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -37,6 +38,7 @@ struct bpf_iter_aux_info;
 struct bpf_local_storage;
 struct bpf_local_storage_map;
 struct kobject;
+struct mem_cgroup;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -161,6 +163,9 @@ struct bpf_map {
 	u32 btf_value_type_id;
 	struct btf *btf;
 	struct bpf_map_memory memory;
+#ifdef CONFIG_MEMCG_KMEM
+	struct mem_cgroup *memcg;
+#endif
 	char name[BPF_OBJ_NAME_LEN];
 	u32 btf_vmlinux_value_type_id;
 	bool bypass_spec_v1;
@@ -1240,6 +1245,27 @@ int  generic_map_delete_batch(struct bpf_map *map,
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
+#ifdef CONFIG_MEMCG_KMEM
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+			   int node);
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+				    size_t align, gfp_t gfp);
+#else
+static inline void *
+bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+		     int node)
+{
+	return kmalloc_node(size, flags, node);
+}
+
+static inline void __percpu *
+bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
+		     gfp_t gfp)
+{
+	return __alloc_percpu_gfp(size, align, gfp);
+}
+#endif
+
 extern int sysctl_unprivileged_bpf_disabled;
 
 static inline bool bpf_allow_ptr_leaks(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f3fe9f53f93c..4154c616788c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -31,6 +31,8 @@
 #include <linux/poll.h>
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/memcontrol.h>
+#include <linux/sched/mm.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -456,6 +458,77 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 		__release(&map_idr_lock);
 }
 
+#ifdef CONFIG_MEMCG_KMEM
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+	map->memcg = get_mem_cgroup_from_mm(current->mm);
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+	mem_cgroup_put(map->memcg);
+}
+
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+			   int node)
+{
+	struct mem_cgroup *old_memcg;
+	bool in_interrupt;
+	void *ptr;
+
+	/*
+	 * If the memory allocation is performed from an interrupt context,
+	 * the memory cgroup to charge can't be determined from the context
+	 * of the current task. Instead, we charge the memory cgroup, which
+	 * contained the process created the map.
+	 */
+	in_interrupt = in_interrupt();
+	if (in_interrupt)
+		old_memcg = set_active_memcg(map->memcg);
+
+	ptr = kmalloc_node(size, flags, node);
+
+	if (in_interrupt)
+		set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+				    size_t align, gfp_t gfp)
+{
+	struct mem_cgroup *old_memcg;
+	bool in_interrupt;
+	void *ptr;
+
+	/*
+	 * If the memory allocation is performed from an interrupt context,
+	 * the memory cgroup to charge can't be determined from the context
+	 * of the current task. Instead, we charge the memory cgroup, which
+	 * contained the process created the map.
+	 */
+	in_interrupt = in_interrupt();
+	if (in_interrupt)
+		old_memcg = set_active_memcg(map->memcg);
+
+	ptr = __alloc_percpu_gfp(size, align, gfp);
+
+	if (in_interrupt)
+		set_active_memcg(old_memcg);
+
+	return ptr;
+}
+
+#else
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+}
+#endif
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
@@ -464,6 +537,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_charge_move(&mem, &map->memory);
 	security_bpf_map_free(map);
+	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 	bpf_map_charge_finish(&mem);
@@ -875,6 +949,8 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map_sec;
 
+	bpf_map_save_memcg(map);
+
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.
-- 
2.26.2


  parent reply	other threads:[~2020-11-19 17:41 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-19 17:37 [PATCH bpf-next v7 00/34] bpf: switch to memcg-based memory accounting Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 01/34] mm: memcontrol: use helpers to read page's memcg data Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 02/34] mm: memcontrol/slab: use helpers to access slab page's memcg_data Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 03/34] mm: introduce page memcg flags Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 04/34] mm: convert page kmemcg type to a page memcg flag Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 05/34] bpf: memcg-based memory accounting for bpf progs Roman Gushchin
2020-11-19 17:37 ` Roman Gushchin [this message]
2020-11-20  1:05   ` [PATCH bpf-next v7 06/34] bpf: prepare for memcg-based memory accounting for bpf maps Song Liu
2020-11-19 17:37 ` [PATCH bpf-next v7 07/34] bpf: " Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 08/34] bpf: refine memcg-based memory accounting for arraymap maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 09/34] bpf: refine memcg-based memory accounting for cpumap maps Roman Gushchin
2020-11-20  1:33   ` Song Liu
2020-11-19 17:37 ` [PATCH bpf-next v7 10/34] bpf: memcg-based memory accounting for cgroup storage maps Roman Gushchin
2020-11-20  1:37   ` Song Liu
2020-11-19 17:37 ` [PATCH bpf-next v7 11/34] bpf: refine memcg-based memory accounting for devmap maps Roman Gushchin
2020-11-20  1:38   ` Song Liu
2020-11-19 17:37 ` [PATCH bpf-next v7 12/34] bpf: refine memcg-based memory accounting for hashtab maps Roman Gushchin
2020-11-20  1:39   ` Song Liu
2020-11-19 17:37 ` [PATCH bpf-next v7 13/34] bpf: memcg-based memory accounting for lpm_trie maps Roman Gushchin
2020-11-20  1:39   ` Song Liu
2020-11-19 17:37 ` [PATCH bpf-next v7 14/34] bpf: memcg-based memory accounting for bpf ringbuffer Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 15/34] bpf: memcg-based memory accounting for bpf local storage maps Roman Gushchin
2020-11-20  1:44   ` Song Liu
2020-11-19 17:37 ` [PATCH bpf-next v7 16/34] bpf: refine memcg-based memory accounting for sockmap and sockhash maps Roman Gushchin
2020-11-20  1:45   ` Song Liu
2020-11-19 17:37 ` [PATCH bpf-next v7 17/34] bpf: refine memcg-based memory accounting for xskmap maps Roman Gushchin
2020-11-20  1:45   ` Song Liu
2020-11-19 17:37 ` [PATCH bpf-next v7 18/34] bpf: eliminate rlimit-based memory accounting for arraymap maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 19/34] bpf: eliminate rlimit-based memory accounting for bpf_struct_ops maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 20/34] bpf: eliminate rlimit-based memory accounting for cpumap maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 21/34] bpf: eliminate rlimit-based memory accounting for cgroup storage maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 22/34] bpf: eliminate rlimit-based memory accounting for devmap maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 23/34] bpf: eliminate rlimit-based memory accounting for hashtab maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 24/34] bpf: eliminate rlimit-based memory accounting for lpm_trie maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 25/34] bpf: eliminate rlimit-based memory accounting for queue_stack_maps maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 26/34] bpf: eliminate rlimit-based memory accounting for reuseport_array maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 27/34] bpf: eliminate rlimit-based memory accounting for bpf ringbuffer Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 28/34] bpf: eliminate rlimit-based memory accounting for sockmap and sockhash maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 29/34] bpf: eliminate rlimit-based memory accounting for stackmap maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 30/34] bpf: eliminate rlimit-based memory accounting for xskmap maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 31/34] bpf: eliminate rlimit-based memory accounting for bpf local storage maps Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 32/34] bpf: eliminate rlimit-based memory accounting infra for bpf maps Roman Gushchin
2020-11-21  2:52   ` Alexei Starovoitov
2020-11-21  2:59     ` Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 33/34] bpf: eliminate rlimit-based memory accounting for bpf progs Roman Gushchin
2020-11-19 17:37 ` [PATCH bpf-next v7 34/34] bpf: samples: do not touch RLIMIT_MEMLOCK Roman Gushchin
2020-11-23 13:30 ` [PATCH bpf-next v7 00/34] bpf: switch to memcg-based memory accounting Daniel Borkmann
2020-11-24  0:05   ` Roman Gushchin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201119173754.4125257-7-guro@fb.com \
    --to=guro@fb.com \
    --cc=akpm@linux-foundation.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.