bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Hou Tao <houtao@huaweicloud.com>
To: bpf@vger.kernel.org
Cc: Martin KaFai Lau <martin.lau@linux.dev>,
	Andrii Nakryiko <andrii@kernel.org>, Song Liu <song@kernel.org>,
	Hao Luo <haoluo@google.com>, Yonghong Song <yhs@fb.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	KP Singh <kpsingh@kernel.org>,
	Stanislav Fomichev <sdf@google.com>, Jiri Olsa <jolsa@kernel.org>,
	John Fastabend <john.fastabend@gmail.com>,
	"Paul E . McKenney" <paulmck@kernel.org>,
	rcu@vger.kernel.org, houtao1@huawei.com
Subject: [RFC PATCH bpf-next 1/6] bpf: Support ctor in bpf memory allocator
Date: Fri, 30 Dec 2022 12:11:46 +0800	[thread overview]
Message-ID: <20221230041151.1231169-2-houtao@huaweicloud.com> (raw)
In-Reply-To: <20221230041151.1231169-1-houtao@huaweicloud.com>

From: Hou Tao <houtao1@huawei.com>

Currently the freed element in bpf memory allocator may be immediately
reused, for htab map the reuse will reinitialize special fields in map
value (e.g., bpf_spin_lock), but lookup procedure may still access
these special fields, and it may lead to hard-lockup as shown below:

 NMI backtrace for cpu 16
 CPU: 16 PID: 2574 Comm: htab.bin Tainted: G             L     6.1.0+ #1
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
 RIP: 0010:queued_spin_lock_slowpath+0x283/0x2c0
 ......
 Call Trace:
  <TASK>
  copy_map_value_locked+0xb7/0x170
  bpf_map_copy_value+0x113/0x3c0
  __sys_bpf+0x1c67/0x2780
  __x64_sys_bpf+0x1c/0x20
  do_syscall_64+0x30/0x60
  entry_SYSCALL_64_after_hwframe+0x46/0xb0
 ......
  </TASK>

For htab map, just like the preallocated case, these is no need to
initialize these special fields in map value again once these fields
have been initialized, but now only bpf memory allocator knows whether
or not an allocated object is reused or not. So introducing ctor support
in bpf memory allocator and calling ctor for the allocated object only
when it is newly allocated.

Fixes: 0fd7c5d43339 ("bpf: Optimize call_rcu in non-preallocated hash map.")
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 include/linux/bpf_mem_alloc.h |  4 +++-
 kernel/bpf/core.c             |  2 +-
 kernel/bpf/hashtab.c          | 16 ++++++++++++----
 kernel/bpf/memalloc.c         | 10 +++++++++-
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 3e164b8efaa9..3c287db087e7 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -12,9 +12,11 @@ struct bpf_mem_alloc {
 	struct bpf_mem_caches __percpu *caches;
 	struct bpf_mem_cache __percpu *cache;
 	struct work_struct work;
+	void (*ctor)(struct bpf_mem_alloc *ma, void *obj);
 };
 
-int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu,
+		       void (*ctor)(struct bpf_mem_alloc *, void *));
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 
 /* kmalloc/kfree equivalent: */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7f98dec6e90f..6da2f9a6b085 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2755,7 +2755,7 @@ static int __init bpf_global_ma_init(void)
 {
 	int ret;
 
-	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
+	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false, NULL);
 	bpf_global_ma_set = !ret;
 	return ret;
 }
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 5aa2b5525f79..3d6557ec4b92 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -453,6 +453,15 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	return 0;
 }
 
+static void htab_elem_ctor(struct bpf_mem_alloc *ma, void *obj)
+{
+	struct bpf_htab *htab = container_of(ma, struct bpf_htab, ma);
+	struct htab_elem *elem = obj;
+
+	check_and_init_map_value(&htab->map,
+				 elem->key + round_up(htab->map.key_size, 8));
+}
+
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -565,12 +574,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 				goto free_prealloc;
 		}
 	} else {
-		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false,
+					 htab_elem_ctor);
 		if (err)
 			goto free_map_locked;
 		if (percpu) {
 			err = bpf_mem_alloc_init(&htab->pcpu_ma,
-						 round_up(htab->map.value_size, 8), true);
+						 round_up(htab->map.value_size, 8), true, NULL);
 			if (err)
 				goto free_map_locked;
 		}
@@ -1004,8 +1014,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			l_new = ERR_PTR(-ENOMEM);
 			goto dec_count;
 		}
-		check_and_init_map_value(&htab->map,
-					 l_new->key + round_up(key_size, 8));
 	}
 
 	memcpy(l_new->key, key, key_size);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index ebcc3dd0fa19..ac5b92fece14 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -98,6 +98,7 @@ struct bpf_mem_cache {
 	int free_cnt;
 	int low_watermark, high_watermark, batch;
 	int percpu_size;
+	struct bpf_mem_alloc *ma;
 
 	struct rcu_head rcu;
 	struct llist_head free_by_rcu;
@@ -188,6 +189,9 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
 			obj = __alloc(c, node);
 			if (!obj)
 				break;
+			/* Only do initialize for newly allocated object */
+			if (c->ma->ctor)
+				c->ma->ctor(c->ma, obj);
 		}
 		if (IS_ENABLED(CONFIG_PREEMPT_RT))
 			/* In RT irq_work runs in per-cpu kthread, so disable
@@ -374,7 +378,8 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
  * kmalloc/kfree. Max allocation size is 4096 in this case.
  * This is bpf_dynptr and bpf_kptr use case.
  */
-int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu,
+		       void (*ctor)(struct bpf_mem_alloc *, void *))
 {
 	static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
 	struct bpf_mem_caches *cc, __percpu *pcc;
@@ -382,6 +387,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	struct obj_cgroup *objcg = NULL;
 	int cpu, i, unit_size, percpu_size = 0;
 
+	ma->ctor = ctor;
 	if (size) {
 		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
 		if (!pc)
@@ -402,6 +408,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c->unit_size = unit_size;
 			c->objcg = objcg;
 			c->percpu_size = percpu_size;
+			c->ma = ma;
 			prefill_mem_cache(c, cpu);
 		}
 		ma->cache = pc;
@@ -424,6 +431,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c = &cc->cache[i];
 			c->unit_size = sizes[i];
 			c->objcg = objcg;
+			c->ma = ma;
 			prefill_mem_cache(c, cpu);
 		}
 	}
-- 
2.29.2


  reply	other threads:[~2022-12-30  4:12 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-12-30  4:11 [RFC PATCH bpf-next 0/6] bpf: Handle reuse in bpf memory alloc Hou Tao
2022-12-30  4:11 ` Hou Tao [this message]
2022-12-30  4:11 ` [RFC PATCH bpf-next 2/6] bpf: Factor out a common helper free_llist() Hou Tao
2022-12-30  4:11 ` [RFC PATCH bpf-next 3/6] bpf: Pass bitwise flags to bpf_mem_alloc_init() Hou Tao
2022-12-30  4:11 ` [RFC PATCH bpf-next 4/6] bpf: Introduce BPF_MA_NO_REUSE for bpf memory allocator Hou Tao
2022-12-30  4:11 ` [RFC PATCH bpf-next 5/6] bpf: Use BPF_MA_NO_REUSE in htab map Hou Tao
2022-12-30  4:11 ` [RFC PATCH bpf-next 6/6] selftests/bpf: Add test case for element reuse " Hou Tao
2023-01-01  1:26 ` [RFC PATCH bpf-next 0/6] bpf: Handle reuse in bpf memory alloc Alexei Starovoitov
2023-01-01 18:48   ` Yonghong Song
2023-01-03 13:47     ` Hou Tao
2023-01-04  6:10       ` Yonghong Song
2023-01-04  6:30         ` Hou Tao
2023-01-04  7:14           ` Yonghong Song
2023-01-04 18:26             ` Alexei Starovoitov
2023-02-10 16:32               ` Kumar Kartikeya Dwivedi
2023-02-10 21:06                 ` Alexei Starovoitov
2023-02-11  1:09                   ` Hou Tao
2023-02-11 16:33                     ` Alexei Starovoitov
2023-02-11 16:34                       ` Alexei Starovoitov
2023-02-15  1:54                         ` Martin KaFai Lau
2023-02-15  4:02                           ` Hou Tao
2023-02-15  7:22                             ` Martin KaFai Lau
2023-02-16  2:11                               ` Hou Tao
2023-02-16  7:47                                 ` Martin KaFai Lau
2023-02-16  8:18                                   ` Hou Tao
2023-02-16 13:55                         ` Hou Tao
2023-02-16 16:35                           ` Alexei Starovoitov
2023-02-17  1:19                             ` Hou Tao
2023-02-22 19:30                               ` Alexei Starovoitov
2023-02-15  2:35                       ` Hou Tao
2023-02-15  2:42                         ` Alexei Starovoitov
2023-02-15  3:00                           ` Hou Tao
2023-01-03 13:40   ` Hou Tao
2023-01-03 19:38     ` Alexei Starovoitov
2023-01-10  6:26       ` Martin KaFai Lau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221230041151.1231169-2-houtao@huaweicloud.com \
    --to=houtao@huaweicloud.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=haoluo@google.com \
    --cc=houtao1@huawei.com \
    --cc=john.fastabend@gmail.com \
    --cc=jolsa@kernel.org \
    --cc=kpsingh@kernel.org \
    --cc=martin.lau@linux.dev \
    --cc=paulmck@kernel.org \
    --cc=rcu@vger.kernel.org \
    --cc=sdf@google.com \
    --cc=song@kernel.org \
    --cc=yhs@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).