All of lore.kernel.org
 help / color / mirror / Atom feed
From: Hou Tao <houtao@huaweicloud.com>
To: bpf@vger.kernel.org, Alexei Starovoitov <ast@kernel.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>,
	Andrii Nakryiko <andrii@kernel.org>, Song Liu <song@kernel.org>,
	Hao Luo <haoluo@google.com>, Yonghong Song <yhs@fb.com>,
	Daniel Borkmann <daniel@iogearbox.net>,
	KP Singh <kpsingh@kernel.org>,
	Stanislav Fomichev <sdf@google.com>, Jiri Olsa <jolsa@kernel.org>,
	John Fastabend <john.fastabend@gmail.com>,
	houtao1@huawei.com
Subject: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
Date: Thu, 20 Oct 2022 22:22:47 +0800	[thread overview]
Message-ID: <20221020142247.1682009-1-houtao@huaweicloud.com> (raw)

From: Hou Tao <houtao1@huawei.com>

Since commit fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc."),
numa node setting for non-preallocated hash table is ignored. The reason
is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
is trivial to support numa node setting for bpf memory allocator.

So adding support for setting numa node in bpf memory allocator and
updating hash map accordingly.

Fixes: fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc.")
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 include/linux/bpf_mem_alloc.h |  3 ++-
 kernel/bpf/hashtab.c          |  6 +++--
 kernel/bpf/memalloc.c         | 50 ++++++++++++++++++++++++++++-------
 3 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 3e164b8efaa9..5b1e34d6f133 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -14,7 +14,8 @@ struct bpf_mem_alloc {
 	struct work_struct work;
 };
 
-int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, int numa_node,
+		       bool percpu);
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 
 /* kmalloc/kfree equivalent: */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index ed3f8a53603b..34954195841d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -568,12 +568,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 				goto free_prealloc;
 		}
 	} else {
-		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size,
+					 htab->map.numa_node, false);
 		if (err)
 			goto free_map_locked;
 		if (percpu) {
 			err = bpf_mem_alloc_init(&htab->pcpu_ma,
-						 round_up(htab->map.value_size, 8), true);
+						 round_up(htab->map.value_size, 8),
+						 htab->map.numa_node, true);
 			if (err)
 				goto free_map_locked;
 		}
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index fc116cf47d24..44c531ba9534 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -6,6 +6,7 @@
 #include <linux/irq_work.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/memcontrol.h>
+#include <linux/nodemask.h>
 #include <asm/local.h>
 
 /* Any context (including NMI) BPF specific memory allocator.
@@ -98,6 +99,7 @@ struct bpf_mem_cache {
 	int free_cnt;
 	int low_watermark, high_watermark, batch;
 	int percpu_size;
+	int numa_node;
 
 	struct rcu_head rcu;
 	struct llist_head free_by_rcu;
@@ -125,8 +127,8 @@ static void *__alloc(struct bpf_mem_cache *c, int node)
 {
 	/* Allocate, but don't deplete atomic reserves that typical
 	 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
-	 * will allocate from the current numa node which is what we
-	 * want here.
+	 * will allocate from the current numa node if numa_node is
+	 * NUMA_NO_NODE, else will allocate from specific numa_node.
 	 */
 	gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
 
@@ -301,9 +303,10 @@ static void bpf_mem_refill(struct irq_work *work)
 	cnt = c->free_cnt;
 	if (cnt < c->low_watermark)
 		/* irq_work runs on this cpu and kmalloc will allocate
-		 * from the current numa node which is what we want here.
+		 * from the current numa node if numa_node is NUMA_NO_NODE,
+		 * else allocate from specific numa_node.
 		 */
-		alloc_bulk(c, c->batch, NUMA_NO_NODE);
+		alloc_bulk(c, c->batch, c->numa_node);
 	else if (cnt > c->high_watermark)
 		free_bulk(c);
 }
@@ -328,7 +331,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
  * bpf progs can and should share bpf_mem_cache when possible.
  */
 
-static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+static void prefill_mem_cache(struct bpf_mem_cache *c, int node)
 {
 	init_irq_work(&c->refill_work, bpf_mem_refill);
 	if (c->unit_size <= 256) {
@@ -349,7 +352,28 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
 	 * prog won't be doing more than 4 map_update_elem from
 	 * irq disabled region
 	 */
-	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
+	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, node);
+}
+
+static inline bool is_valid_numa_node(int numa_node, bool percpu)
+{
+	return numa_node == NUMA_NO_NODE ||
+	       (!percpu && (unsigned int)numa_node < nr_node_ids);
+}
+
+/* The initial prefill is running in the context of map creation process, so
+ * if the preferred numa node is NUMA_NO_NODE, needs to use numa node of the
+ * specific cpu instead.
+ */
+static inline int get_prefill_numa_node(int numa_node, int cpu)
+{
+	int prefill_numa_node;
+
+	if (numa_node == NUMA_NO_NODE)
+		prefill_numa_node = cpu_to_node(cpu);
+	else
+		prefill_numa_node = numa_node;
+	return prefill_numa_node;
 }
 
 /* When size != 0 bpf_mem_cache for each cpu.
@@ -359,13 +383,17 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
  * kmalloc/kfree. Max allocation size is 4096 in this case.
  * This is bpf_dynptr and bpf_kptr use case.
  */
-int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, int numa_node,
+		       bool percpu)
 {
 	static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
 	struct bpf_mem_caches *cc, __percpu *pcc;
+	int cpu, i, unit_size, percpu_size = 0;
 	struct bpf_mem_cache *c, __percpu *pc;
 	struct obj_cgroup *objcg = NULL;
-	int cpu, i, unit_size, percpu_size = 0;
+
+	if (!is_valid_numa_node(numa_node, percpu))
+		return -EINVAL;
 
 	if (size) {
 		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -387,7 +415,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c->unit_size = unit_size;
 			c->objcg = objcg;
 			c->percpu_size = percpu_size;
-			prefill_mem_cache(c, cpu);
+			c->numa_node = numa_node;
+			prefill_mem_cache(c, get_prefill_numa_node(numa_node, cpu));
 		}
 		ma->cache = pc;
 		return 0;
@@ -409,7 +438,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c = &cc->cache[i];
 			c->unit_size = sizes[i];
 			c->objcg = objcg;
-			prefill_mem_cache(c, cpu);
+			c->numa_node = numa_node;
+			prefill_mem_cache(c, get_prefill_numa_node(numa_node, cpu));
 		}
 	}
 	ma->caches = pcc;
-- 
2.29.2


             reply	other threads:[~2022-10-20 13:57 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-20 14:22 Hou Tao [this message]
2022-10-20 18:01 ` [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator Hao Luo
2022-10-21  1:43   ` Hou Tao
2022-10-21  1:48     ` Alexei Starovoitov
2022-10-21  2:06       ` Hou Tao
2022-10-21  2:09         ` Alexei Starovoitov
2022-10-21  2:26           ` Hou Tao
2022-10-21  4:22             ` Alexei Starovoitov
2022-10-21 11:01               ` Hou Tao
2022-11-08  2:22               ` Hou Tao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221020142247.1682009-1-houtao@huaweicloud.com \
    --to=houtao@huaweicloud.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=haoluo@google.com \
    --cc=houtao1@huawei.com \
    --cc=john.fastabend@gmail.com \
    --cc=jolsa@kernel.org \
    --cc=kpsingh@kernel.org \
    --cc=martin.lau@linux.dev \
    --cc=sdf@google.com \
    --cc=song@kernel.org \
    --cc=yhs@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.