All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
@ 2022-10-20 14:22 Hou Tao
  2022-10-20 18:01 ` Hao Luo
  0 siblings, 1 reply; 10+ messages in thread
From: Hou Tao @ 2022-10-20 14:22 UTC (permalink / raw)
  To: bpf, Alexei Starovoitov
  Cc: Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
	Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
	Jiri Olsa, John Fastabend, houtao1

From: Hou Tao <houtao1@huawei.com>

Since commit fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc."),
numa node setting for non-preallocated hash table is ignored. The reason
is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
is trivial to support numa node setting for bpf memory allocator.

So adding support for setting numa node in bpf memory allocator and
updating hash map accordingly.

Fixes: fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc.")
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 include/linux/bpf_mem_alloc.h |  3 ++-
 kernel/bpf/hashtab.c          |  6 +++--
 kernel/bpf/memalloc.c         | 50 ++++++++++++++++++++++++++++-------
 3 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 3e164b8efaa9..5b1e34d6f133 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -14,7 +14,8 @@ struct bpf_mem_alloc {
 	struct work_struct work;
 };
 
-int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, int numa_node,
+		       bool percpu);
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 
 /* kmalloc/kfree equivalent: */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index ed3f8a53603b..34954195841d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -568,12 +568,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 				goto free_prealloc;
 		}
 	} else {
-		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size,
+					 htab->map.numa_node, false);
 		if (err)
 			goto free_map_locked;
 		if (percpu) {
 			err = bpf_mem_alloc_init(&htab->pcpu_ma,
-						 round_up(htab->map.value_size, 8), true);
+						 round_up(htab->map.value_size, 8),
+						 htab->map.numa_node, true);
 			if (err)
 				goto free_map_locked;
 		}
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index fc116cf47d24..44c531ba9534 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -6,6 +6,7 @@
 #include <linux/irq_work.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/memcontrol.h>
+#include <linux/nodemask.h>
 #include <asm/local.h>
 
 /* Any context (including NMI) BPF specific memory allocator.
@@ -98,6 +99,7 @@ struct bpf_mem_cache {
 	int free_cnt;
 	int low_watermark, high_watermark, batch;
 	int percpu_size;
+	int numa_node;
 
 	struct rcu_head rcu;
 	struct llist_head free_by_rcu;
@@ -125,8 +127,8 @@ static void *__alloc(struct bpf_mem_cache *c, int node)
 {
 	/* Allocate, but don't deplete atomic reserves that typical
 	 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
-	 * will allocate from the current numa node which is what we
-	 * want here.
+	 * will allocate from the current numa node if numa_node is
+	 * NUMA_NO_NODE, else will allocate from specific numa_node.
 	 */
 	gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
 
@@ -301,9 +303,10 @@ static void bpf_mem_refill(struct irq_work *work)
 	cnt = c->free_cnt;
 	if (cnt < c->low_watermark)
 		/* irq_work runs on this cpu and kmalloc will allocate
-		 * from the current numa node which is what we want here.
+		 * from the current numa node if numa_node is NUMA_NO_NODE,
+		 * else allocate from specific numa_node.
 		 */
-		alloc_bulk(c, c->batch, NUMA_NO_NODE);
+		alloc_bulk(c, c->batch, c->numa_node);
 	else if (cnt > c->high_watermark)
 		free_bulk(c);
 }
@@ -328,7 +331,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
  * bpf progs can and should share bpf_mem_cache when possible.
  */
 
-static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+static void prefill_mem_cache(struct bpf_mem_cache *c, int node)
 {
 	init_irq_work(&c->refill_work, bpf_mem_refill);
 	if (c->unit_size <= 256) {
@@ -349,7 +352,28 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
 	 * prog won't be doing more than 4 map_update_elem from
 	 * irq disabled region
 	 */
-	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
+	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, node);
+}
+
+static inline bool is_valid_numa_node(int numa_node, bool percpu)
+{
+	return numa_node == NUMA_NO_NODE ||
+	       (!percpu && (unsigned int)numa_node < nr_node_ids);
+}
+
+/* The initial prefill is running in the context of map creation process, so
+ * if the preferred numa node is NUMA_NO_NODE, needs to use numa node of the
+ * specific cpu instead.
+ */
+static inline int get_prefill_numa_node(int numa_node, int cpu)
+{
+	int prefill_numa_node;
+
+	if (numa_node == NUMA_NO_NODE)
+		prefill_numa_node = cpu_to_node(cpu);
+	else
+		prefill_numa_node = numa_node;
+	return prefill_numa_node;
 }
 
 /* When size != 0 bpf_mem_cache for each cpu.
@@ -359,13 +383,17 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
  * kmalloc/kfree. Max allocation size is 4096 in this case.
  * This is bpf_dynptr and bpf_kptr use case.
  */
-int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, int numa_node,
+		       bool percpu)
 {
 	static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
 	struct bpf_mem_caches *cc, __percpu *pcc;
+	int cpu, i, unit_size, percpu_size = 0;
 	struct bpf_mem_cache *c, __percpu *pc;
 	struct obj_cgroup *objcg = NULL;
-	int cpu, i, unit_size, percpu_size = 0;
+
+	if (!is_valid_numa_node(numa_node, percpu))
+		return -EINVAL;
 
 	if (size) {
 		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -387,7 +415,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c->unit_size = unit_size;
 			c->objcg = objcg;
 			c->percpu_size = percpu_size;
-			prefill_mem_cache(c, cpu);
+			c->numa_node = numa_node;
+			prefill_mem_cache(c, get_prefill_numa_node(numa_node, cpu));
 		}
 		ma->cache = pc;
 		return 0;
@@ -409,7 +438,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c = &cc->cache[i];
 			c->unit_size = sizes[i];
 			c->objcg = objcg;
-			prefill_mem_cache(c, cpu);
+			c->numa_node = numa_node;
+			prefill_mem_cache(c, get_prefill_numa_node(numa_node, cpu));
 		}
 	}
 	ma->caches = pcc;
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
  2022-10-20 14:22 [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator Hou Tao
@ 2022-10-20 18:01 ` Hao Luo
  2022-10-21  1:43   ` Hou Tao
  0 siblings, 1 reply; 10+ messages in thread
From: Hao Luo @ 2022-10-20 18:01 UTC (permalink / raw)
  To: Hou Tao
  Cc: bpf, Alexei Starovoitov, Martin KaFai Lau, Andrii Nakryiko,
	Song Liu, Yonghong Song, Daniel Borkmann, KP Singh,
	Stanislav Fomichev, Jiri Olsa, John Fastabend, houtao1

On Thu, Oct 20, 2022 at 6:57 AM Hou Tao <houtao@huaweicloud.com> wrote:
>
> From: Hou Tao <houtao1@huawei.com>
>
> Since commit fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc."),
> numa node setting for non-preallocated hash table is ignored. The reason
> is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
> is trivial to support numa node setting for bpf memory allocator.
>
> So adding support for setting numa node in bpf memory allocator and
> updating hash map accordingly.
>
> Fixes: fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc.")
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---

Looks good to me with a few nits.

>
<...>
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> index fc116cf47d24..44c531ba9534 100644
> --- a/kernel/bpf/memalloc.c
> +++ b/kernel/bpf/memalloc.c
<...>
> +static inline bool is_valid_numa_node(int numa_node, bool percpu)
> +{
> +       return numa_node == NUMA_NO_NODE ||
> +              (!percpu && (unsigned int)numa_node < nr_node_ids);

Maybe also check node_online? There is a similar helper function in
kernel/bpf/syscall.c.

It may help debugging if we could log the reason here, for example,
PERCPU map but with numa_node specified.

> +}
> +
> +/* The initial prefill is running in the context of map creation process, so
> + * if the preferred numa node is NUMA_NO_NODE, needs to use numa node of the
> + * specific cpu instead.
> + */
> +static inline int get_prefill_numa_node(int numa_node, int cpu)
> +{
> +       int prefill_numa_node;
> +
> +       if (numa_node == NUMA_NO_NODE)
> +               prefill_numa_node = cpu_to_node(cpu);
> +       else
> +               prefill_numa_node = numa_node;
> +       return prefill_numa_node;
>  }

nit: an alternative implementation is

 return numa_node == NUMA_NO_NODE ? cpu_to_node(cpu) : numa_node;

>
>  /* When size != 0 bpf_mem_cache for each cpu.
> @@ -359,13 +383,17 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
>   * kmalloc/kfree. Max allocation size is 4096 in this case.
>   * This is bpf_dynptr and bpf_kptr use case.
>   */

We added a parameter to this function, I think it is worth mentioning
the 'numa_node' argument's behavior under different values of
'percpu'.

> -int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
> +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, int numa_node,
> +                      bool percpu)
>  {
<...>
> --
> 2.29.2
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
  2022-10-20 18:01 ` Hao Luo
@ 2022-10-21  1:43   ` Hou Tao
  2022-10-21  1:48     ` Alexei Starovoitov
  0 siblings, 1 reply; 10+ messages in thread
From: Hou Tao @ 2022-10-21  1:43 UTC (permalink / raw)
  To: Hao Luo
  Cc: bpf, Alexei Starovoitov, Martin KaFai Lau, Andrii Nakryiko,
	Song Liu, Yonghong Song, Daniel Borkmann, KP Singh,
	Stanislav Fomichev, Jiri Olsa, John Fastabend, houtao1

Hi,

On 10/21/2022 2:01 AM, Hao Luo wrote:
> On Thu, Oct 20, 2022 at 6:57 AM Hou Tao <houtao@huaweicloud.com> wrote:
>> From: Hou Tao <houtao1@huawei.com>
>>
>> Since commit fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc."),
>> numa node setting for non-preallocated hash table is ignored. The reason
>> is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
>> is trivial to support numa node setting for bpf memory allocator.
>>
>> So adding support for setting numa node in bpf memory allocator and
>> updating hash map accordingly.
>>
>> Fixes: fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc.")
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
> Looks good to me with a few nits.
>
> <...>
>> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
>> index fc116cf47d24..44c531ba9534 100644
>> --- a/kernel/bpf/memalloc.c
>> +++ b/kernel/bpf/memalloc.c
> <...>
>> +static inline bool is_valid_numa_node(int numa_node, bool percpu)
>> +{
>> +       return numa_node == NUMA_NO_NODE ||
>> +              (!percpu && (unsigned int)numa_node < nr_node_ids);
> Maybe also check node_online? There is a similar helper function in
> kernel/bpf/syscall.c.
Will factor out as a helper function and use it in bpf memory allocator in v2.
>
> It may help debugging if we could log the reason here, for example,
> PERCPU map but with numa_node specified.
Not sure about it, because the validity check must have been done in map related
code.

>
>> +}
>> +
>> +/* The initial prefill is running in the context of map creation process, so
>> + * if the preferred numa node is NUMA_NO_NODE, needs to use numa node of the
>> + * specific cpu instead.
>> + */
>> +static inline int get_prefill_numa_node(int numa_node, int cpu)
>> +{
>> +       int prefill_numa_node;
>> +
>> +       if (numa_node == NUMA_NO_NODE)
>> +               prefill_numa_node = cpu_to_node(cpu);
>> +       else
>> +               prefill_numa_node = numa_node;
>> +       return prefill_numa_node;
>>  }
> nit: an alternative implementation is
>
>  return numa_node == NUMA_NO_NODE ? cpu_to_node(cpu) : numa_node;
It is shorter and better. Will do it in v2.
>
>>  /* When size != 0 bpf_mem_cache for each cpu.
>> @@ -359,13 +383,17 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
>>   * kmalloc/kfree. Max allocation size is 4096 in this case.
>>   * This is bpf_dynptr and bpf_kptr use case.
>>   */
> We added a parameter to this function, I think it is worth mentioning
> the 'numa_node' argument's behavior under different values of
> 'percpu'.
How about the following comments ?

 * For per-cpu allocator (percpu=true), the only valid value of numa_node is
 * NUMA_NO_NODE. For non-per-cpu allocator, if numa_node is NUMA_NO_NODE, the
 * preferred memory allocation node is the numa node where the allocating CPU
 * is located, else the preferred node is the specified numa_node.

>
>> -int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
>> +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, int numa_node,
>> +                      bool percpu)
>>  {
> <...>
>> --
>> 2.29.2
>>


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
  2022-10-21  1:43   ` Hou Tao
@ 2022-10-21  1:48     ` Alexei Starovoitov
  2022-10-21  2:06       ` Hou Tao
  0 siblings, 1 reply; 10+ messages in thread
From: Alexei Starovoitov @ 2022-10-21  1:48 UTC (permalink / raw)
  To: Hou Tao
  Cc: Hao Luo, bpf, Alexei Starovoitov, Martin KaFai Lau,
	Andrii Nakryiko, Song Liu, Yonghong Song, Daniel Borkmann,
	KP Singh, Stanislav Fomichev, Jiri Olsa, John Fastabend, houtao1

On Fri, Oct 21, 2022 at 09:43:08AM +0800, Hou Tao wrote:
> Hi,
> 
> On 10/21/2022 2:01 AM, Hao Luo wrote:
> > On Thu, Oct 20, 2022 at 6:57 AM Hou Tao <houtao@huaweicloud.com> wrote:
> >> From: Hou Tao <houtao1@huawei.com>
> >>
> >> Since commit fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc."),
> >> numa node setting for non-preallocated hash table is ignored. The reason
> >> is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
> >> is trivial to support numa node setting for bpf memory allocator.
> >>
> >> So adding support for setting numa node in bpf memory allocator and
> >> updating hash map accordingly.
> >>
> >> Fixes: fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc.")
> >> Signed-off-by: Hou Tao <houtao1@huawei.com>
> >> ---
> > Looks good to me with a few nits.
> >
> > <...>
> >> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> >> index fc116cf47d24..44c531ba9534 100644
> >> --- a/kernel/bpf/memalloc.c
> >> +++ b/kernel/bpf/memalloc.c
> > <...>
> >> +static inline bool is_valid_numa_node(int numa_node, bool percpu)
> >> +{
> >> +       return numa_node == NUMA_NO_NODE ||
> >> +              (!percpu && (unsigned int)numa_node < nr_node_ids);
> > Maybe also check node_online? There is a similar helper function in
> > kernel/bpf/syscall.c.
> Will factor out as a helper function and use it in bpf memory allocator in v2.
> >
> > It may help debugging if we could log the reason here, for example,
> > PERCPU map but with numa_node specified.
> Not sure about it, because the validity check must have been done in map related
> code.
> 
> >
> >> +}
> >> +
> >> +/* The initial prefill is running in the context of map creation process, so
> >> + * if the preferred numa node is NUMA_NO_NODE, needs to use numa node of the
> >> + * specific cpu instead.
> >> + */
> >> +static inline int get_prefill_numa_node(int numa_node, int cpu)
> >> +{
> >> +       int prefill_numa_node;
> >> +
> >> +       if (numa_node == NUMA_NO_NODE)
> >> +               prefill_numa_node = cpu_to_node(cpu);
> >> +       else
> >> +               prefill_numa_node = numa_node;
> >> +       return prefill_numa_node;
> >>  }
> > nit: an alternative implementation is
> >
> >  return numa_node == NUMA_NO_NODE ? cpu_to_node(cpu) : numa_node;
> It is shorter and better. Will do it in v2.
> >
> >>  /* When size != 0 bpf_mem_cache for each cpu.
> >> @@ -359,13 +383,17 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
> >>   * kmalloc/kfree. Max allocation size is 4096 in this case.
> >>   * This is bpf_dynptr and bpf_kptr use case.
> >>   */
> > We added a parameter to this function, I think it is worth mentioning
> > the 'numa_node' argument's behavior under different values of
> > 'percpu'.
> How about the following comments ?
> 
>  * For per-cpu allocator (percpu=true), the only valid value of numa_node is
>  * NUMA_NO_NODE. For non-per-cpu allocator, if numa_node is NUMA_NO_NODE, the
>  * preferred memory allocation node is the numa node where the allocating CPU
>  * is located, else the preferred node is the specified numa_node.

No. This patch doesn't make sense to me.
As far as I can see it can only make things worse.
Why would you want a cpu to use non local memory?

The commit log:
" is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
  is trivial to support numa node setting for bpf memory allocator."
got it wrong.

See the existing comment:
                /* irq_work runs on this cpu and kmalloc will allocate
                 * from the current numa node which is what we want here.
                 */
                alloc_bulk(c, c->batch, NUMA_NO_NODE);


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
  2022-10-21  1:48     ` Alexei Starovoitov
@ 2022-10-21  2:06       ` Hou Tao
  2022-10-21  2:09         ` Alexei Starovoitov
  0 siblings, 1 reply; 10+ messages in thread
From: Hou Tao @ 2022-10-21  2:06 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Hao Luo, bpf, Alexei Starovoitov, Martin KaFai Lau,
	Andrii Nakryiko, Song Liu, Yonghong Song, Daniel Borkmann,
	KP Singh, Stanislav Fomichev, Jiri Olsa, John Fastabend, houtao1

Hi,

On 10/21/2022 9:48 AM, Alexei Starovoitov wrote:
> On Fri, Oct 21, 2022 at 09:43:08AM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 10/21/2022 2:01 AM, Hao Luo wrote:
>>> On Thu, Oct 20, 2022 at 6:57 AM Hou Tao <houtao@huaweicloud.com> wrote:
>>>> From: Hou Tao <houtao1@huawei.com>
>>>>
>>>> Since commit fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc."),
>>>> numa node setting for non-preallocated hash table is ignored. The reason
>>>> is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
>>>> is trivial to support numa node setting for bpf memory allocator.
>>>>
>>>> So adding support for setting numa node in bpf memory allocator and
>>>> updating hash map accordingly.
>>>>
>>>> Fixes: fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc.")
>>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>>> ---
SNIP
>> How about the following comments ?
>>
>>  * For per-cpu allocator (percpu=true), the only valid value of numa_node is
>>  * NUMA_NO_NODE. For non-per-cpu allocator, if numa_node is NUMA_NO_NODE, the
>>  * preferred memory allocation node is the numa node where the allocating CPU
>>  * is located, else the preferred node is the specified numa_node.
> No. This patch doesn't make sense to me.
> As far as I can see it can only make things worse.
> Why would you want a cpu to use non local memory?
For pre-allocated hash table, the numa node setting is honored. And I think the
reason is that there are bpf progs which are pinned on specific CPUs or numa
nodes and accessing local memory will be good for performance. And in my
understanding, the bpf memory allocator is trying to replace pre-allocated hash
table to save memory, if the numa node setting is ignored, the above use cases
may be work badly. Also I am trying to test whether or not there is visible
performance improvement for the above assumed use case.

>
> The commit log:
> " is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
>   is trivial to support numa node setting for bpf memory allocator."
> got it wrong.
>
> See the existing comment:
>                 /* irq_work runs on this cpu and kmalloc will allocate
>                  * from the current numa node which is what we want here.
>                  */
>                 alloc_bulk(c, c->batch, NUMA_NO_NODE);


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
  2022-10-21  2:06       ` Hou Tao
@ 2022-10-21  2:09         ` Alexei Starovoitov
  2022-10-21  2:26           ` Hou Tao
  0 siblings, 1 reply; 10+ messages in thread
From: Alexei Starovoitov @ 2022-10-21  2:09 UTC (permalink / raw)
  To: Hou Tao
  Cc: Hao Luo, bpf, Alexei Starovoitov, Martin KaFai Lau,
	Andrii Nakryiko, Song Liu, Yonghong Song, Daniel Borkmann,
	KP Singh, Stanislav Fomichev, Jiri Olsa, John Fastabend, Hou Tao

On Thu, Oct 20, 2022 at 7:06 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 10/21/2022 9:48 AM, Alexei Starovoitov wrote:
> > On Fri, Oct 21, 2022 at 09:43:08AM +0800, Hou Tao wrote:
> >> Hi,
> >>
> >> On 10/21/2022 2:01 AM, Hao Luo wrote:
> >>> On Thu, Oct 20, 2022 at 6:57 AM Hou Tao <houtao@huaweicloud.com> wrote:
> >>>> From: Hou Tao <houtao1@huawei.com>
> >>>>
> >>>> Since commit fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc."),
> >>>> numa node setting for non-preallocated hash table is ignored. The reason
> >>>> is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
> >>>> is trivial to support numa node setting for bpf memory allocator.
> >>>>
> >>>> So adding support for setting numa node in bpf memory allocator and
> >>>> updating hash map accordingly.
> >>>>
> >>>> Fixes: fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc.")
> >>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
> >>>> ---
> SNIP
> >> How about the following comments ?
> >>
> >>  * For per-cpu allocator (percpu=true), the only valid value of numa_node is
> >>  * NUMA_NO_NODE. For non-per-cpu allocator, if numa_node is NUMA_NO_NODE, the
> >>  * preferred memory allocation node is the numa node where the allocating CPU
> >>  * is located, else the preferred node is the specified numa_node.
> > No. This patch doesn't make sense to me.
> > As far as I can see it can only make things worse.
> > Why would you want a cpu to use non local memory?
> For pre-allocated hash table, the numa node setting is honored. And I think the
> reason is that there are bpf progs which are pinned on specific CPUs or numa
> nodes and accessing local memory will be good for performance.

prealloc happens at map creation time while
bpf prog might be running on completely different cpu,
so numa is necessary for prealloc.

> And in my
> understanding, the bpf memory allocator is trying to replace pre-allocated hash
> table to save memory, if the numa node setting is ignored, the above use cases
> may be work badly. Also I am trying to test whether or not there is visible
> performance improvement for the above assumed use case.

numa should be ignored, because we don't want users to accidently
pick wrong numa id.

> >
> > The commit log:
> > " is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
> >   is trivial to support numa node setting for bpf memory allocator."
> > got it wrong.
> >
> > See the existing comment:
> >                 /* irq_work runs on this cpu and kmalloc will allocate
> >                  * from the current numa node which is what we want here.
> >                  */
> >                 alloc_bulk(c, c->batch, NUMA_NO_NODE);
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
  2022-10-21  2:09         ` Alexei Starovoitov
@ 2022-10-21  2:26           ` Hou Tao
  2022-10-21  4:22             ` Alexei Starovoitov
  0 siblings, 1 reply; 10+ messages in thread
From: Hou Tao @ 2022-10-21  2:26 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Hao Luo, bpf, Alexei Starovoitov, Martin KaFai Lau,
	Andrii Nakryiko, Song Liu, Yonghong Song, Daniel Borkmann,
	KP Singh, Stanislav Fomichev, Jiri Olsa, John Fastabend, Hou Tao

Hi,

On 10/21/2022 10:09 AM, Alexei Starovoitov wrote:
> On Thu, Oct 20, 2022 at 7:06 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi,
>>
>> On 10/21/2022 9:48 AM, Alexei Starovoitov wrote:
>>> On Fri, Oct 21, 2022 at 09:43:08AM +0800, Hou Tao wrote:
>>>> Hi,
>>>>
>>>> On 10/21/2022 2:01 AM, Hao Luo wrote:
>>>>> On Thu, Oct 20, 2022 at 6:57 AM Hou Tao <houtao@huaweicloud.com> wrote:
>>>>>> From: Hou Tao <houtao1@huawei.com>
>>>>>>
>>>>>> Since commit fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc."),
>>>>>> numa node setting for non-preallocated hash table is ignored. The reason
>>>>>> is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
>>>>>> is trivial to support numa node setting for bpf memory allocator.
>>>>>>
>>>>>> So adding support for setting numa node in bpf memory allocator and
>>>>>> updating hash map accordingly.
>>>>>>
>>>>>> Fixes: fba1a1c6c912 ("bpf: Convert hash map to bpf_mem_alloc.")
>>>>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>>>>> ---
>> SNIP
>>>> How about the following comments ?
>>>>
>>>>  * For per-cpu allocator (percpu=true), the only valid value of numa_node is
>>>>  * NUMA_NO_NODE. For non-per-cpu allocator, if numa_node is NUMA_NO_NODE, the
>>>>  * preferred memory allocation node is the numa node where the allocating CPU
>>>>  * is located, else the preferred node is the specified numa_node.
>>> No. This patch doesn't make sense to me.
>>> As far as I can see it can only make things worse.
>>> Why would you want a cpu to use non local memory?
>> For pre-allocated hash table, the numa node setting is honored. And I think the
>> reason is that there are bpf progs which are pinned on specific CPUs or numa
>> nodes and accessing local memory will be good for performance.
> prealloc happens at map creation time while
> bpf prog might be running on completely different cpu,
> so numa is necessary for prealloc.
I see. So for non-preallocated hash map, the memory will allocated from the
current NUMA node if possible and there will be no memory affinity problems if
these programs are on the same NUMA node.
>
>> And in my
>> understanding, the bpf memory allocator is trying to replace pre-allocated hash
>> table to save memory, if the numa node setting is ignored, the above use cases
>> may be work badly. Also I am trying to test whether or not there is visible
>> performance improvement for the above assumed use case.
> numa should be ignored, because we don't want users to accidently
> pick wrong numa id.
How about reject the NUMA node setting for non-preallocated hash table in
hashtab.c ?

>
>>> The commit log:
>>> " is that bpf memory allocator only supports NUMA_NO_NODE, but it seems it
>>>   is trivial to support numa node setting for bpf memory allocator."
>>> got it wrong.
>>>
>>> See the existing comment:
>>>                 /* irq_work runs on this cpu and kmalloc will allocate
>>>                  * from the current numa node which is what we want here.
>>>                  */
>>>                 alloc_bulk(c, c->batch, NUMA_NO_NODE);


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
  2022-10-21  2:26           ` Hou Tao
@ 2022-10-21  4:22             ` Alexei Starovoitov
  2022-10-21 11:01               ` Hou Tao
  2022-11-08  2:22               ` Hou Tao
  0 siblings, 2 replies; 10+ messages in thread
From: Alexei Starovoitov @ 2022-10-21  4:22 UTC (permalink / raw)
  To: Hou Tao
  Cc: Hao Luo, bpf, Alexei Starovoitov, Martin KaFai Lau,
	Andrii Nakryiko, Song Liu, Yonghong Song, Daniel Borkmann,
	KP Singh, Stanislav Fomichev, Jiri Olsa, John Fastabend, Hou Tao

On Thu, Oct 20, 2022 at 7:26 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> How about reject the NUMA node setting for non-preallocated hash table in
> hashtab.c ?

It's easy to ask the question, but please answer it yourself.
Analyze the code and describe what you think is happening now
and what should or should not be the behavior.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
  2022-10-21  4:22             ` Alexei Starovoitov
@ 2022-10-21 11:01               ` Hou Tao
  2022-11-08  2:22               ` Hou Tao
  1 sibling, 0 replies; 10+ messages in thread
From: Hou Tao @ 2022-10-21 11:01 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Hao Luo, bpf, Alexei Starovoitov, Martin KaFai Lau,
	Andrii Nakryiko, Song Liu, Yonghong Song, Daniel Borkmann,
	KP Singh, Stanislav Fomichev, Jiri Olsa, John Fastabend, Hou Tao

Hi,

On 10/21/2022 12:22 PM, Alexei Starovoitov wrote:
> On Thu, Oct 20, 2022 at 7:26 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> How about reject the NUMA node setting for non-preallocated hash table in
>> hashtab.c ?
> It's easy to ask the question, but please answer it yourself.
> Analyze the code and describe what you think is happening now
> and what should or should not be the behavior.
Will do.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator
  2022-10-21  4:22             ` Alexei Starovoitov
  2022-10-21 11:01               ` Hou Tao
@ 2022-11-08  2:22               ` Hou Tao
  1 sibling, 0 replies; 10+ messages in thread
From: Hou Tao @ 2022-11-08  2:22 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Hao Luo, bpf, Alexei Starovoitov, Martin KaFai Lau,
	Andrii Nakryiko, Song Liu, Yonghong Song, Daniel Borkmann,
	KP Singh, Stanislav Fomichev, Jiri Olsa, John Fastabend, Hou Tao

Hi,

On 10/21/2022 12:22 PM, Alexei Starovoitov wrote:
> On Thu, Oct 20, 2022 at 7:26 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> How about reject the NUMA node setting for non-preallocated hash table in
>> hashtab.c ?
> It's easy to ask the question, but please answer it yourself.
> Analyze the code and describe what you think is happening now
> and what should or should not be the behavior.
> .
I found it is a bad idea to reject the numa node setting for non-preallocated
hash table. The reason is that now hash buckets are still allocated according to
the numa node setting. If the numa node setting is rejected, the use case below
won't work normally, so I will keep it as-is:

1. a non-preallocated hash table is created on numa node 2 (buckets are
allocated from node 2)
2. bpf program is running on numa node 2 (elements are also allocated from node 2)
3. all used memories are allocated from node 2.



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-11-08  2:22 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-20 14:22 [PATCH bpf] bpf: Support for setting numa node in bpf memory allocator Hou Tao
2022-10-20 18:01 ` Hao Luo
2022-10-21  1:43   ` Hou Tao
2022-10-21  1:48     ` Alexei Starovoitov
2022-10-21  2:06       ` Hou Tao
2022-10-21  2:09         ` Alexei Starovoitov
2022-10-21  2:26           ` Hou Tao
2022-10-21  4:22             ` Alexei Starovoitov
2022-10-21 11:01               ` Hou Tao
2022-11-08  2:22               ` Hou Tao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.