All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
To: davem@davemloft.net
Cc: daniel@iogearbox.net, andrii@kernel.org, tj@kernel.org,
	memxor@gmail.com, delyank@fb.com, linux-mm@kvack.org,
	bpf@vger.kernel.org, kernel-team@fb.com
Subject: [PATCH v4 bpf-next 08/15] bpf: Adjust low/high watermarks in bpf_mem_cache
Date: Thu, 25 Aug 2022 19:44:23 -0700	[thread overview]
Message-ID: <20220826024430.84565-9-alexei.starovoitov@gmail.com> (raw)
In-Reply-To: <20220826024430.84565-1-alexei.starovoitov@gmail.com>

From: Alexei Starovoitov <ast@kernel.org>

The same low/high watermarks for every bucket in bpf_mem_cache consume
significant amount of memory. Preallocating 64 elements of 4096 bytes each in
the free list is not efficient. Make low/high watermarks and batching value
dependent on element size. This change brings significant memory savings.

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 50 +++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index c1817f14c25a..775c38132c4d 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -100,6 +100,7 @@ struct bpf_mem_cache {
 	int unit_size;
 	/* count of objects in free_llist */
 	int free_cnt;
+	int low_watermark, high_watermark, batch;
 };
 
 struct bpf_mem_caches {
@@ -118,14 +119,6 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
 	return entry;
 }
 
-#define BATCH 48
-#define LOW_WATERMARK 32
-#define HIGH_WATERMARK 96
-/* Assuming the average number of elements per bucket is 64, when all buckets
- * are used the total memory will be: 64*16*32 + 64*32*32 + 64*64*32 + ... +
- * 64*4096*32 ~ 20Mbyte
- */
-
 static void *__alloc(struct bpf_mem_cache *c, int node)
 {
 	/* Allocate, but don't deplete atomic reserves that typical
@@ -216,7 +209,7 @@ static void free_bulk(struct bpf_mem_cache *c)
 		if (IS_ENABLED(CONFIG_PREEMPT_RT))
 			local_irq_restore(flags);
 		free_one(c, llnode);
-	} while (cnt > (HIGH_WATERMARK + LOW_WATERMARK) / 2);
+	} while (cnt > (c->high_watermark + c->low_watermark) / 2);
 
 	/* and drain free_llist_extra */
 	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
@@ -230,12 +223,12 @@ static void bpf_mem_refill(struct irq_work *work)
 
 	/* Racy access to free_cnt. It doesn't need to be 100% accurate */
 	cnt = c->free_cnt;
-	if (cnt < LOW_WATERMARK)
+	if (cnt < c->low_watermark)
 		/* irq_work runs on this cpu and kmalloc will allocate
 		 * from the current numa node which is what we want here.
 		 */
-		alloc_bulk(c, BATCH, NUMA_NO_NODE);
-	else if (cnt > HIGH_WATERMARK)
+		alloc_bulk(c, c->batch, NUMA_NO_NODE);
+	else if (cnt > c->high_watermark)
 		free_bulk(c);
 }
 
@@ -244,9 +237,38 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
 	irq_work_queue(&c->refill_work);
 }
 
+/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
+ * the freelist cache will be elem_size * 64 (or less) on each cpu.
+ *
+ * For bpf programs that don't have statically known allocation sizes and
+ * assuming (low_mark + high_mark) / 2 as an average number of elements per
+ * bucket and all buckets are used the total amount of memory in freelists
+ * on each cpu will be:
+ * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
+ * == ~ 116 Kbyte using below heuristic.
+ * Initialized, but unused bpf allocator (not bpf map specific one) will
+ * consume ~ 11 Kbyte per cpu.
+ * Typical case will be between 11K and 116K closer to 11K.
+ * bpf progs can and should share bpf_mem_cache when possible.
+ */
+
 static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
 {
 	init_irq_work(&c->refill_work, bpf_mem_refill);
+	if (c->unit_size <= 256) {
+		c->low_watermark = 32;
+		c->high_watermark = 96;
+	} else {
+		/* When page_size == 4k, order-0 cache will have low_mark == 2
+		 * and high_mark == 6 with batch alloc of 3 individual pages at
+		 * a time.
+		 * 8k allocs and above low == 1, high == 3, batch == 1.
+		 */
+		c->low_watermark = max(32 * 256 / c->unit_size, 1);
+		c->high_watermark = max(96 * 256 / c->unit_size, 3);
+	}
+	c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+
 	/* To avoid consuming memory assume that 1st run of bpf
 	 * prog won't be doing more than 4 map_update_elem from
 	 * irq disabled region
@@ -388,7 +410,7 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
 
 	WARN_ON(cnt < 0);
 
-	if (cnt < LOW_WATERMARK)
+	if (cnt < c->low_watermark)
 		irq_work_raise(c);
 	return llnode;
 }
@@ -421,7 +443,7 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
 	local_dec(&c->active);
 	local_irq_restore(flags);
 
-	if (cnt > HIGH_WATERMARK)
+	if (cnt > c->high_watermark)
 		/* free few objects from current cpu into global kmalloc pool */
 		irq_work_raise(c);
 }
-- 
2.30.2


  parent reply	other threads:[~2022-08-26  2:45 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-26  2:44 [PATCH v4 bpf-next 00/15] bpf: BPF specific memory allocator Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 01/15] bpf: Introduce any context " Alexei Starovoitov
2022-08-29 21:30   ` Daniel Borkmann
2022-08-29 21:45     ` Alexei Starovoitov
2022-08-29 21:59   ` Daniel Borkmann
2022-08-29 22:04     ` Alexei Starovoitov
2022-08-29 22:39   ` Martin KaFai Lau
2022-08-29 22:42     ` Alexei Starovoitov
2022-08-29 22:59       ` Kumar Kartikeya Dwivedi
2022-08-29 23:13         ` Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 02/15] bpf: Convert hash map to bpf_mem_alloc Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 03/15] selftests/bpf: Improve test coverage of test_maps Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 04/15] samples/bpf: Reduce syscall overhead in map_perf_test Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 05/15] bpf: Relax the requirement to use preallocated hash maps in tracing progs Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 06/15] bpf: Optimize element count in non-preallocated hash map Alexei Starovoitov
2022-08-29 21:47   ` Daniel Borkmann
2022-08-29 21:57     ` Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 07/15] bpf: Optimize call_rcu " Alexei Starovoitov
2022-08-26  2:44 ` Alexei Starovoitov [this message]
2022-08-26  2:44 ` [PATCH v4 bpf-next 09/15] bpf: Batch call_rcu callbacks instead of SLAB_TYPESAFE_BY_RCU Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 10/15] bpf: Add percpu allocation support to bpf_mem_alloc Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 11/15] bpf: Convert percpu hash map to per-cpu bpf_mem_alloc Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 12/15] bpf: Remove tracing program restriction on map types Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 13/15] bpf: Prepare bpf_mem_alloc to be used by sleepable bpf programs Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 14/15] bpf: Remove prealloc-only restriction for " Alexei Starovoitov
2022-08-26  2:44 ` [PATCH v4 bpf-next 15/15] bpf: Introduce sysctl kernel.bpf_force_dyn_alloc Alexei Starovoitov
2022-08-29 22:02   ` Daniel Borkmann
2022-08-29 22:27     ` Alexei Starovoitov
2022-08-27 16:57 ` [PATCH v4 bpf-next 00/15] bpf: BPF specific memory allocator Andrii Nakryiko
2022-08-27 22:53   ` Kumar Kartikeya Dwivedi
2022-08-29 15:47     ` Alexei Starovoitov
2022-09-09 20:10       ` Andrii Nakryiko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220826024430.84565-9-alexei.starovoitov@gmail.com \
    --to=alexei.starovoitov@gmail.com \
    --cc=andrii@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=delyank@fb.com \
    --cc=kernel-team@fb.com \
    --cc=linux-mm@kvack.org \
    --cc=memxor@gmail.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.