[RFC PATCH bpf-next v5 2/2] bpf: Call rcu_momentary_dyntick_idle() in task work periodically

From: Hou Tao <houtao@huaweicloud.com>
To: bpf@vger.kernel.org, Martin KaFai Lau <martin.lau@linux.dev>,
	Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andrii Nakryiko <andrii@kernel.org>, Song Liu <song@kernel.org>,
	Hao Luo <haoluo@google.com>, Yonghong Song <yhs@fb.com>,
	Daniel Borkmann <daniel@iogearbox.net>,
	KP Singh <kpsingh@kernel.org>,
	Stanislav Fomichev <sdf@google.com>, Jiri Olsa <jolsa@kernel.org>,
	John Fastabend <john.fastabend@gmail.com>,
	"Paul E . McKenney" <paulmck@kernel.org>,
	rcu@vger.kernel.org, houtao1@huawei.com
Subject: [RFC PATCH bpf-next v5 2/2] bpf: Call rcu_momentary_dyntick_idle() in task work periodically
Date: Mon, 19 Jun 2023 22:32:31 +0800	[thread overview]
Message-ID: <20230619143231.222536-3-houtao@huaweicloud.com> (raw)
In-Reply-To: <20230619143231.222536-1-houtao@huaweicloud.com>

From: Hou Tao <houtao1@huawei.com>

After doing reuse-after-RCU-GP in bpf memory allocator, if there are
intensive memory allocation and free in bpf memory allocator and RCU GP
is slow, the peak memory usage for bpf memory allocator will be high.

To reduce memory usage for bpf memory allocator, call
rcu_momentary_dyntick_idle() in task work periodically to accelerate the
expiration of RCU grace period.

The following benchmark results the memory usage reduce a lot after
applying the patch:

Before:
overwrite           per-prod-op 49.11 ± 1.30k/s, avg mem 313.09 ± 80.36MiB, peak mem 509.09MiB
batch_add_batch_del per-prod-op 76.06 ± 2.38k/s, avg mem 287.97 ± 63.59MiB, peak mem 496.81MiB
add_del_on_diff_cpu per-prod-op 18.75 ± 0.09k/s, avg mem  27.71 ±  4.92MiB, peak mem  44.54MiB

After:
overwrite           per-prod-op 51.17 ± 0.30k/s, avg mem 105.09 ±  7.74MiB, peak mem 143.60MiB
batch_add_batch_del per-prod-op 86.43 ± 0.90k/s, avg mem  85.82 ± 11.81MiB, peak mem 118.93MiB
add_del_on_diff_cpu per-prod-op 18.71 ± 0.08k/s, avg mem  26.92 ±  5.50MiB, peak mem  43.18MiB

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 kernel/bpf/memalloc.c | 46 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 9b31c53fd285..c4b4cae04400 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -6,6 +6,7 @@
 #include <linux/irq_work.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/memcontrol.h>
+#include <linux/task_work.h>
 #include <asm/local.h>
 
 /* Any context (including NMI) BPF specific memory allocator.
@@ -123,6 +124,16 @@ struct bpf_reuse_batch {
 	struct rcu_head rcu;
 };
 
+#define BPF_GP_ACC_RUNNING 0
+
+struct bpf_rcu_gp_acc_ctx {
+	unsigned long flags;
+	unsigned long next_run;
+	struct callback_head work;
+};
+
+static DEFINE_PER_CPU(struct bpf_rcu_gp_acc_ctx, bpf_acc_ctx);
+
 static struct llist_node notrace *__llist_del_first(struct llist_head *head)
 {
 	struct llist_node *entry, *next;
@@ -347,12 +358,47 @@ static void dyn_reuse_rcu(struct rcu_head *rcu)
 	kfree(batch);
 }
 
+static void bpf_rcu_gp_acc_work(struct callback_head *head)
+{
+	struct bpf_rcu_gp_acc_ctx *ctx = container_of(head, struct bpf_rcu_gp_acc_ctx, work);
+
+	local_irq_disable();
+	rcu_momentary_dyntick_idle();
+	local_irq_enable();
+
+	/* The interval between rcu_momentary_dyntick_idle() calls is
+	 * at least 10ms.
+	 */
+	WRITE_ONCE(ctx->next_run, jiffies + msecs_to_jiffies(10));
+	clear_bit(BPF_GP_ACC_RUNNING, &ctx->flags);
+}
+
+static void bpf_mem_rcu_gp_acc(struct bpf_mem_cache *c)
+{
+	struct bpf_rcu_gp_acc_ctx *ctx = this_cpu_ptr(&bpf_acc_ctx);
+
+	if (atomic_read(&c->dyn_reuse_rcu_cnt) < 128 ||
+	    time_before(jiffies, READ_ONCE(ctx->next_run)))
+		return;
+
+	if ((current->flags & PF_KTHREAD) ||
+	    test_and_set_bit(BPF_GP_ACC_RUNNING, &ctx->flags))
+		return;
+
+	init_task_work(&ctx->work, bpf_rcu_gp_acc_work);
+	/* Task is exiting ? */
+	if (task_work_add(current, &ctx->work, TWA_RESUME))
+		clear_bit(BPF_GP_ACC_RUNNING, &ctx->flags);
+}
+
 static void reuse_bulk(struct bpf_mem_cache *c)
 {
 	struct llist_node *head, *tail;
 	struct bpf_reuse_batch *batch;
 	unsigned long flags;
 
+	bpf_mem_rcu_gp_acc(c);
+
 	head = llist_del_all(&c->free_llist_extra);
 	tail = head;
 	while (tail && tail->next)
-- 
2.29.2