[PATCH bpf-next 2/5] bpf: implement sleepable uprobes by chaining tasks and normal rcu

From: Delyan Kratunov <delyank@fb.com>
To: "daniel@iogearbox.net" <daniel@iogearbox.net>,
	"ast@kernel.org" <ast@kernel.org>,
	"andrii@kernel.org" <andrii@kernel.org>,
	"bpf@vger.kernel.org" <bpf@vger.kernel.org>
Cc: "paulmck@kernel.org" <paulmck@kernel.org>
Subject: [PATCH bpf-next 2/5] bpf: implement sleepable uprobes by chaining tasks and normal rcu
Date: Thu, 28 Apr 2022 16:54:00 +0000	[thread overview]
Message-ID: <972caeb1e9338721bb719b118e0e40705f860f50.1651103126.git.delyank@fb.com> (raw)
In-Reply-To: <cover.1651103126.git.delyank@fb.com>

uprobes work by raising a trap, setting a task flag from within the
interrupt handler, and processing the actual work for the uprobe on the
way back to userspace. As a result, uprobe handlers already execute in a
user context. The primary obstacle to sleepable bpf uprobe programs is
therefore on the bpf side.

Namely, the bpf_prog_array attached to the uprobe is protected by normal
rcu and runs with disabled preemption. In order for uprobe bpf programs
to become actually sleepable, we need it to be protected by the tasks_trace
rcu flavor instead (and kfree() called after a corresponding grace period).

One way to achieve this is by tracking an array-has-contained-sleepable-prog
flag in bpf_prog_array and switching rcu flavors based on it. However, this
is deemed somewhat unwieldly and the rcu flavor transition would be hard
to reason about.

Instead, based on Alexei's proposal, we change the free path for
bpf_prog_array to chain a tasks_trace and normal grace periods
one after the other. Users who iterate under tasks_trace read section would
be safe, as would users who iterate under normal read sections (from
non-sleepable locations). The downside is that we take the tasks_trace latency
unconditionally but that's deemed acceptable under expected workloads.

The other interesting implication is wrt non-sleepable uprobe
programs. Because they need access to dynamically sized rcu-protected
maps, we conditionally disable preemption and take an rcu read section
around them, in addition to the overarching tasks_trace section.

Signed-off-by: Delyan Kratunov <delyank@fb.com>
---
 include/linux/bpf.h          | 60 ++++++++++++++++++++++++++++++++++++
 include/linux/trace_events.h |  1 +
 kernel/bpf/core.c            | 10 +++++-
 kernel/trace/bpf_trace.c     | 23 ++++++++++++++
 kernel/trace/trace_uprobe.c  |  4 +--
 5 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7d7f4806f5fb..d8692d7176ce 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -25,6 +25,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/stddef.h>
 #include <linux/bpfptr.h>
+#include <linux/rcupdate_trace.h>

 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -1379,6 +1380,65 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
 	return ret;
 }

+/**
+ * Notes on RCU design for bpf_prog_arrays containing sleepable programs:
+ *
+ * We use the trace RCU flavor read section to protect the bpf_prog_array overall.
+ * Because there are users of bpf_prog_array that only use the normal
+ * rcu flavor, to avoid tracking a used-for-sleepable-programs bit in the
+ * array, we chain call_rcu_tasks_trace and kfree_rcu on the free path.
+ * This is suboptimal if the array is never used for sleepable programs
+ * but not a cause of concern under expected workloads.
+ *
+ * In the case where a non-sleepable program is inside the array,
+ * we take the rcu read section and disable preemption for that program
+ * alone, so it can access rcu-protected dynamically sized maps.
+ */
+static __always_inline u32
+bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
+			     const void *ctx, bpf_prog_run_fn run_prog)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_trace_run_ctx run_ctx;
+	u32 ret = 1;
+
+	might_fault();
+
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "shouldn't be holding rcu lock");
+
+	migrate_disable();
+	rcu_read_lock_trace();
+
+	array = rcu_dereference(array_rcu);
+	if (unlikely(!array))
+		goto out;
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		if (!prog->aux->sleepable) {
+			preempt_disable();
+			rcu_read_lock();
+		}
+
+		run_ctx.bpf_cookie = item->bpf_cookie;
+		ret &= run_prog(prog, ctx);
+		item++;
+
+		if (!prog->aux->sleepable) {
+			rcu_read_unlock();
+			preempt_enable();
+		}
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+out:
+	rcu_read_unlock_trace();
+	migrate_enable();
+	return ret;
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 extern struct mutex bpf_stats_enabled_mutex;
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index e6e95a9f07a5..d45889f1210d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -736,6 +736,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)

 #ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx);
 int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 13e9dbeeedf3..a4c301ef2ba1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2261,11 +2261,19 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
 	return &bpf_empty_prog_array.hdr;
 }

+static void __bpf_prog_array_free_rcu(struct rcu_head *rcu)
+{
+	struct bpf_prog_array *progs;
+
+	progs = container_of(rcu, struct bpf_prog_array, rcu);
+	kfree_rcu(progs, rcu);
+}
+
 void bpf_prog_array_free(struct bpf_prog_array *progs)
 {
 	if (!progs || progs == &bpf_empty_prog_array.hdr)
 		return;
-	kfree_rcu(progs, rcu);
+	call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_rcu);
 }

 int bpf_prog_array_length(struct bpf_prog_array *array)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f15b826f9899..6a2291d4b5a0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -140,6 +140,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 	return ret;
 }

+unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx)
+{
+	unsigned int ret;
+
+	/*
+	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+	 * to all call sites, we did a bpf_prog_array_valid() there to check
+	 * whether call->prog_array is empty or not, which is
+	 * a heuristic to speed up execution.
+	 *
+	 * If bpf_prog_array_valid() fetched prog_array was
+	 * non-NULL, we go into uprobe_call_bpf() and do the actual
+	 * proper rcu_dereference() under RCU trace lock.
+	 * If it turns out that prog_array is NULL then, we bail out.
+	 * For the opposite, if the bpf_prog_array_valid() fetched pointer
+	 * was NULL, you'll skip the prog_array with the risk of missing
+	 * out of events when it was updated in between this and the
+	 * rcu_dereference() which is accepted risk.
+	 */
+	ret = bpf_prog_run_array_sleepable(call->prog_array, ctx, bpf_prog_run);
+	return ret;
+}
+
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
 BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
 {
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9711589273cd..3eb48897d15b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1346,9 +1346,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	if (bpf_prog_array_valid(call)) {
 		u32 ret;

-		preempt_disable();
-		ret = trace_call_bpf(call, regs);
-		preempt_enable();
+		ret = uprobe_call_bpf(call, regs);
 		if (!ret)
 			return;
 	}
--
2.35.1