All of lore.kernel.org
 help / color / mirror / Atom feed
From: Benjamin Tissoires <bentiss@kernel.org>
To: Alexei Starovoitov <ast@kernel.org>,
	 Daniel Borkmann <daniel@iogearbox.net>,
	Andrii Nakryiko <andrii@kernel.org>,
	 Martin KaFai Lau <martin.lau@linux.dev>,
	 Eduard Zingerman <eddyz87@gmail.com>, Song Liu <song@kernel.org>,
	 Yonghong Song <yonghong.song@linux.dev>,
	 John Fastabend <john.fastabend@gmail.com>,
	KP Singh <kpsingh@kernel.org>,
	 Stanislav Fomichev <sdf@google.com>, Hao Luo <haoluo@google.com>,
	 Jiri Olsa <jolsa@kernel.org>, Mykola Lysenko <mykolal@fb.com>,
	 Shuah Khan <shuah@kernel.org>
Cc: Benjamin Tissoires <bentiss@kernel.org>,
	bpf@vger.kernel.org,  linux-kernel@vger.kernel.org,
	linux-kselftest@vger.kernel.org
Subject: [PATCH bpf-next v4 1/6] bpf/helpers: introduce sleepable bpf_timers
Date: Fri, 15 Mar 2024 15:29:25 +0100	[thread overview]
Message-ID: <20240315-hid-bpf-sleepable-v4-1-5658f2540564@kernel.org> (raw)
In-Reply-To: <20240315-hid-bpf-sleepable-v4-0-5658f2540564@kernel.org>

They are implemented as a workqueue, which means that there are no
guarantees of timing nor ordering.

Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>

---
changes in v4:
- dropped __bpf_timer_compute_key()
- use a spin_lock instead of a semaphore
- ensure bpf_timer_cancel_and_free is not complaining about
  non sleepable context and use cancel_work() instead of
  cancel_work_sync()
- return -EINVAL if a delay is given to bpf_timer_start() with
  BPF_F_TIMER_SLEEPABLE

changes in v3:
- extracted the implementation in bpf_timer only, without
  bpf_timer_set_sleepable_cb()
- rely on schedule_work() only, from bpf_timer_start()
- add semaphore to ensure bpf_timer_work_cb() is accessing
  consistent data

changes in v2 (compared to the one attaches to v1 0/9):
- make use of a kfunc
- add a (non-used) BPF_F_TIMER_SLEEPABLE
- the callback is *not* called, it makes the kernel crashes
---
 include/uapi/linux/bpf.h |  4 +++
 kernel/bpf/helpers.c     | 86 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3c42b9f1bada..b90def29d796 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7461,10 +7461,14 @@ struct bpf_core_relo {
  *     - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
  *       relative to current time.
  *     - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller.
+ *     - BPF_F_TIMER_SLEEPABLE: Timer will run in a sleepable context, with
+ *       no guarantees of ordering nor timing (consider this as being just
+ *       offloaded immediately).
  */
 enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
 	BPF_F_TIMER_CPU_PIN = (1ULL << 1),
+	BPF_F_TIMER_SLEEPABLE = (1ULL << 2),
 };
 
 /* BPF numbers iterator state */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a89587859571..38de73a9df83 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1094,14 +1094,20 @@ const struct bpf_func_proto bpf_snprintf_proto = {
  * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
  * Inner maps can contain bpf timers as well. ops->map_release_uref is
  * freeing the timers when inner map is replaced or deleted by user space.
+ *
+ * sleepable_lock protects only the setup of the workqueue, not the callback
+ * itself. This is done to ensure we don't run concurrently a free of the
+ * callback or the associated program.
  */
 struct bpf_hrtimer {
 	struct hrtimer timer;
+	struct work_struct work;
 	struct bpf_map *map;
 	struct bpf_prog *prog;
 	void __rcu *callback_fn;
 	void *value;
 	struct rcu_head rcu;
+	spinlock_t sleepable_lock;
 };
 
 /* the actual struct hidden inside uapi struct bpf_timer */
@@ -1114,6 +1120,49 @@ struct bpf_timer_kern {
 	struct bpf_spin_lock lock;
 } __attribute__((aligned(8)));
 
+static void bpf_timer_work_cb(struct work_struct *work)
+{
+	struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, work);
+	struct bpf_map *map = t->map;
+	bpf_callback_t callback_fn;
+	void *value = t->value;
+	unsigned long flags;
+	void *key;
+	u32 idx;
+
+	BTF_TYPE_EMIT(struct bpf_timer);
+
+	spin_lock_irqsave(&t->sleepable_lock, flags);
+
+	callback_fn = READ_ONCE(t->callback_fn);
+	if (!callback_fn) {
+		spin_unlock_irqrestore(&t->sleepable_lock, flags);
+		return;
+	}
+
+	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+		/* compute the key */
+		idx = ((char *)value - array->value) / array->elem_size;
+		key = &idx;
+	} else { /* hash or lru */
+		key = value - round_up(map->key_size, 8);
+	}
+
+	/* prevent the callback to be freed by bpf_timer_cancel() while running
+	 * so we can release the sleepable lock
+	 */
+	bpf_prog_inc(t->prog);
+
+	spin_unlock_irqrestore(&t->sleepable_lock, flags);
+
+	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+	/* The verifier checked that return value is zero. */
+
+	bpf_prog_put(t->prog);
+}
+
 static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
 
 static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
@@ -1192,6 +1241,8 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
 	t->prog = NULL;
 	rcu_assign_pointer(t->callback_fn, NULL);
 	hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+	INIT_WORK(&t->work, bpf_timer_work_cb);
+	spin_lock_init(&t->sleepable_lock);
 	t->timer.function = bpf_timer_cb;
 	WRITE_ONCE(timer->timer, t);
 	/* Guarantee the order between timer->timer and map->usercnt. So
@@ -1237,6 +1288,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
 		ret = -EINVAL;
 		goto out;
 	}
+	spin_lock(&t->sleepable_lock);
 	if (!atomic64_read(&t->map->usercnt)) {
 		/* maps with timers must be either held by user space
 		 * or pinned in bpffs. Otherwise timer might still be
@@ -1263,6 +1315,8 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
 	}
 	rcu_assign_pointer(t->callback_fn, callback_fn);
 out:
+	if (t)
+		spin_unlock(&t->sleepable_lock);
 	__bpf_spin_unlock_irqrestore(&timer->lock);
 	return ret;
 }
@@ -1283,8 +1337,12 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
 
 	if (in_nmi())
 		return -EOPNOTSUPP;
-	if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
+	if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN | BPF_F_TIMER_SLEEPABLE))
 		return -EINVAL;
+
+	if ((flags & BPF_F_TIMER_SLEEPABLE) && nsecs)
+		return -EINVAL;
+
 	__bpf_spin_lock_irqsave(&timer->lock);
 	t = timer->timer;
 	if (!t || !t->prog) {
@@ -1300,7 +1358,10 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
 	if (flags & BPF_F_TIMER_CPU_PIN)
 		mode |= HRTIMER_MODE_PINNED;
 
-	hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
+	if (flags & BPF_F_TIMER_SLEEPABLE)
+		schedule_work(&t->work);
+	else
+		hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
 out:
 	__bpf_spin_unlock_irqrestore(&timer->lock);
 	return ret;
@@ -1348,13 +1409,22 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
 		ret = -EDEADLK;
 		goto out;
 	}
+	spin_lock(&t->sleepable_lock);
 	drop_prog_refcnt(t);
+	spin_unlock(&t->sleepable_lock);
 out:
 	__bpf_spin_unlock_irqrestore(&timer->lock);
 	/* Cancel the timer and wait for associated callback to finish
 	 * if it was running.
 	 */
 	ret = ret ?: hrtimer_cancel(&t->timer);
+
+	/* also cancel the sleepable work, but *do not* wait for
+	 * it to finish if it was running as we might not be in a
+	 * sleepable context
+	 */
+	ret = ret ?: cancel_work(&t->work);
+
 	rcu_read_unlock();
 	return ret;
 }
@@ -1383,11 +1453,13 @@ void bpf_timer_cancel_and_free(void *val)
 	t = timer->timer;
 	if (!t)
 		goto out;
+	spin_lock(&t->sleepable_lock);
 	drop_prog_refcnt(t);
 	/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
 	 * this timer, since it won't be initialized.
 	 */
 	WRITE_ONCE(timer->timer, NULL);
+	spin_unlock(&t->sleepable_lock);
 out:
 	__bpf_spin_unlock_irqrestore(&timer->lock);
 	if (!t)
@@ -1410,6 +1482,16 @@ void bpf_timer_cancel_and_free(void *val)
 	 */
 	if (this_cpu_read(hrtimer_running) != t)
 		hrtimer_cancel(&t->timer);
+
+	/* also cancel the sleepable work, but *do not* wait for
+	 * it to finish if it was running as we might not be in a
+	 * sleepable context. Same reason as above, it's fine to
+	 * free 't': the subprog callback will never access it anymore
+	 * and can not reschedule itself since timer->timer = NULL was
+	 * already done.
+	 */
+	cancel_work(&t->work);
+
 	kfree_rcu(t, rcu);
 }
 

-- 
2.44.0


  reply	other threads:[~2024-03-15 14:29 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-15 14:29 [PATCH bpf-next v4 0/6] sleepable bpf_timer (was: allow HID-BPF to do device IOs) Benjamin Tissoires
2024-03-15 14:29 ` Benjamin Tissoires [this message]
2024-03-15 14:29 ` [PATCH bpf-next v4 2/6] bpf/verifier: add bpf_timer as a kfunc capable type Benjamin Tissoires
2024-03-18 21:53   ` Eduard Zingerman
2024-03-21 15:58     ` Benjamin Tissoires
2024-03-15 14:29 ` [PATCH bpf-next v4 3/6] bpf/helpers: introduce bpf_timer_set_sleepable_cb() kfunc Benjamin Tissoires
2024-03-18 22:52   ` Eduard Zingerman
2024-03-21 15:44     ` Benjamin Tissoires
2024-03-15 14:29 ` [PATCH bpf-next v4 4/6] bpf/helpers: mark the callback of bpf_timer_set_sleepable_cb() as sleepable Benjamin Tissoires
2024-03-18 23:54   ` Eduard Zingerman
2024-03-21 16:09     ` Benjamin Tissoires
2024-03-15 14:29 ` [PATCH bpf-next v4 5/6] tools: sync include/uapi/linux/bpf.h Benjamin Tissoires
2024-03-15 14:29 ` [PATCH bpf-next v4 6/6] selftests/bpf: add sleepable timer tests Benjamin Tissoires
2024-03-19  0:14   ` Eduard Zingerman
2024-03-21 15:45     ` Benjamin Tissoires

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240315-hid-bpf-sleepable-v4-1-5658f2540564@kernel.org \
    --to=bentiss@kernel.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=eddyz87@gmail.com \
    --cc=haoluo@google.com \
    --cc=john.fastabend@gmail.com \
    --cc=jolsa@kernel.org \
    --cc=kpsingh@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=martin.lau@linux.dev \
    --cc=mykolal@fb.com \
    --cc=sdf@google.com \
    --cc=shuah@kernel.org \
    --cc=song@kernel.org \
    --cc=yonghong.song@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.