bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH bpf-next] bpf: Introduce bpf_timer
@ 2021-05-20 18:55 Alexei Starovoitov
  2021-05-21 14:38 ` Alexei Starovoitov
                   ` (3 more replies)
  0 siblings, 4 replies; 77+ messages in thread
From: Alexei Starovoitov @ 2021-05-20 18:55 UTC (permalink / raw)
  To: davem; +Cc: daniel, andrii, john.fastabend, lmb, netdev, bpf, kernel-team

From: Alexei Starovoitov <ast@kernel.org>

Introduce 'struct bpf_timer' that can be embedded in most BPF map types
and helpers to operate on it:
long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
long bpf_timer_del(struct bpf_timer *timer)

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
This is work in progress, but gives an idea on how API will look.
---
 include/linux/bpf.h                           |   1 +
 include/uapi/linux/bpf.h                      |  25 ++++
 kernel/bpf/helpers.c                          | 106 +++++++++++++++++
 kernel/bpf/verifier.c                         | 110 ++++++++++++++++++
 kernel/trace/bpf_trace.c                      |   2 +-
 scripts/bpf_doc.py                            |   2 +
 tools/include/uapi/linux/bpf.h                |  25 ++++
 .../testing/selftests/bpf/prog_tests/timer.c  |  42 +++++++
 tools/testing/selftests/bpf/progs/timer.c     |  53 +++++++++
 9 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/timer.c
 create mode 100644 tools/testing/selftests/bpf/progs/timer.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9dc44ba97584..18e09cc0c410 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -312,6 +312,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_FUNC,	/* pointer to a bpf program function */
 	ARG_PTR_TO_STACK_OR_NULL,	/* pointer to stack or NULL */
 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
+	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
 	__BPF_ARG_TYPE_MAX,
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 418b9b813d65..c95d7854d9fb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4761,6 +4761,24 @@ union bpf_attr {
  * 		Execute close syscall for given FD.
  * 	Return
  * 		A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
+ *	Description
+ *		Initialize the timer to call given static function.
+ *	Return
+ *		zero
+ *
+ * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
+ *	Description
+ *		Set the timer expiration N msecs from the current time.
+ *	Return
+ *		zero
+ *
+ * long bpf_timer_del(struct bpf_timer *timer)
+ *	Description
+ *		Deactivate the timer.
+ *	Return
+ *		zero
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4932,6 +4950,9 @@ union bpf_attr {
 	FN(sys_bpf),			\
 	FN(btf_find_by_name_kind),	\
 	FN(sys_close),			\
+	FN(timer_init),			\
+	FN(timer_mod),			\
+	FN(timer_del),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6038,6 +6059,10 @@ struct bpf_spin_lock {
 	__u32	val;
 };
 
+struct bpf_timer {
+	__u64 opaque;
+};
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 544773970dbc..8ef0ad23c991 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -985,6 +985,106 @@ const struct bpf_func_proto bpf_snprintf_proto = {
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+struct bpf_timer_list {
+	struct timer_list tl;
+	struct bpf_map *map;
+	struct bpf_prog *prog;
+	void *callback_fn;
+	void *key;
+	void *value;
+};
+
+static void timer_cb(struct timer_list *timer)
+{
+	struct bpf_timer_list *tl = from_timer(tl, timer, tl);
+	struct bpf_map *map;
+	int ret;
+
+	ret = BPF_CAST_CALL(tl->callback_fn)((u64)(long)tl->map,
+					     (u64)(long)tl->key,
+					     (u64)(long)tl->value, 0, 0);
+	WARN_ON(ret != 0); /* todo: define 0 vs 1 or disallow 1 in the verifier */
+	bpf_prog_put(tl->prog);
+}
+
+BPF_CALL_5(bpf_timer_init, struct bpf_timer *, timer, void *, cb, int, flags,
+	   struct bpf_map *, map, struct bpf_prog *, prog)
+{
+	struct bpf_timer_list *tl;
+
+	if (timer->opaque)
+		return -EBUSY;
+	tl = kcalloc(1, sizeof(*tl), GFP_ATOMIC);
+	if (!tl)
+		return -ENOMEM;
+	tl->callback_fn = cb;
+	tl->value = (void *)timer /* - offset of bpf_timer inside elem */;
+	tl->key = tl->value - round_up(map->key_size, 8);
+	tl->map = map;
+	tl->prog = prog;
+	timer_setup(&tl->tl, timer_cb, 0);
+	timer->opaque = (long)tl;
+	return 0;
+}
+
+const struct bpf_func_proto bpf_timer_init_proto = {
+	.func		= bpf_timer_init,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_timer_mod, struct bpf_timer *, timer, u64, msecs)
+{
+	struct bpf_timer_list *tl;
+
+	tl = (struct bpf_timer_list *)timer->opaque;
+	if (!tl)
+		return -EINVAL;
+	/* keep the prog alive until callback is invoked */
+	if (!mod_timer(&tl->tl, jiffies + msecs_to_jiffies(msecs))) {
+		/* The timer was inactive.
+		 * Keep the prog alive until callback is invoked
+		 */
+		bpf_prog_inc(tl->prog);
+	}
+	return 0;
+}
+
+const struct bpf_func_proto bpf_timer_mod_proto = {
+	.func		= bpf_timer_mod,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_timer_del, struct bpf_timer *, timer)
+{
+	struct bpf_timer_list *tl;
+
+	tl = (struct bpf_timer_list *)timer->opaque;
+	if (!tl)
+		return -EINVAL;
+	if (del_timer(&tl->tl)) {
+		/* The timer was active,
+		 * drop the prog refcnt, since callback
+		 * will not be invoked.
+		 */
+		bpf_prog_put(tl->prog);
+	}
+	return 0;
+}
+
+const struct bpf_func_proto bpf_timer_del_proto = {
+	.func		= bpf_timer_del,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -1033,6 +1133,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ringbuf_query_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
+	case BPF_FUNC_timer_init:
+		return &bpf_timer_init_proto;
+	case BPF_FUNC_timer_mod:
+		return &bpf_timer_mod_proto;
+	case BPF_FUNC_timer_del:
+		return &bpf_timer_del_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9189eecb26dd..606c713be60a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4656,6 +4656,35 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+			      struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	bool is_const = tnum_is_const(reg->var_off);
+	struct bpf_map *map = reg->map_ptr;
+	u64 val = reg->var_off.value;
+
+	if (!is_const) {
+		verbose(env,
+			"R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
+			regno);
+		return -EINVAL;
+	}
+	if (!map->btf) {
+		verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
+			map->name);
+		return -EINVAL;
+	}
+	if (val) {
+		/* todo: relax this requirement */
+		verbose(env, "bpf_timer field can only be first in the map value element\n");
+		return -EINVAL;
+	}
+	WARN_ON(meta->map_ptr);
+	meta->map_ptr = map;
+	return 0;
+}
+
 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
 {
 	return type == ARG_PTR_TO_MEM ||
@@ -4788,6 +4817,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
 static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
 static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -4819,6 +4849,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_FUNC]		= &func_ptr_types,
 	[ARG_PTR_TO_STACK_OR_NULL]	= &stack_ptr_types,
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
+	[ARG_PTR_TO_TIMER]		= &timer_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -5000,6 +5031,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			verbose(env, "verifier internal error\n");
 			return -EFAULT;
 		}
+	} else if (arg_type == ARG_PTR_TO_TIMER) {
+		if (process_timer_func(env, regno, meta))
+			return -EACCES;
 	} else if (arg_type == ARG_PTR_TO_FUNC) {
 		meta->subprogno = reg->subprogno;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
@@ -5742,6 +5776,43 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int set_timer_init_callback_state(struct bpf_verifier_env *env,
+					 struct bpf_func_state *caller,
+					 struct bpf_func_state *callee,
+					 int insn_idx)
+{
+	struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+	struct bpf_map *map_ptr;
+
+	if (bpf_map_ptr_poisoned(insn_aux)) {
+		verbose(env, "bpf_timer_init abusing map_ptr\n");
+		return -EINVAL;
+	}
+
+	map_ptr = BPF_MAP_PTR(insn_aux->map_ptr_state);
+
+	/* bpf_timer_init(struct bpf_timer *timer, void *callback_fn, u64 flags);
+	 * callback_fn(struct bpf_map *map, void *key, void *value);
+	 */
+	callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+	callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+	callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+	callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+	callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+	callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+	callee->in_callback_fn = true;
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -5837,6 +5908,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	    func_id != BPF_FUNC_map_pop_elem &&
 	    func_id != BPF_FUNC_map_peek_elem &&
 	    func_id != BPF_FUNC_for_each_map_elem &&
+	    func_id != BPF_FUNC_timer_init &&
 	    func_id != BPF_FUNC_redirect_map)
 		return 0;
 
@@ -6069,6 +6141,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_timer_init) {
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_timer_init_callback_state);
+		if (err < 0)
+			return -EINVAL;
+	}
+
 	if (func_id == BPF_FUNC_snprintf) {
 		err = check_bpf_snprintf_call(env, regs);
 		if (err < 0)
@@ -12526,6 +12605,37 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 			continue;
 		}
+		if (insn->imm == BPF_FUNC_timer_init) {
+
+			aux = &env->insn_aux_data[i + delta];
+			if (bpf_map_ptr_poisoned(aux)) {
+				verbose(env, "bpf_timer_init abusing map_ptr\n");
+				return -EINVAL;
+			}
+			map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+			{
+				struct bpf_insn ld_addrs[4] = {
+					BPF_LD_IMM64(BPF_REG_4, (long)map_ptr),
+					BPF_LD_IMM64(BPF_REG_5, (long)prog),
+				};
+
+				insn_buf[0] = ld_addrs[0];
+				insn_buf[1] = ld_addrs[1];
+				insn_buf[2] = ld_addrs[2];
+				insn_buf[3] = ld_addrs[3];
+			}
+			insn_buf[4] = *insn;
+			cnt = 5;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
 
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
 		 * and other inlining handlers are currently limited to 64 bit
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d2d7cf6cfe83..453a46c2d732 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1065,7 +1065,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_snprintf:
 		return &bpf_snprintf_proto;
 	default:
-		return NULL;
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index 2d94025b38e9..00ac7b79cddb 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -547,6 +547,7 @@ COMMANDS
             'struct inode',
             'struct socket',
             'struct file',
+            'struct bpf_timer',
     ]
     known_types = {
             '...',
@@ -594,6 +595,7 @@ COMMANDS
             'struct inode',
             'struct socket',
             'struct file',
+            'struct bpf_timer',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 418b9b813d65..c95d7854d9fb 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4761,6 +4761,24 @@ union bpf_attr {
  * 		Execute close syscall for given FD.
  * 	Return
  * 		A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, void *callback, int flags)
+ *	Description
+ *		Initialize the timer to call given static function.
+ *	Return
+ *		zero
+ *
+ * long bpf_timer_mod(struct bpf_timer *timer, u64 msecs)
+ *	Description
+ *		Set the timer expiration N msecs from the current time.
+ *	Return
+ *		zero
+ *
+ * long bpf_timer_del(struct bpf_timer *timer)
+ *	Description
+ *		Deactivate the timer.
+ *	Return
+ *		zero
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4932,6 +4950,9 @@ union bpf_attr {
 	FN(sys_bpf),			\
 	FN(btf_find_by_name_kind),	\
 	FN(sys_close),			\
+	FN(timer_init),			\
+	FN(timer_mod),			\
+	FN(timer_del),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6038,6 +6059,10 @@ struct bpf_spin_lock {
 	__u32	val;
 };
 
+struct bpf_timer {
+	__u64 opaque;
+};
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c
new file mode 100644
index 000000000000..6b7a16a54e70
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/timer.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include "timer.skel.h"
+
+static int timer(struct timer *timer_skel)
+{
+	int err, prog_fd;
+	__u32 duration = 0, retval;
+
+	err = timer__attach(timer_skel);
+	if (!ASSERT_OK(err, "timer_attach"))
+		return err;
+
+	prog_fd = bpf_program__fd(timer_skel->progs.test1);
+	err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
+				NULL, NULL, &retval, &duration);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(retval, 0, "test_run");
+
+	ASSERT_EQ(timer_skel->data->callback_check, 52, "callback_check1");
+	usleep(50 * 1000); /* 10 msecs should be enough, but give it extra */
+	ASSERT_EQ(timer_skel->data->callback_check, 42, "callback_check2");
+
+	timer__detach(timer_skel);
+	return 0;
+}
+
+void test_timer(void)
+{
+	struct timer *timer_skel = NULL;
+	int err;
+
+	timer_skel = timer__open_and_load();
+	if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load"))
+		goto cleanup;
+
+	err = timer(timer_skel);
+	ASSERT_OK(err, "timer");
+cleanup:
+	timer__destroy(timer_skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c
new file mode 100644
index 000000000000..2cf0634f10c9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/timer.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+struct map_elem {
+	struct bpf_timer timer;
+	int counter;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 1000);
+	__type(key, int);
+	__type(value, struct map_elem);
+} hmap SEC(".maps");
+
+__u64 callback_check = 52;
+
+static int timer_cb(struct bpf_map *map, int *key, struct map_elem *val)
+{
+	callback_check--;
+	if (--val->counter)
+		/* re-arm the timer again to execute after 1 msec */
+		bpf_timer_mod(&val->timer, 1);
+	return 0;
+}
+
+int bpf_timer_test(void)
+{
+	struct map_elem *val;
+	int key = 0;
+
+	val = bpf_map_lookup_elem(&hmap, &key);
+	if (val) {
+		bpf_timer_init(&val->timer, timer_cb, 0);
+		bpf_timer_mod(&val->timer, 1);
+	}
+	return 0;
+}
+
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(test1, int a)
+{
+	struct map_elem val = {};
+	int key = 0;
+
+	val.counter = 10, /* number of times to trigger timer_cb */
+	bpf_map_update_elem(&hmap, &key, &val, 0);
+	return bpf_timer_test();
+}
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 77+ messages in thread
* [RFC Patch bpf-next] bpf: introduce bpf timer
@ 2021-04-01  4:26 Cong Wang
  2021-04-01  6:38 ` Song Liu
  2021-04-02 19:28 ` Alexei Starovoitov
  0 siblings, 2 replies; 77+ messages in thread
From: Cong Wang @ 2021-04-01  4:26 UTC (permalink / raw)
  To: netdev
  Cc: bpf, duanxiongchun, wangdongdong.6, songmuchun, Cong Wang,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song

From: Cong Wang <cong.wang@bytedance.com>

(This patch is still in early stage and obviously incomplete. I am sending
it out to get some high-level feedbacks. Please kindly ignore any coding
details for now and focus on the design.)

This patch introduces a bpf timer map and a syscall to create bpf timer
from user-space.

The reason why we have to use a map is because the lifetime of a timer,
without a map, we have to delete the timer before exiting the eBPF program,
this would significately limit its use cases. With a map, the timer can
stay as long as the map itself and can be actually updated via map update
API's too, where the key is the timer ID and the value is the timer expire
timer.

Timer creation is not easy either. In order to prevent users creating a
timer but not adding it to a map, we have to enforce this in the API which
takes a map parameter and adds the new timer into the map in one shot.

And because timer is asynchronous, we can not just use its callback like
bpf_for_each_map_elem(). More importantly, we have to properly reference
count its struct bpf_prog too. It seems impossible to do this either in
verifier or in JIT, so we have to make its callback code a separate eBPF
program and pass a program fd from user-space. Fortunately, timer callback
can still live in the same object file with the rest eBPF code and share
data too.

Here is a quick demo of the timer callback code:

static __u64
check_expired_elem(struct bpf_map *map, __u32 *key, __u64 *val,
                  int *data)
{
  u64 expires = *val;

  if (expires < bpf_jiffies64()) {
    bpf_map_delete_elem(map, key);
    *data++;
  }
  return 0;
}

SEC("timer")
u32 timer_callback(void)
{
  int count = 0;

  bpf_for_each_map_elem(&map, check_expired_elem, &count, 0);
  if (count)
     return 0; // not re-arm this timer
  else
     return 10; // reschedule this timer after 10 jiffies
}

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/bpf.h       |   2 +
 include/linux/bpf_types.h |   1 +
 include/uapi/linux/bpf.h  |  15 +++
 kernel/bpf/Makefile       |   2 +-
 kernel/bpf/syscall.c      |  16 +++
 kernel/bpf/timer.c        | 238 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c     |   6 +
 7 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/timer.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9fdd839b418c..196e8f2f8c12 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2078,4 +2078,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
 
+int bpf_timer_create(union bpf_attr *attr);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index f883f01a5061..9e3afd2dbfc6 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -133,3 +133,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
 #ifdef CONFIG_NET
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_TIMER, timer_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 598716742593..627c0fbf9dac 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -841,6 +841,7 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
+	BPF_TIMER_CREATE,
 };
 
 enum bpf_map_type {
@@ -874,6 +875,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_RINGBUF,
 	BPF_MAP_TYPE_INODE_STORAGE,
 	BPF_MAP_TYPE_TASK_STORAGE,
+	BPF_MAP_TYPE_TIMER,
 };
 
 /* Note that tracing related programs such as
@@ -916,6 +918,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_EXT,
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
+	BPF_PROG_TYPE_TIMER,
 };
 
 enum bpf_attach_type {
@@ -1436,6 +1439,12 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
+	struct { /* struct used by BPF_TIMER_CREATE command */
+		__u32		map_fd;
+		__u32		prog_fd;
+		__u32		flags;		/* timer flags */
+	} timer_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -6013,4 +6022,10 @@ enum {
 	BTF_F_ZERO	=	(1ULL << 3),
 };
 
+/* bpf timer flags */
+enum {
+	BTF_TIMER_F_DEFERRABLE	= (1ULL << 0),
+	BTF_TIMER_F_PINNED	= (1ULL << 1),
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7f33098ca63f..0215bfd1bcea 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,7 +8,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o timer.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9603de81811a..f423f0688bd5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4350,6 +4350,19 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_TIMER_CREATE_LAST_FIELD timer_create.flags
+
+static int bpf_create_timer(union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_TIMER_CREATE))
+		return -EINVAL;
+
+	if (!bpf_capable())
+		return -EPERM;
+
+	return bpf_timer_create(attr);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr;
@@ -4486,6 +4499,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_BIND_MAP:
 		err = bpf_prog_bind_map(&attr);
 		break;
+	case BPF_TIMER_CREATE:
+		err = bpf_create_timer(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/timer.c b/kernel/bpf/timer.c
new file mode 100644
index 000000000000..0d7b5655e60a
--- /dev/null
+++ b/kernel/bpf/timer.c
@@ -0,0 +1,238 @@
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/filter.h>
+#include <uapi/linux/btf.h>
+
+struct bpf_timer_list {
+	struct timer_list timer;
+	struct bpf_prog *prog;
+	u64 expires;
+	s32 id;
+	struct rcu_head rcu;
+};
+
+struct bpf_timer_map {
+	struct bpf_map map;
+	struct idr timer_idr;
+	spinlock_t idr_lock;
+};
+
+static int timer_map_alloc_check(union bpf_attr *attr)
+{
+	if (attr->max_entries == 0 || attr->max_entries > INT_MAX ||
+	    attr->key_size != 4 || attr->value_size != 8)
+		return -EINVAL;
+
+	if (attr->map_flags & BPF_F_MMAPABLE)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct bpf_map *timer_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_timer_map *tmap;
+
+	tmap = kzalloc(sizeof(*tmap), GFP_USER | __GFP_ACCOUNT);
+	if (!tmap)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&tmap->map, attr);
+	spin_lock_init(&tmap->idr_lock);
+	idr_init(&tmap->timer_idr);
+	return &tmap->map;
+}
+
+static int bpf_timer_delete(int id, void *ptr, void *data)
+{
+	struct bpf_timer_list *t = ptr;
+
+	del_timer_sync(&t->timer);
+	kfree_rcu(t, rcu);
+	return 0;
+}
+
+static void timer_map_free(struct bpf_map *map)
+{
+	struct bpf_timer_map *tmap;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	idr_for_each(&tmap->timer_idr, bpf_timer_delete, NULL);
+
+	rcu_barrier();
+	idr_destroy(&tmap->timer_idr);
+}
+
+static void *timer_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_timer_map *tmap;
+	s32 timer_id = *(s32 *)key;
+	struct bpf_timer_list *t;
+	void *ret = NULL;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+
+	rcu_read_lock();
+	t = idr_find(&tmap->timer_idr, timer_id);
+	if (t) {
+		t->expires = t->timer.expires;
+		ret = &t->expires;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static int timer_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 flags)
+{
+	u64 expires = *(u64 *)value;
+	s32 timer_id = *(s32 *)key;
+	struct bpf_timer_map *tmap;
+	struct bpf_timer_list *t;
+	int ret = 0;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+
+	rcu_read_lock();
+	t = idr_find(&tmap->timer_idr, timer_id);
+	if (!t)
+		ret = -ENOENT;
+	else
+		mod_timer(&t->timer, (unsigned long)expires);
+	rcu_read_unlock();
+	return ret;
+}
+
+static int timer_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_timer_map *tmap;
+	s32 timer_id = *(s32 *)key;
+	struct bpf_timer_list *t;
+	unsigned long flags;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	spin_lock_irqsave(&tmap->idr_lock, flags);
+	t = idr_remove(&tmap->timer_idr, timer_id);
+	spin_unlock_irqrestore(&tmap->idr_lock, flags);
+	if (!t)
+		return -ENOENT;
+	del_timer_sync(&t->timer);
+	bpf_prog_put(t->prog);
+	kfree_rcu(t, rcu);
+	return 0;
+}
+
+static int timer_map_get_next_key(struct bpf_map *map, void *key,
+				    void *next_key)
+{
+	struct bpf_timer_map *tmap;
+	s32 next_id = *(s32 *)key;
+	int ret = 0;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	rcu_read_lock();
+	if (!idr_get_next(&tmap->timer_idr, &next_id))
+		ret = -ENOENT;
+	rcu_read_unlock();
+	*(s32 *)next_key = next_id;
+	return ret;
+}
+
+static int timer_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+	return -ENOTSUPP;
+}
+
+static int timer_map_btf_id;
+const struct bpf_map_ops timer_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc_check = timer_map_alloc_check,
+	.map_alloc = timer_map_alloc,
+	.map_free = timer_map_free,
+	.map_mmap = timer_map_mmap,
+	.map_lookup_elem = timer_map_lookup_elem,
+	.map_update_elem = timer_map_update_elem,
+	.map_delete_elem = timer_map_delete_elem,
+	.map_get_next_key = timer_map_get_next_key,
+	.map_btf_name = "bpf_timer_map",
+	.map_btf_id = &timer_map_btf_id,
+};
+
+static void bpf_timer_callback(struct timer_list *t)
+{
+	struct bpf_timer_list *bt = from_timer(bt, t, timer);
+	u32 ret;
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(bt->prog, NULL);
+	rcu_read_unlock();
+
+	if (ret)
+		mod_timer(&bt->timer, bt->timer.expires + ret);
+}
+
+int bpf_timer_create(union bpf_attr *attr)
+{
+	unsigned int flags, timer_flags = 0;
+	struct bpf_timer_map *tmap;
+	struct bpf_timer_list *t;
+	unsigned long irq_flags;
+	struct bpf_prog *prog;
+	struct bpf_map *map;
+	int ret = 0;
+
+	flags = attr->timer_create.flags;
+	if (flags & ~(BTF_TIMER_F_DEFERRABLE | BTF_TIMER_F_PINNED))
+		return -EINVAL;
+
+	prog = bpf_prog_get(attr->timer_create.prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+	if (prog->type != BPF_PROG_TYPE_TIMER) {
+		ret = -EINVAL;
+		goto out_prog_put;
+	}
+
+	map = bpf_map_get(attr->timer_create.map_fd);
+	if (IS_ERR(map)) {
+		ret = PTR_ERR(map);
+		goto out_prog_put;
+	}
+	if (map->map_type != BPF_MAP_TYPE_TIMER) {
+		ret = -EINVAL;
+		goto out_map_put;
+	}
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t) {
+		ret = -ENOMEM;
+		goto out_map_put;
+	}
+
+	if (flags & BTF_TIMER_F_DEFERRABLE)
+		timer_flags |= TIMER_DEFERRABLE;
+	if (flags & BTF_TIMER_F_PINNED)
+		timer_flags |= TIMER_PINNED;
+	timer_setup(&t->timer, bpf_timer_callback, timer_flags);
+	t->prog = prog;
+
+	tmap = container_of(map, struct bpf_timer_map, map);
+	spin_lock_irqsave(&tmap->idr_lock, irq_flags);
+	ret = idr_alloc_cyclic(&tmap->timer_idr, t, 0, INT_MAX, GFP_ATOMIC);
+	spin_unlock_irqrestore(&tmap->idr_lock, irq_flags);
+	if (ret < 0)
+		kfree(t);
+	else
+		t->id = ret;
+
+out_map_put:
+	bpf_map_put(map);
+out_prog_put:
+	if (ret)
+		bpf_prog_put(prog);
+	return ret;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 852541a435ef..ed0cbce8dc4f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5991,6 +5991,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_map_delete_elem &&
+	    env->prog->type == BPF_PROG_TYPE_TIMER) {
+		verbose(env, "bpf_map_delete_elem() can't be called in a timer program\n");
+		return -EINVAL;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 77+ messages in thread

end of thread, other threads:[~2021-08-11 21:03 UTC | newest]

Thread overview: 77+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-20 18:55 [RFC PATCH bpf-next] bpf: Introduce bpf_timer Alexei Starovoitov
2021-05-21 14:38 ` Alexei Starovoitov
2021-05-21 21:37 ` Cong Wang
2021-05-23 16:01   ` Alexei Starovoitov
2021-05-24  8:45     ` Lorenz Bauer
2021-05-25  3:16     ` Cong Wang
2021-05-25  4:59       ` Cong Wang
2021-05-25 18:21         ` Alexei Starovoitov
2021-05-25 19:35           ` Jamal Hadi Salim
2021-05-25 19:57             ` Alexei Starovoitov
2021-05-25 21:09               ` Jamal Hadi Salim
2021-05-25 22:08                 ` Alexei Starovoitov
2021-05-26 15:34                   ` Jamal Hadi Salim
2021-05-26 16:58                     ` Alexei Starovoitov
2021-05-26 18:25                       ` Jamal Hadi Salim
2021-05-30  6:36           ` Cong Wang
2021-06-02  2:00             ` Alexei Starovoitov
2021-06-02  8:48               ` Toke Høiland-Jørgensen
2021-06-02 17:54                 ` Martin KaFai Lau
2021-06-02 18:13                   ` Kumar Kartikeya Dwivedi
2021-06-02 18:26                     ` Alexei Starovoitov
2021-06-02 18:30                       ` Kumar Kartikeya Dwivedi
2021-06-02 18:46                     ` John Fastabend
2021-05-23 11:48 ` Toke Høiland-Jørgensen
2021-05-23 15:58   ` Alexei Starovoitov
2021-05-24  8:42     ` Lorenz Bauer
2021-05-24 14:48       ` Alexei Starovoitov
2021-05-24 17:33     ` Alexei Starovoitov
2021-05-24 18:39       ` Toke Høiland-Jørgensen
2021-05-24 18:38     ` Toke Høiland-Jørgensen
2021-05-24 11:49 ` Lorenz Bauer
2021-05-24 14:56   ` Alexei Starovoitov
2021-05-24 19:13     ` Andrii Nakryiko
2021-05-25  5:22       ` Cong Wang
2021-05-25 19:47         ` Andrii Nakryiko
  -- strict thread matches above, loose matches on Subject: below --
2021-04-01  4:26 [RFC Patch bpf-next] bpf: introduce bpf timer Cong Wang
2021-04-01  6:38 ` Song Liu
2021-04-01 17:28   ` Cong Wang
2021-04-01 20:17     ` Song Liu
2021-04-02 17:34       ` Cong Wang
2021-04-02 17:57         ` Song Liu
2021-04-02 19:08           ` Cong Wang
2021-04-02 19:43             ` Song Liu
2021-04-02 20:57               ` Cong Wang
2021-04-02 23:31                 ` Song Liu
2021-04-05 23:49                   ` Cong Wang
2021-04-06  1:07                     ` Song Liu
2021-04-06  1:24                       ` Cong Wang
2021-04-06  6:17                         ` Song Liu
2021-04-06 16:48                           ` Cong Wang
2021-04-06 23:36                             ` Song Liu
2021-04-08 22:45                               ` Cong Wang
2021-04-02 19:28 ` Alexei Starovoitov
2021-04-02 21:24   ` Cong Wang
2021-04-02 23:45     ` Alexei Starovoitov
2021-04-06  0:36       ` Cong Wang
2021-04-12 23:01         ` Alexei Starovoitov
2021-04-15  4:02           ` Cong Wang
2021-04-15  4:25             ` Alexei Starovoitov
2021-04-15 15:51               ` Cong Wang
2021-04-26 23:00               ` Cong Wang
2021-04-26 23:05                 ` Alexei Starovoitov
2021-04-26 23:37                   ` Cong Wang
2021-04-27  2:01                     ` Alexei Starovoitov
2021-04-27 11:52                       ` Jamal Hadi Salim
2021-04-27 16:36                       ` Cong Wang
2021-04-27 18:33                         ` Alexei Starovoitov
2021-05-09  5:37                           ` Cong Wang
2021-05-10 20:55                             ` Jamal Hadi Salim
2021-05-11 21:29                               ` Cong Wang
2021-05-12 22:56                                 ` Jamal Hadi Salim
2021-05-11  5:05                             ` Joe Stringer
2021-05-11 21:08                               ` Cong Wang
2021-05-12 22:43                               ` Jamal Hadi Salim
2021-05-13 18:45                                 ` Jamal Hadi Salim
2021-05-14  2:53                                   ` Cong Wang
2021-08-11 21:03                                     ` Joe Stringer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).