[PATCH RFC bpf-next 2/3] bpf: add delayed_work mechanism

* [PATCH RFC bpf-next 2/3] bpf: add delayed_work mechanism
  2022-07-11 21:48 [PATCH RFC bpf-next 0/3] Execution context callbacks Delyan Kratunov
@ 2022-07-11 21:48 ` Delyan Kratunov
  2022-07-11 21:48 ` [PATCH RFC bpf-next 1/3] bpf: allow maps to hold bpf_delayed_work fields Delyan Kratunov
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: Delyan Kratunov @ 2022-07-11 21:48 UTC (permalink / raw)
  To: daniel, ast, andrii, bpf

Add a new helper function that can schedule a callback to execute in a
different context. Initially, only irq_work (i.e. hardirq) is supported.

A key consideration is that we need this to work in an NMI context.
Therefore, we use a queue of pre-allocated llist nodes inside
bpf_delayed_work, which we drain on a per-program basis. To avoid races
on the bpf_delayed_work items, we implement a simple lock scheme based
on cmpxchg ordering.

Signed-off-by: Delyan Kratunov <delyank@fb.com>
---
 include/linux/bpf.h            |  13 ++++
 include/uapi/linux/bpf.h       |  28 ++++++++
 kernel/bpf/core.c              |   8 +++
 kernel/bpf/helpers.c           |  92 ++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 123 ++++++++++++++++++++++++++++++++-
 scripts/bpf_doc.py             |   2 +
 tools/include/uapi/linux/bpf.h |  27 ++++++++
 7 files changed, 292 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ad9d2cfb0411..7325a9a2d10b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -27,6 +27,8 @@
 #include <linux/bpfptr.h>
 #include <linux/btf.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/irq_work.h>
+#include <linux/llist.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -460,6 +462,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
 	ARG_PTR_TO_KPTR,	/* pointer to referenced kptr */
 	ARG_PTR_TO_DYNPTR,      /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
+	ARG_PTR_TO_DELAYED_WORK,/* pointer to bpf_delayed_work */
 	__BPF_ARG_TYPE_MAX,
 
 	/* Extended arg_types. */
@@ -1101,6 +1104,9 @@ struct bpf_prog_aux {
 	u32 linfo_idx;
 	u32 num_exentries;
 	struct exception_table_entry *extable;
+
+	/* initialized at load time if program uses delayed work helpers */
+	struct bpf_delayed_irq_work *irq_work;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -2526,4 +2532,11 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 int bpf_dynptr_check_size(u32 size);
 
+struct bpf_delayed_irq_work {
+	struct llist_head items;
+	struct irq_work work;
+	struct bpf_prog *prog;
+};
+void bpf_delayed_work_irq_work_cb(struct irq_work *work);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d68fc4f472f1..dc0587bbbe7c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5325,6 +5325,29 @@ union bpf_attr {
  *		**-EACCES** if the SYN cookie is not valid.
  *
  *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ *
+ * long bpf_delayed_work_submit(struct bpf_delayed_work *work, void *cb, void *data, int flags)
+ *     Description
+ *             Submits a function to execute in a different context.
+ *
+ *             *work* must be a member in a map value.
+ *
+ *             *cb* function to call
+ *
+ *             *data* context to pass as sole argument to *cb*. Must be part of
+ *             a map value or NULL.
+ *
+ *             *flags* must be BPF_DELAYED_WORK_IRQWORK
+ *     Return
+ *             0 when work is successfully submitted.
+ *
+ *             **-EINVAL** if *cb* is NULL
+ *
+ *             **-EOPNOTSUP** if called from an NMI handler on an
+ *             architecture without NMI-safe cmpxchg
+ *
+ *             **-EINVAL** if *work* is already in use
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5535,6 +5558,7 @@ union bpf_attr {
 	FN(tcp_raw_gen_syncookie_ipv6),	\
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
+	FN(delayed_work_submit),        \
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6699,6 +6723,10 @@ struct bpf_delayed_work {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+enum {
+	BPF_DELAYED_WORK_IRQWORK = (1UL << 0),
+};
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b5ffebcce6cc..1f5093f9442b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2567,6 +2567,14 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	int i;
 
 	aux = container_of(work, struct bpf_prog_aux, work);
+
+	/* We have already waited for a qs of the appropriate RCU variety,
+	 * so we can expect no further submissions of work. Just wait for
+	 * the currently scheduled work to finish before releasing anything.
+	 */
+	if (aux->irq_work)
+		irq_work_sync(&aux->irq_work->work);
+
 #ifdef CONFIG_BPF_SYSCALL
 	bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
 #endif
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a1c84d256f83..731547d34c35 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -18,6 +18,8 @@
 #include <linux/proc_ns.h>
 #include <linux/security.h>
 #include <linux/btf_ids.h>
+#include <linux/irq_work.h>
+#include <linux/llist.h>
 
 #include "../../lib/kstrtox.h"
 
@@ -1575,6 +1577,94 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = {
 	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
 };
 
+struct bpf_delayed_work_kern {
+	struct llist_node item;
+	u64 flags; /* used as a lock field */
+	void (*cb)(void *);
+	void *data;
+} __aligned(8);
+
+#define BPF_DELAYED_WORK_FREE (0)
+#define BPF_DELAYED_WORK_CLAIMED (1)
+#define BPF_DELAYED_WORK_READY (2)
+
+void bpf_delayed_work_irq_work_cb(struct irq_work *work)
+{
+	struct bpf_delayed_irq_work *bpf_irq_work = container_of(work, struct bpf_delayed_irq_work, work);
+	struct bpf_delayed_work_kern *work_item, *next;
+	struct llist_node *work_list = llist_del_all(&bpf_irq_work->items);
+
+	/* Traverse in submission order to preserve ordering semantics */
+	llist_reverse_order(work_list);
+
+	llist_for_each_entry_safe(work_item, next, work_list, item) {
+		WARN_ONCE(work_item->flags != BPF_DELAYED_WORK_READY, "incomplete bpf_delayed_work found");
+
+		work_item->cb(work_item->data);
+
+		work_item->cb = work_item->data = NULL;
+		bpf_prog_put(bpf_irq_work->prog);
+		xchg(&work_item->flags, BPF_DELAYED_WORK_FREE);
+	}
+}
+
+BPF_CALL_5(bpf_delayed_work_submit, struct bpf_delayed_work_kern *, work,
+	   void *, callback_fn, void *, data, int, flags, struct bpf_prog_aux *, aux)
+{
+	u64 ret;
+	struct bpf_prog *prog;
+
+	BUILD_BUG_ON(sizeof(struct bpf_delayed_work_kern) > sizeof(struct bpf_delayed_work));
+	BUILD_BUG_ON(__alignof__(struct bpf_delayed_work_kern) != __alignof__(struct bpf_delayed_work));
+	BTF_TYPE_EMIT(struct bpf_delayed_work);
+
+	if (callback_fn == NULL)
+		return -EINVAL;
+
+	if (flags != BPF_DELAYED_WORK_IRQWORK)
+		return -EOPNOTSUPP;
+
+	if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && in_nmi())
+		return -EOPNOTSUPP;
+
+	ret = cmpxchg(&work->flags, BPF_DELAYED_WORK_FREE, BPF_DELAYED_WORK_CLAIMED);
+	if (ret != 0)
+		return -EINVAL;
+
+	work->data = data;
+	work->cb = callback_fn;
+
+	ret = cmpxchg(&work->flags, BPF_DELAYED_WORK_CLAIMED, BPF_DELAYED_WORK_READY);
+	if (ret != BPF_DELAYED_WORK_CLAIMED) {
+		WARN_ONCE(ret != BPF_DELAYED_WORK_CLAIMED, "bpf_delayed_work item altered while claimed");
+		return -EINVAL;
+	}
+
+	/* Bump the ref count for every work item submitted by the program. */
+	prog = bpf_prog_inc_not_zero(aux->prog);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	llist_add(&work->item, &aux->irq_work->items);
+
+	/* It's okay if this prog's irq_work is already submitted,
+	 * it will walk the same list of callbacks anyway.
+	 */
+	(void) irq_work_queue(&aux->irq_work->work);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_delayed_work_submit_proto = {
+	.func		= bpf_delayed_work_submit,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_DELAYED_WORK,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+	.arg3_type	= ARG_PTR_TO_MAP_VALUE, /* TODO: need ptr_to_map_value_mem */
+	.arg4_type	= ARG_ANYTHING,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1643,6 +1733,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_dynptr_write_proto;
 	case BPF_FUNC_dynptr_data:
 		return &bpf_dynptr_data_proto;
+	case BPF_FUNC_delayed_work_submit:
+		return &bpf_delayed_work_submit_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9fd311b7a1ff..212cbea5a382 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5490,6 +5490,55 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+static int process_delayed_work_func(struct bpf_verifier_env *env, int regno,
+			      struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	bool is_const = tnum_is_const(reg->var_off);
+	struct bpf_map *map = reg->map_ptr;
+	u64 val = reg->var_off.value;
+
+	if (!is_const) {
+		verbose(env,
+			"R%d doesn't have constant offset. bpf_delayed_work has to be at the constant offset\n",
+			regno);
+		return -EINVAL;
+	}
+	if (!map->btf) {
+		verbose(env, "map '%s' has to have BTF in order to use bpf_delayed_work\n",
+			map->name);
+		return -EINVAL;
+	}
+	if (!map_value_has_delayed_work(map)) {
+		if (map->delayed_work_off == -E2BIG)
+			verbose(env,
+				"map '%s' has more than one 'struct bpf_delayed_work'\n",
+				map->name);
+		else if (map->delayed_work_off == -ENOENT)
+			verbose(env,
+				"map '%s' doesn't have 'struct bpf_delayed_work'\n",
+				map->name);
+		else
+			verbose(env,
+				"map '%s' is not a struct type or bpf_delayed_work is mangled\n",
+				map->name);
+		return -EINVAL;
+	}
+	if (map->delayed_work_off != val + reg->off) {
+		verbose(env, "off %lld doesn't point to 'struct bpf_delayed_work' that is at %d\n",
+			val + reg->off, map->delayed_work_off);
+		return -EINVAL;
+	}
+	if (meta->map_ptr) {
+		verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+		return -EFAULT;
+	}
+
+	meta->map_uid = reg->map_uid;
+	meta->map_ptr = map;
+	return 0;
+}
+
 static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 			     struct bpf_call_arg_meta *meta)
 {
@@ -5677,6 +5726,7 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK }
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types delayed_work_types = { .types = { PTR_TO_MAP_VALUE } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -5704,6 +5754,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_TIMER]		= &timer_types,
 	[ARG_PTR_TO_KPTR]		= &kptr_types,
 	[ARG_PTR_TO_DYNPTR]		= &stack_ptr_types,
+	[ARG_PTR_TO_DELAYED_WORK]	= &delayed_work_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -6018,6 +6069,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	} else if (arg_type == ARG_PTR_TO_TIMER) {
 		if (process_timer_func(env, regno, meta))
 			return -EACCES;
+	} else if (arg_type == ARG_PTR_TO_DELAYED_WORK) {
+		if (process_delayed_work_func(env, regno, meta))
+			return -EACCES;
 	} else if (arg_type == ARG_PTR_TO_FUNC) {
 		meta->subprogno = reg->subprogno;
 	} else if (base_type(arg_type) == ARG_PTR_TO_MEM) {
@@ -6670,7 +6724,8 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	if (insn->code == (BPF_JMP | BPF_CALL) &&
 	    insn->src_reg == 0 &&
-	    insn->imm == BPF_FUNC_timer_set_callback) {
+	    (insn->imm == BPF_FUNC_timer_set_callback ||
+	     insn->imm == BPF_FUNC_delayed_work_submit)) {
 		struct bpf_verifier_state *async_cb;
 
 		/* there is no real recursion here. timer callbacks are async */
@@ -6898,6 +6953,30 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int set_delayed_work_callback_state(struct bpf_verifier_env *env,
+					   struct bpf_func_state *caller,
+					   struct bpf_func_state *callee,
+					   int insn_idx)
+{
+	/* bpf_delayed_work_submit(struct bpf_delayed_work *work,
+	 *  void *callback_fn, void *data, u64 flags);
+	 *
+	 * callback_fn(void *callback_ctx);
+	 */
+	callee->regs[BPF_REG_1].type = PTR_TO_MAP_VALUE;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+	callee->regs[BPF_REG_1].map_ptr = caller->regs[BPF_REG_3].map_ptr;
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_2]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+	callee->in_callback_fn = true;
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -7294,6 +7373,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 				reg_type_str(env, regs[BPF_REG_1].type));
 			return -EACCES;
 		}
+		break;
+	case BPF_FUNC_delayed_work_submit:
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_delayed_work_callback_state);
+		break;
 	}
 
 	if (err)
@@ -7468,6 +7552,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
 		env->prog->call_get_stack = true;
 
+	if (func_id == BPF_FUNC_delayed_work_submit) {
+		struct bpf_delayed_irq_work *irq_work = kmalloc(
+			sizeof(struct bpf_delayed_irq_work), GFP_KERNEL);
+		if (!irq_work) {
+			verbose(env, "could not allocate irq_work");
+			return -ENOMEM;
+		}
+
+		init_llist_head(&irq_work->items);
+		irq_work->work = IRQ_WORK_INIT_HARD(&bpf_delayed_work_irq_work_cb);
+		irq_work->prog = env->prog;
+		env->prog->aux->irq_work = irq_work;
+	}
+
+
 	if (func_id == BPF_FUNC_get_func_ip) {
 		if (check_get_func_ip(env))
 			return -ENOTSUPP;
@@ -14061,6 +14160,28 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
+		if (insn->imm == BPF_FUNC_delayed_work_submit) {
+			// Add aux as the 5th arg to delayed_work_submit
+			struct bpf_insn ld_addrs[2] = {
+				BPF_LD_IMM64(BPF_REG_5, (long)prog->aux),
+			};
+
+			insn_buf[0] = ld_addrs[0];
+			insn_buf[1] = ld_addrs[1];
+			insn_buf[2] = *insn;
+			cnt = 3;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
+
+
 		if (insn->imm == BPF_FUNC_task_storage_get ||
 		    insn->imm == BPF_FUNC_sk_storage_get ||
 		    insn->imm == BPF_FUNC_inode_storage_get) {
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index a0ec321469bd..0dd43dc9f388 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -637,6 +637,7 @@ class PrinterHelpers(Printer):
             'struct bpf_dynptr',
             'struct iphdr',
             'struct ipv6hdr',
+            'struct bpf_delayed_work',
     ]
     known_types = {
             '...',
@@ -690,6 +691,7 @@ class PrinterHelpers(Printer):
             'struct bpf_dynptr',
             'struct iphdr',
             'struct ipv6hdr',
+            'struct bpf_delayed_work',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d68fc4f472f1..461417159106 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5325,6 +5325,28 @@ union bpf_attr {
  *		**-EACCES** if the SYN cookie is not valid.
  *
  *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * long bpf_delayed_work_submit(struct bpf_delayed_work *work, void *cb, void *data, int flags)
+ *     Description
+ *             Submits a function to execute in a different context.
+ *
+ *             *work* must be a member in a map value.
+ *
+ *             *cb* function to call
+ *
+ *             *data* context to pass as sole argument to *cb*. Must be part of
+ *             a map value or NULL.
+ *
+ *             *flags* must be BPF_DELAYED_WORK_IRQWORK
+ *     Return
+ *             0 when work is successfully submitted.
+ *
+ *             **-EINVAL** if *cb* is NULL
+ *
+ *             **-EOPNOTSUP** if called from an NMI handler on an
+ *             architecture without NMI-safe cmpxchg
+ *
+ *             **-EINVAL** if *work* is already in use
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5535,6 +5557,7 @@ union bpf_attr {
 	FN(tcp_raw_gen_syncookie_ipv6),	\
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
+	FN(delayed_work_submit),        \
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6699,6 +6722,10 @@ struct bpf_delayed_work {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+enum {
+	BPF_DELAYED_WORK_IRQWORK = (1UL << 0),
+};
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
-- 
2.36.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread