[PATCH bpf-next 14/17] bpf: implement number iterator

From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
Cc: <andrii@kernel.org>, <kernel-team@fb.com>, Tejun Heo <tj@kernel.org>
Subject: [PATCH bpf-next 14/17] bpf: implement number iterator
Date: Thu, 2 Mar 2023 15:50:12 -0800	[thread overview]
Message-ID: <20230302235015.2044271-15-andrii@kernel.org> (raw)
In-Reply-To: <20230302235015.2044271-1-andrii@kernel.org>

Implement first open-coded iterator type over a range of integers.

It's public API consists of:
  - bpf_iter_num_new() constructor, which accepts [start, end) range
    (that is, start is inclusive, while end is exclusive).
  - bpf_iter_num_next() which will keep returning 4-byte read-only
    pointer to int until the range is exhausted, at which point NULL will
    be returned. If bpf_iter_num_next() is kept calling after this, NULL
    will be persistently returned.
  - bpf_iter_num_destroy() destructor, that needs to be called at some
    point to clean up iterator state. BPF verifier enforces that iterator
    destructor is called at some point before BPF program exits.

Note that `start = end = X` is a valid combination to setup empty
iterator. bpf_iter_num_new() will return 0 (success) for any such
combination.

If bpf_iter_num_new() detects invalid combination of input arguments, it
returns error, resets iterator state to, effectively, empty iterator, so
any subsequent call to bpf_iter_num_next() will keep returning NULL.

BPF verifier has no knowledge that returned integers are in the [start,
end) value range, as both `start` and `end` are not statically
known/enforced, they are runtime values only.

While implementation is pretty trivial, some care needs to be taken to
avoid overflows and underflows. Subsequent selftests will validate
correctness of [start, end) semantics, especially around extremes
(INT_MIN and INT_MAX).

Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can
be specified.

bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded
BPF loops and bpf_loop() helper and is the basis for implementing
ergonomic BPF loops with no statically known and verified bounds.
Subsequent patches implement bpf_for() macro, demonstrating how this can
be wrapped into something that works and feels like a normal for() loop
in C language.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf.h   | 14 +++++++--
 kernel/bpf/bpf_iter.c | 71 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/helpers.c  |  3 ++
 kernel/bpf/verifier.c | 24 +++++++++++++--
 4 files changed, 107 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a968282ba324..2a730759a471 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -613,6 +613,9 @@ enum bpf_type_flag {
 	/* DYNPTR points to xdp_buff */
 	DYNPTR_TYPE_XDP		= BIT(16 + BPF_BASE_TYPE_BITS),
 
+	/* ITER of integers */
+	ITER_TYPE_NUM		= BIT(17 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
@@ -620,7 +623,7 @@ enum bpf_type_flag {
 #define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
 				 | DYNPTR_TYPE_XDP)
 
-#define ITER_TYPE_FLAG_MASK	(0)
+#define ITER_TYPE_FLAG_MASK	(ITER_TYPE_NUM)
 
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
@@ -1167,6 +1170,7 @@ u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
 
 enum bpf_iter_type {
 	BPF_ITER_TYPE_INVALID,
+	BPF_ITER_TYPE_NUM,
 };
 
 #ifdef CONFIG_BPF_JIT
@@ -1622,8 +1626,12 @@ struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
-/* Maximum number of loops for bpf_loop */
-#define BPF_MAX_LOOPS	BIT(23)
+/* Maximum number of loops for bpf_loop and bpf_iter_num.
+ * It's enum to expose it (and thus make it discoverable) through BTF.
+ */
+enum {
+	BPF_MAX_LOOPS = 8 * 1024 * 1024,
+};
 
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5dc307bdeaeb..504189a3b474 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2020 Facebook */
 
+#include "linux/build_bug.h"
 #include <linux/fs.h>
 #include <linux/anon_inodes.h>
 #include <linux/filter.h>
@@ -776,3 +777,73 @@ const struct bpf_func_proto bpf_loop_proto = {
 	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+struct bpf_iter_num_kern {
+	int cur; /* current value, inclusive */
+	int end; /* final value, exclusive */
+	__u64 :64;
+	__u64 :64;
+} __aligned(8);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter *it__uninit, int start, int end)
+{
+	struct bpf_iter_num_kern *s = (void *)it__uninit;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter));
+
+	/* start == end is legit, it's an empty range and we'll just get NULL
+	 * on first (and any subsequent) bpf_iter_num_next() call
+	 */
+	if (start > end) {
+		s->cur = s->end = 0;
+		return -EINVAL;
+	}
+
+	/* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+	if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+		s->cur = s->end = 0;
+		return -E2BIG;
+	}
+
+	/* user will call bpf_iter_num_next() first,
+	 * which will set s->cur to exactly start value;
+	 * underflow shouldn't matter
+	 */
+	s->cur = start - 1;
+	s->end = end;
+
+	return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter* it)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	/* check failed initialization or if we are done (same behavior);
+	 * need to be careful about overflow, so convert to s64 for checks,
+	 * e.g., if s->cur == s->end == INT_MAX, we can't just do
+	 * s->cur + 1 >= s->end
+	 */
+	if ((s64)(s->cur + 1) >= s->end) {
+		s->cur = s->end = 0;
+		return NULL;
+	}
+
+	s->cur++;
+
+	return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter *it)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	s->cur = s->end = 0;
+}
+
+__diag_pop();
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index de9ef8476e29..23c8f2313d5a 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2398,6 +2398,9 @@ BTF_ID_FLAGS(func, bpf_rcu_read_lock)
 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
 BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 58754929ee33..9671b4f354e9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -779,6 +779,8 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
 static const char *iter_type_str(enum bpf_iter_type type)
 {
 	switch (type) {
+	case BPF_ITER_TYPE_NUM:
+		return "num";
 	case BPF_ITER_TYPE_INVALID:
 		return "<invalid>";
 	default:
@@ -1157,6 +1159,8 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 static enum bpf_dynptr_type arg_to_iter_type(enum bpf_arg_type arg_type)
 {
 	switch (arg_type & ITER_TYPE_FLAG_MASK) {
+	case ITER_TYPE_NUM:
+		return BPF_ITER_TYPE_NUM;
 	default:
 		return BPF_ITER_TYPE_INVALID;
 	}
@@ -9445,6 +9449,9 @@ enum special_kfunc_type {
 	KF_bpf_dynptr_from_xdp,
 	KF_bpf_dynptr_slice,
 	KF_bpf_dynptr_slice_rdwr,
+	KF_bpf_iter_num_new,
+	KF_bpf_iter_num_next,
+	KF_bpf_iter_num_destroy,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -9483,6 +9490,9 @@ BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_iter_num_new)
+BTF_ID(func, bpf_iter_num_next)
+BTF_ID(func, bpf_iter_num_destroy)
 
 static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -9496,7 +9506,7 @@ static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
 
 static bool is_iter_next_kfunc(int btf_id)
 {
-	return false;
+	return btf_id == special_kfunc_list[KF_bpf_iter_num_next];
 }
 
 static enum kfunc_ptr_arg_type
@@ -10278,7 +10288,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			if (is_kfunc_arg_uninit(btf, &args[i]))
 				iter_arg_type |= MEM_UNINIT;
 
-			ret = process_iter_arg(env, regno, insn_idx, iter_arg_type,  meta);
+			if (meta->func_id == special_kfunc_list[KF_bpf_iter_num_new] ||
+			    meta->func_id == special_kfunc_list[KF_bpf_iter_num_next]) {
+				iter_arg_type |= ITER_TYPE_NUM;
+			} else if (meta->func_id == special_kfunc_list[KF_bpf_iter_num_destroy]) {
+				iter_arg_type |= ITER_TYPE_NUM | OBJ_RELEASE;
+			} else {
+				verbose(env, "verifier internal error: unrecognized iterator kfunc\n");
+				return -EFAULT;
+			}
+
+			ret = process_iter_arg(env, regno, insn_idx, iter_arg_type, meta);
 			if (ret < 0)
 				return ret;
 			break;
-- 
2.30.2