All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next v2 01/20] bpf: implement an interface to register bpf_iter targets
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-05 21:19   ` Andrii Nakryiko
  2020-05-04  6:25 ` [PATCH bpf-next v2 02/20] bpf: allow loading of a bpf_iter program Yonghong Song
                   ` (18 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

The target can call bpf_iter_reg_target() to register itself.
The needed information:
  target:           target name
  seq_ops:          the seq_file operations for the target
  init_seq_private  target callback to initialize seq_priv during file open
  fini_seq_private  target callback to clean up seq_priv during file release
  seq_priv_size:    the private_data size needed by the seq_file
                    operations

The target name represents a target which provides a seq_ops
for iterating objects.

The target can provide two callback functions, init_seq_private
and fini_seq_private, called during file open/release time.
For example, /proc/net/{tcp6, ipv6_route, netlink, ...}, net
name space needs to be setup properly during file open and
released properly during file release.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   | 14 ++++++++++++++
 kernel/bpf/Makefile   |  2 +-
 kernel/bpf/bpf_iter.c | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/bpf_iter.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1262ec460ab3..597b37c4e1c6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -31,6 +31,7 @@ struct seq_file;
 struct btf;
 struct btf_type;
 struct exception_table_entry;
+struct seq_operations;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1126,6 +1127,19 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
 
+typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
+typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
+
+struct bpf_iter_reg {
+	const char *target;
+	const struct seq_operations *seq_ops;
+	bpf_iter_init_seq_priv_t init_seq_private;
+	bpf_iter_fini_seq_priv_t fini_seq_private;
+	u32 seq_priv_size;
+};
+
+int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
+
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f2d7be596966..6a8b0febd3f6 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
 obj-y := core.o
 CFLAGS_core.o += $(call cc-disable-warning, override-init)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
new file mode 100644
index 000000000000..ed930a0470e9
--- /dev/null
+++ b/kernel/bpf/bpf_iter.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+struct bpf_iter_target_info {
+	struct list_head list;
+	const char *target;
+	const struct seq_operations *seq_ops;
+	bpf_iter_init_seq_priv_t init_seq_private;
+	bpf_iter_fini_seq_priv_t fini_seq_private;
+	u32 seq_priv_size;
+};
+
+static struct list_head targets = LIST_HEAD_INIT(targets);
+static DEFINE_MUTEX(targets_mutex);
+
+int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
+{
+	struct bpf_iter_target_info *tinfo;
+
+	tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+	if (!tinfo)
+		return -ENOMEM;
+
+	tinfo->target = reg_info->target;
+	tinfo->seq_ops = reg_info->seq_ops;
+	tinfo->init_seq_private = reg_info->init_seq_private;
+	tinfo->fini_seq_private = reg_info->fini_seq_private;
+	tinfo->seq_priv_size = reg_info->seq_priv_size;
+	INIT_LIST_HEAD(&tinfo->list);
+
+	mutex_lock(&targets_mutex);
+	list_add(&tinfo->list, &targets);
+	mutex_unlock(&targets_mutex);
+
+	return 0;
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data
@ 2020-05-04  6:25 Yonghong Song
  2020-05-04  6:25 ` [PATCH bpf-next v2 01/20] bpf: implement an interface to register bpf_iter targets Yonghong Song
                   ` (19 more replies)
  0 siblings, 20 replies; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Motivation:
  The current way to dump kernel data structures mostly:
    1. /proc system
    2. various specific tools like "ss" which requires kernel support.
    3. drgn
  The dropback for the first two is that whenever you want to dump more, you
  need change the kernel. For example, Martin wants to dump socket local
  storage with "ss". Kernel change is needed for it to work ([1]).
  This is also the direct motivation for this work.

  drgn ([2]) solves this proble nicely and no kernel change is not needed.
  But since drgn is not able to verify the validity of a particular pointer value,
  it might present the wrong results in rare cases.
  
  In this patch set, we introduce bpf iterator. Initial kernel changes are
  still needed for interested kernel data, but a later data structure change
  will not require kernel changes any more. bpf program itself can adapt
  to new data structure changes. This will give certain flexibility with
  guaranteed correctness.
  
  In this patch set, kernel seq_ops is used to facilitate iterating through
  kernel data, similar to current /proc and many other lossless kernel
  dumping facilities. In the future, different iterators can be
  implemented to trade off losslessness for other criteria e.g. no
  repeated object visits, etc.

User Interface:
  1. Similar to prog/map/link, the iterator can be pinned into a
     path within a bpffs mount point.
  2. The bpftool command can pin an iterator to a file
         bpftool iter pin <bpf_prog.o> <path>
  3. Use `cat <path>` to dump the contents.
     Use `rm -f <path>` to remove the pinned iterator.
  4. The anonymous iterator can be created as well.

  Please see patch #18 andd #19 for bpf programs and bpf iterator
  output examples.

  Note that certain iterators are namespace aware. For example,
  task and task_file targets only iterate through current pid namespace.
  ipv6_route and netlink will iterate through current net namespace.

  Please see individual patches for implementation details.

Performance:
  The bpf iterator provides in-kernel aggregation abilities
  for kernel data. This can greatly improve performance
  compared to e.g., iterating all process directories under /proc.
  For example, I did an experiment on my VM with an application forking
  different number of tasks and each forked process opening various number
  of files. The following is the result with the latency with unit of microseconds:

    # of forked tasks   # of open files    # of bpf_prog calls  # latency (us)
    100                 100                11503                7586
    1000                1000               1013203              709513
    10000               100                1130203              764519

  The number of bpf_prog calls may be more than forked tasks multipled by
  open files since there are other tasks running on the system.
  The bpf program is a do-nothing program. One millions of bpf calls takes
  less than one second.

Future Work:
  Although the initial motivation is from Martin's sk_local_storage,
  this patch didn't implement tcp6 sockets and sk_local_storage.
  The /proc/net/tcp6 involves three types of sockets, timewait,
  request and tcp6 sockets. Some kind of type casting or other
  mechanism is needed to handle all these socket types in one
  bpf program. This will be addressed in future work.

  Currently, we do not support kernel data generated under module.
  This requires some BTF work.

  More work for more iterators, e.g., bpf_progs, cgroups, bpf_map elements, etc.

Changelog:
  v1 -> v2:
    - removed target_feature, using callback functions instead
    - checking target to ensure program specified btf_id supported (Martin)
    - link_create change with new changes from Andrii
    - better handling of btf_iter vs. seq_file private data (Martin, Andrii)
    - implemented bpf_seq_read() (Andrii, Alexei)
    - percpu buffer for bpf_seq_printf() (Andrii)
    - better syntax for BPF_SEQ_PRINTF macro (Andrii)
    - bpftool fixes (Quentin)
    - a lot of other fixes
  RFC v2 -> v1:
    - rename bpfdump to bpf_iter
    - use bpffs instead of a new file system
    - use bpf_link to streamline and simplify iterator creation.

References:
  [1]: https://lore.kernel.org/bpf/20200225230427.1976129-1-kafai@fb.com
  [2]: https://github.com/osandov/drgn

Yonghong Song (20):
  bpf: implement an interface to register bpf_iter targets
  bpf: allow loading of a bpf_iter program
  bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE
  bpf: implement bpf_seq_read() for bpf iterator
  bpf: create anonymous bpf iterator
  bpf: create file bpf iterator
  bpf: implement common macros/helpers for target iterators
  bpf: add bpf_map iterator
  net: bpf: add netlink and ipv6_route bpf_iter targets
  bpf: add task and task/file iterator targets
  bpf: add PTR_TO_BTF_ID_OR_NULL support
  bpf: add bpf_seq_printf and bpf_seq_write helpers
  bpf: handle spilled PTR_TO_BTF_ID properly when checking
    stack_boundary
  bpf: support variable length array in tracing programs
  tools/libbpf: add bpf_iter support
  tools/bpftool: add bpf_iter support for bptool
  tools/bpf: selftests: add iterator programs for ipv6_route and netlink
  tools/bpf: selftests: add iter progs for bpf_map/task/task_file
  tools/bpf: selftests: add bpf_iter selftests

 fs/proc/proc_net.c                            |  19 +
 include/linux/bpf.h                           |  35 ++
 include/linux/bpf_types.h                     |   1 +
 include/linux/proc_fs.h                       |   3 +
 include/uapi/linux/bpf.h                      |  40 +-
 kernel/bpf/Makefile                           |   2 +-
 kernel/bpf/bpf_iter.c                         | 518 ++++++++++++++++++
 kernel/bpf/btf.c                              |  42 +-
 kernel/bpf/inode.c                            |   5 +-
 kernel/bpf/map_iter.c                         | 107 ++++
 kernel/bpf/syscall.c                          |  59 ++
 kernel/bpf/task_iter.c                        | 336 ++++++++++++
 kernel/bpf/verifier.c                         |  45 +-
 kernel/trace/bpf_trace.c                      | 195 +++++++
 net/ipv6/ip6_fib.c                            |  65 ++-
 net/ipv6/route.c                              |  27 +
 net/netlink/af_netlink.c                      |  87 ++-
 scripts/bpf_helpers_doc.py                    |   2 +
 .../bpftool/Documentation/bpftool-iter.rst    |  83 +++
 tools/bpf/bpftool/bash-completion/bpftool     |  13 +
 tools/bpf/bpftool/iter.c                      |  84 +++
 tools/bpf/bpftool/link.c                      |   1 +
 tools/bpf/bpftool/main.c                      |   3 +-
 tools/bpf/bpftool/main.h                      |   1 +
 tools/include/uapi/linux/bpf.h                |  40 +-
 tools/lib/bpf/bpf.c                           |  11 +
 tools/lib/bpf/bpf.h                           |   2 +
 tools/lib/bpf/bpf_tracing.h                   |  16 +
 tools/lib/bpf/libbpf.c                        |  45 ++
 tools/lib/bpf/libbpf.h                        |   9 +
 tools/lib/bpf/libbpf.map                      |   2 +
 .../selftests/bpf/prog_tests/bpf_iter.c       | 390 +++++++++++++
 .../selftests/bpf/progs/bpf_iter_bpf_map.c    |  29 +
 .../selftests/bpf/progs/bpf_iter_ipv6_route.c |  63 +++
 .../selftests/bpf/progs/bpf_iter_netlink.c    |  74 +++
 .../selftests/bpf/progs/bpf_iter_task.c       |  26 +
 .../selftests/bpf/progs/bpf_iter_task_file.c  |  27 +
 .../selftests/bpf/progs/bpf_iter_test_kern1.c |   4 +
 .../selftests/bpf/progs/bpf_iter_test_kern2.c |   4 +
 .../selftests/bpf/progs/bpf_iter_test_kern3.c |  18 +
 .../selftests/bpf/progs/bpf_iter_test_kern4.c |  48 ++
 .../bpf/progs/bpf_iter_test_kern_common.h     |  22 +
 42 files changed, 2589 insertions(+), 14 deletions(-)
 create mode 100644 kernel/bpf/bpf_iter.c
 create mode 100644 kernel/bpf/map_iter.c
 create mode 100644 kernel/bpf/task_iter.c
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-iter.rst
 create mode 100644 tools/bpf/bpftool/iter.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_iter.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h

-- 
2.24.1


^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 02/20] bpf: allow loading of a bpf_iter program
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
  2020-05-04  6:25 ` [PATCH bpf-next v2 01/20] bpf: implement an interface to register bpf_iter targets Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-05 21:29   ` Andrii Nakryiko
  2020-05-04  6:25 ` [PATCH bpf-next v2 03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE Yonghong Song
                   ` (17 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

A bpf_iter program is a tracing program with attach type
BPF_TRACE_ITER. The load attribute
  attach_btf_id
is used by the verifier against a particular kernel function,
which represents a target, e.g., __bpf_iter__bpf_map
for target bpf_map which is implemented later.

The program return value must be 0 or 1 for now.
  0 : successful, except potential seq_file buffer overflow
      which is handled by seq_file reader.
  1 : request to restart the same object

In the future, other return values may be used for filtering or
teminating the iterator.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |  3 +++
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/bpf_iter.c          | 30 ++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 21 +++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 5 files changed, 56 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 597b37c4e1c6..cd385c36a172 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1127,6 +1127,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
 
+#define BPF_ITER_FUNC_PREFIX "__bpf_iter__"
+
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
 
@@ -1139,6 +1141,7 @@ struct bpf_iter_reg {
 };
 
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
+bool bpf_iter_prog_supported(struct bpf_prog *prog);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b3643e27e264..047b19fe716e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -218,6 +218,7 @@ enum bpf_attach_type {
 	BPF_TRACE_FEXIT,
 	BPF_MODIFY_RETURN,
 	BPF_LSM_MAC,
+	BPF_TRACE_ITER,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index ed930a0470e9..c1fae67a1452 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -12,6 +12,7 @@ struct bpf_iter_target_info {
 	bpf_iter_init_seq_priv_t init_seq_private;
 	bpf_iter_fini_seq_priv_t fini_seq_private;
 	u32 seq_priv_size;
+	u32 btf_id;
 };
 
 static struct list_head targets = LIST_HEAD_INIT(targets);
@@ -38,3 +39,32 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 
 	return 0;
 }
+
+bool bpf_iter_prog_supported(struct bpf_prog *prog)
+{
+	const char *attach_fname = prog->aux->attach_func_name;
+	u32 prog_btf_id = prog->aux->attach_btf_id;
+	const char *prefix = BPF_ITER_FUNC_PREFIX;
+	struct bpf_iter_target_info *tinfo;
+	int prefix_len = strlen(prefix);
+	bool supported = false;
+
+	if (strncmp(attach_fname, prefix, prefix_len))
+		return false;
+
+	mutex_lock(&targets_mutex);
+	list_for_each_entry(tinfo, &targets, list) {
+		if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
+			supported = true;
+			break;
+		}
+		if (!strcmp(attach_fname + prefix_len, tinfo->target)) {
+			tinfo->btf_id = prog->aux->attach_btf_id;
+			supported = true;
+			break;
+		}
+	}
+	mutex_unlock(&targets_mutex);
+
+	return supported;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 70ad009577f8..d725ff7d11db 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env)
 			return 0;
 		range = tnum_const(0);
 		break;
+	case BPF_PROG_TYPE_TRACING:
+		if (env->prog->expected_attach_type != BPF_TRACE_ITER)
+			return 0;
+		break;
 	default:
 		return 0;
 	}
@@ -10481,6 +10485,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	struct bpf_prog *tgt_prog = prog->aux->linked_prog;
 	u32 btf_id = prog->aux->attach_btf_id;
 	const char prefix[] = "btf_trace_";
+	struct btf_func_model fmodel;
 	int ret = 0, subprog = -1, i;
 	struct bpf_trampoline *tr;
 	const struct btf_type *t;
@@ -10622,6 +10627,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		prog->aux->attach_func_proto = t;
 		prog->aux->attach_btf_trace = true;
 		return 0;
+	case BPF_TRACE_ITER:
+		if (!btf_type_is_func(t)) {
+			verbose(env, "attach_btf_id %u is not a function\n",
+				btf_id);
+			return -EINVAL;
+		}
+		t = btf_type_by_id(btf, t->type);
+		if (!btf_type_is_func_proto(t))
+			return -EINVAL;
+		prog->aux->attach_func_name = tname;
+		prog->aux->attach_func_proto = t;
+		if (!bpf_iter_prog_supported(prog))
+			return -EINVAL;
+		ret = btf_distill_func_proto(&env->log, btf, t,
+					     tname, &fmodel);
+		return ret;
 	default:
 		if (!prog_extension)
 			return -EINVAL;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b3643e27e264..047b19fe716e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -218,6 +218,7 @@ enum bpf_attach_type {
 	BPF_TRACE_FEXIT,
 	BPF_MODIFY_RETURN,
 	BPF_LSM_MAC,
+	BPF_TRACE_ITER,
 	__MAX_BPF_ATTACH_TYPE
 };
 
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
  2020-05-04  6:25 ` [PATCH bpf-next v2 01/20] bpf: implement an interface to register bpf_iter targets Yonghong Song
  2020-05-04  6:25 ` [PATCH bpf-next v2 02/20] bpf: allow loading of a bpf_iter program Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-05 21:30   ` Andrii Nakryiko
  2020-05-04  6:25 ` [PATCH bpf-next v2 04/20] bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE Yonghong Song
                   ` (16 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Given a bpf program, the step to create an anonymous bpf iterator is:
  - create a bpf_iter_link, which combines bpf program and the target.
    In the future, there could be more information recorded in the link.
    A link_fd will be returned to the user space.
  - create an anonymous bpf iterator with the given link_fd.

The bpf_iter_link can be pinned to bpffs mount file system to
create a file based bpf iterator as well.

The benefit to use of bpf_iter_link:
  - using bpf link simplifies design and implementation as bpf link
    is used for other tracing bpf programs.
  - for file based bpf iterator, bpf_iter_link provides a standard
    way to replace underlying bpf programs.
  - for both anonymous and free based iterators, bpf link query
    capability can be leveraged.

The patch added support of tracing/iter programs for BPF_LINK_CREATE.
A new link type BPF_LINK_TYPE_ITER is added to facilitate link
querying. Currently, only prog_id is needed, so there is no
additional in-kernel show_fdinfo() and fill_link_info() hook
is needed for BPF_LINK_TYPE_ITER link.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |  1 +
 include/linux/bpf_types.h      |  1 +
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           | 14 ++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 6 files changed, 80 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd385c36a172..8621ad080b24 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1142,6 +1142,7 @@ struct bpf_iter_reg {
 
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
+int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 8345cdf553b8..29d22752fc87 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -124,3 +124,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
 #ifdef CONFIG_CGROUP_BPF
 BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup)
 #endif
+BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 047b19fe716e..2bf33979f9ae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -229,6 +229,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
 	BPF_LINK_TYPE_TRACING = 2,
 	BPF_LINK_TYPE_CGROUP = 3,
+	BPF_LINK_TYPE_ITER = 4,
 
 	MAX_BPF_LINK_TYPE,
 };
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index c1fae67a1452..cc0e205fb8c5 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -15,6 +15,11 @@ struct bpf_iter_target_info {
 	u32 btf_id;
 };
 
+struct bpf_iter_link {
+	struct bpf_link link;
+	struct bpf_iter_target_info *tinfo;
+};
+
 static struct list_head targets = LIST_HEAD_INIT(targets);
 static DEFINE_MUTEX(targets_mutex);
 
@@ -68,3 +73,60 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
 
 	return supported;
 }
+
+static void bpf_iter_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_iter_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_iter_link *iter_link =
+		container_of(link, struct bpf_iter_link, link);
+
+	kfree(iter_link);
+}
+
+static const struct bpf_link_ops bpf_iter_link_lops = {
+	.release = bpf_iter_link_release,
+	.dealloc = bpf_iter_link_dealloc,
+};
+
+int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct bpf_iter_target_info *tinfo;
+	struct bpf_iter_link *link;
+	bool existed = false;
+	u32 prog_btf_id;
+	int err;
+
+	if (attr->link_create.target_fd || attr->link_create.flags)
+		return -EINVAL;
+
+	prog_btf_id = prog->aux->attach_btf_id;
+	mutex_lock(&targets_mutex);
+	list_for_each_entry(tinfo, &targets, list) {
+		if (tinfo->btf_id == prog_btf_id) {
+			existed = true;
+			break;
+		}
+	}
+	mutex_unlock(&targets_mutex);
+	if (!existed)
+		return -ENOENT;
+
+	link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
+	if (!link)
+		return -ENOMEM;
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
+	link->tinfo = tinfo;
+
+	err  = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		return err;
+	}
+
+	return bpf_link_settle(&link_primer);
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bb1ab7da6103..6ffe2d8fb6c7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
 		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
+	case BPF_TRACE_ITER:
+		return BPF_PROG_TYPE_TRACING;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -3729,6 +3731,15 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 	return err;
 }
 
+static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	if (attr->link_create.attach_type == BPF_TRACE_ITER &&
+	    prog->expected_attach_type == BPF_TRACE_ITER)
+		return bpf_iter_link_attach(attr, prog);
+
+	return -EINVAL;
+}
+
 #define BPF_LINK_CREATE_LAST_FIELD link_create.flags
 static int link_create(union bpf_attr *attr)
 {
@@ -3765,6 +3776,9 @@ static int link_create(union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 		ret = cgroup_bpf_link_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_TRACING:
+		ret = tracing_bpf_link_attach(attr, prog);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 047b19fe716e..2bf33979f9ae 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -229,6 +229,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
 	BPF_LINK_TYPE_TRACING = 2,
 	BPF_LINK_TYPE_CGROUP = 3,
+	BPF_LINK_TYPE_ITER = 4,
 
 	MAX_BPF_LINK_TYPE,
 };
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 04/20] bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (2 preceding siblings ...)
  2020-05-04  6:25 ` [PATCH bpf-next v2 03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-05 21:32   ` Andrii Nakryiko
  2020-05-04  6:25 ` [PATCH bpf-next v2 05/20] bpf: implement bpf_seq_read() for bpf iterator Yonghong Song
                   ` (15 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Added BPF_LINK_UPDATE support for tracing/iter programs.
This way, a file based bpf iterator, which holds a reference
to the link, can have its bpf program updated without
creating new files.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/bpf_iter.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index cc0e205fb8c5..05ae04ac1eca 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -23,6 +23,9 @@ struct bpf_iter_link {
 static struct list_head targets = LIST_HEAD_INIT(targets);
 static DEFINE_MUTEX(targets_mutex);
 
+/* protect bpf_iter_link changes */
+static DEFINE_MUTEX(link_mutex);
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 {
 	struct bpf_iter_target_info *tinfo;
@@ -86,9 +89,37 @@ static void bpf_iter_link_dealloc(struct bpf_link *link)
 	kfree(iter_link);
 }
 
+static int bpf_iter_link_replace(struct bpf_link *link,
+				 struct bpf_prog *new_prog,
+				 struct bpf_prog *old_prog)
+{
+	int ret = 0;
+
+	mutex_lock(&link_mutex);
+	if (old_prog && link->prog != old_prog) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	if (link->prog->type != new_prog->type ||
+	    link->prog->expected_attach_type != new_prog->expected_attach_type ||
+	    link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	old_prog = xchg(&link->prog, new_prog);
+	bpf_prog_put(old_prog);
+
+out_unlock:
+	mutex_unlock(&link_mutex);
+	return ret;
+}
+
 static const struct bpf_link_ops bpf_iter_link_lops = {
 	.release = bpf_iter_link_release,
 	.dealloc = bpf_iter_link_dealloc,
+	.update_prog = bpf_iter_link_replace,
 };
 
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 05/20] bpf: implement bpf_seq_read() for bpf iterator
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (3 preceding siblings ...)
  2020-05-04  6:25 ` [PATCH bpf-next v2 04/20] bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-05 19:56   ` Andrii Nakryiko
  2020-05-04  6:25 ` [PATCH bpf-next v2 06/20] bpf: create anonymous " Yonghong Song
                   ` (14 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

bpf iterator uses seq_file to provide a lossless
way to transfer data to user space. But we want to call
bpf program after all objects have been traversed, and
bpf program may write additional data to the
seq_file buffer. The current seq_read() does not work
for this use case.

Besides allowing stop() function to write to the buffer,
the bpf_seq_read() also fixed the buffer size to one page.
If any single call of show() or stop() will emit data
more than one page to cause overflow, -E2BIG error code
will be returned to user space.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/bpf_iter.c | 128 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)

diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 05ae04ac1eca..2674c9cbc3dc 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -26,6 +26,134 @@ static DEFINE_MUTEX(targets_mutex);
 /* protect bpf_iter_link changes */
 static DEFINE_MUTEX(link_mutex);
 
+/* bpf_seq_read, a customized and simpler version for bpf iterator.
+ * no_llseek is assumed for this file.
+ * The following are differences from seq_read():
+ *  . fixed buffer size (PAGE_SIZE)
+ *  . assuming no_llseek
+ *  . stop() may call bpf program, handling potential overflow there
+ */
+static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
+			    loff_t *ppos)
+{
+	struct seq_file *seq = file->private_data;
+	size_t n, offs, copied = 0;
+	int err = 0;
+	void *p;
+
+	mutex_lock(&seq->lock);
+
+	if (!seq->buf) {
+		seq->size = PAGE_SIZE;
+		seq->buf = kmalloc(seq->size, GFP_KERNEL);
+		if (!seq->buf)
+			goto Enomem;
+	}
+
+	if (seq->count) {
+		n = min(seq->count, size);
+		err = copy_to_user(buf, seq->buf + seq->from, n);
+		if (err)
+			goto Efault;
+		seq->count -= n;
+		seq->from += n;
+		copied = n;
+		goto Done;
+	}
+
+	seq->from = 0;
+	p = seq->op->start(seq, &seq->index);
+	if (!p || IS_ERR(p))
+		goto Stop;
+
+	err = seq->op->show(seq, p);
+	if (seq_has_overflowed(seq)) {
+		err = -E2BIG;
+		goto Error_show;
+	} else if (err) {
+		/* < 0: go out, > 0: skip */
+		if (likely(err < 0))
+			goto Error_show;
+		seq->count = 0;
+	}
+
+	while (1) {
+		loff_t pos = seq->index;
+
+		offs = seq->count;
+		p = seq->op->next(seq, p, &seq->index);
+		if (pos == seq->index) {
+			pr_info_ratelimited("buggy seq_file .next function %ps "
+				"did not updated position index\n",
+				seq->op->next);
+			seq->index++;
+		}
+
+		if (!p || IS_ERR(p)) {
+			err = PTR_ERR(p);
+			break;
+		}
+		if (seq->count >= size)
+			break;
+
+		err = seq->op->show(seq, p);
+		if (seq_has_overflowed(seq)) {
+			if (offs == 0) {
+				err = -E2BIG;
+				goto Error_show;
+			}
+			seq->count = offs;
+			break;
+		} else if (err) {
+			/* < 0: go out, > 0: skip */
+			seq->count = offs;
+			if (likely(err < 0)) {
+				if (offs == 0)
+					goto Error_show;
+				break;
+			}
+		}
+	}
+Stop:
+	offs = seq->count;
+	/* may call bpf program */
+	seq->op->stop(seq, p);
+	if (seq_has_overflowed(seq)) {
+		if (offs == 0)
+			goto Error_stop;
+		seq->count = offs;
+	}
+
+	n = min(seq->count, size);
+	err = copy_to_user(buf, seq->buf, n);
+	if (err)
+		goto Efault;
+	copied = n;
+	seq->count -= n;
+	seq->from = n;
+Done:
+	if (!copied)
+		copied = err;
+	else
+		*ppos += copied;
+	mutex_unlock(&seq->lock);
+	return copied;
+
+Error_show:
+	seq->op->stop(seq, p);
+Error_stop:
+	seq->count = 0;
+	goto Done;
+
+Enomem:
+	err = -ENOMEM;
+	goto Done;
+
+Efault:
+	err = -EFAULT;
+	goto Done;
+}
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 {
 	struct bpf_iter_target_info *tinfo;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 06/20] bpf: create anonymous bpf iterator
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (4 preceding siblings ...)
  2020-05-04  6:25 ` [PATCH bpf-next v2 05/20] bpf: implement bpf_seq_read() for bpf iterator Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-05 20:11   ` Andrii Nakryiko
  2020-05-04  6:25 ` [PATCH bpf-next v2 07/20] bpf: create file " Yonghong Song
                   ` (13 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

A new bpf command BPF_ITER_CREATE is added.

The anonymous bpf iterator is seq_file based.
The seq_file private data are referenced by targets.
The bpf_iter infrastructure allocated additional space
at seq_file->private before the space used by targets
to store some meta data, e.g.,
  prog:       prog to run
  session_id: an unique id for each opened seq_file
  seq_num:    how many times bpf programs are queried in this session
  do_stop:    an internal state to decide whether bpf program
              should be called in seq_ops->stop() or not

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |   1 +
 include/uapi/linux/bpf.h       |   6 ++
 kernel/bpf/bpf_iter.c          | 128 +++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  26 +++++++
 tools/include/uapi/linux/bpf.h |   6 ++
 5 files changed, 167 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8621ad080b24..9108d1a9b934 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1143,6 +1143,7 @@ struct bpf_iter_reg {
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int bpf_iter_new_fd(struct bpf_link *link);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2bf33979f9ae..97ceb0f2e539 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_cmd {
 	BPF_LINK_GET_FD_BY_ID,
 	BPF_LINK_GET_NEXT_ID,
 	BPF_ENABLE_STATS,
+	BPF_ITER_CREATE,
 };
 
 enum bpf_map_type {
@@ -614,6 +615,11 @@ union bpf_attr {
 		__u32		type;
 	} enable_stats;
 
+	struct { /* struct used by BPF_ITER_CREATE command */
+		__u32		link_fd;
+		__u32		flags;
+	} iter_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 2674c9cbc3dc..2a9f939be6e6 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2020 Facebook */
 
 #include <linux/fs.h>
+#include <linux/anon_inodes.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
 
@@ -20,12 +21,26 @@ struct bpf_iter_link {
 	struct bpf_iter_target_info *tinfo;
 };
 
+struct bpf_iter_priv_data {
+	struct {
+		struct bpf_iter_target_info *tinfo;
+		struct bpf_prog *prog;
+		u64 session_id;
+		u64 seq_num;
+		u64 do_stop;
+	};
+	u8 target_private[] __aligned(8);
+};
+
 static struct list_head targets = LIST_HEAD_INIT(targets);
 static DEFINE_MUTEX(targets_mutex);
 
 /* protect bpf_iter_link changes */
 static DEFINE_MUTEX(link_mutex);
 
+/* incremented on every opened seq_file */
+static atomic64_t session_id;
+
 /* bpf_seq_read, a customized and simpler version for bpf iterator.
  * no_llseek is assumed for this file.
  * The following are differences from seq_read():
@@ -154,6 +169,31 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 	goto Done;
 }
 
+static int iter_release(struct inode *inode, struct file *file)
+{
+	struct bpf_iter_priv_data *iter_priv;
+	void *file_priv = file->private_data;
+	struct seq_file *seq;
+
+	seq = file_priv;
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+
+	if (iter_priv->tinfo->fini_seq_private)
+		iter_priv->tinfo->fini_seq_private(seq->private);
+
+	bpf_prog_put(iter_priv->prog);
+	seq->private = iter_priv;
+
+	return seq_release_private(inode, file);
+}
+
+static const struct file_operations bpf_iter_fops = {
+	.llseek		= no_llseek,
+	.read		= bpf_seq_read,
+	.release	= iter_release,
+};
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 {
 	struct bpf_iter_target_info *tinfo;
@@ -289,3 +329,91 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 
 	return bpf_link_settle(&link_primer);
 }
+
+static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
+			  struct bpf_iter_target_info *tinfo,
+			  struct bpf_prog *prog)
+{
+	priv_data->tinfo = tinfo;
+	priv_data->prog = prog;
+	priv_data->session_id = atomic64_add_return(1, &session_id);
+	priv_data->seq_num = 0;
+	priv_data->do_stop = 0;
+}
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+{
+	struct bpf_iter_priv_data *priv_data;
+	struct bpf_iter_target_info *tinfo;
+	struct bpf_prog *prog;
+	u32 total_priv_dsize;
+	struct seq_file *seq;
+	int err = 0;
+
+	mutex_lock(&link_mutex);
+	prog = link->link.prog;
+	bpf_prog_inc(prog);
+	mutex_unlock(&link_mutex);
+
+	tinfo = link->tinfo;
+	total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
+			   tinfo->seq_priv_size;
+	priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize);
+	if (!priv_data) {
+		err = -ENOMEM;
+		goto release_prog;
+	}
+
+	if (tinfo->init_seq_private) {
+		err = tinfo->init_seq_private(priv_data->target_private);
+		if (err)
+			goto release_seq_file;
+	}
+
+	init_seq_meta(priv_data, tinfo, prog);
+	seq = file->private_data;
+	seq->private = priv_data->target_private;
+
+	return 0;
+
+release_seq_file:
+	seq_release_private(file->f_inode, file);
+release_prog:
+	bpf_prog_put(prog);
+	return err;
+}
+
+int bpf_iter_new_fd(struct bpf_link *link)
+{
+	struct file *file;
+	unsigned int flags;
+	int err, fd;
+
+	if (link->ops != &bpf_iter_link_lops)
+		return -EINVAL;
+
+	flags = O_RDONLY | O_CLOEXEC;
+	fd = get_unused_fd_flags(flags);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto free_fd;
+	}
+
+	err = prepare_seq_file(file,
+			       container_of(link, struct bpf_iter_link, link));
+	if (err)
+		goto free_file;
+
+	fd_install(fd, file);
+	return fd;
+
+free_file:
+	fput(file);
+free_fd:
+	put_unused_fd(fd);
+	return err;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6ffe2d8fb6c7..a293e88ee01a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3941,6 +3941,29 @@ static int bpf_enable_stats(union bpf_attr *attr)
 	return -EINVAL;
 }
 
+#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
+
+static int bpf_iter_create(union bpf_attr *attr)
+{
+	struct bpf_link *link;
+	int err;
+
+	if (CHECK_ATTR(BPF_ITER_CREATE))
+		return -EINVAL;
+
+	if (attr->iter_create.flags)
+		return -EINVAL;
+
+	link = bpf_link_get_from_fd(attr->iter_create.link_fd);
+	if (IS_ERR(link))
+		return PTR_ERR(link);
+
+	err = bpf_iter_new_fd(link);
+	bpf_link_put(link);
+
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr;
@@ -4068,6 +4091,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_ENABLE_STATS:
 		err = bpf_enable_stats(&attr);
 		break;
+	case BPF_ITER_CREATE:
+		err = bpf_iter_create(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2bf33979f9ae..97ceb0f2e539 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_cmd {
 	BPF_LINK_GET_FD_BY_ID,
 	BPF_LINK_GET_NEXT_ID,
 	BPF_ENABLE_STATS,
+	BPF_ITER_CREATE,
 };
 
 enum bpf_map_type {
@@ -614,6 +615,11 @@ union bpf_attr {
 		__u32		type;
 	} enable_stats;
 
+	struct { /* struct used by BPF_ITER_CREATE command */
+		__u32		link_fd;
+		__u32		flags;
+	} iter_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 07/20] bpf: create file bpf iterator
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (5 preceding siblings ...)
  2020-05-04  6:25 ` [PATCH bpf-next v2 06/20] bpf: create anonymous " Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-05 20:15   ` Andrii Nakryiko
  2020-05-04  6:25 ` [PATCH bpf-next v2 08/20] bpf: implement common macros/helpers for target iterators Yonghong Song
                   ` (12 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

To produce a file bpf iterator, the fd must be
corresponding to a link_fd assocciated with a
trace/iter program. When the pinned file is
opened, a seq_file will be generated.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/bpf_iter.c | 17 ++++++++++++++++-
 kernel/bpf/inode.c    |  5 ++++-
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9108d1a9b934..26daf85cba10 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1022,6 +1022,7 @@ static inline void bpf_enable_instrumentation(void)
 
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
+extern const struct file_operations bpf_iter_fops;
 
 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
 	extern const struct bpf_prog_ops _name ## _prog_ops; \
@@ -1144,6 +1145,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_iter_new_fd(struct bpf_link *link);
+bool bpf_link_is_iter(struct bpf_link *link);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 2a9f939be6e6..8bd787f3db6f 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -41,6 +41,8 @@ static DEFINE_MUTEX(link_mutex);
 /* incremented on every opened seq_file */
 static atomic64_t session_id;
 
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
+
 /* bpf_seq_read, a customized and simpler version for bpf iterator.
  * no_llseek is assumed for this file.
  * The following are differences from seq_read():
@@ -169,6 +171,13 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 	goto Done;
 }
 
+static int iter_open(struct inode *inode, struct file *file)
+{
+	struct bpf_iter_link *link = inode->i_private;
+
+	return prepare_seq_file(file, link);
+}
+
 static int iter_release(struct inode *inode, struct file *file)
 {
 	struct bpf_iter_priv_data *iter_priv;
@@ -188,7 +197,8 @@ static int iter_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static const struct file_operations bpf_iter_fops = {
+const struct file_operations bpf_iter_fops = {
+	.open		= iter_open,
 	.llseek		= no_llseek,
 	.read		= bpf_seq_read,
 	.release	= iter_release,
@@ -290,6 +300,11 @@ static const struct bpf_link_ops bpf_iter_link_lops = {
 	.update_prog = bpf_iter_link_replace,
 };
 
+bool bpf_link_is_iter(struct bpf_link *link)
+{
+	return link->ops == &bpf_iter_link_lops;
+}
+
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
 	struct bpf_link_primer link_primer;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 95087d9f4ed3..fb878ba3f22f 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 
 static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
 {
+	struct bpf_link *link = arg;
+
 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
-			     &bpffs_obj_fops);
+			     bpf_link_is_iter(link) ?
+			     &bpf_iter_fops : &bpffs_obj_fops);
 }
 
 static struct dentry *
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 08/20] bpf: implement common macros/helpers for target iterators
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (6 preceding siblings ...)
  2020-05-04  6:25 ` [PATCH bpf-next v2 07/20] bpf: create file " Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-05 20:25   ` Andrii Nakryiko
  2020-05-04  6:25 ` [PATCH bpf-next v2 09/20] bpf: add bpf_map iterator Yonghong Song
                   ` (11 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Macro DEFINE_BPF_ITER_FUNC is implemented so target
can define an init function to capture the BTF type
which represents the target.

The bpf_iter_meta is a structure holding meta data, common
to all targets in the bpf program.

Additional marker functions are called before/after
bpf_seq_read() show() and stop() callback functions
to help calculate precise seq_num and whether call bpf_prog
inside stop().

Two functions, bpf_iter_get_info() and bpf_iter_run_prog(),
are implemented so target can get needed information from
bpf_iter infrastructure and can run the program.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   | 11 +++++
 kernel/bpf/bpf_iter.c | 94 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 26daf85cba10..70c71c3cd9e8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1129,6 +1129,9 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
 
 #define BPF_ITER_FUNC_PREFIX "__bpf_iter__"
+#define DEFINE_BPF_ITER_FUNC(target, args...)			\
+	extern int __bpf_iter__ ## target(args);		\
+	int __init __bpf_iter__ ## target(args) { return 0; }
 
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
@@ -1141,11 +1144,19 @@ struct bpf_iter_reg {
 	u32 seq_priv_size;
 };
 
+struct bpf_iter_meta {
+	__bpf_md_ptr(struct seq_file *, seq);
+	u64 session_id;
+	u64 seq_num;
+};
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_iter_new_fd(struct bpf_link *link);
 bool bpf_link_is_iter(struct bpf_link *link);
+struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop);
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 8bd787f3db6f..90d58c589816 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -43,6 +43,42 @@ static atomic64_t session_id;
 
 static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
 
+static void bpf_iter_inc_seq_num(struct seq_file *seq)
+{
+	struct bpf_iter_priv_data *iter_priv;
+
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+	iter_priv->seq_num++;
+}
+
+static void bpf_iter_dec_seq_num(struct seq_file *seq)
+{
+	struct bpf_iter_priv_data *iter_priv;
+
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+	iter_priv->seq_num--;
+}
+
+static void bpf_iter_set_stop(struct seq_file *seq)
+{
+	struct bpf_iter_priv_data *iter_priv;
+
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+	iter_priv->do_stop++;
+}
+
+static void bpf_iter_unset_stop(struct seq_file *seq)
+{
+	struct bpf_iter_priv_data *iter_priv;
+
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+	iter_priv->do_stop--;
+}
+
 /* bpf_seq_read, a customized and simpler version for bpf iterator.
  * no_llseek is assumed for this file.
  * The following are differences from seq_read():
@@ -83,12 +119,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 	if (!p || IS_ERR(p))
 		goto Stop;
 
+	bpf_iter_inc_seq_num(seq);
 	err = seq->op->show(seq, p);
 	if (seq_has_overflowed(seq)) {
+		bpf_iter_dec_seq_num(seq);
 		err = -E2BIG;
 		goto Error_show;
 	} else if (err) {
 		/* < 0: go out, > 0: skip */
+		bpf_iter_dec_seq_num(seq);
 		if (likely(err < 0))
 			goto Error_show;
 		seq->count = 0;
@@ -113,8 +152,10 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 		if (seq->count >= size)
 			break;
 
+		bpf_iter_inc_seq_num(seq);
 		err = seq->op->show(seq, p);
 		if (seq_has_overflowed(seq)) {
+			bpf_iter_dec_seq_num(seq);
 			if (offs == 0) {
 				err = -E2BIG;
 				goto Error_show;
@@ -122,6 +163,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 			seq->count = offs;
 			break;
 		} else if (err) {
+			bpf_iter_dec_seq_num(seq);
 			/* < 0: go out, > 0: skip */
 			seq->count = offs;
 			if (likely(err < 0)) {
@@ -134,11 +176,17 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 Stop:
 	offs = seq->count;
 	/* may call bpf program */
-	seq->op->stop(seq, p);
-	if (seq_has_overflowed(seq)) {
-		if (offs == 0)
-			goto Error_stop;
-		seq->count = offs;
+	if (!p) {
+		bpf_iter_set_stop(seq);
+		seq->op->stop(seq, p);
+		if (seq_has_overflowed(seq)) {
+			bpf_iter_unset_stop(seq);
+			if (offs == 0)
+				goto Error_stop;
+			seq->count = offs;
+		}
+	} else {
+		seq->op->stop(seq, p);
 	}
 
 	n = min(seq->count, size);
@@ -432,3 +480,39 @@ int bpf_iter_new_fd(struct bpf_link *link)
 	put_unused_fd(fd);
 	return err;
 }
+
+struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
+{
+	struct bpf_iter_priv_data *iter_priv;
+	struct seq_file *seq;
+	void *seq_priv;
+
+	seq = meta->seq;
+	if (seq->file->f_op != &bpf_iter_fops)
+		return NULL;
+
+	seq_priv = seq->private;
+	iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
+				 target_private);
+
+	if (in_stop && iter_priv->do_stop != 1)
+		return NULL;
+
+	meta->session_id = iter_priv->session_id;
+	meta->seq_num = iter_priv->seq_num;
+
+	return iter_priv->prog;
+}
+
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
+{
+	int ret;
+
+	rcu_read_lock();
+	migrate_disable();
+	ret = BPF_PROG_RUN(prog, ctx);
+	migrate_enable();
+	rcu_read_unlock();
+
+	return ret == 0 ? 0 : -EAGAIN;
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 09/20] bpf: add bpf_map iterator
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (7 preceding siblings ...)
  2020-05-04  6:25 ` [PATCH bpf-next v2 08/20] bpf: implement common macros/helpers for target iterators Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-06  5:11   ` Andrii Nakryiko
  2020-05-04  6:25 ` [PATCH bpf-next v2 10/20] net: bpf: add netlink and ipv6_route bpf_iter targets Yonghong Song
                   ` (10 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Implement seq_file operations to traverse all maps.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   |   1 +
 kernel/bpf/Makefile   |   2 +-
 kernel/bpf/map_iter.c | 107 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c  |  19 ++++++++
 4 files changed, 128 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/map_iter.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 70c71c3cd9e8..56b2ded9c2a6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1082,6 +1082,7 @@ int  generic_map_update_batch(struct bpf_map *map,
 int  generic_map_delete_batch(struct bpf_map *map,
 			      const union bpf_attr *attr,
 			      union bpf_attr __user *uattr);
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a8b0febd3f6..b2b5eefc5254 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
 obj-y := core.o
 CFLAGS_core.o += $(call cc-disable-warning, override-init)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
new file mode 100644
index 000000000000..fa16a4984326
--- /dev/null
+++ b/kernel/bpf/map_iter.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+
+struct bpf_iter_seq_map_info {
+	struct bpf_map *map;
+	u32 id;
+};
+
+static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+	struct bpf_map *map;
+	u32 id = info->id;
+
+	map = bpf_map_get_curr_or_next(&id);
+	if (!map)
+		return NULL;
+
+	++*pos;
+	info->map = map;
+	info->id = id;
+	return map;
+}
+
+static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+	struct bpf_map *map;
+
+	++*pos;
+	++info->id;
+	map = bpf_map_get_curr_or_next(&info->id);
+	if (!map)
+		return NULL;
+
+	bpf_map_put(info->map);
+	info->map = map;
+	return map;
+}
+
+struct bpf_iter__bpf_map {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_map *, map);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map)
+
+static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter__bpf_map ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	ctx.meta = &meta;
+	ctx.map = v;
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	return ret;
+}
+
+static int bpf_map_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_map_seq_show(seq, v, false);
+}
+
+static void bpf_map_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+
+	if (!v)
+		__bpf_map_seq_show(seq, v, true);
+
+	if (info->map) {
+		bpf_map_put(info->map);
+		info->map = NULL;
+	}
+}
+
+static const struct seq_operations bpf_map_seq_ops = {
+	.start	= bpf_map_seq_start,
+	.next	= bpf_map_seq_next,
+	.stop	= bpf_map_seq_stop,
+	.show	= bpf_map_seq_show,
+};
+
+static int __init bpf_map_iter_init(void)
+{
+	struct bpf_iter_reg reg_info = {
+		.target			= "bpf_map",
+		.seq_ops		= &bpf_map_seq_ops,
+		.init_seq_private	= NULL,
+		.fini_seq_private	= NULL,
+		.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
+	};
+
+	return bpf_iter_reg_target(&reg_info);
+}
+
+late_initcall(bpf_map_iter_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a293e88ee01a..de2a75500233 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2934,6 +2934,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 	return err;
 }
 
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
+{
+	struct bpf_map *map;
+
+	spin_lock_bh(&map_idr_lock);
+again:
+	map = idr_get_next(&map_idr, id);
+	if (map) {
+		map = __bpf_map_inc_not_zero(map, false);
+		if (IS_ERR(map)) {
+			(*id)++;
+			goto again;
+		}
+	}
+	spin_unlock_bh(&map_idr_lock);
+
+	return map;
+}
+
 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
 
 struct bpf_prog *bpf_prog_by_id(u32 id)
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 10/20] net: bpf: add netlink and ipv6_route bpf_iter targets
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (8 preceding siblings ...)
  2020-05-04  6:25 ` [PATCH bpf-next v2 09/20] bpf: add bpf_map iterator Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-06  5:21   ` Andrii Nakryiko
  2020-05-04  6:25 ` [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets Yonghong Song
                   ` (9 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

This patch added netlink and ipv6_route targets, using
the same seq_ops (except show() and minor changes for stop())
for /proc/net/{netlink,ipv6_route}.

The net namespace for these targets are the current net
namespace at file open stage, similar to
/proc/net/{netlink,ipv6_route} reference counting
the net namespace at seq_file open stage.

Since module is not supported for now, ipv6_route is
supported only if the IPV6 is built-in, i.e., not compiled
as a module. The restriction can be lifted once module
is properly supported for bpf_iter.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 fs/proc/proc_net.c       | 19 +++++++++
 include/linux/proc_fs.h  |  3 ++
 net/ipv6/ip6_fib.c       | 65 +++++++++++++++++++++++++++++-
 net/ipv6/route.c         | 27 +++++++++++++
 net/netlink/af_netlink.c | 87 +++++++++++++++++++++++++++++++++++++++-
 5 files changed, 197 insertions(+), 4 deletions(-)

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4888c5224442..dba63b2429f0 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -98,6 +98,25 @@ static const struct proc_ops proc_net_seq_ops = {
 	.proc_release	= seq_release_net,
 };
 
+int bpf_iter_init_seq_net(void *priv_data)
+{
+#ifdef CONFIG_NET_NS
+	struct seq_net_private *p = priv_data;
+
+	p->net = get_net(current->nsproxy->net_ns);
+#endif
+	return 0;
+}
+
+void bpf_iter_fini_seq_net(void *priv_data)
+{
+#ifdef CONFIG_NET_NS
+	struct seq_net_private *p = priv_data;
+
+	put_net(p->net);
+#endif
+}
+
 struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
 		struct proc_dir_entry *parent, const struct seq_operations *ops,
 		unsigned int state_size, void *data)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 45c05fd9c99d..03953c59807d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -105,6 +105,9 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    void *data);
 extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 
+extern int bpf_iter_init_seq_net(void *priv_data);
+extern void bpf_iter_fini_seq_net(void *priv_data);
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 46ed56719476..0ba2dc46a44c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2467,7 +2467,7 @@ void fib6_gc_cleanup(void)
 }
 
 #ifdef CONFIG_PROC_FS
-static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
 {
 	struct fib6_info *rt = v;
 	struct ipv6_route_iter *iter = seq->private;
@@ -2625,7 +2625,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
 	return w->node && !(w->state == FWS_U && w->node == w->root);
 }
 
-static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
 	__releases(RCU_BH)
 {
 	struct net *net = seq_file_net(seq);
@@ -2637,6 +2637,67 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
 	rcu_read_unlock_bh();
 }
 
+#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
+struct bpf_iter__ipv6_route {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct fib6_info *, rt);
+};
+
+static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
+				    struct bpf_iter_meta *meta,
+				    void *v)
+{
+	struct bpf_iter__ipv6_route ctx;
+
+	ctx.meta = meta;
+	ctx.rt = v;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+	struct ipv6_route_iter *iter = seq->private;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	if (!prog)
+		return ipv6_route_native_seq_show(seq, v);
+
+	ret = ipv6_route_prog_seq_show(prog, &meta, v);
+	iter->w.leaf = NULL;
+
+	return ret;
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			ipv6_route_prog_seq_show(prog, &meta, v);
+	}
+
+	ipv6_route_native_seq_stop(seq, v);
+}
+#else
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+	return ipv6_route_native_seq_show(seq, v);
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+	ipv6_route_native_seq_stop(seq, v);
+}
+#endif
+
 const struct seq_operations ipv6_route_seq_ops = {
 	.start	= ipv6_route_seq_start,
 	.next	= ipv6_route_seq_next,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3912aac7854d..aa2d3afc8d8b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6393,6 +6393,25 @@ void __init ip6_route_init_special_entries(void)
   #endif
 }
 
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
+
+static int __init bpf_iter_register(void)
+{
+	struct bpf_iter_reg reg_info = {
+		.target			= "ipv6_route",
+		.seq_ops		= &ipv6_route_seq_ops,
+		.init_seq_private	= bpf_iter_init_seq_net,
+		.fini_seq_private	= bpf_iter_fini_seq_net,
+		.seq_priv_size		= sizeof(struct ipv6_route_iter),
+	};
+
+	return bpf_iter_reg_target(&reg_info);
+}
+#endif
+#endif
+
 int __init ip6_route_init(void)
 {
 	int ret;
@@ -6455,6 +6474,14 @@ int __init ip6_route_init(void)
 	if (ret)
 		goto out_register_late_subsys;
 
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	ret = bpf_iter_register();
+	if (ret)
+		goto out_register_late_subsys;
+#endif
+#endif
+
 	for_each_possible_cpu(cpu) {
 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5ded01ca8b20..f0e4599c613c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2596,7 +2596,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	return __netlink_seq_next(seq);
 }
 
-static void netlink_seq_stop(struct seq_file *seq, void *v)
+static void netlink_native_seq_stop(struct seq_file *seq, void *v)
 {
 	struct nl_seq_iter *iter = seq->private;
 
@@ -2607,7 +2607,7 @@ static void netlink_seq_stop(struct seq_file *seq, void *v)
 }
 
 
-static int netlink_seq_show(struct seq_file *seq, void *v)
+static int netlink_native_seq_show(struct seq_file *seq, void *v)
 {
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
@@ -2634,6 +2634,68 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_iter__netlink {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct netlink_sock *, sk);
+};
+
+DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)
+
+static int netlink_prog_seq_show(struct bpf_prog *prog,
+				  struct bpf_iter_meta *meta,
+				  void *v)
+{
+	struct bpf_iter__netlink ctx;
+
+	meta->seq_num--;  /* skip SEQ_START_TOKEN */
+	ctx.meta = meta;
+	ctx.sk = nlk_sk((struct sock *)v);
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int netlink_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	if (!prog)
+		return netlink_native_seq_show(seq, v);
+
+	if (v != SEQ_START_TOKEN)
+		return netlink_prog_seq_show(prog, &meta, v);
+
+	return 0;
+}
+
+static void netlink_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			netlink_prog_seq_show(prog, &meta, v);
+	}
+
+	netlink_native_seq_stop(seq, v);
+}
+#else
+static int netlink_seq_show(struct seq_file *seq, void *v)
+{
+	return netlink_native_seq_show(seq, v);
+}
+
+static void netlink_seq_stop(struct seq_file *seq, void *v)
+{
+	netlink_native_seq_stop(seq, v);
+}
+#endif
+
 static const struct seq_operations netlink_seq_ops = {
 	.start  = netlink_seq_start,
 	.next   = netlink_seq_next,
@@ -2740,6 +2802,21 @@ static const struct rhashtable_params netlink_rhashtable_params = {
 	.automatic_shrinking = true,
 };
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+static int __init bpf_iter_register(void)
+{
+	struct bpf_iter_reg reg_info = {
+		.target			= "netlink",
+		.seq_ops		= &netlink_seq_ops,
+		.init_seq_private	= bpf_iter_init_seq_net,
+		.fini_seq_private	= bpf_iter_fini_seq_net,
+		.seq_priv_size		= sizeof(struct nl_seq_iter),
+	};
+
+	return bpf_iter_reg_target(&reg_info);
+}
+#endif
+
 static int __init netlink_proto_init(void)
 {
 	int i;
@@ -2748,6 +2825,12 @@ static int __init netlink_proto_init(void)
 	if (err != 0)
 		goto out;
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	err = bpf_iter_register();
+	if (err)
+		goto out;
+#endif
+
 	BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));
 
 	nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (9 preceding siblings ...)
  2020-05-04  6:25 ` [PATCH bpf-next v2 10/20] net: bpf: add netlink and ipv6_route bpf_iter targets Yonghong Song
@ 2020-05-04  6:25 ` Yonghong Song
  2020-05-06  7:30   ` Andrii Nakryiko
  2020-05-04  6:26 ` [PATCH bpf-next v2 12/20] bpf: add PTR_TO_BTF_ID_OR_NULL support Yonghong Song
                   ` (8 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:25 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Only the tasks belonging to "current" pid namespace
are enumerated.

For task/file target, the bpf program will have access to
  struct task_struct *task
  u32 fd
  struct file *file
where fd/file is an open file for the task.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/Makefile    |   2 +-
 kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 337 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/task_iter.c

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index b2b5eefc5254..37b2d8620153 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
 obj-y := core.o
 CFLAGS_core.o += $(call cc-disable-warning, override-init)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
new file mode 100644
index 000000000000..1ca258f6e9f4
--- /dev/null
+++ b/kernel/bpf/task_iter.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/pid_namespace.h>
+#include <linux/fs.h>
+#include <linux/fdtable.h>
+#include <linux/filter.h>
+
+struct bpf_iter_seq_task_common {
+	struct pid_namespace *ns;
+};
+
+struct bpf_iter_seq_task_info {
+	struct bpf_iter_seq_task_common common;
+	struct task_struct *task;
+	u32 id;
+};
+
+static struct task_struct *task_seq_get_next(struct pid_namespace *ns, u32 *id)
+{
+	struct task_struct *task = NULL;
+	struct pid *pid;
+
+	rcu_read_lock();
+	pid = idr_get_next(&ns->idr, id);
+	if (pid)
+		task = get_pid_task(pid, PIDTYPE_PID);
+	rcu_read_unlock();
+
+	return task;
+}
+
+static void *task_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_task_info *info = seq->private;
+	struct task_struct *task;
+	u32 id = info->id;
+
+	task = task_seq_get_next(info->common.ns, &id);
+	if (!task)
+		return NULL;
+
+	++*pos;
+	info->task = task;
+	info->id = id;
+
+	return task;
+}
+
+static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_task_info *info = seq->private;
+	struct task_struct *task;
+
+	++*pos;
+	++info->id;
+	task = task_seq_get_next(info->common.ns, &info->id);
+	if (!task)
+		return NULL;
+
+	put_task_struct(info->task);
+	info->task = task;
+	return task;
+}
+
+struct bpf_iter__task {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct task_struct *, task);
+};
+
+DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
+
+static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__task ctx;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog) {
+		meta.seq = seq;
+		ctx.meta = &meta;
+		ctx.task = v;
+		ret = bpf_iter_run_prog(prog, &ctx);
+	}
+
+	return 0;
+}
+
+static int task_seq_show(struct seq_file *seq, void *v)
+{
+	return __task_seq_show(seq, v, false);
+}
+
+static void task_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_task_info *info = seq->private;
+
+	if (!v)
+		__task_seq_show(seq, v, true);
+
+	if (info->task) {
+		put_task_struct(info->task);
+		info->task = NULL;
+	}
+}
+
+static const struct seq_operations task_seq_ops = {
+	.start	= task_seq_start,
+	.next	= task_seq_next,
+	.stop	= task_seq_stop,
+	.show	= task_seq_show,
+};
+
+struct bpf_iter_seq_task_file_info {
+	struct bpf_iter_seq_task_common common;
+	struct task_struct *task;
+	struct files_struct *files;
+	u32 id;
+	u32 fd;
+};
+
+static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
+					   int *fd, struct task_struct **task,
+					   struct files_struct **fstruct)
+{
+	struct files_struct *files;
+	struct task_struct *tk;
+	u32 sid = *id;
+	int sfd;
+
+	/* If this function returns a non-NULL file object,
+	 * it held a reference to the files_struct and file.
+	 * Otherwise, it does not hold any reference.
+	 */
+again:
+	if (*fstruct) {
+		files = *fstruct;
+		sfd = *fd;
+	} else {
+		tk = task_seq_get_next(ns, &sid);
+		if (!tk)
+			return NULL;
+
+		files = get_files_struct(tk);
+		put_task_struct(tk);
+		if (!files) {
+			sid = ++(*id);
+			*fd = 0;
+			goto again;
+		}
+		*fstruct = files;
+		*task = tk;
+		if (sid == *id) {
+			sfd = *fd;
+		} else {
+			*id = sid;
+			sfd = 0;
+		}
+	}
+
+	rcu_read_lock();
+	for (; sfd < files_fdtable(files)->max_fds; sfd++) {
+		struct file *f;
+
+		f = fcheck_files(files, sfd);
+		if (!f)
+			continue;
+		*fd = sfd;
+		get_file(f);
+		rcu_read_unlock();
+		return f;
+	}
+
+	/* the current task is done, go to the next task */
+	rcu_read_unlock();
+	put_files_struct(files);
+	*fstruct = NULL;
+	sid = ++(*id);
+	*fd = 0;
+	goto again;
+}
+
+static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+	struct files_struct *files = NULL;
+	struct task_struct *task = NULL;
+	struct file *file;
+	u32 id = info->id;
+	int fd = info->fd;
+
+	file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files);
+	if (!file) {
+		info->files = NULL;
+		return NULL;
+	}
+
+	++*pos;
+	info->id = id;
+	info->fd = fd;
+	info->task = task;
+	info->files = files;
+
+	return file;
+}
+
+static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+	struct files_struct *files = info->files;
+	struct task_struct *task = info->task;
+	struct file *file;
+
+	++*pos;
+	++info->fd;
+	fput((struct file *)v);
+	file = task_file_seq_get_next(info->common.ns, &info->id, &info->fd,
+				      &task, &files);
+	if (!file) {
+		info->files = NULL;
+		return NULL;
+	}
+
+	info->task = task;
+	info->files = files;
+
+	return file;
+}
+
+struct bpf_iter__task_file {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct task_struct *, task);
+	u32 fd;
+	__bpf_md_ptr(struct file *, file);
+};
+
+DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
+		     struct task_struct *task, u32 fd,
+		     struct file *file)
+
+static int __task_file_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+	struct bpf_iter__task_file ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog) {
+		ctx.meta = &meta;
+		ctx.task = info->task;
+		ctx.fd = info->fd;
+		ctx.file = v;
+		ret = bpf_iter_run_prog(prog, &ctx);
+	}
+
+	return ret;
+}
+
+static int task_file_seq_show(struct seq_file *seq, void *v)
+{
+	return __task_file_seq_show(seq, v, false);
+}
+
+static void task_file_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+
+	if (!v)
+		__task_file_seq_show(seq, v, true);
+	else if (!IS_ERR(v))
+		fput((struct file *)v);
+
+	if (info->files) {
+		put_files_struct(info->files);
+		info->files = NULL;
+	}
+}
+
+/* The first field of task/task_file private data is
+ * struct bpf_iter_seq_task_common.
+ */
+static int init_seq_pidns(void *priv_data)
+{
+	struct bpf_iter_seq_task_common *common = priv_data;
+
+	common->ns = get_pid_ns(task_active_pid_ns(current));
+	return 0;
+}
+
+static void fini_seq_pidns(void *priv_data)
+{
+	struct bpf_iter_seq_task_common *common = priv_data;
+
+	put_pid_ns(common->ns);
+}
+
+static const struct seq_operations task_file_seq_ops = {
+	.start	= task_file_seq_start,
+	.next	= task_file_seq_next,
+	.stop	= task_file_seq_stop,
+	.show	= task_file_seq_show,
+};
+
+static int __init task_iter_init(void)
+{
+	struct bpf_iter_reg task_file_reg_info = {
+		.target			= "task_file",
+		.seq_ops		= &task_file_seq_ops,
+		.init_seq_private	= init_seq_pidns,
+		.fini_seq_private	= fini_seq_pidns,
+		.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
+	};
+	struct bpf_iter_reg task_reg_info = {
+		.target			= "task",
+		.seq_ops		= &task_seq_ops,
+		.init_seq_private	= init_seq_pidns,
+		.fini_seq_private	= fini_seq_pidns,
+		.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
+	};
+	int ret;
+
+	ret = bpf_iter_reg_target(&task_reg_info);
+	if (ret)
+		return ret;
+
+	return bpf_iter_reg_target(&task_file_reg_info);
+}
+late_initcall(task_iter_init);
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 12/20] bpf: add PTR_TO_BTF_ID_OR_NULL support
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (10 preceding siblings ...)
  2020-05-04  6:25 ` [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets Yonghong Song
@ 2020-05-04  6:26 ` Yonghong Song
  2020-05-05 20:27   ` Andrii Nakryiko
  2020-05-04  6:26 ` [PATCH bpf-next v2 13/20] bpf: add bpf_seq_printf and bpf_seq_write helpers Yonghong Song
                   ` (7 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:26 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Add bpf_reg_type PTR_TO_BTF_ID_OR_NULL support.
For tracing/iter program, the bpf program context
definition, e.g., for previous bpf_map target, looks like
  struct bpf_iter__bpf_map {
    struct bpf_iter_meta *meta;
    struct bpf_map *map;
  };

The kernel guarantees that meta is not NULL, but
map pointer maybe NULL. The NULL map indicates that all
objects have been traversed, so bpf program can take
proper action, e.g., do final aggregation and/or send
final report to user space.

Add btf_id_or_null_non0_off to prog->aux structure, to
indicate that if the context access offset is not 0,
set to PTR_TO_BTF_ID_OR_NULL instead of PTR_TO_BTF_ID.
This bit is set for tracing/iter program.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/btf.c      |  5 ++++-
 kernel/bpf/verifier.c | 16 ++++++++++++----
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 56b2ded9c2a6..e8906199755a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -320,6 +320,7 @@ enum bpf_reg_type {
 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
 	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */
 	PTR_TO_BTF_ID,		 /* reg points to kernel struct */
+	PTR_TO_BTF_ID_OR_NULL,	 /* reg points to kernel struct or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -658,6 +659,7 @@ struct bpf_prog_aux {
 	bool offload_requested;
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
 	bool func_proto_unreliable;
+	bool btf_id_or_null_non0_off;
 	enum bpf_tramp_prog_type trampoline_prog_type;
 	struct bpf_trampoline *trampoline;
 	struct hlist_node tramp_hlist;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a2cfba89a8e1..c490fbde22d4 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3790,7 +3790,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		return true;
 
 	/* this is a pointer to another type */
-	info->reg_type = PTR_TO_BTF_ID;
+	if (off != 0 && prog->aux->btf_id_or_null_non0_off)
+		info->reg_type = PTR_TO_BTF_ID_OR_NULL;
+	else
+		info->reg_type = PTR_TO_BTF_ID;
 
 	if (tgt_prog) {
 		ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d725ff7d11db..36b2a38a06fe 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -398,7 +398,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
 	return type == PTR_TO_MAP_VALUE_OR_NULL ||
 	       type == PTR_TO_SOCKET_OR_NULL ||
 	       type == PTR_TO_SOCK_COMMON_OR_NULL ||
-	       type == PTR_TO_TCP_SOCK_OR_NULL;
+	       type == PTR_TO_TCP_SOCK_OR_NULL ||
+	       type == PTR_TO_BTF_ID_OR_NULL;
 }
 
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -483,6 +484,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_TP_BUFFER]	= "tp_buffer",
 	[PTR_TO_XDP_SOCK]	= "xdp_sock",
 	[PTR_TO_BTF_ID]		= "ptr_",
+	[PTR_TO_BTF_ID_OR_NULL]	= "ptr_or_null_",
 };
 
 static char slot_type_char[] = {
@@ -543,7 +545,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			/* reg->off should be 0 for SCALAR_VALUE */
 			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
-			if (t == PTR_TO_BTF_ID)
+			if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
 				verbose(env, "%s", kernel_type_name(reg->btf_id));
 			verbose(env, "(id=%d", reg->id);
 			if (reg_type_may_be_refcounted_or_null(t))
@@ -2139,6 +2141,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
+	case PTR_TO_BTF_ID_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -2659,7 +2662,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 */
 		*reg_type = info.reg_type;
 
-		if (*reg_type == PTR_TO_BTF_ID)
+		if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
 			*btf_id = info.btf_id;
 		else
 			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
@@ -3243,7 +3246,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				 * a sub-register.
 				 */
 				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
-				if (reg_type == PTR_TO_BTF_ID)
+				if (reg_type == PTR_TO_BTF_ID ||
+				    reg_type == PTR_TO_BTF_ID_OR_NULL)
 					regs[value_regno].btf_id = btf_id;
 			}
 			regs[value_regno].type = reg_type;
@@ -6572,6 +6576,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 			reg->type = PTR_TO_SOCK_COMMON;
 		} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
 			reg->type = PTR_TO_TCP_SOCK;
+		} else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
+			reg->type = PTR_TO_BTF_ID;
 		}
 		if (is_null) {
 			/* We don't need id and ref_obj_id from this point
@@ -8429,6 +8435,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
+	case PTR_TO_BTF_ID_OR_NULL:
 		return false;
 	default:
 		return true;
@@ -10640,6 +10647,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		prog->aux->attach_func_proto = t;
 		if (!bpf_iter_prog_supported(prog))
 			return -EINVAL;
+		prog->aux->btf_id_or_null_non0_off = true;
 		ret = btf_distill_func_proto(&env->log, btf, t,
 					     tname, &fmodel);
 		return ret;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 13/20] bpf: add bpf_seq_printf and bpf_seq_write helpers
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (11 preceding siblings ...)
  2020-05-04  6:26 ` [PATCH bpf-next v2 12/20] bpf: add PTR_TO_BTF_ID_OR_NULL support Yonghong Song
@ 2020-05-04  6:26 ` Yonghong Song
  2020-05-06 17:37   ` Andrii Nakryiko
  2020-05-04  6:26 ` [PATCH bpf-next v2 14/20] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary Yonghong Song
                   ` (6 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:26 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.

bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.

For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.

bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_enable(), which guards
bpf prog call, calls preempt_enable().

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/uapi/linux/bpf.h       |  32 +++++-
 kernel/trace/bpf_trace.c       | 195 +++++++++++++++++++++++++++++++++
 scripts/bpf_helpers_doc.py     |   2 +
 tools/include/uapi/linux/bpf.h |  32 +++++-
 4 files changed, 259 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97ceb0f2e539..e440a9d5cca2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3076,6 +3076,34 @@ union bpf_attr {
  * 		See: clock_gettime(CLOCK_BOOTTIME)
  * 	Return
  * 		Current *ktime*.
+ *
+ * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
+ * 	Description
+ * 		seq_printf uses seq_file seq_printf() to print out the format string.
+ * 		The *m* represents the seq_file. The *fmt* and *fmt_size* are for
+ * 		the format string itself. The *data* and *data_len* are format string
+ * 		arguments. The *data* are a u64 array and corresponding format string
+ * 		values are stored in the array. For strings and pointers where pointees
+ * 		are accessed, only the pointer values are stored in the *data* array.
+ * 		The *data_len* is the *data* size in term of bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EBUSY**		Percpu memory copy buffer is busy, can try again
+ *					by returning 1 from bpf program.
+ *		* **-EINVAL**		Invalid arguments, or invalid/unsupported formats.
+ *		* **-E2BIG**		Too many format specifiers.
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
+ *
+ * int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
+ * 	Description
+ * 		seq_write uses seq_file seq_write() to write the data.
+ * 		The *m* represents the seq_file. The *data* and *len* represent the
+ *		data to write in bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3203,7 +3231,9 @@ union bpf_attr {
 	FN(get_netns_cookie),		\
 	FN(get_current_ancestor_cgroup_id),	\
 	FN(sk_assign),			\
-	FN(ktime_get_boot_ns),
+	FN(ktime_get_boot_ns),		\
+	FN(seq_printf),			\
+	FN(seq_write),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e875c95d3ced..7c06eb39bacc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -457,6 +457,193 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
+#define MAX_SEQ_PRINTF_VARARGS		12
+#define MAX_SEQ_PRINTF_MAX_MEMCPY	6
+#define MAX_SEQ_PRINTF_STR_LEN		128
+
+struct bpf_seq_printf_buf {
+	char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
+};
+static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
+static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
+
+BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
+	   const void *, data, u32, data_len)
+{
+	int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
+	int i, buf_used, copy_size, num_args;
+	u64 params[MAX_SEQ_PRINTF_VARARGS];
+	struct bpf_seq_printf_buf *bufs;
+	const u64 *args = data;
+
+	buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
+	if (WARN_ON_ONCE(buf_used > 1)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	bufs = this_cpu_ptr(&bpf_seq_printf_buf);
+
+	/*
+	 * bpf_check()->check_func_arg()->check_stack_boundary()
+	 * guarantees that fmt points to bpf program stack,
+	 * fmt_size bytes of it were initialized and fmt_size > 0
+	 */
+	if (fmt[--fmt_size] != 0)
+		goto out;
+
+	if (data_len & 7)
+		goto out;
+
+	for (i = 0; i < fmt_size; i++) {
+		if (fmt[i] == '%' && (!data || !data_len))
+			goto out;
+	}
+
+	num_args = data_len / 8;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++) {
+		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+			goto out;
+
+		if (fmt[i] != '%')
+			continue;
+
+		if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
+			err = -E2BIG;
+			goto out;
+		}
+
+		if (fmt_cnt >= num_args)
+			goto out;
+
+		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+		i++;
+
+		/* skip optional "[0+-][num]" width formating field */
+		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-')
+			i++;
+		if (fmt[i] >= '1' && fmt[i] <= '9') {
+			i++;
+			while (fmt[i] >= '0' && fmt[i] <= '9')
+				i++;
+		}
+
+		if (fmt[i] == 's') {
+			/* disallow any further format extensions */
+			if (fmt[i + 1] != 0 &&
+			    !isspace(fmt[i + 1]) &&
+			    !ispunct(fmt[i + 1]))
+				goto out;
+
+			/* try our best to copy */
+			if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+				err = -E2BIG;
+				goto out;
+			}
+
+			bufs->buf[memcpy_cnt][0] = 0;
+			strncpy_from_unsafe(bufs->buf[memcpy_cnt],
+					    (void *) (long) args[fmt_cnt],
+					    MAX_SEQ_PRINTF_STR_LEN);
+			params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+			fmt_cnt++;
+			memcpy_cnt++;
+			continue;
+		}
+
+		if (fmt[i] == 'p') {
+			if (fmt[i + 1] == 0 ||
+			    fmt[i + 1] == 'K' ||
+			    fmt[i + 1] == 'x') {
+				/* just kernel pointers */
+				params[fmt_cnt] = args[fmt_cnt];
+				fmt_cnt++;
+				continue;
+			}
+
+			/* only support "%pI4", "%pi4", "%pI6" and "pi6". */
+			if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I')
+				goto out;
+			if (fmt[i + 2] != '4' && fmt[i + 2] != '6')
+				goto out;
+
+			if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+				err = -E2BIG;
+				goto out;
+			}
+
+
+			copy_size = (fmt[i + 2] == '4') ? 4 : 16;
+
+			probe_kernel_read(bufs->buf[memcpy_cnt],
+					  (void *) (long) args[fmt_cnt], copy_size);
+			params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+			i += 2;
+			fmt_cnt++;
+			memcpy_cnt++;
+			continue;
+		}
+
+		if (fmt[i] == 'l') {
+			i++;
+			if (fmt[i] == 'l')
+				i++;
+		}
+
+		if (fmt[i] != 'i' && fmt[i] != 'd' &&
+		    fmt[i] != 'u' && fmt[i] != 'x')
+			goto out;
+
+		params[fmt_cnt] = args[fmt_cnt];
+		fmt_cnt++;
+	}
+
+	/* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
+	 * all of them to seq_printf().
+	 */
+	seq_printf(m, fmt, params[0], params[1], params[2], params[3],
+		   params[4], params[5], params[6], params[7], params[8],
+		   params[9], params[10], params[11]);
+
+	err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
+out:
+	this_cpu_dec(bpf_seq_printf_buf_used);
+	return err;
+}
+
+static int bpf_seq_printf_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_printf_proto = {
+	.func		= bpf_seq_printf,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
+	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+	.btf_id		= bpf_seq_printf_btf_ids,
+};
+
+BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
+{
+	return seq_write(m, data, len) ? -EOVERFLOW : 0;
+}
+
+static int bpf_seq_write_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_write_proto = {
+	.func		= bpf_seq_write,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.btf_id		= bpf_seq_write_btf_ids,
+};
+
 static __always_inline int
 get_map_perf_counter(struct bpf_map *map, u64 flags,
 		     u64 *value, u64 *enabled, u64 *running)
@@ -1226,6 +1413,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_xdp_output:
 		return &bpf_xdp_output_proto;
 #endif
+	case BPF_FUNC_seq_printf:
+		return prog->expected_attach_type == BPF_TRACE_ITER ?
+		       &bpf_seq_printf_proto :
+		       NULL;
+	case BPF_FUNC_seq_write:
+		return prog->expected_attach_type == BPF_TRACE_ITER ?
+		       &bpf_seq_write_proto :
+		       NULL;
 	default:
 		return raw_tp_prog_func_proto(func_id, prog);
 	}
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index f43d193aff3a..ded304c96a05 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -414,6 +414,7 @@ class PrinterHelpers(Printer):
             'struct sk_reuseport_md',
             'struct sockaddr',
             'struct tcphdr',
+            'struct seq_file',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -450,6 +451,7 @@ class PrinterHelpers(Printer):
             'struct sk_reuseport_md',
             'struct sockaddr',
             'struct tcphdr',
+            'struct seq_file',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 97ceb0f2e539..e440a9d5cca2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3076,6 +3076,34 @@ union bpf_attr {
  * 		See: clock_gettime(CLOCK_BOOTTIME)
  * 	Return
  * 		Current *ktime*.
+ *
+ * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
+ * 	Description
+ * 		seq_printf uses seq_file seq_printf() to print out the format string.
+ * 		The *m* represents the seq_file. The *fmt* and *fmt_size* are for
+ * 		the format string itself. The *data* and *data_len* are format string
+ * 		arguments. The *data* are a u64 array and corresponding format string
+ * 		values are stored in the array. For strings and pointers where pointees
+ * 		are accessed, only the pointer values are stored in the *data* array.
+ * 		The *data_len* is the *data* size in term of bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EBUSY**		Percpu memory copy buffer is busy, can try again
+ *					by returning 1 from bpf program.
+ *		* **-EINVAL**		Invalid arguments, or invalid/unsupported formats.
+ *		* **-E2BIG**		Too many format specifiers.
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
+ *
+ * int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
+ * 	Description
+ * 		seq_write uses seq_file seq_write() to write the data.
+ * 		The *m* represents the seq_file. The *data* and *len* represent the
+ *		data to write in bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3203,7 +3231,9 @@ union bpf_attr {
 	FN(get_netns_cookie),		\
 	FN(get_current_ancestor_cgroup_id),	\
 	FN(sk_assign),			\
-	FN(ktime_get_boot_ns),
+	FN(ktime_get_boot_ns),		\
+	FN(seq_printf),			\
+	FN(seq_write),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 14/20] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (12 preceding siblings ...)
  2020-05-04  6:26 ` [PATCH bpf-next v2 13/20] bpf: add bpf_seq_printf and bpf_seq_write helpers Yonghong Song
@ 2020-05-04  6:26 ` Yonghong Song
  2020-05-06 17:38   ` Andrii Nakryiko
  2020-05-04  6:26 ` [PATCH bpf-next v2 15/20] bpf: support variable length array in tracing programs Yonghong Song
                   ` (5 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:26 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

This specifically to handle the case like below:
   // ptr below is a socket ptr identified by PTR_TO_BTF_ID
   u64 param[2] = { ptr, val };
   bpf_seq_printf(seq, fmt, sizeof(fmt), param, sizeof(param));

In this case, the 16 bytes stack for "param" contains:
   8 bytes for ptr with spilled PTR_TO_BTF_ID
   8 bytes for val as STACK_MISC

The current verifier will complain the ptr should not be visible
to the helper.
   ...
   16: (7b) *(u64 *)(r10 -64) = r2
   18: (7b) *(u64 *)(r10 -56) = r1
   19: (bf) r4 = r10
   ;
   20: (07) r4 += -64
   ; BPF_SEQ_PRINTF(seq, fmt1, (long)s, s->sk_protocol);
   21: (bf) r1 = r6
   22: (18) r2 = 0xffffa8d00018605a
   24: (b4) w3 = 10
   25: (b4) w5 = 16
   26: (85) call bpf_seq_printf#125
    R0=inv(id=0) R1_w=ptr_seq_file(id=0,off=0,imm=0)
    R2_w=map_value(id=0,off=90,ks=4,vs=144,imm=0) R3_w=inv10
    R4_w=fp-64 R5_w=inv16 R6=ptr_seq_file(id=0,off=0,imm=0)
    R7=ptr_netlink_sock(id=0,off=0,imm=0) R10=fp0 fp-56_w=mmmmmmmm
    fp-64_w=ptr_
   last_idx 26 first_idx 13
   regs=8 stack=0 before 25: (b4) w5 = 16
   regs=8 stack=0 before 24: (b4) w3 = 10
   invalid indirect read from stack off -64+0 size 16

Let us permit this if the program is a tracing/iter program.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/verifier.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 36b2a38a06fe..4884b6fd7bad 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3494,6 +3494,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 			*stype = STACK_MISC;
 			goto mark;
 		}
+
+		/* pointer value can be visible to tracing/iter program */
+		if (env->prog->type == BPF_PROG_TYPE_TRACING &&
+		    env->prog->expected_attach_type == BPF_TRACE_ITER &&
+		    state->stack[spi].slot_type[0] == STACK_SPILL &&
+		    state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
+			goto mark;
+
 		if (state->stack[spi].slot_type[0] == STACK_SPILL &&
 		    state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
 			__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 15/20] bpf: support variable length array in tracing programs
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (13 preceding siblings ...)
  2020-05-04  6:26 ` [PATCH bpf-next v2 14/20] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary Yonghong Song
@ 2020-05-04  6:26 ` Yonghong Song
  2020-05-06 17:40   ` Andrii Nakryiko
  2020-05-04  6:26 ` [PATCH bpf-next v2 16/20] tools/libbpf: add bpf_iter support Yonghong Song
                   ` (4 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:26 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

In /proc/net/ipv6_route, we have
  struct fib6_info {
    struct fib6_table *fib6_table;
    ...
    struct fib6_nh fib6_nh[0];
  }
  struct fib6_nh {
    struct fib_nh_common nh_common;
    struct rt6_info **rt6i_pcpu;
    struct rt6_exception_bucket *rt6i_exception_bucket;
  };
  struct fib_nh_common {
    ...
    u8 nhc_gw_family;
    ...
  }

The access:
  struct fib6_nh *fib6_nh = &rt->fib6_nh;
  ... fib6_nh->nh_common.nhc_gw_family ...

This patch ensures such an access is handled properly.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/btf.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c490fbde22d4..dcd233139294 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3833,6 +3833,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
 	const struct btf_type *mtype, *elem_type = NULL;
 	const struct btf_member *member;
 	const char *tname, *mname;
+	u32 vlen;
 
 again:
 	tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
@@ -3841,7 +3842,43 @@ int btf_struct_access(struct bpf_verifier_log *log,
 		return -EINVAL;
 	}
 
+	vlen = btf_type_vlen(t);
 	if (off + size > t->size) {
+		/* If the last element is a variable size array, we may
+		 * need to relax the rule.
+		 */
+		struct btf_array *array_elem;
+
+		if (vlen == 0)
+			goto error;
+
+		member = btf_type_member(t) + vlen - 1;
+		mtype = btf_type_skip_modifiers(btf_vmlinux, member->type,
+						NULL);
+		if (!btf_type_is_array(mtype))
+			goto error;
+
+		array_elem = (struct btf_array *)(mtype + 1);
+		if (array_elem->nelems != 0)
+			goto error;
+
+		moff = btf_member_bit_offset(t, member) / 8;
+		if (off < moff)
+			goto error;
+
+		/* Only allow structure for now, can be relaxed for
+		 * other types later.
+		 */
+		elem_type = btf_type_skip_modifiers(btf_vmlinux,
+						    array_elem->type, NULL);
+		if (!btf_type_is_struct(elem_type))
+			goto error;
+
+		off = (off - moff) % elem_type->size;
+		return btf_struct_access(log, elem_type, off, size, atype,
+					 next_btf_id);
+
+error:
 		bpf_log(log, "access beyond struct %s at off %u size %u\n",
 			tname, off, size);
 		return -EACCES;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 16/20] tools/libbpf: add bpf_iter support
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (14 preceding siblings ...)
  2020-05-04  6:26 ` [PATCH bpf-next v2 15/20] bpf: support variable length array in tracing programs Yonghong Song
@ 2020-05-04  6:26 ` Yonghong Song
  2020-05-06  5:44   ` Andrii Nakryiko
  2020-05-04  6:26 ` [PATCH bpf-next v2 17/20] tools/bpftool: add bpf_iter support for bptool Yonghong Song
                   ` (3 subsequent siblings)
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:26 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Two new libbpf APIs are added to support bpf_iter:
  - bpf_program__attach_iter
    Given a bpf program and additional parameters, which is
    none now, returns a bpf_link.
  - bpf_iter_create
    syscall level API to create a bpf iterator.

The macro BPF_SEQ_PRINTF are also introduced. The format
looks like:
  BPF_SEQ_PRINTF(seq, "task id %d\n", pid);

This macro can help bpf program writers with
nicer bpf_seq_printf syntax similar to the kernel one.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/lib/bpf/bpf.c         | 11 +++++++++
 tools/lib/bpf/bpf.h         |  2 ++
 tools/lib/bpf/bpf_tracing.h | 16 +++++++++++++
 tools/lib/bpf/libbpf.c      | 45 +++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.h      |  9 ++++++++
 tools/lib/bpf/libbpf.map    |  2 ++
 6 files changed, 85 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 43322f0d6c7f..1756ae47ddf2 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -619,6 +619,17 @@ int bpf_link_update(int link_fd, int new_prog_fd,
 	return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr));
 }
 
+int bpf_iter_create(int link_fd, unsigned int flags)
+{
+	union bpf_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.iter_create.link_fd = link_fd;
+	attr.iter_create.flags = flags;
+
+	return sys_bpf(BPF_ITER_CREATE, &attr, sizeof(attr));
+}
+
 int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
 		   __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt)
 {
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 1901b2777854..d2748b9da86f 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -187,6 +187,8 @@ struct bpf_link_update_opts {
 LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd,
 			       const struct bpf_link_update_opts *opts);
 
+LIBBPF_API int bpf_iter_create(int link_fd, unsigned int flags);
+
 struct bpf_prog_test_run_attr {
 	int prog_fd;
 	int repeat;
diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index f3f3c3fb98cb..cf97d07692b4 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -413,4 +413,20 @@ typeof(name(0)) name(struct pt_regs *ctx)				    \
 }									    \
 static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
 
+/*
+ * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values
+ * in a structure.
+ */
+#define BPF_SEQ_PRINTF(seq, fmt, args...)				    \
+	({								    \
+		_Pragma("GCC diagnostic push")				    \
+		_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")	    \
+		static const char ___fmt[] = fmt;			    \
+		unsigned long long ___param[] = { args };		    \
+		_Pragma("GCC diagnostic pop")				    \
+		int ___ret = bpf_seq_printf(seq, ___fmt, sizeof(___fmt),    \
+					    ___param, sizeof(___param));    \
+		___ret;							    \
+	})
+
 #endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 977add1b73e2..93355a257405 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -6629,6 +6629,9 @@ static const struct bpf_sec_def section_defs[] = {
 		.is_attach_btf = true,
 		.expected_attach_type = BPF_LSM_MAC,
 		.attach_fn = attach_lsm),
+	SEC_DEF("iter/", TRACING,
+		.expected_attach_type = BPF_TRACE_ITER,
+		.is_attach_btf = true),
 	BPF_PROG_SEC("xdp",			BPF_PROG_TYPE_XDP),
 	BPF_PROG_SEC("perf_event",		BPF_PROG_TYPE_PERF_EVENT),
 	BPF_PROG_SEC("lwt_in",			BPF_PROG_TYPE_LWT_IN),
@@ -6891,6 +6894,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
 
 #define BTF_TRACE_PREFIX "btf_trace_"
 #define BTF_LSM_PREFIX "bpf_lsm_"
+#define BTF_ITER_PREFIX "__bpf_iter__"
 #define BTF_MAX_NAME_SIZE 128
 
 static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix,
@@ -6921,6 +6925,9 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name,
 	else if (attach_type == BPF_LSM_MAC)
 		err = find_btf_by_prefix_kind(btf, BTF_LSM_PREFIX, name,
 					      BTF_KIND_FUNC);
+	else if (attach_type == BPF_TRACE_ITER)
+		err = find_btf_by_prefix_kind(btf, BTF_ITER_PREFIX, name,
+					      BTF_KIND_FUNC);
 	else
 		err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC);
 
@@ -7882,6 +7889,44 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
 	return link;
 }
 
+struct bpf_link *
+bpf_program__attach_iter(struct bpf_program *prog,
+			 const struct bpf_iter_attach_opts *opts)
+{
+	enum bpf_attach_type attach_type;
+	char errmsg[STRERR_BUFSIZE];
+	struct bpf_link *link;
+	int prog_fd, link_fd;
+
+	if (!OPTS_VALID(opts, bpf_iter_attach_opts))
+		return ERR_PTR(-EINVAL);
+
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0) {
+		pr_warn("program '%s': can't attach before loaded\n",
+			bpf_program__title(prog, false));
+		return ERR_PTR(-EINVAL);
+	}
+
+	link = calloc(1, sizeof(*link));
+	if (!link)
+		return ERR_PTR(-ENOMEM);
+	link->detach = &bpf_link__detach_fd;
+
+	attach_type = BPF_TRACE_ITER;
+	link_fd = bpf_link_create(prog_fd, 0, attach_type, NULL);
+	if (link_fd < 0) {
+		link_fd = -errno;
+		free(link);
+		pr_warn("program '%s': failed to attach to iterator: %s\n",
+			bpf_program__title(prog, false),
+			libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg)));
+		return ERR_PTR(link_fd);
+	}
+	link->fd = link_fd;
+	return link;
+}
+
 struct bpf_link *bpf_program__attach(struct bpf_program *prog)
 {
 	const struct bpf_sec_def *sec_def;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index f1dacecb1619..8ea69558f0a8 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -258,6 +258,15 @@ struct bpf_map;
 
 LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map);
 
+struct bpf_iter_attach_opts {
+	size_t sz; /* size of this struct for forward/backward compatibility */
+};
+#define bpf_iter_attach_opts__last_field sz
+
+LIBBPF_API struct bpf_link *
+bpf_program__attach_iter(struct bpf_program *prog,
+			 const struct bpf_iter_attach_opts *opts);
+
 struct bpf_insn;
 
 /*
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index e03bd4db827e..0133d469d30b 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -258,6 +258,8 @@ LIBBPF_0.0.8 {
 LIBBPF_0.0.9 {
 	global:
 		bpf_enable_stats;
+		bpf_iter_create;
 		bpf_link_get_fd_by_id;
 		bpf_link_get_next_id;
+		bpf_program__attach_iter;
 } LIBBPF_0.0.8;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 17/20] tools/bpftool: add bpf_iter support for bptool
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (15 preceding siblings ...)
  2020-05-04  6:26 ` [PATCH bpf-next v2 16/20] tools/libbpf: add bpf_iter support Yonghong Song
@ 2020-05-04  6:26 ` Yonghong Song
  2020-05-04  6:26 ` [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink Yonghong Song
                   ` (2 subsequent siblings)
  19 siblings, 0 replies; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:26 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Currently, only one command is supported
  bpftool iter pin <bpf_prog.o> <path>

It will pin the trace/iter bpf program in
the object file <bpf_prog.o> to the <path>
where <path> should be on a bpffs mount.

For example,
  $ bpftool iter pin ./bpf_iter_ipv6_route.o \
    /sys/fs/bpf/my_route
User can then do a `cat` to print out the results:
  $ cat /sys/fs/bpf/my_route
    fe800000000000000000000000000000 40 00000000000000000000000000000000 ...
    00000000000000000000000000000000 00 00000000000000000000000000000000 ...
    00000000000000000000000000000001 80 00000000000000000000000000000000 ...
    fe800000000000008c0162fffebdfd57 80 00000000000000000000000000000000 ...
    ff000000000000000000000000000000 08 00000000000000000000000000000000 ...
    00000000000000000000000000000000 00 00000000000000000000000000000000 ...

The implementation for ipv6_route iterator is in one of subsequent
patches.

This patch also added BPF_LINK_TYPE_ITER to link query.

In the future, we may add additional parameters to pin command
by parameterizing the bpf iterator. For example, a map_id or pid
may be added to let bpf program only traverses a single map or task,
similar to kernel seq_file single_open().

We may also add introspection command for targets/iterators by
leveraging the bpf_iter itself.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../bpftool/Documentation/bpftool-iter.rst    | 83 ++++++++++++++++++
 tools/bpf/bpftool/bash-completion/bpftool     | 13 +++
 tools/bpf/bpftool/iter.c                      | 84 +++++++++++++++++++
 tools/bpf/bpftool/link.c                      |  1 +
 tools/bpf/bpftool/main.c                      |  3 +-
 tools/bpf/bpftool/main.h                      |  1 +
 6 files changed, 184 insertions(+), 1 deletion(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-iter.rst
 create mode 100644 tools/bpf/bpftool/iter.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
new file mode 100644
index 000000000000..13b173d93890
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
@@ -0,0 +1,83 @@
+============
+bpftool-iter
+============
+-------------------------------------------------------------------------------
+tool to create BPF iterators
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **iter** *COMMAND*
+
+	*COMMANDS* := { **pin** | **help** }
+
+ITER COMMANDS
+===================
+
+|	**bpftool** **iter pin** *OBJ* *PATH*
+|	**bpftool** **iter help**
+|
+|	*OBJ* := /a/file/of/bpf_iter_target.o
+
+
+DESCRIPTION
+===========
+	**bpftool iter pin** *OBJ* *PATH*
+		  A bpf iterator combines a kernel iterating of
+		  particular kernel data (e.g., tasks, bpf_maps, etc.)
+		  and a bpf program called for each kernel data object
+		  (e.g., one task, one bpf_map, etc.). User space can
+		  *read* kernel iterator output through *read()* syscall.
+
+		  The *pin* command creates a bpf iterator from *OBJ*,
+		  and pin it to *PATH*. The *PATH* should be located
+		  in *bpffs* mount. It must not contain a dot
+		  character ('.'), which is reserved for future extensions
+		  of *bpffs*.
+
+		  User can then *cat PATH* to see the bpf iterator output.
+
+	**bpftool iter help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-V, --version
+		  Print version number (similar to **bpftool version**).
+
+	-d, --debug
+		  Print all logs available, even debug-level information. This
+		  includes logs from libbpf as well as from the verifier, when
+		  attempting to load programs.
+
+EXAMPLES
+========
+**# bpftool iter pin bpf_iter_netlink.o /sys/fs/bpf/my_netlink**
+
+::
+
+   Create a file-based bpf iterator from bpf_iter_netlink.o and pin it
+   to /sys/fs/bpf/my_netlink
+
+
+SEE ALSO
+========
+	**bpf**\ (2),
+	**bpf-helpers**\ (7),
+	**bpftool**\ (8),
+	**bpftool-prog**\ (8),
+	**bpftool-map**\ (8),
+	**bpftool-link**\ (8),
+	**bpftool-cgroup**\ (8),
+	**bpftool-feature**\ (8),
+	**bpftool-net**\ (8),
+	**bpftool-perf**\ (8),
+	**bpftool-btf**\ (8)
+	**bpftool-gen**\ (8)
+	**bpftool-struct_ops**\ (8)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index fc989ead7313..9f0f20e73b87 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -610,6 +610,19 @@ _bpftool()
                     ;;
             esac
             ;;
+        iter)
+            case $command in
+                pin)
+                    _filedir
+                    return 0
+                    ;;
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'pin help' \
+                            -- "$cur" ) )
+                    ;;
+            esac
+            ;;
         map)
             local MAP_TYPE='id pinned name'
             case $command in
diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c
new file mode 100644
index 000000000000..a8fb1349c103
--- /dev/null
+++ b/tools/bpf/bpftool/iter.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (C) 2020 Facebook
+
+#define _GNU_SOURCE
+#include <linux/err.h>
+#include <bpf/libbpf.h>
+
+#include "main.h"
+
+static int do_pin(int argc, char **argv)
+{
+	const char *objfile, *path;
+	struct bpf_program *prog;
+	struct bpf_object *obj;
+	struct bpf_link *link;
+	int err;
+
+	if (!REQ_ARGS(2))
+		usage();
+
+	objfile = GET_ARG();
+	path = GET_ARG();
+
+	obj = bpf_object__open(objfile);
+	if (IS_ERR_OR_NULL(obj)) {
+		p_err("can't open objfile %s", objfile);
+		return -1;
+	}
+
+	err = bpf_object__load(obj);
+	if (err) {
+		p_err("can't load objfile %s", objfile);
+		goto close_obj;
+	}
+
+	prog = bpf_program__next(NULL, obj);
+	link = bpf_program__attach_iter(prog, NULL);
+	if (IS_ERR(link)) {
+		err = PTR_ERR(link);
+		p_err("attach_iter failed for program %s",
+		      bpf_program__name(prog));
+		goto close_obj;
+	}
+
+	err = mount_bpffs_for_pin(path);
+	if (err)
+		goto close_link;
+
+	err = bpf_link__pin(link, path);
+	if (err) {
+		p_err("pin_iter failed for program %s to path %s",
+		      bpf_program__name(prog), path);
+		goto close_link;
+	}
+
+close_link:
+	bpf_link__disconnect(link);
+	bpf_link__destroy(link);
+close_obj:
+	bpf_object__close(obj);
+	return err;
+}
+
+static int do_help(int argc, char **argv)
+{
+	fprintf(stderr,
+		"Usage: %s %s pin OBJ PATH\n"
+		"       %s %s help\n"
+		"\n",
+		bin_name, argv[-2], bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "help",	do_help },
+	{ "pin",	do_pin },
+	{ 0 }
+};
+
+int do_iter(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index adc7dc431ed8..b6a0b35c78ae 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -16,6 +16,7 @@ static const char * const link_type_name[] = {
 	[BPF_LINK_TYPE_RAW_TRACEPOINT]		= "raw_tracepoint",
 	[BPF_LINK_TYPE_TRACING]			= "tracing",
 	[BPF_LINK_TYPE_CGROUP]			= "cgroup",
+	[BPF_LINK_TYPE_ITER]			= "iter",
 };
 
 static int link_parse_fd(int *argc, char ***argv)
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 1413a154806e..46bd716a9d86 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -59,7 +59,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops }\n"
+		"       OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -224,6 +224,7 @@ static const struct cmd cmds[] = {
 	{ "btf",	do_btf },
 	{ "gen",	do_gen },
 	{ "struct_ops",	do_struct_ops },
+	{ "iter",	do_iter },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 9b1fb81a8331..a41cefabccaf 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -199,6 +199,7 @@ int do_feature(int argc, char **argv);
 int do_btf(int argc, char **argv);
 int do_gen(int argc, char **argv);
 int do_struct_ops(int argc, char **argv);
+int do_iter(int argc, char **argv);
 
 int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
 int prog_parse_fd(int *argc, char ***argv);
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (16 preceding siblings ...)
  2020-05-04  6:26 ` [PATCH bpf-next v2 17/20] tools/bpftool: add bpf_iter support for bptool Yonghong Song
@ 2020-05-04  6:26 ` Yonghong Song
  2020-05-06  6:01   ` Andrii Nakryiko
  2020-05-06  6:04   ` Andrii Nakryiko
  2020-05-04  6:26 ` [PATCH bpf-next v2 19/20] tools/bpf: selftests: add iter progs for bpf_map/task/task_file Yonghong Song
  2020-05-04  6:26 ` [PATCH bpf-next v2 20/20] tools/bpf: selftests: add bpf_iter selftests Yonghong Song
  19 siblings, 2 replies; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:26 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Two bpf programs are added in this patch for netlink and ipv6_route
target. On my VM, I am able to achieve identical
results compared to /proc/net/netlink and /proc/net/ipv6_route.

  $ cat /proc/net/netlink
  sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
  000000002c42d58b 0   0          00000000 0        0        0     2        0        7
  00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
  00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
  000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
  ....
  00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
  000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
  00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
  000000008398fb08 16  0          00000000 0        0        0     2        0        27
  $ cat /sys/fs/bpf/my_netlink
  sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
  000000002c42d58b 0   0          00000000 0        0        0     2        0        7
  00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
  00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
  000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
  ....
  00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
  000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
  00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
  000000008398fb08 16  0          00000000 0        0        0     2        0        27

  $ cat /proc/net/ipv6_route
  fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
  00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
  fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
  ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
  $ cat /sys/fs/bpf/my_ipv6_route
  fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
  00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
  fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
  ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/progs/bpf_iter_ipv6_route.c | 63 ++++++++++++++++
 .../selftests/bpf/progs/bpf_iter_netlink.c    | 74 +++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_netlink.c

diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
new file mode 100644
index 000000000000..0dee4629298f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+
+extern bool CONFIG_IPV6_SUBTREES __kconfig __weak;
+
+#define	RTF_GATEWAY		0x0002
+#define IFNAMSIZ		16
+#define fib_nh_gw_family        nh_common.nhc_gw_family
+#define fib_nh_gw6              nh_common.nhc_gw.ipv6
+#define fib_nh_dev              nh_common.nhc_dev
+
+SEC("iter/ipv6_route")
+int dump_ipv6_route(struct bpf_iter__ipv6_route *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct fib6_info *rt = ctx->rt;
+	const struct net_device *dev;
+	struct fib6_nh *fib6_nh;
+	unsigned int flags;
+	struct nexthop *nh;
+
+	if (rt == (void *)0)
+		return 0;
+
+	fib6_nh = &rt->fib6_nh[0];
+	flags = rt->fib6_flags;
+
+	/* FIXME: nexthop_is_multipath is not handled here. */
+	nh = rt->nh;
+	if (rt->nh)
+		fib6_nh = &nh->nh_info->fib6_nh;
+
+	BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
+
+	if (CONFIG_IPV6_SUBTREES)
+		BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_src.addr,
+			       rt->fib6_src.plen);
+	else
+		BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 00 ");
+
+	if (fib6_nh->fib_nh_gw_family) {
+		flags |= RTF_GATEWAY;
+		BPF_SEQ_PRINTF(seq, "%pi6 ", &fib6_nh->fib_nh_gw6);
+	} else {
+		BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 ");
+	}
+
+	dev = fib6_nh->fib_nh_dev;
+	if (dev)
+		BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric,
+			       rt->fib6_ref.refs.counter, 0, flags, dev->name);
+	else
+		BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric,
+			       rt->fib6_ref.refs.counter, 0, flags);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
new file mode 100644
index 000000000000..0a85a621a36d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define sk_rmem_alloc	sk_backlog.rmem_alloc
+#define sk_refcnt	__sk_common.skc_refcnt
+
+#define offsetof(TYPE, MEMBER)  ((size_t)&((TYPE *)0)->MEMBER)
+#define container_of(ptr, type, member)				\
+	({							\
+		void *__mptr = (void *)(ptr);			\
+		((type *)(__mptr - offsetof(type, member)));	\
+	})
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
+SEC("iter/netlink")
+int dump_netlink(struct bpf_iter__netlink *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct netlink_sock *nlk = ctx->sk;
+	unsigned long group, ino;
+	struct inode *inode;
+	struct socket *sk;
+	struct sock *s;
+
+	if (nlk == (void *)0)
+		return 0;
+
+	if (ctx->meta->seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "sk               Eth Pid        Groups   "
+				    "Rmem     Wmem     Dump  Locks    Drops    "
+				    "Inode\n");
+
+	s = &nlk->sk;
+	BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol);
+
+	if (!nlk->groups)  {
+		group = 0;
+	} else {
+		/* FIXME: temporary use bpf_probe_read here, needs
+		 * verifier support to do direct access.
+		 */
+		bpf_probe_read(&group, sizeof(group), &nlk->groups[0]);
+	}
+	BPF_SEQ_PRINTF(seq, "%-10u %08x %-8d %-8d %-5d %-8d ",
+		       nlk->portid, (u32)group,
+		       s->sk_rmem_alloc.counter,
+		       s->sk_wmem_alloc.refs.counter - 1,
+		       nlk->cb_running, s->sk_refcnt.refs.counter);
+
+	sk = s->sk_socket;
+	if (!sk) {
+		ino = 0;
+	} else {
+		/* FIXME: container_of inside SOCK_INODE has a forced
+		 * type conversion, and direct access cannot be used
+		 * with current verifier.
+		 */
+		inode = SOCK_INODE(sk);
+		bpf_probe_read(&ino, sizeof(ino), &inode->i_ino);
+	}
+	BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino);
+
+	return 0;
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 19/20] tools/bpf: selftests: add iter progs for bpf_map/task/task_file
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (17 preceding siblings ...)
  2020-05-04  6:26 ` [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink Yonghong Song
@ 2020-05-04  6:26 ` Yonghong Song
  2020-05-06  6:14   ` Andrii Nakryiko
  2020-05-04  6:26 ` [PATCH bpf-next v2 20/20] tools/bpf: selftests: add bpf_iter selftests Yonghong Song
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:26 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

The implementation is arbitrary, just to show how the bpf programs
can be written for bpf_map/task/task_file. They can be costomized
for specific needs.

For example, for bpf_map, the iterator prints out:
  $ cat /sys/fs/bpf/my_bpf_map
      id   refcnt  usercnt  locked_vm
       3        2        0         20
       6        2        0         20
       9        2        0         20
      12        2        0         20
      13        2        0         20
      16        2        0         20
      19        2        0         20
      === END ===

For task, the iterator prints out:
  $ cat /sys/fs/bpf/my_task
    tgid      gid
       1        1
       2        2
    ....
    1944     1944
    1948     1948
    1949     1949
    1953     1953
    === END ===

For task/file, the iterator prints out:
  $ cat /sys/fs/bpf/my_task_file
    tgid      gid       fd      file
       1        1        0 ffffffff95c97600
       1        1        1 ffffffff95c97600
       1        1        2 ffffffff95c97600
    ....
    1895     1895      255 ffffffff95c8fe00
    1932     1932        0 ffffffff95c8fe00
    1932     1932        1 ffffffff95c8fe00
    1932     1932        2 ffffffff95c8fe00
    1932     1932        3 ffffffff95c185c0

This is able to print out all open files (fd and file->f_op), so user can compare
f_op against a particular kernel file operations to find what it is.
For example, from /proc/kallsyms, we can find
  ffffffff95c185c0 r eventfd_fops
so we will know tgid 1932 fd 3 is an eventfd file descriptor.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/progs/bpf_iter_bpf_map.c    | 29 +++++++++++++++++++
 .../selftests/bpf/progs/bpf_iter_task.c       | 26 +++++++++++++++++
 .../selftests/bpf/progs/bpf_iter_task_file.c  | 27 +++++++++++++++++
 3 files changed, 82 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_file.c

diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
new file mode 100644
index 000000000000..d0af0e82b74c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/bpf_map")
+int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	__u64 seq_num = ctx->meta->seq_num;
+	struct bpf_map *map = ctx->map;
+
+	if (map == (void *)0) {
+		BPF_SEQ_PRINTF(seq, "      === END ===\n");
+		return 0;
+	}
+
+	if (seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "      id   refcnt  usercnt  locked_vm\n");
+
+	BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter,
+		       map->usercnt.counter,
+		       map->memory.user->locked_vm.counter);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
new file mode 100644
index 000000000000..ea3c28e46aa5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/task")
+int dump_task(struct bpf_iter__task *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct task_struct *task = ctx->task;
+
+	if (task == (void *)0) {
+		BPF_SEQ_PRINTF(seq, "    === END ===\n");
+		return 0;
+	}
+
+	if (ctx->meta->seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "    tgid      gid\n");
+
+	BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
new file mode 100644
index 000000000000..7c6d3cfe733b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/task_file")
+int dump_task_file(struct bpf_iter__task_file *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct task_struct *task = ctx->task;
+	__u32 fd = ctx->fd;
+	struct file *file = ctx->file;
+
+	if (task == (void *)0 || file == (void *)0)
+		return 0;
+
+	if (ctx->meta->seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "    tgid      gid       fd      file\n");
+
+	BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
+		       (long)file->f_op);
+	return 0;
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH bpf-next v2 20/20] tools/bpf: selftests: add bpf_iter selftests
  2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (18 preceding siblings ...)
  2020-05-04  6:26 ` [PATCH bpf-next v2 19/20] tools/bpf: selftests: add iter progs for bpf_map/task/task_file Yonghong Song
@ 2020-05-04  6:26 ` Yonghong Song
  2020-05-06  6:39   ` Andrii Nakryiko
  19 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-04  6:26 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

The added test includes the following subtests:
  - test verifier change for btf_id_or_null
  - test load/create_iter/read for
    ipv6_route/netlink/bpf_map/task/task_file
  - test anon bpf iterator
  - test anon bpf iterator reading one char at a time
  - test file bpf iterator
  - test overflow (single bpf program output not overflow)
  - test overflow (single bpf program output overflows)

Th ipv6_route tests the following verifier change
  - access fields in the variable length array of the structure.

The netlink load tests th following verifier change
  - put a btf_id ptr value in a stack and accessible to
    tracing/iter programs.

  $ test_progs -n 2
  #2/1 btf_id_or_null:OK
  #2/2 ipv6_route:OK
  #2/3 netlink:OK
  #2/4 bpf_map:OK
  #2/5 task:OK
  #2/6 task_file:OK
  #2/7 anon:OK
  #2/8 anon-read-one-char:OK
  #2/9 file:OK
  #2/10 overflow:OK
  #2/11 overflow-e2big:OK
  #2 bpf_iter:OK
  Summary: 1/11 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 390 ++++++++++++++++++
 .../selftests/bpf/progs/bpf_iter_test_kern1.c |   4 +
 .../selftests/bpf/progs/bpf_iter_test_kern2.c |   4 +
 .../selftests/bpf/progs/bpf_iter_test_kern3.c |  18 +
 .../selftests/bpf/progs/bpf_iter_test_kern4.c |  48 +++
 .../bpf/progs/bpf_iter_test_kern_common.h     |  22 +
 6 files changed, 486 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_iter.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
new file mode 100644
index 000000000000..3df2a400083a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <test_progs.h>
+#include "bpf_iter_ipv6_route.skel.h"
+#include "bpf_iter_netlink.skel.h"
+#include "bpf_iter_bpf_map.skel.h"
+#include "bpf_iter_task.skel.h"
+#include "bpf_iter_task_file.skel.h"
+#include "bpf_iter_test_kern1.skel.h"
+#include "bpf_iter_test_kern2.skel.h"
+#include "bpf_iter_test_kern3.skel.h"
+#include "bpf_iter_test_kern4.skel.h"
+
+static int duration;
+
+static void test_btf_id_or_null(void)
+{
+	struct bpf_iter_test_kern3 *skel;
+
+	skel = bpf_iter_test_kern3__open_and_load();
+	if (CHECK(skel, "bpf_iter_test_kern3__open_and_load",
+		  "skeleton open_and_load unexpectedly succeeded\n")) {
+		bpf_iter_test_kern3__destroy(skel);
+		return;
+	}
+}
+
+static void do_dummy_read(struct bpf_program *prog)
+{
+	struct bpf_link *link;
+	char buf[16] = {};
+	int iter_fd, len;
+
+	link = bpf_program__attach_iter(prog, NULL);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		return;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link), 0);
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	/* not check contents, but ensure read() ends without error */
+	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+		;
+	CHECK(len < 0, "read", "read failed: %s\n", strerror(errno));
+
+	close(iter_fd);
+
+free_link:
+	bpf_link__disconnect(link);
+	bpf_link__destroy(link);
+}
+
+static void test_ipv6_route(void)
+{
+	struct bpf_iter_ipv6_route *skel;
+
+	skel = bpf_iter_ipv6_route__open_and_load();
+	if (CHECK(!skel, "bpf_iter_ipv6_route__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	do_dummy_read(skel->progs.dump_ipv6_route);
+
+	bpf_iter_ipv6_route__destroy(skel);
+}
+
+static void test_netlink(void)
+{
+	struct bpf_iter_netlink *skel;
+
+	skel = bpf_iter_netlink__open_and_load();
+	if (CHECK(!skel, "bpf_iter_netlink__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	do_dummy_read(skel->progs.dump_netlink);
+
+	bpf_iter_netlink__destroy(skel);
+}
+
+static void test_bpf_map(void)
+{
+	struct bpf_iter_bpf_map *skel;
+
+	skel = bpf_iter_bpf_map__open_and_load();
+	if (CHECK(!skel, "bpf_iter_bpf_map__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	do_dummy_read(skel->progs.dump_bpf_map);
+
+	bpf_iter_bpf_map__destroy(skel);
+}
+
+static void test_task(void)
+{
+	struct bpf_iter_task *skel;
+
+	skel = bpf_iter_task__open_and_load();
+	if (CHECK(!skel, "bpf_iter_task__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	do_dummy_read(skel->progs.dump_task);
+
+	bpf_iter_task__destroy(skel);
+}
+
+static void test_task_file(void)
+{
+	struct bpf_iter_task_file *skel;
+
+	skel = bpf_iter_task_file__open_and_load();
+	if (CHECK(!skel, "bpf_iter_task_file__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	do_dummy_read(skel->progs.dump_task_file);
+
+	bpf_iter_task_file__destroy(skel);
+}
+
+/* The expected string is less than 16 bytes */
+static int do_read_with_fd(int iter_fd, const char *expected,
+			   bool read_one_char)
+{
+	int err = -1, len, read_buf_len, start;
+	char buf[16] = {};
+
+	read_buf_len = read_one_char ? 1 : 16;
+	start = 0;
+	while ((len = read(iter_fd, buf + start, read_buf_len)) > 0) {
+		start += len;
+		if (CHECK(start >= 16, "read", "read len %d\n", len))
+			return -1;
+		read_buf_len = read_one_char ? 1 : 16 - start;
+	}
+	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+		return -1;
+
+	err = strcmp(buf, expected);
+	if (CHECK(err, "read", "incorrect read result: buf %s, expected %s\n",
+		  buf, expected))
+		return -1;
+
+	return 0;
+}
+
+static void test_anon_iter(bool read_one_char)
+{
+	struct bpf_iter_test_kern1 *skel;
+	struct bpf_link *link;
+	int iter_fd;
+
+	skel = bpf_iter_test_kern1__open_and_load();
+	if (CHECK(!skel, "bpf_iter_test_kern1__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	link = bpf_program__attach_iter(skel->progs.dump_task, NULL);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		goto out;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link), 0);
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	do_read_with_fd(iter_fd, "abcd", read_one_char);
+	close(iter_fd);
+
+free_link:
+	bpf_link__disconnect(link);
+	bpf_link__destroy(link);
+out:
+	bpf_iter_test_kern1__destroy(skel);
+}
+
+static int do_read(const char *path, const char *expected)
+{
+	int err, iter_fd;
+
+	iter_fd = open(path, O_RDONLY);
+	if (CHECK(iter_fd < 0, "open", "open %s failed: %s\n",
+		  path, strerror(errno)))
+		return -1;
+
+	err = do_read_with_fd(iter_fd, expected, false);
+	close(iter_fd);
+	return err;
+}
+
+static void test_file_iter(void)
+{
+	const char *path = "/sys/fs/bpf/bpf_iter_test1";
+	struct bpf_iter_test_kern1 *skel1;
+	struct bpf_iter_test_kern2 *skel2;
+	struct bpf_link *link;
+	int err;
+
+	skel1 = bpf_iter_test_kern1__open_and_load();
+	if (CHECK(!skel1, "bpf_iter_test_kern1__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	link = bpf_program__attach_iter(skel1->progs.dump_task, NULL);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		goto out;
+
+	/* unlink this path if it exists. */
+	unlink(path);
+
+	err = bpf_link__pin(link, path);
+	if (CHECK(err, "pin_iter", "pin_iter to %s failed: %s\n", path,
+		  strerror(errno)))
+		goto free_link;
+
+	err = do_read(path, "abcd");
+	if (err)
+		goto free_link;
+
+	/* file based iterator seems working fine. Let us a link update
+	 * of the underlying link and `cat` the iterator again, its content
+	 * should change.
+	 */
+	skel2 = bpf_iter_test_kern2__open_and_load();
+	if (CHECK(!skel2, "bpf_iter_test_kern2__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		goto free_link;
+
+	err = bpf_link__update_program(link, skel2->progs.dump_task);
+	if (CHECK(err, "update_prog", "update_prog failed\n"))
+		goto destroy_skel2;
+
+	do_read(path, "ABCD");
+
+destroy_skel2:
+	bpf_iter_test_kern2__destroy(skel2);
+free_link:
+	bpf_link__disconnect(link);
+	bpf_link__destroy(link);
+out:
+	bpf_iter_test_kern1__destroy(skel1);
+}
+
+static void test_overflow(bool test_e2big_overflow)
+{
+	__u32 map_info_len, total_read_len, expected_read_len;
+	int err, iter_fd, map1_fd, map2_fd, len;
+	struct bpf_map_info map_info = {};
+	struct bpf_iter_test_kern4 *skel;
+	struct bpf_link *link;
+	__u32 page_size;
+	char *buf;
+
+	skel = bpf_iter_test_kern4__open();
+	if (CHECK(!skel, "bpf_iter_test_kern4__open",
+		  "skeleton open failed\n"))
+		return;
+
+	/* create two maps: bpf program will only do bpf_seq_write
+	 * for these two maps. The goal is one map output almost
+	 * fills seq_file buffer and then the other will trigger
+	 * overflow and needs restart.
+	 */
+	map1_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+	if (CHECK(map1_fd < 0, "bpf_create_map",
+		  "map_creation failed: %s\n", strerror(errno)))
+		goto out;
+	map2_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+	if (CHECK(map2_fd < 0, "bpf_create_map",
+		  "map_creation failed: %s\n", strerror(errno)))
+		goto free_map1;
+
+	/* bpf_seq_printf kernel buffer is one page, so one map
+	 * bpf_seq_write will mostly fill it, and the other map
+	 * will partially fill and then trigger overflow and need
+	 * bpf_seq_read restart.
+	 */
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	if (test_e2big_overflow)
+		skel->rodata->print_len = (page_size + 8) / 8;
+	else
+		skel->rodata->print_len = (page_size - 8) / 8;
+
+	if (CHECK(bpf_iter_test_kern4__load(skel),
+		  "bpf_iter_test_kern4__load", "skeleton load failed\n"))
+		goto free_map2;
+
+	/* setup filtering map_id in bpf program */
+	map_info_len = sizeof(map_info);
+	err = bpf_obj_get_info_by_fd(map1_fd, &map_info, &map_info_len);
+	if (CHECK(err, "get_map_info", "get map info failed: %s\n",
+		  strerror(errno)))
+		goto free_map2;
+	skel->bss->map1_id = map_info.id;
+
+	err = bpf_obj_get_info_by_fd(map2_fd, &map_info, &map_info_len);
+	if (CHECK(err, "get_map_info", "get map info failed: %s\n",
+		  strerror(errno)))
+		goto free_map2;
+	skel->bss->map2_id = map_info.id;
+
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		goto free_map2;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link), 0);
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	expected_read_len = 2 * (page_size - 8);
+	buf = malloc(expected_read_len);
+	if (!buf)
+		goto close_iter;
+
+	/* do read */
+	total_read_len = 0;
+	while ((len = read(iter_fd, buf, expected_read_len)) > 0)
+		total_read_len += len;
+
+	if (test_e2big_overflow) {
+		CHECK(len != -1 || errno != E2BIG, "read",
+		      "expected ret -1, errno E2BIG, but get ret %d, error %s\n",
+			  len, strerror(errno));
+		goto free_buf;
+	} else {
+		if (CHECK(len < 0, "read", "read failed: %s\n",
+			  strerror(errno)))
+			goto free_buf;
+	}
+
+	if (CHECK(total_read_len != expected_read_len, "read",
+		  "total len %u, expected len %u\n", total_read_len,
+		  expected_read_len))
+		goto free_buf;
+
+	if (CHECK(skel->bss->map1_accessed != 1, "map1_accessed",
+		  "expected 1 actual %d\n", skel->bss->map1_accessed))
+		goto free_buf;
+
+	if (CHECK(skel->bss->map2_accessed != 2, "map2_accessed",
+		  "expected 2 actual %d\n", skel->bss->map2_accessed))
+		goto free_buf;
+
+	CHECK(skel->bss->map2_seqnum1 != skel->bss->map2_seqnum2,
+	      "map2_seqnum", "two different seqnum %lld %lld\n",
+	      skel->bss->map2_seqnum1, skel->bss->map2_seqnum2);
+
+free_buf:
+	free(buf);
+close_iter:
+	close(iter_fd);
+free_link:
+	bpf_link__disconnect(link);
+	bpf_link__destroy(link);
+free_map2:
+	close(map2_fd);
+free_map1:
+	close(map1_fd);
+out:
+	bpf_iter_test_kern4__destroy(skel);
+}
+
+void test_bpf_iter(void)
+{
+	if (test__start_subtest("btf_id_or_null"))
+		test_btf_id_or_null();
+	if (test__start_subtest("ipv6_route"))
+		test_ipv6_route();
+	if (test__start_subtest("netlink"))
+		test_netlink();
+	if (test__start_subtest("bpf_map"))
+		test_bpf_map();
+	if (test__start_subtest("task"))
+		test_task();
+	if (test__start_subtest("task_file"))
+		test_task_file();
+	if (test__start_subtest("anon"))
+		test_anon_iter(false);
+	if (test__start_subtest("anon-read-one-char"))
+		test_anon_iter(true);
+	if (test__start_subtest("file"))
+		test_file_iter();
+	if (test__start_subtest("overflow"))
+		test_overflow(false);
+	if (test__start_subtest("overflow-e2big"))
+		test_overflow(true);
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
new file mode 100644
index 000000000000..c71a7c283108
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define START_CHAR 'a'
+#include "bpf_iter_test_kern_common.h"
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
new file mode 100644
index 000000000000..8bdc8dc07444
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define START_CHAR 'A'
+#include "bpf_iter_test_kern_common.h"
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
new file mode 100644
index 000000000000..636a00fa074d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/task")
+int dump_task(struct bpf_iter__task *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct task_struct *task = ctx->task;
+	int tgid;
+
+	tgid = task->tgid;
+	bpf_seq_write(seq, &tgid, sizeof(tgid));
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
new file mode 100644
index 000000000000..91e0088d3e36
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u32 map1_id = 0, map2_id = 0;
+__u32 map1_accessed = 0, map2_accessed = 0;
+__u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0;
+
+static volatile const __u32 print_len;
+
+SEC("iter/bpf_map")
+int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct bpf_map *map = ctx->map;
+	__u64 seq_num;
+	int i;
+
+	if (map == (void *)0)
+		return 0;
+
+	/* only dump map1_id and map2_id */
+	if (map->id != map1_id && map->id != map2_id)
+		return 0;
+
+	seq_num = ctx->meta->seq_num;
+	if (map->id == map1_id) {
+		map1_seqnum = seq_num;
+		map1_accessed++;
+	}
+
+	if (map->id == map2_id) {
+		if (map2_accessed == 0)
+			map2_seqnum1 = seq_num;
+		else
+			map2_seqnum2 = seq_num;
+		map2_accessed++;
+	}
+
+	/* fill seq_file buffer */
+	for (i = 0; i < print_len; i++)
+		bpf_seq_write(seq, &seq_num, sizeof(seq_num));
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
new file mode 100644
index 000000000000..bdd51cf14b54
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+int count = 0;
+
+SEC("iter/task")
+int dump_task(struct bpf_iter__task *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	char c;
+
+	if (count < 4) {
+		c = START_CHAR + count;
+		bpf_seq_write(seq, &c, sizeof(c));
+		count++;
+	}
+
+	return 0;
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 05/20] bpf: implement bpf_seq_read() for bpf iterator
  2020-05-04  6:25 ` [PATCH bpf-next v2 05/20] bpf: implement bpf_seq_read() for bpf iterator Yonghong Song
@ 2020-05-05 19:56   ` Andrii Nakryiko
  2020-05-05 19:57     ` Alexei Starovoitov
  2020-05-05 20:25     ` Yonghong Song
  0 siblings, 2 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 19:56 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> bpf iterator uses seq_file to provide a lossless
> way to transfer data to user space. But we want to call
> bpf program after all objects have been traversed, and
> bpf program may write additional data to the
> seq_file buffer. The current seq_read() does not work
> for this use case.
>
> Besides allowing stop() function to write to the buffer,
> the bpf_seq_read() also fixed the buffer size to one page.
> If any single call of show() or stop() will emit data
> more than one page to cause overflow, -E2BIG error code
> will be returned to user space.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  kernel/bpf/bpf_iter.c | 128 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 128 insertions(+)
>
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 05ae04ac1eca..2674c9cbc3dc 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -26,6 +26,134 @@ static DEFINE_MUTEX(targets_mutex);
>  /* protect bpf_iter_link changes */
>  static DEFINE_MUTEX(link_mutex);
>
> +/* bpf_seq_read, a customized and simpler version for bpf iterator.
> + * no_llseek is assumed for this file.
> + * The following are differences from seq_read():
> + *  . fixed buffer size (PAGE_SIZE)
> + *  . assuming no_llseek
> + *  . stop() may call bpf program, handling potential overflow there
> + */
> +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
> +                           loff_t *ppos)
> +{
> +       struct seq_file *seq = file->private_data;
> +       size_t n, offs, copied = 0;
> +       int err = 0;
> +       void *p;
> +
> +       mutex_lock(&seq->lock);
> +
> +       if (!seq->buf) {
> +               seq->size = PAGE_SIZE;
> +               seq->buf = kmalloc(seq->size, GFP_KERNEL);
> +               if (!seq->buf)
> +                       goto Enomem;

Why not just mutex_unlock and exit with -ENOMEM? Less goto'ing, more
straightforward.

> +       }
> +
> +       if (seq->count) {
> +               n = min(seq->count, size);
> +               err = copy_to_user(buf, seq->buf + seq->from, n);
> +               if (err)
> +                       goto Efault;
> +               seq->count -= n;
> +               seq->from += n;
> +               copied = n;
> +               goto Done;
> +       }
> +
> +       seq->from = 0;
> +       p = seq->op->start(seq, &seq->index);
> +       if (!p || IS_ERR(p))

IS_ERR_OR_NULL?

> +               goto Stop;
> +
> +       err = seq->op->show(seq, p);
> +       if (seq_has_overflowed(seq)) {
> +               err = -E2BIG;
> +               goto Error_show;
> +       } else if (err) {
> +               /* < 0: go out, > 0: skip */
> +               if (likely(err < 0))
> +                       goto Error_show;
> +               seq->count = 0;
> +       }

This seems a bit more straightforward:

if (seq_has_overflowed(seq))
    err = -E2BIG;
if (err < 0)
    goto Error_show;
else if (err > 0)
    seq->count = 0;

Also, I wonder if err > 0 (so skip was requested), should we ignore
overflow? So something like:

if (err > 0) {
    seq->count = 0;
} else {
    if (seq_has_overflowed(seq))
        err = -E2BIG;
    if (err)
        goto Error_show;
}

> +
> +       while (1) {
> +               loff_t pos = seq->index;
> +
> +               offs = seq->count;
> +               p = seq->op->next(seq, p, &seq->index);
> +               if (pos == seq->index) {
> +                       pr_info_ratelimited("buggy seq_file .next function %ps "
> +                               "did not updated position index\n",
> +                               seq->op->next);
> +                       seq->index++;
> +               }
> +
> +               if (!p || IS_ERR(p)) {

Same, IS_ERR_OR_NULL.

> +                       err = PTR_ERR(p);
> +                       break;
> +               }
> +               if (seq->count >= size)
> +                       break;
> +
> +               err = seq->op->show(seq, p);
> +               if (seq_has_overflowed(seq)) {
> +                       if (offs == 0) {
> +                               err = -E2BIG;
> +                               goto Error_show;
> +                       }
> +                       seq->count = offs;
> +                       break;
> +               } else if (err) {
> +                       /* < 0: go out, > 0: skip */
> +                       seq->count = offs;
> +                       if (likely(err < 0)) {
> +                               if (offs == 0)
> +                                       goto Error_show;
> +                               break;
> +                       }
> +               }

Same question here about ignoring overflow if skip was requested.

> +       }
> +Stop:
> +       offs = seq->count;
> +       /* may call bpf program */
> +       seq->op->stop(seq, p);
> +       if (seq_has_overflowed(seq)) {
> +               if (offs == 0)
> +                       goto Error_stop;
> +               seq->count = offs;

just want to double-check, because it's not clear from the code. If
all the start()/show()/next() succeeded, but stop() overflown. Would
stop() be called again on subsequent read? Would start/show/next
handle this correctly as well?

> +       }
> +
> +       n = min(seq->count, size);
> +       err = copy_to_user(buf, seq->buf, n);
> +       if (err)
> +               goto Efault;
> +       copied = n;
> +       seq->count -= n;
> +       seq->from = n;
> +Done:
> +       if (!copied)
> +               copied = err;
> +       else
> +               *ppos += copied;
> +       mutex_unlock(&seq->lock);
> +       return copied;
> +
> +Error_show:
> +       seq->op->stop(seq, p);
> +Error_stop:
> +       seq->count = 0;
> +       goto Done;
> +
> +Enomem:
> +       err = -ENOMEM;
> +       goto Done;
> +
> +Efault:
> +       err = -EFAULT;
> +       goto Done;

Enomem and Efault seem completely redundant and just add goto
complexity to this algorithm. Let's just inline `err =
-E(NOMEM|FAULT); goto Done;` instead?

> +}
> +
>  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>  {
>         struct bpf_iter_target_info *tinfo;
> --
> 2.24.1
>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 05/20] bpf: implement bpf_seq_read() for bpf iterator
  2020-05-05 19:56   ` Andrii Nakryiko
@ 2020-05-05 19:57     ` Alexei Starovoitov
  2020-05-05 20:25     ` Yonghong Song
  1 sibling, 0 replies; 62+ messages in thread
From: Alexei Starovoitov @ 2020-05-05 19:57 UTC (permalink / raw)
  To: Andrii Nakryiko, Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Daniel Borkmann, Kernel Team

On 5/5/20 12:56 PM, Andrii Nakryiko wrote:
>> +               seq->buf = kmalloc(seq->size, GFP_KERNEL);
>> +               if (!seq->buf)
>> +                       goto Enomem;
> Why not just mutex_unlock and exit with -ENOMEM? Less goto'ing, more
> straightforward.
> 

no. please keep kernel coding style. goto is appropriate here.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 06/20] bpf: create anonymous bpf iterator
  2020-05-04  6:25 ` [PATCH bpf-next v2 06/20] bpf: create anonymous " Yonghong Song
@ 2020-05-05 20:11   ` Andrii Nakryiko
  2020-05-05 20:28     ` Yonghong Song
  0 siblings, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 20:11 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:29 PM Yonghong Song <yhs@fb.com> wrote:
>
> A new bpf command BPF_ITER_CREATE is added.
>
> The anonymous bpf iterator is seq_file based.
> The seq_file private data are referenced by targets.
> The bpf_iter infrastructure allocated additional space
> at seq_file->private before the space used by targets
> to store some meta data, e.g.,
>   prog:       prog to run
>   session_id: an unique id for each opened seq_file
>   seq_num:    how many times bpf programs are queried in this session
>   do_stop:    an internal state to decide whether bpf program
>               should be called in seq_ops->stop() or not
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h            |   1 +
>  include/uapi/linux/bpf.h       |   6 ++
>  kernel/bpf/bpf_iter.c          | 128 +++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c           |  26 +++++++
>  tools/include/uapi/linux/bpf.h |   6 ++
>  5 files changed, 167 insertions(+)
>

[...]

>  /* The description below is an attempt at providing documentation to eBPF
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 2674c9cbc3dc..2a9f939be6e6 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -2,6 +2,7 @@
>  /* Copyright (c) 2020 Facebook */
>
>  #include <linux/fs.h>
> +#include <linux/anon_inodes.h>
>  #include <linux/filter.h>
>  #include <linux/bpf.h>
>
> @@ -20,12 +21,26 @@ struct bpf_iter_link {
>         struct bpf_iter_target_info *tinfo;
>  };
>
> +struct bpf_iter_priv_data {
> +       struct {

nit: anon struct seems unnecessary here? is it just for visual grouping?

> +               struct bpf_iter_target_info *tinfo;
> +               struct bpf_prog *prog;
> +               u64 session_id;
> +               u64 seq_num;
> +               u64 do_stop;
> +       };
> +       u8 target_private[] __aligned(8);
> +};
> +
>  static struct list_head targets = LIST_HEAD_INIT(targets);
>  static DEFINE_MUTEX(targets_mutex);
>
>  /* protect bpf_iter_link changes */
>  static DEFINE_MUTEX(link_mutex);
>
> +/* incremented on every opened seq_file */
> +static atomic64_t session_id;
> +
>  /* bpf_seq_read, a customized and simpler version for bpf iterator.
>   * no_llseek is assumed for this file.
>   * The following are differences from seq_read():
> @@ -154,6 +169,31 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>         goto Done;
>  }
>
> +static int iter_release(struct inode *inode, struct file *file)
> +{
> +       struct bpf_iter_priv_data *iter_priv;
> +       void *file_priv = file->private_data;
> +       struct seq_file *seq;
> +
> +       seq = file_priv;


seq might be NULL, if anon_inode_getfile succeeded, but then
prepare_seq_file failed, so you need to handle that.

Also, file_priv is redundant, assign to seq directly from file->private_data?

> +       iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
> +                                target_private);
> +
> +       if (iter_priv->tinfo->fini_seq_private)
> +               iter_priv->tinfo->fini_seq_private(seq->private);
> +
> +       bpf_prog_put(iter_priv->prog);
> +       seq->private = iter_priv;
> +
> +       return seq_release_private(inode, file);
> +}
> +
> +static const struct file_operations bpf_iter_fops = {
> +       .llseek         = no_llseek,
> +       .read           = bpf_seq_read,
> +       .release        = iter_release,
> +};
> +
>  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>  {
>         struct bpf_iter_target_info *tinfo;
> @@ -289,3 +329,91 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
>
>         return bpf_link_settle(&link_primer);
>  }
> +
> +static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
> +                         struct bpf_iter_target_info *tinfo,
> +                         struct bpf_prog *prog)
> +{
> +       priv_data->tinfo = tinfo;
> +       priv_data->prog = prog;
> +       priv_data->session_id = atomic64_add_return(1, &session_id);

nit: atomic64_inc_return?

> +       priv_data->seq_num = 0;
> +       priv_data->do_stop = 0;
> +}
> +

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 07/20] bpf: create file bpf iterator
  2020-05-04  6:25 ` [PATCH bpf-next v2 07/20] bpf: create file " Yonghong Song
@ 2020-05-05 20:15   ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 20:15 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> To produce a file bpf iterator, the fd must be
> corresponding to a link_fd assocciated with a
> trace/iter program. When the pinned file is
> opened, a seq_file will be generated.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

LGTM.

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  include/linux/bpf.h   |  2 ++
>  kernel/bpf/bpf_iter.c | 17 ++++++++++++++++-
>  kernel/bpf/inode.c    |  5 ++++-
>  3 files changed, 22 insertions(+), 2 deletions(-)
>

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 05/20] bpf: implement bpf_seq_read() for bpf iterator
  2020-05-05 19:56   ` Andrii Nakryiko
  2020-05-05 19:57     ` Alexei Starovoitov
@ 2020-05-05 20:25     ` Yonghong Song
  2020-05-05 21:08       ` Andrii Nakryiko
  1 sibling, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-05 20:25 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/5/20 12:56 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> bpf iterator uses seq_file to provide a lossless
>> way to transfer data to user space. But we want to call
>> bpf program after all objects have been traversed, and
>> bpf program may write additional data to the
>> seq_file buffer. The current seq_read() does not work
>> for this use case.
>>
>> Besides allowing stop() function to write to the buffer,
>> the bpf_seq_read() also fixed the buffer size to one page.
>> If any single call of show() or stop() will emit data
>> more than one page to cause overflow, -E2BIG error code
>> will be returned to user space.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   kernel/bpf/bpf_iter.c | 128 ++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 128 insertions(+)
>>
>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
>> index 05ae04ac1eca..2674c9cbc3dc 100644
>> --- a/kernel/bpf/bpf_iter.c
>> +++ b/kernel/bpf/bpf_iter.c
>> @@ -26,6 +26,134 @@ static DEFINE_MUTEX(targets_mutex);
>>   /* protect bpf_iter_link changes */
>>   static DEFINE_MUTEX(link_mutex);
>>
>> +/* bpf_seq_read, a customized and simpler version for bpf iterator.
>> + * no_llseek is assumed for this file.
>> + * The following are differences from seq_read():
>> + *  . fixed buffer size (PAGE_SIZE)
>> + *  . assuming no_llseek
>> + *  . stop() may call bpf program, handling potential overflow there
>> + */
>> +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>> +                           loff_t *ppos)
>> +{
>> +       struct seq_file *seq = file->private_data;
>> +       size_t n, offs, copied = 0;
>> +       int err = 0;
>> +       void *p;
>> +
>> +       mutex_lock(&seq->lock);
>> +
>> +       if (!seq->buf) {
>> +               seq->size = PAGE_SIZE;
>> +               seq->buf = kmalloc(seq->size, GFP_KERNEL);
>> +               if (!seq->buf)
>> +                       goto Enomem;
> 
> Why not just mutex_unlock and exit with -ENOMEM? Less goto'ing, more
> straightforward.
> 
>> +       }
>> +
>> +       if (seq->count) {
>> +               n = min(seq->count, size);
>> +               err = copy_to_user(buf, seq->buf + seq->from, n);
>> +               if (err)
>> +                       goto Efault;
>> +               seq->count -= n;
>> +               seq->from += n;
>> +               copied = n;
>> +               goto Done;
>> +       }
>> +
>> +       seq->from = 0;
>> +       p = seq->op->start(seq, &seq->index);
>> +       if (!p || IS_ERR(p))
> 
> IS_ERR_OR_NULL?

Ack.

> 
>> +               goto Stop;
>> +
>> +       err = seq->op->show(seq, p);
>> +       if (seq_has_overflowed(seq)) {
>> +               err = -E2BIG;
>> +               goto Error_show;
>> +       } else if (err) {
>> +               /* < 0: go out, > 0: skip */
>> +               if (likely(err < 0))
>> +                       goto Error_show;
>> +               seq->count = 0;
>> +       }
> 
> This seems a bit more straightforward:
> 
> if (seq_has_overflowed(seq))
>      err = -E2BIG;
> if (err < 0)
>      goto Error_show;
> else if (err > 0)
>      seq->count = 0;
> 
> Also, I wonder if err > 0 (so skip was requested), should we ignore
> overflow? So something like:

Think about overflow vs. err > 0 case, I double checked seq_file()
implementation again, yes, it is skipped. So your suggestion below
looks reasonable.

> 
> if (err > 0) {
>      seq->count = 0;
> } else {
>      if (seq_has_overflowed(seq))
>          err = -E2BIG;
>      if (err)
>          goto Error_show;
> }
> 
>> +
>> +       while (1) {
>> +               loff_t pos = seq->index;
>> +
>> +               offs = seq->count;
>> +               p = seq->op->next(seq, p, &seq->index);
>> +               if (pos == seq->index) {
>> +                       pr_info_ratelimited("buggy seq_file .next function %ps "
>> +                               "did not updated position index\n",
>> +                               seq->op->next);
>> +                       seq->index++;
>> +               }
>> +
>> +               if (!p || IS_ERR(p)) {
> 
> Same, IS_ERR_OR_NULL.

Ack.

> 
>> +                       err = PTR_ERR(p);
>> +                       break;
>> +               }
>> +               if (seq->count >= size)
>> +                       break;
>> +
>> +               err = seq->op->show(seq, p);
>> +               if (seq_has_overflowed(seq)) {
>> +                       if (offs == 0) {
>> +                               err = -E2BIG;
>> +                               goto Error_show;
>> +                       }
>> +                       seq->count = offs;
>> +                       break;
>> +               } else if (err) {
>> +                       /* < 0: go out, > 0: skip */
>> +                       seq->count = offs;
>> +                       if (likely(err < 0)) {
>> +                               if (offs == 0)
>> +                                       goto Error_show;
>> +                               break;
>> +                       }
>> +               }
> 
> Same question here about ignoring overflow if skip was requested.

Yes, we should prioritize err > 0 over overflow.

> 
>> +       }
>> +Stop:
>> +       offs = seq->count;
>> +       /* may call bpf program */
>> +       seq->op->stop(seq, p);
>> +       if (seq_has_overflowed(seq)) {
>> +               if (offs == 0)
>> +                       goto Error_stop;
>> +               seq->count = offs;
> 
> just want to double-check, because it's not clear from the code. If
> all the start()/show()/next() succeeded, but stop() overflown. Would
> stop() be called again on subsequent read? Would start/show/next
> handle this correctly as well?

I am supposed to handle this unless there is a bug...
The idea is:
    - if start()/show()/next() is fine and stop() overflow,
      we will skip stop() output and move on.
      (if we found out, we skip to the beginning of the
       buffer, we will return -E2BIG. Otherwise, we will return
       0 here, the user read() may just exit.)
    - next time, when read() called again, the start() will return
      NULL (since previous next() returns NULL) and the control
      will jump to stop(), which will try to do another dump().

> 
>> +       }
>> +
>> +       n = min(seq->count, size);
>> +       err = copy_to_user(buf, seq->buf, n);
>> +       if (err)
>> +               goto Efault;
>> +       copied = n;
>> +       seq->count -= n;
>> +       seq->from = n;
>> +Done:
>> +       if (!copied)
>> +               copied = err;
>> +       else
>> +               *ppos += copied;
>> +       mutex_unlock(&seq->lock);
>> +       return copied;
>> +
>> +Error_show:
>> +       seq->op->stop(seq, p);
>> +Error_stop:
>> +       seq->count = 0;
>> +       goto Done;
>> +
>> +Enomem:
>> +       err = -ENOMEM;
>> +       goto Done;
>> +
>> +Efault:
>> +       err = -EFAULT;
>> +       goto Done;
> 
> Enomem and Efault seem completely redundant and just add goto
> complexity to this algorithm. Let's just inline `err =
> -E(NOMEM|FAULT); goto Done;` instead?

We can do this. This is kind of original seq_read() coding
style. Agree that we do not need to follow them.

> 
>> +}
>> +
>>   int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>>   {
>>          struct bpf_iter_target_info *tinfo;
>> --
>> 2.24.1
>>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 08/20] bpf: implement common macros/helpers for target iterators
  2020-05-04  6:25 ` [PATCH bpf-next v2 08/20] bpf: implement common macros/helpers for target iterators Yonghong Song
@ 2020-05-05 20:25   ` Andrii Nakryiko
  2020-05-05 20:30     ` Yonghong Song
  0 siblings, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 20:25 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>
> Macro DEFINE_BPF_ITER_FUNC is implemented so target
> can define an init function to capture the BTF type
> which represents the target.
>
> The bpf_iter_meta is a structure holding meta data, common
> to all targets in the bpf program.
>
> Additional marker functions are called before/after
> bpf_seq_read() show() and stop() callback functions
> to help calculate precise seq_num and whether call bpf_prog
> inside stop().
>
> Two functions, bpf_iter_get_info() and bpf_iter_run_prog(),
> are implemented so target can get needed information from
> bpf_iter infrastructure and can run the program.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h   | 11 +++++
>  kernel/bpf/bpf_iter.c | 94 ++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 100 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 26daf85cba10..70c71c3cd9e8 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1129,6 +1129,9 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>  int bpf_obj_get_user(const char __user *pathname, int flags);
>
>  #define BPF_ITER_FUNC_PREFIX "__bpf_iter__"
> +#define DEFINE_BPF_ITER_FUNC(target, args...)                  \
> +       extern int __bpf_iter__ ## target(args);                \
> +       int __init __bpf_iter__ ## target(args) { return 0; }

Why is extern declaration needed here? Doesn't the same macro define
global function itself? I'm probably missing some C semantics thingy,
sorry...

>
>  typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
>  typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
> @@ -1141,11 +1144,19 @@ struct bpf_iter_reg {
>         u32 seq_priv_size;
>  };
>
> +struct bpf_iter_meta {
> +       __bpf_md_ptr(struct seq_file *, seq);
> +       u64 session_id;
> +       u64 seq_num;
> +};
> +

[...]

>  /* bpf_seq_read, a customized and simpler version for bpf iterator.
>   * no_llseek is assumed for this file.
>   * The following are differences from seq_read():
> @@ -83,12 +119,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>         if (!p || IS_ERR(p))
>                 goto Stop;
>
> +       bpf_iter_inc_seq_num(seq);

so seq_num is one-based, not zero-based? So on first show() call it
will be set to 1, not 0, right?

>         err = seq->op->show(seq, p);
>         if (seq_has_overflowed(seq)) {
> +               bpf_iter_dec_seq_num(seq);
>                 err = -E2BIG;
>                 goto Error_show;
>         } else if (err) {
>                 /* < 0: go out, > 0: skip */
> +               bpf_iter_dec_seq_num(seq);
>                 if (likely(err < 0))
>                         goto Error_show;
>                 seq->count = 0;

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 12/20] bpf: add PTR_TO_BTF_ID_OR_NULL support
  2020-05-04  6:26 ` [PATCH bpf-next v2 12/20] bpf: add PTR_TO_BTF_ID_OR_NULL support Yonghong Song
@ 2020-05-05 20:27   ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 20:27 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> Add bpf_reg_type PTR_TO_BTF_ID_OR_NULL support.
> For tracing/iter program, the bpf program context
> definition, e.g., for previous bpf_map target, looks like
>   struct bpf_iter__bpf_map {
>     struct bpf_iter_meta *meta;
>     struct bpf_map *map;
>   };
>
> The kernel guarantees that meta is not NULL, but
> map pointer maybe NULL. The NULL map indicates that all
> objects have been traversed, so bpf program can take
> proper action, e.g., do final aggregation and/or send
> final report to user space.
>
> Add btf_id_or_null_non0_off to prog->aux structure, to
> indicate that if the context access offset is not 0,
> set to PTR_TO_BTF_ID_OR_NULL instead of PTR_TO_BTF_ID.
> This bit is set for tracing/iter program.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

LGTM.

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  include/linux/bpf.h   |  2 ++
>  kernel/bpf/btf.c      |  5 ++++-
>  kernel/bpf/verifier.c | 16 ++++++++++++----
>  3 files changed, 18 insertions(+), 5 deletions(-)
>

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 06/20] bpf: create anonymous bpf iterator
  2020-05-05 20:11   ` Andrii Nakryiko
@ 2020-05-05 20:28     ` Yonghong Song
  0 siblings, 0 replies; 62+ messages in thread
From: Yonghong Song @ 2020-05-05 20:28 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/5/20 1:11 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:29 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> A new bpf command BPF_ITER_CREATE is added.
>>
>> The anonymous bpf iterator is seq_file based.
>> The seq_file private data are referenced by targets.
>> The bpf_iter infrastructure allocated additional space
>> at seq_file->private before the space used by targets
>> to store some meta data, e.g.,
>>    prog:       prog to run
>>    session_id: an unique id for each opened seq_file
>>    seq_num:    how many times bpf programs are queried in this session
>>    do_stop:    an internal state to decide whether bpf program
>>                should be called in seq_ops->stop() or not
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h            |   1 +
>>   include/uapi/linux/bpf.h       |   6 ++
>>   kernel/bpf/bpf_iter.c          | 128 +++++++++++++++++++++++++++++++++
>>   kernel/bpf/syscall.c           |  26 +++++++
>>   tools/include/uapi/linux/bpf.h |   6 ++
>>   5 files changed, 167 insertions(+)
>>
> 
> [...]
> 
>>   /* The description below is an attempt at providing documentation to eBPF
>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
>> index 2674c9cbc3dc..2a9f939be6e6 100644
>> --- a/kernel/bpf/bpf_iter.c
>> +++ b/kernel/bpf/bpf_iter.c
>> @@ -2,6 +2,7 @@
>>   /* Copyright (c) 2020 Facebook */
>>
>>   #include <linux/fs.h>
>> +#include <linux/anon_inodes.h>
>>   #include <linux/filter.h>
>>   #include <linux/bpf.h>
>>
>> @@ -20,12 +21,26 @@ struct bpf_iter_link {
>>          struct bpf_iter_target_info *tinfo;
>>   };
>>
>> +struct bpf_iter_priv_data {
>> +       struct {
> 
> nit: anon struct seems unnecessary here? is it just for visual grouping?

Yes, this is just for virtual grouping. Not 100% sure whether this
is needed or not.

> 
>> +               struct bpf_iter_target_info *tinfo;
>> +               struct bpf_prog *prog;
>> +               u64 session_id;
>> +               u64 seq_num;
>> +               u64 do_stop;
>> +       };
>> +       u8 target_private[] __aligned(8);
>> +};
>> +
>>   static struct list_head targets = LIST_HEAD_INIT(targets);
>>   static DEFINE_MUTEX(targets_mutex);
>>
>>   /* protect bpf_iter_link changes */
>>   static DEFINE_MUTEX(link_mutex);
>>
>> +/* incremented on every opened seq_file */
>> +static atomic64_t session_id;
>> +
>>   /* bpf_seq_read, a customized and simpler version for bpf iterator.
>>    * no_llseek is assumed for this file.
>>    * The following are differences from seq_read():
>> @@ -154,6 +169,31 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>>          goto Done;
>>   }
>>
>> +static int iter_release(struct inode *inode, struct file *file)
>> +{
>> +       struct bpf_iter_priv_data *iter_priv;
>> +       void *file_priv = file->private_data;
>> +       struct seq_file *seq;
>> +
>> +       seq = file_priv;
> 
> 
> seq might be NULL, if anon_inode_getfile succeeded, but then
> prepare_seq_file failed, so you need to handle that.

Thanks for catching this. Missed this case.

> 
> Also, file_priv is redundant, assign to seq directly from file->private_data?

Ack.

> 
>> +       iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
>> +                                target_private);
>> +
>> +       if (iter_priv->tinfo->fini_seq_private)
>> +               iter_priv->tinfo->fini_seq_private(seq->private);
>> +
>> +       bpf_prog_put(iter_priv->prog);
>> +       seq->private = iter_priv;
>> +
>> +       return seq_release_private(inode, file);
>> +}
>> +
>> +static const struct file_operations bpf_iter_fops = {
>> +       .llseek         = no_llseek,
>> +       .read           = bpf_seq_read,
>> +       .release        = iter_release,
>> +};
>> +
>>   int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>>   {
>>          struct bpf_iter_target_info *tinfo;
>> @@ -289,3 +329,91 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
>>
>>          return bpf_link_settle(&link_primer);
>>   }
>> +
>> +static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
>> +                         struct bpf_iter_target_info *tinfo,
>> +                         struct bpf_prog *prog)
>> +{
>> +       priv_data->tinfo = tinfo;
>> +       priv_data->prog = prog;
>> +       priv_data->session_id = atomic64_add_return(1, &session_id);
> 
> nit: atomic64_inc_return?

Ack.

> 
>> +       priv_data->seq_num = 0;
>> +       priv_data->do_stop = 0;
>> +}
>> +
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 08/20] bpf: implement common macros/helpers for target iterators
  2020-05-05 20:25   ` Andrii Nakryiko
@ 2020-05-05 20:30     ` Yonghong Song
  2020-05-05 21:10       ` Andrii Nakryiko
  0 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-05 20:30 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/5/20 1:25 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Macro DEFINE_BPF_ITER_FUNC is implemented so target
>> can define an init function to capture the BTF type
>> which represents the target.
>>
>> The bpf_iter_meta is a structure holding meta data, common
>> to all targets in the bpf program.
>>
>> Additional marker functions are called before/after
>> bpf_seq_read() show() and stop() callback functions
>> to help calculate precise seq_num and whether call bpf_prog
>> inside stop().
>>
>> Two functions, bpf_iter_get_info() and bpf_iter_run_prog(),
>> are implemented so target can get needed information from
>> bpf_iter infrastructure and can run the program.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h   | 11 +++++
>>   kernel/bpf/bpf_iter.c | 94 ++++++++++++++++++++++++++++++++++++++++---
>>   2 files changed, 100 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 26daf85cba10..70c71c3cd9e8 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1129,6 +1129,9 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>>   int bpf_obj_get_user(const char __user *pathname, int flags);
>>
>>   #define BPF_ITER_FUNC_PREFIX "__bpf_iter__"
>> +#define DEFINE_BPF_ITER_FUNC(target, args...)                  \
>> +       extern int __bpf_iter__ ## target(args);                \
>> +       int __init __bpf_iter__ ## target(args) { return 0; }
> 
> Why is extern declaration needed here? Doesn't the same macro define

Silence sparse warning. Apparently in kernel, any global function, they 
want a declaration?

> global function itself? I'm probably missing some C semantics thingy,
> sorry...
> 
>>
>>   typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
>>   typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
>> @@ -1141,11 +1144,19 @@ struct bpf_iter_reg {
>>          u32 seq_priv_size;
>>   };
>>
>> +struct bpf_iter_meta {
>> +       __bpf_md_ptr(struct seq_file *, seq);
>> +       u64 session_id;
>> +       u64 seq_num;
>> +};
>> +
> 
> [...]
> 
>>   /* bpf_seq_read, a customized and simpler version for bpf iterator.
>>    * no_llseek is assumed for this file.
>>    * The following are differences from seq_read():
>> @@ -83,12 +119,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>>          if (!p || IS_ERR(p))
>>                  goto Stop;
>>
>> +       bpf_iter_inc_seq_num(seq);
> 
> so seq_num is one-based, not zero-based? So on first show() call it
> will be set to 1, not 0, right?

It is 1 based, we need to document this clearly. I forgot to adjust my 
bpf program for this. Will adjust them properly in the next revision.
> 
>>          err = seq->op->show(seq, p);
>>          if (seq_has_overflowed(seq)) {
>> +               bpf_iter_dec_seq_num(seq);
>>                  err = -E2BIG;
>>                  goto Error_show;
>>          } else if (err) {
>>                  /* < 0: go out, > 0: skip */
>> +               bpf_iter_dec_seq_num(seq);
>>                  if (likely(err < 0))
>>                          goto Error_show;
>>                  seq->count = 0;
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 05/20] bpf: implement bpf_seq_read() for bpf iterator
  2020-05-05 20:25     ` Yonghong Song
@ 2020-05-05 21:08       ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 21:08 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Tue, May 5, 2020 at 1:25 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 5/5/20 12:56 PM, Andrii Nakryiko wrote:
> > On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> bpf iterator uses seq_file to provide a lossless
> >> way to transfer data to user space. But we want to call
> >> bpf program after all objects have been traversed, and
> >> bpf program may write additional data to the
> >> seq_file buffer. The current seq_read() does not work
> >> for this use case.
> >>
> >> Besides allowing stop() function to write to the buffer,
> >> the bpf_seq_read() also fixed the buffer size to one page.
> >> If any single call of show() or stop() will emit data
> >> more than one page to cause overflow, -E2BIG error code
> >> will be returned to user space.
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   kernel/bpf/bpf_iter.c | 128 ++++++++++++++++++++++++++++++++++++++++++
> >>   1 file changed, 128 insertions(+)
> >>
> >> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> >> index 05ae04ac1eca..2674c9cbc3dc 100644
> >> --- a/kernel/bpf/bpf_iter.c
> >> +++ b/kernel/bpf/bpf_iter.c
> >> @@ -26,6 +26,134 @@ static DEFINE_MUTEX(targets_mutex);
> >>   /* protect bpf_iter_link changes */
> >>   static DEFINE_MUTEX(link_mutex);
> >>
> >> +/* bpf_seq_read, a customized and simpler version for bpf iterator.
> >> + * no_llseek is assumed for this file.
> >> + * The following are differences from seq_read():
> >> + *  . fixed buffer size (PAGE_SIZE)
> >> + *  . assuming no_llseek
> >> + *  . stop() may call bpf program, handling potential overflow there
> >> + */
> >> +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
> >> +                           loff_t *ppos)
> >> +{
> >> +       struct seq_file *seq = file->private_data;
> >> +       size_t n, offs, copied = 0;
> >> +       int err = 0;
> >> +       void *p;
> >> +
> >> +       mutex_lock(&seq->lock);
> >> +
> >> +       if (!seq->buf) {
> >> +               seq->size = PAGE_SIZE;
> >> +               seq->buf = kmalloc(seq->size, GFP_KERNEL);
> >> +               if (!seq->buf)
> >> +                       goto Enomem;
> >
> > Why not just mutex_unlock and exit with -ENOMEM? Less goto'ing, more
> > straightforward.
> >
> >> +       }
> >> +
> >> +       if (seq->count) {
> >> +               n = min(seq->count, size);
> >> +               err = copy_to_user(buf, seq->buf + seq->from, n);
> >> +               if (err)
> >> +                       goto Efault;
> >> +               seq->count -= n;
> >> +               seq->from += n;
> >> +               copied = n;
> >> +               goto Done;
> >> +       }
> >> +
> >> +       seq->from = 0;
> >> +       p = seq->op->start(seq, &seq->index);
> >> +       if (!p || IS_ERR(p))
> >
> > IS_ERR_OR_NULL?
>
> Ack.
>
> >
> >> +               goto Stop;
> >> +
> >> +       err = seq->op->show(seq, p);
> >> +       if (seq_has_overflowed(seq)) {
> >> +               err = -E2BIG;
> >> +               goto Error_show;
> >> +       } else if (err) {
> >> +               /* < 0: go out, > 0: skip */
> >> +               if (likely(err < 0))
> >> +                       goto Error_show;
> >> +               seq->count = 0;
> >> +       }
> >
> > This seems a bit more straightforward:
> >
> > if (seq_has_overflowed(seq))
> >      err = -E2BIG;
> > if (err < 0)
> >      goto Error_show;
> > else if (err > 0)
> >      seq->count = 0;
> >
> > Also, I wonder if err > 0 (so skip was requested), should we ignore
> > overflow? So something like:
>
> Think about overflow vs. err > 0 case, I double checked seq_file()
> implementation again, yes, it is skipped. So your suggestion below
> looks reasonable.
>
> >
> > if (err > 0) {
> >      seq->count = 0;
> > } else {
> >      if (seq_has_overflowed(seq))
> >          err = -E2BIG;
> >      if (err)
> >          goto Error_show;
> > }
> >
> >> +
> >> +       while (1) {
> >> +               loff_t pos = seq->index;
> >> +
> >> +               offs = seq->count;
> >> +               p = seq->op->next(seq, p, &seq->index);
> >> +               if (pos == seq->index) {
> >> +                       pr_info_ratelimited("buggy seq_file .next function %ps "
> >> +                               "did not updated position index\n",
> >> +                               seq->op->next);
> >> +                       seq->index++;
> >> +               }
> >> +
> >> +               if (!p || IS_ERR(p)) {
> >
> > Same, IS_ERR_OR_NULL.
>
> Ack.
>
> >
> >> +                       err = PTR_ERR(p);
> >> +                       break;
> >> +               }
> >> +               if (seq->count >= size)
> >> +                       break;
> >> +
> >> +               err = seq->op->show(seq, p);
> >> +               if (seq_has_overflowed(seq)) {
> >> +                       if (offs == 0) {
> >> +                               err = -E2BIG;
> >> +                               goto Error_show;
> >> +                       }
> >> +                       seq->count = offs;
> >> +                       break;
> >> +               } else if (err) {
> >> +                       /* < 0: go out, > 0: skip */
> >> +                       seq->count = offs;
> >> +                       if (likely(err < 0)) {
> >> +                               if (offs == 0)
> >> +                                       goto Error_show;
> >> +                               break;
> >> +                       }
> >> +               }
> >
> > Same question here about ignoring overflow if skip was requested.
>
> Yes, we should prioritize err > 0 over overflow.
>
> >
> >> +       }
> >> +Stop:
> >> +       offs = seq->count;
> >> +       /* may call bpf program */
> >> +       seq->op->stop(seq, p);
> >> +       if (seq_has_overflowed(seq)) {
> >> +               if (offs == 0)
> >> +                       goto Error_stop;
> >> +               seq->count = offs;
> >
> > just want to double-check, because it's not clear from the code. If
> > all the start()/show()/next() succeeded, but stop() overflown. Would
> > stop() be called again on subsequent read? Would start/show/next
> > handle this correctly as well?
>
> I am supposed to handle this unless there is a bug...
> The idea is:
>     - if start()/show()/next() is fine and stop() overflow,
>       we will skip stop() output and move on.
>       (if we found out, we skip to the beginning of the
>        buffer, we will return -E2BIG. Otherwise, we will return
>        0 here, the user read() may just exit.)
>     - next time, when read() called again, the start() will return
>       NULL (since previous next() returns NULL) and the control
>       will jump to stop(), which will try to do another dump().
>

Right, sounds reasonable :)

> >
> >> +       }
> >> +
> >> +       n = min(seq->count, size);
> >> +       err = copy_to_user(buf, seq->buf, n);
> >> +       if (err)
> >> +               goto Efault;
> >> +       copied = n;
> >> +       seq->count -= n;
> >> +       seq->from = n;
> >> +Done:
> >> +       if (!copied)
> >> +               copied = err;
> >> +       else
> >> +               *ppos += copied;
> >> +       mutex_unlock(&seq->lock);
> >> +       return copied;
> >> +
> >> +Error_show:
> >> +       seq->op->stop(seq, p);
> >> +Error_stop:
> >> +       seq->count = 0;
> >> +       goto Done;
> >> +
> >> +Enomem:
> >> +       err = -ENOMEM;
> >> +       goto Done;
> >> +
> >> +Efault:
> >> +       err = -EFAULT;
> >> +       goto Done;
> >
> > Enomem and Efault seem completely redundant and just add goto
> > complexity to this algorithm. Let's just inline `err =
> > -E(NOMEM|FAULT); goto Done;` instead?
>
> We can do this. This is kind of original seq_read() coding
> style. Agree that we do not need to follow them.
>
> >
> >> +}
> >> +
> >>   int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
> >>   {
> >>          struct bpf_iter_target_info *tinfo;
> >> --
> >> 2.24.1
> >>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 08/20] bpf: implement common macros/helpers for target iterators
  2020-05-05 20:30     ` Yonghong Song
@ 2020-05-05 21:10       ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 21:10 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Tue, May 5, 2020 at 1:30 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 5/5/20 1:25 PM, Andrii Nakryiko wrote:
> > On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Macro DEFINE_BPF_ITER_FUNC is implemented so target
> >> can define an init function to capture the BTF type
> >> which represents the target.
> >>
> >> The bpf_iter_meta is a structure holding meta data, common
> >> to all targets in the bpf program.
> >>
> >> Additional marker functions are called before/after
> >> bpf_seq_read() show() and stop() callback functions
> >> to help calculate precise seq_num and whether call bpf_prog
> >> inside stop().
> >>
> >> Two functions, bpf_iter_get_info() and bpf_iter_run_prog(),
> >> are implemented so target can get needed information from
> >> bpf_iter infrastructure and can run the program.
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   include/linux/bpf.h   | 11 +++++
> >>   kernel/bpf/bpf_iter.c | 94 ++++++++++++++++++++++++++++++++++++++++---
> >>   2 files changed, 100 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index 26daf85cba10..70c71c3cd9e8 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -1129,6 +1129,9 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
> >>   int bpf_obj_get_user(const char __user *pathname, int flags);
> >>
> >>   #define BPF_ITER_FUNC_PREFIX "__bpf_iter__"
> >> +#define DEFINE_BPF_ITER_FUNC(target, args...)                  \
> >> +       extern int __bpf_iter__ ## target(args);                \
> >> +       int __init __bpf_iter__ ## target(args) { return 0; }
> >
> > Why is extern declaration needed here? Doesn't the same macro define
>
> Silence sparse warning. Apparently in kernel, any global function, they
> want a declaration?

Ah.. alright :)

>
> > global function itself? I'm probably missing some C semantics thingy,
> > sorry...
> >
> >>
> >>   typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
> >>   typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
> >> @@ -1141,11 +1144,19 @@ struct bpf_iter_reg {
> >>          u32 seq_priv_size;
> >>   };
> >>
> >> +struct bpf_iter_meta {
> >> +       __bpf_md_ptr(struct seq_file *, seq);
> >> +       u64 session_id;
> >> +       u64 seq_num;
> >> +};
> >> +
> >
> > [...]
> >
> >>   /* bpf_seq_read, a customized and simpler version for bpf iterator.
> >>    * no_llseek is assumed for this file.
> >>    * The following are differences from seq_read():
> >> @@ -83,12 +119,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
> >>          if (!p || IS_ERR(p))
> >>                  goto Stop;
> >>
> >> +       bpf_iter_inc_seq_num(seq);
> >
> > so seq_num is one-based, not zero-based? So on first show() call it
> > will be set to 1, not 0, right?
>
> It is 1 based, we need to document this clearly. I forgot to adjust my
> bpf program for this. Will adjust them properly in the next revision.

I see. IMO, seq_num starting at 0 is more natural, but whichever way
is fine with me.

> >
> >>          err = seq->op->show(seq, p);
> >>          if (seq_has_overflowed(seq)) {
> >> +               bpf_iter_dec_seq_num(seq);
> >>                  err = -E2BIG;
> >>                  goto Error_show;
> >>          } else if (err) {
> >>                  /* < 0: go out, > 0: skip */
> >> +               bpf_iter_dec_seq_num(seq);
> >>                  if (likely(err < 0))
> >>                          goto Error_show;
> >>                  seq->count = 0;
> >
> > [...]
> >

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 01/20] bpf: implement an interface to register bpf_iter targets
  2020-05-04  6:25 ` [PATCH bpf-next v2 01/20] bpf: implement an interface to register bpf_iter targets Yonghong Song
@ 2020-05-05 21:19   ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 21:19 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> The target can call bpf_iter_reg_target() to register itself.
> The needed information:
>   target:           target name
>   seq_ops:          the seq_file operations for the target
>   init_seq_private  target callback to initialize seq_priv during file open
>   fini_seq_private  target callback to clean up seq_priv during file release
>   seq_priv_size:    the private_data size needed by the seq_file
>                     operations
>
> The target name represents a target which provides a seq_ops
> for iterating objects.
>
> The target can provide two callback functions, init_seq_private
> and fini_seq_private, called during file open/release time.
> For example, /proc/net/{tcp6, ipv6_route, netlink, ...}, net
> name space needs to be setup properly during file open and
> released properly during file release.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

LGTM.

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  include/linux/bpf.h   | 14 ++++++++++++++
>  kernel/bpf/Makefile   |  2 +-
>  kernel/bpf/bpf_iter.c | 40 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 55 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/bpf_iter.c
>

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 02/20] bpf: allow loading of a bpf_iter program
  2020-05-04  6:25 ` [PATCH bpf-next v2 02/20] bpf: allow loading of a bpf_iter program Yonghong Song
@ 2020-05-05 21:29   ` Andrii Nakryiko
  2020-05-06  0:07     ` Yonghong Song
  0 siblings, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 21:29 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> A bpf_iter program is a tracing program with attach type
> BPF_TRACE_ITER. The load attribute
>   attach_btf_id
> is used by the verifier against a particular kernel function,
> which represents a target, e.g., __bpf_iter__bpf_map
> for target bpf_map which is implemented later.
>
> The program return value must be 0 or 1 for now.
>   0 : successful, except potential seq_file buffer overflow
>       which is handled by seq_file reader.
>   1 : request to restart the same object

This bit is interesting. Is the idea that if BPF program also wants to
send something over, say, perf_buffer, but fails, it can "request"
same execution again? I wonder if typical libc fread() implementation
would handle EAGAIN properly, it seems more driven towards
non-blocking I/O?

On the other hand, following start/show/next logic for seq_file
iteration, requesting skipping element seems useful. It would allow
(in some cases) to "speculatively" generate output and at some point
realize that this is not an element we actually want in the output and
request to ignore that output.

Don't know how useful the latter is going to be in practice, but just
something to keep in mind for the future, I guess...

>
> In the future, other return values may be used for filtering or
> teminating the iterator.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h            |  3 +++
>  include/uapi/linux/bpf.h       |  1 +
>  kernel/bpf/bpf_iter.c          | 30 ++++++++++++++++++++++++++++++
>  kernel/bpf/verifier.c          | 21 +++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h |  1 +
>  5 files changed, 56 insertions(+)
>

[...]


> +
> +bool bpf_iter_prog_supported(struct bpf_prog *prog)
> +{
> +       const char *attach_fname = prog->aux->attach_func_name;
> +       u32 prog_btf_id = prog->aux->attach_btf_id;
> +       const char *prefix = BPF_ITER_FUNC_PREFIX;
> +       struct bpf_iter_target_info *tinfo;
> +       int prefix_len = strlen(prefix);
> +       bool supported = false;
> +
> +       if (strncmp(attach_fname, prefix, prefix_len))
> +               return false;
> +
> +       mutex_lock(&targets_mutex);
> +       list_for_each_entry(tinfo, &targets, list) {
> +               if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
> +                       supported = true;
> +                       break;
> +               }
> +               if (!strcmp(attach_fname + prefix_len, tinfo->target)) {
> +                       tinfo->btf_id = prog->aux->attach_btf_id;

This target_info->btf_id caching here is a bit subtle and easy to
miss, it would be nice to have a code calling this out explicitly.
Thanks!

> +                       supported = true;
> +                       break;
> +               }
> +       }
> +       mutex_unlock(&targets_mutex);
> +
> +       return supported;
> +}
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 70ad009577f8..d725ff7d11db 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env)
>                         return 0;
>                 range = tnum_const(0);
>                 break;
> +       case BPF_PROG_TYPE_TRACING:
> +               if (env->prog->expected_attach_type != BPF_TRACE_ITER)
> +                       return 0;

Commit message mentions enforcing [0, 1], shouldn't it be done here?


> +               break;
>         default:
>                 return 0;
>         }

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-04  6:25 ` [PATCH bpf-next v2 03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE Yonghong Song
@ 2020-05-05 21:30   ` Andrii Nakryiko
  2020-05-06  0:14     ` Yonghong Song
  0 siblings, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 21:30 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> Given a bpf program, the step to create an anonymous bpf iterator is:
>   - create a bpf_iter_link, which combines bpf program and the target.
>     In the future, there could be more information recorded in the link.
>     A link_fd will be returned to the user space.
>   - create an anonymous bpf iterator with the given link_fd.
>
> The bpf_iter_link can be pinned to bpffs mount file system to
> create a file based bpf iterator as well.
>
> The benefit to use of bpf_iter_link:
>   - using bpf link simplifies design and implementation as bpf link
>     is used for other tracing bpf programs.
>   - for file based bpf iterator, bpf_iter_link provides a standard
>     way to replace underlying bpf programs.
>   - for both anonymous and free based iterators, bpf link query
>     capability can be leveraged.
>
> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
> querying. Currently, only prog_id is needed, so there is no
> additional in-kernel show_fdinfo() and fill_link_info() hook
> is needed for BPF_LINK_TYPE_ITER link.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

LGTM. See small nit about __GFP_NOWARN.

Acked-by: Andrii Nakryiko <andriin@fb.com>


>  include/linux/bpf.h            |  1 +
>  include/linux/bpf_types.h      |  1 +
>  include/uapi/linux/bpf.h       |  1 +
>  kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c           | 14 ++++++++
>  tools/include/uapi/linux/bpf.h |  1 +
>  6 files changed, 80 insertions(+)
>

[...]

> +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
> +{
> +       struct bpf_link_primer link_primer;
> +       struct bpf_iter_target_info *tinfo;
> +       struct bpf_iter_link *link;
> +       bool existed = false;
> +       u32 prog_btf_id;
> +       int err;
> +
> +       if (attr->link_create.target_fd || attr->link_create.flags)
> +               return -EINVAL;
> +
> +       prog_btf_id = prog->aux->attach_btf_id;
> +       mutex_lock(&targets_mutex);
> +       list_for_each_entry(tinfo, &targets, list) {
> +               if (tinfo->btf_id == prog_btf_id) {
> +                       existed = true;
> +                       break;
> +               }
> +       }
> +       mutex_unlock(&targets_mutex);
> +       if (!existed)
> +               return -ENOENT;
> +
> +       link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);

nit: all existing link implementation don't specify __GFP_NOWARN,
wonder if bpf_iter_link should be special?

> +       if (!link)
> +               return -ENOMEM;
> +
> +       bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
> +       link->tinfo = tinfo;
> +
> +       err  = bpf_link_prime(&link->link, &link_primer);
> +       if (err) {
> +               kfree(link);
> +               return err;
> +       }
> +
> +       return bpf_link_settle(&link_primer);
> +}

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 04/20] bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE
  2020-05-04  6:25 ` [PATCH bpf-next v2 04/20] bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE Yonghong Song
@ 2020-05-05 21:32   ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-05 21:32 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> Added BPF_LINK_UPDATE support for tracing/iter programs.
> This way, a file based bpf iterator, which holds a reference
> to the link, can have its bpf program updated without
> creating new files.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Nice and simple!

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  kernel/bpf/bpf_iter.c | 31 +++++++++++++++++++++++++++++++
>  1 file changed, 31 insertions(+)
>

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 02/20] bpf: allow loading of a bpf_iter program
  2020-05-05 21:29   ` Andrii Nakryiko
@ 2020-05-06  0:07     ` Yonghong Song
  0 siblings, 0 replies; 62+ messages in thread
From: Yonghong Song @ 2020-05-06  0:07 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/5/20 2:29 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> A bpf_iter program is a tracing program with attach type
>> BPF_TRACE_ITER. The load attribute
>>    attach_btf_id
>> is used by the verifier against a particular kernel function,
>> which represents a target, e.g., __bpf_iter__bpf_map
>> for target bpf_map which is implemented later.
>>
>> The program return value must be 0 or 1 for now.
>>    0 : successful, except potential seq_file buffer overflow
>>        which is handled by seq_file reader.
>>    1 : request to restart the same object
> 
> This bit is interesting. Is the idea that if BPF program also wants to
> send something over, say, perf_buffer, but fails, it can "request"
> same execution again? I wonder if typical libc fread() implementation

Yes. The bpf_seq_read() can handle this the same as any other
retry request. The following is current mapping.
    bpf program return 0   ---> seq_ops->show() return 0
    bpf program return 1   ---> seq_ops->show() return -EAGAIN

> would handle EAGAIN properly, it seems more driven towards
> non-blocking I/O?

I did not have a test for this in current patch set for bpf program
returning 1. Will add a test in the next version.

> 
> On the other hand, following start/show/next logic for seq_file
> iteration, requesting skipping element seems useful. It would allow
> (in some cases) to "speculatively" generate output and at some point
> realize that this is not an element we actually want in the output and
> request to ignore that output.
> 
> Don't know how useful the latter is going to be in practice, but just
> something to keep in mind for the future, I guess...
> 
>>
>> In the future, other return values may be used for filtering or
>> teminating the iterator.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h            |  3 +++
>>   include/uapi/linux/bpf.h       |  1 +
>>   kernel/bpf/bpf_iter.c          | 30 ++++++++++++++++++++++++++++++
>>   kernel/bpf/verifier.c          | 21 +++++++++++++++++++++
>>   tools/include/uapi/linux/bpf.h |  1 +
>>   5 files changed, 56 insertions(+)
>>
> 
> [...]
> 
> 
>> +
>> +bool bpf_iter_prog_supported(struct bpf_prog *prog)
>> +{
>> +       const char *attach_fname = prog->aux->attach_func_name;
>> +       u32 prog_btf_id = prog->aux->attach_btf_id;
>> +       const char *prefix = BPF_ITER_FUNC_PREFIX;
>> +       struct bpf_iter_target_info *tinfo;
>> +       int prefix_len = strlen(prefix);
>> +       bool supported = false;
>> +
>> +       if (strncmp(attach_fname, prefix, prefix_len))
>> +               return false;
>> +
>> +       mutex_lock(&targets_mutex);
>> +       list_for_each_entry(tinfo, &targets, list) {
>> +               if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
>> +                       supported = true;
>> +                       break;
>> +               }
>> +               if (!strcmp(attach_fname + prefix_len, tinfo->target)) {
>> +                       tinfo->btf_id = prog->aux->attach_btf_id;
> 
> This target_info->btf_id caching here is a bit subtle and easy to
> miss, it would be nice to have a code calling this out explicitly.

Will do.

> Thanks!
> 
>> +                       supported = true;
>> +                       break;
>> +               }
>> +       }
>> +       mutex_unlock(&targets_mutex);
>> +
>> +       return supported;
>> +}
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 70ad009577f8..d725ff7d11db 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env)
>>                          return 0;
>>                  range = tnum_const(0);
>>                  break;
>> +       case BPF_PROG_TYPE_TRACING:
>> +               if (env->prog->expected_attach_type != BPF_TRACE_ITER)
>> +                       return 0;
> 
> Commit message mentions enforcing [0, 1], shouldn't it be done here?

The default range is [0, 1], hence no explicit assignment here.

static int check_return_code(struct bpf_verifier_env *env)
{
         struct tnum enforce_attach_type_range = tnum_unknown;
         const struct bpf_prog *prog = env->prog;
         struct bpf_reg_state *reg;
         struct tnum range = tnum_range(0, 1);
......

> 
> 
>> +               break;
>>          default:
>>                  return 0;
>>          }
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-05 21:30   ` Andrii Nakryiko
@ 2020-05-06  0:14     ` Yonghong Song
  2020-05-06  0:54       ` Alexei Starovoitov
  0 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-06  0:14 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/5/20 2:30 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Given a bpf program, the step to create an anonymous bpf iterator is:
>>    - create a bpf_iter_link, which combines bpf program and the target.
>>      In the future, there could be more information recorded in the link.
>>      A link_fd will be returned to the user space.
>>    - create an anonymous bpf iterator with the given link_fd.
>>
>> The bpf_iter_link can be pinned to bpffs mount file system to
>> create a file based bpf iterator as well.
>>
>> The benefit to use of bpf_iter_link:
>>    - using bpf link simplifies design and implementation as bpf link
>>      is used for other tracing bpf programs.
>>    - for file based bpf iterator, bpf_iter_link provides a standard
>>      way to replace underlying bpf programs.
>>    - for both anonymous and free based iterators, bpf link query
>>      capability can be leveraged.
>>
>> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
>> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
>> querying. Currently, only prog_id is needed, so there is no
>> additional in-kernel show_fdinfo() and fill_link_info() hook
>> is needed for BPF_LINK_TYPE_ITER link.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
> 
> LGTM. See small nit about __GFP_NOWARN.
> 
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> 
> 
>>   include/linux/bpf.h            |  1 +
>>   include/linux/bpf_types.h      |  1 +
>>   include/uapi/linux/bpf.h       |  1 +
>>   kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>>   kernel/bpf/syscall.c           | 14 ++++++++
>>   tools/include/uapi/linux/bpf.h |  1 +
>>   6 files changed, 80 insertions(+)
>>
> 
> [...]
> 
>> +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
>> +{
>> +       struct bpf_link_primer link_primer;
>> +       struct bpf_iter_target_info *tinfo;
>> +       struct bpf_iter_link *link;
>> +       bool existed = false;
>> +       u32 prog_btf_id;
>> +       int err;
>> +
>> +       if (attr->link_create.target_fd || attr->link_create.flags)
>> +               return -EINVAL;
>> +
>> +       prog_btf_id = prog->aux->attach_btf_id;
>> +       mutex_lock(&targets_mutex);
>> +       list_for_each_entry(tinfo, &targets, list) {
>> +               if (tinfo->btf_id == prog_btf_id) {
>> +                       existed = true;
>> +                       break;
>> +               }
>> +       }
>> +       mutex_unlock(&targets_mutex);
>> +       if (!existed)
>> +               return -ENOENT;
>> +
>> +       link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
> 
> nit: all existing link implementation don't specify __GFP_NOWARN,
> wonder if bpf_iter_link should be special?

Nothing special. Just feel __GFP_NOWARN is the right thing to do to 
avoid pollute dmesg since -ENOMEM is returned to user space. But in
reality, unlike some key/value allocation where the size could be huge
and __GFP_NOWARN might be more useful, here, sizeof(*link) is fixed
and small, __GFP_NOWARN probably not that useful.

Will drop it.

> 
>> +       if (!link)
>> +               return -ENOMEM;
>> +
>> +       bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
>> +       link->tinfo = tinfo;
>> +
>> +       err  = bpf_link_prime(&link->link, &link_primer);
>> +       if (err) {
>> +               kfree(link);
>> +               return err;
>> +       }
>> +
>> +       return bpf_link_settle(&link_primer);
>> +}
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-06  0:14     ` Yonghong Song
@ 2020-05-06  0:54       ` Alexei Starovoitov
  2020-05-06  3:09         ` Andrii Nakryiko
  0 siblings, 1 reply; 62+ messages in thread
From: Alexei Starovoitov @ 2020-05-06  0:54 UTC (permalink / raw)
  To: Yonghong Song, Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Daniel Borkmann, Kernel Team

On 5/5/20 5:14 PM, Yonghong Song wrote:
> 
> 
> On 5/5/20 2:30 PM, Andrii Nakryiko wrote:
>> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>>
>>> Given a bpf program, the step to create an anonymous bpf iterator is:
>>>    - create a bpf_iter_link, which combines bpf program and the target.
>>>      In the future, there could be more information recorded in the 
>>> link.
>>>      A link_fd will be returned to the user space.
>>>    - create an anonymous bpf iterator with the given link_fd.
>>>
>>> The bpf_iter_link can be pinned to bpffs mount file system to
>>> create a file based bpf iterator as well.
>>>
>>> The benefit to use of bpf_iter_link:
>>>    - using bpf link simplifies design and implementation as bpf link
>>>      is used for other tracing bpf programs.
>>>    - for file based bpf iterator, bpf_iter_link provides a standard
>>>      way to replace underlying bpf programs.
>>>    - for both anonymous and free based iterators, bpf link query
>>>      capability can be leveraged.
>>>
>>> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
>>> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
>>> querying. Currently, only prog_id is needed, so there is no
>>> additional in-kernel show_fdinfo() and fill_link_info() hook
>>> is needed for BPF_LINK_TYPE_ITER link.
>>>
>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>> ---
>>
>> LGTM. See small nit about __GFP_NOWARN.
>>
>> Acked-by: Andrii Nakryiko <andriin@fb.com>
>>
>>
>>>   include/linux/bpf.h            |  1 +
>>>   include/linux/bpf_types.h      |  1 +
>>>   include/uapi/linux/bpf.h       |  1 +
>>>   kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>>>   kernel/bpf/syscall.c           | 14 ++++++++
>>>   tools/include/uapi/linux/bpf.h |  1 +
>>>   6 files changed, 80 insertions(+)
>>>
>>
>> [...]
>>
>>> +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog 
>>> *prog)
>>> +{
>>> +       struct bpf_link_primer link_primer;
>>> +       struct bpf_iter_target_info *tinfo;
>>> +       struct bpf_iter_link *link;
>>> +       bool existed = false;
>>> +       u32 prog_btf_id;
>>> +       int err;
>>> +
>>> +       if (attr->link_create.target_fd || attr->link_create.flags)
>>> +               return -EINVAL;
>>> +
>>> +       prog_btf_id = prog->aux->attach_btf_id;
>>> +       mutex_lock(&targets_mutex);
>>> +       list_for_each_entry(tinfo, &targets, list) {
>>> +               if (tinfo->btf_id == prog_btf_id) {
>>> +                       existed = true;
>>> +                       break;
>>> +               }
>>> +       }
>>> +       mutex_unlock(&targets_mutex);
>>> +       if (!existed)
>>> +               return -ENOENT;
>>> +
>>> +       link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
>>
>> nit: all existing link implementation don't specify __GFP_NOWARN,
>> wonder if bpf_iter_link should be special?
> 
> Nothing special. Just feel __GFP_NOWARN is the right thing to do to 
> avoid pollute dmesg since -ENOMEM is returned to user space. But in
> reality, unlike some key/value allocation where the size could be huge
> and __GFP_NOWARN might be more useful, here, sizeof(*link) is fixed
> and small, __GFP_NOWARN probably not that useful.
> 
> Will drop it.

actually all existing user space driven allocation have nowarn.
If we missed it in other link allocs we should probably add it.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-06  0:54       ` Alexei Starovoitov
@ 2020-05-06  3:09         ` Andrii Nakryiko
  2020-05-06 18:08           ` Alexei Starovoitov
  0 siblings, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06  3:09 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Yonghong Song, Andrii Nakryiko, bpf, Martin KaFai Lau,
	Networking, Daniel Borkmann, Kernel Team

On Tue, May 5, 2020 at 5:54 PM Alexei Starovoitov <ast@fb.com> wrote:
>
> On 5/5/20 5:14 PM, Yonghong Song wrote:
> >
> >
> > On 5/5/20 2:30 PM, Andrii Nakryiko wrote:
> >> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
> >>>
> >>> Given a bpf program, the step to create an anonymous bpf iterator is:
> >>>    - create a bpf_iter_link, which combines bpf program and the target.
> >>>      In the future, there could be more information recorded in the
> >>> link.
> >>>      A link_fd will be returned to the user space.
> >>>    - create an anonymous bpf iterator with the given link_fd.
> >>>
> >>> The bpf_iter_link can be pinned to bpffs mount file system to
> >>> create a file based bpf iterator as well.
> >>>
> >>> The benefit to use of bpf_iter_link:
> >>>    - using bpf link simplifies design and implementation as bpf link
> >>>      is used for other tracing bpf programs.
> >>>    - for file based bpf iterator, bpf_iter_link provides a standard
> >>>      way to replace underlying bpf programs.
> >>>    - for both anonymous and free based iterators, bpf link query
> >>>      capability can be leveraged.
> >>>
> >>> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
> >>> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
> >>> querying. Currently, only prog_id is needed, so there is no
> >>> additional in-kernel show_fdinfo() and fill_link_info() hook
> >>> is needed for BPF_LINK_TYPE_ITER link.
> >>>
> >>> Signed-off-by: Yonghong Song <yhs@fb.com>
> >>> ---
> >>
> >> LGTM. See small nit about __GFP_NOWARN.
> >>
> >> Acked-by: Andrii Nakryiko <andriin@fb.com>
> >>
> >>
> >>>   include/linux/bpf.h            |  1 +
> >>>   include/linux/bpf_types.h      |  1 +
> >>>   include/uapi/linux/bpf.h       |  1 +
> >>>   kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
> >>>   kernel/bpf/syscall.c           | 14 ++++++++
> >>>   tools/include/uapi/linux/bpf.h |  1 +
> >>>   6 files changed, 80 insertions(+)
> >>>
> >>
> >> [...]
> >>
> >>> +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog
> >>> *prog)
> >>> +{
> >>> +       struct bpf_link_primer link_primer;
> >>> +       struct bpf_iter_target_info *tinfo;
> >>> +       struct bpf_iter_link *link;
> >>> +       bool existed = false;
> >>> +       u32 prog_btf_id;
> >>> +       int err;
> >>> +
> >>> +       if (attr->link_create.target_fd || attr->link_create.flags)
> >>> +               return -EINVAL;
> >>> +
> >>> +       prog_btf_id = prog->aux->attach_btf_id;
> >>> +       mutex_lock(&targets_mutex);
> >>> +       list_for_each_entry(tinfo, &targets, list) {
> >>> +               if (tinfo->btf_id == prog_btf_id) {
> >>> +                       existed = true;
> >>> +                       break;
> >>> +               }
> >>> +       }
> >>> +       mutex_unlock(&targets_mutex);
> >>> +       if (!existed)
> >>> +               return -ENOENT;
> >>> +
> >>> +       link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
> >>
> >> nit: all existing link implementation don't specify __GFP_NOWARN,
> >> wonder if bpf_iter_link should be special?
> >
> > Nothing special. Just feel __GFP_NOWARN is the right thing to do to
> > avoid pollute dmesg since -ENOMEM is returned to user space. But in
> > reality, unlike some key/value allocation where the size could be huge
> > and __GFP_NOWARN might be more useful, here, sizeof(*link) is fixed
> > and small, __GFP_NOWARN probably not that useful.
> >
> > Will drop it.
>
> actually all existing user space driven allocation have nowarn.

Can you define "user space driven"? I understand why for map, map key,
map value, program we want to do that, because it's way too easy for
user-space to specify huge sizes and allocation is proportional to
that size. But in this case links are fixed-sized objects, same as
struct file and struct inode. From BPF world, for instance, there is
struct bpf_prog_list, which is created when user is attaching BPF
program to cgroup, so it is user-space driven in similar sense. Yet we
allocate it without __GFP_NOWARN.

> If we missed it in other link allocs we should probably add it.

Before bpf_link was formalized, raw_tracepoint_open was creating
struct bpf_raw_tracepoint, without NOWARN.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 09/20] bpf: add bpf_map iterator
  2020-05-04  6:25 ` [PATCH bpf-next v2 09/20] bpf: add bpf_map iterator Yonghong Song
@ 2020-05-06  5:11   ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06  5:11 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>
> Implement seq_file operations to traverse all maps.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Looks great!

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  include/linux/bpf.h   |   1 +
>  kernel/bpf/Makefile   |   2 +-
>  kernel/bpf/map_iter.c | 107 ++++++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c  |  19 ++++++++
>  4 files changed, 128 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/map_iter.c
>

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 10/20] net: bpf: add netlink and ipv6_route bpf_iter targets
  2020-05-04  6:25 ` [PATCH bpf-next v2 10/20] net: bpf: add netlink and ipv6_route bpf_iter targets Yonghong Song
@ 2020-05-06  5:21   ` Andrii Nakryiko
  2020-05-06 17:32     ` Yonghong Song
  0 siblings, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06  5:21 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:29 PM Yonghong Song <yhs@fb.com> wrote:
>
> This patch added netlink and ipv6_route targets, using
> the same seq_ops (except show() and minor changes for stop())
> for /proc/net/{netlink,ipv6_route}.
>
> The net namespace for these targets are the current net
> namespace at file open stage, similar to
> /proc/net/{netlink,ipv6_route} reference counting
> the net namespace at seq_file open stage.
>
> Since module is not supported for now, ipv6_route is
> supported only if the IPV6 is built-in, i.e., not compiled
> as a module. The restriction can be lifted once module
> is properly supported for bpf_iter.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  fs/proc/proc_net.c       | 19 +++++++++
>  include/linux/proc_fs.h  |  3 ++
>  net/ipv6/ip6_fib.c       | 65 +++++++++++++++++++++++++++++-
>  net/ipv6/route.c         | 27 +++++++++++++
>  net/netlink/af_netlink.c | 87 +++++++++++++++++++++++++++++++++++++++-
>  5 files changed, 197 insertions(+), 4 deletions(-)
>

[...]

>  int __init ip6_route_init(void)
>  {
>         int ret;
> @@ -6455,6 +6474,14 @@ int __init ip6_route_init(void)
>         if (ret)
>                 goto out_register_late_subsys;
>
> +#if IS_BUILTIN(CONFIG_IPV6)
> +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
> +       ret = bpf_iter_register();
> +       if (ret)
> +               goto out_register_late_subsys;

Seems like bpf_iter infra is missing unregistering API.
ip6_route_init(), if fails, undoes all the registrations, so probably
should also unregister ipv6_route target as well?

> +#endif
> +#endif
> +
>         for_each_possible_cpu(cpu) {
>                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
>

[...]

> +static void netlink_seq_stop(struct seq_file *seq, void *v)
> +{
> +       struct bpf_iter_meta meta;
> +       struct bpf_prog *prog;
> +
> +       if (!v) {
> +               meta.seq = seq;
> +               prog = bpf_iter_get_info(&meta, true);
> +               if (prog)
> +                       netlink_prog_seq_show(prog, &meta, v);

nit: netlink_prog_seq_show() can return failure (from BPF program),
but you are not returning it. Given seq_file's stop is not supposed to
fail, you can explicitly cast result to (void)? I think it's done in
few other places in BPF code, when return result is explicitly
ignored.


> +       }
> +
> +       netlink_native_seq_stop(seq, v);
> +}
> +#else

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 16/20] tools/libbpf: add bpf_iter support
  2020-05-04  6:26 ` [PATCH bpf-next v2 16/20] tools/libbpf: add bpf_iter support Yonghong Song
@ 2020-05-06  5:44   ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06  5:44 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:27 PM Yonghong Song <yhs@fb.com> wrote:
>
> Two new libbpf APIs are added to support bpf_iter:
>   - bpf_program__attach_iter
>     Given a bpf program and additional parameters, which is
>     none now, returns a bpf_link.
>   - bpf_iter_create
>     syscall level API to create a bpf iterator.
>
> The macro BPF_SEQ_PRINTF are also introduced. The format
> looks like:
>   BPF_SEQ_PRINTF(seq, "task id %d\n", pid);
>
> This macro can help bpf program writers with
> nicer bpf_seq_printf syntax similar to the kernel one.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Looks great! Just few nits.

>  tools/lib/bpf/bpf.c         | 11 +++++++++
>  tools/lib/bpf/bpf.h         |  2 ++
>  tools/lib/bpf/bpf_tracing.h | 16 +++++++++++++
>  tools/lib/bpf/libbpf.c      | 45 +++++++++++++++++++++++++++++++++++++
>  tools/lib/bpf/libbpf.h      |  9 ++++++++
>  tools/lib/bpf/libbpf.map    |  2 ++
>  6 files changed, 85 insertions(+)
>
> diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
> index 43322f0d6c7f..1756ae47ddf2 100644
> --- a/tools/lib/bpf/bpf.c
> +++ b/tools/lib/bpf/bpf.c
> @@ -619,6 +619,17 @@ int bpf_link_update(int link_fd, int new_prog_fd,
>         return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr));
>  }
>
> +int bpf_iter_create(int link_fd, unsigned int flags)

As discussed in previous thread, given we don't anticipate needing
anything beyond link_fd, let's do bpf_iter_create(int link_fd), nice
and simple. Once we need to add any extensibility, we can add
bpf_iter_create_xattr() variant with opts.

> +{
> +       union bpf_attr attr;
> +
> +       memset(&attr, 0, sizeof(attr));
> +       attr.iter_create.link_fd = link_fd;
> +       attr.iter_create.flags = flags;
> +
> +       return sys_bpf(BPF_ITER_CREATE, &attr, sizeof(attr));
> +}
> +

[...]

> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 977add1b73e2..93355a257405 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -6629,6 +6629,9 @@ static const struct bpf_sec_def section_defs[] = {
>                 .is_attach_btf = true,
>                 .expected_attach_type = BPF_LSM_MAC,
>                 .attach_fn = attach_lsm),
> +       SEC_DEF("iter/", TRACING,
> +               .expected_attach_type = BPF_TRACE_ITER,
> +               .is_attach_btf = true),

Another nit. As discussed, I think auto-attach is a nice feature,
which, if user doesn't want/need, can be skipped.

>         BPF_PROG_SEC("xdp",                     BPF_PROG_TYPE_XDP),
>         BPF_PROG_SEC("perf_event",              BPF_PROG_TYPE_PERF_EVENT),
>         BPF_PROG_SEC("lwt_in",                  BPF_PROG_TYPE_LWT_IN),
> @@ -6891,6 +6894,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
>

[...]

>
> +struct bpf_link *
> +bpf_program__attach_iter(struct bpf_program *prog,
> +                        const struct bpf_iter_attach_opts *opts)
> +{
> +       enum bpf_attach_type attach_type;
> +       char errmsg[STRERR_BUFSIZE];
> +       struct bpf_link *link;
> +       int prog_fd, link_fd;
> +
> +       if (!OPTS_VALID(opts, bpf_iter_attach_opts))
> +               return ERR_PTR(-EINVAL);
> +
> +       prog_fd = bpf_program__fd(prog);
> +       if (prog_fd < 0) {
> +               pr_warn("program '%s': can't attach before loaded\n",
> +                       bpf_program__title(prog, false));
> +               return ERR_PTR(-EINVAL);
> +       }
> +
> +       link = calloc(1, sizeof(*link));
> +       if (!link)
> +               return ERR_PTR(-ENOMEM);
> +       link->detach = &bpf_link__detach_fd;
> +
> +       attach_type = BPF_TRACE_ITER;
> +       link_fd = bpf_link_create(prog_fd, 0, attach_type, NULL);

nit: attach_type variable doesn't seem to be necessary

> +       if (link_fd < 0) {
> +               link_fd = -errno;
> +               free(link);
> +               pr_warn("program '%s': failed to attach to iterator: %s\n",
> +                       bpf_program__title(prog, false),
> +                       libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg)));
> +               return ERR_PTR(link_fd);
> +       }
> +       link->fd = link_fd;
> +       return link;
> +}
> +

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink
  2020-05-04  6:26 ` [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink Yonghong Song
@ 2020-05-06  6:01   ` Andrii Nakryiko
  2020-05-07  1:09     ` Yonghong Song
  2020-05-06  6:04   ` Andrii Nakryiko
  1 sibling, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06  6:01 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:30 PM Yonghong Song <yhs@fb.com> wrote:
>
> Two bpf programs are added in this patch for netlink and ipv6_route
> target. On my VM, I am able to achieve identical
> results compared to /proc/net/netlink and /proc/net/ipv6_route.
>
>   $ cat /proc/net/netlink
>   sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
>   000000002c42d58b 0   0          00000000 0        0        0     2        0        7
>   00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
>   00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
>   000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
>   ....
>   00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
>   000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
>   00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
>   000000008398fb08 16  0          00000000 0        0        0     2        0        27
>   $ cat /sys/fs/bpf/my_netlink
>   sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
>   000000002c42d58b 0   0          00000000 0        0        0     2        0        7
>   00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
>   00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
>   000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
>   ....
>   00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
>   000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
>   00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
>   000000008398fb08 16  0          00000000 0        0        0     2        0        27
>
>   $ cat /proc/net/ipv6_route
>   fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
>   00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>   00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
>   fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
>   ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
>   00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>   $ cat /sys/fs/bpf/my_ipv6_route
>   fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
>   00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>   00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
>   fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
>   ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
>   00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Looks good, but something weird with printf below...

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  .../selftests/bpf/progs/bpf_iter_ipv6_route.c | 63 ++++++++++++++++
>  .../selftests/bpf/progs/bpf_iter_netlink.c    | 74 +++++++++++++++++++
>  2 files changed, 137 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
>
> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
> new file mode 100644
> index 000000000000..0dee4629298f
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
> @@ -0,0 +1,63 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2020 Facebook */
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +#include <bpf/bpf_endian.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +extern bool CONFIG_IPV6_SUBTREES __kconfig __weak;
> +
> +#define        RTF_GATEWAY             0x0002
> +#define IFNAMSIZ               16

nit: these look weirdly unaligned :)

> +#define fib_nh_gw_family        nh_common.nhc_gw_family
> +#define fib_nh_gw6              nh_common.nhc_gw.ipv6
> +#define fib_nh_dev              nh_common.nhc_dev
> +

[...]


> +       dev = fib6_nh->fib_nh_dev;
> +       if (dev)
> +               BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric,
> +                              rt->fib6_ref.refs.counter, 0, flags, dev->name);
> +       else
> +               BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric,
> +                              rt->fib6_ref.refs.counter, 0, flags);

hmm... how does it work? you specify 4 params, but format string
expects 5. Shouldn't this fail?

> +
> +       return 0;
> +}
> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
> new file mode 100644
> index 000000000000..0a85a621a36d
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
> @@ -0,0 +1,74 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2020 Facebook */
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +#include <bpf/bpf_endian.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +#define sk_rmem_alloc  sk_backlog.rmem_alloc
> +#define sk_refcnt      __sk_common.skc_refcnt
> +
> +#define offsetof(TYPE, MEMBER)  ((size_t)&((TYPE *)0)->MEMBER)
> +#define container_of(ptr, type, member)                                \
> +       ({                                                      \
> +               void *__mptr = (void *)(ptr);                   \
> +               ((type *)(__mptr - offsetof(type, member)));    \
> +       })

we should probably put offsetof(), offsetofend() and container_of()
macro into bpf_helpers.h, seems like universal things for kernel
datastructs :)

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink
  2020-05-04  6:26 ` [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink Yonghong Song
  2020-05-06  6:01   ` Andrii Nakryiko
@ 2020-05-06  6:04   ` Andrii Nakryiko
  2020-05-06 23:07     ` Yonghong Song
  1 sibling, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06  6:04 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:30 PM Yonghong Song <yhs@fb.com> wrote:
>
> Two bpf programs are added in this patch for netlink and ipv6_route
> target. On my VM, I am able to achieve identical
> results compared to /proc/net/netlink and /proc/net/ipv6_route.
>
>   $ cat /proc/net/netlink
>   sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
>   000000002c42d58b 0   0          00000000 0        0        0     2        0        7
>   00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
>   00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
>   000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
>   ....
>   00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
>   000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
>   00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
>   000000008398fb08 16  0          00000000 0        0        0     2        0        27
>   $ cat /sys/fs/bpf/my_netlink
>   sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
>   000000002c42d58b 0   0          00000000 0        0        0     2        0        7
>   00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
>   00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
>   000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
>   ....
>   00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
>   000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
>   00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
>   000000008398fb08 16  0          00000000 0        0        0     2        0        27
>
>   $ cat /proc/net/ipv6_route
>   fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
>   00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>   00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
>   fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
>   ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
>   00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>   $ cat /sys/fs/bpf/my_ipv6_route
>   fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
>   00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>   00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
>   fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
>   ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
>   00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Just realized, this is only BPF programs, right? It would be good to
have at least minimal user-space program that would verify and load
it. Otherwise we'll be just testing compilation and it might "bit rot"
a bit...

>  .../selftests/bpf/progs/bpf_iter_ipv6_route.c | 63 ++++++++++++++++
>  .../selftests/bpf/progs/bpf_iter_netlink.c    | 74 +++++++++++++++++++
>  2 files changed, 137 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
>

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 19/20] tools/bpf: selftests: add iter progs for bpf_map/task/task_file
  2020-05-04  6:26 ` [PATCH bpf-next v2 19/20] tools/bpf: selftests: add iter progs for bpf_map/task/task_file Yonghong Song
@ 2020-05-06  6:14   ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06  6:14 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:27 PM Yonghong Song <yhs@fb.com> wrote:
>
> The implementation is arbitrary, just to show how the bpf programs
> can be written for bpf_map/task/task_file. They can be costomized
> for specific needs.
>
> For example, for bpf_map, the iterator prints out:
>   $ cat /sys/fs/bpf/my_bpf_map
>       id   refcnt  usercnt  locked_vm
>        3        2        0         20
>        6        2        0         20
>        9        2        0         20
>       12        2        0         20
>       13        2        0         20
>       16        2        0         20
>       19        2        0         20
>       === END ===
>
> For task, the iterator prints out:
>   $ cat /sys/fs/bpf/my_task
>     tgid      gid
>        1        1
>        2        2
>     ....
>     1944     1944
>     1948     1948
>     1949     1949
>     1953     1953
>     === END ===
>
> For task/file, the iterator prints out:
>   $ cat /sys/fs/bpf/my_task_file
>     tgid      gid       fd      file
>        1        1        0 ffffffff95c97600
>        1        1        1 ffffffff95c97600
>        1        1        2 ffffffff95c97600
>     ....
>     1895     1895      255 ffffffff95c8fe00
>     1932     1932        0 ffffffff95c8fe00
>     1932     1932        1 ffffffff95c8fe00
>     1932     1932        2 ffffffff95c8fe00
>     1932     1932        3 ffffffff95c185c0
>
> This is able to print out all open files (fd and file->f_op), so user can compare
> f_op against a particular kernel file operations to find what it is.
> For example, from /proc/kallsyms, we can find
>   ffffffff95c185c0 r eventfd_fops
> so we will know tgid 1932 fd 3 is an eventfd file descriptor.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

LGTM.

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  .../selftests/bpf/progs/bpf_iter_bpf_map.c    | 29 +++++++++++++++++++
>  .../selftests/bpf/progs/bpf_iter_task.c       | 26 +++++++++++++++++
>  .../selftests/bpf/progs/bpf_iter_task_file.c  | 27 +++++++++++++++++
>  3 files changed, 82 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
>
> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
> new file mode 100644
> index 000000000000..d0af0e82b74c
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
> @@ -0,0 +1,29 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2020 Facebook */
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +#include <bpf/bpf_endian.h>

bpf_endian.h doesn't seem to be really used and it's actually
incompatible with vmlinux.h, so maybe let's drop it?
Same for previous patch, I believe.

> +
> +char _license[] SEC("license") = "GPL";
> +

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 20/20] tools/bpf: selftests: add bpf_iter selftests
  2020-05-04  6:26 ` [PATCH bpf-next v2 20/20] tools/bpf: selftests: add bpf_iter selftests Yonghong Song
@ 2020-05-06  6:39   ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06  6:39 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> The added test includes the following subtests:
>   - test verifier change for btf_id_or_null
>   - test load/create_iter/read for
>     ipv6_route/netlink/bpf_map/task/task_file
>   - test anon bpf iterator
>   - test anon bpf iterator reading one char at a time
>   - test file bpf iterator
>   - test overflow (single bpf program output not overflow)
>   - test overflow (single bpf program output overflows)
>
> Th ipv6_route tests the following verifier change
>   - access fields in the variable length array of the structure.
>
> The netlink load tests th following verifier change
>   - put a btf_id ptr value in a stack and accessible to
>     tracing/iter programs.
>
>   $ test_progs -n 2
>   #2/1 btf_id_or_null:OK
>   #2/2 ipv6_route:OK
>   #2/3 netlink:OK
>   #2/4 bpf_map:OK
>   #2/5 task:OK
>   #2/6 task_file:OK
>   #2/7 anon:OK
>   #2/8 anon-read-one-char:OK
>   #2/9 file:OK
>   #2/10 overflow:OK
>   #2/11 overflow-e2big:OK
>   #2 bpf_iter:OK
>   Summary: 1/11 PASSED, 0 SKIPPED, 0 FAILED
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Looks good overall. bpf_link__disconnect() is wrong, though, please
remove it. With that:

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  .../selftests/bpf/prog_tests/bpf_iter.c       | 390 ++++++++++++++++++
>  .../selftests/bpf/progs/bpf_iter_test_kern1.c |   4 +
>  .../selftests/bpf/progs/bpf_iter_test_kern2.c |   4 +
>  .../selftests/bpf/progs/bpf_iter_test_kern3.c |  18 +
>  .../selftests/bpf/progs/bpf_iter_test_kern4.c |  48 +++
>  .../bpf/progs/bpf_iter_test_kern_common.h     |  22 +
>  6 files changed, 486 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_iter.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
>

[...]

> +
> +free_link:
> +       bpf_link__disconnect(link);

bpf_link__disconnect() actually will make destroy() below not close
link. So no need for it. Same below in few places.

> +       bpf_link__destroy(link);
> +}
> +

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets
  2020-05-04  6:25 ` [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets Yonghong Song
@ 2020-05-06  7:30   ` Andrii Nakryiko
  2020-05-06 18:24     ` Yonghong Song
  0 siblings, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06  7:30 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>
> Only the tasks belonging to "current" pid namespace
> are enumerated.
>
> For task/file target, the bpf program will have access to
>   struct task_struct *task
>   u32 fd
>   struct file *file
> where fd/file is an open file for the task.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

I might be missing some subtleties with task refcounting for task_file
iterator, asked few questions below...

>  kernel/bpf/Makefile    |   2 +-
>  kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 337 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/task_iter.c
>
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index b2b5eefc5254..37b2d8620153 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -2,7 +2,7 @@
>  obj-y := core.o
>  CFLAGS_core.o += $(call cc-disable-warning, override-init)
>
> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
>  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
>  obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> new file mode 100644
> index 000000000000..1ca258f6e9f4
> --- /dev/null
> +++ b/kernel/bpf/task_iter.c
> @@ -0,0 +1,336 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2020 Facebook */
> +
> +#include <linux/init.h>
> +#include <linux/namei.h>
> +#include <linux/pid_namespace.h>
> +#include <linux/fs.h>
> +#include <linux/fdtable.h>
> +#include <linux/filter.h>
> +
> +struct bpf_iter_seq_task_common {
> +       struct pid_namespace *ns;
> +};
> +
> +struct bpf_iter_seq_task_info {
> +       struct bpf_iter_seq_task_common common;

you have comment below in init_seq_pidns() that common is supposed to
be the very first field, but I think it's more important and
appropriate here, so that whoever adds anything here knows that order
of field is important.

> +       struct task_struct *task;
> +       u32 id;
> +};
> +

[...]

> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
> +{
> +       struct bpf_iter_meta meta;
> +       struct bpf_iter__task ctx;
> +       struct bpf_prog *prog;
> +       int ret = 0;
> +
> +       meta.seq = seq;
> +       prog = bpf_iter_get_info(&meta, in_stop);
> +       if (prog) {


nit: `if (!prog) return 0;` here would reduce nesting level below

> +               meta.seq = seq;
> +               ctx.meta = &meta;
> +               ctx.task = v;
> +               ret = bpf_iter_run_prog(prog, &ctx);
> +       }
> +
> +       return 0;

return **ret**; ?

> +}
> +

[...]

> +
> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
> +                                          int *fd, struct task_struct **task,
> +                                          struct files_struct **fstruct)
> +{
> +       struct files_struct *files;
> +       struct task_struct *tk;
> +       u32 sid = *id;
> +       int sfd;
> +
> +       /* If this function returns a non-NULL file object,
> +        * it held a reference to the files_struct and file.
> +        * Otherwise, it does not hold any reference.
> +        */
> +again:
> +       if (*fstruct) {
> +               files = *fstruct;
> +               sfd = *fd;
> +       } else {
> +               tk = task_seq_get_next(ns, &sid);
> +               if (!tk)
> +                       return NULL;
> +
> +               files = get_files_struct(tk);
> +               put_task_struct(tk);

task is put here, but is still used below.. is there some additional
hidden refcounting involved?

> +               if (!files) {
> +                       sid = ++(*id);
> +                       *fd = 0;
> +                       goto again;
> +               }
> +               *fstruct = files;
> +               *task = tk;
> +               if (sid == *id) {
> +                       sfd = *fd;
> +               } else {
> +                       *id = sid;
> +                       sfd = 0;
> +               }
> +       }
> +
> +       rcu_read_lock();
> +       for (; sfd < files_fdtable(files)->max_fds; sfd++) {

files_fdtable does rcu_dereference on each iteration, would it be
better to just cache files_fdtable(files)->max_fds into local
variable? It's unlikely that there will be many iterations, but
still...

> +               struct file *f;
> +
> +               f = fcheck_files(files, sfd);
> +               if (!f)
> +                       continue;
> +               *fd = sfd;
> +               get_file(f);
> +               rcu_read_unlock();
> +               return f;
> +       }
> +
> +       /* the current task is done, go to the next task */
> +       rcu_read_unlock();
> +       put_files_struct(files);
> +       *fstruct = NULL;

*task = NULL; for completeness?

> +       sid = ++(*id);
> +       *fd = 0;
> +       goto again;
> +}
> +
> +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +       struct bpf_iter_seq_task_file_info *info = seq->private;
> +       struct files_struct *files = NULL;
> +       struct task_struct *task = NULL;
> +       struct file *file;
> +       u32 id = info->id;
> +       int fd = info->fd;
> +
> +       file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files);
> +       if (!file) {
> +               info->files = NULL;

what about info->task here?

> +               return NULL;
> +       }
> +
> +       ++*pos;
> +       info->id = id;
> +       info->fd = fd;
> +       info->task = task;
> +       info->files = files;
> +
> +       return file;
> +}
> +

[...]

> +
> +struct bpf_iter__task_file {
> +       __bpf_md_ptr(struct bpf_iter_meta *, meta);
> +       __bpf_md_ptr(struct task_struct *, task);
> +       u32 fd;

nit: sort of works by accident (due to all other field being 8-byte
aligned pointers), shall we add __attribute__((aligned(8)))?

> +       __bpf_md_ptr(struct file *, file);
> +};
> +

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 10/20] net: bpf: add netlink and ipv6_route bpf_iter targets
  2020-05-06  5:21   ` Andrii Nakryiko
@ 2020-05-06 17:32     ` Yonghong Song
  0 siblings, 0 replies; 62+ messages in thread
From: Yonghong Song @ 2020-05-06 17:32 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/5/20 10:21 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:29 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> This patch added netlink and ipv6_route targets, using
>> the same seq_ops (except show() and minor changes for stop())
>> for /proc/net/{netlink,ipv6_route}.
>>
>> The net namespace for these targets are the current net
>> namespace at file open stage, similar to
>> /proc/net/{netlink,ipv6_route} reference counting
>> the net namespace at seq_file open stage.
>>
>> Since module is not supported for now, ipv6_route is
>> supported only if the IPV6 is built-in, i.e., not compiled
>> as a module. The restriction can be lifted once module
>> is properly supported for bpf_iter.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   fs/proc/proc_net.c       | 19 +++++++++
>>   include/linux/proc_fs.h  |  3 ++
>>   net/ipv6/ip6_fib.c       | 65 +++++++++++++++++++++++++++++-
>>   net/ipv6/route.c         | 27 +++++++++++++
>>   net/netlink/af_netlink.c | 87 +++++++++++++++++++++++++++++++++++++++-
>>   5 files changed, 197 insertions(+), 4 deletions(-)
>>
> 
> [...]
> 
>>   int __init ip6_route_init(void)
>>   {
>>          int ret;
>> @@ -6455,6 +6474,14 @@ int __init ip6_route_init(void)
>>          if (ret)
>>                  goto out_register_late_subsys;
>>
>> +#if IS_BUILTIN(CONFIG_IPV6)
>> +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
>> +       ret = bpf_iter_register();
>> +       if (ret)
>> +               goto out_register_late_subsys;
> 
> Seems like bpf_iter infra is missing unregistering API.
> ip6_route_init(), if fails, undoes all the registrations, so probably
> should also unregister ipv6_route target as well?

Yes, it is. But not in this function. In this function, 
bpf_iter_register() is the last one possibly causing error,
so there is no need to unregister here.

But there is another cleanup funciton called outside of this
function, I need to do proper unregister there.

Thanks for catching this.

> 
>> +#endif
>> +#endif
>> +
>>          for_each_possible_cpu(cpu) {
>>                  struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
>>
> 
> [...]
> 
>> +static void netlink_seq_stop(struct seq_file *seq, void *v)
>> +{
>> +       struct bpf_iter_meta meta;
>> +       struct bpf_prog *prog;
>> +
>> +       if (!v) {
>> +               meta.seq = seq;
>> +               prog = bpf_iter_get_info(&meta, true);
>> +               if (prog)
>> +                       netlink_prog_seq_show(prog, &meta, v);
> 
> nit: netlink_prog_seq_show() can return failure (from BPF program),
> but you are not returning it. Given seq_file's stop is not supposed to
> fail, you can explicitly cast result to (void)? I think it's done in

Yes, we can do this. An explicit casting expressed the intention.

> few other places in BPF code, when return result is explicitly
> ignored.
> 
> 
>> +       }
>> +
>> +       netlink_native_seq_stop(seq, v);
>> +}
>> +#else
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 13/20] bpf: add bpf_seq_printf and bpf_seq_write helpers
  2020-05-04  6:26 ` [PATCH bpf-next v2 13/20] bpf: add bpf_seq_printf and bpf_seq_write helpers Yonghong Song
@ 2020-05-06 17:37   ` Andrii Nakryiko
  2020-05-06 21:42     ` Yonghong Song
  0 siblings, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06 17:37 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> Two helpers bpf_seq_printf and bpf_seq_write, are added for
> writing data to the seq_file buffer.
>
> bpf_seq_printf supports common format string flag/width/type
> fields so at least I can get identical results for
> netlink and ipv6_route targets.
>

Does seq_printf() has its own format string specification? Is there
any documentation explaining? I was confused by few different checks
below...

> For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
> specifically indicates a write failure due to overflow, which
> means the object will be repeated in the next bpf invocation
> if object collection stays the same. Note that if the object
> collection is changed, depending how collection traversal is
> done, even if the object still in the collection, it may not
> be visited.
>
> bpf_seq_printf may return -EBUSY meaning that internal percpu
> buffer for memory copy of strings or other pointees is
> not available. Bpf program can return 1 to indicate it
> wants the same object to be repeated. Right now, this should not
> happen on no-RT kernels since migrate_enable(), which guards
> bpf prog call, calls preempt_enable().

You probably meant migrate_disable()/preempt_disable(), right? But
could it still happen, at least due to NMI? E.g., perf_event BPF
program gets triggered during bpf_iter program execution? I think for
perf_event_output function, we have 3 levels, for one of each possible
"contexts"? Should we do something like that here as well?

>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/uapi/linux/bpf.h       |  32 +++++-
>  kernel/trace/bpf_trace.c       | 195 +++++++++++++++++++++++++++++++++
>  scripts/bpf_helpers_doc.py     |   2 +
>  tools/include/uapi/linux/bpf.h |  32 +++++-
>  4 files changed, 259 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 97ceb0f2e539..e440a9d5cca2 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3076,6 +3076,34 @@ union bpf_attr {
>   *             See: clock_gettime(CLOCK_BOOTTIME)
>   *     Return
>   *             Current *ktime*.
> + *

[...]

> +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
> +          const void *, data, u32, data_len)
> +{
> +       int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
> +       int i, buf_used, copy_size, num_args;
> +       u64 params[MAX_SEQ_PRINTF_VARARGS];
> +       struct bpf_seq_printf_buf *bufs;
> +       const u64 *args = data;
> +
> +       buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
> +       if (WARN_ON_ONCE(buf_used > 1)) {
> +               err = -EBUSY;
> +               goto out;
> +       }
> +
> +       bufs = this_cpu_ptr(&bpf_seq_printf_buf);
> +
> +       /*
> +        * bpf_check()->check_func_arg()->check_stack_boundary()
> +        * guarantees that fmt points to bpf program stack,
> +        * fmt_size bytes of it were initialized and fmt_size > 0
> +        */
> +       if (fmt[--fmt_size] != 0)

If we allow fmt_size == 0, this will need to be changed.

> +               goto out;
> +
> +       if (data_len & 7)
> +               goto out;
> +
> +       for (i = 0; i < fmt_size; i++) {
> +               if (fmt[i] == '%' && (!data || !data_len))

So %% escaping is not supported?

> +                       goto out;
> +       }
> +
> +       num_args = data_len / 8;
> +
> +       /* check format string for allowed specifiers */
> +       for (i = 0; i < fmt_size; i++) {
> +               if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))

why these restrictions? are they essential?

> +                       goto out;
> +
> +               if (fmt[i] != '%')
> +                       continue;
> +
> +               if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
> +                       err = -E2BIG;
> +                       goto out;
> +               }
> +
> +               if (fmt_cnt >= num_args)
> +                       goto out;
> +
> +               /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
> +               i++;
> +
> +               /* skip optional "[0+-][num]" width formating field */
> +               while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-')

There could be space as well, as an alternative to 0.

> +                       i++;
> +               if (fmt[i] >= '1' && fmt[i] <= '9') {
> +                       i++;
> +                       while (fmt[i] >= '0' && fmt[i] <= '9')
> +                               i++;
> +               }
> +
> +               if (fmt[i] == 's') {
> +                       /* disallow any further format extensions */
> +                       if (fmt[i + 1] != 0 &&
> +                           !isspace(fmt[i + 1]) &&
> +                           !ispunct(fmt[i + 1]))
> +                               goto out;

I'm not sure I follow this check either. printf("%sbla", "whatever")
is a perfectly fine format string. Unless seq_printf has some
additional restrictions?

> +
> +                       /* try our best to copy */
> +                       if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
> +                               err = -E2BIG;
> +                               goto out;
> +                       }
> +

[...]

> +
> +static int bpf_seq_printf_btf_ids[5];
> +static const struct bpf_func_proto bpf_seq_printf_proto = {
> +       .func           = bpf_seq_printf,
> +       .gpl_only       = true,
> +       .ret_type       = RET_INTEGER,
> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
> +       .arg2_type      = ARG_PTR_TO_MEM,
> +       .arg3_type      = ARG_CONST_SIZE,

It feels like allowing zero shouldn't hurt too much?

> +       .arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
> +       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
> +       .btf_id         = bpf_seq_printf_btf_ids,
> +};
> +
> +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
> +{
> +       return seq_write(m, data, len) ? -EOVERFLOW : 0;
> +}
> +
> +static int bpf_seq_write_btf_ids[5];
> +static const struct bpf_func_proto bpf_seq_write_proto = {
> +       .func           = bpf_seq_write,
> +       .gpl_only       = true,
> +       .ret_type       = RET_INTEGER,
> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
> +       .arg2_type      = ARG_PTR_TO_MEM,
> +       .arg3_type      = ARG_CONST_SIZE,

Same, ARG_CONST_SIZE_OR_ZERO?

> +       .btf_id         = bpf_seq_write_btf_ids,
> +};
> +

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 14/20] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary
  2020-05-04  6:26 ` [PATCH bpf-next v2 14/20] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary Yonghong Song
@ 2020-05-06 17:38   ` Andrii Nakryiko
  2020-05-06 21:47     ` Yonghong Song
  0 siblings, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06 17:38 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>
> This specifically to handle the case like below:
>    // ptr below is a socket ptr identified by PTR_TO_BTF_ID
>    u64 param[2] = { ptr, val };
>    bpf_seq_printf(seq, fmt, sizeof(fmt), param, sizeof(param));
>
> In this case, the 16 bytes stack for "param" contains:
>    8 bytes for ptr with spilled PTR_TO_BTF_ID
>    8 bytes for val as STACK_MISC
>
> The current verifier will complain the ptr should not be visible
> to the helper.
>    ...
>    16: (7b) *(u64 *)(r10 -64) = r2
>    18: (7b) *(u64 *)(r10 -56) = r1
>    19: (bf) r4 = r10
>    ;
>    20: (07) r4 += -64
>    ; BPF_SEQ_PRINTF(seq, fmt1, (long)s, s->sk_protocol);
>    21: (bf) r1 = r6
>    22: (18) r2 = 0xffffa8d00018605a
>    24: (b4) w3 = 10
>    25: (b4) w5 = 16
>    26: (85) call bpf_seq_printf#125
>     R0=inv(id=0) R1_w=ptr_seq_file(id=0,off=0,imm=0)
>     R2_w=map_value(id=0,off=90,ks=4,vs=144,imm=0) R3_w=inv10
>     R4_w=fp-64 R5_w=inv16 R6=ptr_seq_file(id=0,off=0,imm=0)
>     R7=ptr_netlink_sock(id=0,off=0,imm=0) R10=fp0 fp-56_w=mmmmmmmm
>     fp-64_w=ptr_
>    last_idx 26 first_idx 13
>    regs=8 stack=0 before 25: (b4) w5 = 16
>    regs=8 stack=0 before 24: (b4) w3 = 10
>    invalid indirect read from stack off -64+0 size 16
>
> Let us permit this if the program is a tracing/iter program.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

LGTM, but I wonder why enabling this only for iterator programs?

Acked-by: Andrii Nakryiko <andriin@fb.com>


>  kernel/bpf/verifier.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 36b2a38a06fe..4884b6fd7bad 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3494,6 +3494,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
>                         *stype = STACK_MISC;
>                         goto mark;
>                 }
> +
> +               /* pointer value can be visible to tracing/iter program */
> +               if (env->prog->type == BPF_PROG_TYPE_TRACING &&
> +                   env->prog->expected_attach_type == BPF_TRACE_ITER &&

What's the problem allowing this for all program types?

> +                   state->stack[spi].slot_type[0] == STACK_SPILL &&
> +                   state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
> +                       goto mark;
> +
>                 if (state->stack[spi].slot_type[0] == STACK_SPILL &&
>                     state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
>                         __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
> --
> 2.24.1
>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 15/20] bpf: support variable length array in tracing programs
  2020-05-04  6:26 ` [PATCH bpf-next v2 15/20] bpf: support variable length array in tracing programs Yonghong Song
@ 2020-05-06 17:40   ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06 17:40 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Sun, May 3, 2020 at 11:27 PM Yonghong Song <yhs@fb.com> wrote:
>
> In /proc/net/ipv6_route, we have
>   struct fib6_info {
>     struct fib6_table *fib6_table;
>     ...
>     struct fib6_nh fib6_nh[0];
>   }
>   struct fib6_nh {
>     struct fib_nh_common nh_common;
>     struct rt6_info **rt6i_pcpu;
>     struct rt6_exception_bucket *rt6i_exception_bucket;
>   };
>   struct fib_nh_common {
>     ...
>     u8 nhc_gw_family;
>     ...
>   }
>
> The access:
>   struct fib6_nh *fib6_nh = &rt->fib6_nh;
>   ... fib6_nh->nh_common.nhc_gw_family ...
>
> This patch ensures such an access is handled properly.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

LGTM.

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  kernel/bpf/btf.c | 37 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 37 insertions(+)
>

[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-06  3:09         ` Andrii Nakryiko
@ 2020-05-06 18:08           ` Alexei Starovoitov
  0 siblings, 0 replies; 62+ messages in thread
From: Alexei Starovoitov @ 2020-05-06 18:08 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Yonghong Song, Andrii Nakryiko, bpf, Martin KaFai Lau,
	Networking, Daniel Borkmann, Kernel Team

On 5/5/20 8:09 PM, Andrii Nakryiko wrote:
> On Tue, May 5, 2020 at 5:54 PM Alexei Starovoitov <ast@fb.com> wrote:
>>
>> On 5/5/20 5:14 PM, Yonghong Song wrote:
>>>
>>>
>>> On 5/5/20 2:30 PM, Andrii Nakryiko wrote:
>>>> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>>>>
>>>>> Given a bpf program, the step to create an anonymous bpf iterator is:
>>>>>     - create a bpf_iter_link, which combines bpf program and the target.
>>>>>       In the future, there could be more information recorded in the
>>>>> link.
>>>>>       A link_fd will be returned to the user space.
>>>>>     - create an anonymous bpf iterator with the given link_fd.
>>>>>
>>>>> The bpf_iter_link can be pinned to bpffs mount file system to
>>>>> create a file based bpf iterator as well.
>>>>>
>>>>> The benefit to use of bpf_iter_link:
>>>>>     - using bpf link simplifies design and implementation as bpf link
>>>>>       is used for other tracing bpf programs.
>>>>>     - for file based bpf iterator, bpf_iter_link provides a standard
>>>>>       way to replace underlying bpf programs.
>>>>>     - for both anonymous and free based iterators, bpf link query
>>>>>       capability can be leveraged.
>>>>>
>>>>> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
>>>>> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
>>>>> querying. Currently, only prog_id is needed, so there is no
>>>>> additional in-kernel show_fdinfo() and fill_link_info() hook
>>>>> is needed for BPF_LINK_TYPE_ITER link.
>>>>>
>>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>>> ---
>>>>
>>>> LGTM. See small nit about __GFP_NOWARN.
>>>>
>>>> Acked-by: Andrii Nakryiko <andriin@fb.com>
>>>>
>>>>
>>>>>    include/linux/bpf.h            |  1 +
>>>>>    include/linux/bpf_types.h      |  1 +
>>>>>    include/uapi/linux/bpf.h       |  1 +
>>>>>    kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>>>>>    kernel/bpf/syscall.c           | 14 ++++++++
>>>>>    tools/include/uapi/linux/bpf.h |  1 +
>>>>>    6 files changed, 80 insertions(+)
>>>>>
>>>>
>>>> [...]
>>>>
>>>>> +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog
>>>>> *prog)
>>>>> +{
>>>>> +       struct bpf_link_primer link_primer;
>>>>> +       struct bpf_iter_target_info *tinfo;
>>>>> +       struct bpf_iter_link *link;
>>>>> +       bool existed = false;
>>>>> +       u32 prog_btf_id;
>>>>> +       int err;
>>>>> +
>>>>> +       if (attr->link_create.target_fd || attr->link_create.flags)
>>>>> +               return -EINVAL;
>>>>> +
>>>>> +       prog_btf_id = prog->aux->attach_btf_id;
>>>>> +       mutex_lock(&targets_mutex);
>>>>> +       list_for_each_entry(tinfo, &targets, list) {
>>>>> +               if (tinfo->btf_id == prog_btf_id) {
>>>>> +                       existed = true;
>>>>> +                       break;
>>>>> +               }
>>>>> +       }
>>>>> +       mutex_unlock(&targets_mutex);
>>>>> +       if (!existed)
>>>>> +               return -ENOENT;
>>>>> +
>>>>> +       link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
>>>>
>>>> nit: all existing link implementation don't specify __GFP_NOWARN,
>>>> wonder if bpf_iter_link should be special?
>>>
>>> Nothing special. Just feel __GFP_NOWARN is the right thing to do to
>>> avoid pollute dmesg since -ENOMEM is returned to user space. But in
>>> reality, unlike some key/value allocation where the size could be huge
>>> and __GFP_NOWARN might be more useful, here, sizeof(*link) is fixed
>>> and small, __GFP_NOWARN probably not that useful.
>>>
>>> Will drop it.
>>
>> actually all existing user space driven allocation have nowarn.
> 
> Can you define "user space driven"? I understand why for map, map key,
> map value, program we want to do that, because it's way too easy for
> user-space to specify huge sizes and allocation is proportional to
> that size. But in this case links are fixed-sized objects, same as
> struct file and struct inode. From BPF world, for instance, there is
> struct bpf_prog_list, which is created when user is attaching BPF
> program to cgroup, so it is user-space driven in similar sense. Yet we
> allocate it without __GFP_NOWARN.

For tiny objects it doesn't really matter. If slab cannot allocate
another single page the system is in bad shape and warn is good
to have in most cases, but when it's user driven like here that
warn won't help kernel developers debug ooms. Most likely NIC driver
is spamming page alloc warn at this point.
In this particular case bpf_iter arguments will likely grow
and struct will grow too, but probably not the point of kmalloc_large,
so it's really fine which ever way.
Personally I would keep nowarn here.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets
  2020-05-06  7:30   ` Andrii Nakryiko
@ 2020-05-06 18:24     ` Yonghong Song
  2020-05-06 20:51       ` Andrii Nakryiko
  0 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-06 18:24 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/6/20 12:30 AM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Only the tasks belonging to "current" pid namespace
>> are enumerated.
>>
>> For task/file target, the bpf program will have access to
>>    struct task_struct *task
>>    u32 fd
>>    struct file *file
>> where fd/file is an open file for the task.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
> 
> I might be missing some subtleties with task refcounting for task_file
> iterator, asked few questions below...
> 
>>   kernel/bpf/Makefile    |   2 +-
>>   kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 337 insertions(+), 1 deletion(-)
>>   create mode 100644 kernel/bpf/task_iter.c
>>
>> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
>> index b2b5eefc5254..37b2d8620153 100644
>> --- a/kernel/bpf/Makefile
>> +++ b/kernel/bpf/Makefile
>> @@ -2,7 +2,7 @@
>>   obj-y := core.o
>>   CFLAGS_core.o += $(call cc-disable-warning, override-init)
>>
>> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
>> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
>>   obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>>   obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
>>   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> new file mode 100644
>> index 000000000000..1ca258f6e9f4
>> --- /dev/null
>> +++ b/kernel/bpf/task_iter.c
>> @@ -0,0 +1,336 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/* Copyright (c) 2020 Facebook */
>> +
>> +#include <linux/init.h>
>> +#include <linux/namei.h>
>> +#include <linux/pid_namespace.h>
>> +#include <linux/fs.h>
>> +#include <linux/fdtable.h>
>> +#include <linux/filter.h>
>> +
>> +struct bpf_iter_seq_task_common {
>> +       struct pid_namespace *ns;
>> +};
>> +
>> +struct bpf_iter_seq_task_info {
>> +       struct bpf_iter_seq_task_common common;
> 
> you have comment below in init_seq_pidns() that common is supposed to
> be the very first field, but I think it's more important and
> appropriate here, so that whoever adds anything here knows that order
> of field is important.

I can move the comments here.

> 
>> +       struct task_struct *task;
>> +       u32 id;
>> +};
>> +
> 
> [...]
> 
>> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
>> +{
>> +       struct bpf_iter_meta meta;
>> +       struct bpf_iter__task ctx;
>> +       struct bpf_prog *prog;
>> +       int ret = 0;
>> +
>> +       meta.seq = seq;
>> +       prog = bpf_iter_get_info(&meta, in_stop);
>> +       if (prog) {
> 
> 
> nit: `if (!prog) return 0;` here would reduce nesting level below
> 
>> +               meta.seq = seq;
>> +               ctx.meta = &meta;
>> +               ctx.task = v;
>> +               ret = bpf_iter_run_prog(prog, &ctx);
>> +       }
>> +
>> +       return 0;
> 
> return **ret**; ?

It should return "ret". In task_file show() code is similar but correct.
I can do early return with !prog too although we do not have
deep nesting level yet.

> 
>> +}
>> +
> 
> [...]
> 
>> +
>> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
>> +                                          int *fd, struct task_struct **task,
>> +                                          struct files_struct **fstruct)
>> +{
>> +       struct files_struct *files;
>> +       struct task_struct *tk;
>> +       u32 sid = *id;
>> +       int sfd;
>> +
>> +       /* If this function returns a non-NULL file object,
>> +        * it held a reference to the files_struct and file.
>> +        * Otherwise, it does not hold any reference.
>> +        */
>> +again:
>> +       if (*fstruct) {
>> +               files = *fstruct;
>> +               sfd = *fd;
>> +       } else {
>> +               tk = task_seq_get_next(ns, &sid);
>> +               if (!tk)
>> +                       return NULL;
>> +
>> +               files = get_files_struct(tk);
>> +               put_task_struct(tk);
> 
> task is put here, but is still used below.. is there some additional
> hidden refcounting involved?

Good question. I had an impression that we take a reference count
for task->files so task should not go away. But reading linux
code again, I do not have sufficient evidence to back my claim.
So I will reference count task as well, e.g., do not put_task_struct()
until all files are done here.

> 
>> +               if (!files) {
>> +                       sid = ++(*id);
>> +                       *fd = 0;
>> +                       goto again;
>> +               }
>> +               *fstruct = files;
>> +               *task = tk;
>> +               if (sid == *id) {
>> +                       sfd = *fd;
>> +               } else {
>> +                       *id = sid;
>> +                       sfd = 0;
>> +               }
>> +       }
>> +
>> +       rcu_read_lock();
>> +       for (; sfd < files_fdtable(files)->max_fds; sfd++) {
> 
> files_fdtable does rcu_dereference on each iteration, would it be
> better to just cache files_fdtable(files)->max_fds into local
> variable? It's unlikely that there will be many iterations, but
> still...

I borrowed code from fs/proc/fd.c. But I can certainly to avoid
repeated reading max_fds as suggested.

> 
>> +               struct file *f;
>> +
>> +               f = fcheck_files(files, sfd);
>> +               if (!f)
>> +                       continue;
>> +               *fd = sfd;
>> +               get_file(f);
>> +               rcu_read_unlock();
>> +               return f;
>> +       }
>> +
>> +       /* the current task is done, go to the next task */
>> +       rcu_read_unlock();
>> +       put_files_struct(files);
>> +       *fstruct = NULL;
> 
> *task = NULL; for completeness?

if *fstruct == NULL, will try to get next task, so *task = NULL
is unnecessary, but I can add it, won't hurt and possibly make
it easy to understand.

> 
>> +       sid = ++(*id);
>> +       *fd = 0;
>> +       goto again;
>> +}
>> +
>> +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
>> +{
>> +       struct bpf_iter_seq_task_file_info *info = seq->private;
>> +       struct files_struct *files = NULL;
>> +       struct task_struct *task = NULL;
>> +       struct file *file;
>> +       u32 id = info->id;
>> +       int fd = info->fd;
>> +
>> +       file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files);
>> +       if (!file) {
>> +               info->files = NULL;
> 
> what about info->task here?

info->files == NULL indicates the end of iteration, info->task will not 
be checked any more. But I guess, I can assign NULL to task as well to
avoid confusion.

> 
>> +               return NULL;
>> +       }
>> +
>> +       ++*pos;
>> +       info->id = id;
>> +       info->fd = fd;
>> +       info->task = task;
>> +       info->files = files;
>> +
>> +       return file;
>> +}
>> +
> 
> [...]
> 
>> +
>> +struct bpf_iter__task_file {
>> +       __bpf_md_ptr(struct bpf_iter_meta *, meta);
>> +       __bpf_md_ptr(struct task_struct *, task);
>> +       u32 fd;
> 
> nit: sort of works by accident (due to all other field being 8-byte
> aligned pointers), shall we add __attribute__((aligned(8)))?

This is what I thought as well. It should work. But I think
add aligned(8) wont' hurt to expresss the intention.. Will add it.

> 
>> +       __bpf_md_ptr(struct file *, file);
>> +};
>> +
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets
  2020-05-06 18:24     ` Yonghong Song
@ 2020-05-06 20:51       ` Andrii Nakryiko
  2020-05-06 21:20         ` Yonghong Song
  0 siblings, 1 reply; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-06 20:51 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 11:24 AM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 5/6/20 12:30 AM, Andrii Nakryiko wrote:
> > On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Only the tasks belonging to "current" pid namespace
> >> are enumerated.
> >>
> >> For task/file target, the bpf program will have access to
> >>    struct task_struct *task
> >>    u32 fd
> >>    struct file *file
> >> where fd/file is an open file for the task.
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >
> > I might be missing some subtleties with task refcounting for task_file
> > iterator, asked few questions below...
> >
> >>   kernel/bpf/Makefile    |   2 +-
> >>   kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
> >>   2 files changed, 337 insertions(+), 1 deletion(-)
> >>   create mode 100644 kernel/bpf/task_iter.c
> >>
> >> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> >> index b2b5eefc5254..37b2d8620153 100644
> >> --- a/kernel/bpf/Makefile
> >> +++ b/kernel/bpf/Makefile
> >> @@ -2,7 +2,7 @@
> >>   obj-y := core.o
> >>   CFLAGS_core.o += $(call cc-disable-warning, override-init)
> >>
> >> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
> >> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
> >>   obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
> >>   obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
> >>   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> >> new file mode 100644
> >> index 000000000000..1ca258f6e9f4
> >> --- /dev/null
> >> +++ b/kernel/bpf/task_iter.c
> >> @@ -0,0 +1,336 @@
> >> +// SPDX-License-Identifier: GPL-2.0-only
> >> +/* Copyright (c) 2020 Facebook */
> >> +
> >> +#include <linux/init.h>
> >> +#include <linux/namei.h>
> >> +#include <linux/pid_namespace.h>
> >> +#include <linux/fs.h>
> >> +#include <linux/fdtable.h>
> >> +#include <linux/filter.h>
> >> +
> >> +struct bpf_iter_seq_task_common {
> >> +       struct pid_namespace *ns;
> >> +};
> >> +
> >> +struct bpf_iter_seq_task_info {
> >> +       struct bpf_iter_seq_task_common common;
> >
> > you have comment below in init_seq_pidns() that common is supposed to
> > be the very first field, but I think it's more important and
> > appropriate here, so that whoever adds anything here knows that order
> > of field is important.
>
> I can move the comments here.
>
> >
> >> +       struct task_struct *task;
> >> +       u32 id;
> >> +};
> >> +
> >
> > [...]
> >
> >> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
> >> +{
> >> +       struct bpf_iter_meta meta;
> >> +       struct bpf_iter__task ctx;
> >> +       struct bpf_prog *prog;
> >> +       int ret = 0;
> >> +
> >> +       meta.seq = seq;
> >> +       prog = bpf_iter_get_info(&meta, in_stop);
> >> +       if (prog) {
> >
> >
> > nit: `if (!prog) return 0;` here would reduce nesting level below
> >
> >> +               meta.seq = seq;
> >> +               ctx.meta = &meta;
> >> +               ctx.task = v;
> >> +               ret = bpf_iter_run_prog(prog, &ctx);
> >> +       }
> >> +
> >> +       return 0;
> >
> > return **ret**; ?
>
> It should return "ret". In task_file show() code is similar but correct.
> I can do early return with !prog too although we do not have
> deep nesting level yet.
>
> >
> >> +}
> >> +
> >
> > [...]
> >
> >> +
> >> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
> >> +                                          int *fd, struct task_struct **task,
> >> +                                          struct files_struct **fstruct)
> >> +{
> >> +       struct files_struct *files;
> >> +       struct task_struct *tk;
> >> +       u32 sid = *id;
> >> +       int sfd;
> >> +
> >> +       /* If this function returns a non-NULL file object,
> >> +        * it held a reference to the files_struct and file.
> >> +        * Otherwise, it does not hold any reference.
> >> +        */
> >> +again:
> >> +       if (*fstruct) {
> >> +               files = *fstruct;
> >> +               sfd = *fd;
> >> +       } else {
> >> +               tk = task_seq_get_next(ns, &sid);
> >> +               if (!tk)
> >> +                       return NULL;
> >> +
> >> +               files = get_files_struct(tk);
> >> +               put_task_struct(tk);
> >
> > task is put here, but is still used below.. is there some additional
> > hidden refcounting involved?
>
> Good question. I had an impression that we take a reference count
> for task->files so task should not go away. But reading linux
> code again, I do not have sufficient evidence to back my claim.
> So I will reference count task as well, e.g., do not put_task_struct()
> until all files are done here.

All threads within the process share files table. So some threads
might exit, but files will stay, which is why task_struct and
files_struct have separate refcounting, and having refcount on files
doesn't guarantee any particular task will stay alive for long enough.
So I think we need to refcount both files and task in this case.
Reading source code of copy_files() in kernel/fork.c (CLONE_FILES
flags just bumps refcnt on old process' files_struct), seems to
confirm this as well.

>
> >
> >> +               if (!files) {
> >> +                       sid = ++(*id);
> >> +                       *fd = 0;
> >> +                       goto again;
> >> +               }
> >> +               *fstruct = files;
> >> +               *task = tk;
> >> +               if (sid == *id) {
> >> +                       sfd = *fd;
> >> +               } else {
> >> +                       *id = sid;
> >> +                       sfd = 0;
> >> +               }
> >> +       }
> >> +
> >> +       rcu_read_lock();
> >> +       for (; sfd < files_fdtable(files)->max_fds; sfd++) {
> >
> > files_fdtable does rcu_dereference on each iteration, would it be
> > better to just cache files_fdtable(files)->max_fds into local
> > variable? It's unlikely that there will be many iterations, but
> > still...
>
> I borrowed code from fs/proc/fd.c. But I can certainly to avoid
> repeated reading max_fds as suggested.
>
> >
> >> +               struct file *f;
> >> +
> >> +               f = fcheck_files(files, sfd);
> >> +               if (!f)
> >> +                       continue;
> >> +               *fd = sfd;
> >> +               get_file(f);
> >> +               rcu_read_unlock();
> >> +               return f;
> >> +       }
> >> +
> >> +       /* the current task is done, go to the next task */
> >> +       rcu_read_unlock();
> >> +       put_files_struct(files);
> >> +       *fstruct = NULL;
> >
> > *task = NULL; for completeness?
>
> if *fstruct == NULL, will try to get next task, so *task = NULL
> is unnecessary, but I can add it, won't hurt and possibly make
> it easy to understand.
>
> >
> >> +       sid = ++(*id);
> >> +       *fd = 0;
> >> +       goto again;
> >> +}
> >> +
> >> +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
> >> +{
> >> +       struct bpf_iter_seq_task_file_info *info = seq->private;
> >> +       struct files_struct *files = NULL;
> >> +       struct task_struct *task = NULL;
> >> +       struct file *file;
> >> +       u32 id = info->id;
> >> +       int fd = info->fd;
> >> +
> >> +       file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files);
> >> +       if (!file) {
> >> +               info->files = NULL;
> >
> > what about info->task here?
>
> info->files == NULL indicates the end of iteration, info->task will not
> be checked any more. But I guess, I can assign NULL to task as well to
> avoid confusion.
>
> >
> >> +               return NULL;
> >> +       }
> >> +
> >> +       ++*pos;
> >> +       info->id = id;
> >> +       info->fd = fd;
> >> +       info->task = task;
> >> +       info->files = files;
> >> +
> >> +       return file;
> >> +}
> >> +
> >
> > [...]
> >
> >> +
> >> +struct bpf_iter__task_file {
> >> +       __bpf_md_ptr(struct bpf_iter_meta *, meta);
> >> +       __bpf_md_ptr(struct task_struct *, task);
> >> +       u32 fd;
> >
> > nit: sort of works by accident (due to all other field being 8-byte
> > aligned pointers), shall we add __attribute__((aligned(8)))?
>
> This is what I thought as well. It should work. But I think
> add aligned(8) wont' hurt to expresss the intention.. Will add it.
>
> >
> >> +       __bpf_md_ptr(struct file *, file);
> >> +};
> >> +
> >
> > [...]
> >

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets
  2020-05-06 20:51       ` Andrii Nakryiko
@ 2020-05-06 21:20         ` Yonghong Song
  0 siblings, 0 replies; 62+ messages in thread
From: Yonghong Song @ 2020-05-06 21:20 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/6/20 1:51 PM, Andrii Nakryiko wrote:
> On Wed, May 6, 2020 at 11:24 AM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 5/6/20 12:30 AM, Andrii Nakryiko wrote:
>>> On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>>>>
>>>> Only the tasks belonging to "current" pid namespace
>>>> are enumerated.
>>>>
>>>> For task/file target, the bpf program will have access to
>>>>     struct task_struct *task
>>>>     u32 fd
>>>>     struct file *file
>>>> where fd/file is an open file for the task.
>>>>
>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>> ---
>>>
>>> I might be missing some subtleties with task refcounting for task_file
>>> iterator, asked few questions below...
>>>
>>>>    kernel/bpf/Makefile    |   2 +-
>>>>    kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
>>>>    2 files changed, 337 insertions(+), 1 deletion(-)
>>>>    create mode 100644 kernel/bpf/task_iter.c
>>>>
>>>> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
>>>> index b2b5eefc5254..37b2d8620153 100644
>>>> --- a/kernel/bpf/Makefile
>>>> +++ b/kernel/bpf/Makefile
>>>> @@ -2,7 +2,7 @@
>>>>    obj-y := core.o
>>>>    CFLAGS_core.o += $(call cc-disable-warning, override-init)
>>>>
>>>> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
>>>> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
>>>>    obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>>>>    obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
>>>>    obj-$(CONFIG_BPF_SYSCALL) += disasm.o
>>>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>>>> new file mode 100644
>>>> index 000000000000..1ca258f6e9f4
>>>> --- /dev/null
>>>> +++ b/kernel/bpf/task_iter.c
>>>> @@ -0,0 +1,336 @@
>>>> +// SPDX-License-Identifier: GPL-2.0-only
>>>> +/* Copyright (c) 2020 Facebook */
>>>> +
>>>> +#include <linux/init.h>
>>>> +#include <linux/namei.h>
>>>> +#include <linux/pid_namespace.h>
>>>> +#include <linux/fs.h>
>>>> +#include <linux/fdtable.h>
>>>> +#include <linux/filter.h>
>>>> +
>>>> +struct bpf_iter_seq_task_common {
>>>> +       struct pid_namespace *ns;
>>>> +};
>>>> +
>>>> +struct bpf_iter_seq_task_info {
>>>> +       struct bpf_iter_seq_task_common common;
>>>
>>> you have comment below in init_seq_pidns() that common is supposed to
>>> be the very first field, but I think it's more important and
>>> appropriate here, so that whoever adds anything here knows that order
>>> of field is important.
>>
>> I can move the comments here.
>>
>>>
>>>> +       struct task_struct *task;
>>>> +       u32 id;
>>>> +};
>>>> +
>>>
>>> [...]
>>>
>>>> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
>>>> +{
>>>> +       struct bpf_iter_meta meta;
>>>> +       struct bpf_iter__task ctx;
>>>> +       struct bpf_prog *prog;
>>>> +       int ret = 0;
>>>> +
>>>> +       meta.seq = seq;
>>>> +       prog = bpf_iter_get_info(&meta, in_stop);
>>>> +       if (prog) {
>>>
>>>
>>> nit: `if (!prog) return 0;` here would reduce nesting level below
>>>
>>>> +               meta.seq = seq;
>>>> +               ctx.meta = &meta;
>>>> +               ctx.task = v;
>>>> +               ret = bpf_iter_run_prog(prog, &ctx);
>>>> +       }
>>>> +
>>>> +       return 0;
>>>
>>> return **ret**; ?
>>
>> It should return "ret". In task_file show() code is similar but correct.
>> I can do early return with !prog too although we do not have
>> deep nesting level yet.
>>
>>>
>>>> +}
>>>> +
>>>
>>> [...]
>>>
>>>> +
>>>> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
>>>> +                                          int *fd, struct task_struct **task,
>>>> +                                          struct files_struct **fstruct)
>>>> +{
>>>> +       struct files_struct *files;
>>>> +       struct task_struct *tk;
>>>> +       u32 sid = *id;
>>>> +       int sfd;
>>>> +
>>>> +       /* If this function returns a non-NULL file object,
>>>> +        * it held a reference to the files_struct and file.
>>>> +        * Otherwise, it does not hold any reference.
>>>> +        */
>>>> +again:
>>>> +       if (*fstruct) {
>>>> +               files = *fstruct;
>>>> +               sfd = *fd;
>>>> +       } else {
>>>> +               tk = task_seq_get_next(ns, &sid);
>>>> +               if (!tk)
>>>> +                       return NULL;
>>>> +
>>>> +               files = get_files_struct(tk);
>>>> +               put_task_struct(tk);
>>>
>>> task is put here, but is still used below.. is there some additional
>>> hidden refcounting involved?
>>
>> Good question. I had an impression that we take a reference count
>> for task->files so task should not go away. But reading linux
>> code again, I do not have sufficient evidence to back my claim.
>> So I will reference count task as well, e.g., do not put_task_struct()
>> until all files are done here.
> 
> All threads within the process share files table. So some threads
> might exit, but files will stay, which is why task_struct and
> files_struct have separate refcounting, and having refcount on files
> doesn't guarantee any particular task will stay alive for long enough.
> So I think we need to refcount both files and task in this case.
> Reading source code of copy_files() in kernel/fork.c (CLONE_FILES
> flags just bumps refcnt on old process' files_struct), seems to
> confirm this as well.

Just checked the code. It does look like files are shared among
threads (tasks). So yes, in this case, reference counting to
both task and file_table needed.

> 
>>
>>>
>>>> +               if (!files) {
>>>> +                       sid = ++(*id);
>>>> +                       *fd = 0;
>>>> +                       goto again;
>>>> +               }
>>>> +               *fstruct = files;
>>>> +               *task = tk;
>>>> +               if (sid == *id) {
>>>> +                       sfd = *fd;
>>>> +               } else {
>>>> +                       *id = sid;
>>>> +                       sfd = 0;
>>>> +               }
>>>> +       }
>>>> +
>>>> +       rcu_read_lock();
[...]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 13/20] bpf: add bpf_seq_printf and bpf_seq_write helpers
  2020-05-06 17:37   ` Andrii Nakryiko
@ 2020-05-06 21:42     ` Yonghong Song
  2020-05-08 18:15       ` Andrii Nakryiko
  0 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-06 21:42 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/6/20 10:37 AM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Two helpers bpf_seq_printf and bpf_seq_write, are added for
>> writing data to the seq_file buffer.
>>
>> bpf_seq_printf supports common format string flag/width/type
>> fields so at least I can get identical results for
>> netlink and ipv6_route targets.
>>
> 
> Does seq_printf() has its own format string specification? Is there
> any documentation explaining? I was confused by few different checks
> below...

Not really. Similar to bpf_trace_printk(), since we need to
parse format string, so we may only support a subset of
what seq_printf() does. But we should not invent new
formats.

> 
>> For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
>> specifically indicates a write failure due to overflow, which
>> means the object will be repeated in the next bpf invocation
>> if object collection stays the same. Note that if the object
>> collection is changed, depending how collection traversal is
>> done, even if the object still in the collection, it may not
>> be visited.
>>
>> bpf_seq_printf may return -EBUSY meaning that internal percpu
>> buffer for memory copy of strings or other pointees is
>> not available. Bpf program can return 1 to indicate it
>> wants the same object to be repeated. Right now, this should not
>> happen on no-RT kernels since migrate_enable(), which guards
>> bpf prog call, calls preempt_enable().
> 
> You probably meant migrate_disable()/preempt_disable(), right? But

Yes, sorry for typo.

> could it still happen, at least due to NMI? E.g., perf_event BPF
> program gets triggered during bpf_iter program execution? I think for
> perf_event_output function, we have 3 levels, for one of each possible
> "contexts"? Should we do something like that here as well?

Currently bpf_seq_printf() and bpf_seq_write() helpers can
only be called by iter bpf programs. The iter bpf program can only
be run on process context as it is triggered by a read() syscall.
So one level should be enough for non-RT kernel.

For RT kernel, migrate_disable does not prevent preemption,
so it is possible task in the middle of bpf_seq_printf() might
be preempted, so I implemented the logic to return -EBUSY.
I think this case should be extremely rare so I only implemented
one level nesting.

> 
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/uapi/linux/bpf.h       |  32 +++++-
>>   kernel/trace/bpf_trace.c       | 195 +++++++++++++++++++++++++++++++++
>>   scripts/bpf_helpers_doc.py     |   2 +
>>   tools/include/uapi/linux/bpf.h |  32 +++++-
>>   4 files changed, 259 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 97ceb0f2e539..e440a9d5cca2 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -3076,6 +3076,34 @@ union bpf_attr {
>>    *             See: clock_gettime(CLOCK_BOOTTIME)
>>    *     Return
>>    *             Current *ktime*.
>> + *
> 
> [...]
> 
>> +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
>> +          const void *, data, u32, data_len)
>> +{
>> +       int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
>> +       int i, buf_used, copy_size, num_args;
>> +       u64 params[MAX_SEQ_PRINTF_VARARGS];
>> +       struct bpf_seq_printf_buf *bufs;
>> +       const u64 *args = data;
>> +
>> +       buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
>> +       if (WARN_ON_ONCE(buf_used > 1)) {
>> +               err = -EBUSY;
>> +               goto out;
>> +       }
>> +
>> +       bufs = this_cpu_ptr(&bpf_seq_printf_buf);
>> +
>> +       /*
>> +        * bpf_check()->check_func_arg()->check_stack_boundary()
>> +        * guarantees that fmt points to bpf program stack,
>> +        * fmt_size bytes of it were initialized and fmt_size > 0
>> +        */
>> +       if (fmt[--fmt_size] != 0)
> 
> If we allow fmt_size == 0, this will need to be changed.

Currently, we do not support fmt_size == 0. Yes, if we allow, this
needs change.

> 
>> +               goto out;
>> +
>> +       if (data_len & 7)
>> +               goto out;
>> +
>> +       for (i = 0; i < fmt_size; i++) {
>> +               if (fmt[i] == '%' && (!data || !data_len))
> 
> So %% escaping is not supported?

Yes, have not seen a need yet my ipv6_route/netlink example.
Can certain add if there is a use case.

> 
>> +                       goto out;
>> +       }
>> +
>> +       num_args = data_len / 8;
>> +
>> +       /* check format string for allowed specifiers */
>> +       for (i = 0; i < fmt_size; i++) {
>> +               if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
> 
> why these restrictions? are they essential?

This is the same restriction in bpf_trace_printk(). I guess the purpose
is to avoid weird print. To promote bpf_iter to dump beyond asscii, I 
guess we can remove this restriction.

> 
>> +                       goto out;
>> +
>> +               if (fmt[i] != '%')
>> +                       continue;
>> +
>> +               if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
>> +                       err = -E2BIG;
>> +                       goto out;
>> +               }
>> +
>> +               if (fmt_cnt >= num_args)
>> +                       goto out;
>> +
>> +               /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
>> +               i++;
>> +
>> +               /* skip optional "[0+-][num]" width formating field */
>> +               while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-')
> 
> There could be space as well, as an alternative to 0.

We can allow space. But '0' is used more common, right?

> 
>> +                       i++;
>> +               if (fmt[i] >= '1' && fmt[i] <= '9') {
>> +                       i++;
>> +                       while (fmt[i] >= '0' && fmt[i] <= '9')
>> +                               i++;
>> +               }
>> +
>> +               if (fmt[i] == 's') {
>> +                       /* disallow any further format extensions */
>> +                       if (fmt[i + 1] != 0 &&
>> +                           !isspace(fmt[i + 1]) &&
>> +                           !ispunct(fmt[i + 1]))
>> +                               goto out;
> 
> I'm not sure I follow this check either. printf("%sbla", "whatever")
> is a perfectly fine format string. Unless seq_printf has some
> additional restrictions?

Yes, just some restriction inherited from bpf_trace_printk().
Will remove.

> 
>> +
>> +                       /* try our best to copy */
>> +                       if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
>> +                               err = -E2BIG;
>> +                               goto out;
>> +                       }
>> +
> 
> [...]
> 
>> +
>> +static int bpf_seq_printf_btf_ids[5];
>> +static const struct bpf_func_proto bpf_seq_printf_proto = {
>> +       .func           = bpf_seq_printf,
>> +       .gpl_only       = true,
>> +       .ret_type       = RET_INTEGER,
>> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
>> +       .arg2_type      = ARG_PTR_TO_MEM,
>> +       .arg3_type      = ARG_CONST_SIZE,
> 
> It feels like allowing zero shouldn't hurt too much?

This is the format string, I would prefer to keep it non-zero.

> 
>> +       .arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
>> +       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
>> +       .btf_id         = bpf_seq_printf_btf_ids,
>> +};
>> +
>> +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
>> +{
>> +       return seq_write(m, data, len) ? -EOVERFLOW : 0;
>> +}
>> +
>> +static int bpf_seq_write_btf_ids[5];
>> +static const struct bpf_func_proto bpf_seq_write_proto = {
>> +       .func           = bpf_seq_write,
>> +       .gpl_only       = true,
>> +       .ret_type       = RET_INTEGER,
>> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
>> +       .arg2_type      = ARG_PTR_TO_MEM,
>> +       .arg3_type      = ARG_CONST_SIZE,
> 
> Same, ARG_CONST_SIZE_OR_ZERO?

This one, possible. Let me check.

> 
>> +       .btf_id         = bpf_seq_write_btf_ids,
>> +};
>> +
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 14/20] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary
  2020-05-06 17:38   ` Andrii Nakryiko
@ 2020-05-06 21:47     ` Yonghong Song
  0 siblings, 0 replies; 62+ messages in thread
From: Yonghong Song @ 2020-05-06 21:47 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/6/20 10:38 AM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> This specifically to handle the case like below:
>>     // ptr below is a socket ptr identified by PTR_TO_BTF_ID
>>     u64 param[2] = { ptr, val };
>>     bpf_seq_printf(seq, fmt, sizeof(fmt), param, sizeof(param));
>>
>> In this case, the 16 bytes stack for "param" contains:
>>     8 bytes for ptr with spilled PTR_TO_BTF_ID
>>     8 bytes for val as STACK_MISC
>>
>> The current verifier will complain the ptr should not be visible
>> to the helper.
>>     ...
>>     16: (7b) *(u64 *)(r10 -64) = r2
>>     18: (7b) *(u64 *)(r10 -56) = r1
>>     19: (bf) r4 = r10
>>     ;
>>     20: (07) r4 += -64
>>     ; BPF_SEQ_PRINTF(seq, fmt1, (long)s, s->sk_protocol);
>>     21: (bf) r1 = r6
>>     22: (18) r2 = 0xffffa8d00018605a
>>     24: (b4) w3 = 10
>>     25: (b4) w5 = 16
>>     26: (85) call bpf_seq_printf#125
>>      R0=inv(id=0) R1_w=ptr_seq_file(id=0,off=0,imm=0)
>>      R2_w=map_value(id=0,off=90,ks=4,vs=144,imm=0) R3_w=inv10
>>      R4_w=fp-64 R5_w=inv16 R6=ptr_seq_file(id=0,off=0,imm=0)
>>      R7=ptr_netlink_sock(id=0,off=0,imm=0) R10=fp0 fp-56_w=mmmmmmmm
>>      fp-64_w=ptr_
>>     last_idx 26 first_idx 13
>>     regs=8 stack=0 before 25: (b4) w5 = 16
>>     regs=8 stack=0 before 24: (b4) w3 = 10
>>     invalid indirect read from stack off -64+0 size 16
>>
>> Let us permit this if the program is a tracing/iter program.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
> 
> LGTM, but I wonder why enabling this only for iterator programs?
> 
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> 
> 
>>   kernel/bpf/verifier.c | 8 ++++++++
>>   1 file changed, 8 insertions(+)
>>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 36b2a38a06fe..4884b6fd7bad 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -3494,6 +3494,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
>>                          *stype = STACK_MISC;
>>                          goto mark;
>>                  }
>> +
>> +               /* pointer value can be visible to tracing/iter program */
>> +               if (env->prog->type == BPF_PROG_TYPE_TRACING &&
>> +                   env->prog->expected_attach_type == BPF_TRACE_ITER &&
> 
> What's the problem allowing this for all program types?

Just want to conservative here since we may leak kernel pointers.
But probably we are fine since the spill type is PTR_TO_BTF_ID
which means tracing/raw_tp related bpf programs which should
be okay. Will remove the above additional check, which I added
in v2 of the patch.

> 
>> +                   state->stack[spi].slot_type[0] == STACK_SPILL &&
>> +                   state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
>> +                       goto mark;
>> +
>>                  if (state->stack[spi].slot_type[0] == STACK_SPILL &&
>>                      state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
>>                          __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
>> --
>> 2.24.1
>>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink
  2020-05-06  6:04   ` Andrii Nakryiko
@ 2020-05-06 23:07     ` Yonghong Song
  0 siblings, 0 replies; 62+ messages in thread
From: Yonghong Song @ 2020-05-06 23:07 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/5/20 11:04 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:30 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Two bpf programs are added in this patch for netlink and ipv6_route
>> target. On my VM, I am able to achieve identical
>> results compared to /proc/net/netlink and /proc/net/ipv6_route.
>>
>>    $ cat /proc/net/netlink
>>    sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
>>    000000002c42d58b 0   0          00000000 0        0        0     2        0        7
>>    00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
>>    00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
>>    000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
>>    ....
>>    00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
>>    000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
>>    00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
>>    000000008398fb08 16  0          00000000 0        0        0     2        0        27
>>    $ cat /sys/fs/bpf/my_netlink
>>    sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
>>    000000002c42d58b 0   0          00000000 0        0        0     2        0        7
>>    00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
>>    00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
>>    000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
>>    ....
>>    00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
>>    000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
>>    00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
>>    000000008398fb08 16  0          00000000 0        0        0     2        0        27
>>
>>    $ cat /proc/net/ipv6_route
>>    fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
>>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>>    00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
>>    fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
>>    ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
>>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>>    $ cat /sys/fs/bpf/my_ipv6_route
>>    fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
>>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>>    00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
>>    fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
>>    ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
>>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
> 
> Just realized, this is only BPF programs, right? It would be good to
> have at least minimal user-space program that would verify and load
> it. Otherwise we'll be just testing compilation and it might "bit rot"
> a bit...

Totally agree. My latest selftest in test_progs actually tested loading, 
anon iter creating and reading(). It did not verify contents though.

> 
>>   .../selftests/bpf/progs/bpf_iter_ipv6_route.c | 63 ++++++++++++++++
>>   .../selftests/bpf/progs/bpf_iter_netlink.c    | 74 +++++++++++++++++++
>>   2 files changed, 137 insertions(+)
>>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
>>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
>>
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink
  2020-05-06  6:01   ` Andrii Nakryiko
@ 2020-05-07  1:09     ` Yonghong Song
  2020-05-08 18:17       ` Andrii Nakryiko
  0 siblings, 1 reply; 62+ messages in thread
From: Yonghong Song @ 2020-05-07  1:09 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/5/20 11:01 PM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:30 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Two bpf programs are added in this patch for netlink and ipv6_route
>> target. On my VM, I am able to achieve identical
>> results compared to /proc/net/netlink and /proc/net/ipv6_route.
>>
>>    $ cat /proc/net/netlink
>>    sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
>>    000000002c42d58b 0   0          00000000 0        0        0     2        0        7
>>    00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
>>    00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
>>    000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
>>    ....
>>    00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
>>    000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
>>    00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
>>    000000008398fb08 16  0          00000000 0        0        0     2        0        27
>>    $ cat /sys/fs/bpf/my_netlink
>>    sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
>>    000000002c42d58b 0   0          00000000 0        0        0     2        0        7
>>    00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
>>    00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
>>    000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
>>    ....
>>    00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
>>    000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
>>    00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
>>    000000008398fb08 16  0          00000000 0        0        0     2        0        27
>>
>>    $ cat /proc/net/ipv6_route
>>    fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
>>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>>    00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
>>    fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
>>    ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
>>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>>    $ cat /sys/fs/bpf/my_ipv6_route
>>    fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
>>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>>    00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
>>    fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
>>    ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
>>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
> 
> Looks good, but something weird with printf below...
> 
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> 
>>   .../selftests/bpf/progs/bpf_iter_ipv6_route.c | 63 ++++++++++++++++
>>   .../selftests/bpf/progs/bpf_iter_netlink.c    | 74 +++++++++++++++++++
>>   2 files changed, 137 insertions(+)
>>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
>>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
>>
>> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
>> new file mode 100644
>> index 000000000000..0dee4629298f
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
>> @@ -0,0 +1,63 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2020 Facebook */
>> +#include "vmlinux.h"
>> +#include <bpf/bpf_helpers.h>
>> +#include <bpf/bpf_tracing.h>
>> +#include <bpf/bpf_endian.h>
>> +
>> +char _license[] SEC("license") = "GPL";
>> +
>> +extern bool CONFIG_IPV6_SUBTREES __kconfig __weak;
>> +
>> +#define        RTF_GATEWAY             0x0002
>> +#define IFNAMSIZ               16
> 
> nit: these look weirdly unaligned :)
> 
>> +#define fib_nh_gw_family        nh_common.nhc_gw_family
>> +#define fib_nh_gw6              nh_common.nhc_gw.ipv6
>> +#define fib_nh_dev              nh_common.nhc_dev
>> +
> 
> [...]
> 
> 
>> +       dev = fib6_nh->fib_nh_dev;
>> +       if (dev)
>> +               BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric,
>> +                              rt->fib6_ref.refs.counter, 0, flags, dev->name);
>> +       else
>> +               BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric,
>> +                              rt->fib6_ref.refs.counter, 0, flags);
> 
> hmm... how does it work? you specify 4 params, but format string
> expects 5. Shouldn't this fail?

Thanks for catching this. Unfortunately, we can only detech this at 
runtime when BPF_SEQ_PRINTF is executed since only then we do 
format/argument checking.

In the above, if I flip condition "if (dev)" to "if (!dev)", the 
BPF_SEQ_PRRINTF will not print anything and returns -EINVAL.

I am wondering whether verifier should do some verification at prog load
time to ensure
   # of args in packed u64 array >= # of format specifier
This should capture this case. Or we just assume users should do 
adequate testing to capture such cases.

Note that this won't affect safety of the program so it is totally
okay for verifier to delay the checking to runtime.

> 
>> +
>> +       return 0;
>> +}
>> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
>> new file mode 100644
>> index 000000000000..0a85a621a36d
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
>> @@ -0,0 +1,74 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2020 Facebook */
>> +#include "vmlinux.h"
>> +#include <bpf/bpf_helpers.h>
>> +#include <bpf/bpf_tracing.h>
>> +#include <bpf/bpf_endian.h>
>> +
>> +char _license[] SEC("license") = "GPL";
>> +
>> +#define sk_rmem_alloc  sk_backlog.rmem_alloc
>> +#define sk_refcnt      __sk_common.skc_refcnt
>> +
>> +#define offsetof(TYPE, MEMBER)  ((size_t)&((TYPE *)0)->MEMBER)
>> +#define container_of(ptr, type, member)                                \
>> +       ({                                                      \
>> +               void *__mptr = (void *)(ptr);                   \
>> +               ((type *)(__mptr - offsetof(type, member)));    \
>> +       })
> 
> we should probably put offsetof(), offsetofend() and container_of()
> macro into bpf_helpers.h, seems like universal things for kernel
> datastructs :)
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 13/20] bpf: add bpf_seq_printf and bpf_seq_write helpers
  2020-05-06 21:42     ` Yonghong Song
@ 2020-05-08 18:15       ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 18:15 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 2:42 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 5/6/20 10:37 AM, Andrii Nakryiko wrote:
> > On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Two helpers bpf_seq_printf and bpf_seq_write, are added for
> >> writing data to the seq_file buffer.
> >>
> >> bpf_seq_printf supports common format string flag/width/type
> >> fields so at least I can get identical results for
> >> netlink and ipv6_route targets.
> >>
> >
> > Does seq_printf() has its own format string specification? Is there
> > any documentation explaining? I was confused by few different checks
> > below...
>
> Not really. Similar to bpf_trace_printk(), since we need to
> parse format string, so we may only support a subset of
> what seq_printf() does. But we should not invent new
> formats.
>
> >
> >> For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
> >> specifically indicates a write failure due to overflow, which
> >> means the object will be repeated in the next bpf invocation
> >> if object collection stays the same. Note that if the object
> >> collection is changed, depending how collection traversal is
> >> done, even if the object still in the collection, it may not
> >> be visited.
> >>
> >> bpf_seq_printf may return -EBUSY meaning that internal percpu
> >> buffer for memory copy of strings or other pointees is
> >> not available. Bpf program can return 1 to indicate it
> >> wants the same object to be repeated. Right now, this should not
> >> happen on no-RT kernels since migrate_enable(), which guards
> >> bpf prog call, calls preempt_enable().
> >
> > You probably meant migrate_disable()/preempt_disable(), right? But
>
> Yes, sorry for typo.
>
> > could it still happen, at least due to NMI? E.g., perf_event BPF
> > program gets triggered during bpf_iter program execution? I think for
> > perf_event_output function, we have 3 levels, for one of each possible
> > "contexts"? Should we do something like that here as well?
>
> Currently bpf_seq_printf() and bpf_seq_write() helpers can
> only be called by iter bpf programs. The iter bpf program can only
> be run on process context as it is triggered by a read() syscall.
> So one level should be enough for non-RT kernel.
>
> For RT kernel, migrate_disable does not prevent preemption,
> so it is possible task in the middle of bpf_seq_printf() might
> be preempted, so I implemented the logic to return -EBUSY.
> I think this case should be extremely rare so I only implemented
> one level nesting.

yeah, makes sense

>
> >
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   include/uapi/linux/bpf.h       |  32 +++++-
> >>   kernel/trace/bpf_trace.c       | 195 +++++++++++++++++++++++++++++++++
> >>   scripts/bpf_helpers_doc.py     |   2 +
> >>   tools/include/uapi/linux/bpf.h |  32 +++++-
> >>   4 files changed, 259 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 97ceb0f2e539..e440a9d5cca2 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -3076,6 +3076,34 @@ union bpf_attr {
> >>    *             See: clock_gettime(CLOCK_BOOTTIME)
> >>    *     Return
> >>    *             Current *ktime*.
> >> + *
> >
> > [...]
> >
> >> +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
> >> +          const void *, data, u32, data_len)
> >> +{
> >> +       int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
> >> +       int i, buf_used, copy_size, num_args;
> >> +       u64 params[MAX_SEQ_PRINTF_VARARGS];
> >> +       struct bpf_seq_printf_buf *bufs;
> >> +       const u64 *args = data;
> >> +
> >> +       buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
> >> +       if (WARN_ON_ONCE(buf_used > 1)) {
> >> +               err = -EBUSY;
> >> +               goto out;
> >> +       }
> >> +
> >> +       bufs = this_cpu_ptr(&bpf_seq_printf_buf);
> >> +
> >> +       /*
> >> +        * bpf_check()->check_func_arg()->check_stack_boundary()
> >> +        * guarantees that fmt points to bpf program stack,
> >> +        * fmt_size bytes of it were initialized and fmt_size > 0
> >> +        */
> >> +       if (fmt[--fmt_size] != 0)
> >
> > If we allow fmt_size == 0, this will need to be changed.
>
> Currently, we do not support fmt_size == 0. Yes, if we allow, this
> needs change.
>
> >
> >> +               goto out;
> >> +
> >> +       if (data_len & 7)
> >> +               goto out;
> >> +
> >> +       for (i = 0; i < fmt_size; i++) {
> >> +               if (fmt[i] == '%' && (!data || !data_len))
> >
> > So %% escaping is not supported?
>
> Yes, have not seen a need yet my ipv6_route/netlink example.
> Can certain add if there is a use case.

I can imagine this being used quite often when trying to print out
percentages... Would just suck to have to upgrade kernel just to be
able to print % character :)

>
> >
> >> +                       goto out;
> >> +       }
> >> +
> >> +       num_args = data_len / 8;
> >> +
> >> +       /* check format string for allowed specifiers */
> >> +       for (i = 0; i < fmt_size; i++) {
> >> +               if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
> >
> > why these restrictions? are they essential?
>
> This is the same restriction in bpf_trace_printk(). I guess the purpose
> is to avoid weird print. To promote bpf_iter to dump beyond asscii, I
> guess we can remove this restriction.

well, if underlying seq_printf() will fail for those "more liberal"
strings, then that would be bad. Basically, we should try to not
impose additional restrictions compared to seq_printf, but also no
need to allow more, which will be rejected by it. I haven't checked
seq_printf implementation though, so don't know what are those
restrictions.

>
> >
> >> +                       goto out;
> >> +
> >> +               if (fmt[i] != '%')
> >> +                       continue;
> >> +
> >> +               if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
> >> +                       err = -E2BIG;
> >> +                       goto out;
> >> +               }
> >> +
> >> +               if (fmt_cnt >= num_args)
> >> +                       goto out;
> >> +
> >> +               /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
> >> +               i++;
> >> +
> >> +               /* skip optional "[0+-][num]" width formating field */
> >> +               while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-')
> >
> > There could be space as well, as an alternative to 0.
>
> We can allow space. But '0' is used more common, right?

I use both space and 0 quite often, space especially with aligned strings.

>
> >
> >> +                       i++;
> >> +               if (fmt[i] >= '1' && fmt[i] <= '9') {
> >> +                       i++;
> >> +                       while (fmt[i] >= '0' && fmt[i] <= '9')
> >> +                               i++;
> >> +               }
> >> +
> >> +               if (fmt[i] == 's') {
> >> +                       /* disallow any further format extensions */
> >> +                       if (fmt[i + 1] != 0 &&
> >> +                           !isspace(fmt[i + 1]) &&
> >> +                           !ispunct(fmt[i + 1]))
> >> +                               goto out;
> >
> > I'm not sure I follow this check either. printf("%sbla", "whatever")
> > is a perfectly fine format string. Unless seq_printf has some
> > additional restrictions?
>
> Yes, just some restriction inherited from bpf_trace_printk().
> Will remove.

see comment above, if we allow it here, but seq_printf() will reject
it, then there is no point

>
> >
> >> +
> >> +                       /* try our best to copy */
> >> +                       if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
> >> +                               err = -E2BIG;
> >> +                               goto out;
> >> +                       }
> >> +
> >
> > [...]
> >
> >> +
> >> +static int bpf_seq_printf_btf_ids[5];
> >> +static const struct bpf_func_proto bpf_seq_printf_proto = {
> >> +       .func           = bpf_seq_printf,
> >> +       .gpl_only       = true,
> >> +       .ret_type       = RET_INTEGER,
> >> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
> >> +       .arg2_type      = ARG_PTR_TO_MEM,
> >> +       .arg3_type      = ARG_CONST_SIZE,
> >
> > It feels like allowing zero shouldn't hurt too much?
>
> This is the format string, I would prefer to keep it non-zero.

yeah, makes sense, I suppose

>
> >
> >> +       .arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
> >> +       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
> >> +       .btf_id         = bpf_seq_printf_btf_ids,
> >> +};
> >> +
> >> +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
> >> +{
> >> +       return seq_write(m, data, len) ? -EOVERFLOW : 0;
> >> +}
> >> +
> >> +static int bpf_seq_write_btf_ids[5];
> >> +static const struct bpf_func_proto bpf_seq_write_proto = {
> >> +       .func           = bpf_seq_write,
> >> +       .gpl_only       = true,
> >> +       .ret_type       = RET_INTEGER,
> >> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
> >> +       .arg2_type      = ARG_PTR_TO_MEM,
> >> +       .arg3_type      = ARG_CONST_SIZE,
> >
> > Same, ARG_CONST_SIZE_OR_ZERO?
>
> This one, possible. Let me check.

I just remember how much trouble perf_event_output() was causing me
because it enforced >0 for data length. Even though my variable-sized
output was always >0, proving that to (especially older) verifier was
extremely hard. So the less unnecessary restrictions - the better.

>
> >
> >> +       .btf_id         = bpf_seq_write_btf_ids,
> >> +};
> >> +
> >
> > [...]
> >

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink
  2020-05-07  1:09     ` Yonghong Song
@ 2020-05-08 18:17       ` Andrii Nakryiko
  0 siblings, 0 replies; 62+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 18:17 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 6:09 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 5/5/20 11:01 PM, Andrii Nakryiko wrote:
> > On Sun, May 3, 2020 at 11:30 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Two bpf programs are added in this patch for netlink and ipv6_route
> >> target. On my VM, I am able to achieve identical
> >> results compared to /proc/net/netlink and /proc/net/ipv6_route.
> >>
> >>    $ cat /proc/net/netlink
> >>    sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
> >>    000000002c42d58b 0   0          00000000 0        0        0     2        0        7
> >>    00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
> >>    00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
> >>    000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
> >>    ....
> >>    00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
> >>    000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
> >>    00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
> >>    000000008398fb08 16  0          00000000 0        0        0     2        0        27
> >>    $ cat /sys/fs/bpf/my_netlink
> >>    sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
> >>    000000002c42d58b 0   0          00000000 0        0        0     2        0        7
> >>    00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
> >>    00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
> >>    000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
> >>    ....
> >>    00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
> >>    000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
> >>    00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
> >>    000000008398fb08 16  0          00000000 0        0        0     2        0        27
> >>
> >>    $ cat /proc/net/ipv6_route
> >>    fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
> >>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
> >>    00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
> >>    fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
> >>    ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
> >>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
> >>    $ cat /sys/fs/bpf/my_ipv6_route
> >>    fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
> >>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
> >>    00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
> >>    fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
> >>    ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
> >>    00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >
> > Looks good, but something weird with printf below...
> >
> > Acked-by: Andrii Nakryiko <andriin@fb.com>
> >
> >>   .../selftests/bpf/progs/bpf_iter_ipv6_route.c | 63 ++++++++++++++++
> >>   .../selftests/bpf/progs/bpf_iter_netlink.c    | 74 +++++++++++++++++++
> >>   2 files changed, 137 insertions(+)
> >>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
> >>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
> >>
> >> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
> >> new file mode 100644
> >> index 000000000000..0dee4629298f
> >> --- /dev/null
> >> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
> >> @@ -0,0 +1,63 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +/* Copyright (c) 2020 Facebook */
> >> +#include "vmlinux.h"
> >> +#include <bpf/bpf_helpers.h>
> >> +#include <bpf/bpf_tracing.h>
> >> +#include <bpf/bpf_endian.h>
> >> +
> >> +char _license[] SEC("license") = "GPL";
> >> +
> >> +extern bool CONFIG_IPV6_SUBTREES __kconfig __weak;
> >> +
> >> +#define        RTF_GATEWAY             0x0002
> >> +#define IFNAMSIZ               16
> >
> > nit: these look weirdly unaligned :)
> >
> >> +#define fib_nh_gw_family        nh_common.nhc_gw_family
> >> +#define fib_nh_gw6              nh_common.nhc_gw.ipv6
> >> +#define fib_nh_dev              nh_common.nhc_dev
> >> +
> >
> > [...]
> >
> >
> >> +       dev = fib6_nh->fib_nh_dev;
> >> +       if (dev)
> >> +               BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric,
> >> +                              rt->fib6_ref.refs.counter, 0, flags, dev->name);
> >> +       else
> >> +               BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric,
> >> +                              rt->fib6_ref.refs.counter, 0, flags);
> >
> > hmm... how does it work? you specify 4 params, but format string
> > expects 5. Shouldn't this fail?
>
> Thanks for catching this. Unfortunately, we can only detech this at
> runtime when BPF_SEQ_PRINTF is executed since only then we do
> format/argument checking.
>
> In the above, if I flip condition "if (dev)" to "if (!dev)", the
> BPF_SEQ_PRRINTF will not print anything and returns -EINVAL.
>
> I am wondering whether verifier should do some verification at prog load
> time to ensure
>    # of args in packed u64 array >= # of format specifier
> This should capture this case. Or we just assume users should do
> adequate testing to capture such cases.
>

My initial thought is that it would be too specific knowledge for
verifier, but maybe as we add more generic logging/printf
capabilities, it might come in handy. But I'd defer for later on.

> Note that this won't affect safety of the program so it is totally
> okay for verifier to delay the checking to runtime.
>
> >
> >> +
> >> +       return 0;
> >> +}
> >> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
> >> new file mode 100644
> >> index 000000000000..0a85a621a36d
> >> --- /dev/null
> >> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
> >> @@ -0,0 +1,74 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +/* Copyright (c) 2020 Facebook */
> >> +#include "vmlinux.h"
> >> +#include <bpf/bpf_helpers.h>
> >> +#include <bpf/bpf_tracing.h>
> >> +#include <bpf/bpf_endian.h>
> >> +
> >> +char _license[] SEC("license") = "GPL";
> >> +
> >> +#define sk_rmem_alloc  sk_backlog.rmem_alloc
> >> +#define sk_refcnt      __sk_common.skc_refcnt
> >> +
> >> +#define offsetof(TYPE, MEMBER)  ((size_t)&((TYPE *)0)->MEMBER)
> >> +#define container_of(ptr, type, member)                                \
> >> +       ({                                                      \
> >> +               void *__mptr = (void *)(ptr);                   \
> >> +               ((type *)(__mptr - offsetof(type, member)));    \
> >> +       })
> >
> > we should probably put offsetof(), offsetofend() and container_of()
> > macro into bpf_helpers.h, seems like universal things for kernel
> > datastructs :)
> >
> > [...]
> >

^ permalink raw reply	[flat|nested] 62+ messages in thread

end of thread, other threads:[~2020-05-08 18:17 UTC | newest]

Thread overview: 62+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-04  6:25 [PATCH bpf-next v2 00/20] bpf: implement bpf iterator for kernel data Yonghong Song
2020-05-04  6:25 ` [PATCH bpf-next v2 01/20] bpf: implement an interface to register bpf_iter targets Yonghong Song
2020-05-05 21:19   ` Andrii Nakryiko
2020-05-04  6:25 ` [PATCH bpf-next v2 02/20] bpf: allow loading of a bpf_iter program Yonghong Song
2020-05-05 21:29   ` Andrii Nakryiko
2020-05-06  0:07     ` Yonghong Song
2020-05-04  6:25 ` [PATCH bpf-next v2 03/20] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE Yonghong Song
2020-05-05 21:30   ` Andrii Nakryiko
2020-05-06  0:14     ` Yonghong Song
2020-05-06  0:54       ` Alexei Starovoitov
2020-05-06  3:09         ` Andrii Nakryiko
2020-05-06 18:08           ` Alexei Starovoitov
2020-05-04  6:25 ` [PATCH bpf-next v2 04/20] bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE Yonghong Song
2020-05-05 21:32   ` Andrii Nakryiko
2020-05-04  6:25 ` [PATCH bpf-next v2 05/20] bpf: implement bpf_seq_read() for bpf iterator Yonghong Song
2020-05-05 19:56   ` Andrii Nakryiko
2020-05-05 19:57     ` Alexei Starovoitov
2020-05-05 20:25     ` Yonghong Song
2020-05-05 21:08       ` Andrii Nakryiko
2020-05-04  6:25 ` [PATCH bpf-next v2 06/20] bpf: create anonymous " Yonghong Song
2020-05-05 20:11   ` Andrii Nakryiko
2020-05-05 20:28     ` Yonghong Song
2020-05-04  6:25 ` [PATCH bpf-next v2 07/20] bpf: create file " Yonghong Song
2020-05-05 20:15   ` Andrii Nakryiko
2020-05-04  6:25 ` [PATCH bpf-next v2 08/20] bpf: implement common macros/helpers for target iterators Yonghong Song
2020-05-05 20:25   ` Andrii Nakryiko
2020-05-05 20:30     ` Yonghong Song
2020-05-05 21:10       ` Andrii Nakryiko
2020-05-04  6:25 ` [PATCH bpf-next v2 09/20] bpf: add bpf_map iterator Yonghong Song
2020-05-06  5:11   ` Andrii Nakryiko
2020-05-04  6:25 ` [PATCH bpf-next v2 10/20] net: bpf: add netlink and ipv6_route bpf_iter targets Yonghong Song
2020-05-06  5:21   ` Andrii Nakryiko
2020-05-06 17:32     ` Yonghong Song
2020-05-04  6:25 ` [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets Yonghong Song
2020-05-06  7:30   ` Andrii Nakryiko
2020-05-06 18:24     ` Yonghong Song
2020-05-06 20:51       ` Andrii Nakryiko
2020-05-06 21:20         ` Yonghong Song
2020-05-04  6:26 ` [PATCH bpf-next v2 12/20] bpf: add PTR_TO_BTF_ID_OR_NULL support Yonghong Song
2020-05-05 20:27   ` Andrii Nakryiko
2020-05-04  6:26 ` [PATCH bpf-next v2 13/20] bpf: add bpf_seq_printf and bpf_seq_write helpers Yonghong Song
2020-05-06 17:37   ` Andrii Nakryiko
2020-05-06 21:42     ` Yonghong Song
2020-05-08 18:15       ` Andrii Nakryiko
2020-05-04  6:26 ` [PATCH bpf-next v2 14/20] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary Yonghong Song
2020-05-06 17:38   ` Andrii Nakryiko
2020-05-06 21:47     ` Yonghong Song
2020-05-04  6:26 ` [PATCH bpf-next v2 15/20] bpf: support variable length array in tracing programs Yonghong Song
2020-05-06 17:40   ` Andrii Nakryiko
2020-05-04  6:26 ` [PATCH bpf-next v2 16/20] tools/libbpf: add bpf_iter support Yonghong Song
2020-05-06  5:44   ` Andrii Nakryiko
2020-05-04  6:26 ` [PATCH bpf-next v2 17/20] tools/bpftool: add bpf_iter support for bptool Yonghong Song
2020-05-04  6:26 ` [PATCH bpf-next v2 18/20] tools/bpf: selftests: add iterator programs for ipv6_route and netlink Yonghong Song
2020-05-06  6:01   ` Andrii Nakryiko
2020-05-07  1:09     ` Yonghong Song
2020-05-08 18:17       ` Andrii Nakryiko
2020-05-06  6:04   ` Andrii Nakryiko
2020-05-06 23:07     ` Yonghong Song
2020-05-04  6:26 ` [PATCH bpf-next v2 19/20] tools/bpf: selftests: add iter progs for bpf_map/task/task_file Yonghong Song
2020-05-06  6:14   ` Andrii Nakryiko
2020-05-04  6:26 ` [PATCH bpf-next v2 20/20] tools/bpf: selftests: add bpf_iter selftests Yonghong Song
2020-05-06  6:39   ` Andrii Nakryiko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.