All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next v3 01/21] bpf: implement an interface to register bpf_iter targets
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 18:18   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 02/21] bpf: allow loading of a bpf_iter program Yonghong Song
                   ` (19 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

The target can call bpf_iter_reg_target() to register itself.
The needed information:
  target:           target name
  seq_ops:          the seq_file operations for the target
  init_seq_private  target callback to initialize seq_priv during file open
  fini_seq_private  target callback to clean up seq_priv during file release
  seq_priv_size:    the private_data size needed by the seq_file
                    operations

The target name represents a target which provides a seq_ops
for iterating objects.

The target can provide two callback functions, init_seq_private
and fini_seq_private, called during file open/release time.
For example, /proc/net/{tcp6, ipv6_route, netlink, ...}, net
name space needs to be setup properly during file open and
released properly during file release.

Function bpf_iter_unreg_target() is also implemented to unregister
a particular target.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   | 15 +++++++++++
 kernel/bpf/Makefile   |  2 +-
 kernel/bpf/bpf_iter.c | 59 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/bpf_iter.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1262ec460ab3..40c78b86fe38 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -31,6 +31,7 @@ struct seq_file;
 struct btf;
 struct btf_type;
 struct exception_table_entry;
+struct seq_operations;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1126,6 +1127,20 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
 
+typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
+typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
+
+struct bpf_iter_reg {
+	const char *target;
+	const struct seq_operations *seq_ops;
+	bpf_iter_init_seq_priv_t init_seq_private;
+	bpf_iter_fini_seq_priv_t fini_seq_private;
+	u32 seq_priv_size;
+};
+
+int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
+void bpf_iter_unreg_target(const char *target);
+
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f2d7be596966..6a8b0febd3f6 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
 obj-y := core.o
 CFLAGS_core.o += $(call cc-disable-warning, override-init)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
new file mode 100644
index 000000000000..5a8119d17d14
--- /dev/null
+++ b/kernel/bpf/bpf_iter.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+struct bpf_iter_target_info {
+	struct list_head list;
+	const char *target;
+	const struct seq_operations *seq_ops;
+	bpf_iter_init_seq_priv_t init_seq_private;
+	bpf_iter_fini_seq_priv_t fini_seq_private;
+	u32 seq_priv_size;
+};
+
+static struct list_head targets = LIST_HEAD_INIT(targets);
+static DEFINE_MUTEX(targets_mutex);
+
+int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
+{
+	struct bpf_iter_target_info *tinfo;
+
+	tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+	if (!tinfo)
+		return -ENOMEM;
+
+	tinfo->target = reg_info->target;
+	tinfo->seq_ops = reg_info->seq_ops;
+	tinfo->init_seq_private = reg_info->init_seq_private;
+	tinfo->fini_seq_private = reg_info->fini_seq_private;
+	tinfo->seq_priv_size = reg_info->seq_priv_size;
+	INIT_LIST_HEAD(&tinfo->list);
+
+	mutex_lock(&targets_mutex);
+	list_add(&tinfo->list, &targets);
+	mutex_unlock(&targets_mutex);
+
+	return 0;
+}
+
+void bpf_iter_unreg_target(const char *target)
+{
+	struct bpf_iter_target_info *tinfo;
+	bool found = false;
+
+	mutex_lock(&targets_mutex);
+	list_for_each_entry(tinfo, &targets, list) {
+		if (!strcmp(target, tinfo->target)) {
+			list_del(&tinfo->list);
+			kfree(tinfo);
+			found = true;
+			break;
+		}
+	}
+	mutex_unlock(&targets_mutex);
+
+	WARN_ON(found == false);
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data
@ 2020-05-07  5:39 Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 01/21] bpf: implement an interface to register bpf_iter targets Yonghong Song
                   ` (20 more replies)
  0 siblings, 21 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Motivation:
  The current way to dump kernel data structures mostly:
    1. /proc system
    2. various specific tools like "ss" which requires kernel support.
    3. drgn
  The dropback for the first two is that whenever you want to dump more, you
  need change the kernel. For example, Martin wants to dump socket local
  storage with "ss". Kernel change is needed for it to work ([1]).
  This is also the direct motivation for this work.

  drgn ([2]) solves this proble nicely and no kernel change is not needed.
  But since drgn is not able to verify the validity of a particular pointer value,
  it might present the wrong results in rare cases.

  In this patch set, we introduce bpf iterator. Initial kernel changes are
  still needed for interested kernel data, but a later data structure change
  will not require kernel changes any more. bpf program itself can adapt
  to new data structure changes. This will give certain flexibility with
  guaranteed correctness.

  In this patch set, kernel seq_ops is used to facilitate iterating through
  kernel data, similar to current /proc and many other lossless kernel
  dumping facilities. In the future, different iterators can be
  implemented to trade off losslessness for other criteria e.g. no
  repeated object visits, etc.

User Interface:
  1. Similar to prog/map/link, the iterator can be pinned into a
     path within a bpffs mount point.
  2. The bpftool command can pin an iterator to a file
         bpftool iter pin <bpf_prog.o> <path>
  3. Use `cat <path>` to dump the contents.
     Use `rm -f <path>` to remove the pinned iterator.
  4. The anonymous iterator can be created as well.

  Please see patch #19 andd #20 for bpf programs and bpf iterator
  output examples.

  Note that certain iterators are namespace aware. For example,
  task and task_file targets only iterate through current pid namespace.
  ipv6_route and netlink will iterate through current net namespace.

  Please see individual patches for implementation details.

Performance:
  The bpf iterator provides in-kernel aggregation abilities
  for kernel data. This can greatly improve performance
  compared to e.g., iterating all process directories under /proc.
  For example, I did an experiment on my VM with an application forking
  different number of tasks and each forked process opening various number
  of files. The following is the result with the latency with unit of microseconds:

    # of forked tasks   # of open files    # of bpf_prog calls  # latency (us)
    100                 100                11503                7586
    1000                1000               1013203              709513
    10000               100                1130203              764519
  
  The number of bpf_prog calls may be more than forked tasks multipled by
  open files since there are other tasks running on the system.
  The bpf program is a do-nothing program. One millions of bpf calls takes
  less than one second. 
    
Future Work:            
  Although the initial motivation is from Martin's sk_local_storage,
  this patch didn't implement tcp6 sockets and sk_local_storage.
  The /proc/net/tcp6 involves three types of sockets, timewait,
  request and tcp6 sockets. Some kind of type casting or other
  mechanism is needed to handle all these socket types in one
  bpf program. This will be addressed in future work.

  Currently, we do not support kernel data generated under module.
  This requires some BTF work.

  More work for more iterators, e.g., tcp, udp, bpf_map elements, etc.

Changelog:
  v2 -> v3:
    - add bpf_iter_unreg_target() to unregister a target, used in the
      error path of the __init functions.
    - handle err != 0 before handling overflow (Andrii)
    - reference count "task" for task_file target (Andrii)
    - remove some redundancy for bpf_map/task/task_file targets
    - add bpf_iter_unreg_target() in ip6_route_cleanup()
    - Handling "%%" format in bpf_seq_printf() (Andrii)
    - implement auto-attach for bpf_iter in libbpf (Andrii)
    - add macros offsetof and container_of in bpf_helpers.h (Andrii)
    - add tests for auto-attach and program-return-1 cases
    - some other minor fixes
  v1 -> v2:
    - removed target_feature, using callback functions instead
    - checking target to ensure program specified btf_id supported (Martin)
    - link_create change with new changes from Andrii
    - better handling of btf_iter vs. seq_file private data (Martin, Andrii)
    - implemented bpf_seq_read() (Andrii, Alexei)
    - percpu buffer for bpf_seq_printf() (Andrii)
    - better syntax for BPF_SEQ_PRINTF macro (Andrii)
    - bpftool fixes (Quentin)
    - a lot of other fixes
  RFC v2 -> v1:
    - rename bpfdump to bpf_iter
    - use bpffs instead of a new file system
    - use bpf_link to streamline and simplify iterator creation.

References:
  [1]: https://lore.kernel.org/bpf/20200225230427.1976129-1-kafai@fb.com
  [2]: https://github.com/osandov/drgn

Yonghong Song (21):
  bpf: implement an interface to register bpf_iter targets
  bpf: allow loading of a bpf_iter program
  bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE
  bpf: implement bpf_seq_read() for bpf iterator
  bpf: create anonymous bpf iterator
  bpf: create file bpf iterator
  bpf: implement common macros/helpers for target iterators
  bpf: add bpf_map iterator
  net: bpf: add netlink and ipv6_route bpf_iter targets
  bpf: add task and task/file iterator targets
  bpf: add PTR_TO_BTF_ID_OR_NULL support
  bpf: add bpf_seq_printf and bpf_seq_write helpers
  bpf: handle spilled PTR_TO_BTF_ID properly when checking
    stack_boundary
  bpf: support variable length array in tracing programs
  tools/libbpf: add bpf_iter support
  tools/libpf: add offsetof/container_of macro in bpf_helpers.h
  tools/bpftool: add bpf_iter support for bptool
  tools/bpf: selftests: add iterator programs for ipv6_route and netlink
  tools/bpf: selftests: add iter progs for bpf_map/task/task_file
  tools/bpf: selftests: add bpf_iter selftests

 fs/proc/proc_net.c                            |  19 +
 include/linux/bpf.h                           |  36 ++
 include/linux/bpf_types.h                     |   1 +
 include/linux/proc_fs.h                       |   3 +
 include/uapi/linux/bpf.h                      |  40 +-
 kernel/bpf/Makefile                           |   2 +-
 kernel/bpf/bpf_iter.c                         | 526 ++++++++++++++++++
 kernel/bpf/btf.c                              |  42 +-
 kernel/bpf/inode.c                            |   5 +-
 kernel/bpf/map_iter.c                         |  97 ++++
 kernel/bpf/syscall.c                          |  59 ++
 kernel/bpf/task_iter.c                        | 332 +++++++++++
 kernel/bpf/verifier.c                         |  42 +-
 kernel/trace/bpf_trace.c                      | 200 +++++++
 net/ipv6/ip6_fib.c                            |  65 ++-
 net/ipv6/route.c                              |  37 ++
 net/netlink/af_netlink.c                      |  87 ++-
 scripts/bpf_helpers_doc.py                    |   2 +
 .../bpftool/Documentation/bpftool-iter.rst    |  83 +++
 tools/bpf/bpftool/bash-completion/bpftool     |  13 +
 tools/bpf/bpftool/iter.c                      |  84 +++
 tools/bpf/bpftool/link.c                      |   1 +
 tools/bpf/bpftool/main.c                      |   3 +-
 tools/bpf/bpftool/main.h                      |   1 +
 tools/include/uapi/linux/bpf.h                |  40 +-
 tools/lib/bpf/bpf.c                           |  10 +
 tools/lib/bpf/bpf.h                           |   2 +
 tools/lib/bpf/bpf_helpers.h                   |  14 +
 tools/lib/bpf/bpf_tracing.h                   |  16 +
 tools/lib/bpf/libbpf.c                        |  52 ++
 tools/lib/bpf/libbpf.h                        |   9 +
 tools/lib/bpf/libbpf.map                      |   2 +
 .../selftests/bpf/prog_tests/bpf_iter.c       | 408 ++++++++++++++
 .../selftests/bpf/progs/bpf_iter_bpf_map.c    |  28 +
 .../selftests/bpf/progs/bpf_iter_ipv6_route.c |  62 +++
 .../selftests/bpf/progs/bpf_iter_netlink.c    |  66 +++
 .../selftests/bpf/progs/bpf_iter_task.c       |  25 +
 .../selftests/bpf/progs/bpf_iter_task_file.c  |  26 +
 .../selftests/bpf/progs/bpf_iter_test_kern1.c |   4 +
 .../selftests/bpf/progs/bpf_iter_test_kern2.c |   4 +
 .../selftests/bpf/progs/bpf_iter_test_kern3.c |  18 +
 .../selftests/bpf/progs/bpf_iter_test_kern4.c |  52 ++
 .../bpf/progs/bpf_iter_test_kern_common.h     |  22 +
 43 files changed, 2626 insertions(+), 14 deletions(-)
 create mode 100644 kernel/bpf/bpf_iter.c
 create mode 100644 kernel/bpf/map_iter.c
 create mode 100644 kernel/bpf/task_iter.c
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-iter.rst
 create mode 100644 tools/bpf/bpftool/iter.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_iter.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h

-- 
2.24.1


^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 02/21] bpf: allow loading of a bpf_iter program
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 01/21] bpf: implement an interface to register bpf_iter targets Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 18:20   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 03/21] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE Yonghong Song
                   ` (18 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

A bpf_iter program is a tracing program with attach type
BPF_TRACE_ITER. The load attribute
  attach_btf_id
is used by the verifier against a particular kernel function,
which represents a target, e.g., __bpf_iter__bpf_map
for target bpf_map which is implemented later.

The program return value must be 0 or 1 for now.
  0 : successful, except potential seq_file buffer overflow
      which is handled by seq_file reader.
  1 : request to restart the same object

In the future, other return values may be used for filtering or
teminating the iterator.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |  3 +++
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/bpf_iter.c          | 36 ++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 21 ++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 5 files changed, 62 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 40c78b86fe38..f28bdd714754 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1127,6 +1127,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
 
+#define BPF_ITER_FUNC_PREFIX "__bpf_iter__"
+
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
 
@@ -1140,6 +1142,7 @@ struct bpf_iter_reg {
 
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
 void bpf_iter_unreg_target(const char *target);
+bool bpf_iter_prog_supported(struct bpf_prog *prog);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b3643e27e264..047b19fe716e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -218,6 +218,7 @@ enum bpf_attach_type {
 	BPF_TRACE_FEXIT,
 	BPF_MODIFY_RETURN,
 	BPF_LSM_MAC,
+	BPF_TRACE_ITER,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5a8119d17d14..dec182d8395a 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -12,6 +12,7 @@ struct bpf_iter_target_info {
 	bpf_iter_init_seq_priv_t init_seq_private;
 	bpf_iter_fini_seq_priv_t fini_seq_private;
 	u32 seq_priv_size;
+	u32 btf_id;	/* cached value */
 };
 
 static struct list_head targets = LIST_HEAD_INIT(targets);
@@ -57,3 +58,38 @@ void bpf_iter_unreg_target(const char *target)
 
 	WARN_ON(found == false);
 }
+
+static void cache_btf_id(struct bpf_iter_target_info *tinfo,
+			 struct bpf_prog *prog)
+{
+	tinfo->btf_id = prog->aux->attach_btf_id;
+}
+
+bool bpf_iter_prog_supported(struct bpf_prog *prog)
+{
+	const char *attach_fname = prog->aux->attach_func_name;
+	u32 prog_btf_id = prog->aux->attach_btf_id;
+	const char *prefix = BPF_ITER_FUNC_PREFIX;
+	struct bpf_iter_target_info *tinfo;
+	int prefix_len = strlen(prefix);
+	bool supported = false;
+
+	if (strncmp(attach_fname, prefix, prefix_len))
+		return false;
+
+	mutex_lock(&targets_mutex);
+	list_for_each_entry(tinfo, &targets, list) {
+		if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
+			supported = true;
+			break;
+		}
+		if (!strcmp(attach_fname + prefix_len, tinfo->target)) {
+			cache_btf_id(tinfo, prog);
+			supported = true;
+			break;
+		}
+	}
+	mutex_unlock(&targets_mutex);
+
+	return supported;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 70ad009577f8..d725ff7d11db 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env)
 			return 0;
 		range = tnum_const(0);
 		break;
+	case BPF_PROG_TYPE_TRACING:
+		if (env->prog->expected_attach_type != BPF_TRACE_ITER)
+			return 0;
+		break;
 	default:
 		return 0;
 	}
@@ -10481,6 +10485,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	struct bpf_prog *tgt_prog = prog->aux->linked_prog;
 	u32 btf_id = prog->aux->attach_btf_id;
 	const char prefix[] = "btf_trace_";
+	struct btf_func_model fmodel;
 	int ret = 0, subprog = -1, i;
 	struct bpf_trampoline *tr;
 	const struct btf_type *t;
@@ -10622,6 +10627,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		prog->aux->attach_func_proto = t;
 		prog->aux->attach_btf_trace = true;
 		return 0;
+	case BPF_TRACE_ITER:
+		if (!btf_type_is_func(t)) {
+			verbose(env, "attach_btf_id %u is not a function\n",
+				btf_id);
+			return -EINVAL;
+		}
+		t = btf_type_by_id(btf, t->type);
+		if (!btf_type_is_func_proto(t))
+			return -EINVAL;
+		prog->aux->attach_func_name = tname;
+		prog->aux->attach_func_proto = t;
+		if (!bpf_iter_prog_supported(prog))
+			return -EINVAL;
+		ret = btf_distill_func_proto(&env->log, btf, t,
+					     tname, &fmodel);
+		return ret;
 	default:
 		if (!prog_extension)
 			return -EINVAL;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b3643e27e264..047b19fe716e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -218,6 +218,7 @@ enum bpf_attach_type {
 	BPF_TRACE_FEXIT,
 	BPF_MODIFY_RETURN,
 	BPF_LSM_MAC,
+	BPF_TRACE_ITER,
 	__MAX_BPF_ATTACH_TYPE
 };
 
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 03/21] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 01/21] bpf: implement an interface to register bpf_iter targets Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 02/21] bpf: allow loading of a bpf_iter program Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 18:24   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 04/21] bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE Yonghong Song
                   ` (17 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Given a bpf program, the step to create an anonymous bpf iterator is:
  - create a bpf_iter_link, which combines bpf program and the target.
    In the future, there could be more information recorded in the link.
    A link_fd will be returned to the user space.
  - create an anonymous bpf iterator with the given link_fd.

The bpf_iter_link can be pinned to bpffs mount file system to
create a file based bpf iterator as well.

The benefit to use of bpf_iter_link:
  - using bpf link simplifies design and implementation as bpf link
    is used for other tracing bpf programs.
  - for file based bpf iterator, bpf_iter_link provides a standard
    way to replace underlying bpf programs.
  - for both anonymous and free based iterators, bpf link query
    capability can be leveraged.

The patch added support of tracing/iter programs for BPF_LINK_CREATE.
A new link type BPF_LINK_TYPE_ITER is added to facilitate link
querying. Currently, only prog_id is needed, so there is no
additional in-kernel show_fdinfo() and fill_link_info() hook
is needed for BPF_LINK_TYPE_ITER link.

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |  1 +
 include/linux/bpf_types.h      |  1 +
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           | 14 ++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 6 files changed, 80 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f28bdd714754..e93d2d33c82c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1143,6 +1143,7 @@ struct bpf_iter_reg {
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
 void bpf_iter_unreg_target(const char *target);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
+int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 8345cdf553b8..29d22752fc87 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -124,3 +124,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
 #ifdef CONFIG_CGROUP_BPF
 BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup)
 #endif
+BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 047b19fe716e..2bf33979f9ae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -229,6 +229,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
 	BPF_LINK_TYPE_TRACING = 2,
 	BPF_LINK_TYPE_CGROUP = 3,
+	BPF_LINK_TYPE_ITER = 4,
 
 	MAX_BPF_LINK_TYPE,
 };
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index dec182d8395a..03f5832909db 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -15,6 +15,11 @@ struct bpf_iter_target_info {
 	u32 btf_id;	/* cached value */
 };
 
+struct bpf_iter_link {
+	struct bpf_link link;
+	struct bpf_iter_target_info *tinfo;
+};
+
 static struct list_head targets = LIST_HEAD_INIT(targets);
 static DEFINE_MUTEX(targets_mutex);
 
@@ -93,3 +98,60 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
 
 	return supported;
 }
+
+static void bpf_iter_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_iter_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_iter_link *iter_link =
+		container_of(link, struct bpf_iter_link, link);
+
+	kfree(iter_link);
+}
+
+static const struct bpf_link_ops bpf_iter_link_lops = {
+	.release = bpf_iter_link_release,
+	.dealloc = bpf_iter_link_dealloc,
+};
+
+int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct bpf_iter_target_info *tinfo;
+	struct bpf_iter_link *link;
+	bool existed = false;
+	u32 prog_btf_id;
+	int err;
+
+	if (attr->link_create.target_fd || attr->link_create.flags)
+		return -EINVAL;
+
+	prog_btf_id = prog->aux->attach_btf_id;
+	mutex_lock(&targets_mutex);
+	list_for_each_entry(tinfo, &targets, list) {
+		if (tinfo->btf_id == prog_btf_id) {
+			existed = true;
+			break;
+		}
+	}
+	mutex_unlock(&targets_mutex);
+	if (!existed)
+		return -ENOENT;
+
+	link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
+	if (!link)
+		return -ENOMEM;
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
+	link->tinfo = tinfo;
+
+	err  = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		return err;
+	}
+
+	return bpf_link_settle(&link_primer);
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bb1ab7da6103..6ffe2d8fb6c7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
 		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
+	case BPF_TRACE_ITER:
+		return BPF_PROG_TYPE_TRACING;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -3729,6 +3731,15 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 	return err;
 }
 
+static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	if (attr->link_create.attach_type == BPF_TRACE_ITER &&
+	    prog->expected_attach_type == BPF_TRACE_ITER)
+		return bpf_iter_link_attach(attr, prog);
+
+	return -EINVAL;
+}
+
 #define BPF_LINK_CREATE_LAST_FIELD link_create.flags
 static int link_create(union bpf_attr *attr)
 {
@@ -3765,6 +3776,9 @@ static int link_create(union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 		ret = cgroup_bpf_link_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_TRACING:
+		ret = tracing_bpf_link_attach(attr, prog);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 047b19fe716e..2bf33979f9ae 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -229,6 +229,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
 	BPF_LINK_TYPE_TRACING = 2,
 	BPF_LINK_TYPE_CGROUP = 3,
+	BPF_LINK_TYPE_ITER = 4,
 
 	MAX_BPF_LINK_TYPE,
 };
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 04/21] bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (2 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 03/21] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 05/21] bpf: implement bpf_seq_read() for bpf iterator Yonghong Song
                   ` (16 subsequent siblings)
  20 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Added BPF_LINK_UPDATE support for tracing/iter programs.
This way, a file based bpf iterator, which holds a reference
to the link, can have its bpf program updated without
creating new files.

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/bpf_iter.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 03f5832909db..0542a243b78c 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -23,6 +23,9 @@ struct bpf_iter_link {
 static struct list_head targets = LIST_HEAD_INIT(targets);
 static DEFINE_MUTEX(targets_mutex);
 
+/* protect bpf_iter_link changes */
+static DEFINE_MUTEX(link_mutex);
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 {
 	struct bpf_iter_target_info *tinfo;
@@ -111,9 +114,37 @@ static void bpf_iter_link_dealloc(struct bpf_link *link)
 	kfree(iter_link);
 }
 
+static int bpf_iter_link_replace(struct bpf_link *link,
+				 struct bpf_prog *new_prog,
+				 struct bpf_prog *old_prog)
+{
+	int ret = 0;
+
+	mutex_lock(&link_mutex);
+	if (old_prog && link->prog != old_prog) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	if (link->prog->type != new_prog->type ||
+	    link->prog->expected_attach_type != new_prog->expected_attach_type ||
+	    link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	old_prog = xchg(&link->prog, new_prog);
+	bpf_prog_put(old_prog);
+
+out_unlock:
+	mutex_unlock(&link_mutex);
+	return ret;
+}
+
 static const struct bpf_link_ops bpf_iter_link_lops = {
 	.release = bpf_iter_link_release,
 	.dealloc = bpf_iter_link_dealloc,
+	.update_prog = bpf_iter_link_replace,
 };
 
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 05/21] bpf: implement bpf_seq_read() for bpf iterator
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (3 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 04/21] bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 18:52   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 06/21] bpf: create anonymous " Yonghong Song
                   ` (15 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

bpf iterator uses seq_file to provide a lossless
way to transfer data to user space. But we want to call
bpf program after all objects have been traversed, and
bpf program may write additional data to the
seq_file buffer. The current seq_read() does not work
for this use case.

Besides allowing stop() function to write to the buffer,
the bpf_seq_read() also fixed the buffer size to one page.
If any single call of show() or stop() will emit data
more than one page to cause overflow, -E2BIG error code
will be returned to user space.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/bpf_iter.c | 118 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 0542a243b78c..f198597b0ea4 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -26,6 +26,124 @@ static DEFINE_MUTEX(targets_mutex);
 /* protect bpf_iter_link changes */
 static DEFINE_MUTEX(link_mutex);
 
+/* bpf_seq_read, a customized and simpler version for bpf iterator.
+ * no_llseek is assumed for this file.
+ * The following are differences from seq_read():
+ *  . fixed buffer size (PAGE_SIZE)
+ *  . assuming no_llseek
+ *  . stop() may call bpf program, handling potential overflow there
+ */
+static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
+			    loff_t *ppos)
+{
+	struct seq_file *seq = file->private_data;
+	size_t n, offs, copied = 0;
+	int err = 0;
+	void *p;
+
+	mutex_lock(&seq->lock);
+
+	if (!seq->buf) {
+		seq->size = PAGE_SIZE;
+		seq->buf = kmalloc(seq->size, GFP_KERNEL);
+		if (!seq->buf) {
+			err = -ENOMEM;
+			goto done;
+		}
+	}
+
+	if (seq->count) {
+		n = min(seq->count, size);
+		err = copy_to_user(buf, seq->buf + seq->from, n);
+		if (err) {
+			err = -EFAULT;
+			goto done;
+		}
+		seq->count -= n;
+		seq->from += n;
+		copied = n;
+		goto done;
+	}
+
+	seq->from = 0;
+	p = seq->op->start(seq, &seq->index);
+	if (IS_ERR_OR_NULL(p))
+		goto stop;
+
+	err = seq->op->show(seq, p);
+	if (err > 0) {
+		seq->count = 0;
+	} else if (err < 0 || seq_has_overflowed(seq)) {
+		if (!err)
+			err = -E2BIG;
+		seq->count = 0;
+		seq->op->stop(seq, p);
+		goto done;
+	}
+
+	while (1) {
+		loff_t pos = seq->index;
+
+		offs = seq->count;
+		p = seq->op->next(seq, p, &seq->index);
+		if (pos == seq->index) {
+			pr_info_ratelimited("buggy seq_file .next function %ps "
+				"did not updated position index\n",
+				seq->op->next);
+			seq->index++;
+		}
+
+		if (IS_ERR_OR_NULL(p)) {
+			err = PTR_ERR(p);
+			break;
+		}
+		if (seq->count >= size)
+			break;
+
+		err = seq->op->show(seq, p);
+		if (err > 0) {
+			seq->count = offs;
+		} else if (err < 0 || seq_has_overflowed(seq)) {
+			seq->count = offs;
+			if (!err)
+				err = -E2BIG;
+			if (offs == 0) {
+				seq->op->stop(seq, p);
+				goto done;
+			}
+			break;
+		}
+	}
+stop:
+	offs = seq->count;
+	/* bpf program called if !p */
+	seq->op->stop(seq, p);
+	if (!p && seq_has_overflowed(seq)) {
+		seq->count = offs;
+		if (offs == 0) {
+			err = -E2BIG;
+			goto done;
+		}
+	}
+
+	n = min(seq->count, size);
+	err = copy_to_user(buf, seq->buf, n);
+	if (err) {
+		err = -EFAULT;
+		goto done;
+	}
+	copied = n;
+	seq->count -= n;
+	seq->from = n;
+done:
+	if (!copied)
+		copied = err;
+	else
+		*ppos += copied;
+	mutex_unlock(&seq->lock);
+	return copied;
+}
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 {
 	struct bpf_iter_target_info *tinfo;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 06/21] bpf: create anonymous bpf iterator
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (4 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 05/21] bpf: implement bpf_seq_read() for bpf iterator Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 18:57   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 07/21] bpf: create file " Yonghong Song
                   ` (14 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

A new bpf command BPF_ITER_CREATE is added.

The anonymous bpf iterator is seq_file based.
The seq_file private data are referenced by targets.
The bpf_iter infrastructure allocated additional space
at seq_file->private before the space used by targets
to store some meta data, e.g.,
  prog:       prog to run
  session_id: an unique id for each opened seq_file
  seq_num:    how many times bpf programs are queried in this session
  done_stop:  an internal state to decide whether bpf program
              should be called in seq_ops->stop() or not

The seq_num will start from 0 for valid objects.
The bpf program may see the same seq_num more than once if
 - seq_file buffer overflow happens and the same object
   is retried by bpf_seq_read(), or
 - the bpf program explicitly requests a retry of the
   same object

Since module is not supported for bpf_iter, all target
registeration happens at __init time, so there is no
need to change bpf_iter_unreg_target() as it is used
mostly in error path of the init function at which time
no bpf iterators have been created yet.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |   1 +
 include/uapi/linux/bpf.h       |   6 ++
 kernel/bpf/bpf_iter.c          | 129 +++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  26 +++++++
 tools/include/uapi/linux/bpf.h |   6 ++
 5 files changed, 168 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e93d2d33c82c..80b1b9d8a638 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1144,6 +1144,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
 void bpf_iter_unreg_target(const char *target);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int bpf_iter_new_fd(struct bpf_link *link);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2bf33979f9ae..97ceb0f2e539 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_cmd {
 	BPF_LINK_GET_FD_BY_ID,
 	BPF_LINK_GET_NEXT_ID,
 	BPF_ENABLE_STATS,
+	BPF_ITER_CREATE,
 };
 
 enum bpf_map_type {
@@ -614,6 +615,11 @@ union bpf_attr {
 		__u32		type;
 	} enable_stats;
 
+	struct { /* struct used by BPF_ITER_CREATE command */
+		__u32		link_fd;
+		__u32		flags;
+	} iter_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index f198597b0ea4..917df4c69966 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2020 Facebook */
 
 #include <linux/fs.h>
+#include <linux/anon_inodes.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
 
@@ -20,12 +21,24 @@ struct bpf_iter_link {
 	struct bpf_iter_target_info *tinfo;
 };
 
+struct bpf_iter_priv_data {
+	struct bpf_iter_target_info *tinfo;
+	struct bpf_prog *prog;
+	u64 session_id;
+	u64 seq_num;
+	bool done_stop;
+	u8 target_private[] __aligned(8);
+};
+
 static struct list_head targets = LIST_HEAD_INIT(targets);
 static DEFINE_MUTEX(targets_mutex);
 
 /* protect bpf_iter_link changes */
 static DEFINE_MUTEX(link_mutex);
 
+/* incremented on every opened seq_file */
+static atomic64_t session_id;
+
 /* bpf_seq_read, a customized and simpler version for bpf iterator.
  * no_llseek is assumed for this file.
  * The following are differences from seq_read():
@@ -144,6 +157,33 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 	return copied;
 }
 
+static int iter_release(struct inode *inode, struct file *file)
+{
+	struct bpf_iter_priv_data *iter_priv;
+	struct seq_file *seq;
+
+	seq = file->private_data;
+	if (!seq)
+		return 0;
+
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+
+	if (iter_priv->tinfo->fini_seq_private)
+		iter_priv->tinfo->fini_seq_private(seq->private);
+
+	bpf_prog_put(iter_priv->prog);
+	seq->private = iter_priv;
+
+	return seq_release_private(inode, file);
+}
+
+static const struct file_operations bpf_iter_fops = {
+	.llseek		= no_llseek,
+	.read		= bpf_seq_read,
+	.release	= iter_release,
+};
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 {
 	struct bpf_iter_target_info *tinfo;
@@ -304,3 +344,92 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 
 	return bpf_link_settle(&link_primer);
 }
+
+static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
+			  struct bpf_iter_target_info *tinfo,
+			  struct bpf_prog *prog)
+{
+	priv_data->tinfo = tinfo;
+	priv_data->prog = prog;
+	priv_data->session_id = atomic64_inc_return(&session_id);
+	priv_data->seq_num = 0;
+	priv_data->done_stop = false;
+}
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+{
+	struct bpf_iter_priv_data *priv_data;
+	struct bpf_iter_target_info *tinfo;
+	struct bpf_prog *prog;
+	u32 total_priv_dsize;
+	struct seq_file *seq;
+	int err = 0;
+
+	mutex_lock(&link_mutex);
+	prog = link->link.prog;
+	bpf_prog_inc(prog);
+	mutex_unlock(&link_mutex);
+
+	tinfo = link->tinfo;
+	total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
+			   tinfo->seq_priv_size;
+	priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize);
+	if (!priv_data) {
+		err = -ENOMEM;
+		goto release_prog;
+	}
+
+	if (tinfo->init_seq_private) {
+		err = tinfo->init_seq_private(priv_data->target_private);
+		if (err)
+			goto release_seq_file;
+	}
+
+	init_seq_meta(priv_data, tinfo, prog);
+	seq = file->private_data;
+	seq->private = priv_data->target_private;
+
+	return 0;
+
+release_seq_file:
+	seq_release_private(file->f_inode, file);
+	file->private_data = NULL;
+release_prog:
+	bpf_prog_put(prog);
+	return err;
+}
+
+int bpf_iter_new_fd(struct bpf_link *link)
+{
+	struct file *file;
+	unsigned int flags;
+	int err, fd;
+
+	if (link->ops != &bpf_iter_link_lops)
+		return -EINVAL;
+
+	flags = O_RDONLY | O_CLOEXEC;
+	fd = get_unused_fd_flags(flags);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto free_fd;
+	}
+
+	err = prepare_seq_file(file,
+			       container_of(link, struct bpf_iter_link, link));
+	if (err)
+		goto free_file;
+
+	fd_install(fd, file);
+	return fd;
+
+free_file:
+	fput(file);
+free_fd:
+	put_unused_fd(fd);
+	return err;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6ffe2d8fb6c7..a293e88ee01a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3941,6 +3941,29 @@ static int bpf_enable_stats(union bpf_attr *attr)
 	return -EINVAL;
 }
 
+#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
+
+static int bpf_iter_create(union bpf_attr *attr)
+{
+	struct bpf_link *link;
+	int err;
+
+	if (CHECK_ATTR(BPF_ITER_CREATE))
+		return -EINVAL;
+
+	if (attr->iter_create.flags)
+		return -EINVAL;
+
+	link = bpf_link_get_from_fd(attr->iter_create.link_fd);
+	if (IS_ERR(link))
+		return PTR_ERR(link);
+
+	err = bpf_iter_new_fd(link);
+	bpf_link_put(link);
+
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr;
@@ -4068,6 +4091,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_ENABLE_STATS:
 		err = bpf_enable_stats(&attr);
 		break;
+	case BPF_ITER_CREATE:
+		err = bpf_iter_create(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2bf33979f9ae..97ceb0f2e539 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_cmd {
 	BPF_LINK_GET_FD_BY_ID,
 	BPF_LINK_GET_NEXT_ID,
 	BPF_ENABLE_STATS,
+	BPF_ITER_CREATE,
 };
 
 enum bpf_map_type {
@@ -614,6 +615,11 @@ union bpf_attr {
 		__u32		type;
 	} enable_stats;
 
+	struct { /* struct used by BPF_ITER_CREATE command */
+		__u32		link_fd;
+		__u32		flags;
+	} iter_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 07/21] bpf: create file bpf iterator
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (5 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 06/21] bpf: create anonymous " Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 08/21] bpf: implement common macros/helpers for target iterators Yonghong Song
                   ` (13 subsequent siblings)
  20 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

To produce a file bpf iterator, the fd must be
corresponding to a link_fd assocciated with a
trace/iter program. When the pinned file is
opened, a seq_file will be generated.

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/bpf_iter.c | 17 ++++++++++++++++-
 kernel/bpf/inode.c    |  5 ++++-
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 80b1b9d8a638..b06653ab3476 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1022,6 +1022,7 @@ static inline void bpf_enable_instrumentation(void)
 
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
+extern const struct file_operations bpf_iter_fops;
 
 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
 	extern const struct bpf_prog_ops _name ## _prog_ops; \
@@ -1145,6 +1146,7 @@ void bpf_iter_unreg_target(const char *target);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_iter_new_fd(struct bpf_link *link);
+bool bpf_link_is_iter(struct bpf_link *link);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 917df4c69966..b0b8726c08f8 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -39,6 +39,8 @@ static DEFINE_MUTEX(link_mutex);
 /* incremented on every opened seq_file */
 static atomic64_t session_id;
 
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
+
 /* bpf_seq_read, a customized and simpler version for bpf iterator.
  * no_llseek is assumed for this file.
  * The following are differences from seq_read():
@@ -157,6 +159,13 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 	return copied;
 }
 
+static int iter_open(struct inode *inode, struct file *file)
+{
+	struct bpf_iter_link *link = inode->i_private;
+
+	return prepare_seq_file(file, link);
+}
+
 static int iter_release(struct inode *inode, struct file *file)
 {
 	struct bpf_iter_priv_data *iter_priv;
@@ -178,7 +187,8 @@ static int iter_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static const struct file_operations bpf_iter_fops = {
+const struct file_operations bpf_iter_fops = {
+	.open		= iter_open,
 	.llseek		= no_llseek,
 	.read		= bpf_seq_read,
 	.release	= iter_release,
@@ -305,6 +315,11 @@ static const struct bpf_link_ops bpf_iter_link_lops = {
 	.update_prog = bpf_iter_link_replace,
 };
 
+bool bpf_link_is_iter(struct bpf_link *link)
+{
+	return link->ops == &bpf_iter_link_lops;
+}
+
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
 	struct bpf_link_primer link_primer;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 95087d9f4ed3..fb878ba3f22f 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 
 static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
 {
+	struct bpf_link *link = arg;
+
 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
-			     &bpffs_obj_fops);
+			     bpf_link_is_iter(link) ?
+			     &bpf_iter_fops : &bpffs_obj_fops);
 }
 
 static struct dentry *
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 08/21] bpf: implement common macros/helpers for target iterators
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (6 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 07/21] bpf: create file " Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 19:07   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 09/21] bpf: add bpf_map iterator Yonghong Song
                   ` (12 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Macro DEFINE_BPF_ITER_FUNC is implemented so target
can define an init function to capture the BTF type
which represents the target.

The bpf_iter_meta is a structure holding meta data, common
to all targets in the bpf program.

Additional marker functions are called before or after
bpf_seq_read() show()/next()/stop() callback functions
to help calculate precise seq_num and whether call bpf_prog
inside stop().

Two functions, bpf_iter_get_info() and bpf_iter_run_prog(),
are implemented so target can get needed information from
bpf_iter infrastructure and can run the program.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   | 11 ++++++
 kernel/bpf/bpf_iter.c | 86 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b06653ab3476..ffe0b9b669bf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1129,6 +1129,9 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
 
 #define BPF_ITER_FUNC_PREFIX "__bpf_iter__"
+#define DEFINE_BPF_ITER_FUNC(target, args...)			\
+	extern int __bpf_iter__ ## target(args);		\
+	int __init __bpf_iter__ ## target(args) { return 0; }
 
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
@@ -1141,12 +1144,20 @@ struct bpf_iter_reg {
 	u32 seq_priv_size;
 };
 
+struct bpf_iter_meta {
+	__bpf_md_ptr(struct seq_file *, seq);
+	u64 session_id;
+	u64 seq_num;
+};
+
 int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
 void bpf_iter_unreg_target(const char *target);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_iter_new_fd(struct bpf_link *link);
 bool bpf_link_is_iter(struct bpf_link *link);
+struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop);
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index b0b8726c08f8..bf4fb7d7267b 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -41,6 +41,33 @@ static atomic64_t session_id;
 
 static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
 
+static void bpf_iter_inc_seq_num(struct seq_file *seq)
+{
+	struct bpf_iter_priv_data *iter_priv;
+
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+	iter_priv->seq_num++;
+}
+
+static void bpf_iter_dec_seq_num(struct seq_file *seq)
+{
+	struct bpf_iter_priv_data *iter_priv;
+
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+	iter_priv->seq_num--;
+}
+
+static void bpf_iter_done_stop(struct seq_file *seq)
+{
+	struct bpf_iter_priv_data *iter_priv;
+
+	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+				 target_private);
+	iter_priv->done_stop = true;
+}
+
 /* bpf_seq_read, a customized and simpler version for bpf iterator.
  * no_llseek is assumed for this file.
  * The following are differences from seq_read():
@@ -87,6 +114,10 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 
 	err = seq->op->show(seq, p);
 	if (err > 0) {
+		/* object is skipped, decrease seq_num, so next
+		 * valid object can reuse the same seq_num.
+		 */
+		bpf_iter_dec_seq_num(seq);
 		seq->count = 0;
 	} else if (err < 0 || seq_has_overflowed(seq)) {
 		if (!err)
@@ -112,11 +143,16 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 			err = PTR_ERR(p);
 			break;
 		}
+
+		/* get a valid next object, increase seq_num */
+		bpf_iter_inc_seq_num(seq);
+
 		if (seq->count >= size)
 			break;
 
 		err = seq->op->show(seq, p);
 		if (err > 0) {
+			bpf_iter_dec_seq_num(seq);
 			seq->count = offs;
 		} else if (err < 0 || seq_has_overflowed(seq)) {
 			seq->count = offs;
@@ -133,11 +169,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 	offs = seq->count;
 	/* bpf program called if !p */
 	seq->op->stop(seq, p);
-	if (!p && seq_has_overflowed(seq)) {
-		seq->count = offs;
-		if (offs == 0) {
-			err = -E2BIG;
-			goto done;
+	if (!p) {
+		if (!seq_has_overflowed(seq)) {
+			bpf_iter_done_stop(seq);
+		} else {
+			seq->count = offs;
+			if (offs == 0) {
+				err = -E2BIG;
+				goto done;
+			}
 		}
 	}
 
@@ -448,3 +488,39 @@ int bpf_iter_new_fd(struct bpf_link *link)
 	put_unused_fd(fd);
 	return err;
 }
+
+struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
+{
+	struct bpf_iter_priv_data *iter_priv;
+	struct seq_file *seq;
+	void *seq_priv;
+
+	seq = meta->seq;
+	if (seq->file->f_op != &bpf_iter_fops)
+		return NULL;
+
+	seq_priv = seq->private;
+	iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
+				 target_private);
+
+	if (in_stop && iter_priv->done_stop)
+		return NULL;
+
+	meta->session_id = iter_priv->session_id;
+	meta->seq_num = iter_priv->seq_num;
+
+	return iter_priv->prog;
+}
+
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
+{
+	int ret;
+
+	rcu_read_lock();
+	migrate_disable();
+	ret = BPF_PROG_RUN(prog, ctx);
+	migrate_enable();
+	rcu_read_unlock();
+
+	return ret == 0 ? 0 : -EAGAIN;
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 09/21] bpf: add bpf_map iterator
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (7 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 08/21] bpf: implement common macros/helpers for target iterators Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 10/21] net: bpf: add netlink and ipv6_route bpf_iter targets Yonghong Song
                   ` (11 subsequent siblings)
  20 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Implement seq_file operations to traverse all maps.

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/Makefile   |  2 +-
 kernel/bpf/map_iter.c | 97 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c  | 19 +++++++++
 4 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/map_iter.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ffe0b9b669bf..363ab0751967 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1082,6 +1082,7 @@ int  generic_map_update_batch(struct bpf_map *map,
 int  generic_map_delete_batch(struct bpf_map *map,
 			      const union bpf_attr *attr,
 			      union bpf_attr __user *uattr);
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a8b0febd3f6..b2b5eefc5254 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
 obj-y := core.o
 CFLAGS_core.o += $(call cc-disable-warning, override-init)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
new file mode 100644
index 000000000000..8162e0c00b9f
--- /dev/null
+++ b/kernel/bpf/map_iter.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+
+struct bpf_iter_seq_map_info {
+	u32 mid;
+};
+
+static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+	struct bpf_map *map;
+
+	map = bpf_map_get_curr_or_next(&info->mid);
+	if (!map)
+		return NULL;
+
+	++*pos;
+	return map;
+}
+
+static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+	struct bpf_map *map;
+
+	++*pos;
+	++info->mid;
+	bpf_map_put((struct bpf_map *)v);
+	map = bpf_map_get_curr_or_next(&info->mid);
+	if (!map)
+		return NULL;
+
+	return map;
+}
+
+struct bpf_iter__bpf_map {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_map *, map);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map)
+
+static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter__bpf_map ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	ctx.meta = &meta;
+	ctx.map = v;
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	return ret;
+}
+
+static int bpf_map_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_map_seq_show(seq, v, false);
+}
+
+static void bpf_map_seq_stop(struct seq_file *seq, void *v)
+{
+	if (!v)
+		(void)__bpf_map_seq_show(seq, v, true);
+	else
+		bpf_map_put((struct bpf_map *)v);
+}
+
+static const struct seq_operations bpf_map_seq_ops = {
+	.start	= bpf_map_seq_start,
+	.next	= bpf_map_seq_next,
+	.stop	= bpf_map_seq_stop,
+	.show	= bpf_map_seq_show,
+};
+
+static int __init bpf_map_iter_init(void)
+{
+	struct bpf_iter_reg reg_info = {
+		.target			= "bpf_map",
+		.seq_ops		= &bpf_map_seq_ops,
+		.init_seq_private	= NULL,
+		.fini_seq_private	= NULL,
+		.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
+	};
+
+	return bpf_iter_reg_target(&reg_info);
+}
+
+late_initcall(bpf_map_iter_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a293e88ee01a..de2a75500233 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2934,6 +2934,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 	return err;
 }
 
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
+{
+	struct bpf_map *map;
+
+	spin_lock_bh(&map_idr_lock);
+again:
+	map = idr_get_next(&map_idr, id);
+	if (map) {
+		map = __bpf_map_inc_not_zero(map, false);
+		if (IS_ERR(map)) {
+			(*id)++;
+			goto again;
+		}
+	}
+	spin_unlock_bh(&map_idr_lock);
+
+	return map;
+}
+
 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
 
 struct bpf_prog *bpf_prog_by_id(u32 id)
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 10/21] net: bpf: add netlink and ipv6_route bpf_iter targets
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (8 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 09/21] bpf: add bpf_map iterator Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 19:17   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 11/21] bpf: add task and task/file iterator targets Yonghong Song
                   ` (10 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

This patch added netlink and ipv6_route targets, using
the same seq_ops (except show() and minor changes for stop())
for /proc/net/{netlink,ipv6_route}.

The net namespace for these targets are the current net
namespace at file open stage, similar to
/proc/net/{netlink,ipv6_route} reference counting
the net namespace at seq_file open stage.

Since module is not supported for now, ipv6_route is
supported only if the IPV6 is built-in, i.e., not compiled
as a module. The restriction can be lifted once module
is properly supported for bpf_iter.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 fs/proc/proc_net.c       | 19 +++++++++
 include/linux/proc_fs.h  |  3 ++
 net/ipv6/ip6_fib.c       | 65 +++++++++++++++++++++++++++++-
 net/ipv6/route.c         | 37 +++++++++++++++++
 net/netlink/af_netlink.c | 87 +++++++++++++++++++++++++++++++++++++++-
 5 files changed, 207 insertions(+), 4 deletions(-)

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4888c5224442..dba63b2429f0 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -98,6 +98,25 @@ static const struct proc_ops proc_net_seq_ops = {
 	.proc_release	= seq_release_net,
 };
 
+int bpf_iter_init_seq_net(void *priv_data)
+{
+#ifdef CONFIG_NET_NS
+	struct seq_net_private *p = priv_data;
+
+	p->net = get_net(current->nsproxy->net_ns);
+#endif
+	return 0;
+}
+
+void bpf_iter_fini_seq_net(void *priv_data)
+{
+#ifdef CONFIG_NET_NS
+	struct seq_net_private *p = priv_data;
+
+	put_net(p->net);
+#endif
+}
+
 struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
 		struct proc_dir_entry *parent, const struct seq_operations *ops,
 		unsigned int state_size, void *data)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 45c05fd9c99d..03953c59807d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -105,6 +105,9 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    void *data);
 extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 
+extern int bpf_iter_init_seq_net(void *priv_data);
+extern void bpf_iter_fini_seq_net(void *priv_data);
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 46ed56719476..a1fcc0ca21af 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2467,7 +2467,7 @@ void fib6_gc_cleanup(void)
 }
 
 #ifdef CONFIG_PROC_FS
-static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
 {
 	struct fib6_info *rt = v;
 	struct ipv6_route_iter *iter = seq->private;
@@ -2625,7 +2625,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
 	return w->node && !(w->state == FWS_U && w->node == w->root);
 }
 
-static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
 	__releases(RCU_BH)
 {
 	struct net *net = seq_file_net(seq);
@@ -2637,6 +2637,67 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
 	rcu_read_unlock_bh();
 }
 
+#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
+struct bpf_iter__ipv6_route {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct fib6_info *, rt);
+};
+
+static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
+				    struct bpf_iter_meta *meta,
+				    void *v)
+{
+	struct bpf_iter__ipv6_route ctx;
+
+	ctx.meta = meta;
+	ctx.rt = v;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+	struct ipv6_route_iter *iter = seq->private;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	if (!prog)
+		return ipv6_route_native_seq_show(seq, v);
+
+	ret = ipv6_route_prog_seq_show(prog, &meta, v);
+	iter->w.leaf = NULL;
+
+	return ret;
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			(void)ipv6_route_prog_seq_show(prog, &meta, v);
+	}
+
+	ipv6_route_native_seq_stop(seq, v);
+}
+#else
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+	return ipv6_route_native_seq_show(seq, v);
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+	ipv6_route_native_seq_stop(seq, v);
+}
+#endif
+
 const struct seq_operations ipv6_route_seq_ops = {
 	.start	= ipv6_route_seq_start,
 	.next	= ipv6_route_seq_next,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3912aac7854d..25f6d3e619d0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6393,6 +6393,30 @@ void __init ip6_route_init_special_entries(void)
   #endif
 }
 
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
+
+static int __init bpf_iter_register(void)
+{
+	struct bpf_iter_reg reg_info = {
+		.target			= "ipv6_route",
+		.seq_ops		= &ipv6_route_seq_ops,
+		.init_seq_private	= bpf_iter_init_seq_net,
+		.fini_seq_private	= bpf_iter_fini_seq_net,
+		.seq_priv_size		= sizeof(struct ipv6_route_iter),
+	};
+
+	return bpf_iter_reg_target(&reg_info);
+}
+
+static void bpf_iter_unregister(void)
+{
+	bpf_iter_unreg_target("ipv6_route");
+}
+#endif
+#endif
+
 int __init ip6_route_init(void)
 {
 	int ret;
@@ -6455,6 +6479,14 @@ int __init ip6_route_init(void)
 	if (ret)
 		goto out_register_late_subsys;
 
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	ret = bpf_iter_register();
+	if (ret)
+		goto out_register_late_subsys;
+#endif
+#endif
+
 	for_each_possible_cpu(cpu) {
 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
 
@@ -6487,6 +6519,11 @@ int __init ip6_route_init(void)
 
 void ip6_route_cleanup(void)
 {
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	bpf_iter_unregister();
+#endif
+#endif
 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
 	unregister_pernet_subsys(&ip6_route_net_late_ops);
 	fib6_rules_cleanup();
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5ded01ca8b20..33cda9baa979 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2596,7 +2596,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	return __netlink_seq_next(seq);
 }
 
-static void netlink_seq_stop(struct seq_file *seq, void *v)
+static void netlink_native_seq_stop(struct seq_file *seq, void *v)
 {
 	struct nl_seq_iter *iter = seq->private;
 
@@ -2607,7 +2607,7 @@ static void netlink_seq_stop(struct seq_file *seq, void *v)
 }
 
 
-static int netlink_seq_show(struct seq_file *seq, void *v)
+static int netlink_native_seq_show(struct seq_file *seq, void *v)
 {
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
@@ -2634,6 +2634,68 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_iter__netlink {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct netlink_sock *, sk);
+};
+
+DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)
+
+static int netlink_prog_seq_show(struct bpf_prog *prog,
+				  struct bpf_iter_meta *meta,
+				  void *v)
+{
+	struct bpf_iter__netlink ctx;
+
+	meta->seq_num--;  /* skip SEQ_START_TOKEN */
+	ctx.meta = meta;
+	ctx.sk = nlk_sk((struct sock *)v);
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int netlink_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	if (!prog)
+		return netlink_native_seq_show(seq, v);
+
+	if (v != SEQ_START_TOKEN)
+		return netlink_prog_seq_show(prog, &meta, v);
+
+	return 0;
+}
+
+static void netlink_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			(void)netlink_prog_seq_show(prog, &meta, v);
+	}
+
+	netlink_native_seq_stop(seq, v);
+}
+#else
+static int netlink_seq_show(struct seq_file *seq, void *v)
+{
+	return netlink_native_seq_show(seq, v);
+}
+
+static void netlink_seq_stop(struct seq_file *seq, void *v)
+{
+	netlink_native_seq_stop(seq, v);
+}
+#endif
+
 static const struct seq_operations netlink_seq_ops = {
 	.start  = netlink_seq_start,
 	.next   = netlink_seq_next,
@@ -2740,6 +2802,21 @@ static const struct rhashtable_params netlink_rhashtable_params = {
 	.automatic_shrinking = true,
 };
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+static int __init bpf_iter_register(void)
+{
+	struct bpf_iter_reg reg_info = {
+		.target			= "netlink",
+		.seq_ops		= &netlink_seq_ops,
+		.init_seq_private	= bpf_iter_init_seq_net,
+		.fini_seq_private	= bpf_iter_fini_seq_net,
+		.seq_priv_size		= sizeof(struct nl_seq_iter),
+	};
+
+	return bpf_iter_reg_target(&reg_info);
+}
+#endif
+
 static int __init netlink_proto_init(void)
 {
 	int i;
@@ -2748,6 +2825,12 @@ static int __init netlink_proto_init(void)
 	if (err != 0)
 		goto out;
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	err = bpf_iter_register();
+	if (err)
+		goto out;
+#endif
+
 	BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));
 
 	nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 11/21] bpf: add task and task/file iterator targets
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (9 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 10/21] net: bpf: add netlink and ipv6_route bpf_iter targets Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 19:36   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 12/21] bpf: add PTR_TO_BTF_ID_OR_NULL support Yonghong Song
                   ` (9 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Only the tasks belonging to "current" pid namespace
are enumerated.

For task/file target, the bpf program will have access to
  struct task_struct *task
  u32 fd
  struct file *file
where fd/file is an open file for the task.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/Makefile    |   2 +-
 kernel/bpf/task_iter.c | 332 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 333 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/task_iter.c

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index b2b5eefc5254..37b2d8620153 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
 obj-y := core.o
 CFLAGS_core.o += $(call cc-disable-warning, override-init)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
new file mode 100644
index 000000000000..21dd794bf5d9
--- /dev/null
+++ b/kernel/bpf/task_iter.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/pid_namespace.h>
+#include <linux/fs.h>
+#include <linux/fdtable.h>
+#include <linux/filter.h>
+
+struct bpf_iter_seq_task_common {
+	struct pid_namespace *ns;
+};
+
+struct bpf_iter_seq_task_info {
+	/* The first field must be struct bpf_iter_seq_task_common.
+	 * this is assumed by {init, fini}_seq_pidns() callback functions.
+	 */
+	struct bpf_iter_seq_task_common common;
+	u32 tid;
+};
+
+static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+					     u32 *tid)
+{
+	struct task_struct *task = NULL;
+	struct pid *pid;
+
+	rcu_read_lock();
+	pid = idr_get_next(&ns->idr, tid);
+	if (pid)
+		task = get_pid_task(pid, PIDTYPE_PID);
+	rcu_read_unlock();
+
+	return task;
+}
+
+static void *task_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_task_info *info = seq->private;
+	struct task_struct *task;
+
+	task = task_seq_get_next(info->common.ns, &info->tid);
+	if (!task)
+		return NULL;
+
+	++*pos;
+	return task;
+}
+
+static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_task_info *info = seq->private;
+	struct task_struct *task;
+
+	++*pos;
+	++info->tid;
+	put_task_struct((struct task_struct *)v);
+	task = task_seq_get_next(info->common.ns, &info->tid);
+	if (!task)
+		return NULL;
+
+	return task;
+}
+
+struct bpf_iter__task {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct task_struct *, task);
+};
+
+DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
+
+static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__task ctx;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (!prog)
+		return 0;
+
+	meta.seq = seq;
+	ctx.meta = &meta;
+	ctx.task = v;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_seq_show(struct seq_file *seq, void *v)
+{
+	return __task_seq_show(seq, v, false);
+}
+
+static void task_seq_stop(struct seq_file *seq, void *v)
+{
+	if (!v)
+		(void)__task_seq_show(seq, v, true);
+	else
+		put_task_struct((struct task_struct *)v);
+}
+
+static const struct seq_operations task_seq_ops = {
+	.start	= task_seq_start,
+	.next	= task_seq_next,
+	.stop	= task_seq_stop,
+	.show	= task_seq_show,
+};
+
+struct bpf_iter_seq_task_file_info {
+	/* The first field must be struct bpf_iter_seq_task_common.
+	 * this is assumed by {init, fini}_seq_pidns() callback functions.
+	 */
+	struct bpf_iter_seq_task_common common;
+	struct task_struct *task;
+	struct files_struct *files;
+	u32 tid;
+	u32 fd;
+};
+
+static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *tid,
+					   int *fd, struct task_struct **task,
+					   struct files_struct **fstruct)
+{
+	struct files_struct *curr_files;
+	struct task_struct *curr_task;
+	u32 curr_tid = *tid, max_fds;
+	int curr_fd = *fd;
+
+	/* If this function returns a non-NULL file object,
+	 * it held a reference to the task/files_struct/file.
+	 * Otherwise, it does not hold any reference.
+	 */
+again:
+	if (*task) {
+		curr_task = *task;
+		curr_files = *fstruct;
+		curr_fd = *fd;
+	} else {
+		curr_task = task_seq_get_next(ns, &curr_tid);
+		if (!curr_task)
+			return NULL;
+
+		curr_files = get_files_struct(curr_task);
+		if (!curr_files) {
+			put_task_struct(curr_task);
+			curr_tid = ++(*tid);
+			*fd = 0;
+			goto again;
+		}
+
+		/* set *fstruct, *task and *tid */
+		*fstruct = curr_files;
+		*task = curr_task;
+		if (curr_tid == *tid) {
+			curr_fd = *fd;
+		} else {
+			*tid = curr_tid;
+			curr_fd = 0;
+		}
+	}
+
+	rcu_read_lock();
+	max_fds = files_fdtable(curr_files)->max_fds;
+	for (; curr_fd < max_fds; curr_fd++) {
+		struct file *f;
+
+		f = fcheck_files(curr_files, curr_fd);
+		if (!f)
+			continue;
+
+		/* set *fd */
+		*fd = curr_fd;
+		get_file(f);
+		rcu_read_unlock();
+		return f;
+	}
+
+	/* the current task is done, go to the next task */
+	rcu_read_unlock();
+	put_files_struct(curr_files);
+	put_task_struct(curr_task);
+	*task = NULL;
+	*fstruct = NULL;
+	*fd = 0;
+	curr_tid = ++(*tid);
+	goto again;
+}
+
+static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+	struct files_struct *files = NULL;
+	struct task_struct *task = NULL;
+	struct file *file;
+
+	file = task_file_seq_get_next(info->common.ns, &info->tid, &info->fd,
+				      &task, &files);
+	if (!file) {
+		info->files = NULL;
+		info->task = NULL;
+		return NULL;
+	}
+
+	++*pos;
+	info->task = task;
+	info->files = files;
+
+	return file;
+}
+
+static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+	struct files_struct *files = info->files;
+	struct task_struct *task = info->task;
+	struct file *file;
+
+	++*pos;
+	++info->fd;
+	fput((struct file *)v);
+	file = task_file_seq_get_next(info->common.ns, &info->tid, &info->fd,
+				      &task, &files);
+	if (!file) {
+		info->files = NULL;
+		info->task = NULL;
+		return NULL;
+	}
+
+	info->task = task;
+	info->files = files;
+
+	return file;
+}
+
+struct bpf_iter__task_file {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct task_struct *, task);
+	u32 fd __aligned(8);
+	__bpf_md_ptr(struct file *, file);
+};
+
+DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
+		     struct task_struct *task, u32 fd,
+		     struct file *file)
+
+static int __task_file_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+	struct bpf_iter__task_file ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (!prog)
+		return 0;
+
+	ctx.meta = &meta;
+	ctx.task = info->task;
+	ctx.fd = info->fd;
+	ctx.file = v;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_file_seq_show(struct seq_file *seq, void *v)
+{
+	return __task_file_seq_show(seq, v, false);
+}
+
+static void task_file_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_task_file_info *info = seq->private;
+
+	if (!v) {
+		(void)__task_file_seq_show(seq, v, true);
+	} else {
+		fput((struct file *)v);
+		put_files_struct(info->files);
+		put_task_struct(info->task);
+		info->files = NULL;
+		info->task = NULL;
+	}
+}
+
+static int init_seq_pidns(void *priv_data)
+{
+	struct bpf_iter_seq_task_common *common = priv_data;
+
+	common->ns = get_pid_ns(task_active_pid_ns(current));
+	return 0;
+}
+
+static void fini_seq_pidns(void *priv_data)
+{
+	struct bpf_iter_seq_task_common *common = priv_data;
+
+	put_pid_ns(common->ns);
+}
+
+static const struct seq_operations task_file_seq_ops = {
+	.start	= task_file_seq_start,
+	.next	= task_file_seq_next,
+	.stop	= task_file_seq_stop,
+	.show	= task_file_seq_show,
+};
+
+static int __init task_iter_init(void)
+{
+	struct bpf_iter_reg task_file_reg_info = {
+		.target			= "task_file",
+		.seq_ops		= &task_file_seq_ops,
+		.init_seq_private	= init_seq_pidns,
+		.fini_seq_private	= fini_seq_pidns,
+		.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
+	};
+	struct bpf_iter_reg task_reg_info = {
+		.target			= "task",
+		.seq_ops		= &task_seq_ops,
+		.init_seq_private	= init_seq_pidns,
+		.fini_seq_private	= fini_seq_pidns,
+		.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
+	};
+	int ret;
+
+	ret = bpf_iter_reg_target(&task_reg_info);
+	if (ret)
+		return ret;
+
+	return bpf_iter_reg_target(&task_file_reg_info);
+}
+late_initcall(task_iter_init);
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 12/21] bpf: add PTR_TO_BTF_ID_OR_NULL support
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (10 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 11/21] bpf: add task and task/file iterator targets Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 13/21] bpf: add bpf_seq_printf and bpf_seq_write helpers Yonghong Song
                   ` (8 subsequent siblings)
  20 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Add bpf_reg_type PTR_TO_BTF_ID_OR_NULL support.
For tracing/iter program, the bpf program context
definition, e.g., for previous bpf_map target, looks like
  struct bpf_iter__bpf_map {
    struct bpf_iter_meta *meta;
    struct bpf_map *map;
  };

The kernel guarantees that meta is not NULL, but
map pointer maybe NULL. The NULL map indicates that all
objects have been traversed, so bpf program can take
proper action, e.g., do final aggregation and/or send
final report to user space.

Add btf_id_or_null_non0_off to prog->aux structure, to
indicate that if the context access offset is not 0,
set to PTR_TO_BTF_ID_OR_NULL instead of PTR_TO_BTF_ID.
This bit is set for tracing/iter program.

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/btf.c      |  5 ++++-
 kernel/bpf/verifier.c | 16 ++++++++++++----
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 363ab0751967..cf4b6e44f2bc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -320,6 +320,7 @@ enum bpf_reg_type {
 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
 	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */
 	PTR_TO_BTF_ID,		 /* reg points to kernel struct */
+	PTR_TO_BTF_ID_OR_NULL,	 /* reg points to kernel struct or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -658,6 +659,7 @@ struct bpf_prog_aux {
 	bool offload_requested;
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
 	bool func_proto_unreliable;
+	bool btf_id_or_null_non0_off;
 	enum bpf_tramp_prog_type trampoline_prog_type;
 	struct bpf_trampoline *trampoline;
 	struct hlist_node tramp_hlist;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a2cfba89a8e1..c490fbde22d4 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3790,7 +3790,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		return true;
 
 	/* this is a pointer to another type */
-	info->reg_type = PTR_TO_BTF_ID;
+	if (off != 0 && prog->aux->btf_id_or_null_non0_off)
+		info->reg_type = PTR_TO_BTF_ID_OR_NULL;
+	else
+		info->reg_type = PTR_TO_BTF_ID;
 
 	if (tgt_prog) {
 		ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d725ff7d11db..36b2a38a06fe 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -398,7 +398,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
 	return type == PTR_TO_MAP_VALUE_OR_NULL ||
 	       type == PTR_TO_SOCKET_OR_NULL ||
 	       type == PTR_TO_SOCK_COMMON_OR_NULL ||
-	       type == PTR_TO_TCP_SOCK_OR_NULL;
+	       type == PTR_TO_TCP_SOCK_OR_NULL ||
+	       type == PTR_TO_BTF_ID_OR_NULL;
 }
 
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -483,6 +484,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_TP_BUFFER]	= "tp_buffer",
 	[PTR_TO_XDP_SOCK]	= "xdp_sock",
 	[PTR_TO_BTF_ID]		= "ptr_",
+	[PTR_TO_BTF_ID_OR_NULL]	= "ptr_or_null_",
 };
 
 static char slot_type_char[] = {
@@ -543,7 +545,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			/* reg->off should be 0 for SCALAR_VALUE */
 			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
-			if (t == PTR_TO_BTF_ID)
+			if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
 				verbose(env, "%s", kernel_type_name(reg->btf_id));
 			verbose(env, "(id=%d", reg->id);
 			if (reg_type_may_be_refcounted_or_null(t))
@@ -2139,6 +2141,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
+	case PTR_TO_BTF_ID_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -2659,7 +2662,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 */
 		*reg_type = info.reg_type;
 
-		if (*reg_type == PTR_TO_BTF_ID)
+		if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
 			*btf_id = info.btf_id;
 		else
 			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
@@ -3243,7 +3246,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				 * a sub-register.
 				 */
 				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
-				if (reg_type == PTR_TO_BTF_ID)
+				if (reg_type == PTR_TO_BTF_ID ||
+				    reg_type == PTR_TO_BTF_ID_OR_NULL)
 					regs[value_regno].btf_id = btf_id;
 			}
 			regs[value_regno].type = reg_type;
@@ -6572,6 +6576,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 			reg->type = PTR_TO_SOCK_COMMON;
 		} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
 			reg->type = PTR_TO_TCP_SOCK;
+		} else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
+			reg->type = PTR_TO_BTF_ID;
 		}
 		if (is_null) {
 			/* We don't need id and ref_obj_id from this point
@@ -8429,6 +8435,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
+	case PTR_TO_BTF_ID_OR_NULL:
 		return false;
 	default:
 		return true;
@@ -10640,6 +10647,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		prog->aux->attach_func_proto = t;
 		if (!bpf_iter_prog_supported(prog))
 			return -EINVAL;
+		prog->aux->btf_id_or_null_non0_off = true;
 		ret = btf_distill_func_proto(&env->log, btf, t,
 					     tname, &fmodel);
 		return ret;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 13/21] bpf: add bpf_seq_printf and bpf_seq_write helpers
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (11 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 12/21] bpf: add PTR_TO_BTF_ID_OR_NULL support Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 19:44   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 14/21] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary Yonghong Song
                   ` (7 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.

bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.

For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.

bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_disable(), which guards
bpf prog call, calls preempt_disable().

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/uapi/linux/bpf.h       |  32 +++++-
 kernel/trace/bpf_trace.c       | 200 +++++++++++++++++++++++++++++++++
 scripts/bpf_helpers_doc.py     |   2 +
 tools/include/uapi/linux/bpf.h |  32 +++++-
 4 files changed, 264 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97ceb0f2e539..e440a9d5cca2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3076,6 +3076,34 @@ union bpf_attr {
  * 		See: clock_gettime(CLOCK_BOOTTIME)
  * 	Return
  * 		Current *ktime*.
+ *
+ * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
+ * 	Description
+ * 		seq_printf uses seq_file seq_printf() to print out the format string.
+ * 		The *m* represents the seq_file. The *fmt* and *fmt_size* are for
+ * 		the format string itself. The *data* and *data_len* are format string
+ * 		arguments. The *data* are a u64 array and corresponding format string
+ * 		values are stored in the array. For strings and pointers where pointees
+ * 		are accessed, only the pointer values are stored in the *data* array.
+ * 		The *data_len* is the *data* size in term of bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EBUSY**		Percpu memory copy buffer is busy, can try again
+ *					by returning 1 from bpf program.
+ *		* **-EINVAL**		Invalid arguments, or invalid/unsupported formats.
+ *		* **-E2BIG**		Too many format specifiers.
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
+ *
+ * int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
+ * 	Description
+ * 		seq_write uses seq_file seq_write() to write the data.
+ * 		The *m* represents the seq_file. The *data* and *len* represent the
+ *		data to write in bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3203,7 +3231,9 @@ union bpf_attr {
 	FN(get_netns_cookie),		\
 	FN(get_current_ancestor_cgroup_id),	\
 	FN(sk_assign),			\
-	FN(ktime_get_boot_ns),
+	FN(ktime_get_boot_ns),		\
+	FN(seq_printf),			\
+	FN(seq_write),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e875c95d3ced..02721cbaa2f8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -457,6 +457,198 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
+#define MAX_SEQ_PRINTF_VARARGS		12
+#define MAX_SEQ_PRINTF_MAX_MEMCPY	6
+#define MAX_SEQ_PRINTF_STR_LEN		128
+
+struct bpf_seq_printf_buf {
+	char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
+};
+static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
+static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
+
+BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
+	   const void *, data, u32, data_len)
+{
+	int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
+	int i, buf_used, copy_size, num_args;
+	u64 params[MAX_SEQ_PRINTF_VARARGS];
+	struct bpf_seq_printf_buf *bufs;
+	const u64 *args = data;
+
+	buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
+	if (WARN_ON_ONCE(buf_used > 1)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	bufs = this_cpu_ptr(&bpf_seq_printf_buf);
+
+	/*
+	 * bpf_check()->check_func_arg()->check_stack_boundary()
+	 * guarantees that fmt points to bpf program stack,
+	 * fmt_size bytes of it were initialized and fmt_size > 0
+	 */
+	if (fmt[--fmt_size] != 0)
+		goto out;
+
+	if (data_len & 7)
+		goto out;
+
+	for (i = 0; i < fmt_size; i++) {
+		if (fmt[i] == '%') {
+			if (fmt[i + 1] == '%')
+				i++;
+			else if (!data || !data_len)
+				goto out;
+		}
+	}
+
+	num_args = data_len / 8;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++) {
+		/* only printable ascii for now. */
+		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+			goto out;
+
+		if (fmt[i] != '%')
+			continue;
+
+		if (fmt[i + 1] == '%') {
+			i++;
+			continue;
+		}
+
+		if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
+			err = -E2BIG;
+			goto out;
+		}
+
+		if (fmt_cnt >= num_args)
+			goto out;
+
+		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+		i++;
+
+		/* skip optional "[0 +-][num]" width formating field */
+		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-' ||
+		       fmt[i] == ' ')
+			i++;
+		if (fmt[i] >= '1' && fmt[i] <= '9') {
+			i++;
+			while (fmt[i] >= '0' && fmt[i] <= '9')
+				i++;
+		}
+
+		if (fmt[i] == 's') {
+			/* try our best to copy */
+			if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+				err = -E2BIG;
+				goto out;
+			}
+
+			bufs->buf[memcpy_cnt][0] = 0;
+			strncpy_from_unsafe(bufs->buf[memcpy_cnt],
+					    (void *) (long) args[fmt_cnt],
+					    MAX_SEQ_PRINTF_STR_LEN);
+			params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+			fmt_cnt++;
+			memcpy_cnt++;
+			continue;
+		}
+
+		if (fmt[i] == 'p') {
+			if (fmt[i + 1] == 0 ||
+			    fmt[i + 1] == 'K' ||
+			    fmt[i + 1] == 'x') {
+				/* just kernel pointers */
+				params[fmt_cnt] = args[fmt_cnt];
+				fmt_cnt++;
+				continue;
+			}
+
+			/* only support "%pI4", "%pi4", "%pI6" and "pi6". */
+			if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I')
+				goto out;
+			if (fmt[i + 2] != '4' && fmt[i + 2] != '6')
+				goto out;
+
+			if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+				err = -E2BIG;
+				goto out;
+			}
+
+
+			copy_size = (fmt[i + 2] == '4') ? 4 : 16;
+
+			probe_kernel_read(bufs->buf[memcpy_cnt],
+					  (void *) (long) args[fmt_cnt], copy_size);
+			params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+			i += 2;
+			fmt_cnt++;
+			memcpy_cnt++;
+			continue;
+		}
+
+		if (fmt[i] == 'l') {
+			i++;
+			if (fmt[i] == 'l')
+				i++;
+		}
+
+		if (fmt[i] != 'i' && fmt[i] != 'd' &&
+		    fmt[i] != 'u' && fmt[i] != 'x')
+			goto out;
+
+		params[fmt_cnt] = args[fmt_cnt];
+		fmt_cnt++;
+	}
+
+	/* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
+	 * all of them to seq_printf().
+	 */
+	seq_printf(m, fmt, params[0], params[1], params[2], params[3],
+		   params[4], params[5], params[6], params[7], params[8],
+		   params[9], params[10], params[11]);
+
+	err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
+out:
+	this_cpu_dec(bpf_seq_printf_buf_used);
+	return err;
+}
+
+static int bpf_seq_printf_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_printf_proto = {
+	.func		= bpf_seq_printf,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
+	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+	.btf_id		= bpf_seq_printf_btf_ids,
+};
+
+BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
+{
+	return seq_write(m, data, len) ? -EOVERFLOW : 0;
+}
+
+static int bpf_seq_write_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_write_proto = {
+	.func		= bpf_seq_write,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.btf_id		= bpf_seq_write_btf_ids,
+};
+
 static __always_inline int
 get_map_perf_counter(struct bpf_map *map, u64 flags,
 		     u64 *value, u64 *enabled, u64 *running)
@@ -1226,6 +1418,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_xdp_output:
 		return &bpf_xdp_output_proto;
 #endif
+	case BPF_FUNC_seq_printf:
+		return prog->expected_attach_type == BPF_TRACE_ITER ?
+		       &bpf_seq_printf_proto :
+		       NULL;
+	case BPF_FUNC_seq_write:
+		return prog->expected_attach_type == BPF_TRACE_ITER ?
+		       &bpf_seq_write_proto :
+		       NULL;
 	default:
 		return raw_tp_prog_func_proto(func_id, prog);
 	}
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index f43d193aff3a..ded304c96a05 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -414,6 +414,7 @@ class PrinterHelpers(Printer):
             'struct sk_reuseport_md',
             'struct sockaddr',
             'struct tcphdr',
+            'struct seq_file',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -450,6 +451,7 @@ class PrinterHelpers(Printer):
             'struct sk_reuseport_md',
             'struct sockaddr',
             'struct tcphdr',
+            'struct seq_file',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 97ceb0f2e539..e440a9d5cca2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3076,6 +3076,34 @@ union bpf_attr {
  * 		See: clock_gettime(CLOCK_BOOTTIME)
  * 	Return
  * 		Current *ktime*.
+ *
+ * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
+ * 	Description
+ * 		seq_printf uses seq_file seq_printf() to print out the format string.
+ * 		The *m* represents the seq_file. The *fmt* and *fmt_size* are for
+ * 		the format string itself. The *data* and *data_len* are format string
+ * 		arguments. The *data* are a u64 array and corresponding format string
+ * 		values are stored in the array. For strings and pointers where pointees
+ * 		are accessed, only the pointer values are stored in the *data* array.
+ * 		The *data_len* is the *data* size in term of bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EBUSY**		Percpu memory copy buffer is busy, can try again
+ *					by returning 1 from bpf program.
+ *		* **-EINVAL**		Invalid arguments, or invalid/unsupported formats.
+ *		* **-E2BIG**		Too many format specifiers.
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
+ *
+ * int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
+ * 	Description
+ * 		seq_write uses seq_file seq_write() to write the data.
+ * 		The *m* represents the seq_file. The *data* and *len* represent the
+ *		data to write in bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3203,7 +3231,9 @@ union bpf_attr {
 	FN(get_netns_cookie),		\
 	FN(get_current_ancestor_cgroup_id),	\
 	FN(sk_assign),			\
-	FN(ktime_get_boot_ns),
+	FN(ktime_get_boot_ns),		\
+	FN(seq_printf),			\
+	FN(seq_write),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 14/21] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (12 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 13/21] bpf: add bpf_seq_printf and bpf_seq_write helpers Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 15/21] bpf: support variable length array in tracing programs Yonghong Song
                   ` (6 subsequent siblings)
  20 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

This specifically to handle the case like below:
   // ptr below is a socket ptr identified by PTR_TO_BTF_ID
   u64 param[2] = { ptr, val };
   bpf_seq_printf(seq, fmt, sizeof(fmt), param, sizeof(param));

In this case, the 16 bytes stack for "param" contains:
   8 bytes for ptr with spilled PTR_TO_BTF_ID
   8 bytes for val as STACK_MISC

The current verifier will complain the ptr should not be visible
to the helper.
   ...
   16: (7b) *(u64 *)(r10 -64) = r2
   18: (7b) *(u64 *)(r10 -56) = r1
   19: (bf) r4 = r10
   ;
   20: (07) r4 += -64
   ; BPF_SEQ_PRINTF(seq, fmt1, (long)s, s->sk_protocol);
   21: (bf) r1 = r6
   22: (18) r2 = 0xffffa8d00018605a
   24: (b4) w3 = 10
   25: (b4) w5 = 16
   26: (85) call bpf_seq_printf#125
    R0=inv(id=0) R1_w=ptr_seq_file(id=0,off=0,imm=0)
    R2_w=map_value(id=0,off=90,ks=4,vs=144,imm=0) R3_w=inv10
    R4_w=fp-64 R5_w=inv16 R6=ptr_seq_file(id=0,off=0,imm=0)
    R7=ptr_netlink_sock(id=0,off=0,imm=0) R10=fp0 fp-56_w=mmmmmmmm
    fp-64_w=ptr_
   last_idx 26 first_idx 13
   regs=8 stack=0 before 25: (b4) w5 = 16
   regs=8 stack=0 before 24: (b4) w3 = 10
   invalid indirect read from stack off -64+0 size 16

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/verifier.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 36b2a38a06fe..2a1826c76bb6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3494,6 +3494,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 			*stype = STACK_MISC;
 			goto mark;
 		}
+
+		if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+		    state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
+			goto mark;
+
 		if (state->stack[spi].slot_type[0] == STACK_SPILL &&
 		    state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
 			__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 15/21] bpf: support variable length array in tracing programs
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (13 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 14/21] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 16/21] tools/libbpf: add bpf_iter support Yonghong Song
                   ` (5 subsequent siblings)
  20 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

In /proc/net/ipv6_route, we have
  struct fib6_info {
    struct fib6_table *fib6_table;
    ...
    struct fib6_nh fib6_nh[0];
  }
  struct fib6_nh {
    struct fib_nh_common nh_common;
    struct rt6_info **rt6i_pcpu;
    struct rt6_exception_bucket *rt6i_exception_bucket;
  };
  struct fib_nh_common {
    ...
    u8 nhc_gw_family;
    ...
  }

The access:
  struct fib6_nh *fib6_nh = &rt->fib6_nh;
  ... fib6_nh->nh_common.nhc_gw_family ...

This patch ensures such an access is handled properly.

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/btf.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c490fbde22d4..dcd233139294 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3833,6 +3833,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
 	const struct btf_type *mtype, *elem_type = NULL;
 	const struct btf_member *member;
 	const char *tname, *mname;
+	u32 vlen;
 
 again:
 	tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
@@ -3841,7 +3842,43 @@ int btf_struct_access(struct bpf_verifier_log *log,
 		return -EINVAL;
 	}
 
+	vlen = btf_type_vlen(t);
 	if (off + size > t->size) {
+		/* If the last element is a variable size array, we may
+		 * need to relax the rule.
+		 */
+		struct btf_array *array_elem;
+
+		if (vlen == 0)
+			goto error;
+
+		member = btf_type_member(t) + vlen - 1;
+		mtype = btf_type_skip_modifiers(btf_vmlinux, member->type,
+						NULL);
+		if (!btf_type_is_array(mtype))
+			goto error;
+
+		array_elem = (struct btf_array *)(mtype + 1);
+		if (array_elem->nelems != 0)
+			goto error;
+
+		moff = btf_member_bit_offset(t, member) / 8;
+		if (off < moff)
+			goto error;
+
+		/* Only allow structure for now, can be relaxed for
+		 * other types later.
+		 */
+		elem_type = btf_type_skip_modifiers(btf_vmlinux,
+						    array_elem->type, NULL);
+		if (!btf_type_is_struct(elem_type))
+			goto error;
+
+		off = (off - moff) % elem_type->size;
+		return btf_struct_access(log, elem_type, off, size, atype,
+					 next_btf_id);
+
+error:
 		bpf_log(log, "access beyond struct %s at off %u size %u\n",
 			tname, off, size);
 		return -EACCES;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 16/21] tools/libbpf: add bpf_iter support
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (14 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 15/21] bpf: support variable length array in tracing programs Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 19:46   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 17/21] tools/libpf: add offsetof/container_of macro in bpf_helpers.h Yonghong Song
                   ` (4 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Two new libbpf APIs are added to support bpf_iter:
  - bpf_program__attach_iter
    Given a bpf program and additional parameters, which is
    none now, returns a bpf_link.
  - bpf_iter_create
    syscall level API to create a bpf iterator.

The macro BPF_SEQ_PRINTF are also introduced. The format
looks like:
  BPF_SEQ_PRINTF(seq, "task id %d\n", pid);

This macro can help bpf program writers with
nicer bpf_seq_printf syntax similar to the kernel one.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/lib/bpf/bpf.c         | 10 +++++++
 tools/lib/bpf/bpf.h         |  2 ++
 tools/lib/bpf/bpf_tracing.h | 16 ++++++++++++
 tools/lib/bpf/libbpf.c      | 52 +++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.h      |  9 +++++++
 tools/lib/bpf/libbpf.map    |  2 ++
 6 files changed, 91 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 43322f0d6c7f..a7329b671c41 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -619,6 +619,16 @@ int bpf_link_update(int link_fd, int new_prog_fd,
 	return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr));
 }
 
+int bpf_iter_create(int link_fd)
+{
+	union bpf_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.iter_create.link_fd = link_fd;
+
+	return sys_bpf(BPF_ITER_CREATE, &attr, sizeof(attr));
+}
+
 int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
 		   __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt)
 {
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 1901b2777854..1b6015b21ba8 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -187,6 +187,8 @@ struct bpf_link_update_opts {
 LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd,
 			       const struct bpf_link_update_opts *opts);
 
+LIBBPF_API int bpf_iter_create(int link_fd);
+
 struct bpf_prog_test_run_attr {
 	int prog_fd;
 	int repeat;
diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index f3f3c3fb98cb..cf97d07692b4 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -413,4 +413,20 @@ typeof(name(0)) name(struct pt_regs *ctx)				    \
 }									    \
 static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
 
+/*
+ * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values
+ * in a structure.
+ */
+#define BPF_SEQ_PRINTF(seq, fmt, args...)				    \
+	({								    \
+		_Pragma("GCC diagnostic push")				    \
+		_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")	    \
+		static const char ___fmt[] = fmt;			    \
+		unsigned long long ___param[] = { args };		    \
+		_Pragma("GCC diagnostic pop")				    \
+		int ___ret = bpf_seq_printf(seq, ___fmt, sizeof(___fmt),    \
+					    ___param, sizeof(___param));    \
+		___ret;							    \
+	})
+
 #endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 977add1b73e2..6c2f46908f4d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -6586,6 +6586,8 @@ static struct bpf_link *attach_trace(const struct bpf_sec_def *sec,
 				     struct bpf_program *prog);
 static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec,
 				   struct bpf_program *prog);
+static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
+				    struct bpf_program *prog);
 
 static const struct bpf_sec_def section_defs[] = {
 	BPF_PROG_SEC("socket",			BPF_PROG_TYPE_SOCKET_FILTER),
@@ -6629,6 +6631,10 @@ static const struct bpf_sec_def section_defs[] = {
 		.is_attach_btf = true,
 		.expected_attach_type = BPF_LSM_MAC,
 		.attach_fn = attach_lsm),
+	SEC_DEF("iter/", TRACING,
+		.expected_attach_type = BPF_TRACE_ITER,
+		.is_attach_btf = true,
+		.attach_fn = attach_iter),
 	BPF_PROG_SEC("xdp",			BPF_PROG_TYPE_XDP),
 	BPF_PROG_SEC("perf_event",		BPF_PROG_TYPE_PERF_EVENT),
 	BPF_PROG_SEC("lwt_in",			BPF_PROG_TYPE_LWT_IN),
@@ -6891,6 +6897,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
 
 #define BTF_TRACE_PREFIX "btf_trace_"
 #define BTF_LSM_PREFIX "bpf_lsm_"
+#define BTF_ITER_PREFIX "__bpf_iter__"
 #define BTF_MAX_NAME_SIZE 128
 
 static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix,
@@ -6921,6 +6928,9 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name,
 	else if (attach_type == BPF_LSM_MAC)
 		err = find_btf_by_prefix_kind(btf, BTF_LSM_PREFIX, name,
 					      BTF_KIND_FUNC);
+	else if (attach_type == BPF_TRACE_ITER)
+		err = find_btf_by_prefix_kind(btf, BTF_ITER_PREFIX, name,
+					      BTF_KIND_FUNC);
 	else
 		err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC);
 
@@ -7848,6 +7858,12 @@ static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec,
 	return bpf_program__attach_lsm(prog);
 }
 
+static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
+				    struct bpf_program *prog)
+{
+	return bpf_program__attach_iter(prog, NULL);
+}
+
 struct bpf_link *
 bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
 {
@@ -7882,6 +7898,42 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
 	return link;
 }
 
+struct bpf_link *
+bpf_program__attach_iter(struct bpf_program *prog,
+			 const struct bpf_iter_attach_opts *opts)
+{
+	char errmsg[STRERR_BUFSIZE];
+	struct bpf_link *link;
+	int prog_fd, link_fd;
+
+	if (!OPTS_VALID(opts, bpf_iter_attach_opts))
+		return ERR_PTR(-EINVAL);
+
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0) {
+		pr_warn("program '%s': can't attach before loaded\n",
+			bpf_program__title(prog, false));
+		return ERR_PTR(-EINVAL);
+	}
+
+	link = calloc(1, sizeof(*link));
+	if (!link)
+		return ERR_PTR(-ENOMEM);
+	link->detach = &bpf_link__detach_fd;
+
+	link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_ITER, NULL);
+	if (link_fd < 0) {
+		link_fd = -errno;
+		free(link);
+		pr_warn("program '%s': failed to attach to iterator: %s\n",
+			bpf_program__title(prog, false),
+			libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg)));
+		return ERR_PTR(link_fd);
+	}
+	link->fd = link_fd;
+	return link;
+}
+
 struct bpf_link *bpf_program__attach(struct bpf_program *prog)
 {
 	const struct bpf_sec_def *sec_def;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index f1dacecb1619..8ea69558f0a8 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -258,6 +258,15 @@ struct bpf_map;
 
 LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map);
 
+struct bpf_iter_attach_opts {
+	size_t sz; /* size of this struct for forward/backward compatibility */
+};
+#define bpf_iter_attach_opts__last_field sz
+
+LIBBPF_API struct bpf_link *
+bpf_program__attach_iter(struct bpf_program *prog,
+			 const struct bpf_iter_attach_opts *opts);
+
 struct bpf_insn;
 
 /*
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index e03bd4db827e..0133d469d30b 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -258,6 +258,8 @@ LIBBPF_0.0.8 {
 LIBBPF_0.0.9 {
 	global:
 		bpf_enable_stats;
+		bpf_iter_create;
 		bpf_link_get_fd_by_id;
 		bpf_link_get_next_id;
+		bpf_program__attach_iter;
 } LIBBPF_0.0.8;
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 17/21] tools/libpf: add offsetof/container_of macro in bpf_helpers.h
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (15 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 16/21] tools/libbpf: add bpf_iter support Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 19:48   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 18/21] tools/bpftool: add bpf_iter support for bptool Yonghong Song
                   ` (3 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

These two helpers will be used later in bpf_iter bpf program
bpf_iter_netlink.c. Put them in bpf_helpers.h since they could
be useful in other cases.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/lib/bpf/bpf_helpers.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index da00b87aa199..f67dce2af802 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -36,6 +36,20 @@
 #define __weak __attribute__((weak))
 #endif
 
+/*
+ * Helper macro to manipulate data structures
+ */
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER)  ((size_t)&((TYPE *)0)->MEMBER)
+#endif
+#ifndef container_of
+#define container_of(ptr, type, member)				\
+	({							\
+		void *__mptr = (void *)(ptr);			\
+		((type *)(__mptr - offsetof(type, member)));	\
+	})
+#endif
+
 /*
  * Helper structure used by eBPF C program
  * to describe BPF map attributes to libbpf loader
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 18/21] tools/bpftool: add bpf_iter support for bptool
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (16 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 17/21] tools/libpf: add offsetof/container_of macro in bpf_helpers.h Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 19:51   ` Andrii Nakryiko
  2020-05-07  5:39 ` [PATCH bpf-next v3 19/21] tools/bpf: selftests: add iterator programs for ipv6_route and netlink Yonghong Song
                   ` (2 subsequent siblings)
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Currently, only one command is supported
  bpftool iter pin <bpf_prog.o> <path>

It will pin the trace/iter bpf program in
the object file <bpf_prog.o> to the <path>
where <path> should be on a bpffs mount.

For example,
  $ bpftool iter pin ./bpf_iter_ipv6_route.o \
    /sys/fs/bpf/my_route
User can then do a `cat` to print out the results:
  $ cat /sys/fs/bpf/my_route
    fe800000000000000000000000000000 40 00000000000000000000000000000000 ...
    00000000000000000000000000000000 00 00000000000000000000000000000000 ...
    00000000000000000000000000000001 80 00000000000000000000000000000000 ...
    fe800000000000008c0162fffebdfd57 80 00000000000000000000000000000000 ...
    ff000000000000000000000000000000 08 00000000000000000000000000000000 ...
    00000000000000000000000000000000 00 00000000000000000000000000000000 ...

The implementation for ipv6_route iterator is in one of subsequent
patches.

This patch also added BPF_LINK_TYPE_ITER to link query.

In the future, we may add additional parameters to pin command
by parameterizing the bpf iterator. For example, a map_id or pid
may be added to let bpf program only traverses a single map or task,
similar to kernel seq_file single_open().

We may also add introspection command for targets/iterators by
leveraging the bpf_iter itself.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../bpftool/Documentation/bpftool-iter.rst    | 83 ++++++++++++++++++
 tools/bpf/bpftool/bash-completion/bpftool     | 13 +++
 tools/bpf/bpftool/iter.c                      | 84 +++++++++++++++++++
 tools/bpf/bpftool/link.c                      |  1 +
 tools/bpf/bpftool/main.c                      |  3 +-
 tools/bpf/bpftool/main.h                      |  1 +
 6 files changed, 184 insertions(+), 1 deletion(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-iter.rst
 create mode 100644 tools/bpf/bpftool/iter.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
new file mode 100644
index 000000000000..13b173d93890
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
@@ -0,0 +1,83 @@
+============
+bpftool-iter
+============
+-------------------------------------------------------------------------------
+tool to create BPF iterators
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **iter** *COMMAND*
+
+	*COMMANDS* := { **pin** | **help** }
+
+ITER COMMANDS
+===================
+
+|	**bpftool** **iter pin** *OBJ* *PATH*
+|	**bpftool** **iter help**
+|
+|	*OBJ* := /a/file/of/bpf_iter_target.o
+
+
+DESCRIPTION
+===========
+	**bpftool iter pin** *OBJ* *PATH*
+		  A bpf iterator combines a kernel iterating of
+		  particular kernel data (e.g., tasks, bpf_maps, etc.)
+		  and a bpf program called for each kernel data object
+		  (e.g., one task, one bpf_map, etc.). User space can
+		  *read* kernel iterator output through *read()* syscall.
+
+		  The *pin* command creates a bpf iterator from *OBJ*,
+		  and pin it to *PATH*. The *PATH* should be located
+		  in *bpffs* mount. It must not contain a dot
+		  character ('.'), which is reserved for future extensions
+		  of *bpffs*.
+
+		  User can then *cat PATH* to see the bpf iterator output.
+
+	**bpftool iter help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-V, --version
+		  Print version number (similar to **bpftool version**).
+
+	-d, --debug
+		  Print all logs available, even debug-level information. This
+		  includes logs from libbpf as well as from the verifier, when
+		  attempting to load programs.
+
+EXAMPLES
+========
+**# bpftool iter pin bpf_iter_netlink.o /sys/fs/bpf/my_netlink**
+
+::
+
+   Create a file-based bpf iterator from bpf_iter_netlink.o and pin it
+   to /sys/fs/bpf/my_netlink
+
+
+SEE ALSO
+========
+	**bpf**\ (2),
+	**bpf-helpers**\ (7),
+	**bpftool**\ (8),
+	**bpftool-prog**\ (8),
+	**bpftool-map**\ (8),
+	**bpftool-link**\ (8),
+	**bpftool-cgroup**\ (8),
+	**bpftool-feature**\ (8),
+	**bpftool-net**\ (8),
+	**bpftool-perf**\ (8),
+	**bpftool-btf**\ (8)
+	**bpftool-gen**\ (8)
+	**bpftool-struct_ops**\ (8)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index fc989ead7313..9f0f20e73b87 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -610,6 +610,19 @@ _bpftool()
                     ;;
             esac
             ;;
+        iter)
+            case $command in
+                pin)
+                    _filedir
+                    return 0
+                    ;;
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'pin help' \
+                            -- "$cur" ) )
+                    ;;
+            esac
+            ;;
         map)
             local MAP_TYPE='id pinned name'
             case $command in
diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c
new file mode 100644
index 000000000000..a8fb1349c103
--- /dev/null
+++ b/tools/bpf/bpftool/iter.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (C) 2020 Facebook
+
+#define _GNU_SOURCE
+#include <linux/err.h>
+#include <bpf/libbpf.h>
+
+#include "main.h"
+
+static int do_pin(int argc, char **argv)
+{
+	const char *objfile, *path;
+	struct bpf_program *prog;
+	struct bpf_object *obj;
+	struct bpf_link *link;
+	int err;
+
+	if (!REQ_ARGS(2))
+		usage();
+
+	objfile = GET_ARG();
+	path = GET_ARG();
+
+	obj = bpf_object__open(objfile);
+	if (IS_ERR_OR_NULL(obj)) {
+		p_err("can't open objfile %s", objfile);
+		return -1;
+	}
+
+	err = bpf_object__load(obj);
+	if (err) {
+		p_err("can't load objfile %s", objfile);
+		goto close_obj;
+	}
+
+	prog = bpf_program__next(NULL, obj);
+	link = bpf_program__attach_iter(prog, NULL);
+	if (IS_ERR(link)) {
+		err = PTR_ERR(link);
+		p_err("attach_iter failed for program %s",
+		      bpf_program__name(prog));
+		goto close_obj;
+	}
+
+	err = mount_bpffs_for_pin(path);
+	if (err)
+		goto close_link;
+
+	err = bpf_link__pin(link, path);
+	if (err) {
+		p_err("pin_iter failed for program %s to path %s",
+		      bpf_program__name(prog), path);
+		goto close_link;
+	}
+
+close_link:
+	bpf_link__disconnect(link);
+	bpf_link__destroy(link);
+close_obj:
+	bpf_object__close(obj);
+	return err;
+}
+
+static int do_help(int argc, char **argv)
+{
+	fprintf(stderr,
+		"Usage: %s %s pin OBJ PATH\n"
+		"       %s %s help\n"
+		"\n",
+		bin_name, argv[-2], bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "help",	do_help },
+	{ "pin",	do_pin },
+	{ 0 }
+};
+
+int do_iter(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index adc7dc431ed8..b6a0b35c78ae 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -16,6 +16,7 @@ static const char * const link_type_name[] = {
 	[BPF_LINK_TYPE_RAW_TRACEPOINT]		= "raw_tracepoint",
 	[BPF_LINK_TYPE_TRACING]			= "tracing",
 	[BPF_LINK_TYPE_CGROUP]			= "cgroup",
+	[BPF_LINK_TYPE_ITER]			= "iter",
 };
 
 static int link_parse_fd(int *argc, char ***argv)
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 1413a154806e..46bd716a9d86 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -59,7 +59,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops }\n"
+		"       OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -224,6 +224,7 @@ static const struct cmd cmds[] = {
 	{ "btf",	do_btf },
 	{ "gen",	do_gen },
 	{ "struct_ops",	do_struct_ops },
+	{ "iter",	do_iter },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 9b1fb81a8331..a41cefabccaf 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -199,6 +199,7 @@ int do_feature(int argc, char **argv);
 int do_btf(int argc, char **argv);
 int do_gen(int argc, char **argv);
 int do_struct_ops(int argc, char **argv);
+int do_iter(int argc, char **argv);
 
 int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
 int prog_parse_fd(int *argc, char ***argv);
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 19/21] tools/bpf: selftests: add iterator programs for ipv6_route and netlink
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (17 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 18/21] tools/bpftool: add bpf_iter support for bptool Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 20/21] tools/bpf: selftests: add iter progs for bpf_map/task/task_file Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 21/21] tools/bpf: selftests: add bpf_iter selftests Yonghong Song
  20 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

Two bpf programs are added in this patch for netlink and ipv6_route
target. On my VM, I am able to achieve identical
results compared to /proc/net/netlink and /proc/net/ipv6_route.

  $ cat /proc/net/netlink
  sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
  000000002c42d58b 0   0          00000000 0        0        0     2        0        7
  00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
  00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
  000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
  ....
  00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
  000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
  00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
  000000008398fb08 16  0          00000000 0        0        0     2        0        27
  $ cat /sys/fs/bpf/my_netlink
  sk               Eth Pid        Groups   Rmem     Wmem     Dump  Locks    Drops    Inode
  000000002c42d58b 0   0          00000000 0        0        0     2        0        7
  00000000a4e8b5e1 0   1          00000551 0        0        0     2        0        18719
  00000000e1b1c195 4   0          00000000 0        0        0     2        0        16422
  000000007e6b29f9 6   0          00000000 0        0        0     2        0        16424
  ....
  00000000159a170d 15  1862       00000002 0        0        0     2        0        1886
  000000009aca4bc9 15  3918224839 00000002 0        0        0     2        0        19076
  00000000d0ab31d2 15  1          00000002 0        0        0     2        0        18683
  000000008398fb08 16  0          00000000 0        0        0     2        0        27

  $ cat /proc/net/ipv6_route
  fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
  00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
  fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
  ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
  $ cat /sys/fs/bpf/my_ipv6_route
  fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
  00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
  fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
  ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/progs/bpf_iter_ipv6_route.c | 62 +++++++++++++++++
 .../selftests/bpf/progs/bpf_iter_netlink.c    | 66 +++++++++++++++++++
 2 files changed, 128 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_netlink.c

diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
new file mode 100644
index 000000000000..ab9e2650e021
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+extern bool CONFIG_IPV6_SUBTREES __kconfig __weak;
+
+#define RTF_GATEWAY		0x0002
+#define IFNAMSIZ		16
+#define fib_nh_gw_family	nh_common.nhc_gw_family
+#define fib_nh_gw6		nh_common.nhc_gw.ipv6
+#define fib_nh_dev		nh_common.nhc_dev
+
+SEC("iter/ipv6_route")
+int dump_ipv6_route(struct bpf_iter__ipv6_route *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct fib6_info *rt = ctx->rt;
+	const struct net_device *dev;
+	struct fib6_nh *fib6_nh;
+	unsigned int flags;
+	struct nexthop *nh;
+
+	if (rt == (void *)0)
+		return 0;
+
+	fib6_nh = &rt->fib6_nh[0];
+	flags = rt->fib6_flags;
+
+	/* FIXME: nexthop_is_multipath is not handled here. */
+	nh = rt->nh;
+	if (rt->nh)
+		fib6_nh = &nh->nh_info->fib6_nh;
+
+	BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
+
+	if (CONFIG_IPV6_SUBTREES)
+		BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_src.addr,
+			       rt->fib6_src.plen);
+	else
+		BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 00 ");
+
+	if (fib6_nh->fib_nh_gw_family) {
+		flags |= RTF_GATEWAY;
+		BPF_SEQ_PRINTF(seq, "%pi6 ", &fib6_nh->fib_nh_gw6);
+	} else {
+		BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 ");
+	}
+
+	dev = fib6_nh->fib_nh_dev;
+	if (dev)
+		BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric,
+			       rt->fib6_ref.refs.counter, 0, flags, dev->name);
+	else
+		BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x\n", rt->fib6_metric,
+			       rt->fib6_ref.refs.counter, 0, flags);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
new file mode 100644
index 000000000000..6b40a233d4e0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define sk_rmem_alloc	sk_backlog.rmem_alloc
+#define sk_refcnt	__sk_common.skc_refcnt
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
+SEC("iter/netlink")
+int dump_netlink(struct bpf_iter__netlink *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct netlink_sock *nlk = ctx->sk;
+	unsigned long group, ino;
+	struct inode *inode;
+	struct socket *sk;
+	struct sock *s;
+
+	if (nlk == (void *)0)
+		return 0;
+
+	if (ctx->meta->seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "sk               Eth Pid        Groups   "
+				    "Rmem     Wmem     Dump  Locks    Drops    "
+				    "Inode\n");
+
+	s = &nlk->sk;
+	BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol);
+
+	if (!nlk->groups)  {
+		group = 0;
+	} else {
+		/* FIXME: temporary use bpf_probe_read here, needs
+		 * verifier support to do direct access.
+		 */
+		bpf_probe_read(&group, sizeof(group), &nlk->groups[0]);
+	}
+	BPF_SEQ_PRINTF(seq, "%-10u %08x %-8d %-8d %-5d %-8d ",
+		       nlk->portid, (u32)group,
+		       s->sk_rmem_alloc.counter,
+		       s->sk_wmem_alloc.refs.counter - 1,
+		       nlk->cb_running, s->sk_refcnt.refs.counter);
+
+	sk = s->sk_socket;
+	if (!sk) {
+		ino = 0;
+	} else {
+		/* FIXME: container_of inside SOCK_INODE has a forced
+		 * type conversion, and direct access cannot be used
+		 * with current verifier.
+		 */
+		inode = SOCK_INODE(sk);
+		bpf_probe_read(&ino, sizeof(ino), &inode->i_ino);
+	}
+	BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino);
+
+	return 0;
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 20/21] tools/bpf: selftests: add iter progs for bpf_map/task/task_file
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (18 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 19/21] tools/bpf: selftests: add iterator programs for ipv6_route and netlink Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-07  5:39 ` [PATCH bpf-next v3 21/21] tools/bpf: selftests: add bpf_iter selftests Yonghong Song
  20 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

The implementation is arbitrary, just to show how the bpf programs
can be written for bpf_map/task/task_file. They can be costomized
for specific needs.

For example, for bpf_map, the iterator prints out:
  $ cat /sys/fs/bpf/my_bpf_map
      id   refcnt  usercnt  locked_vm
       3        2        0         20
       6        2        0         20
       9        2        0         20
      12        2        0         20
      13        2        0         20
      16        2        0         20
      19        2        0         20
      %%% END %%%

For task, the iterator prints out:
  $ cat /sys/fs/bpf/my_task
    tgid      gid
       1        1
       2        2
    ....
    1944     1944
    1948     1948
    1949     1949
    1953     1953
    === END ===

For task/file, the iterator prints out:
  $ cat /sys/fs/bpf/my_task_file
    tgid      gid       fd      file
       1        1        0 ffffffff95c97600
       1        1        1 ffffffff95c97600
       1        1        2 ffffffff95c97600
    ....
    1895     1895      255 ffffffff95c8fe00
    1932     1932        0 ffffffff95c8fe00
    1932     1932        1 ffffffff95c8fe00
    1932     1932        2 ffffffff95c8fe00
    1932     1932        3 ffffffff95c185c0

This is able to print out all open files (fd and file->f_op), so user can compare
f_op against a particular kernel file operations to find what it is.
For example, from /proc/kallsyms, we can find
  ffffffff95c185c0 r eventfd_fops
so we will know tgid 1932 fd 3 is an eventfd file descriptor.

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/progs/bpf_iter_bpf_map.c    | 28 +++++++++++++++++++
 .../selftests/bpf/progs/bpf_iter_task.c       | 25 +++++++++++++++++
 .../selftests/bpf/progs/bpf_iter_task_file.c  | 26 +++++++++++++++++
 3 files changed, 79 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_file.c

diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
new file mode 100644
index 000000000000..4867cd3445c8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/bpf_map")
+int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	__u64 seq_num = ctx->meta->seq_num;
+	struct bpf_map *map = ctx->map;
+
+	if (map == (void *)0) {
+		BPF_SEQ_PRINTF(seq, "      %%%%%% END %%%%%%\n");
+		return 0;
+	}
+
+	if (seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "      id   refcnt  usercnt  locked_vm\n");
+
+	BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter,
+		       map->usercnt.counter,
+		       map->memory.user->locked_vm.counter);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
new file mode 100644
index 000000000000..90f9011c57ca
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/task")
+int dump_task(struct bpf_iter__task *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct task_struct *task = ctx->task;
+
+	if (task == (void *)0) {
+		BPF_SEQ_PRINTF(seq, "    === END ===\n");
+		return 0;
+	}
+
+	if (ctx->meta->seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "    tgid      gid\n");
+
+	BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
new file mode 100644
index 000000000000..c6ced38f0880
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/task_file")
+int dump_task_file(struct bpf_iter__task_file *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct task_struct *task = ctx->task;
+	__u32 fd = ctx->fd;
+	struct file *file = ctx->file;
+
+	if (task == (void *)0 || file == (void *)0)
+		return 0;
+
+	if (ctx->meta->seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "    tgid      gid       fd      file\n");
+
+	BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
+		       (long)file->f_op);
+	return 0;
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [PATCH bpf-next v3 21/21] tools/bpf: selftests: add bpf_iter selftests
  2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
                   ` (19 preceding siblings ...)
  2020-05-07  5:39 ` [PATCH bpf-next v3 20/21] tools/bpf: selftests: add iter progs for bpf_map/task/task_file Yonghong Song
@ 2020-05-07  5:39 ` Yonghong Song
  2020-05-08 19:57   ` Andrii Nakryiko
  20 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-07  5:39 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

The added test includes the following subtests:
  - test verifier change for btf_id_or_null
  - test load/create_iter/read for
    ipv6_route/netlink/bpf_map/task/task_file
  - test anon bpf iterator
  - test anon bpf iterator reading one char at a time
  - test file bpf iterator
  - test overflow (single bpf program output not overflow)
  - test overflow (single bpf program output overflows)
  - test bpf prog returning 1

The ipv6_route tests the following verifier change
  - access fields in the variable length array of the structure.

The netlink load tests the following verifier change
  - put a btf_id ptr value in a stack and accessible to
    tracing/iter programs.

The anon bpf iterator also tests link auto attach through skeleton.

  $ test_progs -n 2
  #2/1 btf_id_or_null:OK
  #2/2 ipv6_route:OK
  #2/3 netlink:OK
  #2/4 bpf_map:OK
  #2/5 task:OK
  #2/6 task_file:OK
  #2/7 anon:OK
  #2/8 anon-read-one-char:OK
  #2/9 file:OK
  #2/10 overflow:OK
  #2/11 overflow-e2big:OK
  #2/12 prog-ret-1:OK
  #2 bpf_iter:OK
  Summary: 1/12 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 408 ++++++++++++++++++
 .../selftests/bpf/progs/bpf_iter_test_kern1.c |   4 +
 .../selftests/bpf/progs/bpf_iter_test_kern2.c |   4 +
 .../selftests/bpf/progs/bpf_iter_test_kern3.c |  18 +
 .../selftests/bpf/progs/bpf_iter_test_kern4.c |  52 +++
 .../bpf/progs/bpf_iter_test_kern_common.h     |  22 +
 6 files changed, 508 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_iter.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
new file mode 100644
index 000000000000..6d0b0c599417
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <test_progs.h>
+#include "bpf_iter_ipv6_route.skel.h"
+#include "bpf_iter_netlink.skel.h"
+#include "bpf_iter_bpf_map.skel.h"
+#include "bpf_iter_task.skel.h"
+#include "bpf_iter_task_file.skel.h"
+#include "bpf_iter_test_kern1.skel.h"
+#include "bpf_iter_test_kern2.skel.h"
+#include "bpf_iter_test_kern3.skel.h"
+#include "bpf_iter_test_kern4.skel.h"
+
+static int duration;
+
+static void test_btf_id_or_null(void)
+{
+	struct bpf_iter_test_kern3 *skel;
+
+	skel = bpf_iter_test_kern3__open_and_load();
+	if (CHECK(skel, "bpf_iter_test_kern3__open_and_load",
+		  "skeleton open_and_load unexpectedly succeeded\n")) {
+		bpf_iter_test_kern3__destroy(skel);
+		return;
+	}
+}
+
+static void do_dummy_read(struct bpf_program *prog)
+{
+	struct bpf_link *link;
+	char buf[16] = {};
+	int iter_fd, len;
+
+	link = bpf_program__attach_iter(prog, NULL);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		return;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	/* not check contents, but ensure read() ends without error */
+	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+		;
+	CHECK(len < 0, "read", "read failed: %s\n", strerror(errno));
+
+	close(iter_fd);
+
+free_link:
+	bpf_link__destroy(link);
+}
+
+static void test_ipv6_route(void)
+{
+	struct bpf_iter_ipv6_route *skel;
+
+	skel = bpf_iter_ipv6_route__open_and_load();
+	if (CHECK(!skel, "bpf_iter_ipv6_route__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	do_dummy_read(skel->progs.dump_ipv6_route);
+
+	bpf_iter_ipv6_route__destroy(skel);
+}
+
+static void test_netlink(void)
+{
+	struct bpf_iter_netlink *skel;
+
+	skel = bpf_iter_netlink__open_and_load();
+	if (CHECK(!skel, "bpf_iter_netlink__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	do_dummy_read(skel->progs.dump_netlink);
+
+	bpf_iter_netlink__destroy(skel);
+}
+
+static void test_bpf_map(void)
+{
+	struct bpf_iter_bpf_map *skel;
+
+	skel = bpf_iter_bpf_map__open_and_load();
+	if (CHECK(!skel, "bpf_iter_bpf_map__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	do_dummy_read(skel->progs.dump_bpf_map);
+
+	bpf_iter_bpf_map__destroy(skel);
+}
+
+static void test_task(void)
+{
+	struct bpf_iter_task *skel;
+
+	skel = bpf_iter_task__open_and_load();
+	if (CHECK(!skel, "bpf_iter_task__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	do_dummy_read(skel->progs.dump_task);
+
+	bpf_iter_task__destroy(skel);
+}
+
+static void test_task_file(void)
+{
+	struct bpf_iter_task_file *skel;
+
+	skel = bpf_iter_task_file__open_and_load();
+	if (CHECK(!skel, "bpf_iter_task_file__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	do_dummy_read(skel->progs.dump_task_file);
+
+	bpf_iter_task_file__destroy(skel);
+}
+
+/* The expected string is less than 16 bytes */
+static int do_read_with_fd(int iter_fd, const char *expected,
+			   bool read_one_char)
+{
+	int err = -1, len, read_buf_len, start;
+	char buf[16] = {};
+
+	read_buf_len = read_one_char ? 1 : 16;
+	start = 0;
+	while ((len = read(iter_fd, buf + start, read_buf_len)) > 0) {
+		start += len;
+		if (CHECK(start >= 16, "read", "read len %d\n", len))
+			return -1;
+		read_buf_len = read_one_char ? 1 : 16 - start;
+	}
+	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+		return -1;
+
+	err = strcmp(buf, expected);
+	if (CHECK(err, "read", "incorrect read result: buf %s, expected %s\n",
+		  buf, expected))
+		return -1;
+
+	return 0;
+}
+
+static void test_anon_iter(bool read_one_char)
+{
+	struct bpf_iter_test_kern1 *skel;
+	struct bpf_link *link;
+	int iter_fd, err;
+
+	skel = bpf_iter_test_kern1__open_and_load();
+	if (CHECK(!skel, "bpf_iter_test_kern1__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	err = bpf_iter_test_kern1__attach(skel);
+	if (CHECK(err, "bpf_iter_test_kern1__attach",
+		  "skeleton attach failed\n")) {
+		goto out;
+	}
+
+	link = skel->links.dump_task;
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto out;
+
+	do_read_with_fd(iter_fd, "abcd", read_one_char);
+	close(iter_fd);
+
+out:
+	bpf_iter_test_kern1__destroy(skel);
+}
+
+static int do_read(const char *path, const char *expected)
+{
+	int err, iter_fd;
+
+	iter_fd = open(path, O_RDONLY);
+	if (CHECK(iter_fd < 0, "open", "open %s failed: %s\n",
+		  path, strerror(errno)))
+		return -1;
+
+	err = do_read_with_fd(iter_fd, expected, false);
+	close(iter_fd);
+	return err;
+}
+
+static void test_file_iter(void)
+{
+	const char *path = "/sys/fs/bpf/bpf_iter_test1";
+	struct bpf_iter_test_kern1 *skel1;
+	struct bpf_iter_test_kern2 *skel2;
+	struct bpf_link *link;
+	int err;
+
+	skel1 = bpf_iter_test_kern1__open_and_load();
+	if (CHECK(!skel1, "bpf_iter_test_kern1__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	link = bpf_program__attach_iter(skel1->progs.dump_task, NULL);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		goto out;
+
+	/* unlink this path if it exists. */
+	unlink(path);
+
+	err = bpf_link__pin(link, path);
+	if (CHECK(err, "pin_iter", "pin_iter to %s failed: %s\n", path,
+		  strerror(errno)))
+		goto free_link;
+
+	err = do_read(path, "abcd");
+	if (err)
+		goto free_link;
+
+	/* file based iterator seems working fine. Let us a link update
+	 * of the underlying link and `cat` the iterator again, its content
+	 * should change.
+	 */
+	skel2 = bpf_iter_test_kern2__open_and_load();
+	if (CHECK(!skel2, "bpf_iter_test_kern2__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		goto free_link;
+
+	err = bpf_link__update_program(link, skel2->progs.dump_task);
+	if (CHECK(err, "update_prog", "update_prog failed\n"))
+		goto destroy_skel2;
+
+	do_read(path, "ABCD");
+
+destroy_skel2:
+	bpf_iter_test_kern2__destroy(skel2);
+free_link:
+	bpf_link__destroy(link);
+out:
+	bpf_iter_test_kern1__destroy(skel1);
+}
+
+static void test_overflow(bool test_e2big_overflow, bool ret1)
+{
+	__u32 map_info_len, total_read_len, expected_read_len;
+	int err, iter_fd, map1_fd, map2_fd, len;
+	struct bpf_map_info map_info = {};
+	struct bpf_iter_test_kern4 *skel;
+	struct bpf_link *link;
+	__u32 page_size;
+	char *buf;
+
+	skel = bpf_iter_test_kern4__open();
+	if (CHECK(!skel, "bpf_iter_test_kern4__open",
+		  "skeleton open failed\n"))
+		return;
+
+	/* create two maps: bpf program will only do bpf_seq_write
+	 * for these two maps. The goal is one map output almost
+	 * fills seq_file buffer and then the other will trigger
+	 * overflow and needs restart.
+	 */
+	map1_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+	if (CHECK(map1_fd < 0, "bpf_create_map",
+		  "map_creation failed: %s\n", strerror(errno)))
+		goto out;
+	map2_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+	if (CHECK(map2_fd < 0, "bpf_create_map",
+		  "map_creation failed: %s\n", strerror(errno)))
+		goto free_map1;
+
+	/* bpf_seq_printf kernel buffer is one page, so one map
+	 * bpf_seq_write will mostly fill it, and the other map
+	 * will partially fill and then trigger overflow and need
+	 * bpf_seq_read restart.
+	 */
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	if (test_e2big_overflow) {
+		skel->rodata->print_len = (page_size + 8) / 8;
+		expected_read_len = 2 * (page_size + 8);
+	} else if (!ret1) {
+		skel->rodata->print_len = (page_size - 8) / 8;
+		expected_read_len = 2 * (page_size - 8);
+	} else {
+		skel->rodata->print_len = 1;
+		expected_read_len = 2 * 8;
+	}
+	skel->rodata->ret1 = ret1;
+
+	if (CHECK(bpf_iter_test_kern4__load(skel),
+		  "bpf_iter_test_kern4__load", "skeleton load failed\n"))
+		goto free_map2;
+
+	/* setup filtering map_id in bpf program */
+	map_info_len = sizeof(map_info);
+	err = bpf_obj_get_info_by_fd(map1_fd, &map_info, &map_info_len);
+	if (CHECK(err, "get_map_info", "get map info failed: %s\n",
+		  strerror(errno)))
+		goto free_map2;
+	skel->bss->map1_id = map_info.id;
+
+	err = bpf_obj_get_info_by_fd(map2_fd, &map_info, &map_info_len);
+	if (CHECK(err, "get_map_info", "get map info failed: %s\n",
+		  strerror(errno)))
+		goto free_map2;
+	skel->bss->map2_id = map_info.id;
+
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		goto free_map2;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	buf = malloc(expected_read_len);
+	if (!buf)
+		goto close_iter;
+
+	/* do read */
+	total_read_len = 0;
+	if (test_e2big_overflow) {
+		while ((len = read(iter_fd, buf, expected_read_len)) > 0)
+			total_read_len += len;
+
+		CHECK(len != -1 || errno != E2BIG, "read",
+		      "expected ret -1, errno E2BIG, but get ret %d, error %s\n",
+			  len, strerror(errno));
+		goto free_buf;
+	} else if (!ret1) {
+		while ((len = read(iter_fd, buf, expected_read_len)) > 0)
+			total_read_len += len;
+
+		if (CHECK(len < 0, "read", "read failed: %s\n",
+			  strerror(errno)))
+			goto free_buf;
+	} else {
+		do {
+			len = read(iter_fd, buf, expected_read_len);
+			if (len > 0)
+				total_read_len += len;
+		} while (len > 0 || len == -EAGAIN);
+
+		if (CHECK(len < 0, "read", "read failed: %s\n",
+			  strerror(errno)))
+			goto free_buf;
+	}
+
+	if (CHECK(total_read_len != expected_read_len, "read",
+		  "total len %u, expected len %u\n", total_read_len,
+		  expected_read_len))
+		goto free_buf;
+
+	if (CHECK(skel->bss->map1_accessed != 1, "map1_accessed",
+		  "expected 1 actual %d\n", skel->bss->map1_accessed))
+		goto free_buf;
+
+	if (CHECK(skel->bss->map2_accessed != 2, "map2_accessed",
+		  "expected 2 actual %d\n", skel->bss->map2_accessed))
+		goto free_buf;
+
+	CHECK(skel->bss->map2_seqnum1 != skel->bss->map2_seqnum2,
+	      "map2_seqnum", "two different seqnum %lld %lld\n",
+	      skel->bss->map2_seqnum1, skel->bss->map2_seqnum2);
+
+free_buf:
+	free(buf);
+close_iter:
+	close(iter_fd);
+free_link:
+	bpf_link__destroy(link);
+free_map2:
+	close(map2_fd);
+free_map1:
+	close(map1_fd);
+out:
+	bpf_iter_test_kern4__destroy(skel);
+}
+
+void test_bpf_iter(void)
+{
+	if (test__start_subtest("btf_id_or_null"))
+		test_btf_id_or_null();
+	if (test__start_subtest("ipv6_route"))
+		test_ipv6_route();
+	if (test__start_subtest("netlink"))
+		test_netlink();
+	if (test__start_subtest("bpf_map"))
+		test_bpf_map();
+	if (test__start_subtest("task"))
+		test_task();
+	if (test__start_subtest("task_file"))
+		test_task_file();
+	if (test__start_subtest("anon"))
+		test_anon_iter(false);
+	if (test__start_subtest("anon-read-one-char"))
+		test_anon_iter(true);
+	if (test__start_subtest("file"))
+		test_file_iter();
+	if (test__start_subtest("overflow"))
+		test_overflow(false, false);
+	if (test__start_subtest("overflow-e2big"))
+		test_overflow(true, false);
+	if (test__start_subtest("prog-ret-1"))
+		test_overflow(false, true);
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
new file mode 100644
index 000000000000..c71a7c283108
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define START_CHAR 'a'
+#include "bpf_iter_test_kern_common.h"
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
new file mode 100644
index 000000000000..8bdc8dc07444
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define START_CHAR 'A'
+#include "bpf_iter_test_kern_common.h"
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
new file mode 100644
index 000000000000..636a00fa074d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/task")
+int dump_task(struct bpf_iter__task *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct task_struct *task = ctx->task;
+	int tgid;
+
+	tgid = task->tgid;
+	bpf_seq_write(seq, &tgid, sizeof(tgid));
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
new file mode 100644
index 000000000000..b18dc0471d07
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u32 map1_id = 0, map2_id = 0;
+__u32 map1_accessed = 0, map2_accessed = 0;
+__u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0;
+
+static volatile const __u32 print_len;
+static volatile const __u32 ret1;
+
+SEC("iter/bpf_map")
+int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct bpf_map *map = ctx->map;
+	__u64 seq_num;
+	int i, ret = 0;
+
+	if (map == (void *)0)
+		return 0;
+
+	/* only dump map1_id and map2_id */
+	if (map->id != map1_id && map->id != map2_id)
+		return 0;
+
+	seq_num = ctx->meta->seq_num;
+	if (map->id == map1_id) {
+		map1_seqnum = seq_num;
+		map1_accessed++;
+	}
+
+	if (map->id == map2_id) {
+		if (map2_accessed == 0) {
+			map2_seqnum1 = seq_num;
+			if (ret1)
+				ret = 1;
+		} else {
+			map2_seqnum2 = seq_num;
+		}
+		map2_accessed++;
+	}
+
+	/* fill seq_file buffer */
+	for (i = 0; i < print_len; i++)
+		bpf_seq_write(seq, &seq_num, sizeof(seq_num));
+
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
new file mode 100644
index 000000000000..bdd51cf14b54
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+int count = 0;
+
+SEC("iter/task")
+int dump_task(struct bpf_iter__task *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	char c;
+
+	if (count < 4) {
+		c = START_CHAR + count;
+		bpf_seq_write(seq, &c, sizeof(c));
+		count++;
+	}
+
+	return 0;
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 01/21] bpf: implement an interface to register bpf_iter targets
  2020-05-07  5:39 ` [PATCH bpf-next v3 01/21] bpf: implement an interface to register bpf_iter targets Yonghong Song
@ 2020-05-08 18:18   ` Andrii Nakryiko
  0 siblings, 0 replies; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 18:18 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:39 PM Yonghong Song <yhs@fb.com> wrote:
>
> The target can call bpf_iter_reg_target() to register itself.
> The needed information:
>   target:           target name
>   seq_ops:          the seq_file operations for the target
>   init_seq_private  target callback to initialize seq_priv during file open
>   fini_seq_private  target callback to clean up seq_priv during file release
>   seq_priv_size:    the private_data size needed by the seq_file
>                     operations
>
> The target name represents a target which provides a seq_ops
> for iterating objects.
>
> The target can provide two callback functions, init_seq_private
> and fini_seq_private, called during file open/release time.
> For example, /proc/net/{tcp6, ipv6_route, netlink, ...}, net
> name space needs to be setup properly during file open and
> released properly during file release.
>
> Function bpf_iter_unreg_target() is also implemented to unregister
> a particular target.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Lost my initial ack? :) Here we go again:

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  include/linux/bpf.h   | 15 +++++++++++
>  kernel/bpf/Makefile   |  2 +-
>  kernel/bpf/bpf_iter.c | 59 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 75 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/bpf_iter.c
>

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 02/21] bpf: allow loading of a bpf_iter program
  2020-05-07  5:39 ` [PATCH bpf-next v3 02/21] bpf: allow loading of a bpf_iter program Yonghong Song
@ 2020-05-08 18:20   ` Andrii Nakryiko
  0 siblings, 0 replies; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 18:20 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>
> A bpf_iter program is a tracing program with attach type
> BPF_TRACE_ITER. The load attribute
>   attach_btf_id
> is used by the verifier against a particular kernel function,
> which represents a target, e.g., __bpf_iter__bpf_map
> for target bpf_map which is implemented later.
>
> The program return value must be 0 or 1 for now.
>   0 : successful, except potential seq_file buffer overflow
>       which is handled by seq_file reader.
>   1 : request to restart the same object
>
> In the future, other return values may be used for filtering or
> teminating the iterator.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Looks good.

Acked-by: Andrii Nakryiko <andriin@fb.com>


>  include/linux/bpf.h            |  3 +++
>  include/uapi/linux/bpf.h       |  1 +
>  kernel/bpf/bpf_iter.c          | 36 ++++++++++++++++++++++++++++++++++
>  kernel/bpf/verifier.c          | 21 ++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h |  1 +
>  5 files changed, 62 insertions(+)
>

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 03/21] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-07  5:39 ` [PATCH bpf-next v3 03/21] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE Yonghong Song
@ 2020-05-08 18:24   ` Andrii Nakryiko
  2020-05-09  1:36     ` Yonghong Song
  0 siblings, 1 reply; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 18:24 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:41 PM Yonghong Song <yhs@fb.com> wrote:
>
> Given a bpf program, the step to create an anonymous bpf iterator is:
>   - create a bpf_iter_link, which combines bpf program and the target.
>     In the future, there could be more information recorded in the link.
>     A link_fd will be returned to the user space.
>   - create an anonymous bpf iterator with the given link_fd.
>
> The bpf_iter_link can be pinned to bpffs mount file system to
> create a file based bpf iterator as well.
>
> The benefit to use of bpf_iter_link:
>   - using bpf link simplifies design and implementation as bpf link
>     is used for other tracing bpf programs.
>   - for file based bpf iterator, bpf_iter_link provides a standard
>     way to replace underlying bpf programs.
>   - for both anonymous and free based iterators, bpf link query
>     capability can be leveraged.
>
> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
> querying. Currently, only prog_id is needed, so there is no
> additional in-kernel show_fdinfo() and fill_link_info() hook
> is needed for BPF_LINK_TYPE_ITER link.
>
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

still looks good, but I realized show_fdinfo and fill_link_info is
missing, see request for a follow-up below :)


>  include/linux/bpf.h            |  1 +
>  include/linux/bpf_types.h      |  1 +
>  include/uapi/linux/bpf.h       |  1 +
>  kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c           | 14 ++++++++
>  tools/include/uapi/linux/bpf.h |  1 +
>  6 files changed, 80 insertions(+)
>

[...]

> +static const struct bpf_link_ops bpf_iter_link_lops = {
> +       .release = bpf_iter_link_release,
> +       .dealloc = bpf_iter_link_dealloc,
> +};

Link infra supports .show_fdinfo and .fill_link_info methods, there is
no need to block on this, but it would be great to implement them from
BPF_LINK_TYPE_ITER as well in the same release as a follow-up. Thanks!


[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 05/21] bpf: implement bpf_seq_read() for bpf iterator
  2020-05-07  5:39 ` [PATCH bpf-next v3 05/21] bpf: implement bpf_seq_read() for bpf iterator Yonghong Song
@ 2020-05-08 18:52   ` Andrii Nakryiko
  2020-05-09  1:41     ` Yonghong Song
  0 siblings, 1 reply; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 18:52 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:39 PM Yonghong Song <yhs@fb.com> wrote:
>
> bpf iterator uses seq_file to provide a lossless
> way to transfer data to user space. But we want to call
> bpf program after all objects have been traversed, and
> bpf program may write additional data to the
> seq_file buffer. The current seq_read() does not work
> for this use case.
>
> Besides allowing stop() function to write to the buffer,
> the bpf_seq_read() also fixed the buffer size to one page.
> If any single call of show() or stop() will emit data
> more than one page to cause overflow, -E2BIG error code
> will be returned to user space.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

This loop is much simpler and more streamlined now, thanks a lot! I
think it's correct, see below about one confusing (but apparently
correct) bit, though. Either way:

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  kernel/bpf/bpf_iter.c | 118 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 118 insertions(+)
>
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 0542a243b78c..f198597b0ea4 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -26,6 +26,124 @@ static DEFINE_MUTEX(targets_mutex);
>  /* protect bpf_iter_link changes */
>  static DEFINE_MUTEX(link_mutex);
>
> +/* bpf_seq_read, a customized and simpler version for bpf iterator.
> + * no_llseek is assumed for this file.
> + * The following are differences from seq_read():
> + *  . fixed buffer size (PAGE_SIZE)
> + *  . assuming no_llseek
> + *  . stop() may call bpf program, handling potential overflow there
> + */
> +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
> +                           loff_t *ppos)
> +{
> +       struct seq_file *seq = file->private_data;
> +       size_t n, offs, copied = 0;
> +       int err = 0;
> +       void *p;
> +
> +       mutex_lock(&seq->lock);
> +
> +       if (!seq->buf) {
> +               seq->size = PAGE_SIZE;
> +               seq->buf = kmalloc(seq->size, GFP_KERNEL);
> +               if (!seq->buf) {
> +                       err = -ENOMEM;
> +                       goto done;

oh, thank you for converting to all lower-case label names! :)

> +               }
> +       }
> +
> +       if (seq->count) {
> +               n = min(seq->count, size);
> +               err = copy_to_user(buf, seq->buf + seq->from, n);
> +               if (err) {
> +                       err = -EFAULT;
> +                       goto done;
> +               }
> +               seq->count -= n;
> +               seq->from += n;
> +               copied = n;
> +               goto done;
> +       }
> +
> +       seq->from = 0;
> +       p = seq->op->start(seq, &seq->index);
> +       if (IS_ERR_OR_NULL(p))
> +               goto stop;

if start() returns IS_ERR(p), stop(p) below won't produce any output
(because BPF program is called only for p == NULL), so we'll just
return 0 with no error, do I interpret the code correctly? I think
seq_file's read actually returns PTR_ERR(p) as a result in this case.

so I think you need err = PTR_ERR(p); before goto stop here?

> +
> +       err = seq->op->show(seq, p);
> +       if (err > 0) {
> +               seq->count = 0;
> +       } else if (err < 0 || seq_has_overflowed(seq)) {
> +               if (!err)
> +                       err = -E2BIG;
> +               seq->count = 0;
> +               seq->op->stop(seq, p);
> +               goto done;
> +       }
> +
> +       while (1) {
> +               loff_t pos = seq->index;
> +
> +               offs = seq->count;
> +               p = seq->op->next(seq, p, &seq->index);
> +               if (pos == seq->index) {
> +                       pr_info_ratelimited("buggy seq_file .next function %ps "
> +                               "did not updated position index\n",
> +                               seq->op->next);
> +                       seq->index++;
> +               }
> +
> +               if (IS_ERR_OR_NULL(p)) {
> +                       err = PTR_ERR(p);
> +                       break;
> +               }
> +               if (seq->count >= size)
> +                       break;
> +
> +               err = seq->op->show(seq, p);
> +               if (err > 0) {
> +                       seq->count = offs;
> +               } else if (err < 0 || seq_has_overflowed(seq)) {
> +                       seq->count = offs;
> +                       if (!err)
> +                               err = -E2BIG;

nit: this -E2BIG is set unconditionally even for 2nd+ show(). This
will work, because it will get ignored on next iteration, but I think
it will be much more obvious if written as:

if (!err && offs = 0)
    err = -E2BIG;

It took me few re-readings of the code I'm pretty familiar with
already to realize that this is ok.

I had to write the below piece to realize that this is fine :) Just
leaving here just in case you find it useful:

else if (err < 0 || seq_has_overflowed(seq)) {
    if (!err && offs == 0) /* overflow in first show() output */
        err = -E2BIG;
    if (err) {             /* overflow in first show() or real error happened */
        seq->count = 0; /* not strictly necessary, but shows that we
are truncating output */
        seq->op->stop(seq, p);
        goto done; /* done will return err */
    }
    /* no error and overflow for 2nd+ show(), roll back output and stop */
    seq->count = offs;
    break;
}

> +                       if (offs == 0) {
> +                               seq->op->stop(seq, p);
> +                               goto done;
> +                       }
> +                       break;
> +               }
> +       }
> +stop:
> +       offs = seq->count;
> +       /* bpf program called if !p */
> +       seq->op->stop(seq, p);
> +       if (!p && seq_has_overflowed(seq)) {
> +               seq->count = offs;
> +               if (offs == 0) {
> +                       err = -E2BIG;
> +                       goto done;
> +               }
> +       }
> +
> +       n = min(seq->count, size);
> +       err = copy_to_user(buf, seq->buf, n);
> +       if (err) {
> +               err = -EFAULT;
> +               goto done;
> +       }
> +       copied = n;
> +       seq->count -= n;
> +       seq->from = n;
> +done:
> +       if (!copied)
> +               copied = err;
> +       else
> +               *ppos += copied;
> +       mutex_unlock(&seq->lock);
> +       return copied;
> +}
> +
>  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>  {
>         struct bpf_iter_target_info *tinfo;
> --
> 2.24.1
>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 06/21] bpf: create anonymous bpf iterator
  2020-05-07  5:39 ` [PATCH bpf-next v3 06/21] bpf: create anonymous " Yonghong Song
@ 2020-05-08 18:57   ` Andrii Nakryiko
  0 siblings, 0 replies; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 18:57 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:39 PM Yonghong Song <yhs@fb.com> wrote:
>
> A new bpf command BPF_ITER_CREATE is added.
>
> The anonymous bpf iterator is seq_file based.
> The seq_file private data are referenced by targets.
> The bpf_iter infrastructure allocated additional space
> at seq_file->private before the space used by targets
> to store some meta data, e.g.,
>   prog:       prog to run
>   session_id: an unique id for each opened seq_file
>   seq_num:    how many times bpf programs are queried in this session
>   done_stop:  an internal state to decide whether bpf program
>               should be called in seq_ops->stop() or not
>
> The seq_num will start from 0 for valid objects.
> The bpf program may see the same seq_num more than once if
>  - seq_file buffer overflow happens and the same object
>    is retried by bpf_seq_read(), or
>  - the bpf program explicitly requests a retry of the
>    same object
>
> Since module is not supported for bpf_iter, all target
> registeration happens at __init time, so there is no
> need to change bpf_iter_unreg_target() as it is used
> mostly in error path of the init function at which time
> no bpf iterators have been created yet.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

LGTM.

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  include/linux/bpf.h            |   1 +
>  include/uapi/linux/bpf.h       |   6 ++
>  kernel/bpf/bpf_iter.c          | 129 +++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c           |  26 +++++++
>  tools/include/uapi/linux/bpf.h |   6 ++
>  5 files changed, 168 insertions(+)
>

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 08/21] bpf: implement common macros/helpers for target iterators
  2020-05-07  5:39 ` [PATCH bpf-next v3 08/21] bpf: implement common macros/helpers for target iterators Yonghong Song
@ 2020-05-08 19:07   ` Andrii Nakryiko
  2020-05-09  3:18     ` Yonghong Song
  0 siblings, 1 reply; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 19:07 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>
> Macro DEFINE_BPF_ITER_FUNC is implemented so target
> can define an init function to capture the BTF type
> which represents the target.
>
> The bpf_iter_meta is a structure holding meta data, common
> to all targets in the bpf program.
>
> Additional marker functions are called before or after
> bpf_seq_read() show()/next()/stop() callback functions
> to help calculate precise seq_num and whether call bpf_prog
> inside stop().
>
> Two functions, bpf_iter_get_info() and bpf_iter_run_prog(),
> are implemented so target can get needed information from
> bpf_iter infrastructure and can run the program.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h   | 11 ++++++
>  kernel/bpf/bpf_iter.c | 86 ++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 92 insertions(+), 5 deletions(-)
>

Looks good. I was worried about re-using seq_num when element is
skipped, but this could already happen that same seq_num is associated
with different objects: overflow + retry returns different object
(because iteration is not a snapshot, so the element could be gone on
retry). Both cases will have to be handled in about the same fashion,
so it's fine.

Hm... Could this be a problem for start() implementation? E.g., if
object is still there, but iterator wants to skip it permanently.
Re-using seq_num will mean that start() will keep trying to fetch same
to-be-skipped element? Not sure, please think about it, but we can fix
it up later, if necessary.

Acked-by: Andrii Nakryiko <andriin@fb.com>

[...]

> @@ -112,11 +143,16 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>                         err = PTR_ERR(p);
>                         break;
>                 }
> +
> +               /* get a valid next object, increase seq_num */

typo: get -> got

> +               bpf_iter_inc_seq_num(seq);
> +
>                 if (seq->count >= size)
>                         break;
>
>                 err = seq->op->show(seq, p);
>                 if (err > 0) {
> +                       bpf_iter_dec_seq_num(seq);
>                         seq->count = offs;
>                 } else if (err < 0 || seq_has_overflowed(seq)) {
>                         seq->count = offs;

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 10/21] net: bpf: add netlink and ipv6_route bpf_iter targets
  2020-05-07  5:39 ` [PATCH bpf-next v3 10/21] net: bpf: add netlink and ipv6_route bpf_iter targets Yonghong Song
@ 2020-05-08 19:17   ` Andrii Nakryiko
  0 siblings, 0 replies; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 19:17 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>
> This patch added netlink and ipv6_route targets, using
> the same seq_ops (except show() and minor changes for stop())
> for /proc/net/{netlink,ipv6_route}.
>
> The net namespace for these targets are the current net
> namespace at file open stage, similar to
> /proc/net/{netlink,ipv6_route} reference counting
> the net namespace at seq_file open stage.
>
> Since module is not supported for now, ipv6_route is
> supported only if the IPV6 is built-in, i.e., not compiled
> as a module. The restriction can be lifted once module
> is properly supported for bpf_iter.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Looks correct.

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  fs/proc/proc_net.c       | 19 +++++++++
>  include/linux/proc_fs.h  |  3 ++
>  net/ipv6/ip6_fib.c       | 65 +++++++++++++++++++++++++++++-
>  net/ipv6/route.c         | 37 +++++++++++++++++
>  net/netlink/af_netlink.c | 87 +++++++++++++++++++++++++++++++++++++++-
>  5 files changed, 207 insertions(+), 4 deletions(-)
>

[...]

> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 3912aac7854d..25f6d3e619d0 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -6393,6 +6393,30 @@ void __init ip6_route_init_special_entries(void)
>    #endif
>  }
>
> +#if IS_BUILTIN(CONFIG_IPV6)
> +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
> +DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
> +
> +static int __init bpf_iter_register(void)
> +{
> +       struct bpf_iter_reg reg_info = {
> +               .target                 = "ipv6_route",
> +               .seq_ops                = &ipv6_route_seq_ops,
> +               .init_seq_private       = bpf_iter_init_seq_net,
> +               .fini_seq_private       = bpf_iter_fini_seq_net,
> +               .seq_priv_size          = sizeof(struct ipv6_route_iter),
> +       };
> +
> +       return bpf_iter_reg_target(&reg_info);
> +}
> +
> +static void bpf_iter_unregister(void)
> +{
> +       bpf_iter_unreg_target("ipv6_route");

Nit. This string duplication is unfortunate. If bpf_iter_unreg_target
took same `struct bpf_iter_ret *` as bpf_iter_reg_target(), it would
be symmetrical and not dependent on magic strings anymore. That
reg_info struct would just be static const struct global variable
passed to both register/unregister.

> +}
> +#endif
> +#endif
> +

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 11/21] bpf: add task and task/file iterator targets
  2020-05-07  5:39 ` [PATCH bpf-next v3 11/21] bpf: add task and task/file iterator targets Yonghong Song
@ 2020-05-08 19:36   ` Andrii Nakryiko
  0 siblings, 0 replies; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 19:36 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>
> Only the tasks belonging to "current" pid namespace
> are enumerated.
>
> For task/file target, the bpf program will have access to
>   struct task_struct *task
>   u32 fd
>   struct file *file
> where fd/file is an open file for the task.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  kernel/bpf/Makefile    |   2 +-
>  kernel/bpf/task_iter.c | 332 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 333 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/task_iter.c
>

Looks good, though I'd just use bpf_iter_seq_task_file_info directly,
given it is a state of iterator by design. Probably would simplify
code a bit as well. But either way I can't find any bug in the code,
so:

Acked-by: Andrii Nakryiko <andriin@fb.com>


[...]

> +
> +struct bpf_iter_seq_task_common {
> +       struct pid_namespace *ns;
> +};
> +
> +struct bpf_iter_seq_task_info {
> +       /* The first field must be struct bpf_iter_seq_task_common.
> +        * this is assumed by {init, fini}_seq_pidns() callback functions.
> +        */

Awesome, thanks!

> +       struct bpf_iter_seq_task_common common;
> +       u32 tid;
> +};
> +

[...]

> +
> +static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)

nit: v is very non-descriptive name, prev_task?

> +{
> +       struct bpf_iter_seq_task_info *info = seq->private;
> +       struct task_struct *task;
> +
> +       ++*pos;
> +       ++info->tid;
> +       put_task_struct((struct task_struct *)v);
> +       task = task_seq_get_next(info->common.ns, &info->tid);
> +       if (!task)
> +               return NULL;
> +
> +       return task;
> +}
> +
> +struct bpf_iter__task {
> +       __bpf_md_ptr(struct bpf_iter_meta *, meta);
> +       __bpf_md_ptr(struct task_struct *, task);
> +};
> +
> +DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
> +
> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)

same nit: v -> task? Can also specify `struct task_struct *` type,
same for above.

[...]

> +
> +struct bpf_iter_seq_task_file_info {
> +       /* The first field must be struct bpf_iter_seq_task_common.
> +        * this is assumed by {init, fini}_seq_pidns() callback functions.
> +        */
> +       struct bpf_iter_seq_task_common common;
> +       struct task_struct *task;
> +       struct files_struct *files;
> +       u32 tid;
> +       u32 fd;
> +};
> +
> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *tid,
> +                                          int *fd, struct task_struct **task,
> +                                          struct files_struct **fstruct)

Why not just passing struct bpf_iter_seq_task_file_info* directly,
instead of these 5 individual pointers?

> +{
> +       struct files_struct *curr_files;
> +       struct task_struct *curr_task;
> +       u32 curr_tid = *tid, max_fds;
> +       int curr_fd = *fd;
> +
> +       /* If this function returns a non-NULL file object,
> +        * it held a reference to the task/files_struct/file.
> +        * Otherwise, it does not hold any reference.
> +        */

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 13/21] bpf: add bpf_seq_printf and bpf_seq_write helpers
  2020-05-07  5:39 ` [PATCH bpf-next v3 13/21] bpf: add bpf_seq_printf and bpf_seq_write helpers Yonghong Song
@ 2020-05-08 19:44   ` Andrii Nakryiko
  2020-05-09  4:18     ` Yonghong Song
  0 siblings, 1 reply; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 19:44 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>
> Two helpers bpf_seq_printf and bpf_seq_write, are added for
> writing data to the seq_file buffer.
>
> bpf_seq_printf supports common format string flag/width/type
> fields so at least I can get identical results for
> netlink and ipv6_route targets.
>
> For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
> specifically indicates a write failure due to overflow, which
> means the object will be repeated in the next bpf invocation
> if object collection stays the same. Note that if the object
> collection is changed, depending how collection traversal is
> done, even if the object still in the collection, it may not
> be visited.
>
> bpf_seq_printf may return -EBUSY meaning that internal percpu
> buffer for memory copy of strings or other pointees is
> not available. Bpf program can return 1 to indicate it
> wants the same object to be repeated. Right now, this should not
> happen on no-RT kernels since migrate_disable(), which guards
> bpf prog call, calls preempt_disable().
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/uapi/linux/bpf.h       |  32 +++++-
>  kernel/trace/bpf_trace.c       | 200 +++++++++++++++++++++++++++++++++
>  scripts/bpf_helpers_doc.py     |   2 +
>  tools/include/uapi/linux/bpf.h |  32 +++++-
>  4 files changed, 264 insertions(+), 2 deletions(-)
>

Was a bit surprised by behavior on failed memory read, I think it's
important to emphasize and document this. But otherwise:

Acked-by: Andrii Nakryiko <andriin@fb.com>

[...]

> +               if (fmt[i] == 's') {
> +                       /* try our best to copy */
> +                       if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
> +                               err = -E2BIG;
> +                               goto out;
> +                       }
> +
> +                       bufs->buf[memcpy_cnt][0] = 0;
> +                       strncpy_from_unsafe(bufs->buf[memcpy_cnt],
> +                                           (void *) (long) args[fmt_cnt],
> +                                           MAX_SEQ_PRINTF_STR_LEN);

So the behavior is that we try to read string, but if it fails, we
treat it as empty string? That needs to be documented, IMHO. My
expectation was that entire printf would fail.

Same for pointers below, right?

> +                       params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
> +
> +                       fmt_cnt++;
> +                       memcpy_cnt++;
> +                       continue;
> +               }
> +

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 16/21] tools/libbpf: add bpf_iter support
  2020-05-07  5:39 ` [PATCH bpf-next v3 16/21] tools/libbpf: add bpf_iter support Yonghong Song
@ 2020-05-08 19:46   ` Andrii Nakryiko
  0 siblings, 0 replies; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 19:46 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:42 PM Yonghong Song <yhs@fb.com> wrote:
>
> Two new libbpf APIs are added to support bpf_iter:
>   - bpf_program__attach_iter
>     Given a bpf program and additional parameters, which is
>     none now, returns a bpf_link.
>   - bpf_iter_create
>     syscall level API to create a bpf iterator.
>
> The macro BPF_SEQ_PRINTF are also introduced. The format
> looks like:
>   BPF_SEQ_PRINTF(seq, "task id %d\n", pid);
>
> This macro can help bpf program writers with
> nicer bpf_seq_printf syntax similar to the kernel one.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Looks great!

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  tools/lib/bpf/bpf.c         | 10 +++++++
>  tools/lib/bpf/bpf.h         |  2 ++
>  tools/lib/bpf/bpf_tracing.h | 16 ++++++++++++
>  tools/lib/bpf/libbpf.c      | 52 +++++++++++++++++++++++++++++++++++++
>  tools/lib/bpf/libbpf.h      |  9 +++++++
>  tools/lib/bpf/libbpf.map    |  2 ++
>  6 files changed, 91 insertions(+)
>

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 17/21] tools/libpf: add offsetof/container_of macro in bpf_helpers.h
  2020-05-07  5:39 ` [PATCH bpf-next v3 17/21] tools/libpf: add offsetof/container_of macro in bpf_helpers.h Yonghong Song
@ 2020-05-08 19:48   ` Andrii Nakryiko
  0 siblings, 0 replies; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 19:48 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>
> These two helpers will be used later in bpf_iter bpf program
> bpf_iter_netlink.c. Put them in bpf_helpers.h since they could
> be useful in other cases.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

Not sure #ifndef/#endif guards are needed, but won't hurt either.

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  tools/lib/bpf/bpf_helpers.h | 14 ++++++++++++++
>  1 file changed, 14 insertions(+)
>

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 18/21] tools/bpftool: add bpf_iter support for bptool
  2020-05-07  5:39 ` [PATCH bpf-next v3 18/21] tools/bpftool: add bpf_iter support for bptool Yonghong Song
@ 2020-05-08 19:51   ` Andrii Nakryiko
  2020-05-09  5:26     ` Yonghong Song
  0 siblings, 1 reply; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 19:51 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>
> Currently, only one command is supported
>   bpftool iter pin <bpf_prog.o> <path>
>
> It will pin the trace/iter bpf program in
> the object file <bpf_prog.o> to the <path>
> where <path> should be on a bpffs mount.
>
> For example,
>   $ bpftool iter pin ./bpf_iter_ipv6_route.o \
>     /sys/fs/bpf/my_route
> User can then do a `cat` to print out the results:
>   $ cat /sys/fs/bpf/my_route
>     fe800000000000000000000000000000 40 00000000000000000000000000000000 ...
>     00000000000000000000000000000000 00 00000000000000000000000000000000 ...
>     00000000000000000000000000000001 80 00000000000000000000000000000000 ...
>     fe800000000000008c0162fffebdfd57 80 00000000000000000000000000000000 ...
>     ff000000000000000000000000000000 08 00000000000000000000000000000000 ...
>     00000000000000000000000000000000 00 00000000000000000000000000000000 ...
>
> The implementation for ipv6_route iterator is in one of subsequent
> patches.
>
> This patch also added BPF_LINK_TYPE_ITER to link query.
>
> In the future, we may add additional parameters to pin command
> by parameterizing the bpf iterator. For example, a map_id or pid
> may be added to let bpf program only traverses a single map or task,
> similar to kernel seq_file single_open().
>
> We may also add introspection command for targets/iterators by
> leveraging the bpf_iter itself.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  .../bpftool/Documentation/bpftool-iter.rst    | 83 ++++++++++++++++++
>  tools/bpf/bpftool/bash-completion/bpftool     | 13 +++
>  tools/bpf/bpftool/iter.c                      | 84 +++++++++++++++++++
>  tools/bpf/bpftool/link.c                      |  1 +
>  tools/bpf/bpftool/main.c                      |  3 +-
>  tools/bpf/bpftool/main.h                      |  1 +
>  6 files changed, 184 insertions(+), 1 deletion(-)
>  create mode 100644 tools/bpf/bpftool/Documentation/bpftool-iter.rst
>  create mode 100644 tools/bpf/bpftool/iter.c
>

[...]

> diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c
> new file mode 100644
> index 000000000000..a8fb1349c103
> --- /dev/null
> +++ b/tools/bpf/bpftool/iter.c
> @@ -0,0 +1,84 @@
> +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +// Copyright (C) 2020 Facebook
> +
> +#define _GNU_SOURCE
> +#include <linux/err.h>
> +#include <bpf/libbpf.h>
> +
> +#include "main.h"
> +
> +static int do_pin(int argc, char **argv)
> +{
> +       const char *objfile, *path;
> +       struct bpf_program *prog;
> +       struct bpf_object *obj;
> +       struct bpf_link *link;
> +       int err;
> +
> +       if (!REQ_ARGS(2))
> +               usage();
> +
> +       objfile = GET_ARG();
> +       path = GET_ARG();
> +
> +       obj = bpf_object__open(objfile);
> +       if (IS_ERR_OR_NULL(obj)) {

nit: can't be NULL

> +               p_err("can't open objfile %s", objfile);
> +               return -1;
> +       }
> +
> +       err = bpf_object__load(obj);
> +       if (err) {
> +               p_err("can't load objfile %s", objfile);
> +               goto close_obj;
> +       }
> +
> +       prog = bpf_program__next(NULL, obj);

check for null and printf error? Crashing is not good.

> +       link = bpf_program__attach_iter(prog, NULL);
> +       if (IS_ERR(link)) {
> +               err = PTR_ERR(link);
> +               p_err("attach_iter failed for program %s",
> +                     bpf_program__name(prog));
> +               goto close_obj;
> +       }
> +
> +       err = mount_bpffs_for_pin(path);
> +       if (err)
> +               goto close_link;
> +
> +       err = bpf_link__pin(link, path);
> +       if (err) {
> +               p_err("pin_iter failed for program %s to path %s",
> +                     bpf_program__name(prog), path);
> +               goto close_link;
> +       }
> +
> +close_link:
> +       bpf_link__disconnect(link);

this is wrong, just destroy()

> +       bpf_link__destroy(link);
> +close_obj:
> +       bpf_object__close(obj);
> +       return err;
> +}
> +

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 21/21] tools/bpf: selftests: add bpf_iter selftests
  2020-05-07  5:39 ` [PATCH bpf-next v3 21/21] tools/bpf: selftests: add bpf_iter selftests Yonghong Song
@ 2020-05-08 19:57   ` Andrii Nakryiko
  0 siblings, 0 replies; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-08 19:57 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Wed, May 6, 2020 at 10:41 PM Yonghong Song <yhs@fb.com> wrote:
>
> The added test includes the following subtests:
>   - test verifier change for btf_id_or_null
>   - test load/create_iter/read for
>     ipv6_route/netlink/bpf_map/task/task_file
>   - test anon bpf iterator
>   - test anon bpf iterator reading one char at a time
>   - test file bpf iterator
>   - test overflow (single bpf program output not overflow)
>   - test overflow (single bpf program output overflows)
>   - test bpf prog returning 1
>
> The ipv6_route tests the following verifier change
>   - access fields in the variable length array of the structure.
>
> The netlink load tests the following verifier change
>   - put a btf_id ptr value in a stack and accessible to
>     tracing/iter programs.
>
> The anon bpf iterator also tests link auto attach through skeleton.
>
>   $ test_progs -n 2
>   #2/1 btf_id_or_null:OK
>   #2/2 ipv6_route:OK
>   #2/3 netlink:OK
>   #2/4 bpf_map:OK
>   #2/5 task:OK
>   #2/6 task_file:OK
>   #2/7 anon:OK
>   #2/8 anon-read-one-char:OK
>   #2/9 file:OK
>   #2/10 overflow:OK
>   #2/11 overflow-e2big:OK
>   #2/12 prog-ret-1:OK
>   #2 bpf_iter:OK
>   Summary: 1/12 PASSED, 0 SKIPPED, 0 FAILED
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---

I'm personally not a big fan of bpf_iter_test_kern_common.h approach
(including it and parameterizing with #define), I'd rather just
copy/paste BPF program code (it's just a few lines) and maybe even put
them in the same file/skeleton, less files to jump between. But that's
just personal preferences, so:

Acked-by: Andrii Nakryiko <andriin@fb.com>

>  .../selftests/bpf/prog_tests/bpf_iter.c       | 408 ++++++++++++++++++
>  .../selftests/bpf/progs/bpf_iter_test_kern1.c |   4 +
>  .../selftests/bpf/progs/bpf_iter_test_kern2.c |   4 +
>  .../selftests/bpf/progs/bpf_iter_test_kern3.c |  18 +
>  .../selftests/bpf/progs/bpf_iter_test_kern4.c |  52 +++
>  .../bpf/progs/bpf_iter_test_kern_common.h     |  22 +
>  6 files changed, 508 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_iter.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
>

[...]

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 03/21] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-08 18:24   ` Andrii Nakryiko
@ 2020-05-09  1:36     ` Yonghong Song
  2020-05-12  3:15       ` Andrii Nakryiko
  0 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-09  1:36 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/8/20 11:24 AM, Andrii Nakryiko wrote:
> On Wed, May 6, 2020 at 10:41 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Given a bpf program, the step to create an anonymous bpf iterator is:
>>    - create a bpf_iter_link, which combines bpf program and the target.
>>      In the future, there could be more information recorded in the link.
>>      A link_fd will be returned to the user space.
>>    - create an anonymous bpf iterator with the given link_fd.
>>
>> The bpf_iter_link can be pinned to bpffs mount file system to
>> create a file based bpf iterator as well.
>>
>> The benefit to use of bpf_iter_link:
>>    - using bpf link simplifies design and implementation as bpf link
>>      is used for other tracing bpf programs.
>>    - for file based bpf iterator, bpf_iter_link provides a standard
>>      way to replace underlying bpf programs.
>>    - for both anonymous and free based iterators, bpf link query
>>      capability can be leveraged.
>>
>> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
>> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
>> querying. Currently, only prog_id is needed, so there is no
>> additional in-kernel show_fdinfo() and fill_link_info() hook
>> is needed for BPF_LINK_TYPE_ITER link.
>>
>> Acked-by: Andrii Nakryiko <andriin@fb.com>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
> 
> still looks good, but I realized show_fdinfo and fill_link_info is
> missing, see request for a follow-up below :)
> 
> 
>>   include/linux/bpf.h            |  1 +
>>   include/linux/bpf_types.h      |  1 +
>>   include/uapi/linux/bpf.h       |  1 +
>>   kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>>   kernel/bpf/syscall.c           | 14 ++++++++
>>   tools/include/uapi/linux/bpf.h |  1 +
>>   6 files changed, 80 insertions(+)
>>
> 
> [...]
> 
>> +static const struct bpf_link_ops bpf_iter_link_lops = {
>> +       .release = bpf_iter_link_release,
>> +       .dealloc = bpf_iter_link_dealloc,
>> +};
> 
> Link infra supports .show_fdinfo and .fill_link_info methods, there is
> no need to block on this, but it would be great to implement them from
> BPF_LINK_TYPE_ITER as well in the same release as a follow-up. Thanks!

The reason I did not implement is due to we do not have additional 
information beyond prog_id to present. The prog_id itself gives all
information about this link. I looked at tracing program 
show_fdinfo/fill_link_info, the additional attach_type is printed.
But attach_type is obvious for BPF_LINK_TYPE_ITER which does not
need print.

In the future when we add more stuff to parameterize the bpf_iter,
will need to implement these two callbacks as well as bpftool.

> 
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 05/21] bpf: implement bpf_seq_read() for bpf iterator
  2020-05-08 18:52   ` Andrii Nakryiko
@ 2020-05-09  1:41     ` Yonghong Song
  0 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-09  1:41 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/8/20 11:52 AM, Andrii Nakryiko wrote:
> On Wed, May 6, 2020 at 10:39 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> bpf iterator uses seq_file to provide a lossless
>> way to transfer data to user space. But we want to call
>> bpf program after all objects have been traversed, and
>> bpf program may write additional data to the
>> seq_file buffer. The current seq_read() does not work
>> for this use case.
>>
>> Besides allowing stop() function to write to the buffer,
>> the bpf_seq_read() also fixed the buffer size to one page.
>> If any single call of show() or stop() will emit data
>> more than one page to cause overflow, -E2BIG error code
>> will be returned to user space.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
> 
> This loop is much simpler and more streamlined now, thanks a lot! I
> think it's correct, see below about one confusing (but apparently
> correct) bit, though. Either way:
> 
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> 
>>   kernel/bpf/bpf_iter.c | 118 ++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 118 insertions(+)
>>
>> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
>> index 0542a243b78c..f198597b0ea4 100644
>> --- a/kernel/bpf/bpf_iter.c
>> +++ b/kernel/bpf/bpf_iter.c
>> @@ -26,6 +26,124 @@ static DEFINE_MUTEX(targets_mutex);
>>   /* protect bpf_iter_link changes */
>>   static DEFINE_MUTEX(link_mutex);
>>
>> +/* bpf_seq_read, a customized and simpler version for bpf iterator.
>> + * no_llseek is assumed for this file.
>> + * The following are differences from seq_read():
>> + *  . fixed buffer size (PAGE_SIZE)
>> + *  . assuming no_llseek
>> + *  . stop() may call bpf program, handling potential overflow there
>> + */
>> +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>> +                           loff_t *ppos)
>> +{
>> +       struct seq_file *seq = file->private_data;
>> +       size_t n, offs, copied = 0;
>> +       int err = 0;
>> +       void *p;
>> +
>> +       mutex_lock(&seq->lock);
>> +
>> +       if (!seq->buf) {
>> +               seq->size = PAGE_SIZE;
>> +               seq->buf = kmalloc(seq->size, GFP_KERNEL);
>> +               if (!seq->buf) {
>> +                       err = -ENOMEM;
>> +                       goto done;
> 
> oh, thank you for converting to all lower-case label names! :)
> 
>> +               }
>> +       }
>> +
>> +       if (seq->count) {
>> +               n = min(seq->count, size);
>> +               err = copy_to_user(buf, seq->buf + seq->from, n);
>> +               if (err) {
>> +                       err = -EFAULT;
>> +                       goto done;
>> +               }
>> +               seq->count -= n;
>> +               seq->from += n;
>> +               copied = n;
>> +               goto done;
>> +       }
>> +
>> +       seq->from = 0;
>> +       p = seq->op->start(seq, &seq->index);
>> +       if (IS_ERR_OR_NULL(p))
>> +               goto stop;
> 
> if start() returns IS_ERR(p), stop(p) below won't produce any output
> (because BPF program is called only for p == NULL), so we'll just
> return 0 with no error, do I interpret the code correctly? I think
> seq_file's read actually returns PTR_ERR(p) as a result in this case.
> 
> so I think you need err = PTR_ERR(p); before goto stop here?

Thanks for catching this!
Yes, seq_file() indeed returns PTR_ERR(p) to user space here.
Will make the change.

> 
>> +
>> +       err = seq->op->show(seq, p);
>> +       if (err > 0) {
>> +               seq->count = 0;
>> +       } else if (err < 0 || seq_has_overflowed(seq)) {
>> +               if (!err)
>> +                       err = -E2BIG;
>> +               seq->count = 0;
>> +               seq->op->stop(seq, p);
>> +               goto done;
>> +       }
>> +
>> +       while (1) {
>> +               loff_t pos = seq->index;
>> +
>> +               offs = seq->count;
>> +               p = seq->op->next(seq, p, &seq->index);
>> +               if (pos == seq->index) {
>> +                       pr_info_ratelimited("buggy seq_file .next function %ps "
>> +                               "did not updated position index\n",
>> +                               seq->op->next);
>> +                       seq->index++;
>> +               }
>> +
>> +               if (IS_ERR_OR_NULL(p)) {
>> +                       err = PTR_ERR(p);
>> +                       break;
>> +               }
>> +               if (seq->count >= size)
>> +                       break;
>> +
>> +               err = seq->op->show(seq, p);
>> +               if (err > 0) {
>> +                       seq->count = offs;
>> +               } else if (err < 0 || seq_has_overflowed(seq)) {
>> +                       seq->count = offs;
>> +                       if (!err)
>> +                               err = -E2BIG;
> 
> nit: this -E2BIG is set unconditionally even for 2nd+ show(). This
> will work, because it will get ignored on next iteration, but I think
> it will be much more obvious if written as:
> 
> if (!err && offs = 0)
>      err = -E2BIG;

Yes, will make the change since it indeed makes code more readable.

> 
> It took me few re-readings of the code I'm pretty familiar with
> already to realize that this is ok.
> 
> I had to write the below piece to realize that this is fine :) Just
> leaving here just in case you find it useful:
> 
> else if (err < 0 || seq_has_overflowed(seq)) {
>      if (!err && offs == 0) /* overflow in first show() output */
>          err = -E2BIG;
>      if (err) {             /* overflow in first show() or real error happened */
>          seq->count = 0; /* not strictly necessary, but shows that we
> are truncating output */
>          seq->op->stop(seq, p);
>          goto done; /* done will return err */
>      }
>      /* no error and overflow for 2nd+ show(), roll back output and stop */
>      seq->count = offs;
>      break;
> }
> 
>> +                       if (offs == 0) {
>> +                               seq->op->stop(seq, p);
>> +                               goto done;
>> +                       }
>> +                       break;
>> +               }
>> +       }
>> +stop:
>> +       offs = seq->count;
>> +       /* bpf program called if !p */
>> +       seq->op->stop(seq, p);
>> +       if (!p && seq_has_overflowed(seq)) {
>> +               seq->count = offs;
>> +               if (offs == 0) {
>> +                       err = -E2BIG;
>> +                       goto done;
>> +               }
>> +       }
>> +
>> +       n = min(seq->count, size);
>> +       err = copy_to_user(buf, seq->buf, n);
>> +       if (err) {
>> +               err = -EFAULT;
>> +               goto done;
>> +       }
>> +       copied = n;
>> +       seq->count -= n;
>> +       seq->from = n;
>> +done:
>> +       if (!copied)
>> +               copied = err;
>> +       else
>> +               *ppos += copied;
>> +       mutex_unlock(&seq->lock);
>> +       return copied;
>> +}
>> +
>>   int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>>   {
>>          struct bpf_iter_target_info *tinfo;
>> --
>> 2.24.1
>>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 08/21] bpf: implement common macros/helpers for target iterators
  2020-05-08 19:07   ` Andrii Nakryiko
@ 2020-05-09  3:18     ` Yonghong Song
  2020-05-12  3:16       ` Andrii Nakryiko
  0 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-09  3:18 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/8/20 12:07 PM, Andrii Nakryiko wrote:
> On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Macro DEFINE_BPF_ITER_FUNC is implemented so target
>> can define an init function to capture the BTF type
>> which represents the target.
>>
>> The bpf_iter_meta is a structure holding meta data, common
>> to all targets in the bpf program.
>>
>> Additional marker functions are called before or after
>> bpf_seq_read() show()/next()/stop() callback functions
>> to help calculate precise seq_num and whether call bpf_prog
>> inside stop().
>>
>> Two functions, bpf_iter_get_info() and bpf_iter_run_prog(),
>> are implemented so target can get needed information from
>> bpf_iter infrastructure and can run the program.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h   | 11 ++++++
>>   kernel/bpf/bpf_iter.c | 86 ++++++++++++++++++++++++++++++++++++++++---
>>   2 files changed, 92 insertions(+), 5 deletions(-)
>>
> 
> Looks good. I was worried about re-using seq_num when element is
> skipped, but this could already happen that same seq_num is associated
> with different objects: overflow + retry returns different object
> (because iteration is not a snapshot, so the element could be gone on
> retry). Both cases will have to be handled in about the same fashion,
> so it's fine.

This is what I thought as well.

> 
> Hm... Could this be a problem for start() implementation? E.g., if
> object is still there, but iterator wants to skip it permanently.
> Re-using seq_num will mean that start() will keep trying to fetch same
> to-be-skipped element? Not sure, please think about it, but we can fix
> it up later, if necessary.

The seq_num is for bpf_program context. It does not affect how start()
behaves. The start() MAY retry the same element over and over again
if show() overflows or returns <0, but in which case, user space
should check the return error code to decide to retry or give up.

> 
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> 
> [...]
> 
>> @@ -112,11 +143,16 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>>                          err = PTR_ERR(p);
>>                          break;
>>                  }
>> +
>> +               /* get a valid next object, increase seq_num */
> 
> typo: get -> got

Ack.

> 
>> +               bpf_iter_inc_seq_num(seq);
>> +
>>                  if (seq->count >= size)
>>                          break;
>>
>>                  err = seq->op->show(seq, p);
>>                  if (err > 0) {
>> +                       bpf_iter_dec_seq_num(seq);
>>                          seq->count = offs;
>>                  } else if (err < 0 || seq_has_overflowed(seq)) {
>>                          seq->count = offs;
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 13/21] bpf: add bpf_seq_printf and bpf_seq_write helpers
  2020-05-08 19:44   ` Andrii Nakryiko
@ 2020-05-09  4:18     ` Yonghong Song
  2020-05-09  5:30       ` Alexei Starovoitov
  0 siblings, 1 reply; 45+ messages in thread
From: Yonghong Song @ 2020-05-09  4:18 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/8/20 12:44 PM, Andrii Nakryiko wrote:
> On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Two helpers bpf_seq_printf and bpf_seq_write, are added for
>> writing data to the seq_file buffer.
>>
>> bpf_seq_printf supports common format string flag/width/type
>> fields so at least I can get identical results for
>> netlink and ipv6_route targets.
>>
>> For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
>> specifically indicates a write failure due to overflow, which
>> means the object will be repeated in the next bpf invocation
>> if object collection stays the same. Note that if the object
>> collection is changed, depending how collection traversal is
>> done, even if the object still in the collection, it may not
>> be visited.
>>
>> bpf_seq_printf may return -EBUSY meaning that internal percpu
>> buffer for memory copy of strings or other pointees is
>> not available. Bpf program can return 1 to indicate it
>> wants the same object to be repeated. Right now, this should not
>> happen on no-RT kernels since migrate_disable(), which guards
>> bpf prog call, calls preempt_disable().
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/uapi/linux/bpf.h       |  32 +++++-
>>   kernel/trace/bpf_trace.c       | 200 +++++++++++++++++++++++++++++++++
>>   scripts/bpf_helpers_doc.py     |   2 +
>>   tools/include/uapi/linux/bpf.h |  32 +++++-
>>   4 files changed, 264 insertions(+), 2 deletions(-)
>>
> 
> Was a bit surprised by behavior on failed memory read, I think it's
> important to emphasize and document this. But otherwise:
> 
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> 
> [...]
> 
>> +               if (fmt[i] == 's') {
>> +                       /* try our best to copy */
>> +                       if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
>> +                               err = -E2BIG;
>> +                               goto out;
>> +                       }
>> +
>> +                       bufs->buf[memcpy_cnt][0] = 0;
>> +                       strncpy_from_unsafe(bufs->buf[memcpy_cnt],
>> +                                           (void *) (long) args[fmt_cnt],
>> +                                           MAX_SEQ_PRINTF_STR_LEN);
> 
> So the behavior is that we try to read string, but if it fails, we
> treat it as empty string? That needs to be documented, IMHO. My
> expectation was that entire printf would fail.

Let me return proper error. Currently, two possible errors may happen:
   - user provide an invalid address, yes, an error should be returned
     and we should not do anything
   - user provide a valid address, but it needs page fault happening
     to read the content. With current implementation,
     strncpy_from_unsafe will return fail. Future sleepable
     bpf program will help for this case, so an error means a
     real address error.

> 
> Same for pointers below, right?
> 
>> +                       params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
>> +
>> +                       fmt_cnt++;
>> +                       memcpy_cnt++;
>> +                       continue;
>> +               }
>> +
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 18/21] tools/bpftool: add bpf_iter support for bptool
  2020-05-08 19:51   ` Andrii Nakryiko
@ 2020-05-09  5:26     ` Yonghong Song
  0 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-09  5:26 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/8/20 12:51 PM, Andrii Nakryiko wrote:
> On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Currently, only one command is supported
>>    bpftool iter pin <bpf_prog.o> <path>
>>
>> It will pin the trace/iter bpf program in
>> the object file <bpf_prog.o> to the <path>
>> where <path> should be on a bpffs mount.
>>
>> For example,
>>    $ bpftool iter pin ./bpf_iter_ipv6_route.o \
>>      /sys/fs/bpf/my_route
>> User can then do a `cat` to print out the results:
>>    $ cat /sys/fs/bpf/my_route
>>      fe800000000000000000000000000000 40 00000000000000000000000000000000 ...
>>      00000000000000000000000000000000 00 00000000000000000000000000000000 ...
>>      00000000000000000000000000000001 80 00000000000000000000000000000000 ...
>>      fe800000000000008c0162fffebdfd57 80 00000000000000000000000000000000 ...
>>      ff000000000000000000000000000000 08 00000000000000000000000000000000 ...
>>      00000000000000000000000000000000 00 00000000000000000000000000000000 ...
>>
>> The implementation for ipv6_route iterator is in one of subsequent
>> patches.
>>
>> This patch also added BPF_LINK_TYPE_ITER to link query.
>>
>> In the future, we may add additional parameters to pin command
>> by parameterizing the bpf iterator. For example, a map_id or pid
>> may be added to let bpf program only traverses a single map or task,
>> similar to kernel seq_file single_open().
>>
>> We may also add introspection command for targets/iterators by
>> leveraging the bpf_iter itself.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   .../bpftool/Documentation/bpftool-iter.rst    | 83 ++++++++++++++++++
>>   tools/bpf/bpftool/bash-completion/bpftool     | 13 +++
>>   tools/bpf/bpftool/iter.c                      | 84 +++++++++++++++++++
>>   tools/bpf/bpftool/link.c                      |  1 +
>>   tools/bpf/bpftool/main.c                      |  3 +-
>>   tools/bpf/bpftool/main.h                      |  1 +
>>   6 files changed, 184 insertions(+), 1 deletion(-)
>>   create mode 100644 tools/bpf/bpftool/Documentation/bpftool-iter.rst
>>   create mode 100644 tools/bpf/bpftool/iter.c
>>
> 
> [...]
> 
>> diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c
>> new file mode 100644
>> index 000000000000..a8fb1349c103
>> --- /dev/null
>> +++ b/tools/bpf/bpftool/iter.c
>> @@ -0,0 +1,84 @@
>> +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
>> +// Copyright (C) 2020 Facebook
>> +
>> +#define _GNU_SOURCE
>> +#include <linux/err.h>
>> +#include <bpf/libbpf.h>
>> +
>> +#include "main.h"
>> +
>> +static int do_pin(int argc, char **argv)
>> +{
>> +       const char *objfile, *path;
>> +       struct bpf_program *prog;
>> +       struct bpf_object *obj;
>> +       struct bpf_link *link;
>> +       int err;
>> +
>> +       if (!REQ_ARGS(2))
>> +               usage();
>> +
>> +       objfile = GET_ARG();
>> +       path = GET_ARG();
>> +
>> +       obj = bpf_object__open(objfile);
>> +       if (IS_ERR_OR_NULL(obj)) {
> 
> nit: can't be NULL

Ack. Will change.

> 
>> +               p_err("can't open objfile %s", objfile);
>> +               return -1;
>> +       }
>> +
>> +       err = bpf_object__load(obj);
>> +       if (err) {
>> +               p_err("can't load objfile %s", objfile);
>> +               goto close_obj;
>> +       }
>> +
>> +       prog = bpf_program__next(NULL, obj);
> 
> check for null and printf error? Crashing is not good.

Make sense. Will change.

> 
>> +       link = bpf_program__attach_iter(prog, NULL);
>> +       if (IS_ERR(link)) {
>> +               err = PTR_ERR(link);
>> +               p_err("attach_iter failed for program %s",
>> +                     bpf_program__name(prog));
>> +               goto close_obj;
>> +       }
>> +
>> +       err = mount_bpffs_for_pin(path);
>> +       if (err)
>> +               goto close_link;
>> +
>> +       err = bpf_link__pin(link, path);
>> +       if (err) {
>> +               p_err("pin_iter failed for program %s to path %s",
>> +                     bpf_program__name(prog), path);
>> +               goto close_link;
>> +       }
>> +
>> +close_link:
>> +       bpf_link__disconnect(link);
> 
> this is wrong, just destroy()

Will change.

> 
>> +       bpf_link__destroy(link);
>> +close_obj:
>> +       bpf_object__close(obj);
>> +       return err;
>> +}
>> +
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 13/21] bpf: add bpf_seq_printf and bpf_seq_write helpers
  2020-05-09  4:18     ` Yonghong Song
@ 2020-05-09  5:30       ` Alexei Starovoitov
  2020-05-09  6:04         ` Yonghong Song
  0 siblings, 1 reply; 45+ messages in thread
From: Alexei Starovoitov @ 2020-05-09  5:30 UTC (permalink / raw)
  To: Yonghong Song, Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Daniel Borkmann, Kernel Team

On 5/8/20 9:18 PM, Yonghong Song wrote:
> 
> 
> On 5/8/20 12:44 PM, Andrii Nakryiko wrote:
>> On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>>>
>>> Two helpers bpf_seq_printf and bpf_seq_write, are added for
>>> writing data to the seq_file buffer.
>>>
>>> bpf_seq_printf supports common format string flag/width/type
>>> fields so at least I can get identical results for
>>> netlink and ipv6_route targets.
>>>
>>> For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
>>> specifically indicates a write failure due to overflow, which
>>> means the object will be repeated in the next bpf invocation
>>> if object collection stays the same. Note that if the object
>>> collection is changed, depending how collection traversal is
>>> done, even if the object still in the collection, it may not
>>> be visited.
>>>
>>> bpf_seq_printf may return -EBUSY meaning that internal percpu
>>> buffer for memory copy of strings or other pointees is
>>> not available. Bpf program can return 1 to indicate it
>>> wants the same object to be repeated. Right now, this should not
>>> happen on no-RT kernels since migrate_disable(), which guards
>>> bpf prog call, calls preempt_disable().
>>>
>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>> ---
>>>   include/uapi/linux/bpf.h       |  32 +++++-
>>>   kernel/trace/bpf_trace.c       | 200 +++++++++++++++++++++++++++++++++
>>>   scripts/bpf_helpers_doc.py     |   2 +
>>>   tools/include/uapi/linux/bpf.h |  32 +++++-
>>>   4 files changed, 264 insertions(+), 2 deletions(-)
>>>
>>
>> Was a bit surprised by behavior on failed memory read, I think it's
>> important to emphasize and document this. But otherwise:
>>
>> Acked-by: Andrii Nakryiko <andriin@fb.com>
>>
>> [...]
>>
>>> +               if (fmt[i] == 's') {
>>> +                       /* try our best to copy */
>>> +                       if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
>>> +                               err = -E2BIG;
>>> +                               goto out;
>>> +                       }
>>> +
>>> +                       bufs->buf[memcpy_cnt][0] = 0;
>>> +                       strncpy_from_unsafe(bufs->buf[memcpy_cnt],
>>> +                                           (void *) (long) 
>>> args[fmt_cnt],
>>> +                                           MAX_SEQ_PRINTF_STR_LEN);
>>
>> So the behavior is that we try to read string, but if it fails, we
>> treat it as empty string? That needs to be documented, IMHO. My
>> expectation was that entire printf would fail.
> 
> Let me return proper error. Currently, two possible errors may happen:
>    - user provide an invalid address, yes, an error should be returned
>      and we should not do anything
>    - user provide a valid address, but it needs page fault happening
>      to read the content. With current implementation,
>      strncpy_from_unsafe will return fail. Future sleepable
>      bpf program will help for this case, so an error means a
>      real address error.

It matches what bpf_trace_printk() is doing.
I suggest to defer any improvements to later patches.
Both should be consistent.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 13/21] bpf: add bpf_seq_printf and bpf_seq_write helpers
  2020-05-09  5:30       ` Alexei Starovoitov
@ 2020-05-09  6:04         ` Yonghong Song
  0 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-09  6:04 UTC (permalink / raw)
  To: Alexei Starovoitov, Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Daniel Borkmann, Kernel Team



On 5/8/20 10:30 PM, Alexei Starovoitov wrote:
> On 5/8/20 9:18 PM, Yonghong Song wrote:
>>
>>
>> On 5/8/20 12:44 PM, Andrii Nakryiko wrote:
>>> On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>>>>
>>>> Two helpers bpf_seq_printf and bpf_seq_write, are added for
>>>> writing data to the seq_file buffer.
>>>>
>>>> bpf_seq_printf supports common format string flag/width/type
>>>> fields so at least I can get identical results for
>>>> netlink and ipv6_route targets.
>>>>
>>>> For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
>>>> specifically indicates a write failure due to overflow, which
>>>> means the object will be repeated in the next bpf invocation
>>>> if object collection stays the same. Note that if the object
>>>> collection is changed, depending how collection traversal is
>>>> done, even if the object still in the collection, it may not
>>>> be visited.
>>>>
>>>> bpf_seq_printf may return -EBUSY meaning that internal percpu
>>>> buffer for memory copy of strings or other pointees is
>>>> not available. Bpf program can return 1 to indicate it
>>>> wants the same object to be repeated. Right now, this should not
>>>> happen on no-RT kernels since migrate_disable(), which guards
>>>> bpf prog call, calls preempt_disable().
>>>>
>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>> ---
>>>>   include/uapi/linux/bpf.h       |  32 +++++-
>>>>   kernel/trace/bpf_trace.c       | 200 
>>>> +++++++++++++++++++++++++++++++++
>>>>   scripts/bpf_helpers_doc.py     |   2 +
>>>>   tools/include/uapi/linux/bpf.h |  32 +++++-
>>>>   4 files changed, 264 insertions(+), 2 deletions(-)
>>>>
>>>
>>> Was a bit surprised by behavior on failed memory read, I think it's
>>> important to emphasize and document this. But otherwise:
>>>
>>> Acked-by: Andrii Nakryiko <andriin@fb.com>
>>>
>>> [...]
>>>
>>>> +               if (fmt[i] == 's') {
>>>> +                       /* try our best to copy */
>>>> +                       if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
>>>> +                               err = -E2BIG;
>>>> +                               goto out;
>>>> +                       }
>>>> +
>>>> +                       bufs->buf[memcpy_cnt][0] = 0;
>>>> +                       strncpy_from_unsafe(bufs->buf[memcpy_cnt],
>>>> +                                           (void *) (long) 
>>>> args[fmt_cnt],
>>>> +                                           MAX_SEQ_PRINTF_STR_LEN);
>>>
>>> So the behavior is that we try to read string, but if it fails, we
>>> treat it as empty string? That needs to be documented, IMHO. My
>>> expectation was that entire printf would fail.
>>
>> Let me return proper error. Currently, two possible errors may happen:
>>    - user provide an invalid address, yes, an error should be returned
>>      and we should not do anything
>>    - user provide a valid address, but it needs page fault happening
>>      to read the content. With current implementation,
>>      strncpy_from_unsafe will return fail. Future sleepable
>>      bpf program will help for this case, so an error means a
>>      real address error.
> 
> It matches what bpf_trace_printk() is doing.
> I suggest to defer any improvements to later patches.
> Both should be consistent.

Sure. We can do that.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 03/21] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-09  1:36     ` Yonghong Song
@ 2020-05-12  3:15       ` Andrii Nakryiko
  2020-05-13 16:57         ` Yonghong Song
  0 siblings, 1 reply; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-12  3:15 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Fri, May 8, 2020 at 6:36 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 5/8/20 11:24 AM, Andrii Nakryiko wrote:
> > On Wed, May 6, 2020 at 10:41 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Given a bpf program, the step to create an anonymous bpf iterator is:
> >>    - create a bpf_iter_link, which combines bpf program and the target.
> >>      In the future, there could be more information recorded in the link.
> >>      A link_fd will be returned to the user space.
> >>    - create an anonymous bpf iterator with the given link_fd.
> >>
> >> The bpf_iter_link can be pinned to bpffs mount file system to
> >> create a file based bpf iterator as well.
> >>
> >> The benefit to use of bpf_iter_link:
> >>    - using bpf link simplifies design and implementation as bpf link
> >>      is used for other tracing bpf programs.
> >>    - for file based bpf iterator, bpf_iter_link provides a standard
> >>      way to replace underlying bpf programs.
> >>    - for both anonymous and free based iterators, bpf link query
> >>      capability can be leveraged.
> >>
> >> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
> >> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
> >> querying. Currently, only prog_id is needed, so there is no
> >> additional in-kernel show_fdinfo() and fill_link_info() hook
> >> is needed for BPF_LINK_TYPE_ITER link.
> >>
> >> Acked-by: Andrii Nakryiko <andriin@fb.com>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >
> > still looks good, but I realized show_fdinfo and fill_link_info is
> > missing, see request for a follow-up below :)
> >
> >
> >>   include/linux/bpf.h            |  1 +
> >>   include/linux/bpf_types.h      |  1 +
> >>   include/uapi/linux/bpf.h       |  1 +
> >>   kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
> >>   kernel/bpf/syscall.c           | 14 ++++++++
> >>   tools/include/uapi/linux/bpf.h |  1 +
> >>   6 files changed, 80 insertions(+)
> >>
> >
> > [...]
> >
> >> +static const struct bpf_link_ops bpf_iter_link_lops = {
> >> +       .release = bpf_iter_link_release,
> >> +       .dealloc = bpf_iter_link_dealloc,
> >> +};
> >
> > Link infra supports .show_fdinfo and .fill_link_info methods, there is
> > no need to block on this, but it would be great to implement them from
> > BPF_LINK_TYPE_ITER as well in the same release as a follow-up. Thanks!
>
> The reason I did not implement is due to we do not have additional
> information beyond prog_id to present. The prog_id itself gives all
> information about this link. I looked at tracing program

Not all, e.g., bpf_iter target is invisible right now. It's good to
have this added in a follow up, but certainly not a blocker.


> show_fdinfo/fill_link_info, the additional attach_type is printed.
> But attach_type is obvious for BPF_LINK_TYPE_ITER which does not
> need print.
>
> In the future when we add more stuff to parameterize the bpf_iter,
> will need to implement these two callbacks as well as bpftool.

yep

>
> >
> >
> > [...]
> >

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 08/21] bpf: implement common macros/helpers for target iterators
  2020-05-09  3:18     ` Yonghong Song
@ 2020-05-12  3:16       ` Andrii Nakryiko
  0 siblings, 0 replies; 45+ messages in thread
From: Andrii Nakryiko @ 2020-05-12  3:16 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team

On Fri, May 8, 2020 at 8:18 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 5/8/20 12:07 PM, Andrii Nakryiko wrote:
> > On Wed, May 6, 2020 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Macro DEFINE_BPF_ITER_FUNC is implemented so target
> >> can define an init function to capture the BTF type
> >> which represents the target.
> >>
> >> The bpf_iter_meta is a structure holding meta data, common
> >> to all targets in the bpf program.
> >>
> >> Additional marker functions are called before or after
> >> bpf_seq_read() show()/next()/stop() callback functions
> >> to help calculate precise seq_num and whether call bpf_prog
> >> inside stop().
> >>
> >> Two functions, bpf_iter_get_info() and bpf_iter_run_prog(),
> >> are implemented so target can get needed information from
> >> bpf_iter infrastructure and can run the program.
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   include/linux/bpf.h   | 11 ++++++
> >>   kernel/bpf/bpf_iter.c | 86 ++++++++++++++++++++++++++++++++++++++++---
> >>   2 files changed, 92 insertions(+), 5 deletions(-)
> >>
> >
> > Looks good. I was worried about re-using seq_num when element is
> > skipped, but this could already happen that same seq_num is associated
> > with different objects: overflow + retry returns different object
> > (because iteration is not a snapshot, so the element could be gone on
> > retry). Both cases will have to be handled in about the same fashion,
> > so it's fine.
>
> This is what I thought as well.
>
> >
> > Hm... Could this be a problem for start() implementation? E.g., if
> > object is still there, but iterator wants to skip it permanently.
> > Re-using seq_num will mean that start() will keep trying to fetch same
> > to-be-skipped element? Not sure, please think about it, but we can fix
> > it up later, if necessary.
>
> The seq_num is for bpf_program context. It does not affect how start()
> behaves. The start() MAY retry the same element over and over again
> if show() overflows or returns <0, but in which case, user space
> should check the return error code to decide to retry or give up.

ah, right seq_num vs internal id, makes sense, never mind :)

>
> >
> > Acked-by: Andrii Nakryiko <andriin@fb.com>
> >
> > [...]
> >
> >> @@ -112,11 +143,16 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
> >>                          err = PTR_ERR(p);
> >>                          break;
> >>                  }
> >> +
> >> +               /* get a valid next object, increase seq_num */
> >
> > typo: get -> got
>
> Ack.
>
> >
> >> +               bpf_iter_inc_seq_num(seq);
> >> +
> >>                  if (seq->count >= size)
> >>                          break;
> >>
> >>                  err = seq->op->show(seq, p);
> >>                  if (err > 0) {
> >> +                       bpf_iter_dec_seq_num(seq);
> >>                          seq->count = offs;
> >>                  } else if (err < 0 || seq_has_overflowed(seq)) {
> >>                          seq->count = offs;
> >
> > [...]
> >

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH bpf-next v3 03/21] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE
  2020-05-12  3:15       ` Andrii Nakryiko
@ 2020-05-13 16:57         ` Yonghong Song
  0 siblings, 0 replies; 45+ messages in thread
From: Yonghong Song @ 2020-05-13 16:57 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Martin KaFai Lau, Networking,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team



On 5/11/20 8:15 PM, Andrii Nakryiko wrote:
> On Fri, May 8, 2020 at 6:36 PM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 5/8/20 11:24 AM, Andrii Nakryiko wrote:
>>> On Wed, May 6, 2020 at 10:41 PM Yonghong Song <yhs@fb.com> wrote:
>>>>
>>>> Given a bpf program, the step to create an anonymous bpf iterator is:
>>>>     - create a bpf_iter_link, which combines bpf program and the target.
>>>>       In the future, there could be more information recorded in the link.
>>>>       A link_fd will be returned to the user space.
>>>>     - create an anonymous bpf iterator with the given link_fd.
>>>>
>>>> The bpf_iter_link can be pinned to bpffs mount file system to
>>>> create a file based bpf iterator as well.
>>>>
>>>> The benefit to use of bpf_iter_link:
>>>>     - using bpf link simplifies design and implementation as bpf link
>>>>       is used for other tracing bpf programs.
>>>>     - for file based bpf iterator, bpf_iter_link provides a standard
>>>>       way to replace underlying bpf programs.
>>>>     - for both anonymous and free based iterators, bpf link query
>>>>       capability can be leveraged.
>>>>
>>>> The patch added support of tracing/iter programs for BPF_LINK_CREATE.
>>>> A new link type BPF_LINK_TYPE_ITER is added to facilitate link
>>>> querying. Currently, only prog_id is needed, so there is no
>>>> additional in-kernel show_fdinfo() and fill_link_info() hook
>>>> is needed for BPF_LINK_TYPE_ITER link.
>>>>
>>>> Acked-by: Andrii Nakryiko <andriin@fb.com>
>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>> ---
>>>
>>> still looks good, but I realized show_fdinfo and fill_link_info is
>>> missing, see request for a follow-up below :)
>>>
>>>
>>>>    include/linux/bpf.h            |  1 +
>>>>    include/linux/bpf_types.h      |  1 +
>>>>    include/uapi/linux/bpf.h       |  1 +
>>>>    kernel/bpf/bpf_iter.c          | 62 ++++++++++++++++++++++++++++++++++
>>>>    kernel/bpf/syscall.c           | 14 ++++++++
>>>>    tools/include/uapi/linux/bpf.h |  1 +
>>>>    6 files changed, 80 insertions(+)
>>>>
>>>
>>> [...]
>>>
>>>> +static const struct bpf_link_ops bpf_iter_link_lops = {
>>>> +       .release = bpf_iter_link_release,
>>>> +       .dealloc = bpf_iter_link_dealloc,
>>>> +};
>>>
>>> Link infra supports .show_fdinfo and .fill_link_info methods, there is
>>> no need to block on this, but it would be great to implement them from
>>> BPF_LINK_TYPE_ITER as well in the same release as a follow-up. Thanks!
>>
>> The reason I did not implement is due to we do not have additional
>> information beyond prog_id to present. The prog_id itself gives all
>> information about this link. I looked at tracing program
> 
> Not all, e.g., bpf_iter target is invisible right now. It's good to
> have this added in a follow up, but certainly not a blocker.

Indeed, bpf_iter target is not visible now. We can add it to program
or to link. Adding to link is a reasonable idea as link itself is
the concept to merge program and target.

Will have a follow up patch for this later.

> 
> 
>> show_fdinfo/fill_link_info, the additional attach_type is printed.
>> But attach_type is obvious for BPF_LINK_TYPE_ITER which does not
>> need print.
>>
>> In the future when we add more stuff to parameterize the bpf_iter,
>> will need to implement these two callbacks as well as bpftool.
> 
> yep
> 
>>
>>>
>>>
>>> [...]
>>>

^ permalink raw reply	[flat|nested] 45+ messages in thread

end of thread, other threads:[~2020-05-13 16:57 UTC | newest]

Thread overview: 45+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-07  5:39 [PATCH bpf-next v3 00/21] bpf: implement bpf iterator for kernel data Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 01/21] bpf: implement an interface to register bpf_iter targets Yonghong Song
2020-05-08 18:18   ` Andrii Nakryiko
2020-05-07  5:39 ` [PATCH bpf-next v3 02/21] bpf: allow loading of a bpf_iter program Yonghong Song
2020-05-08 18:20   ` Andrii Nakryiko
2020-05-07  5:39 ` [PATCH bpf-next v3 03/21] bpf: support bpf tracing/iter programs for BPF_LINK_CREATE Yonghong Song
2020-05-08 18:24   ` Andrii Nakryiko
2020-05-09  1:36     ` Yonghong Song
2020-05-12  3:15       ` Andrii Nakryiko
2020-05-13 16:57         ` Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 04/21] bpf: support bpf tracing/iter programs for BPF_LINK_UPDATE Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 05/21] bpf: implement bpf_seq_read() for bpf iterator Yonghong Song
2020-05-08 18:52   ` Andrii Nakryiko
2020-05-09  1:41     ` Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 06/21] bpf: create anonymous " Yonghong Song
2020-05-08 18:57   ` Andrii Nakryiko
2020-05-07  5:39 ` [PATCH bpf-next v3 07/21] bpf: create file " Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 08/21] bpf: implement common macros/helpers for target iterators Yonghong Song
2020-05-08 19:07   ` Andrii Nakryiko
2020-05-09  3:18     ` Yonghong Song
2020-05-12  3:16       ` Andrii Nakryiko
2020-05-07  5:39 ` [PATCH bpf-next v3 09/21] bpf: add bpf_map iterator Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 10/21] net: bpf: add netlink and ipv6_route bpf_iter targets Yonghong Song
2020-05-08 19:17   ` Andrii Nakryiko
2020-05-07  5:39 ` [PATCH bpf-next v3 11/21] bpf: add task and task/file iterator targets Yonghong Song
2020-05-08 19:36   ` Andrii Nakryiko
2020-05-07  5:39 ` [PATCH bpf-next v3 12/21] bpf: add PTR_TO_BTF_ID_OR_NULL support Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 13/21] bpf: add bpf_seq_printf and bpf_seq_write helpers Yonghong Song
2020-05-08 19:44   ` Andrii Nakryiko
2020-05-09  4:18     ` Yonghong Song
2020-05-09  5:30       ` Alexei Starovoitov
2020-05-09  6:04         ` Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 14/21] bpf: handle spilled PTR_TO_BTF_ID properly when checking stack_boundary Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 15/21] bpf: support variable length array in tracing programs Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 16/21] tools/libbpf: add bpf_iter support Yonghong Song
2020-05-08 19:46   ` Andrii Nakryiko
2020-05-07  5:39 ` [PATCH bpf-next v3 17/21] tools/libpf: add offsetof/container_of macro in bpf_helpers.h Yonghong Song
2020-05-08 19:48   ` Andrii Nakryiko
2020-05-07  5:39 ` [PATCH bpf-next v3 18/21] tools/bpftool: add bpf_iter support for bptool Yonghong Song
2020-05-08 19:51   ` Andrii Nakryiko
2020-05-09  5:26     ` Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 19/21] tools/bpf: selftests: add iterator programs for ipv6_route and netlink Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 20/21] tools/bpf: selftests: add iter progs for bpf_map/task/task_file Yonghong Song
2020-05-07  5:39 ` [PATCH bpf-next v3 21/21] tools/bpf: selftests: add bpf_iter selftests Yonghong Song
2020-05-08 19:57   ` Andrii Nakryiko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.