bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma
@ 2020-12-15 23:36 Song Liu
  2020-12-15 23:36 ` [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter Song Liu
                   ` (4 more replies)
  0 siblings, 5 replies; 36+ messages in thread
From: Song Liu @ 2020-12-15 23:36 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, john.fastabend, kpsingh, kernel-team, Song Liu

This set introduces bpf_iter for task_vma, which can be used to generate
information similar to /proc/pid/maps or /proc/pid/smaps. Patch 4/4 adds
an example that mimics /proc/pid/maps.

Changes v1 => v2:
  1. Small fixes in task_iter.c and the selftests. (Yonghong)

Song Liu (4):
  bpf: introduce task_vma bpf_iter
  bpf: allow bpf_d_path in sleepable bpf_iter program
  libbpf: introduce section "iter.s/" for sleepable bpf_iter program
  selftests/bpf: add test for bpf_iter_task_vma

 include/linux/bpf.h                           |   2 +-
 kernel/bpf/task_iter.c                        | 205 +++++++++++++++++-
 kernel/trace/bpf_trace.c                      |   5 +
 tools/lib/bpf/libbpf.c                        |   5 +
 .../selftests/bpf/prog_tests/bpf_iter.c       | 106 ++++++++-
 tools/testing/selftests/bpf/progs/bpf_iter.h  |   9 +
 .../selftests/bpf/progs/bpf_iter_task_vma.c   |  55 +++++
 7 files changed, 375 insertions(+), 12 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c

--
2.24.1

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-15 23:36 [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma Song Liu
@ 2020-12-15 23:36 ` Song Liu
  2020-12-16 17:36   ` Yonghong Song
                     ` (2 more replies)
  2020-12-15 23:37 ` [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program Song Liu
                   ` (3 subsequent siblings)
  4 siblings, 3 replies; 36+ messages in thread
From: Song Liu @ 2020-12-15 23:36 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, john.fastabend, kpsingh, kernel-team, Song Liu

Introduce task_vma bpf_iter to print memory information of a process. It
can be used to print customized information similar to /proc/<pid>/maps.

task_vma iterator releases mmap_lock before calling the BPF program.
Therefore, we cannot pass vm_area_struct directly to the BPF program. A
new __vm_area_struct is introduced to keep key information of a vma. On
each iteration, task_vma gathers information in __vm_area_struct and
passes it to the BPF program.

If the vma maps to a file, task_vma also holds a reference to the file
while calling the BPF program.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/bpf.h    |   2 +-
 kernel/bpf/task_iter.c | 205 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 205 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 07cb5d15e7439..49dd1e29c8118 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1325,7 +1325,7 @@ enum bpf_iter_feature {
 	BPF_ITER_RESCHED	= BIT(0),
 };
 
-#define BPF_ITER_CTX_ARG_MAX 2
+#define BPF_ITER_CTX_ARG_MAX 3
 struct bpf_iter_reg {
 	const char *target;
 	bpf_iter_attach_target_t attach_target;
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 0458a40edf10a..15a066b442f75 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -304,9 +304,183 @@ static const struct seq_operations task_file_seq_ops = {
 	.show	= task_file_seq_show,
 };
 
+/*
+ * Key information from vm_area_struct. We need this because we cannot
+ * assume the vm_area_struct is still valid after each iteration.
+ */
+struct __vm_area_struct {
+	__u64 start;
+	__u64 end;
+	__u64 flags;
+	__u64 pgoff;
+};
+
+struct bpf_iter_seq_task_vma_info {
+	/* The first field must be struct bpf_iter_seq_task_common.
+	 * this is assumed by {init, fini}_seq_pidns() callback functions.
+	 */
+	struct bpf_iter_seq_task_common common;
+	struct task_struct *task;
+	struct __vm_area_struct vma;
+	struct file *file;
+	u32 tid;
+};
+
+static struct __vm_area_struct *
+task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
+{
+	struct pid_namespace *ns = info->common.ns;
+	struct task_struct *curr_task;
+	struct vm_area_struct *vma;
+	u32 curr_tid = info->tid;
+	bool new_task = false;
+
+	/* If this function returns a non-NULL __vm_area_struct, it held
+	 * a reference to the task_struct. If info->file is non-NULL, it
+	 * also holds a reference to the file. If this function returns
+	 * NULL, it does not hold any reference.
+	 */
+again:
+	if (info->task) {
+		curr_task = info->task;
+	} else {
+		curr_task = task_seq_get_next(ns, &curr_tid, true);
+		if (!curr_task) {
+			info->task = NULL;
+			info->tid++;
+			return NULL;
+		}
+
+		if (curr_tid != info->tid) {
+			info->tid = curr_tid;
+			new_task = true;
+		}
+
+		if (!curr_task->mm)
+			goto next_task;
+		info->task = curr_task;
+	}
+
+	mmap_read_lock(curr_task->mm);
+	if (new_task) {
+		vma = curr_task->mm->mmap;
+	} else {
+		/* We drop the lock between each iteration, so it is
+		 * necessary to use find_vma() to find the next vma. This
+		 * is similar to the mechanism in show_smaps_rollup().
+		 */
+		vma = find_vma(curr_task->mm, info->vma.end - 1);
+		/* same vma as previous iteration, use vma->next */
+		if (vma && (vma->vm_start == info->vma.start))
+			vma = vma->vm_next;
+	}
+	if (!vma) {
+		mmap_read_unlock(curr_task->mm);
+		goto next_task;
+	}
+	info->task = curr_task;
+	info->vma.start = vma->vm_start;
+	info->vma.end = vma->vm_end;
+	info->vma.pgoff = vma->vm_pgoff;
+	info->vma.flags = vma->vm_flags;
+	if (vma->vm_file)
+		info->file = get_file(vma->vm_file);
+	mmap_read_unlock(curr_task->mm);
+	return &info->vma;
+
+next_task:
+	put_task_struct(curr_task);
+	info->task = NULL;
+	curr_tid = ++(info->tid);
+	goto again;
+}
+
+static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_task_vma_info *info = seq->private;
+	struct __vm_area_struct *vma;
+
+	vma = task_vma_seq_get_next(info);
+	if (vma && *pos == 0)
+		++*pos;
+
+	return vma;
+}
+
+static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_task_vma_info *info = seq->private;
+
+	++*pos;
+	if (info->file) {
+		fput(info->file);
+		info->file = NULL;
+	}
+	return task_vma_seq_get_next(info);
+}
+
+struct bpf_iter__task_vma {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct task_struct *, task);
+	__bpf_md_ptr(struct __vm_area_struct *, vma);
+	__bpf_md_ptr(struct file *, file);
+};
+
+DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
+		     struct task_struct *task, struct __vm_area_struct *vma,
+		     struct file *file)
+
+static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
+{
+	struct bpf_iter_seq_task_vma_info *info = seq->private;
+	struct bpf_iter__task_vma ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (!prog)
+		return 0;
+
+	ctx.meta = &meta;
+	ctx.task = info->task;
+	ctx.vma = &info->vma;
+	ctx.file = info->file;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_vma_seq_show(struct seq_file *seq, void *v)
+{
+	return __task_vma_seq_show(seq, false);
+}
+
+static void task_vma_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_task_vma_info *info = seq->private;
+
+	if (!v) {
+		(void)__task_vma_seq_show(seq, true);
+	} else {
+		put_task_struct(info->task);
+		if (info->file) {
+			fput(info->file);
+			info->file = NULL;
+		}
+		info->task = NULL;
+	}
+}
+
+static const struct seq_operations task_vma_seq_ops = {
+	.start	= task_vma_seq_start,
+	.next	= task_vma_seq_next,
+	.stop	= task_vma_seq_stop,
+	.show	= task_vma_seq_show,
+};
+
 BTF_ID_LIST(btf_task_file_ids)
 BTF_ID(struct, task_struct)
 BTF_ID(struct, file)
+BTF_ID(struct, __vm_area_struct)
 
 static const struct bpf_iter_seq_info task_seq_info = {
 	.seq_ops		= &task_seq_ops,
@@ -346,6 +520,28 @@ static struct bpf_iter_reg task_file_reg_info = {
 	.seq_info		= &task_file_seq_info,
 };
 
+static const struct bpf_iter_seq_info task_vma_seq_info = {
+	.seq_ops		= &task_vma_seq_ops,
+	.init_seq_private	= init_seq_pidns,
+	.fini_seq_private	= fini_seq_pidns,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_vma_info),
+};
+
+static struct bpf_iter_reg task_vma_reg_info = {
+	.target			= "task_vma",
+	.feature		= BPF_ITER_RESCHED,
+	.ctx_arg_info_size	= 3,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__task_vma, task),
+		  PTR_TO_BTF_ID_OR_NULL },
+		{ offsetof(struct bpf_iter__task_vma, vma),
+		  PTR_TO_BTF_ID_OR_NULL },
+		{ offsetof(struct bpf_iter__task_vma, file),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+	.seq_info		= &task_vma_seq_info,
+};
+
 static int __init task_iter_init(void)
 {
 	int ret;
@@ -357,6 +553,13 @@ static int __init task_iter_init(void)
 
 	task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
 	task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
-	return bpf_iter_reg_target(&task_file_reg_info);
+	ret =  bpf_iter_reg_target(&task_file_reg_info);
+	if (ret)
+		return ret;
+
+	task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
+	task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2];
+	task_vma_reg_info.ctx_arg_info[2].btf_id = btf_task_file_ids[1];
+	return bpf_iter_reg_target(&task_vma_reg_info);
 }
 late_initcall(task_iter_init);
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program
  2020-12-15 23:36 [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma Song Liu
  2020-12-15 23:36 ` [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter Song Liu
@ 2020-12-15 23:37 ` Song Liu
  2020-12-16 17:41   ` Yonghong Song
                     ` (2 more replies)
  2020-12-15 23:37 ` [PATCH v2 bpf-next 3/4] libbpf: introduce section "iter.s/" for " Song Liu
                   ` (2 subsequent siblings)
  4 siblings, 3 replies; 36+ messages in thread
From: Song Liu @ 2020-12-15 23:37 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, john.fastabend, kpsingh, kernel-team, Song Liu

task_file and task_vma iter programs have access to file->f_path. Enable
bpf_d_path to print paths of these file.

bpf_iter programs are generally called in sleepable context. However, it
is still necessary to diffientiate sleepable and non-sleepable bpf_iter
programs: sleepable programs have access to bpf_d_path; non-sleepable
programs have access to bpf_spin_lock.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 kernel/trace/bpf_trace.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4be771df5549a..9e5f9b968355f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1191,6 +1191,11 @@ BTF_SET_END(btf_allowlist_d_path)
 
 static bool bpf_d_path_allowed(const struct bpf_prog *prog)
 {
+	if (prog->type == BPF_PROG_TYPE_TRACING &&
+	    prog->expected_attach_type == BPF_TRACE_ITER &&
+	    prog->aux->sleepable)
+		return true;
+
 	if (prog->type == BPF_PROG_TYPE_LSM)
 		return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
 
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v2 bpf-next 3/4] libbpf: introduce section "iter.s/" for sleepable bpf_iter program
  2020-12-15 23:36 [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma Song Liu
  2020-12-15 23:36 ` [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter Song Liu
  2020-12-15 23:37 ` [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program Song Liu
@ 2020-12-15 23:37 ` Song Liu
  2020-12-16 17:42   ` Yonghong Song
  2020-12-15 23:37 ` [PATCH v2 bpf-next 4/4] selftests/bpf: add test for bpf_iter_task_vma Song Liu
  2020-12-16 17:00 ` [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma Yonghong Song
  4 siblings, 1 reply; 36+ messages in thread
From: Song Liu @ 2020-12-15 23:37 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, john.fastabend, kpsingh, kernel-team, Song Liu

Sleepable iterator program have access to helper functions like bpf_d_path.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 tools/lib/bpf/libbpf.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 6ae748f6ea118..12ad4593a91cb 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8541,6 +8541,11 @@ static const struct bpf_sec_def section_defs[] = {
 		.expected_attach_type = BPF_TRACE_ITER,
 		.is_attach_btf = true,
 		.attach_fn = attach_iter),
+	SEC_DEF("iter.s/", TRACING,
+		.expected_attach_type = BPF_TRACE_ITER,
+		.is_attach_btf = true,
+		.is_sleepable = true,
+		.attach_fn = attach_iter),
 	BPF_EAPROG_SEC("xdp_devmap/",		BPF_PROG_TYPE_XDP,
 						BPF_XDP_DEVMAP),
 	BPF_EAPROG_SEC("xdp_cpumap/",		BPF_PROG_TYPE_XDP,
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v2 bpf-next 4/4] selftests/bpf: add test for bpf_iter_task_vma
  2020-12-15 23:36 [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma Song Liu
                   ` (2 preceding siblings ...)
  2020-12-15 23:37 ` [PATCH v2 bpf-next 3/4] libbpf: introduce section "iter.s/" for " Song Liu
@ 2020-12-15 23:37 ` Song Liu
  2020-12-16 18:18   ` Yonghong Song
  2020-12-16 17:00 ` [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma Yonghong Song
  4 siblings, 1 reply; 36+ messages in thread
From: Song Liu @ 2020-12-15 23:37 UTC (permalink / raw)
  To: bpf, netdev
  Cc: ast, daniel, andrii, john.fastabend, kpsingh, kernel-team, Song Liu

The test dumps information similar to /proc/pid/maps. The first line of
the output is compared against the /proc file to make sure they match.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 106 ++++++++++++++++--
 tools/testing/selftests/bpf/progs/bpf_iter.h  |   9 ++
 .../selftests/bpf/progs/bpf_iter_task_vma.c   |  55 +++++++++
 3 files changed, 160 insertions(+), 10 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 0e586368948dd..7afd3abae1899 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -7,6 +7,7 @@
 #include "bpf_iter_task.skel.h"
 #include "bpf_iter_task_stack.skel.h"
 #include "bpf_iter_task_file.skel.h"
+#include "bpf_iter_task_vma.skel.h"
 #include "bpf_iter_task_btf.skel.h"
 #include "bpf_iter_tcp4.skel.h"
 #include "bpf_iter_tcp6.skel.h"
@@ -64,6 +65,22 @@ static void do_dummy_read(struct bpf_program *prog)
 	bpf_link__destroy(link);
 }
 
+static int read_fd_into_buffer(int fd, char *buf, int size)
+{
+	int bufleft = size;
+	int len;
+
+	do {
+		len = read(fd, buf, bufleft);
+		if (len > 0) {
+			buf += len;
+			bufleft -= len;
+		}
+	} while (len > 0);
+
+	return len;
+}
+
 static void test_ipv6_route(void)
 {
 	struct bpf_iter_ipv6_route *skel;
@@ -177,7 +194,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
 {
 	struct bpf_program *prog = skel->progs.dump_task_struct;
 	struct bpf_iter_task_btf__bss *bss = skel->bss;
-	int iter_fd = -1, len = 0, bufleft = TASKBUFSZ;
+	int iter_fd = -1, err;
 	struct bpf_link *link;
 	char *buf = taskbuf;
 	int ret = 0;
@@ -190,14 +207,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
 	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
 		goto free_link;
 
-	do {
-		len = read(iter_fd, buf, bufleft);
-		if (len > 0) {
-			buf += len;
-			bufleft -= len;
-		}
-	} while (len > 0);
-
+	err = read_fd_into_buffer(iter_fd, buf, TASKBUFSZ);
 	if (bss->skip) {
 		printf("%s:SKIP:no __builtin_btf_type_id\n", __func__);
 		ret = 1;
@@ -205,7 +215,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
 		goto free_link;
 	}
 
-	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+	if (CHECK(err < 0, "read", "read failed: %s\n", strerror(errno)))
 		goto free_link;
 
 	CHECK(strstr(taskbuf, "(struct task_struct)") == NULL,
@@ -1133,6 +1143,80 @@ static void test_buf_neg_offset(void)
 		bpf_iter_test_kern6__destroy(skel);
 }
 
+#define CMP_BUFFER_SIZE 1024
+char task_vma_output[CMP_BUFFER_SIZE];
+char proc_maps_output[CMP_BUFFER_SIZE];
+
+/* remove \0 and \t from str, and only keep the first line */
+static void str_strip_first_line(char *str)
+{
+	char *dst = str, *src = str;
+
+	do {
+		if (*src == ' ' || *src == '\t')
+			src++;
+		else
+			*(dst++) = *(src++);
+
+	} while (*src != '\0' && *src != '\n');
+
+	*dst = '\0';
+}
+
+static void test_task_vma(void)
+{
+	int err, iter_fd = -1, proc_maps_fd = -1;
+	struct bpf_iter_task_vma *skel;
+	char maps_path[64];
+
+	skel = bpf_iter_task_vma__open();
+	if (CHECK(!skel, "bpf_iter_task_vma__open", "skeleton open failed\n"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	err = bpf_iter_task_vma__load(skel);
+	if (CHECK(err, "bpf_iter_task_vma__load", "skeleton load failed\n"))
+		goto out;
+
+	do_dummy_read(skel->progs.proc_maps);
+
+	skel->links.proc_maps = bpf_program__attach_iter(
+		skel->progs.proc_maps, NULL);
+
+	if (CHECK(IS_ERR(skel->links.proc_maps), "bpf_program__attach_iter",
+		  "attach iterator failed\n"))
+		goto out;
+
+	/* read 1kB from bpf_iter */
+	iter_fd = bpf_iter_create(bpf_link__fd(skel->links.proc_maps));
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto out;
+	err = read_fd_into_buffer(iter_fd, task_vma_output, CMP_BUFFER_SIZE);
+	if (CHECK(err < 0, "read_iter_fd", "read_iter_fd failed\n"))
+		goto out;
+
+	/* read 1kB from /proc/pid/maps */
+	snprintf(maps_path, 64, "/proc/%u/maps", skel->bss->pid);
+	proc_maps_fd = open(maps_path, O_RDONLY);
+	if (CHECK(proc_maps_fd < 0, "open_proc_maps", "open_proc_maps failed\n"))
+		goto out;
+	err = read_fd_into_buffer(proc_maps_fd, proc_maps_output, CMP_BUFFER_SIZE);
+	if (CHECK(err < 0, "read_prog_maps_fd", "read_prog_maps_fd failed\n"))
+		goto out;
+
+	/* strip and compare the first line of the two files */
+	str_strip_first_line(task_vma_output);
+	str_strip_first_line(proc_maps_output);
+
+	CHECK(strcmp(task_vma_output, proc_maps_output), "compare_output",
+	      "found mismatch\n");
+out:
+	close(proc_maps_fd);
+	close(iter_fd);
+	bpf_iter_task_vma__destroy(skel);
+}
+
 void test_bpf_iter(void)
 {
 	if (test__start_subtest("btf_id_or_null"))
@@ -1149,6 +1233,8 @@ void test_bpf_iter(void)
 		test_task_stack();
 	if (test__start_subtest("task_file"))
 		test_task_file();
+	if (test__start_subtest("task_vma"))
+		test_task_vma();
 	if (test__start_subtest("task_btf"))
 		test_task_btf();
 	if (test__start_subtest("tcp4"))
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h
index 6a1255465fd6d..4dab0869c4fcb 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter.h
+++ b/tools/testing/selftests/bpf/progs/bpf_iter.h
@@ -7,6 +7,7 @@
 #define bpf_iter__netlink bpf_iter__netlink___not_used
 #define bpf_iter__task bpf_iter__task___not_used
 #define bpf_iter__task_file bpf_iter__task_file___not_used
+#define bpf_iter__task_vma bpf_iter__task_vma___not_used
 #define bpf_iter__tcp bpf_iter__tcp___not_used
 #define tcp6_sock tcp6_sock___not_used
 #define bpf_iter__udp bpf_iter__udp___not_used
@@ -26,6 +27,7 @@
 #undef bpf_iter__netlink
 #undef bpf_iter__task
 #undef bpf_iter__task_file
+#undef bpf_iter__task_vma
 #undef bpf_iter__tcp
 #undef tcp6_sock
 #undef bpf_iter__udp
@@ -67,6 +69,13 @@ struct bpf_iter__task_file {
 	struct file *file;
 } __attribute__((preserve_access_index));
 
+struct bpf_iter__task_vma {
+	struct bpf_iter_meta *meta;
+	struct task_struct *task;
+	struct __vm_area_struct *vma;
+	struct file *file;
+} __attribute__((preserve_access_index));
+
 struct bpf_iter__bpf_map {
 	struct bpf_iter_meta *meta;
 	struct bpf_map *map;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
new file mode 100644
index 0000000000000..ba87afe01024c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+/* Copied from mm.h */
+#define VM_READ		0x00000001
+#define VM_WRITE	0x00000002
+#define VM_EXEC		0x00000004
+#define VM_MAYSHARE	0x00000080
+
+/* Copied from kdev_t.h */
+#define MINORBITS	20
+#define MINORMASK	((1U << MINORBITS) - 1)
+#define MAJOR(dev)	((unsigned int) ((dev) >> MINORBITS))
+#define MINOR(dev)	((unsigned int) ((dev) & MINORMASK))
+
+#define D_PATH_BUF_SIZE 1024
+char d_path_buf[D_PATH_BUF_SIZE];
+__u32 pid;
+
+SEC("iter.s/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
+{
+	struct __vm_area_struct *vma = ctx->vma;
+	struct seq_file *seq = ctx->meta->seq;
+	struct task_struct *task = ctx->task;
+	struct file *file = ctx->file;
+	char perm_str[] = "----";
+
+	if (task == (void *)0 || vma == (void *)0 || task->pid != pid)
+		return 0;
+
+	perm_str[0] = (vma->flags & VM_READ) ? 'r' : '-';
+	perm_str[1] = (vma->flags & VM_WRITE) ? 'w' : '-';
+	perm_str[2] = (vma->flags & VM_EXEC) ? 'x' : '-';
+	perm_str[3] = (vma->flags & VM_MAYSHARE) ? 's' : 'p';
+	BPF_SEQ_PRINTF(seq, "%08llx-%08llx %s ", vma->start, vma->end, perm_str);
+
+	if (file) {
+		__u32 dev = file->f_inode->i_sb->s_dev;
+
+		bpf_d_path(&file->f_path, d_path_buf, D_PATH_BUF_SIZE);
+
+		BPF_SEQ_PRINTF(seq, "%08llx ", vma->pgoff << 12);
+		BPF_SEQ_PRINTF(seq, "%02x:%02x %u", MAJOR(dev), MINOR(dev),
+			       file->f_inode->i_ino);
+		BPF_SEQ_PRINTF(seq, "\t%s\n", d_path_buf);
+	} else {
+		BPF_SEQ_PRINTF(seq, "%08llx 00:00 0\n", 0ULL);
+	}
+	return 0;
+}
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma
  2020-12-15 23:36 [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma Song Liu
                   ` (3 preceding siblings ...)
  2020-12-15 23:37 ` [PATCH v2 bpf-next 4/4] selftests/bpf: add test for bpf_iter_task_vma Song Liu
@ 2020-12-16 17:00 ` Yonghong Song
  2020-12-16 17:35   ` Song Liu
  4 siblings, 1 reply; 36+ messages in thread
From: Yonghong Song @ 2020-12-16 17:00 UTC (permalink / raw)
  To: Song Liu, bpf, netdev
  Cc: ast, daniel, andrii, john.fastabend, kpsingh, kernel-team



On 12/15/20 3:36 PM, Song Liu wrote:
> This set introduces bpf_iter for task_vma, which can be used to generate
> information similar to /proc/pid/maps or /proc/pid/smaps. Patch 4/4 adds

I did not see an example for /proc/pid/smaps. It would be good if you 
can cover smaps as well since it is used by a lot of people.

> an example that mimics /proc/pid/maps.
> 
> Changes v1 => v2:
>    1. Small fixes in task_iter.c and the selftests. (Yonghong)
> 
> Song Liu (4):
>    bpf: introduce task_vma bpf_iter
>    bpf: allow bpf_d_path in sleepable bpf_iter program
>    libbpf: introduce section "iter.s/" for sleepable bpf_iter program
>    selftests/bpf: add test for bpf_iter_task_vma
> 
>   include/linux/bpf.h                           |   2 +-
>   kernel/bpf/task_iter.c                        | 205 +++++++++++++++++-
>   kernel/trace/bpf_trace.c                      |   5 +
>   tools/lib/bpf/libbpf.c                        |   5 +
>   .../selftests/bpf/prog_tests/bpf_iter.c       | 106 ++++++++-
>   tools/testing/selftests/bpf/progs/bpf_iter.h  |   9 +
>   .../selftests/bpf/progs/bpf_iter_task_vma.c   |  55 +++++
>   7 files changed, 375 insertions(+), 12 deletions(-)
>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
> 
> --
> 2.24.1
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma
  2020-12-16 17:00 ` [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma Yonghong Song
@ 2020-12-16 17:35   ` Song Liu
  2020-12-16 18:31     ` Yonghong Song
  0 siblings, 1 reply; 36+ messages in thread
From: Song Liu @ 2020-12-16 17:35 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, netdev, ast, daniel, andrii, john.fastabend, kpsingh, Kernel Team



> On Dec 16, 2020, at 9:00 AM, Yonghong Song <yhs@fb.com> wrote:
> 
> 
> 
> On 12/15/20 3:36 PM, Song Liu wrote:
>> This set introduces bpf_iter for task_vma, which can be used to generate
>> information similar to /proc/pid/maps or /proc/pid/smaps. Patch 4/4 adds
> 
> I did not see an example for /proc/pid/smaps. It would be good if you can cover smaps as well since it is used by a lot of people.

smaps is tricky, as it contains a lot of information, and some of these information
require architecture and configuration specific logic, e.g., page table structure. 
To really mimic smaps, we will probably need a helper for smap_gather_stats().
However, I don't think that's really necessary. I think task_vma iterator is most
useful in gathering information that are not presented in smaps. For example, if a
vma covers mixed 2MB pages and 4kB pages, smaps won't show which address ranges are
backed by 2MB pages. 

I have a test BPF program that parses 4-level x86 page table for huge pages. Since
we need bounded-loop to parse the page table, the program won't work well for too
big vma. We can probably add this program to samples/bpf/, but I think it is not
a good fit for selftests. 

Thanks,
Song 


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-15 23:36 ` [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter Song Liu
@ 2020-12-16 17:36   ` Yonghong Song
  2020-12-16 19:41     ` Song Liu
  2020-12-17  0:34   ` Andrii Nakryiko
  2020-12-17 19:03   ` Alexei Starovoitov
  2 siblings, 1 reply; 36+ messages in thread
From: Yonghong Song @ 2020-12-16 17:36 UTC (permalink / raw)
  To: Song Liu, bpf, netdev
  Cc: ast, daniel, andrii, john.fastabend, kpsingh, kernel-team



On 12/15/20 3:36 PM, Song Liu wrote:
> Introduce task_vma bpf_iter to print memory information of a process. It
> can be used to print customized information similar to /proc/<pid>/maps.
> 
> task_vma iterator releases mmap_lock before calling the BPF program.
> Therefore, we cannot pass vm_area_struct directly to the BPF program. A
> new __vm_area_struct is introduced to keep key information of a vma. On
> each iteration, task_vma gathers information in __vm_area_struct and
> passes it to the BPF program.
> 
> If the vma maps to a file, task_vma also holds a reference to the file
> while calling the BPF program.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>   include/linux/bpf.h    |   2 +-
>   kernel/bpf/task_iter.c | 205 ++++++++++++++++++++++++++++++++++++++++-
>   2 files changed, 205 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 07cb5d15e7439..49dd1e29c8118 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1325,7 +1325,7 @@ enum bpf_iter_feature {
>   	BPF_ITER_RESCHED	= BIT(0),
>   };
>   
> -#define BPF_ITER_CTX_ARG_MAX 2
> +#define BPF_ITER_CTX_ARG_MAX 3
>   struct bpf_iter_reg {
>   	const char *target;
>   	bpf_iter_attach_target_t attach_target;
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 0458a40edf10a..15a066b442f75 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -304,9 +304,183 @@ static const struct seq_operations task_file_seq_ops = {
>   	.show	= task_file_seq_show,
>   };
>   
> +/*
> + * Key information from vm_area_struct. We need this because we cannot
> + * assume the vm_area_struct is still valid after each iteration.
> + */
> +struct __vm_area_struct {
> +	__u64 start;
> +	__u64 end;
> +	__u64 flags;
> +	__u64 pgoff;
> +};
> +
> +struct bpf_iter_seq_task_vma_info {
> +	/* The first field must be struct bpf_iter_seq_task_common.
> +	 * this is assumed by {init, fini}_seq_pidns() callback functions.
> +	 */
> +	struct bpf_iter_seq_task_common common;
> +	struct task_struct *task;
> +	struct __vm_area_struct vma;
> +	struct file *file;
> +	u32 tid;
> +};
> +
> +static struct __vm_area_struct *
> +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
> +{
> +	struct pid_namespace *ns = info->common.ns;
> +	struct task_struct *curr_task;
> +	struct vm_area_struct *vma;
> +	u32 curr_tid = info->tid;
> +	bool new_task = false;
> +
> +	/* If this function returns a non-NULL __vm_area_struct, it held
> +	 * a reference to the task_struct. If info->file is non-NULL, it
> +	 * also holds a reference to the file. If this function returns
> +	 * NULL, it does not hold any reference.
> +	 */
> +again:
> +	if (info->task) {
> +		curr_task = info->task;
> +	} else {
> +		curr_task = task_seq_get_next(ns, &curr_tid, true);
> +		if (!curr_task) {
> +			info->task = NULL;
> +			info->tid++;

Here, info->tid should be info->tid = curr_tid + 1.
For exmaple, suppose initial curr_tid = info->tid = 10, and the
above task_seq_get_next(...) returns NULL with curr_tid = 100
which means tid = 100 has been visited. So we would like
to set info->tid = 101 to avoid future potential redundant work.
Returning NULL here will signal end of iteration but user
space can still call read()...

> +			return NULL;
> +		}
> +
> +		if (curr_tid != info->tid) {
> +			info->tid = curr_tid;
> +			new_task = true;
> +		}
> +
> +		if (!curr_task->mm)
> +			goto next_task;
> +		info->task = curr_task;
> +	}
> +
> +	mmap_read_lock(curr_task->mm);
> +	if (new_task) {
> +		vma = curr_task->mm->mmap;
> +	} else {
> +		/* We drop the lock between each iteration, so it is
> +		 * necessary to use find_vma() to find the next vma. This
> +		 * is similar to the mechanism in show_smaps_rollup().
> +		 */
> +		vma = find_vma(curr_task->mm, info->vma.end - 1);
> +		/* same vma as previous iteration, use vma->next */
> +		if (vma && (vma->vm_start == info->vma.start))
> +			vma = vma->vm_next;

We may have some issues here if control is returned to user space
in the middle of iterations. For example,
    - seq_ops->next() sets info->vma properly (say corresponds to vma1 
of tid1)
    - control returns to user space
    - control backs to kernel and this is not a new task since
      tid is the same
    - but we skipped this vma for show().

I think the above skipping should be guarded. If the function
is called from seq_ops->next(), yes it can be skipped.
If the function is called from seq_ops->start(), it should not
be skipped.

Could you double check such a scenario with a smaller buffer
size for read() in user space?

> +	}
> +	if (!vma) {
> +		mmap_read_unlock(curr_task->mm);
> +		goto next_task;
> +	}
> +	info->task = curr_task;
> +	info->vma.start = vma->vm_start;
> +	info->vma.end = vma->vm_end;
> +	info->vma.pgoff = vma->vm_pgoff;
> +	info->vma.flags = vma->vm_flags;
> +	if (vma->vm_file)
> +		info->file = get_file(vma->vm_file);
> +	mmap_read_unlock(curr_task->mm);
> +	return &info->vma;
> +
> +next_task:
> +	put_task_struct(curr_task);
> +	info->task = NULL;
> +	curr_tid = ++(info->tid);
> +	goto again;
> +}
> +
> +static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct bpf_iter_seq_task_vma_info *info = seq->private;
> +	struct __vm_area_struct *vma;
> +
> +	vma = task_vma_seq_get_next(info);
> +	if (vma && *pos == 0)
> +		++*pos;
> +
> +	return vma;
> +}
> +
> +static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct bpf_iter_seq_task_vma_info *info = seq->private;
> +
> +	++*pos;
> +	if (info->file) {
> +		fput(info->file);
> +		info->file = NULL;
> +	}
> +	return task_vma_seq_get_next(info);
> +}
> +
> +struct bpf_iter__task_vma {
> +	__bpf_md_ptr(struct bpf_iter_meta *, meta);
> +	__bpf_md_ptr(struct task_struct *, task);
> +	__bpf_md_ptr(struct __vm_area_struct *, vma);
> +	__bpf_md_ptr(struct file *, file);
> +};
> +
> +DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
> +		     struct task_struct *task, struct __vm_area_struct *vma,
> +		     struct file *file)
> +
> +static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
> +{
> +	struct bpf_iter_seq_task_vma_info *info = seq->private;
> +	struct bpf_iter__task_vma ctx;
> +	struct bpf_iter_meta meta;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, in_stop);
> +	if (!prog)
> +		return 0;
> +
> +	ctx.meta = &meta;
> +	ctx.task = info->task;
> +	ctx.vma = &info->vma;
> +	ctx.file = info->file;
> +	return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int task_vma_seq_show(struct seq_file *seq, void *v)
> +{
> +	return __task_vma_seq_show(seq, false);
> +}
> +
> +static void task_vma_seq_stop(struct seq_file *seq, void *v)
> +{
> +	struct bpf_iter_seq_task_vma_info *info = seq->private;
> +
> +	if (!v) {
> +		(void)__task_vma_seq_show(seq, true);
> +	} else {
> +		put_task_struct(info->task);
> +		if (info->file) {
> +			fput(info->file);
> +			info->file = NULL;
> +		}
> +		info->task = NULL;
> +	}
> +}
> +
[...]

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program
  2020-12-15 23:37 ` [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program Song Liu
@ 2020-12-16 17:41   ` Yonghong Song
  2020-12-16 18:15   ` KP Singh
  2021-01-25 12:52   ` Jiri Olsa
  2 siblings, 0 replies; 36+ messages in thread
From: Yonghong Song @ 2020-12-16 17:41 UTC (permalink / raw)
  To: Song Liu, bpf, netdev
  Cc: ast, daniel, andrii, john.fastabend, kpsingh, kernel-team



On 12/15/20 3:37 PM, Song Liu wrote:
> task_file and task_vma iter programs have access to file->f_path. Enable
> bpf_d_path to print paths of these file.
> 
> bpf_iter programs are generally called in sleepable context. However, it
> is still necessary to diffientiate sleepable and non-sleepable bpf_iter
> programs: sleepable programs have access to bpf_d_path; non-sleepable
> programs have access to bpf_spin_lock.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>

Agreed. So far bpf_iter programs all called from process context.

Acked-by: Yonghong Song <yhs@fb.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 3/4] libbpf: introduce section "iter.s/" for sleepable bpf_iter program
  2020-12-15 23:37 ` [PATCH v2 bpf-next 3/4] libbpf: introduce section "iter.s/" for " Song Liu
@ 2020-12-16 17:42   ` Yonghong Song
  2020-12-16 18:00     ` KP Singh
  0 siblings, 1 reply; 36+ messages in thread
From: Yonghong Song @ 2020-12-16 17:42 UTC (permalink / raw)
  To: Song Liu, bpf, netdev
  Cc: ast, daniel, andrii, john.fastabend, kpsingh, kernel-team



On 12/15/20 3:37 PM, Song Liu wrote:
> Sleepable iterator program have access to helper functions like bpf_d_path.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>

Acked-by: Yonghong Song <yhs@fb.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 3/4] libbpf: introduce section "iter.s/" for sleepable bpf_iter program
  2020-12-16 17:42   ` Yonghong Song
@ 2020-12-16 18:00     ` KP Singh
  0 siblings, 0 replies; 36+ messages in thread
From: KP Singh @ 2020-12-16 18:00 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Song Liu, bpf, Networking, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, KP Singh, kernel-team

On Wed, Dec 16, 2020 at 6:44 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 12/15/20 3:37 PM, Song Liu wrote:
> > Sleepable iterator program have access to helper functions like bpf_d_path.
> >
> > Signed-off-by: Song Liu <songliubraving@fb.com>
>
> Acked-by: Yonghong Song <yhs@fb.com>

Acked-by: KP Singh <kpsingh@kernel.org>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program
  2020-12-15 23:37 ` [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program Song Liu
  2020-12-16 17:41   ` Yonghong Song
@ 2020-12-16 18:15   ` KP Singh
  2020-12-16 18:31     ` KP Singh
  2021-01-25 12:52   ` Jiri Olsa
  2 siblings, 1 reply; 36+ messages in thread
From: KP Singh @ 2020-12-16 18:15 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, Networking, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, KP Singh, kernel-team

On Wed, Dec 16, 2020 at 1:06 AM Song Liu <songliubraving@fb.com> wrote:
>
> task_file and task_vma iter programs have access to file->f_path. Enable
> bpf_d_path to print paths of these file.
>
> bpf_iter programs are generally called in sleepable context. However, it
> is still necessary to diffientiate sleepable and non-sleepable bpf_iter
> programs: sleepable programs have access to bpf_d_path; non-sleepable
> programs have access to bpf_spin_lock.
>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  kernel/trace/bpf_trace.c | 5 +++++
>  1 file changed, 5 insertions(+)
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 4be771df5549a..9e5f9b968355f 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1191,6 +1191,11 @@ BTF_SET_END(btf_allowlist_d_path)
>
>  static bool bpf_d_path_allowed(const struct bpf_prog *prog)
>  {
> +       if (prog->type == BPF_PROG_TYPE_TRACING &&
> +           prog->expected_attach_type == BPF_TRACE_ITER &&
> +           prog->aux->sleepable)
> +               return true;

For the sleepable/non-sleepable we have been (until now) checking
this in bpf_tracing_func_proto (or bpf_lsm_func_proto)

eg.

case BPF_FUNC_copy_from_user:
return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;

But even beyond that, I don't think this is needed.

We have originally exposed the helper to both sleepable and
non-sleepable LSM and tracing programs with an allow list.

For LSM the allow list is bpf_lsm_is_sleepable_hook) but
that's just an initial allow list and thus causes some confusion
w.r.t to sleep ability (maybe we should add a comment there).

Based on the current logic, my understanding is that
it's okay to use the helper in the allowed hooks in both
"lsm.s/" and "lsm/" (and the same for
BPF_PROG_TYPE_TRACING).

We would have required sleepable only if this helper called "dput"
(which can sleep).

> +
>         if (prog->type == BPF_PROG_TYPE_LSM)
>                 return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
>
> --
> 2.24.1
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 4/4] selftests/bpf: add test for bpf_iter_task_vma
  2020-12-15 23:37 ` [PATCH v2 bpf-next 4/4] selftests/bpf: add test for bpf_iter_task_vma Song Liu
@ 2020-12-16 18:18   ` Yonghong Song
  2020-12-16 23:23     ` Song Liu
  0 siblings, 1 reply; 36+ messages in thread
From: Yonghong Song @ 2020-12-16 18:18 UTC (permalink / raw)
  To: Song Liu, bpf, netdev
  Cc: ast, daniel, andrii, john.fastabend, kpsingh, kernel-team



On 12/15/20 3:37 PM, Song Liu wrote:
> The test dumps information similar to /proc/pid/maps. The first line of
> the output is compared against the /proc file to make sure they match.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>   .../selftests/bpf/prog_tests/bpf_iter.c       | 106 ++++++++++++++++--
>   tools/testing/selftests/bpf/progs/bpf_iter.h  |   9 ++
>   .../selftests/bpf/progs/bpf_iter_task_vma.c   |  55 +++++++++
>   3 files changed, 160 insertions(+), 10 deletions(-)
>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> index 0e586368948dd..7afd3abae1899 100644
> --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> @@ -7,6 +7,7 @@
>   #include "bpf_iter_task.skel.h"
>   #include "bpf_iter_task_stack.skel.h"
>   #include "bpf_iter_task_file.skel.h"
> +#include "bpf_iter_task_vma.skel.h"
>   #include "bpf_iter_task_btf.skel.h"
>   #include "bpf_iter_tcp4.skel.h"
>   #include "bpf_iter_tcp6.skel.h"
> @@ -64,6 +65,22 @@ static void do_dummy_read(struct bpf_program *prog)
>   	bpf_link__destroy(link);
>   }
>   
> +static int read_fd_into_buffer(int fd, char *buf, int size)
> +{
> +	int bufleft = size;
> +	int len;
> +
> +	do {
> +		len = read(fd, buf, bufleft);
> +		if (len > 0) {
> +			buf += len;
> +			bufleft -= len;
> +		}
> +	} while (len > 0);
> +
> +	return len;
> +}
> +
>   static void test_ipv6_route(void)
>   {
>   	struct bpf_iter_ipv6_route *skel;
> @@ -177,7 +194,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
>   {
>   	struct bpf_program *prog = skel->progs.dump_task_struct;
>   	struct bpf_iter_task_btf__bss *bss = skel->bss;
> -	int iter_fd = -1, len = 0, bufleft = TASKBUFSZ;
> +	int iter_fd = -1, err;
>   	struct bpf_link *link;
>   	char *buf = taskbuf;
>   	int ret = 0;
> @@ -190,14 +207,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
>   	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
>   		goto free_link;
>   
> -	do {
> -		len = read(iter_fd, buf, bufleft);
> -		if (len > 0) {
> -			buf += len;
> -			bufleft -= len;
> -		}
> -	} while (len > 0);
> -
> +	err = read_fd_into_buffer(iter_fd, buf, TASKBUFSZ);
>   	if (bss->skip) {
>   		printf("%s:SKIP:no __builtin_btf_type_id\n", __func__);
>   		ret = 1;
> @@ -205,7 +215,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
>   		goto free_link;
>   	}
>   
> -	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
> +	if (CHECK(err < 0, "read", "read failed: %s\n", strerror(errno)))
>   		goto free_link;
>   
>   	CHECK(strstr(taskbuf, "(struct task_struct)") == NULL,
> @@ -1133,6 +1143,80 @@ static void test_buf_neg_offset(void)
>   		bpf_iter_test_kern6__destroy(skel);
>   }
>   
> +#define CMP_BUFFER_SIZE 1024
> +char task_vma_output[CMP_BUFFER_SIZE];
> +char proc_maps_output[CMP_BUFFER_SIZE];

The above two as 'static' variables to avoid potential conflicts
with other selftests?

> +
> +/* remove \0 and \t from str, and only keep the first line */
> +static void str_strip_first_line(char *str)
> +{
> +	char *dst = str, *src = str;
> +
> +	do {
> +		if (*src == ' ' || *src == '\t')
> +			src++;
> +		else
> +			*(dst++) = *(src++);
> +
> +	} while (*src != '\0' && *src != '\n');
> +
> +	*dst = '\0';
> +}
> +
> +static void test_task_vma(void)
> +{
> +	int err, iter_fd = -1, proc_maps_fd = -1;
> +	struct bpf_iter_task_vma *skel;
> +	char maps_path[64];
> +
> +	skel = bpf_iter_task_vma__open();
> +	if (CHECK(!skel, "bpf_iter_task_vma__open", "skeleton open failed\n"))
> +		return;
> +
> +	skel->bss->pid = getpid();
> +
> +	err = bpf_iter_task_vma__load(skel);
> +	if (CHECK(err, "bpf_iter_task_vma__load", "skeleton load failed\n"))
> +		goto out;
> +
> +	do_dummy_read(skel->progs.proc_maps);

This do_dummy_read() is not needed, right?

> +
> +	skel->links.proc_maps = bpf_program__attach_iter(
> +		skel->progs.proc_maps, NULL);
> +
> +	if (CHECK(IS_ERR(skel->links.proc_maps), "bpf_program__attach_iter",
> +		  "attach iterator failed\n"))
> +		goto out;
> +
> +	/* read 1kB from bpf_iter */

Maybe 1kB => CMP_BUFFER_SIZE(1kB)?
so future if people change CMP_BUFFER_SIZE value, then can find here
and adjust properly.

> +	iter_fd = bpf_iter_create(bpf_link__fd(skel->links.proc_maps));
> +	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
> +		goto out;
> +	err = read_fd_into_buffer(iter_fd, task_vma_output, CMP_BUFFER_SIZE);
> +	if (CHECK(err < 0, "read_iter_fd", "read_iter_fd failed\n"))
> +		goto out;
> +
> +	/* read 1kB from /proc/pid/maps */
> +	snprintf(maps_path, 64, "/proc/%u/maps", skel->bss->pid);
> +	proc_maps_fd = open(maps_path, O_RDONLY);
> +	if (CHECK(proc_maps_fd < 0, "open_proc_maps", "open_proc_maps failed\n"))
> +		goto out;
> +	err = read_fd_into_buffer(proc_maps_fd, proc_maps_output, CMP_BUFFER_SIZE);
> +	if (CHECK(err < 0, "read_prog_maps_fd", "read_prog_maps_fd failed\n"))
> +		goto out;
> +
> +	/* strip and compare the first line of the two files */
> +	str_strip_first_line(task_vma_output);
> +	str_strip_first_line(proc_maps_output);
> +
> +	CHECK(strcmp(task_vma_output, proc_maps_output), "compare_output",
> +	      "found mismatch\n");
> +out:
> +	close(proc_maps_fd);
> +	close(iter_fd);
> +	bpf_iter_task_vma__destroy(skel);
> +}
> +
>   void test_bpf_iter(void)
>   {
>   	if (test__start_subtest("btf_id_or_null"))
> @@ -1149,6 +1233,8 @@ void test_bpf_iter(void)
>   		test_task_stack();
>   	if (test__start_subtest("task_file"))
>   		test_task_file();
> +	if (test__start_subtest("task_vma"))
> +		test_task_vma();
>   	if (test__start_subtest("task_btf"))
>   		test_task_btf();
>   	if (test__start_subtest("tcp4"))
> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h
> index 6a1255465fd6d..4dab0869c4fcb 100644
> --- a/tools/testing/selftests/bpf/progs/bpf_iter.h
> +++ b/tools/testing/selftests/bpf/progs/bpf_iter.h
> @@ -7,6 +7,7 @@
>   #define bpf_iter__netlink bpf_iter__netlink___not_used
>   #define bpf_iter__task bpf_iter__task___not_used
>   #define bpf_iter__task_file bpf_iter__task_file___not_used
> +#define bpf_iter__task_vma bpf_iter__task_vma___not_used
>   #define bpf_iter__tcp bpf_iter__tcp___not_used
>   #define tcp6_sock tcp6_sock___not_used
>   #define bpf_iter__udp bpf_iter__udp___not_used
> @@ -26,6 +27,7 @@
>   #undef bpf_iter__netlink
>   #undef bpf_iter__task
>   #undef bpf_iter__task_file
> +#undef bpf_iter__task_vma
>   #undef bpf_iter__tcp
>   #undef tcp6_sock
>   #undef bpf_iter__udp
> @@ -67,6 +69,13 @@ struct bpf_iter__task_file {
>   	struct file *file;
>   } __attribute__((preserve_access_index));
>   
> +struct bpf_iter__task_vma {
> +	struct bpf_iter_meta *meta;
> +	struct task_struct *task;
> +	struct __vm_area_struct *vma;
> +	struct file *file;
> +} __attribute__((preserve_access_index));

Should we also define __vm_area_struct here since it is not available
in old kernels?

> +
>   struct bpf_iter__bpf_map {
>   	struct bpf_iter_meta *meta;
>   	struct bpf_map *map;
> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
> new file mode 100644
> index 0000000000000..ba87afe01024c
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
> @@ -0,0 +1,55 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2020 Facebook */
> +#include "bpf_iter.h"
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +/* Copied from mm.h */
> +#define VM_READ		0x00000001
> +#define VM_WRITE	0x00000002
> +#define VM_EXEC		0x00000004
> +#define VM_MAYSHARE	0x00000080
> +
> +/* Copied from kdev_t.h */
> +#define MINORBITS	20
> +#define MINORMASK	((1U << MINORBITS) - 1)
> +#define MAJOR(dev)	((unsigned int) ((dev) >> MINORBITS))
> +#define MINOR(dev)	((unsigned int) ((dev) & MINORMASK))
> +
> +#define D_PATH_BUF_SIZE 1024
> +char d_path_buf[D_PATH_BUF_SIZE];
> +__u32 pid;

To please llvm10, maybe
   char d_path_buf[D_PATH_BUF_SIZE] = {};
   __u32 pid = 0;

> +
> +SEC("iter.s/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
> +{
> +	struct __vm_area_struct *vma = ctx->vma;
> +	struct seq_file *seq = ctx->meta->seq;
> +	struct task_struct *task = ctx->task;
> +	struct file *file = ctx->file;
> +	char perm_str[] = "----";
> +
> +	if (task == (void *)0 || vma == (void *)0 || task->pid != pid)

I suppose kernel already filtered all non-group-leader tasks, so here
we can have task->tgid != pid?

> +		return 0;

Using /proc system, user typically do cat /proc/pid/maps. How can we
have a similar user experience with vma_iter here? One way to do this
is:
    - We still have this bpf program, filtering based on user pid,
    - normal bpftool iter pin command pid the program to say 
/sys/fs/bpf/task_vma
    - since "pid" is in a map, user can use bpftool to update "pid"
      with the target pid.
    - "cat /sys/fs/bpf/task_vma" will work.

One thing here is pid and d_path_buf are global (map) variables, so
if two users are trying to do "cat /sys/fs/bpf/task_vma" at the same
time, there will be interferences and it will not work.

One possible way is during BPF_ITER_CREATE, we duplicate all program
maps. But this is unnecessary as in most cases, the bpf_iter is not
pinned and private to applications.

Any other ideas?


> +
> +	perm_str[0] = (vma->flags & VM_READ) ? 'r' : '-';
> +	perm_str[1] = (vma->flags & VM_WRITE) ? 'w' : '-';
> +	perm_str[2] = (vma->flags & VM_EXEC) ? 'x' : '-';
> +	perm_str[3] = (vma->flags & VM_MAYSHARE) ? 's' : 'p';
> +	BPF_SEQ_PRINTF(seq, "%08llx-%08llx %s ", vma->start, vma->end, perm_str);
> +
> +	if (file) {
> +		__u32 dev = file->f_inode->i_sb->s_dev;
> +
> +		bpf_d_path(&file->f_path, d_path_buf, D_PATH_BUF_SIZE);
> +
> +		BPF_SEQ_PRINTF(seq, "%08llx ", vma->pgoff << 12);
> +		BPF_SEQ_PRINTF(seq, "%02x:%02x %u", MAJOR(dev), MINOR(dev),
> +			       file->f_inode->i_ino);
> +		BPF_SEQ_PRINTF(seq, "\t%s\n", d_path_buf);
> +	} else {
> +		BPF_SEQ_PRINTF(seq, "%08llx 00:00 0\n", 0ULL);
> +	}
> +	return 0;
> +}
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma
  2020-12-16 17:35   ` Song Liu
@ 2020-12-16 18:31     ` Yonghong Song
  0 siblings, 0 replies; 36+ messages in thread
From: Yonghong Song @ 2020-12-16 18:31 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, netdev, ast, daniel, andrii, john.fastabend, kpsingh, Kernel Team



On 12/16/20 9:35 AM, Song Liu wrote:
> 
> 
>> On Dec 16, 2020, at 9:00 AM, Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 12/15/20 3:36 PM, Song Liu wrote:
>>> This set introduces bpf_iter for task_vma, which can be used to generate
>>> information similar to /proc/pid/maps or /proc/pid/smaps. Patch 4/4 adds
>>
>> I did not see an example for /proc/pid/smaps. It would be good if you can cover smaps as well since it is used by a lot of people.
> 
> smaps is tricky, as it contains a lot of information, and some of these information
> require architecture and configuration specific logic, e.g., page table structure.
> To really mimic smaps, we will probably need a helper for smap_gather_stats().
> However, I don't think that's really necessary. I think task_vma iterator is most
> useful in gathering information that are not presented in smaps. For example, if a
> vma covers mixed 2MB pages and 4kB pages, smaps won't show which address ranges are
> backed by 2MB pages.

Let us remove "/proc/pid/smaps" from cover letter description then.
Could you add the above information to the patch #1 (and possibly cover 
letter as well) of the series? This can serve one of reasons why we
introduce task_vma iter. Maybe you want to extend bpf programs to
cover this use case.

> 
> I have a test BPF program that parses 4-level x86 page table for huge pages. Since
> we need bounded-loop to parse the page table, the program won't work well for too
> big vma. We can probably add this program to samples/bpf/, but I think it is not
> a good fit for selftests.

This can be done later.

> 
> Thanks,
> Song
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program
  2020-12-16 18:15   ` KP Singh
@ 2020-12-16 18:31     ` KP Singh
  0 siblings, 0 replies; 36+ messages in thread
From: KP Singh @ 2020-12-16 18:31 UTC (permalink / raw)
  To: bpf, Networking; +Cc: KP Singh

On Wed, Dec 16, 2020 at 7:15 PM KP Singh <kpsingh@kernel.org> wrote:
>
> On Wed, Dec 16, 2020 at 1:06 AM Song Liu <songliubraving@fb.com> wrote:
> >
> > task_file and task_vma iter programs have access to file->f_path. Enable
> > bpf_d_path to print paths of these file.
> >
> > bpf_iter programs are generally called in sleepable context. However, it
> > is still necessary to diffientiate sleepable and non-sleepable bpf_iter
> > programs: sleepable programs have access to bpf_d_path; non-sleepable
> > programs have access to bpf_spin_lock.
> >
> > Signed-off-by: Song Liu <songliubraving@fb.com>
> > ---
> >  kernel/trace/bpf_trace.c | 5 +++++
> >  1 file changed, 5 insertions(+)
> >
> > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > index 4be771df5549a..9e5f9b968355f 100644
> > --- a/kernel/trace/bpf_trace.c
> > +++ b/kernel/trace/bpf_trace.c
> > @@ -1191,6 +1191,11 @@ BTF_SET_END(btf_allowlist_d_path)
> >
> >  static bool bpf_d_path_allowed(const struct bpf_prog *prog)
> >  {
> > +       if (prog->type == BPF_PROG_TYPE_TRACING &&
> > +           prog->expected_attach_type == BPF_TRACE_ITER &&
> > +           prog->aux->sleepable)
> > +               return true;
>

Another try to send it on the list.

> For the sleepable/non-sleepable we have been (until now) checking
> this in bpf_tracing_func_proto (or bpf_lsm_func_proto)
>
> eg.
>
> case BPF_FUNC_copy_from_user:
> return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
>
> But even beyond that, I don't think this is needed.
>
> We have originally exposed the helper to both sleepable and
> non-sleepable LSM and tracing programs with an allow list.
>
> For LSM the allow list is bpf_lsm_is_sleepable_hook) but
> that's just an initial allow list and thus causes some confusion
> w.r.t to sleep ability (maybe we should add a comment there).
>
> Based on the current logic, my understanding is that
> it's okay to use the helper in the allowed hooks in both
> "lsm.s/" and "lsm/" (and the same for
> BPF_PROG_TYPE_TRACING).
>
> We would have required sleepable only if this helper called "dput"
> (which can sleep).
>
> > +
> >         if (prog->type == BPF_PROG_TYPE_LSM)
> >                 return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
> >
> > --
> > 2.24.1
> >

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-16 17:36   ` Yonghong Song
@ 2020-12-16 19:41     ` Song Liu
  0 siblings, 0 replies; 36+ messages in thread
From: Song Liu @ 2020-12-16 19:41 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Networking, ast, daniel, andrii, john.fastabend, kpsingh,
	Kernel Team



> On Dec 16, 2020, at 9:36 AM, Yonghong Song <yhs@fb.com> wrote:
> 
[...]
>> +	if (info->task) {
>> +		curr_task = info->task;
>> +	} else {
>> +		curr_task = task_seq_get_next(ns, &curr_tid, true);
>> +		if (!curr_task) {
>> +			info->task = NULL;
>> +			info->tid++;
> 
> Here, info->tid should be info->tid = curr_tid + 1.
> For exmaple, suppose initial curr_tid = info->tid = 10, and the
> above task_seq_get_next(...) returns NULL with curr_tid = 100
> which means tid = 100 has been visited. So we would like
> to set info->tid = 101 to avoid future potential redundant work.
> Returning NULL here will signal end of iteration but user
> space can still call read()...

Make sense. Let me fix. 

> 
>> +			return NULL;
>> +		}
>> +
>> +		if (curr_tid != info->tid) {
>> +			info->tid = curr_tid;
>> +			new_task = true;
>> +		}
>> +
>> +		if (!curr_task->mm)
>> +			goto next_task;
>> +		info->task = curr_task;
>> +	}
>> +
>> +	mmap_read_lock(curr_task->mm);
>> +	if (new_task) {
>> +		vma = curr_task->mm->mmap;
>> +	} else {
>> +		/* We drop the lock between each iteration, so it is
>> +		 * necessary to use find_vma() to find the next vma. This
>> +		 * is similar to the mechanism in show_smaps_rollup().
>> +		 */
>> +		vma = find_vma(curr_task->mm, info->vma.end - 1);
>> +		/* same vma as previous iteration, use vma->next */
>> +		if (vma && (vma->vm_start == info->vma.start))
>> +			vma = vma->vm_next;
> 
> We may have some issues here if control is returned to user space
> in the middle of iterations. For example,
>   - seq_ops->next() sets info->vma properly (say corresponds to vma1 of tid1)
>   - control returns to user space
>   - control backs to kernel and this is not a new task since
>     tid is the same
>   - but we skipped this vma for show().
> 
> I think the above skipping should be guarded. If the function
> is called from seq_ops->next(), yes it can be skipped.
> If the function is called from seq_ops->start(), it should not
> be skipped.
> 
> Could you double check such a scenario with a smaller buffer
> size for read() in user space?

Yeah, this appeared to be a problem... Thanks for catching it! But I 
am not sure (yet) how to fix it. Let me think more about it. 

Thanks,
Song


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 4/4] selftests/bpf: add test for bpf_iter_task_vma
  2020-12-16 18:18   ` Yonghong Song
@ 2020-12-16 23:23     ` Song Liu
  0 siblings, 0 replies; 36+ messages in thread
From: Song Liu @ 2020-12-16 23:23 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Networking, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, kpsingh, Kernel Team



> On Dec 16, 2020, at 10:18 AM, Yonghong Song <yhs@fb.com> wrote:
> 

[...]

>> +
>> +	err = bpf_iter_task_vma__load(skel);
>> +	if (CHECK(err, "bpf_iter_task_vma__load", "skeleton load failed\n"))
>> +		goto out;
>> +
>> +	do_dummy_read(skel->progs.proc_maps);
> 
> This do_dummy_read() is not needed, right?

do_dummy_read() helped me got bug in earlier version. I am planning to 
change the following to do smaller reads, then do_dummy_read() is no longer
needed. 

[...]

> 
>> +
>> +SEC("iter.s/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
>> +{
>> +	struct __vm_area_struct *vma = ctx->vma;
>> +	struct seq_file *seq = ctx->meta->seq;
>> +	struct task_struct *task = ctx->task;
>> +	struct file *file = ctx->file;
>> +	char perm_str[] = "----";
>> +
>> +	if (task == (void *)0 || vma == (void *)0 || task->pid != pid)
> 
> I suppose kernel already filtered all non-group-leader tasks, so here
> we can have task->tgid != pid?

Yeah, that works. 

> 
>> +		return 0;
> 
> Using /proc system, user typically do cat /proc/pid/maps. How can we
> have a similar user experience with vma_iter here? One way to do this
> is:
>   - We still have this bpf program, filtering based on user pid,
>   - normal bpftool iter pin command pid the program to say /sys/fs/bpf/task_vma
>   - since "pid" is in a map, user can use bpftool to update "pid"
>     with the target pid.
>   - "cat /sys/fs/bpf/task_vma" will work.
> 
> One thing here is pid and d_path_buf are global (map) variables, so
> if two users are trying to do "cat /sys/fs/bpf/task_vma" at the same
> time, there will be interferences and it will not work.
> 
> One possible way is during BPF_ITER_CREATE, we duplicate all program
> maps. But this is unnecessary as in most cases, the bpf_iter is not
> pinned and private to applications.
> 
> Any other ideas?

Maybe we can use task local storage for pid and d_path_buf? 

To make it more practical, we probably want in kernel filtering based 
on pid. IOW, let user specify which task to iterate. 

Thanks,
Song


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-15 23:36 ` [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter Song Liu
  2020-12-16 17:36   ` Yonghong Song
@ 2020-12-17  0:34   ` Andrii Nakryiko
  2020-12-17  1:51     ` Song Liu
  2020-12-17 19:03   ` Alexei Starovoitov
  2 siblings, 1 reply; 36+ messages in thread
From: Andrii Nakryiko @ 2020-12-17  0:34 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, Networking, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, john fastabend, KP Singh, Kernel Team

On Tue, Dec 15, 2020 at 3:37 PM Song Liu <songliubraving@fb.com> wrote:
>
> Introduce task_vma bpf_iter to print memory information of a process. It
> can be used to print customized information similar to /proc/<pid>/maps.
>
> task_vma iterator releases mmap_lock before calling the BPF program.
> Therefore, we cannot pass vm_area_struct directly to the BPF program. A
> new __vm_area_struct is introduced to keep key information of a vma. On
> each iteration, task_vma gathers information in __vm_area_struct and
> passes it to the BPF program.
>
> If the vma maps to a file, task_vma also holds a reference to the file
> while calling the BPF program.
>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  include/linux/bpf.h    |   2 +-
>  kernel/bpf/task_iter.c | 205 ++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 205 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 07cb5d15e7439..49dd1e29c8118 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1325,7 +1325,7 @@ enum bpf_iter_feature {
>         BPF_ITER_RESCHED        = BIT(0),
>  };
>
> -#define BPF_ITER_CTX_ARG_MAX 2
> +#define BPF_ITER_CTX_ARG_MAX 3
>  struct bpf_iter_reg {
>         const char *target;
>         bpf_iter_attach_target_t attach_target;
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 0458a40edf10a..15a066b442f75 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -304,9 +304,183 @@ static const struct seq_operations task_file_seq_ops = {
>         .show   = task_file_seq_show,
>  };
>
> +/*
> + * Key information from vm_area_struct. We need this because we cannot
> + * assume the vm_area_struct is still valid after each iteration.
> + */
> +struct __vm_area_struct {
> +       __u64 start;
> +       __u64 end;
> +       __u64 flags;
> +       __u64 pgoff;

I'd keep the original names of the fields (vm_start, vm_end, etc). But
there are some more fields which seem useful, like vm_page_prot,
vm_mm, etc.

It's quite unfortunate, actually, that bpf_iter program doesn't get
access to the real vm_area_struct, because it won't be able to do much
beyond using fields that we pre-defined here. E.g., there could be
interesting things to do with vm_mm, but unfortunately it won't be
possible.

Is there any way to still provide access to the original
vm_area_struct and let BPF programs use BTF magic to follow all those
pointers (like vm_mm) safely?

> +};
> +

[...]

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-17  0:34   ` Andrii Nakryiko
@ 2020-12-17  1:51     ` Song Liu
  0 siblings, 0 replies; 36+ messages in thread
From: Song Liu @ 2020-12-17  1:51 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: bpf, Networking, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, john fastabend, KP Singh, Kernel Team



> On Dec 16, 2020, at 4:34 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> 
> On Tue, Dec 15, 2020 at 3:37 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> Introduce task_vma bpf_iter to print memory information of a process. It
>> can be used to print customized information similar to /proc/<pid>/maps.
>> 
>> task_vma iterator releases mmap_lock before calling the BPF program.
>> Therefore, we cannot pass vm_area_struct directly to the BPF program. A
>> new __vm_area_struct is introduced to keep key information of a vma. On
>> each iteration, task_vma gathers information in __vm_area_struct and
>> passes it to the BPF program.
>> 
>> If the vma maps to a file, task_vma also holds a reference to the file
>> while calling the BPF program.
>> 
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>> include/linux/bpf.h    |   2 +-
>> kernel/bpf/task_iter.c | 205 ++++++++++++++++++++++++++++++++++++++++-
>> 2 files changed, 205 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 07cb5d15e7439..49dd1e29c8118 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1325,7 +1325,7 @@ enum bpf_iter_feature {
>>        BPF_ITER_RESCHED        = BIT(0),
>> };
>> 
>> -#define BPF_ITER_CTX_ARG_MAX 2
>> +#define BPF_ITER_CTX_ARG_MAX 3
>> struct bpf_iter_reg {
>>        const char *target;
>>        bpf_iter_attach_target_t attach_target;
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> index 0458a40edf10a..15a066b442f75 100644
>> --- a/kernel/bpf/task_iter.c
>> +++ b/kernel/bpf/task_iter.c
>> @@ -304,9 +304,183 @@ static const struct seq_operations task_file_seq_ops = {
>>        .show   = task_file_seq_show,
>> };
>> 
>> +/*
>> + * Key information from vm_area_struct. We need this because we cannot
>> + * assume the vm_area_struct is still valid after each iteration.
>> + */
>> +struct __vm_area_struct {
>> +       __u64 start;
>> +       __u64 end;
>> +       __u64 flags;
>> +       __u64 pgoff;
> 
> I'd keep the original names of the fields (vm_start, vm_end, etc).

I thought about the names. Unlike the kernel fs/mm code, where there
are many different start/end/offset/flags, the prefix doesn't seem to 
be helpful in the BPF programs. In fact, it is probably easier for 
the developers to differentiate __vm_area_struct->start and 
vm_area_struct->vm_start.

Also, we have bpf_iter__task_vma->file, which is the same as 
vm_area_struct->vm_file. If we prefix __vm_area_struct members, it 
will be a little weird to name it either "vm_file" or "file".

Given these reasons, I decided to not have vm_ prefix. Does this make
sense? 

> But
> there are some more fields which seem useful, like vm_page_prot,
> vm_mm, etc.

vm_page_prot doesn't really provide extra information than vm_flags. 
Please refer to mm/mmap.c vm_get_page_prot(). 

We have the vm_mm in task->mm, so no need to add it to __vm_area_struct.

> 
> It's quite unfortunate, actually, that bpf_iter program doesn't get
> access to the real vm_area_struct, because it won't be able to do much
> beyond using fields that we pre-defined here. E.g., there could be
> interesting things to do with vm_mm, but unfortunately it won't be
> possible.
> 
> Is there any way to still provide access to the original
> vm_area_struct and let BPF programs use BTF magic to follow all those
> pointers (like vm_mm) safely?

We hold a reference to task, and the task holds a reference to task->mm,
so we can safely probe_read information in mm_struct, like the page 
table. 

Thanks,
Song


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-15 23:36 ` [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter Song Liu
  2020-12-16 17:36   ` Yonghong Song
  2020-12-17  0:34   ` Andrii Nakryiko
@ 2020-12-17 19:03   ` Alexei Starovoitov
  2020-12-17 22:08     ` Song Liu
  2 siblings, 1 reply; 36+ messages in thread
From: Alexei Starovoitov @ 2020-12-17 19:03 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, netdev, ast, daniel, andrii, john.fastabend, kpsingh, kernel-team

On Tue, Dec 15, 2020 at 03:36:59PM -0800, Song Liu wrote:
> +/*
> + * Key information from vm_area_struct. We need this because we cannot
> + * assume the vm_area_struct is still valid after each iteration.
> + */
> +struct __vm_area_struct {
> +	__u64 start;
> +	__u64 end;
> +	__u64 flags;
> +	__u64 pgoff;
> +};

Where it's inside .c or exposed in uapi/bpf.h it will become uapi
if it's used this way. Let's switch to arbitrary BTF-based access instead.

> +static struct __vm_area_struct *
> +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
> +{
> +	struct pid_namespace *ns = info->common.ns;
> +	struct task_struct *curr_task;
> +	struct vm_area_struct *vma;
> +	u32 curr_tid = info->tid;
> +	bool new_task = false;
> +
> +	/* If this function returns a non-NULL __vm_area_struct, it held
> +	 * a reference to the task_struct. If info->file is non-NULL, it
> +	 * also holds a reference to the file. If this function returns
> +	 * NULL, it does not hold any reference.
> +	 */
> +again:
> +	if (info->task) {
> +		curr_task = info->task;
> +	} else {
> +		curr_task = task_seq_get_next(ns, &curr_tid, true);
> +		if (!curr_task) {
> +			info->task = NULL;
> +			info->tid++;
> +			return NULL;
> +		}
> +
> +		if (curr_tid != info->tid) {
> +			info->tid = curr_tid;
> +			new_task = true;
> +		}
> +
> +		if (!curr_task->mm)
> +			goto next_task;
> +		info->task = curr_task;
> +	}
> +
> +	mmap_read_lock(curr_task->mm);

That will hurt. /proc readers do that and it causes all sorts
of production issues. We cannot take this lock.
There is no need to take it.
Switch the whole thing to probe_read style walking.
And reimplement find_vma with probe_read while omitting vmacache.
It will be short rbtree walk.
bpf prog doesn't need to see a stable struct. It will read it through ptr_to_btf_id
which will use probe_read equivalent underneath.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-17 19:03   ` Alexei Starovoitov
@ 2020-12-17 22:08     ` Song Liu
  2020-12-18  2:34       ` Alexei Starovoitov
  0 siblings, 1 reply; 36+ messages in thread
From: Song Liu @ 2020-12-17 22:08 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: bpf, netdev, ast, daniel, andrii, john.fastabend, kpsingh, Kernel Team



> On Dec 17, 2020, at 11:03 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Tue, Dec 15, 2020 at 03:36:59PM -0800, Song Liu wrote:
>> +/*
>> + * Key information from vm_area_struct. We need this because we cannot
>> + * assume the vm_area_struct is still valid after each iteration.
>> + */
>> +struct __vm_area_struct {
>> +	__u64 start;
>> +	__u64 end;
>> +	__u64 flags;
>> +	__u64 pgoff;
>> +};
> 
> Where it's inside .c or exposed in uapi/bpf.h it will become uapi
> if it's used this way. Let's switch to arbitrary BTF-based access instead.
> 
>> +static struct __vm_area_struct *
>> +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>> +{
>> +	struct pid_namespace *ns = info->common.ns;
>> +	struct task_struct *curr_task;
>> +	struct vm_area_struct *vma;
>> +	u32 curr_tid = info->tid;
>> +	bool new_task = false;
>> +
>> +	/* If this function returns a non-NULL __vm_area_struct, it held
>> +	 * a reference to the task_struct. If info->file is non-NULL, it
>> +	 * also holds a reference to the file. If this function returns
>> +	 * NULL, it does not hold any reference.
>> +	 */
>> +again:
>> +	if (info->task) {
>> +		curr_task = info->task;
>> +	} else {
>> +		curr_task = task_seq_get_next(ns, &curr_tid, true);
>> +		if (!curr_task) {
>> +			info->task = NULL;
>> +			info->tid++;
>> +			return NULL;
>> +		}
>> +
>> +		if (curr_tid != info->tid) {
>> +			info->tid = curr_tid;
>> +			new_task = true;
>> +		}
>> +
>> +		if (!curr_task->mm)
>> +			goto next_task;
>> +		info->task = curr_task;
>> +	}
>> +
>> +	mmap_read_lock(curr_task->mm);
> 
> That will hurt. /proc readers do that and it causes all sorts
> of production issues. We cannot take this lock.
> There is no need to take it.
> Switch the whole thing to probe_read style walking.
> And reimplement find_vma with probe_read while omitting vmacache.
> It will be short rbtree walk.
> bpf prog doesn't need to see a stable struct. It will read it through ptr_to_btf_id
> which will use probe_read equivalent underneath.

rw_semaphore is designed to avoid write starvation, so read_lock should not cause
problem unless the lock was taken for extended period. [1] was a recent fix that 
avoids /proc issue by releasing mmap_lock between iterations. We are using similar
mechanism here. BTW: I changed this to mmap_read_lock_killable() in the next version. 

On the other hand, we need a valid vm_file pointer for bpf_d_path. So walking the 
rbtree without taking any lock would not work. We can avoid taking the lock when 
some SPF like mechanism merged (hopefully soonish). 

Did I miss anything? 

We can improve bpf_iter with some mechanism to specify which task to iterate, so 
that we don't have to iterate through all tasks when the user only want to inspect 
vmas in one task. 

Thanks,
Song

[1] ff9f47f6f00c ("mm: proc: smaps_rollup: do not stall write attempts on mmap_lock")


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-17 22:08     ` Song Liu
@ 2020-12-18  2:34       ` Alexei Starovoitov
  2020-12-18  3:15         ` Yonghong Song
  2020-12-18  4:33         ` Song Liu
  0 siblings, 2 replies; 36+ messages in thread
From: Alexei Starovoitov @ 2020-12-18  2:34 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, netdev, ast, daniel, andrii, john.fastabend, kpsingh, Kernel Team

On Thu, Dec 17, 2020 at 10:08:31PM +0000, Song Liu wrote:
> 
> 
> > On Dec 17, 2020, at 11:03 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > 
> > On Tue, Dec 15, 2020 at 03:36:59PM -0800, Song Liu wrote:
> >> +/*
> >> + * Key information from vm_area_struct. We need this because we cannot
> >> + * assume the vm_area_struct is still valid after each iteration.
> >> + */
> >> +struct __vm_area_struct {
> >> +	__u64 start;
> >> +	__u64 end;
> >> +	__u64 flags;
> >> +	__u64 pgoff;
> >> +};
> > 
> > Where it's inside .c or exposed in uapi/bpf.h it will become uapi
> > if it's used this way. Let's switch to arbitrary BTF-based access instead.
> > 
> >> +static struct __vm_area_struct *
> >> +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
> >> +{
> >> +	struct pid_namespace *ns = info->common.ns;
> >> +	struct task_struct *curr_task;
> >> +	struct vm_area_struct *vma;
> >> +	u32 curr_tid = info->tid;
> >> +	bool new_task = false;
> >> +
> >> +	/* If this function returns a non-NULL __vm_area_struct, it held
> >> +	 * a reference to the task_struct. If info->file is non-NULL, it
> >> +	 * also holds a reference to the file. If this function returns
> >> +	 * NULL, it does not hold any reference.
> >> +	 */
> >> +again:
> >> +	if (info->task) {
> >> +		curr_task = info->task;
> >> +	} else {
> >> +		curr_task = task_seq_get_next(ns, &curr_tid, true);
> >> +		if (!curr_task) {
> >> +			info->task = NULL;
> >> +			info->tid++;
> >> +			return NULL;
> >> +		}
> >> +
> >> +		if (curr_tid != info->tid) {
> >> +			info->tid = curr_tid;
> >> +			new_task = true;
> >> +		}
> >> +
> >> +		if (!curr_task->mm)
> >> +			goto next_task;
> >> +		info->task = curr_task;
> >> +	}
> >> +
> >> +	mmap_read_lock(curr_task->mm);
> > 
> > That will hurt. /proc readers do that and it causes all sorts
> > of production issues. We cannot take this lock.
> > There is no need to take it.
> > Switch the whole thing to probe_read style walking.
> > And reimplement find_vma with probe_read while omitting vmacache.
> > It will be short rbtree walk.
> > bpf prog doesn't need to see a stable struct. It will read it through ptr_to_btf_id
> > which will use probe_read equivalent underneath.
> 
> rw_semaphore is designed to avoid write starvation, so read_lock should not cause
> problem unless the lock was taken for extended period. [1] was a recent fix that 
> avoids /proc issue by releasing mmap_lock between iterations. We are using similar
> mechanism here. BTW: I changed this to mmap_read_lock_killable() in the next version. 
> 
> On the other hand, we need a valid vm_file pointer for bpf_d_path. So walking the 

ahh. I missed that. Makes sense.
vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.

> rbtree without taking any lock would not work. We can avoid taking the lock when 
> some SPF like mechanism merged (hopefully soonish). 
> 
> Did I miss anything? 
> 
> We can improve bpf_iter with some mechanism to specify which task to iterate, so 
> that we don't have to iterate through all tasks when the user only want to inspect 
> vmas in one task. 

yes. let's figure out how to make it parametrizable.
map_iter runs only for given map_fd.
Maybe vma_iter should run only for given pidfd?
I think all_task_all_vmas iter is nice to have, but we don't really need it?

> Thanks,
> Song
> 
> [1] ff9f47f6f00c ("mm: proc: smaps_rollup: do not stall write attempts on mmap_lock")

Thanks for this link. With "if (mmap_lock_is_contended())" check it should work indeed.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-18  2:34       ` Alexei Starovoitov
@ 2020-12-18  3:15         ` Yonghong Song
  2020-12-18  4:33         ` Song Liu
  1 sibling, 0 replies; 36+ messages in thread
From: Yonghong Song @ 2020-12-18  3:15 UTC (permalink / raw)
  To: Alexei Starovoitov, Song Liu
  Cc: bpf, netdev, ast, daniel, andrii, john.fastabend, kpsingh, Kernel Team



On 12/17/20 6:34 PM, Alexei Starovoitov wrote:
> On Thu, Dec 17, 2020 at 10:08:31PM +0000, Song Liu wrote:
>>
>>
>>> On Dec 17, 2020, at 11:03 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>>
>>> On Tue, Dec 15, 2020 at 03:36:59PM -0800, Song Liu wrote:
>>>> +/*
>>>> + * Key information from vm_area_struct. We need this because we cannot
>>>> + * assume the vm_area_struct is still valid after each iteration.
>>>> + */
>>>> +struct __vm_area_struct {
>>>> +	__u64 start;
>>>> +	__u64 end;
>>>> +	__u64 flags;
>>>> +	__u64 pgoff;
>>>> +};
>>>
>>> Where it's inside .c or exposed in uapi/bpf.h it will become uapi
>>> if it's used this way. Let's switch to arbitrary BTF-based access instead.
>>>
>>>> +static struct __vm_area_struct *
>>>> +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>>>> +{
>>>> +	struct pid_namespace *ns = info->common.ns;
>>>> +	struct task_struct *curr_task;
>>>> +	struct vm_area_struct *vma;
>>>> +	u32 curr_tid = info->tid;
>>>> +	bool new_task = false;
>>>> +
>>>> +	/* If this function returns a non-NULL __vm_area_struct, it held
>>>> +	 * a reference to the task_struct. If info->file is non-NULL, it
>>>> +	 * also holds a reference to the file. If this function returns
>>>> +	 * NULL, it does not hold any reference.
>>>> +	 */
>>>> +again:
>>>> +	if (info->task) {
>>>> +		curr_task = info->task;
>>>> +	} else {
>>>> +		curr_task = task_seq_get_next(ns, &curr_tid, true);
>>>> +		if (!curr_task) {
>>>> +			info->task = NULL;
>>>> +			info->tid++;
>>>> +			return NULL;
>>>> +		}
>>>> +
>>>> +		if (curr_tid != info->tid) {
>>>> +			info->tid = curr_tid;
>>>> +			new_task = true;
>>>> +		}
>>>> +
>>>> +		if (!curr_task->mm)
>>>> +			goto next_task;
>>>> +		info->task = curr_task;
>>>> +	}
>>>> +
>>>> +	mmap_read_lock(curr_task->mm);
>>>
>>> That will hurt. /proc readers do that and it causes all sorts
>>> of production issues. We cannot take this lock.
>>> There is no need to take it.
>>> Switch the whole thing to probe_read style walking.
>>> And reimplement find_vma with probe_read while omitting vmacache.
>>> It will be short rbtree walk.
>>> bpf prog doesn't need to see a stable struct. It will read it through ptr_to_btf_id
>>> which will use probe_read equivalent underneath.
>>
>> rw_semaphore is designed to avoid write starvation, so read_lock should not cause
>> problem unless the lock was taken for extended period. [1] was a recent fix that
>> avoids /proc issue by releasing mmap_lock between iterations. We are using similar
>> mechanism here. BTW: I changed this to mmap_read_lock_killable() in the next version.
>>
>> On the other hand, we need a valid vm_file pointer for bpf_d_path. So walking the
> 
> ahh. I missed that. Makes sense.
> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
> 
>> rbtree without taking any lock would not work. We can avoid taking the lock when
>> some SPF like mechanism merged (hopefully soonish).
>>
>> Did I miss anything?
>>
>> We can improve bpf_iter with some mechanism to specify which task to iterate, so
>> that we don't have to iterate through all tasks when the user only want to inspect
>> vmas in one task.
> 
> yes. let's figure out how to make it parametrizable.
> map_iter runs only for given map_fd.
> Maybe vma_iter should run only for given pidfd?
> I think all_task_all_vmas iter is nice to have, but we don't really need it?

We could make pidfd optional? If pidfd == 0, all vmas of all tasks will 
be traversed. Otherwise, pidfd > 0, only vmas of that pidfd will be 
traversed.

> 
>> Thanks,
>> Song
>>
>> [1] ff9f47f6f00c ("mm: proc: smaps_rollup: do not stall write attempts on mmap_lock")
> 
> Thanks for this link. With "if (mmap_lock_is_contended())" check it should work indeed.
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-18  2:34       ` Alexei Starovoitov
  2020-12-18  3:15         ` Yonghong Song
@ 2020-12-18  4:33         ` Song Liu
  2020-12-18  5:23           ` Alexei Starovoitov
  1 sibling, 1 reply; 36+ messages in thread
From: Song Liu @ 2020-12-18  4:33 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: bpf, netdev, ast, daniel, andrii, john.fastabend, kpsingh, Kernel Team



> On Dec 17, 2020, at 6:34 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Thu, Dec 17, 2020 at 10:08:31PM +0000, Song Liu wrote:
>> 
>> 
>>> On Dec 17, 2020, at 11:03 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>> 
>>> On Tue, Dec 15, 2020 at 03:36:59PM -0800, Song Liu wrote:
>>>> +/*
>>>> + * Key information from vm_area_struct. We need this because we cannot
>>>> + * assume the vm_area_struct is still valid after each iteration.
>>>> + */
>>>> +struct __vm_area_struct {
>>>> +	__u64 start;
>>>> +	__u64 end;
>>>> +	__u64 flags;
>>>> +	__u64 pgoff;
>>>> +};
>>> 
>>> Where it's inside .c or exposed in uapi/bpf.h it will become uapi
>>> if it's used this way. Let's switch to arbitrary BTF-based access instead.
>>> 
>>>> +static struct __vm_area_struct *
>>>> +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>>>> +{
>>>> +	struct pid_namespace *ns = info->common.ns;
>>>> +	struct task_struct *curr_task;
>>>> +	struct vm_area_struct *vma;
>>>> +	u32 curr_tid = info->tid;
>>>> +	bool new_task = false;
>>>> +
>>>> +	/* If this function returns a non-NULL __vm_area_struct, it held
>>>> +	 * a reference to the task_struct. If info->file is non-NULL, it
>>>> +	 * also holds a reference to the file. If this function returns
>>>> +	 * NULL, it does not hold any reference.
>>>> +	 */
>>>> +again:
>>>> +	if (info->task) {
>>>> +		curr_task = info->task;
>>>> +	} else {
>>>> +		curr_task = task_seq_get_next(ns, &curr_tid, true);
>>>> +		if (!curr_task) {
>>>> +			info->task = NULL;
>>>> +			info->tid++;
>>>> +			return NULL;
>>>> +		}
>>>> +
>>>> +		if (curr_tid != info->tid) {
>>>> +			info->tid = curr_tid;
>>>> +			new_task = true;
>>>> +		}
>>>> +
>>>> +		if (!curr_task->mm)
>>>> +			goto next_task;
>>>> +		info->task = curr_task;
>>>> +	}
>>>> +
>>>> +	mmap_read_lock(curr_task->mm);
>>> 
>>> That will hurt. /proc readers do that and it causes all sorts
>>> of production issues. We cannot take this lock.
>>> There is no need to take it.
>>> Switch the whole thing to probe_read style walking.
>>> And reimplement find_vma with probe_read while omitting vmacache.
>>> It will be short rbtree walk.
>>> bpf prog doesn't need to see a stable struct. It will read it through ptr_to_btf_id
>>> which will use probe_read equivalent underneath.
>> 
>> rw_semaphore is designed to avoid write starvation, so read_lock should not cause
>> problem unless the lock was taken for extended period. [1] was a recent fix that 
>> avoids /proc issue by releasing mmap_lock between iterations. We are using similar
>> mechanism here. BTW: I changed this to mmap_read_lock_killable() in the next version. 
>> 
>> On the other hand, we need a valid vm_file pointer for bpf_d_path. So walking the 
> 
> ahh. I missed that. Makes sense.
> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.

Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we 
allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
allow access of vma->vm_file as a valid pointer to struct file. However, since the
vma might be freed, vma->vm_file could point to random data. 

> 
>> rbtree without taking any lock would not work. We can avoid taking the lock when 
>> some SPF like mechanism merged (hopefully soonish). 
>> 
>> Did I miss anything? 
>> 
>> We can improve bpf_iter with some mechanism to specify which task to iterate, so 
>> that we don't have to iterate through all tasks when the user only want to inspect 
>> vmas in one task. 
> 
> yes. let's figure out how to make it parametrizable.
> map_iter runs only for given map_fd.
> Maybe vma_iter should run only for given pidfd?
> I think all_task_all_vmas iter is nice to have, but we don't really need it?
> 
>> Thanks,
>> Song
>> 
>> [1] ff9f47f6f00c ("mm: proc: smaps_rollup: do not stall write attempts on mmap_lock")
> 
> Thanks for this link. With "if (mmap_lock_is_contended())" check it should work indeed.

To make sure we are on the same page: I am using slightly different mechanism in 
task_vma_iter, which doesn't require checking mmap_lock_is_contended(). In the 
smaps_rollup case, the code only unlock mmap_sem when the lock is contended. In 
task_iter, we always unlock mmap_sem between two iterations. This is because we 
don't want to hold mmap_sem while calling the BPF program, which may sleep (calling
bpf_d_path). 

Thanks,
Song



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-18  4:33         ` Song Liu
@ 2020-12-18  5:23           ` Alexei Starovoitov
  2020-12-18 16:38             ` Yonghong Song
  0 siblings, 1 reply; 36+ messages in thread
From: Alexei Starovoitov @ 2020-12-18  5:23 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, netdev, ast, daniel, andrii, john.fastabend, kpsingh, Kernel Team

On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
> >
> > ahh. I missed that. Makes sense.
> > vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
>
> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
> allow access of vma->vm_file as a valid pointer to struct file. However, since the
> vma might be freed, vma->vm_file could point to random data.

I don't think so. The proposed patch will do get_file() on it.
There is actually no need to assign it into a different variable.
Accessing it via vma->vm_file is safe and cleaner.

> >> [1] ff9f47f6f00c ("mm: proc: smaps_rollup: do not stall write attempts on mmap_lock")
> >
> > Thanks for this link. With "if (mmap_lock_is_contended())" check it should work indeed.
>
> To make sure we are on the same page: I am using slightly different mechanism in
> task_vma_iter, which doesn't require checking mmap_lock_is_contended(). In the
> smaps_rollup case, the code only unlock mmap_sem when the lock is contended. In
> task_iter, we always unlock mmap_sem between two iterations. This is because we
> don't want to hold mmap_sem while calling the BPF program, which may sleep (calling
> bpf_d_path).

That part is clear. I had to look into mmap_read_lock_killable() implementation
to realize that it's checking for lock_is_contended after acquiring
and releasing
if there is a contention. So it's the same behavior at the end.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-18  5:23           ` Alexei Starovoitov
@ 2020-12-18 16:38             ` Yonghong Song
  2020-12-18 17:23               ` Song Liu
  0 siblings, 1 reply; 36+ messages in thread
From: Yonghong Song @ 2020-12-18 16:38 UTC (permalink / raw)
  To: Alexei Starovoitov, Song Liu
  Cc: bpf, netdev, ast, daniel, andrii, john.fastabend, kpsingh, Kernel Team



On 12/17/20 9:23 PM, Alexei Starovoitov wrote:
> On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
>>>
>>> ahh. I missed that. Makes sense.
>>> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
>>
>> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
>> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
>> allow access of vma->vm_file as a valid pointer to struct file. However, since the
>> vma might be freed, vma->vm_file could point to random data.
> 
> I don't think so. The proposed patch will do get_file() on it.
> There is actually no need to assign it into a different variable.
> Accessing it via vma->vm_file is safe and cleaner.

I did not check the code but do you have scenarios where vma is freed 
but old vma->vm_file is not freed due to reference counting, but
freed vma area is reused so vma->vm_file could be garbage?

> 
>>>> [1] ff9f47f6f00c ("mm: proc: smaps_rollup: do not stall write attempts on mmap_lock")
>>>
>>> Thanks for this link. With "if (mmap_lock_is_contended())" check it should work indeed.
>>
>> To make sure we are on the same page: I am using slightly different mechanism in
>> task_vma_iter, which doesn't require checking mmap_lock_is_contended(). In the
>> smaps_rollup case, the code only unlock mmap_sem when the lock is contended. In
>> task_iter, we always unlock mmap_sem between two iterations. This is because we
>> don't want to hold mmap_sem while calling the BPF program, which may sleep (calling
>> bpf_d_path).
> 
> That part is clear. I had to look into mmap_read_lock_killable() implementation
> to realize that it's checking for lock_is_contended after acquiring
> and releasing
> if there is a contention. So it's the same behavior at the end.
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-18 16:38             ` Yonghong Song
@ 2020-12-18 17:23               ` Song Liu
  2021-01-05  1:46                 ` Alexei Starovoitov
  0 siblings, 1 reply; 36+ messages in thread
From: Song Liu @ 2020-12-18 17:23 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Alexei Starovoitov, bpf, netdev, ast, daniel, andrii,
	john.fastabend, kpsingh, Kernel Team



> On Dec 18, 2020, at 8:38 AM, Yonghong Song <yhs@fb.com> wrote:
> 
> 
> 
> On 12/17/20 9:23 PM, Alexei Starovoitov wrote:
>> On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> ahh. I missed that. Makes sense.
>>>> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
>>> 
>>> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
>>> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
>>> allow access of vma->vm_file as a valid pointer to struct file. However, since the
>>> vma might be freed, vma->vm_file could point to random data.
>> I don't think so. The proposed patch will do get_file() on it.
>> There is actually no need to assign it into a different variable.
>> Accessing it via vma->vm_file is safe and cleaner.
> 
> I did not check the code but do you have scenarios where vma is freed but old vma->vm_file is not freed due to reference counting, but
> freed vma area is reused so vma->vm_file could be garbage?

AFAIK, once we unlock mmap_sem, the vma could be freed and reused. I guess ptr_to_btf_id
or probe_read would not help with this?

Thanks,
Song

> 
>>>>> [1] ff9f47f6f00c ("mm: proc: smaps_rollup: do not stall write attempts on mmap_lock")
>>>> 
>>>> Thanks for this link. With "if (mmap_lock_is_contended())" check it should work indeed.
>>> 
>>> To make sure we are on the same page: I am using slightly different mechanism in
>>> task_vma_iter, which doesn't require checking mmap_lock_is_contended(). In the
>>> smaps_rollup case, the code only unlock mmap_sem when the lock is contended. In
>>> task_iter, we always unlock mmap_sem between two iterations. This is because we
>>> don't want to hold mmap_sem while calling the BPF program, which may sleep (calling
>>> bpf_d_path).
>> That part is clear. I had to look into mmap_read_lock_killable() implementation
>> to realize that it's checking for lock_is_contended after acquiring
>> and releasing
>> if there is a contention. So it's the same behavior at the end.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2020-12-18 17:23               ` Song Liu
@ 2021-01-05  1:46                 ` Alexei Starovoitov
  2021-01-05  5:47                   ` Song Liu
  0 siblings, 1 reply; 36+ messages in thread
From: Alexei Starovoitov @ 2021-01-05  1:46 UTC (permalink / raw)
  To: Song Liu
  Cc: Yonghong Song, bpf, netdev, ast, daniel, andrii, john.fastabend,
	kpsingh, Kernel Team

On Fri, Dec 18, 2020 at 05:23:25PM +0000, Song Liu wrote:
> 
> 
> > On Dec 18, 2020, at 8:38 AM, Yonghong Song <yhs@fb.com> wrote:
> > 
> > 
> > 
> > On 12/17/20 9:23 PM, Alexei Starovoitov wrote:
> >> On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
> >>>> 
> >>>> ahh. I missed that. Makes sense.
> >>>> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
> >>> 
> >>> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
> >>> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
> >>> allow access of vma->vm_file as a valid pointer to struct file. However, since the
> >>> vma might be freed, vma->vm_file could point to random data.
> >> I don't think so. The proposed patch will do get_file() on it.
> >> There is actually no need to assign it into a different variable.
> >> Accessing it via vma->vm_file is safe and cleaner.
> > 
> > I did not check the code but do you have scenarios where vma is freed but old vma->vm_file is not freed due to reference counting, but
> > freed vma area is reused so vma->vm_file could be garbage?
> 
> AFAIK, once we unlock mmap_sem, the vma could be freed and reused. I guess ptr_to_btf_id
> or probe_read would not help with this?

Theoretically we can hack the verifier to treat some ptr_to_btf_id as "less
valid" than the other ptr_to_btf_id, but the user experience will not be great.
Reading such bpf prog will not be obvious. I think it's better to run bpf prog
in mmap_lock then and let it access vma->vm_file. After prog finishes the iter
bit can do if (mmap_lock_is_contended()) before iterating. That will deliver
better performance too. Instead of task_vma_seq_get_next() doing
mmap_lock/unlock at every vma. No need for get_file() either. And no
__vm_area_struct exposure.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2021-01-05  1:46                 ` Alexei Starovoitov
@ 2021-01-05  5:47                   ` Song Liu
  2021-01-05 16:27                     ` Alexei Starovoitov
  0 siblings, 1 reply; 36+ messages in thread
From: Song Liu @ 2021-01-05  5:47 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Yonghong Song, bpf, netdev, ast, daniel, andrii, john.fastabend,
	kpsingh, Kernel Team



> On Jan 4, 2021, at 5:46 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Fri, Dec 18, 2020 at 05:23:25PM +0000, Song Liu wrote:
>> 
>> 
>>> On Dec 18, 2020, at 8:38 AM, Yonghong Song <yhs@fb.com> wrote:
>>> 
>>> 
>>> 
>>> On 12/17/20 9:23 PM, Alexei Starovoitov wrote:
>>>> On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>> 
>>>>>> ahh. I missed that. Makes sense.
>>>>>> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
>>>>> 
>>>>> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
>>>>> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
>>>>> allow access of vma->vm_file as a valid pointer to struct file. However, since the
>>>>> vma might be freed, vma->vm_file could point to random data.
>>>> I don't think so. The proposed patch will do get_file() on it.
>>>> There is actually no need to assign it into a different variable.
>>>> Accessing it via vma->vm_file is safe and cleaner.
>>> 
>>> I did not check the code but do you have scenarios where vma is freed but old vma->vm_file is not freed due to reference counting, but
>>> freed vma area is reused so vma->vm_file could be garbage?
>> 
>> AFAIK, once we unlock mmap_sem, the vma could be freed and reused. I guess ptr_to_btf_id
>> or probe_read would not help with this?
> 
> Theoretically we can hack the verifier to treat some ptr_to_btf_id as "less
> valid" than the other ptr_to_btf_id, but the user experience will not be great.
> Reading such bpf prog will not be obvious. I think it's better to run bpf prog
> in mmap_lock then and let it access vma->vm_file. After prog finishes the iter
> bit can do if (mmap_lock_is_contended()) before iterating. That will deliver
> better performance too. Instead of task_vma_seq_get_next() doing
> mmap_lock/unlock at every vma. No need for get_file() either. And no
> __vm_area_struct exposure.

I think there might be concern calling BPF program with mmap_lock, especially that 
the program is sleepable (for bpf_d_path). It shouldn't be a problem for common 
cases, but I am not 100% sure for corner cases (many instructions in BPF + sleep). 
Current version is designed to be very safe for the workload, which might be too
conservative. 

Thanks,
Song


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2021-01-05  5:47                   ` Song Liu
@ 2021-01-05 16:27                     ` Alexei Starovoitov
  2021-01-05 17:10                       ` Song Liu
  0 siblings, 1 reply; 36+ messages in thread
From: Alexei Starovoitov @ 2021-01-05 16:27 UTC (permalink / raw)
  To: Song Liu
  Cc: Yonghong Song, bpf, netdev, ast, daniel, andrii, john.fastabend,
	kpsingh, Kernel Team

On Mon, Jan 4, 2021 at 9:47 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Jan 4, 2021, at 5:46 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Dec 18, 2020 at 05:23:25PM +0000, Song Liu wrote:
> >>
> >>
> >>> On Dec 18, 2020, at 8:38 AM, Yonghong Song <yhs@fb.com> wrote:
> >>>
> >>>
> >>>
> >>> On 12/17/20 9:23 PM, Alexei Starovoitov wrote:
> >>>> On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>>>
> >>>>>> ahh. I missed that. Makes sense.
> >>>>>> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
> >>>>>
> >>>>> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
> >>>>> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
> >>>>> allow access of vma->vm_file as a valid pointer to struct file. However, since the
> >>>>> vma might be freed, vma->vm_file could point to random data.
> >>>> I don't think so. The proposed patch will do get_file() on it.
> >>>> There is actually no need to assign it into a different variable.
> >>>> Accessing it via vma->vm_file is safe and cleaner.
> >>>
> >>> I did not check the code but do you have scenarios where vma is freed but old vma->vm_file is not freed due to reference counting, but
> >>> freed vma area is reused so vma->vm_file could be garbage?
> >>
> >> AFAIK, once we unlock mmap_sem, the vma could be freed and reused. I guess ptr_to_btf_id
> >> or probe_read would not help with this?
> >
> > Theoretically we can hack the verifier to treat some ptr_to_btf_id as "less
> > valid" than the other ptr_to_btf_id, but the user experience will not be great.
> > Reading such bpf prog will not be obvious. I think it's better to run bpf prog
> > in mmap_lock then and let it access vma->vm_file. After prog finishes the iter
> > bit can do if (mmap_lock_is_contended()) before iterating. That will deliver
> > better performance too. Instead of task_vma_seq_get_next() doing
> > mmap_lock/unlock at every vma. No need for get_file() either. And no
> > __vm_area_struct exposure.
>
> I think there might be concern calling BPF program with mmap_lock, especially that
> the program is sleepable (for bpf_d_path). It shouldn't be a problem for common
> cases, but I am not 100% sure for corner cases (many instructions in BPF + sleep).
> Current version is designed to be very safe for the workload, which might be too
> conservative.

I know and I agree with all that, but how do you propose to fix the
vm_file concern
without holding the lock while prog is running? I couldn't come up with a way.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2021-01-05 16:27                     ` Alexei Starovoitov
@ 2021-01-05 17:10                       ` Song Liu
  2021-01-05 17:27                         ` Alexei Starovoitov
  0 siblings, 1 reply; 36+ messages in thread
From: Song Liu @ 2021-01-05 17:10 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Yonghong Song, bpf, netdev, ast, daniel, andrii, john.fastabend,
	kpsingh, Kernel Team



> On Jan 5, 2021, at 8:27 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Mon, Jan 4, 2021 at 9:47 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Jan 4, 2021, at 5:46 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>> 
>>> On Fri, Dec 18, 2020 at 05:23:25PM +0000, Song Liu wrote:
>>>> 
>>>> 
>>>>> On Dec 18, 2020, at 8:38 AM, Yonghong Song <yhs@fb.com> wrote:
>>>>> 
>>>>> 
>>>>> 
>>>>> On 12/17/20 9:23 PM, Alexei Starovoitov wrote:
>>>>>> On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>>>> 
>>>>>>>> ahh. I missed that. Makes sense.
>>>>>>>> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
>>>>>>> 
>>>>>>> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
>>>>>>> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
>>>>>>> allow access of vma->vm_file as a valid pointer to struct file. However, since the
>>>>>>> vma might be freed, vma->vm_file could point to random data.
>>>>>> I don't think so. The proposed patch will do get_file() on it.
>>>>>> There is actually no need to assign it into a different variable.
>>>>>> Accessing it via vma->vm_file is safe and cleaner.
>>>>> 
>>>>> I did not check the code but do you have scenarios where vma is freed but old vma->vm_file is not freed due to reference counting, but
>>>>> freed vma area is reused so vma->vm_file could be garbage?
>>>> 
>>>> AFAIK, once we unlock mmap_sem, the vma could be freed and reused. I guess ptr_to_btf_id
>>>> or probe_read would not help with this?
>>> 
>>> Theoretically we can hack the verifier to treat some ptr_to_btf_id as "less
>>> valid" than the other ptr_to_btf_id, but the user experience will not be great.
>>> Reading such bpf prog will not be obvious. I think it's better to run bpf prog
>>> in mmap_lock then and let it access vma->vm_file. After prog finishes the iter
>>> bit can do if (mmap_lock_is_contended()) before iterating. That will deliver
>>> better performance too. Instead of task_vma_seq_get_next() doing
>>> mmap_lock/unlock at every vma. No need for get_file() either. And no
>>> __vm_area_struct exposure.
>> 
>> I think there might be concern calling BPF program with mmap_lock, especially that
>> the program is sleepable (for bpf_d_path). It shouldn't be a problem for common
>> cases, but I am not 100% sure for corner cases (many instructions in BPF + sleep).
>> Current version is designed to be very safe for the workload, which might be too
>> conservative.
> 
> I know and I agree with all that, but how do you propose to fix the
> vm_file concern
> without holding the lock while prog is running? I couldn't come up with a way.

I guess the gap here is that I don't see why __vm_area_struct exposure is an issue. 
It is similar to __sk_buff, and simpler (though we had more reasons to introduce
__sk_buff back when there wasn't BTF). 

If we can accept __vm_area_struct, current version should work, as it doesn't have 
problem with vm_file. 

Thanks,
Song

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2021-01-05 17:10                       ` Song Liu
@ 2021-01-05 17:27                         ` Alexei Starovoitov
  2021-01-05 19:38                           ` Song Liu
  0 siblings, 1 reply; 36+ messages in thread
From: Alexei Starovoitov @ 2021-01-05 17:27 UTC (permalink / raw)
  To: Song Liu
  Cc: Yonghong Song, bpf, netdev, ast, daniel, andrii, john.fastabend,
	kpsingh, Kernel Team

On Tue, Jan 5, 2021 at 9:11 AM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Jan 5, 2021, at 8:27 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Jan 4, 2021 at 9:47 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Jan 4, 2021, at 5:46 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >>>
> >>> On Fri, Dec 18, 2020 at 05:23:25PM +0000, Song Liu wrote:
> >>>>
> >>>>
> >>>>> On Dec 18, 2020, at 8:38 AM, Yonghong Song <yhs@fb.com> wrote:
> >>>>>
> >>>>>
> >>>>>
> >>>>> On 12/17/20 9:23 PM, Alexei Starovoitov wrote:
> >>>>>> On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>>>>>
> >>>>>>>> ahh. I missed that. Makes sense.
> >>>>>>>> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
> >>>>>>>
> >>>>>>> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
> >>>>>>> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
> >>>>>>> allow access of vma->vm_file as a valid pointer to struct file. However, since the
> >>>>>>> vma might be freed, vma->vm_file could point to random data.
> >>>>>> I don't think so. The proposed patch will do get_file() on it.
> >>>>>> There is actually no need to assign it into a different variable.
> >>>>>> Accessing it via vma->vm_file is safe and cleaner.
> >>>>>
> >>>>> I did not check the code but do you have scenarios where vma is freed but old vma->vm_file is not freed due to reference counting, but
> >>>>> freed vma area is reused so vma->vm_file could be garbage?
> >>>>
> >>>> AFAIK, once we unlock mmap_sem, the vma could be freed and reused. I guess ptr_to_btf_id
> >>>> or probe_read would not help with this?
> >>>
> >>> Theoretically we can hack the verifier to treat some ptr_to_btf_id as "less
> >>> valid" than the other ptr_to_btf_id, but the user experience will not be great.
> >>> Reading such bpf prog will not be obvious. I think it's better to run bpf prog
> >>> in mmap_lock then and let it access vma->vm_file. After prog finishes the iter
> >>> bit can do if (mmap_lock_is_contended()) before iterating. That will deliver
> >>> better performance too. Instead of task_vma_seq_get_next() doing
> >>> mmap_lock/unlock at every vma. No need for get_file() either. And no
> >>> __vm_area_struct exposure.
> >>
> >> I think there might be concern calling BPF program with mmap_lock, especially that
> >> the program is sleepable (for bpf_d_path). It shouldn't be a problem for common
> >> cases, but I am not 100% sure for corner cases (many instructions in BPF + sleep).
> >> Current version is designed to be very safe for the workload, which might be too
> >> conservative.
> >
> > I know and I agree with all that, but how do you propose to fix the
> > vm_file concern
> > without holding the lock while prog is running? I couldn't come up with a way.
>
> I guess the gap here is that I don't see why __vm_area_struct exposure is an issue.
> It is similar to __sk_buff, and simpler (though we had more reasons to introduce
> __sk_buff back when there wasn't BTF).
>
> If we can accept __vm_area_struct, current version should work, as it doesn't have
> problem with vm_file

True. The problem with __vm_area_struct is that it is a hard coded
uapi with little to none
extensibility. In this form vma iterator is not really useful beyond
the example in selftest.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2021-01-05 17:27                         ` Alexei Starovoitov
@ 2021-01-05 19:38                           ` Song Liu
  2021-01-05 19:46                             ` Alexei Starovoitov
  0 siblings, 1 reply; 36+ messages in thread
From: Song Liu @ 2021-01-05 19:38 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Yonghong Song, bpf, netdev, ast, daniel, andrii, john.fastabend,
	kpsingh, Kernel Team



> On Jan 5, 2021, at 9:27 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Tue, Jan 5, 2021 at 9:11 AM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Jan 5, 2021, at 8:27 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>> 
>>> On Mon, Jan 4, 2021 at 9:47 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Jan 4, 2021, at 5:46 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>>>> 
>>>>> On Fri, Dec 18, 2020 at 05:23:25PM +0000, Song Liu wrote:
>>>>>> 
>>>>>> 
>>>>>>> On Dec 18, 2020, at 8:38 AM, Yonghong Song <yhs@fb.com> wrote:
>>>>>>> 
>>>>>>> 
>>>>>>> 
>>>>>>> On 12/17/20 9:23 PM, Alexei Starovoitov wrote:
>>>>>>>> On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>>>>>> 
>>>>>>>>>> ahh. I missed that. Makes sense.
>>>>>>>>>> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
>>>>>>>>> 
>>>>>>>>> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
>>>>>>>>> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
>>>>>>>>> allow access of vma->vm_file as a valid pointer to struct file. However, since the
>>>>>>>>> vma might be freed, vma->vm_file could point to random data.
>>>>>>>> I don't think so. The proposed patch will do get_file() on it.
>>>>>>>> There is actually no need to assign it into a different variable.
>>>>>>>> Accessing it via vma->vm_file is safe and cleaner.
>>>>>>> 
>>>>>>> I did not check the code but do you have scenarios where vma is freed but old vma->vm_file is not freed due to reference counting, but
>>>>>>> freed vma area is reused so vma->vm_file could be garbage?
>>>>>> 
>>>>>> AFAIK, once we unlock mmap_sem, the vma could be freed and reused. I guess ptr_to_btf_id
>>>>>> or probe_read would not help with this?
>>>>> 
>>>>> Theoretically we can hack the verifier to treat some ptr_to_btf_id as "less
>>>>> valid" than the other ptr_to_btf_id, but the user experience will not be great.
>>>>> Reading such bpf prog will not be obvious. I think it's better to run bpf prog
>>>>> in mmap_lock then and let it access vma->vm_file. After prog finishes the iter
>>>>> bit can do if (mmap_lock_is_contended()) before iterating. That will deliver
>>>>> better performance too. Instead of task_vma_seq_get_next() doing
>>>>> mmap_lock/unlock at every vma. No need for get_file() either. And no
>>>>> __vm_area_struct exposure.
>>>> 
>>>> I think there might be concern calling BPF program with mmap_lock, especially that
>>>> the program is sleepable (for bpf_d_path). It shouldn't be a problem for common
>>>> cases, but I am not 100% sure for corner cases (many instructions in BPF + sleep).
>>>> Current version is designed to be very safe for the workload, which might be too
>>>> conservative.
>>> 
>>> I know and I agree with all that, but how do you propose to fix the
>>> vm_file concern
>>> without holding the lock while prog is running? I couldn't come up with a way.
>> 
>> I guess the gap here is that I don't see why __vm_area_struct exposure is an issue.
>> It is similar to __sk_buff, and simpler (though we had more reasons to introduce
>> __sk_buff back when there wasn't BTF).
>> 
>> If we can accept __vm_area_struct, current version should work, as it doesn't have
>> problem with vm_file
> 
> True. The problem with __vm_area_struct is that it is a hard coded
> uapi with little to none
> extensibility. In this form vma iterator is not really useful beyond
> the example in selftest.

With __vm_area_struct, we can still probe_read the page table, so we can 
cover more use cases than the selftest. But I agree that it is not as
extensible as feeding real vm_area_struct with BTF to the BPF program. 
Let me try calling BPF program with mmap_lock. 

Thanks,
Song 


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2021-01-05 19:38                           ` Song Liu
@ 2021-01-05 19:46                             ` Alexei Starovoitov
  2021-01-05 19:51                               ` Song Liu
  0 siblings, 1 reply; 36+ messages in thread
From: Alexei Starovoitov @ 2021-01-05 19:46 UTC (permalink / raw)
  To: Song Liu
  Cc: Yonghong Song, bpf, netdev, ast, daniel, andrii, john.fastabend,
	kpsingh, Kernel Team

On Tue, Jan 05, 2021 at 07:38:08PM +0000, Song Liu wrote:
> 
> 
> > On Jan 5, 2021, at 9:27 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > 
> > On Tue, Jan 5, 2021 at 9:11 AM Song Liu <songliubraving@fb.com> wrote:
> >> 
> >> 
> >> 
> >>> On Jan 5, 2021, at 8:27 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >>> 
> >>> On Mon, Jan 4, 2021 at 9:47 PM Song Liu <songliubraving@fb.com> wrote:
> >>>> 
> >>>> 
> >>>> 
> >>>>> On Jan 4, 2021, at 5:46 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >>>>> 
> >>>>> On Fri, Dec 18, 2020 at 05:23:25PM +0000, Song Liu wrote:
> >>>>>> 
> >>>>>> 
> >>>>>>> On Dec 18, 2020, at 8:38 AM, Yonghong Song <yhs@fb.com> wrote:
> >>>>>>> 
> >>>>>>> 
> >>>>>>> 
> >>>>>>> On 12/17/20 9:23 PM, Alexei Starovoitov wrote:
> >>>>>>>> On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>>>>>>> 
> >>>>>>>>>> ahh. I missed that. Makes sense.
> >>>>>>>>>> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
> >>>>>>>>> 
> >>>>>>>>> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
> >>>>>>>>> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
> >>>>>>>>> allow access of vma->vm_file as a valid pointer to struct file. However, since the
> >>>>>>>>> vma might be freed, vma->vm_file could point to random data.
> >>>>>>>> I don't think so. The proposed patch will do get_file() on it.
> >>>>>>>> There is actually no need to assign it into a different variable.
> >>>>>>>> Accessing it via vma->vm_file is safe and cleaner.
> >>>>>>> 
> >>>>>>> I did not check the code but do you have scenarios where vma is freed but old vma->vm_file is not freed due to reference counting, but
> >>>>>>> freed vma area is reused so vma->vm_file could be garbage?
> >>>>>> 
> >>>>>> AFAIK, once we unlock mmap_sem, the vma could be freed and reused. I guess ptr_to_btf_id
> >>>>>> or probe_read would not help with this?
> >>>>> 
> >>>>> Theoretically we can hack the verifier to treat some ptr_to_btf_id as "less
> >>>>> valid" than the other ptr_to_btf_id, but the user experience will not be great.
> >>>>> Reading such bpf prog will not be obvious. I think it's better to run bpf prog
> >>>>> in mmap_lock then and let it access vma->vm_file. After prog finishes the iter
> >>>>> bit can do if (mmap_lock_is_contended()) before iterating. That will deliver
> >>>>> better performance too. Instead of task_vma_seq_get_next() doing
> >>>>> mmap_lock/unlock at every vma. No need for get_file() either. And no
> >>>>> __vm_area_struct exposure.
> >>>> 
> >>>> I think there might be concern calling BPF program with mmap_lock, especially that
> >>>> the program is sleepable (for bpf_d_path). It shouldn't be a problem for common
> >>>> cases, but I am not 100% sure for corner cases (many instructions in BPF + sleep).
> >>>> Current version is designed to be very safe for the workload, which might be too
> >>>> conservative.
> >>> 
> >>> I know and I agree with all that, but how do you propose to fix the
> >>> vm_file concern
> >>> without holding the lock while prog is running? I couldn't come up with a way.
> >> 
> >> I guess the gap here is that I don't see why __vm_area_struct exposure is an issue.
> >> It is similar to __sk_buff, and simpler (though we had more reasons to introduce
> >> __sk_buff back when there wasn't BTF).
> >> 
> >> If we can accept __vm_area_struct, current version should work, as it doesn't have
> >> problem with vm_file
> > 
> > True. The problem with __vm_area_struct is that it is a hard coded
> > uapi with little to none
> > extensibility. In this form vma iterator is not really useful beyond
> > the example in selftest.
> 
> With __vm_area_struct, we can still probe_read the page table, so we can 
> cover more use cases than the selftest. But I agree that it is not as
> extensible as feeding real vm_area_struct with BTF to the BPF program. 

Another confusing thing with __vm_area_struct is vm_flags field.
It's copied directly. As __vm_area_struct->flags this field is uapi field,
but its contents are kernel internal. We avoided such corner cases in the past.
When flags field is copied into uapi the kernel internal flags are encoded
and exposed as separate uapi flags. That was the case with
BPF_TCP_* flags. If we have to do this with vm_flags (VM_EXEC, etc) that
might kill the patchset due to abi concerns.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter
  2021-01-05 19:46                             ` Alexei Starovoitov
@ 2021-01-05 19:51                               ` Song Liu
  0 siblings, 0 replies; 36+ messages in thread
From: Song Liu @ 2021-01-05 19:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Yonghong Song, bpf, netdev, ast, daniel, andrii, john.fastabend,
	kpsingh, Kernel Team



> On Jan 5, 2021, at 11:46 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Tue, Jan 05, 2021 at 07:38:08PM +0000, Song Liu wrote:
>> 
>> 
>>> On Jan 5, 2021, at 9:27 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>> 
>>> On Tue, Jan 5, 2021 at 9:11 AM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Jan 5, 2021, at 8:27 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>>>> 
>>>>> On Mon, Jan 4, 2021 at 9:47 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>> 
>>>>>> 
>>>>>> 
>>>>>>> On Jan 4, 2021, at 5:46 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>>>>>> 
>>>>>>> On Fri, Dec 18, 2020 at 05:23:25PM +0000, Song Liu wrote:
>>>>>>>> 
>>>>>>>> 
>>>>>>>>> On Dec 18, 2020, at 8:38 AM, Yonghong Song <yhs@fb.com> wrote:
>>>>>>>>> 
>>>>>>>>> 
>>>>>>>>> 
>>>>>>>>> On 12/17/20 9:23 PM, Alexei Starovoitov wrote:
>>>>>>>>>> On Thu, Dec 17, 2020 at 8:33 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>>>>>>>> 
>>>>>>>>>>>> ahh. I missed that. Makes sense.
>>>>>>>>>>>> vm_file needs to be accurate, but vm_area_struct should be accessed as ptr_to_btf_id.
>>>>>>>>>>> 
>>>>>>>>>>> Passing pointer of vm_area_struct into BPF will be tricky. For example, shall we
>>>>>>>>>>> allow the user to access vma->vm_file? IIUC, with ptr_to_btf_id the verifier will
>>>>>>>>>>> allow access of vma->vm_file as a valid pointer to struct file. However, since the
>>>>>>>>>>> vma might be freed, vma->vm_file could point to random data.
>>>>>>>>>> I don't think so. The proposed patch will do get_file() on it.
>>>>>>>>>> There is actually no need to assign it into a different variable.
>>>>>>>>>> Accessing it via vma->vm_file is safe and cleaner.
>>>>>>>>> 
>>>>>>>>> I did not check the code but do you have scenarios where vma is freed but old vma->vm_file is not freed due to reference counting, but
>>>>>>>>> freed vma area is reused so vma->vm_file could be garbage?
>>>>>>>> 
>>>>>>>> AFAIK, once we unlock mmap_sem, the vma could be freed and reused. I guess ptr_to_btf_id
>>>>>>>> or probe_read would not help with this?
>>>>>>> 
>>>>>>> Theoretically we can hack the verifier to treat some ptr_to_btf_id as "less
>>>>>>> valid" than the other ptr_to_btf_id, but the user experience will not be great.
>>>>>>> Reading such bpf prog will not be obvious. I think it's better to run bpf prog
>>>>>>> in mmap_lock then and let it access vma->vm_file. After prog finishes the iter
>>>>>>> bit can do if (mmap_lock_is_contended()) before iterating. That will deliver
>>>>>>> better performance too. Instead of task_vma_seq_get_next() doing
>>>>>>> mmap_lock/unlock at every vma. No need for get_file() either. And no
>>>>>>> __vm_area_struct exposure.
>>>>>> 
>>>>>> I think there might be concern calling BPF program with mmap_lock, especially that
>>>>>> the program is sleepable (for bpf_d_path). It shouldn't be a problem for common
>>>>>> cases, but I am not 100% sure for corner cases (many instructions in BPF + sleep).
>>>>>> Current version is designed to be very safe for the workload, which might be too
>>>>>> conservative.
>>>>> 
>>>>> I know and I agree with all that, but how do you propose to fix the
>>>>> vm_file concern
>>>>> without holding the lock while prog is running? I couldn't come up with a way.
>>>> 
>>>> I guess the gap here is that I don't see why __vm_area_struct exposure is an issue.
>>>> It is similar to __sk_buff, and simpler (though we had more reasons to introduce
>>>> __sk_buff back when there wasn't BTF).
>>>> 
>>>> If we can accept __vm_area_struct, current version should work, as it doesn't have
>>>> problem with vm_file
>>> 
>>> True. The problem with __vm_area_struct is that it is a hard coded
>>> uapi with little to none
>>> extensibility. In this form vma iterator is not really useful beyond
>>> the example in selftest.
>> 
>> With __vm_area_struct, we can still probe_read the page table, so we can 
>> cover more use cases than the selftest. But I agree that it is not as
>> extensible as feeding real vm_area_struct with BTF to the BPF program. 
> 
> Another confusing thing with __vm_area_struct is vm_flags field.
> It's copied directly. As __vm_area_struct->flags this field is uapi field,
> but its contents are kernel internal. We avoided such corner cases in the past.
> When flags field is copied into uapi the kernel internal flags are encoded
> and exposed as separate uapi flags. That was the case with
> BPF_TCP_* flags. If we have to do this with vm_flags (VM_EXEC, etc) that
> might kill the patchset due to abi concerns.

This makes sense. It shouldn't be uapi without extra encoding. 

Song


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program
  2020-12-15 23:37 ` [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program Song Liu
  2020-12-16 17:41   ` Yonghong Song
  2020-12-16 18:15   ` KP Singh
@ 2021-01-25 12:52   ` Jiri Olsa
  2 siblings, 0 replies; 36+ messages in thread
From: Jiri Olsa @ 2021-01-25 12:52 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, netdev, ast, daniel, andrii, john.fastabend, kpsingh, kernel-team

On Tue, Dec 15, 2020 at 03:37:00PM -0800, Song Liu wrote:
> task_file and task_vma iter programs have access to file->f_path. Enable
> bpf_d_path to print paths of these file.
> 
> bpf_iter programs are generally called in sleepable context. However, it
> is still necessary to diffientiate sleepable and non-sleepable bpf_iter
> programs: sleepable programs have access to bpf_d_path; non-sleepable
> programs have access to bpf_spin_lock.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  kernel/trace/bpf_trace.c | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 4be771df5549a..9e5f9b968355f 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1191,6 +1191,11 @@ BTF_SET_END(btf_allowlist_d_path)
>  
>  static bool bpf_d_path_allowed(const struct bpf_prog *prog)
>  {
> +	if (prog->type == BPF_PROG_TYPE_TRACING &&
> +	    prog->expected_attach_type == BPF_TRACE_ITER &&
> +	    prog->aux->sleepable)
> +		return true;
> +
>  	if (prog->type == BPF_PROG_TYPE_LSM)
>  		return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
>  
> -- 
> 2.24.1
> 

would be great to have this merged for bpftrace

Tested-by: Jiri Olsa <jolsa@redhat.com>

thanks,
jirka


^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2021-01-26  5:36 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-15 23:36 [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma Song Liu
2020-12-15 23:36 ` [PATCH v2 bpf-next 1/4] bpf: introduce task_vma bpf_iter Song Liu
2020-12-16 17:36   ` Yonghong Song
2020-12-16 19:41     ` Song Liu
2020-12-17  0:34   ` Andrii Nakryiko
2020-12-17  1:51     ` Song Liu
2020-12-17 19:03   ` Alexei Starovoitov
2020-12-17 22:08     ` Song Liu
2020-12-18  2:34       ` Alexei Starovoitov
2020-12-18  3:15         ` Yonghong Song
2020-12-18  4:33         ` Song Liu
2020-12-18  5:23           ` Alexei Starovoitov
2020-12-18 16:38             ` Yonghong Song
2020-12-18 17:23               ` Song Liu
2021-01-05  1:46                 ` Alexei Starovoitov
2021-01-05  5:47                   ` Song Liu
2021-01-05 16:27                     ` Alexei Starovoitov
2021-01-05 17:10                       ` Song Liu
2021-01-05 17:27                         ` Alexei Starovoitov
2021-01-05 19:38                           ` Song Liu
2021-01-05 19:46                             ` Alexei Starovoitov
2021-01-05 19:51                               ` Song Liu
2020-12-15 23:37 ` [PATCH v2 bpf-next 2/4] bpf: allow bpf_d_path in sleepable bpf_iter program Song Liu
2020-12-16 17:41   ` Yonghong Song
2020-12-16 18:15   ` KP Singh
2020-12-16 18:31     ` KP Singh
2021-01-25 12:52   ` Jiri Olsa
2020-12-15 23:37 ` [PATCH v2 bpf-next 3/4] libbpf: introduce section "iter.s/" for " Song Liu
2020-12-16 17:42   ` Yonghong Song
2020-12-16 18:00     ` KP Singh
2020-12-15 23:37 ` [PATCH v2 bpf-next 4/4] selftests/bpf: add test for bpf_iter_task_vma Song Liu
2020-12-16 18:18   ` Yonghong Song
2020-12-16 23:23     ` Song Liu
2020-12-16 17:00 ` [PATCH v2 bpf-next 0/4] introduce bpf_iter for task_vma Yonghong Song
2020-12-16 17:35   ` Song Liu
2020-12-16 18:31     ` Yonghong Song

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).