bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next v2 0/3] Parameterize task iterators.
@ 2022-08-01 23:26 Kui-Feng Lee
  2022-08-01 23:26 ` [PATCH bpf-next v2 1/3] bpf: " Kui-Feng Lee
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Kui-Feng Lee @ 2022-08-01 23:26 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, kernel-team, yhs; +Cc: Kui-Feng Lee

Allow creating an iterator that loops through resources of one task/thread.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested in only the
resources of a specific task or process.  Passing the addintional
parameters, people can now create an iterator to go through all
resources or only the resources of a task.

Major Changes:

 - Add new parameters in bpf_iter_link_info to indicate to go through
   all tasks or to go through a specific task.

 - Change the implementations of BPF iterators of vma, files, and
   tasks to allow going through only the resources of a specific task.

 - Provide the arguments of parameterized task iterators in
   bpf_link_info.

Kui-Feng Lee (3):
  bpf: Parameterize task iterators.
  bpf: Handle bpf_link_info for the parameterized task BPF iterators.
  selftests/bpf: Test parameterized task BPF iterators.

 include/linux/bpf.h                           |   4 +
 include/uapi/linux/bpf.h                      |  27 ++++
 kernel/bpf/task_iter.c                        | 103 ++++++++++---
 tools/include/uapi/linux/bpf.h                |  27 ++++
 .../selftests/bpf/prog_tests/bpf_iter.c       | 143 +++++++++++++++---
 .../selftests/bpf/prog_tests/btf_dump.c       |   2 +-
 .../selftests/bpf/progs/bpf_iter_task.c       |   9 ++
 .../selftests/bpf/progs/bpf_iter_task_file.c  |   7 +
 .../selftests/bpf/progs/bpf_iter_task_vma.c   |   6 +-
 9 files changed, 282 insertions(+), 46 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH bpf-next v2 1/3] bpf: Parameterize task iterators.
  2022-08-01 23:26 [PATCH bpf-next v2 0/3] Parameterize task iterators Kui-Feng Lee
@ 2022-08-01 23:26 ` Kui-Feng Lee
  2022-08-02  1:49   ` Alexei Starovoitov
  2022-08-02  3:30   ` Andrii Nakryiko
  2022-08-01 23:26 ` [PATCH bpf-next v2 2/3] bpf: Handle bpf_link_info for the parameterized task BPF iterators Kui-Feng Lee
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 12+ messages in thread
From: Kui-Feng Lee @ 2022-08-01 23:26 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, kernel-team, yhs; +Cc: Kui-Feng Lee

Allow creating an iterator that loops through resources of one task/thread.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested
in only the resources of a specific task or process.  Passing the
additional parameters, people can now create an iterator to go
through all resources or only the resources of a task.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
---
 include/linux/bpf.h            |  4 ++
 include/uapi/linux/bpf.h       | 23 +++++++++
 kernel/bpf/task_iter.c         | 93 ++++++++++++++++++++++++++--------
 tools/include/uapi/linux/bpf.h | 23 +++++++++
 4 files changed, 121 insertions(+), 22 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 11950029284f..3c26dbfc9cef 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 
 struct bpf_iter_aux_info {
 	struct bpf_map *map;
+	struct {
+		u32	tid;
+		u8	type;
+	} task;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ffcbf79a556b..ed5ba501609f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
+enum bpf_task_iter_type {
+	BPF_TASK_ITER_ALL = 0,
+	BPF_TASK_ITER_TID,
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+	/*
+	 * Parameters of task iterators.
+	 */
+	struct {
+		__u32   pid_fd;
+		/*
+		 * The type of the iterator.
+		 *
+		 * It can be one of enum bpf_task_iter_type.
+		 *
+		 * BPF_TASK_ITER_ALL (default)
+		 *	The iterator iterates over resources of everyprocess.
+		 *
+		 * BPF_TASK_ITER_TID
+		 *	You should also set *pid_fd* to iterate over one task.
+		 */
+		__u8	type;	/* BPF_TASK_ITER_* */
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..9942601e1dfb 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -12,6 +12,8 @@
 
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
+	u32	tid;
+	u8	type;
 };
 
 struct bpf_iter_seq_task_info {
@@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info {
 	u32 tid;
 };
 
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
 					     u32 *tid,
 					     bool skip_if_dup_files)
 {
 	struct task_struct *task = NULL;
 	struct pid *pid;
 
+	if (common->type == BPF_TASK_ITER_TID) {
+		if (*tid && *tid != common->tid)
+			return NULL;
+		rcu_read_lock();
+		pid = find_pid_ns(common->tid, common->ns);
+		if (pid) {
+			task = get_pid_task(pid, PIDTYPE_PID);
+			*tid = common->tid;
+		}
+		rcu_read_unlock();
+		return task;
+	}
+
 	rcu_read_lock();
 retry:
-	pid = find_ge_pid(*tid, ns);
+	pid = find_ge_pid(*tid, common->ns);
 	if (pid) {
-		*tid = pid_nr_ns(pid, ns);
+		*tid = pid_nr_ns(pid, common->ns);
 		task = get_pid_task(pid, PIDTYPE_PID);
 		if (!task) {
 			++*tid;
@@ -56,7 +71,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
 	struct bpf_iter_seq_task_info *info = seq->private;
 	struct task_struct *task;
 
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+	task = task_seq_get_next(&info->common, &info->tid, false);
+
 	if (!task)
 		return NULL;
 
@@ -73,7 +89,8 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	++*pos;
 	++info->tid;
 	put_task_struct((struct task_struct *)v);
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+
+	task = task_seq_get_next(&info->common, &info->tid, false);
 	if (!task)
 		return NULL;
 
@@ -117,6 +134,30 @@ static void task_seq_stop(struct seq_file *seq, void *v)
 		put_task_struct((struct task_struct *)v);
 }
 
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+				union bpf_iter_link_info *linfo,
+				struct bpf_iter_aux_info *aux)
+{
+	unsigned int flags;
+	struct task_struct *tsk;
+
+	if (linfo->task.type == BPF_TASK_ITER_ALL && linfo->task.pid_fd != 0)
+		return -EINVAL;
+
+	aux->task.type = linfo->task.type;
+
+	if (linfo->task.type == BPF_TASK_ITER_TID) {
+		tsk = pidfd_get_task(linfo->task.pid_fd, &flags);
+		if (IS_ERR(tsk))
+			return PTR_ERR(tsk);
+
+		aux->task.tid = tsk->pid;
+		put_task_struct(tsk);
+	}
+
+	return 0;
+}
+
 static const struct seq_operations task_seq_ops = {
 	.start	= task_seq_start,
 	.next	= task_seq_next,
@@ -137,8 +178,7 @@ struct bpf_iter_seq_task_file_info {
 static struct file *
 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 	struct task_struct *curr_task;
 	unsigned int curr_fd = info->fd;
 
@@ -151,21 +191,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 		curr_task = info->task;
 		curr_fd = info->fd;
 	} else {
-                curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
                 if (!curr_task) {
                         info->task = NULL;
-                        info->tid = curr_tid;
                         return NULL;
                 }
 
-                /* set info->task and info->tid */
+		/* set info->task */
 		info->task = curr_task;
-		if (curr_tid == info->tid) {
+		if (saved_tid == info->tid)
 			curr_fd = info->fd;
-		} else {
-			info->tid = curr_tid;
+		else
 			curr_fd = 0;
-		}
 	}
 
 	rcu_read_lock();
@@ -186,9 +223,15 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 	/* the current task is done, go to the next task */
 	rcu_read_unlock();
 	put_task_struct(curr_task);
+
+	if (info->common.type == BPF_TASK_ITER_TID) {
+		info->task = NULL;
+		return NULL;
+	}
+
 	info->task = NULL;
 	info->fd = 0;
-	curr_tid = ++(info->tid);
+	saved_tid = ++(info->tid);
 	goto again;
 }
 
@@ -269,6 +312,8 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
 	struct bpf_iter_seq_task_common *common = priv_data;
 
 	common->ns = get_pid_ns(task_active_pid_ns(current));
+	common->type = aux->task.type;
+	common->tid = aux->task.tid;
 	return 0;
 }
 
@@ -307,11 +352,10 @@ enum bpf_task_vma_iter_find_op {
 static struct vm_area_struct *
 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
 	enum bpf_task_vma_iter_find_op op;
 	struct vm_area_struct *curr_vma;
 	struct task_struct *curr_task;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 
 	/* If this function returns a non-NULL vma, it holds a reference to
 	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
@@ -371,14 +415,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 		}
 	} else {
 again:
-		curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
 		if (!curr_task) {
-			info->tid = curr_tid + 1;
+			info->tid++;
 			goto finish;
 		}
 
-		if (curr_tid != info->tid) {
-			info->tid = curr_tid;
+		if (saved_tid != info->tid) {
 			/* new task, process the first vma */
 			op = task_vma_iter_first_vma;
 		} else {
@@ -430,9 +473,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 	return curr_vma;
 
 next_task:
+	if (info->common.type == BPF_TASK_ITER_TID)
+		goto finish;
+
 	put_task_struct(curr_task);
 	info->task = NULL;
-	curr_tid++;
+	info->tid++;
 	goto again;
 
 finish:
@@ -533,6 +579,7 @@ static const struct bpf_iter_seq_info task_seq_info = {
 
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
@@ -551,6 +598,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
 
 static struct bpf_iter_reg task_file_reg_info = {
 	.target			= "task_file",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
@@ -571,6 +619,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {
 
 static struct bpf_iter_reg task_vma_reg_info = {
 	.target			= "task_vma",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ffcbf79a556b..ed5ba501609f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
+enum bpf_task_iter_type {
+	BPF_TASK_ITER_ALL = 0,
+	BPF_TASK_ITER_TID,
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+	/*
+	 * Parameters of task iterators.
+	 */
+	struct {
+		__u32   pid_fd;
+		/*
+		 * The type of the iterator.
+		 *
+		 * It can be one of enum bpf_task_iter_type.
+		 *
+		 * BPF_TASK_ITER_ALL (default)
+		 *	The iterator iterates over resources of everyprocess.
+		 *
+		 * BPF_TASK_ITER_TID
+		 *	You should also set *pid_fd* to iterate over one task.
+		 */
+		__u8	type;	/* BPF_TASK_ITER_* */
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH bpf-next v2 2/3] bpf: Handle bpf_link_info for the parameterized task BPF iterators.
  2022-08-01 23:26 [PATCH bpf-next v2 0/3] Parameterize task iterators Kui-Feng Lee
  2022-08-01 23:26 ` [PATCH bpf-next v2 1/3] bpf: " Kui-Feng Lee
@ 2022-08-01 23:26 ` Kui-Feng Lee
  2022-08-01 23:26 ` [PATCH bpf-next v2 3/3] selftests/bpf: Test " Kui-Feng Lee
  2022-08-01 23:35 ` [PATCH bpf-next v2 0/3] Parameterize task iterators Kui-Feng Lee
  3 siblings, 0 replies; 12+ messages in thread
From: Kui-Feng Lee @ 2022-08-01 23:26 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, kernel-team, yhs; +Cc: Kui-Feng Lee

Add new fields to bpf_link_info that users can query it through
bpf_obj_get_info_by_fd().

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
---
 include/uapi/linux/bpf.h       |  4 ++++
 kernel/bpf/task_iter.c         | 10 ++++++++++
 tools/include/uapi/linux/bpf.h |  4 ++++
 3 files changed, 18 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ed5ba501609f..33e6325d8d87 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6158,6 +6158,10 @@ struct bpf_link_info {
 				struct {
 					__u32 map_id;
 				} map;
+				struct {
+					__u32 tid;
+					__u8 type; /* one of enum bpf_task_iter_type */
+				} task;
 			};
 		} iter;
 		struct  {
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 9942601e1dfb..610ec96333f3 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -577,6 +577,13 @@ static const struct bpf_iter_seq_info task_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
 };
 
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+	info->iter.task.tid = aux->task.tid;
+	info->iter.task.type = aux->task.type;
+	return 0;
+}
+
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
 	.attach_target		= bpf_iter_attach_task,
@@ -587,6 +594,7 @@ static struct bpf_iter_reg task_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -608,6 +616,7 @@ static struct bpf_iter_reg task_file_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_file_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -629,6 +638,7 @@ static struct bpf_iter_reg task_vma_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_vma_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ed5ba501609f..33e6325d8d87 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6158,6 +6158,10 @@ struct bpf_link_info {
 				struct {
 					__u32 map_id;
 				} map;
+				struct {
+					__u32 tid;
+					__u8 type; /* one of enum bpf_task_iter_type */
+				} task;
 			};
 		} iter;
 		struct  {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH bpf-next v2 3/3] selftests/bpf: Test parameterized task BPF iterators.
  2022-08-01 23:26 [PATCH bpf-next v2 0/3] Parameterize task iterators Kui-Feng Lee
  2022-08-01 23:26 ` [PATCH bpf-next v2 1/3] bpf: " Kui-Feng Lee
  2022-08-01 23:26 ` [PATCH bpf-next v2 2/3] bpf: Handle bpf_link_info for the parameterized task BPF iterators Kui-Feng Lee
@ 2022-08-01 23:26 ` Kui-Feng Lee
  2022-08-01 23:35 ` [PATCH bpf-next v2 0/3] Parameterize task iterators Kui-Feng Lee
  3 siblings, 0 replies; 12+ messages in thread
From: Kui-Feng Lee @ 2022-08-01 23:26 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, kernel-team, yhs; +Cc: Kui-Feng Lee

Test iterators of vma, files, and tasks of a specific task.

Ensure the API works appropriately for both going through all tasks and
going through a particular task.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 143 +++++++++++++++---
 .../selftests/bpf/prog_tests/btf_dump.c       |   2 +-
 .../selftests/bpf/progs/bpf_iter_task.c       |   9 ++
 .../selftests/bpf/progs/bpf_iter_task_file.c  |   7 +
 .../selftests/bpf/progs/bpf_iter_task_vma.c   |   6 +-
 5 files changed, 143 insertions(+), 24 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index a33874b081b6..effffbfad901 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
 #include <test_progs.h>
+#include <sys/syscall.h>
+#include <unistd.h>
 #include "bpf_iter_ipv6_route.skel.h"
 #include "bpf_iter_netlink.skel.h"
 #include "bpf_iter_bpf_map.skel.h"
@@ -42,13 +44,13 @@ static void test_btf_id_or_null(void)
 	}
 }
 
-static void do_dummy_read(struct bpf_program *prog)
+static void do_dummy_read(struct bpf_program *prog, struct bpf_iter_attach_opts *opts)
 {
 	struct bpf_link *link;
 	char buf[16] = {};
 	int iter_fd, len;
 
-	link = bpf_program__attach_iter(prog, NULL);
+	link = bpf_program__attach_iter(prog, opts);
 	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		return;
 
@@ -91,7 +93,7 @@ static void test_ipv6_route(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_ipv6_route__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_ipv6_route);
+	do_dummy_read(skel->progs.dump_ipv6_route, NULL);
 
 	bpf_iter_ipv6_route__destroy(skel);
 }
@@ -104,7 +106,7 @@ static void test_netlink(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_netlink__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_netlink);
+	do_dummy_read(skel->progs.dump_netlink, NULL);
 
 	bpf_iter_netlink__destroy(skel);
 }
@@ -117,22 +119,79 @@ static void test_bpf_map(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_map__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_bpf_map);
+	do_dummy_read(skel->progs.dump_bpf_map, NULL);
 
 	bpf_iter_bpf_map__destroy(skel);
 }
 
+static int pidfd_open(pid_t pid, unsigned int flags)
+{
+	return syscall(SYS_pidfd_open, pid, flags);
+}
+
+static void check_bpf_link_info(const struct bpf_program *prog)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+	struct bpf_link_info info = {};
+	__u32 info_len;
+	struct bpf_link *link;
+	int err;
+
+	linfo.task.pid_fd = pidfd_open(getpid(), 0);
+	linfo.task.type = BPF_TASK_ITER_TID;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	link = bpf_program__attach_iter(prog, &opts);
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
+		return;
+
+	info_len = sizeof(info);
+	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
+	if (ASSERT_OK(err, "bpf_obj_get_info_by_fd")) {
+		ASSERT_EQ(info.iter.task.type, BPF_TASK_ITER_TID, "check_task_type");
+		ASSERT_EQ(info.iter.task.tid, getpid(), "check_task_tid");
+	}
+
+	bpf_link__destroy(link);
+	close(linfo.task.pid_fd);
+}
+
 static void test_task(void)
 {
 	struct bpf_iter_task *skel;
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
 
 	skel = bpf_iter_task__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_task);
+	linfo.task.pid_fd = pidfd_open(getpid(), 0);
+	linfo.task.type = BPF_TASK_ITER_TID;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	skel->bss->tid = getpid();
+
+	do_dummy_read(skel->progs.dump_task, &opts);
+
+	ASSERT_EQ(skel->bss->num_unknown_tid, 0, "check_num_unknown_tid");
+	ASSERT_EQ(skel->bss->num_known_tid, 1, "check_num_known_tid");
+
+	skel->bss->num_unknown_tid = 0;
+	skel->bss->num_known_tid = 0;
+
+	do_dummy_read(skel->progs.dump_task, NULL);
+
+	ASSERT_GE(skel->bss->num_unknown_tid, 0, "check_num_unknown_tid");
+	ASSERT_EQ(skel->bss->num_known_tid, 1, "check_num_known_tid");
+
+	check_bpf_link_info(skel->progs.dump_task);
 
 	bpf_iter_task__destroy(skel);
+	close(linfo.task.pid_fd);
 }
 
 static void test_task_sleepable(void)
@@ -143,7 +202,7 @@ static void test_task_sleepable(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_task_sleepable);
+	do_dummy_read(skel->progs.dump_task_sleepable, NULL);
 
 	ASSERT_GT(skel->bss->num_expected_failure_copy_from_user_task, 0,
 		  "num_expected_failure_copy_from_user_task");
@@ -161,8 +220,8 @@ static void test_task_stack(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_task_stack__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_task_stack);
-	do_dummy_read(skel->progs.get_task_user_stacks);
+	do_dummy_read(skel->progs.dump_task_stack, NULL);
+	do_dummy_read(skel->progs.get_task_user_stacks, NULL);
 
 	bpf_iter_task_stack__destroy(skel);
 }
@@ -174,7 +233,9 @@ static void *do_nothing(void *arg)
 
 static void test_task_file(void)
 {
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
 	struct bpf_iter_task_file *skel;
+	union bpf_iter_link_info linfo;
 	pthread_t thread_id;
 	void *ret;
 
@@ -188,16 +249,33 @@ static void test_task_file(void)
 		  "pthread_create"))
 		goto done;
 
-	do_dummy_read(skel->progs.dump_task_file);
+	linfo.task.pid_fd = pidfd_open(getpid(), 0);
+	linfo.task.type = BPF_TASK_ITER_TID;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	do_dummy_read(skel->progs.dump_task_file, &opts);
 
 	if (!ASSERT_FALSE(pthread_join(thread_id, &ret) || ret != NULL,
 		  "pthread_join"))
 		goto done;
 
 	ASSERT_EQ(skel->bss->count, 0, "check_count");
+	ASSERT_EQ(skel->bss->unique_tgid_count, 1, "check_unique_tgid_count");
+
+	skel->bss->count = 0;
+	skel->bss->unique_tgid_count = 0;
+
+	do_dummy_read(skel->progs.dump_task_file, NULL);
 
-done:
+	ASSERT_GE(skel->bss->count, 0, "check_count");
+	ASSERT_GE(skel->bss->unique_tgid_count, 1, "check_unique_tgid_count");
+
+	check_bpf_link_info(skel->progs.dump_task_file);
+
+ done:
 	bpf_iter_task_file__destroy(skel);
+	close(linfo.task.pid_fd);
 }
 
 #define TASKBUFSZ		32768
@@ -274,7 +352,7 @@ static void test_tcp4(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_tcp4__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_tcp4);
+	do_dummy_read(skel->progs.dump_tcp4, NULL);
 
 	bpf_iter_tcp4__destroy(skel);
 }
@@ -287,7 +365,7 @@ static void test_tcp6(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_tcp6__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_tcp6);
+	do_dummy_read(skel->progs.dump_tcp6, NULL);
 
 	bpf_iter_tcp6__destroy(skel);
 }
@@ -300,7 +378,7 @@ static void test_udp4(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_udp4__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_udp4);
+	do_dummy_read(skel->progs.dump_udp4, NULL);
 
 	bpf_iter_udp4__destroy(skel);
 }
@@ -313,7 +391,7 @@ static void test_udp6(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_udp6__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_udp6);
+	do_dummy_read(skel->progs.dump_udp6, NULL);
 
 	bpf_iter_udp6__destroy(skel);
 }
@@ -326,7 +404,7 @@ static void test_unix(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_unix__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_unix);
+	do_dummy_read(skel->progs.dump_unix, NULL);
 
 	bpf_iter_unix__destroy(skel);
 }
@@ -988,7 +1066,7 @@ static void test_bpf_sk_storage_get(void)
 	if (!ASSERT_OK(err, "bpf_map_update_elem"))
 		goto close_socket;
 
-	do_dummy_read(skel->progs.fill_socket_owner);
+	do_dummy_read(skel->progs.fill_socket_owner, NULL);
 
 	err = bpf_map_lookup_elem(map_fd, &sock_fd, &val);
 	if (CHECK(err || val != getpid(), "bpf_map_lookup_elem",
@@ -996,7 +1074,7 @@ static void test_bpf_sk_storage_get(void)
 	    getpid(), val, err))
 		goto close_socket;
 
-	do_dummy_read(skel->progs.negate_socket_local_storage);
+	do_dummy_read(skel->progs.negate_socket_local_storage, NULL);
 
 	err = bpf_map_lookup_elem(map_fd, &sock_fd, &val);
 	CHECK(err || val != -getpid(), "bpf_map_lookup_elem",
@@ -1116,7 +1194,7 @@ static void test_link_iter(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_link__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_bpf_link);
+	do_dummy_read(skel->progs.dump_bpf_link, NULL);
 
 	bpf_iter_bpf_link__destroy(skel);
 }
@@ -1129,7 +1207,7 @@ static void test_ksym_iter(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_ksym__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_ksym);
+	do_dummy_read(skel->progs.dump_ksym, NULL);
 
 	bpf_iter_ksym__destroy(skel);
 }
@@ -1154,7 +1232,7 @@ static void str_strip_first_line(char *str)
 	*dst = '\0';
 }
 
-static void test_task_vma(void)
+static void test_task_vma_(struct bpf_iter_attach_opts *opts)
 {
 	int err, iter_fd = -1, proc_maps_fd = -1;
 	struct bpf_iter_task_vma *skel;
@@ -1166,13 +1244,14 @@ static void test_task_vma(void)
 		return;
 
 	skel->bss->pid = getpid();
+	skel->bss->one_task = opts ? 1 : 0;
 
 	err = bpf_iter_task_vma__load(skel);
 	if (!ASSERT_OK(err, "bpf_iter_task_vma__load"))
 		goto out;
 
 	skel->links.proc_maps = bpf_program__attach_iter(
-		skel->progs.proc_maps, NULL);
+		skel->progs.proc_maps, opts);
 
 	if (!ASSERT_OK_PTR(skel->links.proc_maps, "bpf_program__attach_iter")) {
 		skel->links.proc_maps = NULL;
@@ -1211,12 +1290,32 @@ static void test_task_vma(void)
 	str_strip_first_line(proc_maps_output);
 
 	ASSERT_STREQ(task_vma_output, proc_maps_output, "compare_output");
+
+	check_bpf_link_info(skel->progs.proc_maps);
+
 out:
 	close(proc_maps_fd);
 	close(iter_fd);
 	bpf_iter_task_vma__destroy(skel);
 }
 
+static void test_task_vma(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.task.pid_fd = pidfd_open(getpid(), 0);
+	linfo.task.type = BPF_TASK_ITER_TID;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	test_task_vma_(&opts);
+	test_task_vma_(NULL);
+
+	close(linfo.task.pid_fd);
+}
+
 void test_bpf_iter(void)
 {
 	if (test__start_subtest("btf_id_or_null"))
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index 5fce7008d1ff..d7c2954a352b 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -764,7 +764,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d,
 
 	/* union with nested struct */
 	TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT,
-			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},}",
+			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.task = (struct){.pid_fd = (__u32)1,},}",
 			   { .map = { .map_fd = 1 }});
 
 	/* struct skb with nested structs/unions; because type output is so
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
index d22741272692..96131b9a1caa 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
@@ -6,6 +6,10 @@
 
 char _license[] SEC("license") = "GPL";
 
+uint32_t tid = 0;
+int num_unknown_tid = 0;
+int num_known_tid = 0;
+
 SEC("iter/task")
 int dump_task(struct bpf_iter__task *ctx)
 {
@@ -18,6 +22,11 @@ int dump_task(struct bpf_iter__task *ctx)
 		return 0;
 	}
 
+	if (task->pid != tid)
+		num_unknown_tid++;
+	else
+		num_known_tid++;
+
 	if (ctx->meta->seq_num == 0)
 		BPF_SEQ_PRINTF(seq, "    tgid      gid\n");
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
index 6e7b400888fe..031455ed8748 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
@@ -7,6 +7,8 @@ char _license[] SEC("license") = "GPL";
 
 int count = 0;
 int tgid = 0;
+int last_tgid = -1;
+int unique_tgid_count = 0;
 
 SEC("iter/task_file")
 int dump_task_file(struct bpf_iter__task_file *ctx)
@@ -27,6 +29,11 @@ int dump_task_file(struct bpf_iter__task_file *ctx)
 	if (tgid == task->tgid && task->tgid != task->pid)
 		count++;
 
+	if (last_tgid != task->tgid) {
+		last_tgid = task->tgid;
+		unique_tgid_count++;
+	}
+
 	BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
 		       (long)file->f_op);
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
index 4ea6a37d1345..44f4a31c2ddd 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
@@ -20,6 +20,7 @@ char _license[] SEC("license") = "GPL";
 #define D_PATH_BUF_SIZE 1024
 char d_path_buf[D_PATH_BUF_SIZE] = {};
 __u32 pid = 0;
+__u32 one_task = 0;
 
 SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
 {
@@ -33,8 +34,11 @@ SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
 		return 0;
 
 	file = vma->vm_file;
-	if (task->tgid != pid)
+	if (task->tgid != pid) {
+		if (one_task)
+			BPF_SEQ_PRINTF(seq, "unexpected task (%d != %d)", task->tgid, pid);
 		return 0;
+	}
 	perm_str[0] = (vma->vm_flags & VM_READ) ? 'r' : '-';
 	perm_str[1] = (vma->vm_flags & VM_WRITE) ? 'w' : '-';
 	perm_str[2] = (vma->vm_flags & VM_EXEC) ? 'x' : '-';
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2 0/3] Parameterize task iterators.
  2022-08-01 23:26 [PATCH bpf-next v2 0/3] Parameterize task iterators Kui-Feng Lee
                   ` (2 preceding siblings ...)
  2022-08-01 23:26 ` [PATCH bpf-next v2 3/3] selftests/bpf: Test " Kui-Feng Lee
@ 2022-08-01 23:35 ` Kui-Feng Lee
  3 siblings, 0 replies; 12+ messages in thread
From: Kui-Feng Lee @ 2022-08-01 23:35 UTC (permalink / raw)
  To: daniel, Kernel Team, Yonghong Song, ast, andrii, bpf

On Mon, 2022-08-01 at 16:26 -0700, Kui-Feng Lee wrote:
> Allow creating an iterator that loops through resources of one
> task/thread.
> 
> People could only create iterators to loop through all resources of
> files, vma, and tasks in the system, even though they were interested
> in only the
> resources of a specific task or process.  Passing the addintional
> parameters, people can now create an iterator to go through all
> resources or only the resources of a task.
> 
> Major Changes:
> 
>  - Add new parameters in bpf_iter_link_info to indicate to go through
>    all tasks or to go through a specific task.
> 
>  - Change the implementations of BPF iterators of vma, files, and
>    tasks to allow going through only the resources of a specific
> task.
> 
>  - Provide the arguments of parameterized task iterators in
>    bpf_link_info.

The major changes since v1:
 - Fix the issue of task_seq_get_next() returning NULL for being called
twice or more.
 - Use pidfd instead of pid/tid while keep the 'type'.


> 
> Kui-Feng Lee (3):
>   bpf: Parameterize task iterators.
>   bpf: Handle bpf_link_info for the parameterized task BPF iterators.
>   selftests/bpf: Test parameterized task BPF iterators.
> 
>  include/linux/bpf.h                           |   4 +
>  include/uapi/linux/bpf.h                      |  27 ++++
>  kernel/bpf/task_iter.c                        | 103 ++++++++++---
>  tools/include/uapi/linux/bpf.h                |  27 ++++
>  .../selftests/bpf/prog_tests/bpf_iter.c       | 143 +++++++++++++++-
> --
>  .../selftests/bpf/prog_tests/btf_dump.c       |   2 +-
>  .../selftests/bpf/progs/bpf_iter_task.c       |   9 ++
>  .../selftests/bpf/progs/bpf_iter_task_file.c  |   7 +
>  .../selftests/bpf/progs/bpf_iter_task_vma.c   |   6 +-
>  9 files changed, 282 insertions(+), 46 deletions(-)
> 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2 1/3] bpf: Parameterize task iterators.
  2022-08-01 23:26 ` [PATCH bpf-next v2 1/3] bpf: " Kui-Feng Lee
@ 2022-08-02  1:49   ` Alexei Starovoitov
  2022-08-02 16:47     ` Kui-Feng Lee
  2022-08-02  3:30   ` Andrii Nakryiko
  1 sibling, 1 reply; 12+ messages in thread
From: Alexei Starovoitov @ 2022-08-02  1:49 UTC (permalink / raw)
  To: Kui-Feng Lee
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Kernel Team, Yonghong Song

On Mon, Aug 1, 2022 at 4:27 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> Allow creating an iterator that loops through resources of one task/thread.
>
> People could only create iterators to loop through all resources of
> files, vma, and tasks in the system, even though they were interested
> in only the resources of a specific task or process.  Passing the
> additional parameters, people can now create an iterator to go
> through all resources or only the resources of a task.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> ---
>  include/linux/bpf.h            |  4 ++
>  include/uapi/linux/bpf.h       | 23 +++++++++
>  kernel/bpf/task_iter.c         | 93 ++++++++++++++++++++++++++--------
>  tools/include/uapi/linux/bpf.h | 23 +++++++++
>  4 files changed, 121 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 11950029284f..3c26dbfc9cef 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>
>  struct bpf_iter_aux_info {
>         struct bpf_map *map;
> +       struct {
> +               u32     tid;
> +               u8      type;
> +       } task;
>  };
>
>  typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index ffcbf79a556b..ed5ba501609f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
>         __u32   attach_type;            /* program attach type (enum bpf_attach_type) */
>  };
>
> +enum bpf_task_iter_type {
> +       BPF_TASK_ITER_ALL = 0,
> +       BPF_TASK_ITER_TID,
> +};
> +
>  union bpf_iter_link_info {
>         struct {
>                 __u32   map_fd;
>         } map;
> +       /*
> +        * Parameters of task iterators.
> +        */
> +       struct {
> +               __u32   pid_fd;
> +               /*
> +                * The type of the iterator.
> +                *
> +                * It can be one of enum bpf_task_iter_type.
> +                *
> +                * BPF_TASK_ITER_ALL (default)
> +                *      The iterator iterates over resources of everyprocess.
> +                *
> +                * BPF_TASK_ITER_TID
> +                *      You should also set *pid_fd* to iterate over one task.
> +                */
> +               __u8    type;   /* BPF_TASK_ITER_* */

__u8 might be a pain for future extensibility.
big vs little endian will be another potential issue.

Maybe use enum bpf_task_iter_type type; here and
move the comment to enum def ?
Or rename it to '__u32 flags;' ?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2 1/3] bpf: Parameterize task iterators.
  2022-08-01 23:26 ` [PATCH bpf-next v2 1/3] bpf: " Kui-Feng Lee
  2022-08-02  1:49   ` Alexei Starovoitov
@ 2022-08-02  3:30   ` Andrii Nakryiko
  2022-08-02 16:42     ` Kui-Feng Lee
  1 sibling, 1 reply; 12+ messages in thread
From: Andrii Nakryiko @ 2022-08-02  3:30 UTC (permalink / raw)
  To: Kui-Feng Lee; +Cc: bpf, ast, daniel, andrii, kernel-team, yhs

On Mon, Aug 1, 2022 at 4:27 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> Allow creating an iterator that loops through resources of one task/thread.
>
> People could only create iterators to loop through all resources of
> files, vma, and tasks in the system, even though they were interested
> in only the resources of a specific task or process.  Passing the
> additional parameters, people can now create an iterator to go
> through all resources or only the resources of a task.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> ---
>  include/linux/bpf.h            |  4 ++
>  include/uapi/linux/bpf.h       | 23 +++++++++
>  kernel/bpf/task_iter.c         | 93 ++++++++++++++++++++++++++--------
>  tools/include/uapi/linux/bpf.h | 23 +++++++++
>  4 files changed, 121 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 11950029284f..3c26dbfc9cef 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>
>  struct bpf_iter_aux_info {
>         struct bpf_map *map;
> +       struct {
> +               u32     tid;
> +               u8      type;
> +       } task;
>  };
>
>  typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index ffcbf79a556b..ed5ba501609f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
>         __u32   attach_type;            /* program attach type (enum bpf_attach_type) */
>  };
>
> +enum bpf_task_iter_type {
> +       BPF_TASK_ITER_ALL = 0,
> +       BPF_TASK_ITER_TID,
> +};
> +
>  union bpf_iter_link_info {
>         struct {
>                 __u32   map_fd;
>         } map;
> +       /*
> +        * Parameters of task iterators.
> +        */
> +       struct {
> +               __u32   pid_fd;

I was a bit late to the discussion about pidfd vs plain pid. I think
we should support both in this API. While pid_fd has some nice
guarantees like avoiding the risk of accidental PID reuse, in a lot
(if not all) cases where task/task_vma/task_file iterators are going
to be used this is never a risk, because pid will usually come from
some tracing BPF program (kprobe/tp/fentry/etc), like in case of
profiling, and then will be used by user-space almost immediately to
query some additional information (fetching relevant vma information
for profiling use case). So main benefit of pidfd is not that relevant
for BPF tracing use cases, because PIDs are not going to be reused so
fast within such a short time frame.

But pidfd does have downsides. It requires 2 syscalls (pidfd_open and
close) for each PID, it creates struct file for each such active
pidfd. So it will have non-trivial overhead for high-frequency BPF
iterator use cases (imagine querying some simple stats for a big set
of tasks, frequently: you'll spend more time in pidfd syscalls and
more resources just keeping corresponding struct file open than
actually doing useful BPF work). For simple BPF iter cases it will
unnecessarily complicate program flow while giving no benefit instead.

So I propose we support both in UAPI. Internally either way we resolve
to plain pid/tid, so this won't cause added maintenance burden. But
simple cases will keep simple, while more long-lived and/or
complicated ones will still be supported. We then can have
BPF_TASK_ITER_PIDFD vs BPF_TASK_ITER_TID to differentiate whether the
above __u32 pid_fd (which we should probably rename to something more
generic like "target") is pid FD or TID/PID. See also below about TID
vs PID.

> +               /*
> +                * The type of the iterator.
> +                *
> +                * It can be one of enum bpf_task_iter_type.
> +                *
> +                * BPF_TASK_ITER_ALL (default)
> +                *      The iterator iterates over resources of everyprocess.
> +                *
> +                * BPF_TASK_ITER_TID
> +                *      You should also set *pid_fd* to iterate over one task.

naming nit: we should decide whether we use TID (thread) and PID
(process) terminology (more usual for user-space) or PID (process ==
task == user-space thread) and TGID (thread group, i.e. user-space
process). I haven't investigated much what's we use most consistently,
but curious to hear what others think.

Also I can see use-cases where we want to iterate just specified task
(i.e., just specified thread) vs all the tasks that belong to the same
process group (i.e., thread within process). Naming TBD, but we should
have BPF_TASK_ITER_TID and BPF_TASK_ITER_TGID (or some other naming).

One might ask why do we need single-task mode if we can always stop
iteration from BPF program, but this is trivial only for iter/task,
while for iter/task_vma and iter/task_file it becomes inconvenient to
detect switch from one task to another. It costs us essentially
nothing to support this mode, so I advocate to do that.

I have similar thoughts about cgroup iteration modes and actually
supporting cgroup_fd as target for task iterators (which will mean
iterating tasks belonging to provided cgroup(s)), but I'll reply on
cgroup iterator patch first, and we can just reuse the same cgroup
target specification between iter/cgroup and iter/task afterwards.


> +                */
> +               __u8    type;   /* BPF_TASK_ITER_* */
> +       } task;
>  };
>

[...]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2 1/3] bpf: Parameterize task iterators.
  2022-08-02  3:30   ` Andrii Nakryiko
@ 2022-08-02 16:42     ` Kui-Feng Lee
  2022-08-02 21:17       ` Andrii Nakryiko
  0 siblings, 1 reply; 12+ messages in thread
From: Kui-Feng Lee @ 2022-08-02 16:42 UTC (permalink / raw)
  To: andrii.nakryiko; +Cc: daniel, Kernel Team, Yonghong Song, ast, andrii, bpf

On Mon, 2022-08-01 at 20:30 -0700, Andrii Nakryiko wrote:
> On Mon, Aug 1, 2022 at 4:27 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > Allow creating an iterator that loops through resources of one
> > task/thread.
> > 
> > People could only create iterators to loop through all resources of
> > files, vma, and tasks in the system, even though they were
> > interested
> > in only the resources of a specific task or process.  Passing the
> > additional parameters, people can now create an iterator to go
> > through all resources or only the resources of a task.
> > 
> > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > ---
> >  include/linux/bpf.h            |  4 ++
> >  include/uapi/linux/bpf.h       | 23 +++++++++
> >  kernel/bpf/task_iter.c         | 93 ++++++++++++++++++++++++++----
> > ----
> >  tools/include/uapi/linux/bpf.h | 23 +++++++++
> >  4 files changed, 121 insertions(+), 22 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 11950029284f..3c26dbfc9cef 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > *pathname, int flags);
> > 
> >  struct bpf_iter_aux_info {
> >         struct bpf_map *map;
> > +       struct {
> > +               u32     tid;
> > +               u8      type;
> > +       } task;
> >  };
> > 
> >  typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index ffcbf79a556b..ed5ba501609f 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
> >         __u32   attach_type;            /* program attach type
> > (enum bpf_attach_type) */
> >  };
> > 
> > +enum bpf_task_iter_type {
> > +       BPF_TASK_ITER_ALL = 0,
> > +       BPF_TASK_ITER_TID,
> > +};
> > +
> >  union bpf_iter_link_info {
> >         struct {
> >                 __u32   map_fd;
> >         } map;
> > +       /*
> > +        * Parameters of task iterators.
> > +        */
> > +       struct {
> > +               __u32   pid_fd;
> 
> I was a bit late to the discussion about pidfd vs plain pid. I think
> we should support both in this API. While pid_fd has some nice
> guarantees like avoiding the risk of accidental PID reuse, in a lot
> (if not all) cases where task/task_vma/task_file iterators are going
> to be used this is never a risk, because pid will usually come from
> some tracing BPF program (kprobe/tp/fentry/etc), like in case of
> profiling, and then will be used by user-space almost immediately to
> query some additional information (fetching relevant vma information
> for profiling use case). So main benefit of pidfd is not that
> relevant
> for BPF tracing use cases, because PIDs are not going to be reused so
> fast within such a short time frame.
> 
> But pidfd does have downsides. It requires 2 syscalls (pidfd_open and
> close) for each PID, it creates struct file for each such active
> pidfd. So it will have non-trivial overhead for high-frequency BPF
> iterator use cases (imagine querying some simple stats for a big set
> of tasks, frequently: you'll spend more time in pidfd syscalls and
> more resources just keeping corresponding struct file open than
> actually doing useful BPF work). For simple BPF iter cases it will
> unnecessarily complicate program flow while giving no benefit
> instead.

It is a good point to have more syscalls.

> 
> So I propose we support both in UAPI. Internally either way we
> resolve
> to plain pid/tid, so this won't cause added maintenance burden. But
> simple cases will keep simple, while more long-lived and/or
> complicated ones will still be supported. We then can have
> BPF_TASK_ITER_PIDFD vs BPF_TASK_ITER_TID to differentiate whether the
> above __u32 pid_fd (which we should probably rename to something more
> generic like "target") is pid FD or TID/PID. See also below about TID
> vs PID.
> 
> > +               /*
> > +                * The type of the iterator.
> > +                *
> > +                * It can be one of enum bpf_task_iter_type.
> > +                *
> > +                * BPF_TASK_ITER_ALL (default)
> > +                *      The iterator iterates over resources of
> > everyprocess.
> > +                *
> > +                * BPF_TASK_ITER_TID
> > +                *      You should also set *pid_fd* to iterate
> > over one task.
> 
> naming nit: we should decide whether we use TID (thread) and PID
> (process) terminology (more usual for user-space) or PID (process ==
> task == user-space thread) and TGID (thread group, i.e. user-space
> process). I haven't investigated much what's we use most
> consistently,
> but curious to hear what others think.
> 
> Also I can see use-cases where we want to iterate just specified task
> (i.e., just specified thread) vs all the tasks that belong to the
> same
> process group (i.e., thread within process). Naming TBD, but we
> should
> have BPF_TASK_ITER_TID and BPF_TASK_ITER_TGID (or some other naming).


I discussed with Yonghong about iterators over resources of all tasks
of a process.  User code should create iterators for each thread of the
process if necessary.  We may add the support of tgid if it is higly
demanded.

In a discussion of using pidfd, people mentioned to extend pidfd to
threads if there is a good use-case.  It also applies to our case. 
Most of the time, if not always, vma & files are shared by all threads
of a process.  So, an iteration over all resources of every threads of
a process doesn't get obvious benefit.  It is also true for an iterator
over the resources of a specific thread instead of a process.

> 
> One might ask why do we need single-task mode if we can always stop
> iteration from BPF program, but this is trivial only for iter/task,
> while for iter/task_vma and iter/task_file it becomes inconvenient to
> detect switch from one task to another. It costs us essentially
> nothing to support this mode, so I advocate to do that.
> 
> I have similar thoughts about cgroup iteration modes and actually
> supporting cgroup_fd as target for task iterators (which will mean
> iterating tasks belonging to provided cgroup(s)), but I'll reply on
> cgroup iterator patch first, and we can just reuse the same cgroup
> target specification between iter/cgroup and iter/task afterwards.
> 
> 
> > +                */
> > +               __u8    type;   /* BPF_TASK_ITER_* */
> > +       } task;
> >  };
> > 
> 
> [...]


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2 1/3] bpf: Parameterize task iterators.
  2022-08-02  1:49   ` Alexei Starovoitov
@ 2022-08-02 16:47     ` Kui-Feng Lee
  2022-08-02 21:19       ` Andrii Nakryiko
  0 siblings, 1 reply; 12+ messages in thread
From: Kui-Feng Lee @ 2022-08-02 16:47 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: daniel, Kernel Team, Yonghong Song, ast, andrii, bpf

On Mon, 2022-08-01 at 18:49 -0700, Alexei Starovoitov wrote:
> On Mon, Aug 1, 2022 at 4:27 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > Allow creating an iterator that loops through resources of one
> > task/thread.
> > 
> > People could only create iterators to loop through all resources of
> > files, vma, and tasks in the system, even though they were
> > interested
> > in only the resources of a specific task or process.  Passing the
> > additional parameters, people can now create an iterator to go
> > through all resources or only the resources of a task.
> > 
> > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > ---
> >  include/linux/bpf.h            |  4 ++
> >  include/uapi/linux/bpf.h       | 23 +++++++++
> >  kernel/bpf/task_iter.c         | 93 ++++++++++++++++++++++++++----
> > ----
> >  tools/include/uapi/linux/bpf.h | 23 +++++++++
> >  4 files changed, 121 insertions(+), 22 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 11950029284f..3c26dbfc9cef 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > *pathname, int flags);
> > 
> >  struct bpf_iter_aux_info {
> >         struct bpf_map *map;
> > +       struct {
> > +               u32     tid;
> > +               u8      type;
> > +       } task;
> >  };
> > 
> >  typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index ffcbf79a556b..ed5ba501609f 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
> >         __u32   attach_type;            /* program attach type
> > (enum bpf_attach_type) */
> >  };
> > 
> > +enum bpf_task_iter_type {
> > +       BPF_TASK_ITER_ALL = 0,
> > +       BPF_TASK_ITER_TID,
> > +};
> > +
> >  union bpf_iter_link_info {
> >         struct {
> >                 __u32   map_fd;
> >         } map;
> > +       /*
> > +        * Parameters of task iterators.
> > +        */
> > +       struct {
> > +               __u32   pid_fd;
> > +               /*
> > +                * The type of the iterator.
> > +                *
> > +                * It can be one of enum bpf_task_iter_type.
> > +                *
> > +                * BPF_TASK_ITER_ALL (default)
> > +                *      The iterator iterates over resources of
> > everyprocess.
> > +                *
> > +                * BPF_TASK_ITER_TID
> > +                *      You should also set *pid_fd* to iterate
> > over one task.
> > +                */
> > +               __u8    type;   /* BPF_TASK_ITER_* */
> 
> __u8 might be a pain for future extensibility.

Do you mean the problem caused by padding?

> big vs little endian will be another potential issue.

Do we need binary compatible for different platforms?
I don't get the point of endian.  Could you explain it more?

> 
> Maybe use enum bpf_task_iter_type type; here and
> move the comment to enum def ?
> Or rename it to '__u32 flags;' ?


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2 1/3] bpf: Parameterize task iterators.
  2022-08-02 16:42     ` Kui-Feng Lee
@ 2022-08-02 21:17       ` Andrii Nakryiko
  2022-08-04 23:05         ` Kui-Feng Lee
  0 siblings, 1 reply; 12+ messages in thread
From: Andrii Nakryiko @ 2022-08-02 21:17 UTC (permalink / raw)
  To: Kui-Feng Lee; +Cc: daniel, Kernel Team, Yonghong Song, ast, andrii, bpf

On Tue, Aug 2, 2022 at 9:42 AM Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> On Mon, 2022-08-01 at 20:30 -0700, Andrii Nakryiko wrote:
> > On Mon, Aug 1, 2022 at 4:27 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
> > >
> > > Allow creating an iterator that loops through resources of one
> > > task/thread.
> > >
> > > People could only create iterators to loop through all resources of
> > > files, vma, and tasks in the system, even though they were
> > > interested
> > > in only the resources of a specific task or process.  Passing the
> > > additional parameters, people can now create an iterator to go
> > > through all resources or only the resources of a task.
> > >
> > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > ---
> > >  include/linux/bpf.h            |  4 ++
> > >  include/uapi/linux/bpf.h       | 23 +++++++++
> > >  kernel/bpf/task_iter.c         | 93 ++++++++++++++++++++++++++----
> > > ----
> > >  tools/include/uapi/linux/bpf.h | 23 +++++++++
> > >  4 files changed, 121 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index 11950029284f..3c26dbfc9cef 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > > *pathname, int flags);
> > >
> > >  struct bpf_iter_aux_info {
> > >         struct bpf_map *map;
> > > +       struct {
> > > +               u32     tid;
> > > +               u8      type;
> > > +       } task;
> > >  };
> > >
> > >  typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index ffcbf79a556b..ed5ba501609f 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
> > >         __u32   attach_type;            /* program attach type
> > > (enum bpf_attach_type) */
> > >  };
> > >
> > > +enum bpf_task_iter_type {
> > > +       BPF_TASK_ITER_ALL = 0,
> > > +       BPF_TASK_ITER_TID,
> > > +};
> > > +
> > >  union bpf_iter_link_info {
> > >         struct {
> > >                 __u32   map_fd;
> > >         } map;
> > > +       /*
> > > +        * Parameters of task iterators.
> > > +        */
> > > +       struct {
> > > +               __u32   pid_fd;
> >
> > I was a bit late to the discussion about pidfd vs plain pid. I think
> > we should support both in this API. While pid_fd has some nice
> > guarantees like avoiding the risk of accidental PID reuse, in a lot
> > (if not all) cases where task/task_vma/task_file iterators are going
> > to be used this is never a risk, because pid will usually come from
> > some tracing BPF program (kprobe/tp/fentry/etc), like in case of
> > profiling, and then will be used by user-space almost immediately to
> > query some additional information (fetching relevant vma information
> > for profiling use case). So main benefit of pidfd is not that
> > relevant
> > for BPF tracing use cases, because PIDs are not going to be reused so
> > fast within such a short time frame.
> >
> > But pidfd does have downsides. It requires 2 syscalls (pidfd_open and
> > close) for each PID, it creates struct file for each such active
> > pidfd. So it will have non-trivial overhead for high-frequency BPF
> > iterator use cases (imagine querying some simple stats for a big set
> > of tasks, frequently: you'll spend more time in pidfd syscalls and
> > more resources just keeping corresponding struct file open than
> > actually doing useful BPF work). For simple BPF iter cases it will
> > unnecessarily complicate program flow while giving no benefit
> > instead.
>
> It is a good point to have more syscalls.
>
> >
> > So I propose we support both in UAPI. Internally either way we
> > resolve
> > to plain pid/tid, so this won't cause added maintenance burden. But
> > simple cases will keep simple, while more long-lived and/or
> > complicated ones will still be supported. We then can have
> > BPF_TASK_ITER_PIDFD vs BPF_TASK_ITER_TID to differentiate whether the
> > above __u32 pid_fd (which we should probably rename to something more
> > generic like "target") is pid FD or TID/PID. See also below about TID
> > vs PID.
> >
> > > +               /*
> > > +                * The type of the iterator.
> > > +                *
> > > +                * It can be one of enum bpf_task_iter_type.
> > > +                *
> > > +                * BPF_TASK_ITER_ALL (default)
> > > +                *      The iterator iterates over resources of
> > > everyprocess.
> > > +                *
> > > +                * BPF_TASK_ITER_TID
> > > +                *      You should also set *pid_fd* to iterate
> > > over one task.
> >
> > naming nit: we should decide whether we use TID (thread) and PID
> > (process) terminology (more usual for user-space) or PID (process ==
> > task == user-space thread) and TGID (thread group, i.e. user-space
> > process). I haven't investigated much what's we use most
> > consistently,
> > but curious to hear what others think.
> >
> > Also I can see use-cases where we want to iterate just specified task
> > (i.e., just specified thread) vs all the tasks that belong to the
> > same
> > process group (i.e., thread within process). Naming TBD, but we
> > should
> > have BPF_TASK_ITER_TID and BPF_TASK_ITER_TGID (or some other naming).
>
>
> I discussed with Yonghong about iterators over resources of all tasks
> of a process.  User code should create iterators for each thread of the
> process if necessary.  We may add the support of tgid if it is higly
> demanded.
>
> In a discussion of using pidfd, people mentioned to extend pidfd to
> threads if there is a good use-case.  It also applies to our case.
> Most of the time, if not always, vma & files are shared by all threads
> of a process.  So, an iteration over all resources of every threads of
> a process doesn't get obvious benefit.  It is also true for an iterator
> over the resources of a specific thread instead of a process.
>

Ok, so two different points here.

First, TID (thread) vs TGID (process) modes. I'd define TGID mode as:
a) user specifies some TID and we resolve that to thread group leader
TID (that is we resolve thread to process), and then iterate all
threads within that process. For TID (thread) mode, we accept
specified TID as exactly the thread we iterate (even if it's thread
group leader, we iterate only that specific thread, not all threads in
a process).

Second, about the point that all threads within a process share vma,
file table, etc. That's true. But you are forgetting about iter/task
that is iterating just tasks. TGID mode for such use case is very
useful. For task_vma/task_file we can probably do the same logic we
have today where if the thread has the same file table or mm_struct as
thread group leader, we skip such thread when iterating vmas and
files.

Thoughts?


> >
> > One might ask why do we need single-task mode if we can always stop
> > iteration from BPF program, but this is trivial only for iter/task,
> > while for iter/task_vma and iter/task_file it becomes inconvenient to
> > detect switch from one task to another. It costs us essentially
> > nothing to support this mode, so I advocate to do that.
> >
> > I have similar thoughts about cgroup iteration modes and actually
> > supporting cgroup_fd as target for task iterators (which will mean
> > iterating tasks belonging to provided cgroup(s)), but I'll reply on
> > cgroup iterator patch first, and we can just reuse the same cgroup
> > target specification between iter/cgroup and iter/task afterwards.
> >
> >
> > > +                */
> > > +               __u8    type;   /* BPF_TASK_ITER_* */
> > > +       } task;
> > >  };
> > >
> >
> > [...]
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2 1/3] bpf: Parameterize task iterators.
  2022-08-02 16:47     ` Kui-Feng Lee
@ 2022-08-02 21:19       ` Andrii Nakryiko
  0 siblings, 0 replies; 12+ messages in thread
From: Andrii Nakryiko @ 2022-08-02 21:19 UTC (permalink / raw)
  To: Kui-Feng Lee
  Cc: alexei.starovoitov, daniel, Kernel Team, Yonghong Song, ast, andrii, bpf

On Tue, Aug 2, 2022 at 9:48 AM Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> On Mon, 2022-08-01 at 18:49 -0700, Alexei Starovoitov wrote:
> > On Mon, Aug 1, 2022 at 4:27 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
> > >
> > > Allow creating an iterator that loops through resources of one
> > > task/thread.
> > >
> > > People could only create iterators to loop through all resources of
> > > files, vma, and tasks in the system, even though they were
> > > interested
> > > in only the resources of a specific task or process.  Passing the
> > > additional parameters, people can now create an iterator to go
> > > through all resources or only the resources of a task.
> > >
> > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > ---
> > >  include/linux/bpf.h            |  4 ++
> > >  include/uapi/linux/bpf.h       | 23 +++++++++
> > >  kernel/bpf/task_iter.c         | 93 ++++++++++++++++++++++++++----
> > > ----
> > >  tools/include/uapi/linux/bpf.h | 23 +++++++++
> > >  4 files changed, 121 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index 11950029284f..3c26dbfc9cef 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > > *pathname, int flags);
> > >
> > >  struct bpf_iter_aux_info {
> > >         struct bpf_map *map;
> > > +       struct {
> > > +               u32     tid;
> > > +               u8      type;
> > > +       } task;
> > >  };
> > >
> > >  typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index ffcbf79a556b..ed5ba501609f 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
> > >         __u32   attach_type;            /* program attach type
> > > (enum bpf_attach_type) */
> > >  };
> > >
> > > +enum bpf_task_iter_type {
> > > +       BPF_TASK_ITER_ALL = 0,
> > > +       BPF_TASK_ITER_TID,
> > > +};
> > > +
> > >  union bpf_iter_link_info {
> > >         struct {
> > >                 __u32   map_fd;
> > >         } map;
> > > +       /*
> > > +        * Parameters of task iterators.
> > > +        */
> > > +       struct {
> > > +               __u32   pid_fd;
> > > +               /*
> > > +                * The type of the iterator.
> > > +                *
> > > +                * It can be one of enum bpf_task_iter_type.
> > > +                *
> > > +                * BPF_TASK_ITER_ALL (default)
> > > +                *      The iterator iterates over resources of
> > > everyprocess.
> > > +                *
> > > +                * BPF_TASK_ITER_TID
> > > +                *      You should also set *pid_fd* to iterate
> > > over one task.
> > > +                */
> > > +               __u8    type;   /* BPF_TASK_ITER_* */
> >
> > __u8 might be a pain for future extensibility.
>
> Do you mean the problem caused by padding?

Not Alexei, but I agree that there is no reason to try to save a few
bytes here. Let's use u32 or just plain 32-bit enum. Please also put
it in front of pid_fd (first field in this substruct), so that it's
easier to extend this with more information about "iteration target"
(e.g., if we later want to iterate tasks within cgroup, we might end
up specifying cgroup_id, which I believe is 64-bit, so it would be
nice to be able to just do a union across {pid_fd, pid, cgroup_fd,
cgroup_id}.

>
> > big vs little endian will be another potential issue.
>
> Do we need binary compatible for different platforms?
> I don't get the point of endian.  Could you explain it more?
>
> >
> > Maybe use enum bpf_task_iter_type type; here and
> > move the comment to enum def ?
> > Or rename it to '__u32 flags;' ?
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH bpf-next v2 1/3] bpf: Parameterize task iterators.
  2022-08-02 21:17       ` Andrii Nakryiko
@ 2022-08-04 23:05         ` Kui-Feng Lee
  0 siblings, 0 replies; 12+ messages in thread
From: Kui-Feng Lee @ 2022-08-04 23:05 UTC (permalink / raw)
  To: andrii.nakryiko; +Cc: daniel, Kernel Team, Yonghong Song, ast, andrii, bpf

On Tue, 2022-08-02 at 14:17 -0700, Andrii Nakryiko wrote:
> On Tue, Aug 2, 2022 at 9:42 AM Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > On Mon, 2022-08-01 at 20:30 -0700, Andrii Nakryiko wrote:
> > > On Mon, Aug 1, 2022 at 4:27 PM Kui-Feng Lee <kuifeng@fb.com>
> > > wrote:
> > > > 
> > > > Allow creating an iterator that loops through resources of one
> > > > task/thread.
> > > > 
> > > > People could only create iterators to loop through all
> > > > resources of
> > > > files, vma, and tasks in the system, even though they were
> > > > interested
> > > > in only the resources of a specific task or process.  Passing
> > > > the
> > > > additional parameters, people can now create an iterator to go
> > > > through all resources or only the resources of a task.
> > > > 
> > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > ---
> > > >  include/linux/bpf.h            |  4 ++
> > > >  include/uapi/linux/bpf.h       | 23 +++++++++
> > > >  kernel/bpf/task_iter.c         | 93
> > > > ++++++++++++++++++++++++++----
> > > > ----
> > > >  tools/include/uapi/linux/bpf.h | 23 +++++++++
> > > >  4 files changed, 121 insertions(+), 22 deletions(-)
> > > > 
> > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > index 11950029284f..3c26dbfc9cef 100644
> > > > --- a/include/linux/bpf.h
> > > > +++ b/include/linux/bpf.h
> > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > > > *pathname, int flags);
> > > > 
> > > >  struct bpf_iter_aux_info {
> > > >         struct bpf_map *map;
> > > > +       struct {
> > > > +               u32     tid;
> > > > +               u8      type;
> > > > +       } task;
> > > >  };
> > > > 
> > > >  typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> > > > diff --git a/include/uapi/linux/bpf.h
> > > > b/include/uapi/linux/bpf.h
> > > > index ffcbf79a556b..ed5ba501609f 100644
> > > > --- a/include/uapi/linux/bpf.h
> > > > +++ b/include/uapi/linux/bpf.h
> > > > @@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
> > > >         __u32   attach_type;            /* program attach type
> > > > (enum bpf_attach_type) */
> > > >  };
> > > > 
> > > > +enum bpf_task_iter_type {
> > > > +       BPF_TASK_ITER_ALL = 0,
> > > > +       BPF_TASK_ITER_TID,
> > > > +};
> > > > +
> > > >  union bpf_iter_link_info {
> > > >         struct {
> > > >                 __u32   map_fd;
> > > >         } map;
> > > > +       /*
> > > > +        * Parameters of task iterators.
> > > > +        */
> > > > +       struct {
> > > > +               __u32   pid_fd;
> > > 
> > > I was a bit late to the discussion about pidfd vs plain pid. I
> > > think
> > > we should support both in this API. While pid_fd has some nice
> > > guarantees like avoiding the risk of accidental PID reuse, in a
> > > lot
> > > (if not all) cases where task/task_vma/task_file iterators are
> > > going
> > > to be used this is never a risk, because pid will usually come
> > > from
> > > some tracing BPF program (kprobe/tp/fentry/etc), like in case of
> > > profiling, and then will be used by user-space almost immediately
> > > to
> > > query some additional information (fetching relevant vma
> > > information
> > > for profiling use case). So main benefit of pidfd is not that
> > > relevant
> > > for BPF tracing use cases, because PIDs are not going to be
> > > reused so
> > > fast within such a short time frame.
> > > 
> > > But pidfd does have downsides. It requires 2 syscalls (pidfd_open
> > > and
> > > close) for each PID, it creates struct file for each such active
> > > pidfd. So it will have non-trivial overhead for high-frequency
> > > BPF
> > > iterator use cases (imagine querying some simple stats for a big
> > > set
> > > of tasks, frequently: you'll spend more time in pidfd syscalls
> > > and
> > > more resources just keeping corresponding struct file open than
> > > actually doing useful BPF work). For simple BPF iter cases it
> > > will
> > > unnecessarily complicate program flow while giving no benefit
> > > instead.
> > 
> > It is a good point to have more syscalls.
> > 
> > > 
> > > So I propose we support both in UAPI. Internally either way we
> > > resolve
> > > to plain pid/tid, so this won't cause added maintenance burden.
> > > But
> > > simple cases will keep simple, while more long-lived and/or
> > > complicated ones will still be supported. We then can have
> > > BPF_TASK_ITER_PIDFD vs BPF_TASK_ITER_TID to differentiate whether
> > > the
> > > above __u32 pid_fd (which we should probably rename to something
> > > more
> > > generic like "target") is pid FD or TID/PID. See also below about
> > > TID
> > > vs PID.
> > > 
> > > > +               /*
> > > > +                * The type of the iterator.
> > > > +                *
> > > > +                * It can be one of enum bpf_task_iter_type.
> > > > +                *
> > > > +                * BPF_TASK_ITER_ALL (default)
> > > > +                *      The iterator iterates over resources of
> > > > everyprocess.
> > > > +                *
> > > > +                * BPF_TASK_ITER_TID
> > > > +                *      You should also set *pid_fd* to iterate
> > > > over one task.
> > > 
> > > naming nit: we should decide whether we use TID (thread) and PID
> > > (process) terminology (more usual for user-space) or PID (process
> > > ==
> > > task == user-space thread) and TGID (thread group, i.e. user-
> > > space
> > > process). I haven't investigated much what's we use most
> > > consistently,
> > > but curious to hear what others think.
> > > 
> > > Also I can see use-cases where we want to iterate just specified
> > > task
> > > (i.e., just specified thread) vs all the tasks that belong to the
> > > same
> > > process group (i.e., thread within process). Naming TBD, but we
> > > should
> > > have BPF_TASK_ITER_TID and BPF_TASK_ITER_TGID (or some other
> > > naming).
> > 
> > 
> > I discussed with Yonghong about iterators over resources of all
> > tasks
> > of a process.  User code should create iterators for each thread of
> > the
> > process if necessary.  We may add the support of tgid if it is
> > higly
> > demanded.
> > 
> > In a discussion of using pidfd, people mentioned to extend pidfd to
> > threads if there is a good use-case.  It also applies to our case.
> > Most of the time, if not always, vma & files are shared by all
> > threads
> > of a process.  So, an iteration over all resources of every threads
> > of
> > a process doesn't get obvious benefit.  It is also true for an
> > iterator
> > over the resources of a specific thread instead of a process.
> > 
> 
> Ok, so two different points here.
> 
> First, TID (thread) vs TGID (process) modes. I'd define TGID mode as:
> a) user specifies some TID and we resolve that to thread group leader
> TID (that is we resolve thread to process), and then iterate all
> threads within that process. For TID (thread) mode, we accept
> specified TID as exactly the thread we iterate (even if it's thread
> group leader, we iterate only that specific thread, not all threads
> in
> a process).
> 
> Second, about the point that all threads within a process share vma,
> file table, etc. That's true. But you are forgetting about iter/task
> that is iterating just tasks. TGID mode for such use case is very
> useful. For task_vma/task_file we can probably do the same logic we
> have today where if the thread has the same file table or mm_struct
> as
> thread group leader, we skip such thread when iterating vmas and
> files.

Yes, you are right.  Iterators of all tasks in a procss is useful.
Just like our discussion offline, it is worth to supports pidfd, tid
and tgid.  For pidfd, it would works just like tgid.  We just do a
translation at the kernel from pidfd to tgid.

> 
> Thoughts?
> 
> 
> > > 
> > > One might ask why do we need single-task mode if we can always
> > > stop
> > > iteration from BPF program, but this is trivial only for
> > > iter/task,
> > > while for iter/task_vma and iter/task_file it becomes
> > > inconvenient to
> > > detect switch from one task to another. It costs us essentially
> > > nothing to support this mode, so I advocate to do that.
> > > 
> > > I have similar thoughts about cgroup iteration modes and actually
> > > supporting cgroup_fd as target for task iterators (which will
> > > mean
> > > iterating tasks belonging to provided cgroup(s)), but I'll reply
> > > on
> > > cgroup iterator patch first, and we can just reuse the same
> > > cgroup
> > > target specification between iter/cgroup and iter/task
> > > afterwards.
> > > 
> > > 
> > > > +                */
> > > > +               __u8    type;   /* BPF_TASK_ITER_* */
> > > > +       } task;
> > > >  };
> > > > 
> > > 
> > > [...]
> > 


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-08-04 23:05 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-01 23:26 [PATCH bpf-next v2 0/3] Parameterize task iterators Kui-Feng Lee
2022-08-01 23:26 ` [PATCH bpf-next v2 1/3] bpf: " Kui-Feng Lee
2022-08-02  1:49   ` Alexei Starovoitov
2022-08-02 16:47     ` Kui-Feng Lee
2022-08-02 21:19       ` Andrii Nakryiko
2022-08-02  3:30   ` Andrii Nakryiko
2022-08-02 16:42     ` Kui-Feng Lee
2022-08-02 21:17       ` Andrii Nakryiko
2022-08-04 23:05         ` Kui-Feng Lee
2022-08-01 23:26 ` [PATCH bpf-next v2 2/3] bpf: Handle bpf_link_info for the parameterized task BPF iterators Kui-Feng Lee
2022-08-01 23:26 ` [PATCH bpf-next v2 3/3] selftests/bpf: Test " Kui-Feng Lee
2022-08-01 23:35 ` [PATCH bpf-next v2 0/3] Parameterize task iterators Kui-Feng Lee

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).