All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next 0/3] Parameterize task iterators.
@ 2022-07-26  5:17 Kui-Feng Lee
  2022-07-26  5:17 ` [PATCH bpf-next 1/3] bpf: " Kui-Feng Lee
                   ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Kui-Feng Lee @ 2022-07-26  5:17 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, kernel-team, yhs; +Cc: Kui-Feng Lee

Allow creating an iterator that loops through resources of one task/thread.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested in only the
resources of a specific task or process.  Passing the addintional
parameters, people can now create an iterator to go through all
resources or only the resources of a task.

Major Changes:

 - Add new parameters in bpf_iter_link_info to indicate to go through
   all tasks or to go through a specific task.

 - Change the implementations of BPF iterators of vma, files, and
   tasks to allow going through only the resources of a specific task.

 - Provide the arguments of parameterized task iterators in
   bpf_link_info.

Kui-Feng Lee (3):
  bpf: Parameterize task iterators.
  bpf: Handle bpf_link_info for the parameterized task BPF iterators.
  selftests/bpf: Test parameterized task BPF iterators.

 include/linux/bpf.h                           |   4 +
 include/uapi/linux/bpf.h                      |  27 ++++
 kernel/bpf/task_iter.c                        |  91 +++++++++---
 tools/include/uapi/linux/bpf.h                |  27 ++++
 .../selftests/bpf/prog_tests/bpf_iter.c       | 131 +++++++++++++++---
 .../selftests/bpf/prog_tests/btf_dump.c       |   2 +-
 .../selftests/bpf/progs/bpf_iter_task.c       |   9 ++
 .../selftests/bpf/progs/bpf_iter_task_file.c  |   7 +
 .../selftests/bpf/progs/bpf_iter_task_vma.c   |   6 +-
 9 files changed, 258 insertions(+), 46 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-26  5:17 [PATCH bpf-next 0/3] Parameterize task iterators Kui-Feng Lee
@ 2022-07-26  5:17 ` Kui-Feng Lee
  2022-07-26 12:13   ` Jiri Olsa
  2022-07-26  5:17 ` [PATCH bpf-next 2/3] bpf: Handle bpf_link_info for the parameterized task BPF iterators Kui-Feng Lee
  2022-07-26  5:17 ` [PATCH bpf-next 3/3] selftests/bpf: Test " Kui-Feng Lee
  2 siblings, 1 reply; 22+ messages in thread
From: Kui-Feng Lee @ 2022-07-26  5:17 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, kernel-team, yhs; +Cc: Kui-Feng Lee

Allow creating an iterator that loops through resources of one task/thread.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested
in only the resources of a specific task or process.  Passing the
additional parameters, people can now create an iterator to go
through all resources or only the resources of a task.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
---
 include/linux/bpf.h            |  4 ++
 include/uapi/linux/bpf.h       | 23 ++++++++++
 kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++---------
 tools/include/uapi/linux/bpf.h | 23 ++++++++++
 4 files changed, 109 insertions(+), 22 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 11950029284f..c8d164404e20 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 
 struct bpf_iter_aux_info {
 	struct bpf_map *map;
+	struct {
+		__u32	tid;
+		u8	type;
+	} task;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ffcbf79a556b..991141cacb9e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
+enum bpf_task_iter_type {
+	BPF_TASK_ITER_ALL = 0,
+	BPF_TASK_ITER_TID,
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+	/*
+	 * Parameters of task iterators.
+	 */
+	struct {
+		__u32	tid;
+		/*
+		 * The type of the iterator.
+		 *
+		 * It can be one of enum bpf_task_iter_type.
+		 *
+		 * BPF_TASK_ITER_ALL (default)
+		 *	The iterator iterates over resources of everyprocess.
+		 *
+		 * BPF_TASK_ITER_TID
+		 *	You should also set *tid* to iterate over one task.
+		 */
+		__u8	type;	/* BPF_TASK_ITER_* */
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..7979aacb651e 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -12,6 +12,8 @@
 
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
+	u32	tid;
+	u8	type;
 };
 
 struct bpf_iter_seq_task_info {
@@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info {
 	u32 tid;
 };
 
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
 					     u32 *tid,
 					     bool skip_if_dup_files)
 {
 	struct task_struct *task = NULL;
 	struct pid *pid;
 
+	if (common->type == BPF_TASK_ITER_TID) {
+		if (*tid)
+			return NULL;
+		rcu_read_lock();
+		pid = find_pid_ns(common->tid, common->ns);
+		if (pid) {
+			task = get_pid_task(pid, PIDTYPE_PID);
+			*tid = common->tid;
+		}
+		rcu_read_unlock();
+		return task;
+	}
+
 	rcu_read_lock();
 retry:
-	pid = find_ge_pid(*tid, ns);
+	pid = find_ge_pid(*tid, common->ns);
 	if (pid) {
-		*tid = pid_nr_ns(pid, ns);
+		*tid = pid_nr_ns(pid, common->ns);
 		task = get_pid_task(pid, PIDTYPE_PID);
 		if (!task) {
 			++*tid;
@@ -56,7 +71,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
 	struct bpf_iter_seq_task_info *info = seq->private;
 	struct task_struct *task;
 
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+	task = task_seq_get_next(&info->common, &info->tid, false);
+
 	if (!task)
 		return NULL;
 
@@ -73,7 +89,8 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	++*pos;
 	++info->tid;
 	put_task_struct((struct task_struct *)v);
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+
+	task = task_seq_get_next(&info->common, &info->tid, false);
 	if (!task)
 		return NULL;
 
@@ -117,6 +134,18 @@ static void task_seq_stop(struct seq_file *seq, void *v)
 		put_task_struct((struct task_struct *)v);
 }
 
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+				union bpf_iter_link_info *linfo,
+				struct bpf_iter_aux_info *aux)
+{
+	if (linfo->task.type == BPF_TASK_ITER_ALL && linfo->task.tid)
+		return -EINVAL;
+
+	aux->task.type = linfo->task.type;
+	aux->task.tid = linfo->task.tid;
+	return 0;
+}
+
 static const struct seq_operations task_seq_ops = {
 	.start	= task_seq_start,
 	.next	= task_seq_next,
@@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info {
 static struct file *
 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 	struct task_struct *curr_task;
 	unsigned int curr_fd = info->fd;
 
@@ -151,21 +179,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 		curr_task = info->task;
 		curr_fd = info->fd;
 	} else {
-                curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
                 if (!curr_task) {
                         info->task = NULL;
-                        info->tid = curr_tid;
                         return NULL;
                 }
 
-                /* set info->task and info->tid */
+		/* set info->task */
 		info->task = curr_task;
-		if (curr_tid == info->tid) {
+		if (saved_tid == info->tid)
 			curr_fd = info->fd;
-		} else {
-			info->tid = curr_tid;
+		else
 			curr_fd = 0;
-		}
 	}
 
 	rcu_read_lock();
@@ -186,9 +211,15 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 	/* the current task is done, go to the next task */
 	rcu_read_unlock();
 	put_task_struct(curr_task);
+
+	if (info->common.type == BPF_TASK_ITER_TID) {
+		info->task = NULL;
+		return NULL;
+	}
+
 	info->task = NULL;
 	info->fd = 0;
-	curr_tid = ++(info->tid);
+	saved_tid = ++(info->tid);
 	goto again;
 }
 
@@ -269,6 +300,8 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
 	struct bpf_iter_seq_task_common *common = priv_data;
 
 	common->ns = get_pid_ns(task_active_pid_ns(current));
+	common->type = aux->task.type;
+	common->tid = aux->task.tid;
 	return 0;
 }
 
@@ -307,11 +340,10 @@ enum bpf_task_vma_iter_find_op {
 static struct vm_area_struct *
 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
 	enum bpf_task_vma_iter_find_op op;
 	struct vm_area_struct *curr_vma;
 	struct task_struct *curr_task;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 
 	/* If this function returns a non-NULL vma, it holds a reference to
 	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
@@ -371,14 +403,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 		}
 	} else {
 again:
-		curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
 		if (!curr_task) {
-			info->tid = curr_tid + 1;
+			info->tid++;
 			goto finish;
 		}
 
-		if (curr_tid != info->tid) {
-			info->tid = curr_tid;
+		if (saved_tid != info->tid) {
 			/* new task, process the first vma */
 			op = task_vma_iter_first_vma;
 		} else {
@@ -430,9 +461,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 	return curr_vma;
 
 next_task:
+	if (info->common.type == BPF_TASK_ITER_TID)
+		goto finish;
+
 	put_task_struct(curr_task);
 	info->task = NULL;
-	curr_tid++;
+	info->tid++;
 	goto again;
 
 finish:
@@ -533,6 +567,7 @@ static const struct bpf_iter_seq_info task_seq_info = {
 
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
@@ -551,6 +586,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
 
 static struct bpf_iter_reg task_file_reg_info = {
 	.target			= "task_file",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
@@ -571,6 +607,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {
 
 static struct bpf_iter_reg task_vma_reg_info = {
 	.target			= "task_vma",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ffcbf79a556b..991141cacb9e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
+enum bpf_task_iter_type {
+	BPF_TASK_ITER_ALL = 0,
+	BPF_TASK_ITER_TID,
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+	/*
+	 * Parameters of task iterators.
+	 */
+	struct {
+		__u32	tid;
+		/*
+		 * The type of the iterator.
+		 *
+		 * It can be one of enum bpf_task_iter_type.
+		 *
+		 * BPF_TASK_ITER_ALL (default)
+		 *	The iterator iterates over resources of everyprocess.
+		 *
+		 * BPF_TASK_ITER_TID
+		 *	You should also set *tid* to iterate over one task.
+		 */
+		__u8	type;	/* BPF_TASK_ITER_* */
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH bpf-next 2/3] bpf: Handle bpf_link_info for the parameterized task BPF iterators.
  2022-07-26  5:17 [PATCH bpf-next 0/3] Parameterize task iterators Kui-Feng Lee
  2022-07-26  5:17 ` [PATCH bpf-next 1/3] bpf: " Kui-Feng Lee
@ 2022-07-26  5:17 ` Kui-Feng Lee
  2022-07-26  5:17 ` [PATCH bpf-next 3/3] selftests/bpf: Test " Kui-Feng Lee
  2 siblings, 0 replies; 22+ messages in thread
From: Kui-Feng Lee @ 2022-07-26  5:17 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, kernel-team, yhs; +Cc: Kui-Feng Lee

Add new fields to bpf_link_info that users can query it through
bpf_obj_get_info_by_fd().

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
---
 include/uapi/linux/bpf.h       |  4 ++++
 kernel/bpf/task_iter.c         | 10 ++++++++++
 tools/include/uapi/linux/bpf.h |  4 ++++
 3 files changed, 18 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 991141cacb9e..3121e9c66b7e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6158,6 +6158,10 @@ struct bpf_link_info {
 				struct {
 					__u32 map_id;
 				} map;
+				struct {
+					__u32 tid;
+					__u8 type; /* one of enum bpf_task_iter_type */
+				} task;
 			};
 		} iter;
 		struct  {
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 7979aacb651e..26655d681776 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -565,6 +565,13 @@ static const struct bpf_iter_seq_info task_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
 };
 
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+	info->iter.task.tid = aux->task.tid;
+	info->iter.task.type = aux->task.type;
+	return 0;
+}
+
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
 	.attach_target		= bpf_iter_attach_task,
@@ -575,6 +582,7 @@ static struct bpf_iter_reg task_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -596,6 +604,7 @@ static struct bpf_iter_reg task_file_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_file_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -617,6 +626,7 @@ static struct bpf_iter_reg task_vma_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_vma_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 991141cacb9e..3121e9c66b7e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6158,6 +6158,10 @@ struct bpf_link_info {
 				struct {
 					__u32 map_id;
 				} map;
+				struct {
+					__u32 tid;
+					__u8 type; /* one of enum bpf_task_iter_type */
+				} task;
 			};
 		} iter;
 		struct  {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH bpf-next 3/3] selftests/bpf: Test parameterized task BPF iterators.
  2022-07-26  5:17 [PATCH bpf-next 0/3] Parameterize task iterators Kui-Feng Lee
  2022-07-26  5:17 ` [PATCH bpf-next 1/3] bpf: " Kui-Feng Lee
  2022-07-26  5:17 ` [PATCH bpf-next 2/3] bpf: Handle bpf_link_info for the parameterized task BPF iterators Kui-Feng Lee
@ 2022-07-26  5:17 ` Kui-Feng Lee
  2 siblings, 0 replies; 22+ messages in thread
From: Kui-Feng Lee @ 2022-07-26  5:17 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, kernel-team, yhs; +Cc: Kui-Feng Lee

Test iterators of vma, files, and tasks of a specific task.

Ensure the API works appropriately for both going through all tasks and
going through a particular task.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 131 +++++++++++++++---
 .../selftests/bpf/prog_tests/btf_dump.c       |   2 +-
 .../selftests/bpf/progs/bpf_iter_task.c       |   9 ++
 .../selftests/bpf/progs/bpf_iter_task_file.c  |   7 +
 .../selftests/bpf/progs/bpf_iter_task_vma.c   |   6 +-
 5 files changed, 131 insertions(+), 24 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index a33874b081b6..f1116354f982 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -42,13 +42,13 @@ static void test_btf_id_or_null(void)
 	}
 }
 
-static void do_dummy_read(struct bpf_program *prog)
+static void do_dummy_read(struct bpf_program *prog, struct bpf_iter_attach_opts *opts)
 {
 	struct bpf_link *link;
 	char buf[16] = {};
 	int iter_fd, len;
 
-	link = bpf_program__attach_iter(prog, NULL);
+	link = bpf_program__attach_iter(prog, opts);
 	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		return;
 
@@ -91,7 +91,7 @@ static void test_ipv6_route(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_ipv6_route__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_ipv6_route);
+	do_dummy_read(skel->progs.dump_ipv6_route, NULL);
 
 	bpf_iter_ipv6_route__destroy(skel);
 }
@@ -104,7 +104,7 @@ static void test_netlink(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_netlink__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_netlink);
+	do_dummy_read(skel->progs.dump_netlink, NULL);
 
 	bpf_iter_netlink__destroy(skel);
 }
@@ -117,20 +117,70 @@ static void test_bpf_map(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_map__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_bpf_map);
+	do_dummy_read(skel->progs.dump_bpf_map, NULL);
 
 	bpf_iter_bpf_map__destroy(skel);
 }
 
+static void check_bpf_link_info(const struct bpf_program *prog)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+	struct bpf_link_info info = {};
+	__u32 info_len;
+	struct bpf_link *link;
+	int err;
+
+	linfo.task.tid = getpid();
+	linfo.task.type = BPF_TASK_ITER_TID;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	link = bpf_program__attach_iter(prog, &opts);
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
+		return;
+
+	info_len = sizeof(info);
+	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
+	if (ASSERT_OK(err, "bpf_obj_get_info_by_fd")) {
+		ASSERT_EQ(info.iter.task.type, BPF_TASK_ITER_TID, "check_task_type");
+		ASSERT_EQ(info.iter.task.tid, getpid(), "check_task_tid");
+	}
+
+	bpf_link__destroy(link);
+}
+
 static void test_task(void)
 {
 	struct bpf_iter_task *skel;
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
 
 	skel = bpf_iter_task__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_task);
+	linfo.task.tid = getpid();
+	linfo.task.type = BPF_TASK_ITER_TID;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	skel->bss->tid = getpid();
+
+	do_dummy_read(skel->progs.dump_task, &opts);
+
+	ASSERT_EQ(skel->bss->num_unknown_tid, 0, "check_num_unknown_tid");
+	ASSERT_EQ(skel->bss->num_known_tid, 1, "check_num_known_tid");
+
+	skel->bss->num_unknown_tid = 0;
+	skel->bss->num_known_tid = 0;
+
+	do_dummy_read(skel->progs.dump_task, NULL);
+
+	ASSERT_GE(skel->bss->num_unknown_tid, 0, "check_num_unknown_tid");
+	ASSERT_EQ(skel->bss->num_known_tid, 1, "check_num_known_tid");
+
+	check_bpf_link_info(skel->progs.dump_task);
 
 	bpf_iter_task__destroy(skel);
 }
@@ -143,7 +193,7 @@ static void test_task_sleepable(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_task_sleepable);
+	do_dummy_read(skel->progs.dump_task_sleepable, NULL);
 
 	ASSERT_GT(skel->bss->num_expected_failure_copy_from_user_task, 0,
 		  "num_expected_failure_copy_from_user_task");
@@ -161,8 +211,8 @@ static void test_task_stack(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_task_stack__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_task_stack);
-	do_dummy_read(skel->progs.get_task_user_stacks);
+	do_dummy_read(skel->progs.dump_task_stack, NULL);
+	do_dummy_read(skel->progs.get_task_user_stacks, NULL);
 
 	bpf_iter_task_stack__destroy(skel);
 }
@@ -174,7 +224,9 @@ static void *do_nothing(void *arg)
 
 static void test_task_file(void)
 {
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
 	struct bpf_iter_task_file *skel;
+	union bpf_iter_link_info linfo;
 	pthread_t thread_id;
 	void *ret;
 
@@ -188,15 +240,31 @@ static void test_task_file(void)
 		  "pthread_create"))
 		goto done;
 
-	do_dummy_read(skel->progs.dump_task_file);
+	linfo.task.tid = getpid();
+	linfo.task.type = BPF_TASK_ITER_TID;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	do_dummy_read(skel->progs.dump_task_file, &opts);
 
 	if (!ASSERT_FALSE(pthread_join(thread_id, &ret) || ret != NULL,
 		  "pthread_join"))
 		goto done;
 
 	ASSERT_EQ(skel->bss->count, 0, "check_count");
+	ASSERT_EQ(skel->bss->unique_tgid_count, 1, "check_unique_tgid_count");
+
+	skel->bss->count = 0;
+	skel->bss->unique_tgid_count = 0;
+
+	do_dummy_read(skel->progs.dump_task_file, NULL);
 
-done:
+	ASSERT_GE(skel->bss->count, 0, "check_count");
+	ASSERT_GE(skel->bss->unique_tgid_count, 1, "check_unique_tgid_count");
+
+	check_bpf_link_info(skel->progs.dump_task_file);
+
+ done:
 	bpf_iter_task_file__destroy(skel);
 }
 
@@ -274,7 +342,7 @@ static void test_tcp4(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_tcp4__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_tcp4);
+	do_dummy_read(skel->progs.dump_tcp4, NULL);
 
 	bpf_iter_tcp4__destroy(skel);
 }
@@ -287,7 +355,7 @@ static void test_tcp6(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_tcp6__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_tcp6);
+	do_dummy_read(skel->progs.dump_tcp6, NULL);
 
 	bpf_iter_tcp6__destroy(skel);
 }
@@ -300,7 +368,7 @@ static void test_udp4(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_udp4__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_udp4);
+	do_dummy_read(skel->progs.dump_udp4, NULL);
 
 	bpf_iter_udp4__destroy(skel);
 }
@@ -313,7 +381,7 @@ static void test_udp6(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_udp6__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_udp6);
+	do_dummy_read(skel->progs.dump_udp6, NULL);
 
 	bpf_iter_udp6__destroy(skel);
 }
@@ -326,7 +394,7 @@ static void test_unix(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_unix__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_unix);
+	do_dummy_read(skel->progs.dump_unix, NULL);
 
 	bpf_iter_unix__destroy(skel);
 }
@@ -988,7 +1056,7 @@ static void test_bpf_sk_storage_get(void)
 	if (!ASSERT_OK(err, "bpf_map_update_elem"))
 		goto close_socket;
 
-	do_dummy_read(skel->progs.fill_socket_owner);
+	do_dummy_read(skel->progs.fill_socket_owner, NULL);
 
 	err = bpf_map_lookup_elem(map_fd, &sock_fd, &val);
 	if (CHECK(err || val != getpid(), "bpf_map_lookup_elem",
@@ -996,7 +1064,7 @@ static void test_bpf_sk_storage_get(void)
 	    getpid(), val, err))
 		goto close_socket;
 
-	do_dummy_read(skel->progs.negate_socket_local_storage);
+	do_dummy_read(skel->progs.negate_socket_local_storage, NULL);
 
 	err = bpf_map_lookup_elem(map_fd, &sock_fd, &val);
 	CHECK(err || val != -getpid(), "bpf_map_lookup_elem",
@@ -1116,7 +1184,7 @@ static void test_link_iter(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_link__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_bpf_link);
+	do_dummy_read(skel->progs.dump_bpf_link, NULL);
 
 	bpf_iter_bpf_link__destroy(skel);
 }
@@ -1129,7 +1197,7 @@ static void test_ksym_iter(void)
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_ksym__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_ksym);
+	do_dummy_read(skel->progs.dump_ksym, NULL);
 
 	bpf_iter_ksym__destroy(skel);
 }
@@ -1154,7 +1222,7 @@ static void str_strip_first_line(char *str)
 	*dst = '\0';
 }
 
-static void test_task_vma(void)
+static void test_task_vma_(struct bpf_iter_attach_opts *opts)
 {
 	int err, iter_fd = -1, proc_maps_fd = -1;
 	struct bpf_iter_task_vma *skel;
@@ -1166,13 +1234,14 @@ static void test_task_vma(void)
 		return;
 
 	skel->bss->pid = getpid();
+	skel->bss->one_task = opts ? 1 : 0;
 
 	err = bpf_iter_task_vma__load(skel);
 	if (!ASSERT_OK(err, "bpf_iter_task_vma__load"))
 		goto out;
 
 	skel->links.proc_maps = bpf_program__attach_iter(
-		skel->progs.proc_maps, NULL);
+		skel->progs.proc_maps, opts);
 
 	if (!ASSERT_OK_PTR(skel->links.proc_maps, "bpf_program__attach_iter")) {
 		skel->links.proc_maps = NULL;
@@ -1211,12 +1280,30 @@ static void test_task_vma(void)
 	str_strip_first_line(proc_maps_output);
 
 	ASSERT_STREQ(task_vma_output, proc_maps_output, "compare_output");
+
+	check_bpf_link_info(skel->progs.proc_maps);
+
 out:
 	close(proc_maps_fd);
 	close(iter_fd);
 	bpf_iter_task_vma__destroy(skel);
 }
 
+static void test_task_vma(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.task.tid = getpid();
+	linfo.task.type = BPF_TASK_ITER_TID;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	test_task_vma_(&opts);
+	test_task_vma_(NULL);
+}
+
 void test_bpf_iter(void)
 {
 	if (test__start_subtest("btf_id_or_null"))
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index 5fce7008d1ff..32c34ce9cbeb 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -764,7 +764,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d,
 
 	/* union with nested struct */
 	TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT,
-			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},}",
+			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.task = (struct){.tid = (__u32)1,},}",
 			   { .map = { .map_fd = 1 }});
 
 	/* struct skb with nested structs/unions; because type output is so
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
index d22741272692..96131b9a1caa 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
@@ -6,6 +6,10 @@
 
 char _license[] SEC("license") = "GPL";
 
+uint32_t tid = 0;
+int num_unknown_tid = 0;
+int num_known_tid = 0;
+
 SEC("iter/task")
 int dump_task(struct bpf_iter__task *ctx)
 {
@@ -18,6 +22,11 @@ int dump_task(struct bpf_iter__task *ctx)
 		return 0;
 	}
 
+	if (task->pid != tid)
+		num_unknown_tid++;
+	else
+		num_known_tid++;
+
 	if (ctx->meta->seq_num == 0)
 		BPF_SEQ_PRINTF(seq, "    tgid      gid\n");
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
index 6e7b400888fe..031455ed8748 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
@@ -7,6 +7,8 @@ char _license[] SEC("license") = "GPL";
 
 int count = 0;
 int tgid = 0;
+int last_tgid = -1;
+int unique_tgid_count = 0;
 
 SEC("iter/task_file")
 int dump_task_file(struct bpf_iter__task_file *ctx)
@@ -27,6 +29,11 @@ int dump_task_file(struct bpf_iter__task_file *ctx)
 	if (tgid == task->tgid && task->tgid != task->pid)
 		count++;
 
+	if (last_tgid != task->tgid) {
+		last_tgid = task->tgid;
+		unique_tgid_count++;
+	}
+
 	BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
 		       (long)file->f_op);
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
index 4ea6a37d1345..44f4a31c2ddd 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
@@ -20,6 +20,7 @@ char _license[] SEC("license") = "GPL";
 #define D_PATH_BUF_SIZE 1024
 char d_path_buf[D_PATH_BUF_SIZE] = {};
 __u32 pid = 0;
+__u32 one_task = 0;
 
 SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
 {
@@ -33,8 +34,11 @@ SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
 		return 0;
 
 	file = vma->vm_file;
-	if (task->tgid != pid)
+	if (task->tgid != pid) {
+		if (one_task)
+			BPF_SEQ_PRINTF(seq, "unexpected task (%d != %d)", task->tgid, pid);
 		return 0;
+	}
 	perm_str[0] = (vma->vm_flags & VM_READ) ? 'r' : '-';
 	perm_str[1] = (vma->vm_flags & VM_WRITE) ? 'w' : '-';
 	perm_str[2] = (vma->vm_flags & VM_EXEC) ? 'x' : '-';
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-26  5:17 ` [PATCH bpf-next 1/3] bpf: " Kui-Feng Lee
@ 2022-07-26 12:13   ` Jiri Olsa
  2022-07-26 13:19     ` Jiri Olsa
  2022-07-27  6:56     ` Kui-Feng Lee
  0 siblings, 2 replies; 22+ messages in thread
From: Jiri Olsa @ 2022-07-26 12:13 UTC (permalink / raw)
  To: Kui-Feng Lee; +Cc: bpf, ast, daniel, andrii, kernel-team, yhs

On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> Allow creating an iterator that loops through resources of one task/thread.
> 
> People could only create iterators to loop through all resources of
> files, vma, and tasks in the system, even though they were interested
> in only the resources of a specific task or process.  Passing the
> additional parameters, people can now create an iterator to go
> through all resources or only the resources of a task.
> 
> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> ---
>  include/linux/bpf.h            |  4 ++
>  include/uapi/linux/bpf.h       | 23 ++++++++++
>  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++---------
>  tools/include/uapi/linux/bpf.h | 23 ++++++++++
>  4 files changed, 109 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 11950029284f..c8d164404e20 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>  
>  struct bpf_iter_aux_info {
>  	struct bpf_map *map;
> +	struct {
> +		__u32	tid;

should be just u32 ?

> +		u8	type;
> +	} task;
>  };
>  

SNIP

>  
>  /* BPF syscall commands, see bpf(2) man-page for more details. */
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 8c921799def4..7979aacb651e 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -12,6 +12,8 @@
>  
>  struct bpf_iter_seq_task_common {
>  	struct pid_namespace *ns;
> +	u32	tid;
> +	u8	type;
>  };
>  
>  struct bpf_iter_seq_task_info {
> @@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info {
>  	u32 tid;
>  };
>  
> -static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
> +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
>  					     u32 *tid,
>  					     bool skip_if_dup_files)
>  {
>  	struct task_struct *task = NULL;
>  	struct pid *pid;
>  
> +	if (common->type == BPF_TASK_ITER_TID) {
> +		if (*tid)
> +			return NULL;

I tested and this condition breaks it for fd iterations, not sure about
the task and vma, because they share this function

if bpf_seq_read is called with small buffer there will be multiple calls
to task_file_seq_get_next and second one will stop in here, even if there
are more files to be displayed for the task in filter

it'd be nice to have some test for this ;-) or perhaps compare with the
not filtered output

SNIP

>  static const struct seq_operations task_seq_ops = {
>  	.start	= task_seq_start,
>  	.next	= task_seq_next,
> @@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info {
>  static struct file *
>  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
>  {
> -	struct pid_namespace *ns = info->common.ns;
> -	u32 curr_tid = info->tid;
> +	u32 saved_tid = info->tid;
>  	struct task_struct *curr_task;
>  	unsigned int curr_fd = info->fd;
>  
> @@ -151,21 +179,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
>  		curr_task = info->task;
>  		curr_fd = info->fd;
>  	} else {
> -                curr_task = task_seq_get_next(ns, &curr_tid, true);
> +		curr_task = task_seq_get_next(&info->common, &info->tid, true);
>                  if (!curr_task) {
>                          info->task = NULL;
> -                        info->tid = curr_tid;
>                          return NULL;
>                  }

nit, looks like we're missing proper indent in here


>  
> -                /* set info->task and info->tid */
> +		/* set info->task */
>  		info->task = curr_task;
> -		if (curr_tid == info->tid) {
> +		if (saved_tid == info->tid)
>  			curr_fd = info->fd;
> -		} else {
> -			info->tid = curr_tid;
> +		else

SNIP

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-26 12:13   ` Jiri Olsa
@ 2022-07-26 13:19     ` Jiri Olsa
  2022-07-27  6:39       ` Kui-Feng Lee
  2022-07-27  6:56     ` Kui-Feng Lee
  1 sibling, 1 reply; 22+ messages in thread
From: Jiri Olsa @ 2022-07-26 13:19 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: Kui-Feng Lee, bpf, ast, daniel, andrii, kernel-team, yhs

On Tue, Jul 26, 2022 at 02:13:17PM +0200, Jiri Olsa wrote:
> On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > Allow creating an iterator that loops through resources of one task/thread.
> > 
> > People could only create iterators to loop through all resources of
> > files, vma, and tasks in the system, even though they were interested
> > in only the resources of a specific task or process.  Passing the
> > additional parameters, people can now create an iterator to go
> > through all resources or only the resources of a task.
> > 
> > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > ---
> >  include/linux/bpf.h            |  4 ++
> >  include/uapi/linux/bpf.h       | 23 ++++++++++
> >  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++---------
> >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> >  4 files changed, 109 insertions(+), 22 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 11950029284f..c8d164404e20 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
> >  
> >  struct bpf_iter_aux_info {
> >  	struct bpf_map *map;
> > +	struct {
> > +		__u32	tid;
> 
> should be just u32 ?
> 
> > +		u8	type;
> > +	} task;
> >  };
> >  
> 
> SNIP
> 
> >  
> >  /* BPF syscall commands, see bpf(2) man-page for more details. */
> > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > index 8c921799def4..7979aacb651e 100644
> > --- a/kernel/bpf/task_iter.c
> > +++ b/kernel/bpf/task_iter.c
> > @@ -12,6 +12,8 @@
> >  
> >  struct bpf_iter_seq_task_common {
> >  	struct pid_namespace *ns;
> > +	u32	tid;
> > +	u8	type;
> >  };
> >  
> >  struct bpf_iter_seq_task_info {
> > @@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info {
> >  	u32 tid;
> >  };
> >  
> > -static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
> > +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
> >  					     u32 *tid,
> >  					     bool skip_if_dup_files)
> >  {
> >  	struct task_struct *task = NULL;
> >  	struct pid *pid;
> >  
> > +	if (common->type == BPF_TASK_ITER_TID) {
> > +		if (*tid)
> > +			return NULL;
> 
> I tested and this condition breaks it for fd iterations, not sure about
> the task and vma, because they share this function
> 
> if bpf_seq_read is called with small buffer there will be multiple calls
> to task_file_seq_get_next and second one will stop in here, even if there
> are more files to be displayed for the task in filter

I mean there will be multiple calls of following sequence:

  bpf_seq_read
    task_file_seq_start
      task_seq_get_next

and 2nd one will return NULL in task_seq_get_next,
because info->tid is already set
 
jirka

> 
> it'd be nice to have some test for this ;-) or perhaps compare with the
> not filtered output
> 
> SNIP
> 
> >  static const struct seq_operations task_seq_ops = {
> >  	.start	= task_seq_start,
> >  	.next	= task_seq_next,
> > @@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info {
> >  static struct file *
> >  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
> >  {
> > -	struct pid_namespace *ns = info->common.ns;
> > -	u32 curr_tid = info->tid;
> > +	u32 saved_tid = info->tid;
> >  	struct task_struct *curr_task;
> >  	unsigned int curr_fd = info->fd;
> >  
> > @@ -151,21 +179,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
> >  		curr_task = info->task;
> >  		curr_fd = info->fd;
> >  	} else {
> > -                curr_task = task_seq_get_next(ns, &curr_tid, true);
> > +		curr_task = task_seq_get_next(&info->common, &info->tid, true);
> >                  if (!curr_task) {
> >                          info->task = NULL;
> > -                        info->tid = curr_tid;
> >                          return NULL;
> >                  }
> 
> nit, looks like we're missing proper indent in here
> 
> 
> >  
> > -                /* set info->task and info->tid */
> > +		/* set info->task */
> >  		info->task = curr_task;
> > -		if (curr_tid == info->tid) {
> > +		if (saved_tid == info->tid)
> >  			curr_fd = info->fd;
> > -		} else {
> > -			info->tid = curr_tid;
> > +		else
> 
> SNIP

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-26 13:19     ` Jiri Olsa
@ 2022-07-27  6:39       ` Kui-Feng Lee
  0 siblings, 0 replies; 22+ messages in thread
From: Kui-Feng Lee @ 2022-07-27  6:39 UTC (permalink / raw)
  To: olsajiri; +Cc: daniel, Kernel Team, Yonghong Song, ast, andrii, bpf

On Tue, 2022-07-26 at 15:19 +0200, Jiri Olsa wrote:
> On Tue, Jul 26, 2022 at 02:13:17PM +0200, Jiri Olsa wrote:
> > > -static struct task_struct *task_seq_get_next(struct
> > > pid_namespace *ns,
> > > +static struct task_struct *task_seq_get_next(struct
> > > bpf_iter_seq_task_common *common,
> > >                                              u32 *tid,
> > >                                              bool
> > > skip_if_dup_files)
> > >  {
> > >         struct task_struct *task = NULL;
> > >         struct pid *pid;
> > >  
> > > +       if (common->type == BPF_TASK_ITER_TID) {
> > > +               if (*tid)
> > > +                       return NULL;
> > 
> > I tested and this condition breaks it for fd iterations, not sure
> > about
> > the task and vma, because they share this function
> > 
> > if bpf_seq_read is called with small buffer there will be multiple
> > calls
> > to task_file_seq_get_next and second one will stop in here, even if
> > there
> > are more files to be displayed for the task in filter
> 
> I mean there will be multiple calls of following sequence:
> 
>   bpf_seq_read
>     task_file_seq_start
>       task_seq_get_next
> 
> and 2nd one will return NULL in task_seq_get_next,
> because info->tid is already set

Ok!  I got your point.  I will fix it ASAP.




^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-26 12:13   ` Jiri Olsa
  2022-07-26 13:19     ` Jiri Olsa
@ 2022-07-27  6:56     ` Kui-Feng Lee
  2022-07-27  8:19       ` Kumar Kartikeya Dwivedi
  1 sibling, 1 reply; 22+ messages in thread
From: Kui-Feng Lee @ 2022-07-27  6:56 UTC (permalink / raw)
  To: olsajiri; +Cc: daniel, Kernel Team, Yonghong Song, ast, andrii, bpf

On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > Allow creating an iterator that loops through resources of one
> > task/thread.
> > 
> > People could only create iterators to loop through all resources of
> > files, vma, and tasks in the system, even though they were
> > interested
> > in only the resources of a specific task or process.  Passing the
> > additional parameters, people can now create an iterator to go
> > through all resources or only the resources of a task.
> > 
> > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > ---
> >  include/linux/bpf.h            |  4 ++
> >  include/uapi/linux/bpf.h       | 23 ++++++++++
> >  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++-----
> > ----
> >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> >  4 files changed, 109 insertions(+), 22 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 11950029284f..c8d164404e20 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > *pathname, int flags);
> >  
> >  struct bpf_iter_aux_info {
> >         struct bpf_map *map;
> > +       struct {
> > +               __u32   tid;
> 
> should be just u32 ?


Or, should change the following 'type' to __u8?

> 
> > +               u8      type;
> > +       } task;
> >  };
> >  
> 
> SNIP
> 
> >  
> >  /* BPF syscall commands, see bpf(2) man-page for more details. */
> > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > index 8c921799def4..7979aacb651e 100644
> > --- a/kernel/bpf/task_iter.c
> > +++ b/kernel/bpf/task_iter.c
> > @@ -12,6 +12,8 @@
> >  
> >  struct bpf_iter_seq_task_common {
> >         struct pid_namespace *ns;
> > +       u32     tid;
> > +       u8      type;
> >  };
> >  
> >  struct bpf_iter_seq_task_info {
> > @@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info {
> >         u32 tid;
> >  };
> >  
> > -static struct task_struct *task_seq_get_next(struct pid_namespace
> > *ns,
> > +static struct task_struct *task_seq_get_next(struct
> > bpf_iter_seq_task_common *common,
> >                                              u32 *tid,
> >                                              bool
> > skip_if_dup_files)
> >  {
> >         struct task_struct *task = NULL;
> >         struct pid *pid;
> >  
> > +       if (common->type == BPF_TASK_ITER_TID) {
> > +               if (*tid)
> > +                       return NULL;
> 
> I tested and this condition breaks it for fd iterations, not sure
> about
> the task and vma, because they share this function
> 
> if bpf_seq_read is called with small buffer there will be multiple
> calls
> to task_file_seq_get_next and second one will stop in here, even if
> there
> are more files to be displayed for the task in filter
> 
> it'd be nice to have some test for this ;-) or perhaps compare with
> the
> not filtered output
> 
> SNIP
> 
> >  static const struct seq_operations task_seq_ops = {
> >         .start  = task_seq_start,
> >         .next   = task_seq_next,
> > @@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info {
> >  static struct file *
> >  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
> >  {
> > -       struct pid_namespace *ns = info->common.ns;
> > -       u32 curr_tid = info->tid;
> > +       u32 saved_tid = info->tid;
> >         struct task_struct *curr_task;
> >         unsigned int curr_fd = info->fd;
> >  
> > @@ -151,21 +179,18 @@ task_file_seq_get_next(struct
> > bpf_iter_seq_task_file_info *info)
> >                 curr_task = info->task;
> >                 curr_fd = info->fd;
> >         } else {
> > -                curr_task = task_seq_get_next(ns, &curr_tid,
> > true);
> > +               curr_task = task_seq_get_next(&info->common, &info-
> > >tid, true);
> >                  if (!curr_task) {
> >                          info->task = NULL;
> > -                        info->tid = curr_tid;
> >                          return NULL;
> >                  }
> 
> nit, looks like we're missing proper indent in here

Yes, we mixed spaces and tabs.  Should I change these lines to tabs?
> 
> 
> >  
> > -                /* set info->task and info->tid */
> > +               /* set info->task */
> >                 info->task = curr_task;
> > -               if (curr_tid == info->tid) {
> > +               if (saved_tid == info->tid)
> >                         curr_fd = info->fd;
> > -               } else {
> > -                       info->tid = curr_tid;
> > +               else
> 
> SNIP


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-27  6:56     ` Kui-Feng Lee
@ 2022-07-27  8:19       ` Kumar Kartikeya Dwivedi
  2022-07-28  5:25         ` Kui-Feng Lee
  0 siblings, 1 reply; 22+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2022-07-27  8:19 UTC (permalink / raw)
  To: Kui-Feng Lee
  Cc: olsajiri, daniel, Kernel Team, Yonghong Song, ast, andrii, bpf, brauner

On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > Allow creating an iterator that loops through resources of one
> > > task/thread.
> > >
> > > People could only create iterators to loop through all resources of
> > > files, vma, and tasks in the system, even though they were
> > > interested
> > > in only the resources of a specific task or process.  Passing the
> > > additional parameters, people can now create an iterator to go
> > > through all resources or only the resources of a task.
> > >
> > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > ---
> > >  include/linux/bpf.h            |  4 ++
> > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > >  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++-----
> > > ----
> > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index 11950029284f..c8d164404e20 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > > *pathname, int flags);
> > >
> > >  struct bpf_iter_aux_info {
> > >         struct bpf_map *map;
> > > +       struct {
> > > +               __u32   tid;
> >
> > should be just u32 ?
>
> Or, should change the following 'type' to __u8?

Would it be better to use a pidfd instead of a tid here? Unset pidfd
would mean going over all tasks, and any fd > 0 implies attaching to a
specific task (as is the convention in BPF land). Most of the new
UAPIs working on processes are using pidfds (to work with a stable
handle instead of a reusable ID).
The iterator taking an fd also gives an opportunity to BPF LSMs to
attach permissions/policies to it (once we have a file local storage
map) e.g. whether creating a task iterator for that specific pidfd
instance (backed by the struct file) would be allowed or not.
You are using getpid in the selftest and keeping track of last_tgid in
the iterator, so I guess you don't even need to extend pidfd_open to
work on thread IDs right now for your use case (and fdtable and mm are
shared for POSIX threads anyway, so for those two it won't make a
difference).

What is your opinion?

>
> [...]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-27  8:19       ` Kumar Kartikeya Dwivedi
@ 2022-07-28  5:25         ` Kui-Feng Lee
  2022-07-28  8:47           ` Kumar Kartikeya Dwivedi
  0 siblings, 1 reply; 22+ messages in thread
From: Kui-Feng Lee @ 2022-07-28  5:25 UTC (permalink / raw)
  To: memxor
  Cc: brauner, Yonghong Song, daniel, olsajiri, Kernel Team, ast, bpf, andrii

On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote:
> On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > > Allow creating an iterator that loops through resources of one
> > > > task/thread.
> > > > 
> > > > People could only create iterators to loop through all
> > > > resources of
> > > > files, vma, and tasks in the system, even though they were
> > > > interested
> > > > in only the resources of a specific task or process.  Passing
> > > > the
> > > > additional parameters, people can now create an iterator to go
> > > > through all resources or only the resources of a task.
> > > > 
> > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > ---
> > > >  include/linux/bpf.h            |  4 ++
> > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > >  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++-
> > > > ----
> > > > ----
> > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > 
> > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > index 11950029284f..c8d164404e20 100644
> > > > --- a/include/linux/bpf.h
> > > > +++ b/include/linux/bpf.h
> > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > > > *pathname, int flags);
> > > > 
> > > >  struct bpf_iter_aux_info {
> > > >         struct bpf_map *map;
> > > > +       struct {
> > > > +               __u32   tid;
> > > 
> > > should be just u32 ?
> > 
> > Or, should change the following 'type' to __u8?
> 
> Would it be better to use a pidfd instead of a tid here? Unset pidfd
> would mean going over all tasks, and any fd > 0 implies attaching to
> a
> specific task (as is the convention in BPF land). Most of the new
> UAPIs working on processes are using pidfds (to work with a stable
> handle instead of a reusable ID).
> The iterator taking an fd also gives an opportunity to BPF LSMs to
> attach permissions/policies to it (once we have a file local storage
> map) e.g. whether creating a task iterator for that specific pidfd
> instance (backed by the struct file) would be allowed or not.
> You are using getpid in the selftest and keeping track of last_tgid
> in
> the iterator, so I guess you don't even need to extend pidfd_open to
> work on thread IDs right now for your use case (and fdtable and mm
> are
> shared for POSIX threads anyway, so for those two it won't make a
> difference).
> 
> What is your opinion?

Do you mean removed both tid and type, and replace them with a pidfd?
We can do that in uapi, struct bpf_link_info.  But, the interal types,
ex. bpf_iter_aux_info, still need to use tid or struct file to avoid
getting file from the per-process fdtable.  Is that what you mean?


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28  5:25         ` Kui-Feng Lee
@ 2022-07-28  8:47           ` Kumar Kartikeya Dwivedi
  2022-07-28 15:16             ` Kui-Feng Lee
  0 siblings, 1 reply; 22+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2022-07-28  8:47 UTC (permalink / raw)
  To: Kui-Feng Lee
  Cc: brauner, Yonghong Song, daniel, olsajiri, Kernel Team, ast, bpf, andrii

On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote:
> > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > >
> > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > > > Allow creating an iterator that loops through resources of one
> > > > > task/thread.
> > > > >
> > > > > People could only create iterators to loop through all
> > > > > resources of
> > > > > files, vma, and tasks in the system, even though they were
> > > > > interested
> > > > > in only the resources of a specific task or process.  Passing
> > > > > the
> > > > > additional parameters, people can now create an iterator to go
> > > > > through all resources or only the resources of a task.
> > > > >
> > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > ---
> > > > >  include/linux/bpf.h            |  4 ++
> > > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > >  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++-
> > > > > ----
> > > > > ----
> > > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > index 11950029284f..c8d164404e20 100644
> > > > > --- a/include/linux/bpf.h
> > > > > +++ b/include/linux/bpf.h
> > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > > > > *pathname, int flags);
> > > > >
> > > > >  struct bpf_iter_aux_info {
> > > > >         struct bpf_map *map;
> > > > > +       struct {
> > > > > +               __u32   tid;
> > > >
> > > > should be just u32 ?
> > >
> > > Or, should change the following 'type' to __u8?
> >
> > Would it be better to use a pidfd instead of a tid here? Unset pidfd
> > would mean going over all tasks, and any fd > 0 implies attaching to
> > a
> > specific task (as is the convention in BPF land). Most of the new
> > UAPIs working on processes are using pidfds (to work with a stable
> > handle instead of a reusable ID).
> > The iterator taking an fd also gives an opportunity to BPF LSMs to
> > attach permissions/policies to it (once we have a file local storage
> > map) e.g. whether creating a task iterator for that specific pidfd
> > instance (backed by the struct file) would be allowed or not.
> > You are using getpid in the selftest and keeping track of last_tgid
> > in
> > the iterator, so I guess you don't even need to extend pidfd_open to
> > work on thread IDs right now for your use case (and fdtable and mm
> > are
> > shared for POSIX threads anyway, so for those two it won't make a
> > difference).
> >
> > What is your opinion?
>
> Do you mean removed both tid and type, and replace them with a pidfd?
> We can do that in uapi, struct bpf_link_info.  But, the interal types,
> ex. bpf_iter_aux_info, still need to use tid or struct file to avoid
> getting file from the per-process fdtable.  Is that what you mean?
>

Yes, just for the UAPI, it is similar to taking map_fd for map iter.
In bpf_link_info we should report just the tid, just like map iter
reports map_id.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28  8:47           ` Kumar Kartikeya Dwivedi
@ 2022-07-28 15:16             ` Kui-Feng Lee
  2022-07-28 16:22               ` Kumar Kartikeya Dwivedi
  0 siblings, 1 reply; 22+ messages in thread
From: Kui-Feng Lee @ 2022-07-28 15:16 UTC (permalink / raw)
  To: memxor
  Cc: Yonghong Song, brauner, daniel, olsajiri, ast, Kernel Team, bpf, andrii

On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote:
> On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote:
> > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com>
> > > wrote:
> > > > 
> > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > > > > Allow creating an iterator that loops through resources of
> > > > > > one
> > > > > > task/thread.
> > > > > > 
> > > > > > People could only create iterators to loop through all
> > > > > > resources of
> > > > > > files, vma, and tasks in the system, even though they were
> > > > > > interested
> > > > > > in only the resources of a specific task or process. 
> > > > > > Passing
> > > > > > the
> > > > > > additional parameters, people can now create an iterator to
> > > > > > go
> > > > > > through all resources or only the resources of a task.
> > > > > > 
> > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > > ---
> > > > > >  include/linux/bpf.h            |  4 ++
> > > > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > > >  kernel/bpf/task_iter.c         | 81
> > > > > > +++++++++++++++++++++++++-
> > > > > > ----
> > > > > > ----
> > > > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > > > 
> > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > index 11950029284f..c8d164404e20 100644
> > > > > > --- a/include/linux/bpf.h
> > > > > > +++ b/include/linux/bpf.h
> > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char
> > > > > > __user
> > > > > > *pathname, int flags);
> > > > > > 
> > > > > >  struct bpf_iter_aux_info {
> > > > > >         struct bpf_map *map;
> > > > > > +       struct {
> > > > > > +               __u32   tid;
> > > > > 
> > > > > should be just u32 ?
> > > > 
> > > > Or, should change the following 'type' to __u8?
> > > 
> > > Would it be better to use a pidfd instead of a tid here? Unset
> > > pidfd
> > > would mean going over all tasks, and any fd > 0 implies attaching
> > > to
> > > a
> > > specific task (as is the convention in BPF land). Most of the new
> > > UAPIs working on processes are using pidfds (to work with a
> > > stable
> > > handle instead of a reusable ID).
> > > The iterator taking an fd also gives an opportunity to BPF LSMs
> > > to
> > > attach permissions/policies to it (once we have a file local
> > > storage
> > > map) e.g. whether creating a task iterator for that specific
> > > pidfd
> > > instance (backed by the struct file) would be allowed or not.
> > > You are using getpid in the selftest and keeping track of
> > > last_tgid
> > > in
> > > the iterator, so I guess you don't even need to extend pidfd_open
> > > to
> > > work on thread IDs right now for your use case (and fdtable and
> > > mm
> > > are
> > > shared for POSIX threads anyway, so for those two it won't make a
> > > difference).
> > > 
> > > What is your opinion?
> > 
> > Do you mean removed both tid and type, and replace them with a
> > pidfd?
> > We can do that in uapi, struct bpf_link_info.  But, the interal
> > types,
> > ex. bpf_iter_aux_info, still need to use tid or struct file to
> > avoid
> > getting file from the per-process fdtable.  Is that what you mean?
> > 
> 
> Yes, just for the UAPI, it is similar to taking map_fd for map iter.
> In bpf_link_info we should report just the tid, just like map iter
> reports map_id.

It sounds good to me.

One thing I need a clarification. You mentioned that a fd > 0 implies
attaching to a specific task, however fd can be 0. So, it should be fd
>= 0. So, it forces the user to initialize the value of pidfd to -1. 
So, for convenience, we still need a field like 'type' to make it easy
to create iterators without a filter.



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28 15:16             ` Kui-Feng Lee
@ 2022-07-28 16:22               ` Kumar Kartikeya Dwivedi
  2022-07-28 16:40                 ` Kui-Feng Lee
  2022-07-28 18:01                 ` Hao Luo
  0 siblings, 2 replies; 22+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2022-07-28 16:22 UTC (permalink / raw)
  To: Kui-Feng Lee
  Cc: Yonghong Song, brauner, daniel, olsajiri, ast, Kernel Team, bpf, andrii

On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote:
> > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > >
> > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote:
> > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com>
> > > > wrote:
> > > > >
> > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > > > > > Allow creating an iterator that loops through resources of
> > > > > > > one
> > > > > > > task/thread.
> > > > > > >
> > > > > > > People could only create iterators to loop through all
> > > > > > > resources of
> > > > > > > files, vma, and tasks in the system, even though they were
> > > > > > > interested
> > > > > > > in only the resources of a specific task or process.
> > > > > > > Passing
> > > > > > > the
> > > > > > > additional parameters, people can now create an iterator to
> > > > > > > go
> > > > > > > through all resources or only the resources of a task.
> > > > > > >
> > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > > > ---
> > > > > > >  include/linux/bpf.h            |  4 ++
> > > > > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > > > >  kernel/bpf/task_iter.c         | 81
> > > > > > > +++++++++++++++++++++++++-
> > > > > > > ----
> > > > > > > ----
> > > > > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > > > >
> > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > > index 11950029284f..c8d164404e20 100644
> > > > > > > --- a/include/linux/bpf.h
> > > > > > > +++ b/include/linux/bpf.h
> > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char
> > > > > > > __user
> > > > > > > *pathname, int flags);
> > > > > > >
> > > > > > >  struct bpf_iter_aux_info {
> > > > > > >         struct bpf_map *map;
> > > > > > > +       struct {
> > > > > > > +               __u32   tid;
> > > > > >
> > > > > > should be just u32 ?
> > > > >
> > > > > Or, should change the following 'type' to __u8?
> > > >
> > > > Would it be better to use a pidfd instead of a tid here? Unset
> > > > pidfd
> > > > would mean going over all tasks, and any fd > 0 implies attaching
> > > > to
> > > > a
> > > > specific task (as is the convention in BPF land). Most of the new
> > > > UAPIs working on processes are using pidfds (to work with a
> > > > stable
> > > > handle instead of a reusable ID).
> > > > The iterator taking an fd also gives an opportunity to BPF LSMs
> > > > to
> > > > attach permissions/policies to it (once we have a file local
> > > > storage
> > > > map) e.g. whether creating a task iterator for that specific
> > > > pidfd
> > > > instance (backed by the struct file) would be allowed or not.
> > > > You are using getpid in the selftest and keeping track of
> > > > last_tgid
> > > > in
> > > > the iterator, so I guess you don't even need to extend pidfd_open
> > > > to
> > > > work on thread IDs right now for your use case (and fdtable and
> > > > mm
> > > > are
> > > > shared for POSIX threads anyway, so for those two it won't make a
> > > > difference).
> > > >
> > > > What is your opinion?
> > >
> > > Do you mean removed both tid and type, and replace them with a
> > > pidfd?
> > > We can do that in uapi, struct bpf_link_info.  But, the interal
> > > types,
> > > ex. bpf_iter_aux_info, still need to use tid or struct file to
> > > avoid
> > > getting file from the per-process fdtable.  Is that what you mean?
> > >
> >
> > Yes, just for the UAPI, it is similar to taking map_fd for map iter.
> > In bpf_link_info we should report just the tid, just like map iter
> > reports map_id.
>
> It sounds good to me.
>
> One thing I need a clarification. You mentioned that a fd > 0 implies
> attaching to a specific task, however fd can be 0. So, it should be fd
> >= 0. So, it forces the user to initialize the value of pidfd to -1.
> So, for convenience, we still need a field like 'type' to make it easy
> to create iterators without a filter.
>

Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it
is fine to rely on that assumption. For e.g. even for map_fd,
bpf_map_elem iterator considers fd 0 to be unset. Then you don't need
the type field.

>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28 16:22               ` Kumar Kartikeya Dwivedi
@ 2022-07-28 16:40                 ` Kui-Feng Lee
  2022-07-28 17:08                   ` Yonghong Song
       [not found]                   ` <CAP01T74HRHapKDAfj104KNGnzCgNQSu_M5-KfEvGBNzLWNfd+Q@mail.gmail.com>
  2022-07-28 18:01                 ` Hao Luo
  1 sibling, 2 replies; 22+ messages in thread
From: Kui-Feng Lee @ 2022-07-28 16:40 UTC (permalink / raw)
  To: memxor
  Cc: Yonghong Song, brauner, daniel, olsajiri, ast, Kernel Team, bpf, andrii

On Thu, 2022-07-28 at 18:22 +0200, Kumar Kartikeya Dwivedi wrote:
> On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote:
> > > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com>
> > > wrote:
> > > > 
> > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi
> > > > wrote:
> > > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com>
> > > > > wrote:
> > > > > > 
> > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee
> > > > > > > wrote:
> > > > > > > > Allow creating an iterator that loops through resources
> > > > > > > > of
> > > > > > > > one
> > > > > > > > task/thread.
> > > > > > > > 
> > > > > > > > People could only create iterators to loop through all
> > > > > > > > resources of
> > > > > > > > files, vma, and tasks in the system, even though they
> > > > > > > > were
> > > > > > > > interested
> > > > > > > > in only the resources of a specific task or process.
> > > > > > > > Passing
> > > > > > > > the
> > > > > > > > additional parameters, people can now create an
> > > > > > > > iterator to
> > > > > > > > go
> > > > > > > > through all resources or only the resources of a task.
> > > > > > > > 
> > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > > > > ---
> > > > > > > >  include/linux/bpf.h            |  4 ++
> > > > > > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > > > > >  kernel/bpf/task_iter.c         | 81
> > > > > > > > +++++++++++++++++++++++++-
> > > > > > > > ----
> > > > > > > > ----
> > > > > > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > > > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > > > index 11950029284f..c8d164404e20 100644
> > > > > > > > --- a/include/linux/bpf.h
> > > > > > > > +++ b/include/linux/bpf.h
> > > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char
> > > > > > > > __user
> > > > > > > > *pathname, int flags);
> > > > > > > > 
> > > > > > > >  struct bpf_iter_aux_info {
> > > > > > > >         struct bpf_map *map;
> > > > > > > > +       struct {
> > > > > > > > +               __u32   tid;
> > > > > > > 
> > > > > > > should be just u32 ?
> > > > > > 
> > > > > > Or, should change the following 'type' to __u8?
> > > > > 
> > > > > Would it be better to use a pidfd instead of a tid here?
> > > > > Unset
> > > > > pidfd
> > > > > would mean going over all tasks, and any fd > 0 implies
> > > > > attaching
> > > > > to
> > > > > a
> > > > > specific task (as is the convention in BPF land). Most of the
> > > > > new
> > > > > UAPIs working on processes are using pidfds (to work with a
> > > > > stable
> > > > > handle instead of a reusable ID).
> > > > > The iterator taking an fd also gives an opportunity to BPF
> > > > > LSMs
> > > > > to
> > > > > attach permissions/policies to it (once we have a file local
> > > > > storage
> > > > > map) e.g. whether creating a task iterator for that specific
> > > > > pidfd
> > > > > instance (backed by the struct file) would be allowed or not.
> > > > > You are using getpid in the selftest and keeping track of
> > > > > last_tgid
> > > > > in
> > > > > the iterator, so I guess you don't even need to extend
> > > > > pidfd_open
> > > > > to
> > > > > work on thread IDs right now for your use case (and fdtable
> > > > > and
> > > > > mm
> > > > > are
> > > > > shared for POSIX threads anyway, so for those two it won't
> > > > > make a
> > > > > difference).
> > > > > 
> > > > > What is your opinion?
> > > > 
> > > > Do you mean removed both tid and type, and replace them with a
> > > > pidfd?
> > > > We can do that in uapi, struct bpf_link_info.  But, the interal
> > > > types,
> > > > ex. bpf_iter_aux_info, still need to use tid or struct file to
> > > > avoid
> > > > getting file from the per-process fdtable.  Is that what you
> > > > mean?
> > > > 
> > > 
> > > Yes, just for the UAPI, it is similar to taking map_fd for map
> > > iter.
> > > In bpf_link_info we should report just the tid, just like map
> > > iter
> > > reports map_id.
> > 
> > It sounds good to me.
> > 
> > One thing I need a clarification. You mentioned that a fd > 0
> > implies
> > attaching to a specific task, however fd can be 0. So, it should be
> > fd
> > > = 0. So, it forces the user to initialize the value of pidfd to -
> > > 1.
> > So, for convenience, we still need a field like 'type' to make it
> > easy
> > to create iterators without a filter.
> > 
> 
> Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it
> is fine to rely on that assumption. For e.g. even for map_fd,
> bpf_map_elem iterator considers fd 0 to be unset. Then you don't need
> the type field.

I just realize that pidfd may be meaningless for the bpf_link_info
returned by bpf_obj_get_info_by_fd() since the origin fd might be
closed already.  So, I will always set it a value of 0.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28 16:40                 ` Kui-Feng Lee
@ 2022-07-28 17:08                   ` Yonghong Song
  2022-07-28 17:52                     ` Kumar Kartikeya Dwivedi
       [not found]                   ` <CAP01T74HRHapKDAfj104KNGnzCgNQSu_M5-KfEvGBNzLWNfd+Q@mail.gmail.com>
  1 sibling, 1 reply; 22+ messages in thread
From: Yonghong Song @ 2022-07-28 17:08 UTC (permalink / raw)
  To: Kui-Feng Lee, memxor
  Cc: brauner, daniel, olsajiri, ast, Kernel Team, bpf, andrii



On 7/28/22 9:40 AM, Kui-Feng Lee wrote:
> On Thu, 2022-07-28 at 18:22 +0200, Kumar Kartikeya Dwivedi wrote:
>> On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote:
>>>
>>> On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote:
>>>> On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com>
>>>> wrote:
>>>>>
>>>>> On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi
>>>>> wrote:
>>>>>> On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com>
>>>>>> wrote:
>>>>>>>
>>>>>>> On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
>>>>>>>> On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee
>>>>>>>> wrote:
>>>>>>>>> Allow creating an iterator that loops through resources
>>>>>>>>> of
>>>>>>>>> one
>>>>>>>>> task/thread.
>>>>>>>>>
>>>>>>>>> People could only create iterators to loop through all
>>>>>>>>> resources of
>>>>>>>>> files, vma, and tasks in the system, even though they
>>>>>>>>> were
>>>>>>>>> interested
>>>>>>>>> in only the resources of a specific task or process.
>>>>>>>>> Passing
>>>>>>>>> the
>>>>>>>>> additional parameters, people can now create an
>>>>>>>>> iterator to
>>>>>>>>> go
>>>>>>>>> through all resources or only the resources of a task.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
>>>>>>>>> ---
>>>>>>>>>   include/linux/bpf.h            |  4 ++
>>>>>>>>>   include/uapi/linux/bpf.h       | 23 ++++++++++
>>>>>>>>>   kernel/bpf/task_iter.c         | 81
>>>>>>>>> +++++++++++++++++++++++++-
>>>>>>>>> ----
>>>>>>>>> ----
>>>>>>>>>   tools/include/uapi/linux/bpf.h | 23 ++++++++++
>>>>>>>>>   4 files changed, 109 insertions(+), 22 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>>>>>>> index 11950029284f..c8d164404e20 100644
>>>>>>>>> --- a/include/linux/bpf.h
>>>>>>>>> +++ b/include/linux/bpf.h
>>>>>>>>> @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char
>>>>>>>>> __user
>>>>>>>>> *pathname, int flags);
>>>>>>>>>
>>>>>>>>>   struct bpf_iter_aux_info {
>>>>>>>>>          struct bpf_map *map;
>>>>>>>>> +       struct {
>>>>>>>>> +               __u32   tid;
>>>>>>>>
>>>>>>>> should be just u32 ?
>>>>>>>
>>>>>>> Or, should change the following 'type' to __u8?
>>>>>>
>>>>>> Would it be better to use a pidfd instead of a tid here?
>>>>>> Unset
>>>>>> pidfd
>>>>>> would mean going over all tasks, and any fd > 0 implies
>>>>>> attaching
>>>>>> to
>>>>>> a
>>>>>> specific task (as is the convention in BPF land). Most of the
>>>>>> new
>>>>>> UAPIs working on processes are using pidfds (to work with a
>>>>>> stable
>>>>>> handle instead of a reusable ID).
>>>>>> The iterator taking an fd also gives an opportunity to BPF
>>>>>> LSMs
>>>>>> to
>>>>>> attach permissions/policies to it (once we have a file local
>>>>>> storage
>>>>>> map) e.g. whether creating a task iterator for that specific
>>>>>> pidfd
>>>>>> instance (backed by the struct file) would be allowed or not.
>>>>>> You are using getpid in the selftest and keeping track of
>>>>>> last_tgid
>>>>>> in
>>>>>> the iterator, so I guess you don't even need to extend
>>>>>> pidfd_open
>>>>>> to
>>>>>> work on thread IDs right now for your use case (and fdtable
>>>>>> and
>>>>>> mm
>>>>>> are
>>>>>> shared for POSIX threads anyway, so for those two it won't
>>>>>> make a
>>>>>> difference).

There is one problem here. The current pidfd_open syscall
only supports thread-group leader, i.e., main thread, i.e.,
it won't support any non-main-thread tid's. Yes, thread-group
leader and other threads should share the same vma and files
in most of times, but it still possible different threads
in the same process may have different files which is why
in current task_iter.c we have:
                 *tid = pid_nr_ns(pid, ns);
                 task = get_pid_task(pid, PIDTYPE_PID);
                 if (!task) {
                         ++*tid;
                         goto retry;
                 } else if (skip_if_dup_files && 
!thread_group_leader(task) &&
                            task->files == task->group_leader->files) {
                         put_task_struct(task);
                         task = NULL;
                         ++*tid;
                         goto retry;
                 }


Each thread (tid) will have some fields different from
thread-group leader (tgid), e.g., comm and most (if not all)
scheduling related fields.

So it would be good to support for each tid from the start
as it is not clear when pidfd_open will support non
thread-group leader.

If it worries wrap around, a reference to the task
can be held when tid passed to the kernel at link
create time. This is similar to pid is passed to
the kernel at pidfd_open syscall. But in practice,
I think taking the reference during read() should
also fine. The race always exist anyway.

Kumar, could you give more details about security
concerns? I am not sure about the tight relationship
between bpf_iter and security. bpf_iter just for
iterating kernel data structures.

>>>>>>
>>>>>> What is your opinion?
>>>>>
>>>>> Do you mean removed both tid and type, and replace them with a
>>>>> pidfd?
>>>>> We can do that in uapi, struct bpf_link_info.  But, the interal
>>>>> types,
>>>>> ex. bpf_iter_aux_info, still need to use tid or struct file to
>>>>> avoid
>>>>> getting file from the per-process fdtable.  Is that what you
>>>>> mean?
>>>>>
>>>>
>>>> Yes, just for the UAPI, it is similar to taking map_fd for map
>>>> iter.
>>>> In bpf_link_info we should report just the tid, just like map
>>>> iter
>>>> reports map_id.
>>>
>>> It sounds good to me.
>>>
>>> One thing I need a clarification. You mentioned that a fd > 0
>>> implies
>>> attaching to a specific task, however fd can be 0. So, it should be
>>> fd
>>>> = 0. So, it forces the user to initialize the value of pidfd to -
>>>> 1.
>>> So, for convenience, we still need a field like 'type' to make it
>>> easy
>>> to create iterators without a filter.
>>>
>>
>> Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it
>> is fine to rely on that assumption. For e.g. even for map_fd,
>> bpf_map_elem iterator considers fd 0 to be unset. Then you don't need
>> the type field.
> 
> I just realize that pidfd may be meaningless for the bpf_link_info
> returned by bpf_obj_get_info_by_fd() since the origin fd might be
> closed already.  So, I will always set it a value of 0.
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28 17:08                   ` Yonghong Song
@ 2022-07-28 17:52                     ` Kumar Kartikeya Dwivedi
  2022-07-28 19:11                       ` Hao Luo
  2022-07-28 22:54                       ` Yonghong Song
  0 siblings, 2 replies; 22+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2022-07-28 17:52 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Kui-Feng Lee, brauner, daniel, olsajiri, ast, Kernel Team, bpf, andrii

On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote:
> > [...]
>
> There is one problem here. The current pidfd_open syscall
> only supports thread-group leader, i.e., main thread, i.e.,
> it won't support any non-main-thread tid's. Yes, thread-group
> leader and other threads should share the same vma and files
> in most of times, but it still possible different threads
> in the same process may have different files which is why
> in current task_iter.c we have:
>                  *tid = pid_nr_ns(pid, ns);
>                  task = get_pid_task(pid, PIDTYPE_PID);
>                  if (!task) {
>                          ++*tid;
>                          goto retry;
>                  } else if (skip_if_dup_files &&
> !thread_group_leader(task) &&
>                             task->files == task->group_leader->files) {
>                          put_task_struct(task);
>                          task = NULL;
>                          ++*tid;
>                          goto retry;
>                  }
>
>
> Each thread (tid) will have some fields different from
> thread-group leader (tgid), e.g., comm and most (if not all)
> scheduling related fields.
>
> So it would be good to support for each tid from the start
> as it is not clear when pidfd_open will support non
> thread-group leader.

I think this is just a missing feature, not a design limitation. If we
just disable thread pifdfd from waitid and pidfd_send_signal, I think
it is ok to use it elsewhere.

>
> If it worries wrap around, a reference to the task
> can be held when tid passed to the kernel at link
> create time. This is similar to pid is passed to
> the kernel at pidfd_open syscall. But in practice,
> I think taking the reference during read() should
> also fine. The race always exist anyway.
>
> Kumar, could you give more details about security
> concerns? I am not sure about the tight relationship
> between bpf_iter and security. bpf_iter just for
> iterating kernel data structures.
>

There is no security 'concern' per se. But having an fd which is used
to set up the iterator just gives a chance to a BPF LSM to easily
isolate permissions to attach the iterator to a task represented by
that fd. i.e. the fd is an object to which this permission can be
bound, the fd can be passed around (to share the same permission with
others but it is limited to the task corresponding to it), etc. The
permission management is even easier and faster once you have a file
local storage map (which I plan to send soon).

So you could have two pidfds, one which allows the process to attach
the iter to it, and one which doesn't, without relying on the task's
capabilities, the checks can become more fine grained, and the
original task can even drop its capabilities (because they are bound
to the fd instead), which allows privilege separation.

It becomes a bit difficult when kernel APIs take IDs because then you
don't have any subject (other than the task) to draw the permissions
from.

But anyway, all of this was just a suggestion (which is why I
solicited opinions in the original reply). Feel free to drop/ignore if
it is too much of a hassle to support (i.e. it is understandable you
wouldn't want to spend time extending pidfd_open for this).

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28 16:22               ` Kumar Kartikeya Dwivedi
  2022-07-28 16:40                 ` Kui-Feng Lee
@ 2022-07-28 18:01                 ` Hao Luo
  1 sibling, 0 replies; 22+ messages in thread
From: Hao Luo @ 2022-07-28 18:01 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Kui-Feng Lee, Yonghong Song, brauner, daniel, olsajiri, ast,
	Kernel Team, bpf, andrii

On Thu, Jul 28, 2022 at 9:24 AM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote:
> >
> > On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote:
> > > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > > >
> > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote:
> > > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com>
> > > > > wrote:
> > > > > >
> > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > > > > > > Allow creating an iterator that loops through resources of
> > > > > > > > one
> > > > > > > > task/thread.
> > > > > > > >
> > > > > > > > People could only create iterators to loop through all
> > > > > > > > resources of
> > > > > > > > files, vma, and tasks in the system, even though they were
> > > > > > > > interested
> > > > > > > > in only the resources of a specific task or process.
> > > > > > > > Passing
> > > > > > > > the
> > > > > > > > additional parameters, people can now create an iterator to
> > > > > > > > go
> > > > > > > > through all resources or only the resources of a task.
> > > > > > > >
> > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > > > > ---
> > > > > > > >  include/linux/bpf.h            |  4 ++
> > > > > > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > > > > >  kernel/bpf/task_iter.c         | 81
> > > > > > > > +++++++++++++++++++++++++-
> > > > > > > > ----
> > > > > > > > ----
> > > > > > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > > > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > > > index 11950029284f..c8d164404e20 100644
> > > > > > > > --- a/include/linux/bpf.h
> > > > > > > > +++ b/include/linux/bpf.h
> > > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char
> > > > > > > > __user
> > > > > > > > *pathname, int flags);
> > > > > > > >
> > > > > > > >  struct bpf_iter_aux_info {
> > > > > > > >         struct bpf_map *map;
> > > > > > > > +       struct {
> > > > > > > > +               __u32   tid;
> > > > > > >
> > > > > > > should be just u32 ?
> > > > > >
> > > > > > Or, should change the following 'type' to __u8?
> > > > >
> > > > > Would it be better to use a pidfd instead of a tid here? Unset
> > > > > pidfd
> > > > > would mean going over all tasks, and any fd > 0 implies attaching
> > > > > to
> > > > > a
> > > > > specific task (as is the convention in BPF land). Most of the new
> > > > > UAPIs working on processes are using pidfds (to work with a
> > > > > stable
> > > > > handle instead of a reusable ID).
> > > > > The iterator taking an fd also gives an opportunity to BPF LSMs
> > > > > to
> > > > > attach permissions/policies to it (once we have a file local
> > > > > storage
> > > > > map) e.g. whether creating a task iterator for that specific
> > > > > pidfd
> > > > > instance (backed by the struct file) would be allowed or not.
> > > > > You are using getpid in the selftest and keeping track of
> > > > > last_tgid
> > > > > in
> > > > > the iterator, so I guess you don't even need to extend pidfd_open
> > > > > to
> > > > > work on thread IDs right now for your use case (and fdtable and
> > > > > mm
> > > > > are
> > > > > shared for POSIX threads anyway, so for those two it won't make a
> > > > > difference).
> > > > >
> > > > > What is your opinion?
> > > >
> > > > Do you mean removed both tid and type, and replace them with a
> > > > pidfd?
> > > > We can do that in uapi, struct bpf_link_info.  But, the interal
> > > > types,
> > > > ex. bpf_iter_aux_info, still need to use tid or struct file to
> > > > avoid
> > > > getting file from the per-process fdtable.  Is that what you mean?
> > > >
> > >
> > > Yes, just for the UAPI, it is similar to taking map_fd for map iter.
> > > In bpf_link_info we should report just the tid, just like map iter
> > > reports map_id.
> >
> > It sounds good to me.
> >
> > One thing I need a clarification. You mentioned that a fd > 0 implies
> > attaching to a specific task, however fd can be 0. So, it should be fd
> > >= 0. So, it forces the user to initialize the value of pidfd to -1.
> > So, for convenience, we still need a field like 'type' to make it easy
> > to create iterators without a filter.
> >
>
> Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it
> is fine to rely on that assumption. For e.g. even for map_fd,
> bpf_map_elem iterator considers fd 0 to be unset. Then you don't need
> the type field.
>

FD 0 is STDIN, unless someone explicitly close(0). IMO using fd 0 for
default behavior is fine.

Hao

> >

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28 17:52                     ` Kumar Kartikeya Dwivedi
@ 2022-07-28 19:11                       ` Hao Luo
  2022-08-02 14:25                         ` Kumar Kartikeya Dwivedi
  2022-07-28 22:54                       ` Yonghong Song
  1 sibling, 1 reply; 22+ messages in thread
From: Hao Luo @ 2022-07-28 19:11 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Yonghong Song, Kui-Feng Lee, brauner, daniel, olsajiri, ast,
	Kernel Team, bpf, andrii

Hi Kumar,

On Thu, Jul 28, 2022 at 10:53 AM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote:
> > > [...]
> >
> > There is one problem here. The current pidfd_open syscall
> > only supports thread-group leader, i.e., main thread, i.e.,
> > it won't support any non-main-thread tid's. Yes, thread-group
> > leader and other threads should share the same vma and files
> > in most of times, but it still possible different threads
> > in the same process may have different files which is why
> > in current task_iter.c we have:
> >                  *tid = pid_nr_ns(pid, ns);
> >                  task = get_pid_task(pid, PIDTYPE_PID);
> >                  if (!task) {
> >                          ++*tid;
> >                          goto retry;
> >                  } else if (skip_if_dup_files &&
> > !thread_group_leader(task) &&
> >                             task->files == task->group_leader->files) {
> >                          put_task_struct(task);
> >                          task = NULL;
> >                          ++*tid;
> >                          goto retry;
> >                  }
> >
> >
> > Each thread (tid) will have some fields different from
> > thread-group leader (tgid), e.g., comm and most (if not all)
> > scheduling related fields.
> >
> > So it would be good to support for each tid from the start
> > as it is not clear when pidfd_open will support non
> > thread-group leader.
>
> I think this is just a missing feature, not a design limitation. If we
> just disable thread pifdfd from waitid and pidfd_send_signal, I think
> it is ok to use it elsewhere.
>
> >
> > If it worries wrap around, a reference to the task
> > can be held when tid passed to the kernel at link
> > create time. This is similar to pid is passed to
> > the kernel at pidfd_open syscall. But in practice,
> > I think taking the reference during read() should
> > also fine. The race always exist anyway.
> >
> > Kumar, could you give more details about security
> > concerns? I am not sure about the tight relationship
> > between bpf_iter and security. bpf_iter just for
> > iterating kernel data structures.
> >
>
> There is no security 'concern' per se. But having an fd which is used
> to set up the iterator just gives a chance to a BPF LSM to easily
> isolate permissions to attach the iterator to a task represented by
> that fd. i.e. the fd is an object to which this permission can be
> bound, the fd can be passed around (to share the same permission with
> others but it is limited to the task corresponding to it), etc. The
> permission management is even easier and faster once you have a file
> local storage map (which I plan to send soon).
>

I like the idea of a file local storage map. I have several questions
in mind, but don't want to digress from the discussion under
Kui-Feng's patchset. It probably will be clear when seeing your
change. Please cc me when you send it out. thanks.

> So you could have two pidfds, one which allows the process to attach
> the iter to it, and one which doesn't, without relying on the task's
> capabilities, the checks can become more fine grained, and the
> original task can even drop its capabilities (because they are bound
> to the fd instead), which allows privilege separation.
>
> It becomes a bit difficult when kernel APIs take IDs because then you
> don't have any subject (other than the task) to draw the permissions
> from.
>
> But anyway, all of this was just a suggestion (which is why I
> solicited opinions in the original reply). Feel free to drop/ignore if
> it is too much of a hassle to support (i.e. it is understandable you
> wouldn't want to spend time extending pidfd_open for this).

On another thread, I was having a discussion with Tejun on FD vs ID
for cgroup_iter. I am in favor of ID in general, because it's
stateless and matches the info reported by bpf_link_info. This is nice
from the userspace's perspective.

Hao

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28 17:52                     ` Kumar Kartikeya Dwivedi
  2022-07-28 19:11                       ` Hao Luo
@ 2022-07-28 22:54                       ` Yonghong Song
  2022-07-29  9:10                         ` Christian Brauner
  1 sibling, 1 reply; 22+ messages in thread
From: Yonghong Song @ 2022-07-28 22:54 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Kui-Feng Lee, brauner, daniel, olsajiri, ast, Kernel Team, bpf, andrii



On 7/28/22 10:52 AM, Kumar Kartikeya Dwivedi wrote:
> On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote:
>>> [...]
>>
>> There is one problem here. The current pidfd_open syscall
>> only supports thread-group leader, i.e., main thread, i.e.,
>> it won't support any non-main-thread tid's. Yes, thread-group
>> leader and other threads should share the same vma and files
>> in most of times, but it still possible different threads
>> in the same process may have different files which is why
>> in current task_iter.c we have:
>>                   *tid = pid_nr_ns(pid, ns);
>>                   task = get_pid_task(pid, PIDTYPE_PID);
>>                   if (!task) {
>>                           ++*tid;
>>                           goto retry;
>>                   } else if (skip_if_dup_files &&
>> !thread_group_leader(task) &&
>>                              task->files == task->group_leader->files) {
>>                           put_task_struct(task);
>>                           task = NULL;
>>                           ++*tid;
>>                           goto retry;
>>                   }
>>
>>
>> Each thread (tid) will have some fields different from
>> thread-group leader (tgid), e.g., comm and most (if not all)
>> scheduling related fields.
>>
>> So it would be good to support for each tid from the start
>> as it is not clear when pidfd_open will support non
>> thread-group leader.
> 
> I think this is just a missing feature, not a design limitation. If we
> just disable thread pifdfd from waitid and pidfd_send_signal, I think
> it is ok to use it elsewhere.

Yes, I agree this is a missing feature. It is desirable
that the missing feature gets implemented in kernel or
at least promising work is recognized before we use pidfd
for this task structure.

> 
>>
>> If it worries wrap around, a reference to the task
>> can be held when tid passed to the kernel at link
>> create time. This is similar to pid is passed to
>> the kernel at pidfd_open syscall. But in practice,
>> I think taking the reference during read() should
>> also fine. The race always exist anyway.
>>
>> Kumar, could you give more details about security
>> concerns? I am not sure about the tight relationship
>> between bpf_iter and security. bpf_iter just for
>> iterating kernel data structures.
>>
> 
> There is no security 'concern' per se. But having an fd which is used
> to set up the iterator just gives a chance to a BPF LSM to easily
> isolate permissions to attach the iterator to a task represented by
> that fd. i.e. the fd is an object to which this permission can be
> bound, the fd can be passed around (to share the same permission with
> others but it is limited to the task corresponding to it), etc. The
> permission management is even easier and faster once you have a file
> local storage map (which I plan to send soon).

So you mean with fd, bpf_lsm could add a hook in bpf iterator
to enforce policies about whether a particular task can be visited
or not, right? In such cases, I think the policy should be
against a task pointer and a bpf pointer, which have enough information
for the policy to work.

In typical use cases, user space gets a pid (the process own pid or
another process id). Yes, we could create a pidfd with pidfd_open().
And this is pidfd can be manipulated with permissions, and passed
to the bpf iter. The current pidfd creation looks like below:

int pidfd_create(struct pid *pid, unsigned int flags)
{
         int fd;

         if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
                 return -EINVAL;

         if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
                 return -EINVAL;

         fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
                               flags | O_RDWR | O_CLOEXEC);
         if (fd < 0)
                 put_pid(pid);

         return fd;
}

I am not sure how security policy can be easily applied to such a
fd. Probably user need to further manipulate fd with fcntl() and I think
most users probably won't do that.

The following are some bpf program lsm hooks:

#ifdef CONFIG_BPF_SYSCALL
LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux)
LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux 
*aux)
#endif /* CONFIG_BPF_SYSCALL */

I think task_struct ptr and prog ptr should be enough for the
potential LSM hook.

> 
> So you could have two pidfds, one which allows the process to attach
> the iter to it, and one which doesn't, without relying on the task's
> capabilities, the checks can become more fine grained, and the
> original task can even drop its capabilities (because they are bound
> to the fd instead), which allows privilege separation.
> 
> It becomes a bit difficult when kernel APIs take IDs because then you
> don't have any subject (other than the task) to draw the permissions
> from.

Maybe I missed something from security perspective. But from design
perspective, I am okay with pidfd since it pushes the responsibility
of pid -> task_struct conversion to user space. Although unlikely,
it still has chances that inside the kernel tid->task_struct may
actually use wrapped around tid...

But since pidfd_open misses tid support, I hesitate to use pidfd
for now.

Maybe Daniel or Alexei can comment as well?

> 
> But anyway, all of this was just a suggestion (which is why I
> solicited opinions in the original reply). Feel free to drop/ignore if
> it is too much of a hassle to support (i.e. it is understandable you
> wouldn't want to spend time extending pidfd_open for this).

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28 22:54                       ` Yonghong Song
@ 2022-07-29  9:10                         ` Christian Brauner
  0 siblings, 0 replies; 22+ messages in thread
From: Christian Brauner @ 2022-07-29  9:10 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Kumar Kartikeya Dwivedi, Kui-Feng Lee, daniel, olsajiri, ast,
	Kernel Team, bpf, andrii

On Thu, Jul 28, 2022 at 03:54:01PM -0700, Yonghong Song wrote:
> 
> 
> On 7/28/22 10:52 AM, Kumar Kartikeya Dwivedi wrote:
> > On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote:
> > > > [...]
> > > 
> > > There is one problem here. The current pidfd_open syscall
> > > only supports thread-group leader, i.e., main thread, i.e.,
> > > it won't support any non-main-thread tid's. Yes, thread-group
> > > leader and other threads should share the same vma and files
> > > in most of times, but it still possible different threads
> > > in the same process may have different files which is why
> > > in current task_iter.c we have:
> > >                   *tid = pid_nr_ns(pid, ns);
> > >                   task = get_pid_task(pid, PIDTYPE_PID);
> > >                   if (!task) {
> > >                           ++*tid;
> > >                           goto retry;
> > >                   } else if (skip_if_dup_files &&
> > > !thread_group_leader(task) &&
> > >                              task->files == task->group_leader->files) {
> > >                           put_task_struct(task);
> > >                           task = NULL;
> > >                           ++*tid;
> > >                           goto retry;
> > >                   }
> > > 
> > > 
> > > Each thread (tid) will have some fields different from
> > > thread-group leader (tgid), e.g., comm and most (if not all)
> > > scheduling related fields.
> > > 
> > > So it would be good to support for each tid from the start
> > > as it is not clear when pidfd_open will support non
> > > thread-group leader.
> > 
> > I think this is just a missing feature, not a design limitation. If we
> > just disable thread pifdfd from waitid and pidfd_send_signal, I think
> > it is ok to use it elsewhere.
> 
> Yes, I agree this is a missing feature. It is desirable
> that the missing feature gets implemented in kernel or
> at least promising work is recognized before we use pidfd
> for this task structure.

When I did pidfd_{open,getfd,send_signal}, CLONE_PIDFD, and the waitid
support we simply didn't enable support for pidfd to refer to individual
threads because there was no immediate use-case for it and we hade some
concerns that I can't remember off the top of my head. Plus, we were
quite happy that we got the pidfd stuff in so we limited the scope of
what we wanted to do in the first iteration.

But if there is a good use-case for this then by all means we should
enable pidfds to refer to individual threads and I'm happy to route that
upstream. But this needs to be solid work as that area of the kernel can
be interesting technically and otherwise...

There have been requests for this before already.

I've added a wrapper pidfd_get_task() a while ago that encodes the
PIDTYPE_TGID restriction. If you grep for it you should find all places
that rely on pidfds (process_mrelease() and whatnot). All these places
need to be able to deal with individual threads if you change that.

But note, the mw is coming up.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
       [not found]                   ` <CAP01T74HRHapKDAfj104KNGnzCgNQSu_M5-KfEvGBNzLWNfd+Q@mail.gmail.com>
@ 2022-07-30  2:46                     ` Kui-Feng Lee
  0 siblings, 0 replies; 22+ messages in thread
From: Kui-Feng Lee @ 2022-07-30  2:46 UTC (permalink / raw)
  To: memxor
  Cc: Yonghong Song, brauner, daniel, olsajiri, ast, Kernel Team, bpf, andrii

On Thu, 2022-07-28 at 19:00 +0200, Kumar Kartikeya Dwivedi wrote:
> On Thu, 28 Jul 2022 at 18:40, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > On Thu, 2022-07-28 at 18:22 +0200, Kumar Kartikeya Dwivedi wrote:
> > > On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com>
> > > wrote:
> > > > 
> > > > On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi
> > > > wrote:
> > > > > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com>
> > > > > wrote:
> > > > > > 
> > > > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi
> > > > > > wrote:
> > > > > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee
> > > > > > > <kuifeng@fb.com>
> > > > > > > wrote:
> > > > > > > > 
> > > > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng
> > > > > > > > > Lee
> > > > > > > > > wrote:
> > > > > > > > > > Allow creating an iterator that loops through
> > > > > > > > > > resources
> > > > > > > > > > of
> > > > > > > > > > one
> > > > > > > > > > task/thread.
> > > > > > > > > > 
> > > > > > > > > > People could only create iterators to loop through
> > > > > > > > > > all
> > > > > > > > > > resources of
> > > > > > > > > > files, vma, and tasks in the system, even though
> > > > > > > > > > they
> > > > > > > > > > were
> > > > > > > > > > interested
> > > > > > > > > > in only the resources of a specific task or
> > > > > > > > > > process.
> > > > > > > > > > Passing
> > > > > > > > > > the
> > > > > > > > > > additional parameters, people can now create an
> > > > > > > > > > iterator to
> > > > > > > > > > go
> > > > > > > > > > through all resources or only the resources of a
> > > > > > > > > > task.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > > > > > > ---
> > > > > > > > > >   include/linux/bpf.h            |  4 ++
> > > > > > > > > >   include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > > > > > > >   kernel/bpf/task_iter.c         | 81
> > > > > > > > > > +++++++++++++++++++++++++-
> > > > > > > > > > ----
> > > > > > > > > > ----
> > > > > > > > > >   tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > > > > > > >   4 files changed, 109 insertions(+), 22
> > > > > > > > > > deletions(-)
> > > > > > > > > > 
> > > > > > > > > > diff --git a/include/linux/bpf.h
> > > > > > > > > > b/include/linux/bpf.h
> > > > > > > > > > index 11950029284f..c8d164404e20 100644
> > > > > > > > > > --- a/include/linux/bpf.h
> > > > > > > > > > +++ b/include/linux/bpf.h
> > > > > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const
> > > > > > > > > > char
> > > > > > > > > > __user
> > > > > > > > > > *pathname, int flags);
> > > > > > > > > > 
> > > > > > > > > >   struct bpf_iter_aux_info {
> > > > > > > > > >          struct bpf_map *map;
> > > > > > > > > > +       struct {
> > > > > > > > > > +               __u32   tid;
> > > > > > > > > 
> > > > > > > > > should be just u32 ?
> > > > > > > > 
> > > > > > > > Or, should change the following 'type' to __u8?
> > > > > > > 
> > > > > > > Would it be better to use a pidfd instead of a tid here?
> > > > > > > Unset
> > > > > > > pidfd
> > > > > > > would mean going over all tasks, and any fd > 0 implies
> > > > > > > attaching
> > > > > > > to
> > > > > > > a
> > > > > > > specific task (as is the convention in BPF land). Most of
> > > > > > > the
> > > > > > > new
> > > > > > > UAPIs working on processes are using pidfds (to work with
> > > > > > > a
> > > > > > > stable
> > > > > > > handle instead of a reusable ID).
> > > > > > > The iterator taking an fd also gives an opportunity to
> > > > > > > BPF
> > > > > > > LSMs
> > > > > > > to
> > > > > > > attach permissions/policies to it (once we have a file
> > > > > > > local
> > > > > > > storage
> > > > > > > map) e.g. whether creating a task iterator for that
> > > > > > > specific
> > > > > > > pidfd
> > > > > > > instance (backed by the struct file) would be allowed or
> > > > > > > not.
> > > > > > > You are using getpid in the selftest and keeping track of
> > > > > > > last_tgid
> > > > > > > in
> > > > > > > the iterator, so I guess you don't even need to extend
> > > > > > > pidfd_open
> > > > > > > to
> > > > > > > work on thread IDs right now for your use case (and
> > > > > > > fdtable
> > > > > > > and
> > > > > > > mm
> > > > > > > are
> > > > > > > shared for POSIX threads anyway, so for those two it
> > > > > > > won't
> > > > > > > make a
> > > > > > > difference).
> > > > > > > 
> > > > > > > What is your opinion?
> > > > > > 
> > > > > > Do you mean removed both tid and type, and replace them
> > > > > > with a
> > > > > > pidfd?
> > > > > > We can do that in uapi, struct bpf_link_info.  But, the
> > > > > > interal
> > > > > > types,
> > > > > > ex. bpf_iter_aux_info, still need to use tid or struct file
> > > > > > to
> > > > > > avoid
> > > > > > getting file from the per-process fdtable.  Is that what
> > > > > > you
> > > > > > mean?
> > > > > > 
> > > > > 
> > > > > Yes, just for the UAPI, it is similar to taking map_fd for
> > > > > map
> > > > > iter.
> > > > > In bpf_link_info we should report just the tid, just like map
> > > > > iter
> > > > > reports map_id.
> > > > 
> > > > It sounds good to me.
> > > > 
> > > > One thing I need a clarification. You mentioned that a fd > 0
> > > > implies
> > > > attaching to a specific task, however fd can be 0. So, it
> > > > should be
> > > > fd
> > > > > = 0. So, it forces the user to initialize the value of pidfd
> > > > > to -
> > > > > 1.
> > > > So, for convenience, we still need a field like 'type' to make
> > > > it
> > > > easy
> > > > to create iterators without a filter.
> > > > 
> > > 
> > > Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so
> > > it
> > > is fine to rely on that assumption. For e.g. even for map_fd,
> > > bpf_map_elem iterator considers fd 0 to be unset. Then you don't
> > > need
> > > the type field.
> > 
> > I just realize that pidfd may be meaningless for the bpf_link_info
> > returned by bpf_obj_get_info_by_fd() since the origin fd might be
> > closed already.  So, I will always set it a value of 0.
> > 
> 
> For bpf_link_info, we should only be returning the tid of the task it
> is attached to, you cannot report the pidfd in bpf_link_info
> correctly (as you already realised). By default this would be 0,
> which is also an invalid tid, but when pidfd is set it will be the
> tid of the task it is attached to, so it works well.


We have a lot of dicussions around using tid or pidfd?
Kumar also mentioned about removing 'type'.
However, I have a feel that we need to keep 'type' in struct
bpf_link_info.  I cam imagine that we may like to create iterators of
tasks in a cgroup or other paramters in futhure.  'type' will help us
to tell the types of a parameter.



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: Parameterize task iterators.
  2022-07-28 19:11                       ` Hao Luo
@ 2022-08-02 14:25                         ` Kumar Kartikeya Dwivedi
  0 siblings, 0 replies; 22+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2022-08-02 14:25 UTC (permalink / raw)
  To: Hao Luo
  Cc: Yonghong Song, Kui-Feng Lee, brauner, daniel, olsajiri, ast,
	Kernel Team, bpf, andrii

On Thu, 28 Jul 2022 at 21:11, Hao Luo <haoluo@google.com> wrote:
>
> Hi Kumar,
>
> On Thu, Jul 28, 2022 at 10:53 AM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
> >
> > On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote:
> > > > [...]
> > >
> > > There is one problem here. The current pidfd_open syscall
> > > only supports thread-group leader, i.e., main thread, i.e.,
> > > it won't support any non-main-thread tid's. Yes, thread-group
> > > leader and other threads should share the same vma and files
> > > in most of times, but it still possible different threads
> > > in the same process may have different files which is why
> > > in current task_iter.c we have:
> > >                  *tid = pid_nr_ns(pid, ns);
> > >                  task = get_pid_task(pid, PIDTYPE_PID);
> > >                  if (!task) {
> > >                          ++*tid;
> > >                          goto retry;
> > >                  } else if (skip_if_dup_files &&
> > > !thread_group_leader(task) &&
> > >                             task->files == task->group_leader->files) {
> > >                          put_task_struct(task);
> > >                          task = NULL;
> > >                          ++*tid;
> > >                          goto retry;
> > >                  }
> > >
> > >
> > > Each thread (tid) will have some fields different from
> > > thread-group leader (tgid), e.g., comm and most (if not all)
> > > scheduling related fields.
> > >
> > > So it would be good to support for each tid from the start
> > > as it is not clear when pidfd_open will support non
> > > thread-group leader.
> >
> > I think this is just a missing feature, not a design limitation. If we
> > just disable thread pifdfd from waitid and pidfd_send_signal, I think
> > it is ok to use it elsewhere.
> >
> > >
> > > If it worries wrap around, a reference to the task
> > > can be held when tid passed to the kernel at link
> > > create time. This is similar to pid is passed to
> > > the kernel at pidfd_open syscall. But in practice,
> > > I think taking the reference during read() should
> > > also fine. The race always exist anyway.
> > >
> > > Kumar, could you give more details about security
> > > concerns? I am not sure about the tight relationship
> > > between bpf_iter and security. bpf_iter just for
> > > iterating kernel data structures.
> > >
> >
> > There is no security 'concern' per se. But having an fd which is used
> > to set up the iterator just gives a chance to a BPF LSM to easily
> > isolate permissions to attach the iterator to a task represented by
> > that fd. i.e. the fd is an object to which this permission can be
> > bound, the fd can be passed around (to share the same permission with
> > others but it is limited to the task corresponding to it), etc. The
> > permission management is even easier and faster once you have a file
> > local storage map (which I plan to send soon).
> >
>
> I like the idea of a file local storage map. I have several questions
> in mind, but don't want to digress from the discussion under
> Kui-Feng's patchset. It probably will be clear when seeing your
> change. Please cc me when you send it out. thanks.
>

Thanks for the interest! I'll cc you when I send it out.

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2022-08-02 14:26 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-26  5:17 [PATCH bpf-next 0/3] Parameterize task iterators Kui-Feng Lee
2022-07-26  5:17 ` [PATCH bpf-next 1/3] bpf: " Kui-Feng Lee
2022-07-26 12:13   ` Jiri Olsa
2022-07-26 13:19     ` Jiri Olsa
2022-07-27  6:39       ` Kui-Feng Lee
2022-07-27  6:56     ` Kui-Feng Lee
2022-07-27  8:19       ` Kumar Kartikeya Dwivedi
2022-07-28  5:25         ` Kui-Feng Lee
2022-07-28  8:47           ` Kumar Kartikeya Dwivedi
2022-07-28 15:16             ` Kui-Feng Lee
2022-07-28 16:22               ` Kumar Kartikeya Dwivedi
2022-07-28 16:40                 ` Kui-Feng Lee
2022-07-28 17:08                   ` Yonghong Song
2022-07-28 17:52                     ` Kumar Kartikeya Dwivedi
2022-07-28 19:11                       ` Hao Luo
2022-08-02 14:25                         ` Kumar Kartikeya Dwivedi
2022-07-28 22:54                       ` Yonghong Song
2022-07-29  9:10                         ` Christian Brauner
     [not found]                   ` <CAP01T74HRHapKDAfj104KNGnzCgNQSu_M5-KfEvGBNzLWNfd+Q@mail.gmail.com>
2022-07-30  2:46                     ` Kui-Feng Lee
2022-07-28 18:01                 ` Hao Luo
2022-07-26  5:17 ` [PATCH bpf-next 2/3] bpf: Handle bpf_link_info for the parameterized task BPF iterators Kui-Feng Lee
2022-07-26  5:17 ` [PATCH bpf-next 3/3] selftests/bpf: Test " Kui-Feng Lee

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.