All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yonghong Song <yhs@fb.com>
To: Kui-Feng Lee <kuifeng@fb.com>,
	bpf@vger.kernel.org, ast@kernel.org, daniel@iogearbox.net,
	andrii@kernel.org, kernel-team@fb.com
Subject: Re: [PATCH bpf-next v5 1/3] bpf: Parameterize task iterators.
Date: Sat, 13 Aug 2022 15:17:07 -0700	[thread overview]
Message-ID: <0f5123dc-5334-7e23-e143-c82002762242@fb.com> (raw)
In-Reply-To: <20220811001654.1316689-2-kuifeng@fb.com>



On 8/10/22 5:16 PM, Kui-Feng Lee wrote:
> Allow creating an iterator that loops through resources of one task/thread.
> 
> People could only create iterators to loop through all resources of
> files, vma, and tasks in the system, even though they were interested
> in only the resources of a specific task or process.  Passing the
> additional parameters, people can now create an iterator to go
> through all resources or only the resources of a task.
> 
> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> ---
>   include/linux/bpf.h            |  29 ++++++++
>   include/uapi/linux/bpf.h       |   8 +++
>   kernel/bpf/task_iter.c         | 126 ++++++++++++++++++++++++++-------
>   tools/include/uapi/linux/bpf.h |   8 +++
>   4 files changed, 147 insertions(+), 24 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 11950029284f..6bbe53d06faa 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>   	extern int bpf_iter_ ## target(args);			\
>   	int __init bpf_iter_ ## target(args) { return 0; }
>   
> +/*
> + * The task type of iterators.
> + *
> + * For BPF task iterators, they can be parameterized with various
> + * parameters to visit only some of tasks.
> + *
> + * BPF_TASK_ITER_ALL (default)
> + *	Iterate over resources of every task.
> + *
> + * BPF_TASK_ITER_TID
> + *	Iterate over resources of a task/tid.
> + *
> + * BPF_TASK_ITER_TGID
> + *	Iterate over reosurces of evevry task of a process / task group.
> + */
> +enum bpf_iter_task_type {
> +	BPF_TASK_ITER_ALL = 0,
> +	BPF_TASK_ITER_TID,
> +	BPF_TASK_ITER_TGID,
> +};
> +
>   struct bpf_iter_aux_info {
>   	struct bpf_map *map;
> +	struct {
> +		enum bpf_iter_task_type	type;
> +		union {
> +			u32 tid;
> +			u32 tgid;
> +			u32 pid_fd;
> +		};
> +	} task;
>   };
>   
>   typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index ffcbf79a556b..6328aca0cf5c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -91,6 +91,14 @@ union bpf_iter_link_info {
>   	struct {
>   		__u32	map_fd;
>   	} map;
> +	/*
> +	 * Parameters of task iterators.
> +	 */

The comment can be put into one line.

> +	struct {
> +		__u32	tid;
> +		__u32	tgid;
> +		__u32	pid_fd;

The above is a max of kernel and user space terminologies.
tid/pid are user space concept and tgid is a kernel space
concept.

In bpf uapi header, we have

struct bpf_pidns_info {
         __u32 pid;
         __u32 tgid;
};

which uses kernel terminologies.

So I suggest the bpf_iter_link_info.task can also
use pure kernel terminology pid/tgid/tgid_fd here.

Alternative, using pure user space terminology
can be tid/pid/pid_fd but seems the kernel terminology
might be better since we already have precedence.


> +	} task;
>   };
>   
>   /* BPF syscall commands, see bpf(2) man-page for more details. */
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 8c921799def4..f2e21efe075d 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -12,6 +12,12 @@
>   
>   struct bpf_iter_seq_task_common {
>   	struct pid_namespace *ns;
> +	enum bpf_iter_task_type	type;
> +	union {
> +		u32 tid;
> +		u32 tgid;
> +		u32 pid_fd;
> +	};
>   };
>   
>   struct bpf_iter_seq_task_info {
> @@ -22,24 +28,40 @@ struct bpf_iter_seq_task_info {
>   	u32 tid;
>   };
>   
> -static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
> +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
>   					     u32 *tid,
>   					     bool skip_if_dup_files)
>   {
>   	struct task_struct *task = NULL;
>   	struct pid *pid;
>   
> +	if (common->type == BPF_TASK_ITER_TID) {
> +		if (*tid && *tid != common->tid)
> +			return NULL;
> +		rcu_read_lock();
> +		pid = find_pid_ns(common->tid, common->ns);
> +		if (pid) {
> +			task = get_pid_task(pid, PIDTYPE_PID);
> +			*tid = common->tid;
> +		}
> +		rcu_read_unlock();
> +		return task;
> +	}
> +
>   	rcu_read_lock();
>   retry:
> -	pid = find_ge_pid(*tid, ns);
> +	pid = find_ge_pid(*tid, common->ns);
>   	if (pid) {
> -		*tid = pid_nr_ns(pid, ns);
> +		*tid = pid_nr_ns(pid, common->ns);
>   		task = get_pid_task(pid, PIDTYPE_PID);
> +

This extra line is unnecessary.

>   		if (!task) {
>   			++*tid;
>   			goto retry;
> -		} else if (skip_if_dup_files && !thread_group_leader(task) &&
> -			   task->files == task->group_leader->files) {
> +		} else if ((skip_if_dup_files && !thread_group_leader(task) &&
> +			    task->files == task->group_leader->files) ||
> +			   (common->type == BPF_TASK_ITER_TGID &&
> +			    __task_pid_nr_ns(task, PIDTYPE_TGID, common->ns) != common->tgid)) {
>   			put_task_struct(task);
>   			task = NULL;
>   			++*tid;
> @@ -56,7 +78,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
>   	struct bpf_iter_seq_task_info *info = seq->private;
>   	struct task_struct *task;
>   
> -	task = task_seq_get_next(info->common.ns, &info->tid, false);
> +	task = task_seq_get_next(&info->common, &info->tid, false);
> +

Extra line?

>   	if (!task)
>   		return NULL;
>   
> @@ -73,7 +96,8 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>   	++*pos;
>   	++info->tid;
>   	put_task_struct((struct task_struct *)v);
> -	task = task_seq_get_next(info->common.ns, &info->tid, false);
> +

Extra line?

> +	task = task_seq_get_next(&info->common, &info->tid, false);
>   	if (!task)
>   		return NULL;
>   
> @@ -117,6 +141,43 @@ static void task_seq_stop(struct seq_file *seq, void *v)
>   		put_task_struct((struct task_struct *)v);
>   }
>   
> +static int bpf_iter_attach_task(struct bpf_prog *prog,
> +				union bpf_iter_link_info *linfo,
> +				struct bpf_iter_aux_info *aux)
> +{
> +	unsigned int flags;
> +	struct pid_namespace *ns;
> +	struct pid *pid;
> +	pid_t tgid;

Follow reverse chrismas tree style?

> +
> +	if (linfo->task.tid != 0) {
> +		aux->task.type = BPF_TASK_ITER_TID;
> +		aux->task.tid = linfo->task.tid;
> +	} else if (linfo->task.tgid != 0) {
> +		aux->task.type = BPF_TASK_ITER_TGID;
> +		aux->task.tgid = linfo->task.tgid;
> +	} else if (linfo->task.pid_fd != 0) {
> +		aux->task.type = BPF_TASK_ITER_TGID;
> +		pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
> +		if (IS_ERR(pid))
> +			return PTR_ERR(pid);
> +
> +		ns = task_active_pid_ns(current);
> +		if (IS_ERR(ns))
> +			return PTR_ERR(ns);
> +
> +		tgid = pid_nr_ns(pid, ns);
> +		if (tgid <= 0)
> +			return -EINVAL;

Is it possible that tgid <= 0? I think no, so
the above two lines are unnecessary.

> +
> +		aux->task.tgid = tgid;

We leaks the reference count for 'pid' here.
We need to add
		put_pid(pid);
to release the reference for pid.
	
> +	} else {
> +		aux->task.type = BPF_TASK_ITER_ALL;
> +	}

What will happen if two or all of task.tid, task.tgid and
task.pid_fd non-zero? Should we fail here?

> +
> +	return 0;
> +}
> +
>   static const struct seq_operations task_seq_ops = {
>   	.start	= task_seq_start,
>   	.next	= task_seq_next,
> @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info {
>   static struct file *
[...]
>   
> @@ -307,11 +381,10 @@ enum bpf_task_vma_iter_find_op {
>   static struct vm_area_struct *
>   task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>   {
> -	struct pid_namespace *ns = info->common.ns;
>   	enum bpf_task_vma_iter_find_op op;
>   	struct vm_area_struct *curr_vma;
>   	struct task_struct *curr_task;
> -	u32 curr_tid = info->tid;
> +	u32 saved_tid = info->tid;
>   
>   	/* If this function returns a non-NULL vma, it holds a reference to
>   	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
> @@ -371,14 +444,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>   		}
>   	} else {
>   again:
> -		curr_task = task_seq_get_next(ns, &curr_tid, true);
> +		curr_task = task_seq_get_next(&info->common, &info->tid, true);
>   		if (!curr_task) {
> -			info->tid = curr_tid + 1;
> +			info->tid++;
>   			goto finish;
>   		}
>   
> -		if (curr_tid != info->tid) {
> -			info->tid = curr_tid;
> +		if (saved_tid != info->tid) {
>   			/* new task, process the first vma */
>   			op = task_vma_iter_first_vma;
>   		} else {
> @@ -430,9 +502,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>   	return curr_vma;
>   
>   next_task:
> +	if (info->common.type == BPF_TASK_ITER_TID)
> +		goto finish;
> +
>   	put_task_struct(curr_task);
>   	info->task = NULL;
> -	curr_tid++;
> +	info->tid++;

saved_tid = ++info->tid?

>   	goto again;
>   
>   finish:
[...]

  reply	other threads:[~2022-08-13 22:17 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-11  0:16 [PATCH bpf-next v5 0/3] Parameterize task iterators Kui-Feng Lee
2022-08-11  0:16 ` [PATCH bpf-next v5 1/3] bpf: " Kui-Feng Lee
2022-08-13 22:17   ` Yonghong Song [this message]
2022-08-15  1:01     ` Alexei Starovoitov
2022-08-16  4:42     ` Andrii Nakryiko
2022-08-18  3:40       ` Yonghong Song
2022-08-16  5:25     ` Andrii Nakryiko
2022-08-18  4:31       ` Yonghong Song
2022-08-25 17:50         ` Andrii Nakryiko
2022-08-16 17:00     ` Kui-Feng Lee
2022-08-14 20:24   ` Jiri Olsa
2022-08-16 17:21     ` Kui-Feng Lee
2022-08-15 23:08   ` Hao Luo
2022-08-16 19:11     ` Kui-Feng Lee
2022-08-16  5:02   ` Andrii Nakryiko
2022-08-16 18:45     ` Kui-Feng Lee
2022-08-11  0:16 ` [PATCH bpf-next v5 2/3] bpf: Handle bpf_link_info for the parameterized task BPF iterators Kui-Feng Lee
2022-08-13 22:23   ` Yonghong Song
2022-08-11  0:16 ` [PATCH bpf-next v5 3/3] selftests/bpf: Test " Kui-Feng Lee
2022-08-13 22:50   ` Yonghong Song
2022-08-18 17:24     ` Kui-Feng Lee
2022-08-16  5:15   ` Andrii Nakryiko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=0f5123dc-5334-7e23-e143-c82002762242@fb.com \
    --to=yhs@fb.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=kernel-team@fb.com \
    --cc=kuifeng@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.