Re: [PATCH bpf-next v5 1/3] bpf: Parameterize task iterators.

From: Yonghong Song <yhs@fb.com>
To: Kui-Feng Lee <kuifeng@fb.com>,
	bpf@vger.kernel.org, ast@kernel.org, daniel@iogearbox.net,
	andrii@kernel.org, kernel-team@fb.com
Subject: Re: [PATCH bpf-next v5 1/3] bpf: Parameterize task iterators.
Date: Sat, 13 Aug 2022 15:17:07 -0700	[thread overview]
Message-ID: <0f5123dc-5334-7e23-e143-c82002762242@fb.com> (raw)
In-Reply-To: <20220811001654.1316689-2-kuifeng@fb.com>

On 8/10/22 5:16 PM, Kui-Feng Lee wrote:
> Allow creating an iterator that loops through resources of one task/thread.
> 
> People could only create iterators to loop through all resources of
> files, vma, and tasks in the system, even though they were interested
> in only the resources of a specific task or process.  Passing the
> additional parameters, people can now create an iterator to go
> through all resources or only the resources of a task.
> 
> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> ---
>   include/linux/bpf.h            |  29 ++++++++
>   include/uapi/linux/bpf.h       |   8 +++
>   kernel/bpf/task_iter.c         | 126 ++++++++++++++++++++++++++-------
>   tools/include/uapi/linux/bpf.h |   8 +++
>   4 files changed, 147 insertions(+), 24 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 11950029284f..6bbe53d06faa 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>   	extern int bpf_iter_ ## target(args);			\
>   	int __init bpf_iter_ ## target(args) { return 0; }
>   
> +/*
> + * The task type of iterators.
> + *
> + * For BPF task iterators, they can be parameterized with various
> + * parameters to visit only some of tasks.
> + *
> + * BPF_TASK_ITER_ALL (default)
> + *	Iterate over resources of every task.
> + *
> + * BPF_TASK_ITER_TID
> + *	Iterate over resources of a task/tid.
> + *
> + * BPF_TASK_ITER_TGID
> + *	Iterate over reosurces of evevry task of a process / task group.
> + */
> +enum bpf_iter_task_type {
> +	BPF_TASK_ITER_ALL = 0,
> +	BPF_TASK_ITER_TID,
> +	BPF_TASK_ITER_TGID,
> +};
> +
>   struct bpf_iter_aux_info {
>   	struct bpf_map *map;
> +	struct {
> +		enum bpf_iter_task_type	type;
> +		union {
> +			u32 tid;
> +			u32 tgid;
> +			u32 pid_fd;
> +		};
> +	} task;
>   };
>   
>   typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index ffcbf79a556b..6328aca0cf5c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -91,6 +91,14 @@ union bpf_iter_link_info {
>   	struct {
>   		__u32	map_fd;
>   	} map;
> +	/*
> +	 * Parameters of task iterators.
> +	 */

The comment can be put into one line.

> +	struct {
> +		__u32	tid;
> +		__u32	tgid;
> +		__u32	pid_fd;

The above is a max of kernel and user space terminologies.
tid/pid are user space concept and tgid is a kernel space
concept.

In bpf uapi header, we have

struct bpf_pidns_info {
         __u32 pid;
         __u32 tgid;
};

which uses kernel terminologies.

So I suggest the bpf_iter_link_info.task can also
use pure kernel terminology pid/tgid/tgid_fd here.

Alternative, using pure user space terminology
can be tid/pid/pid_fd but seems the kernel terminology
might be better since we already have precedence.

> +	} task;
>   };
>   
>   /* BPF syscall commands, see bpf(2) man-page for more details. */
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 8c921799def4..f2e21efe075d 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -12,6 +12,12 @@
>   
>   struct bpf_iter_seq_task_common {
>   	struct pid_namespace *ns;
> +	enum bpf_iter_task_type	type;
> +	union {
> +		u32 tid;
> +		u32 tgid;
> +		u32 pid_fd;
> +	};
>   };
>   
>   struct bpf_iter_seq_task_info {
> @@ -22,24 +28,40 @@ struct bpf_iter_seq_task_info {
>   	u32 tid;
>   };
>   
> -static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
> +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
>   					     u32 *tid,
>   					     bool skip_if_dup_files)
>   {
>   	struct task_struct *task = NULL;
>   	struct pid *pid;
>   
> +	if (common->type == BPF_TASK_ITER_TID) {
> +		if (*tid && *tid != common->tid)
> +			return NULL;
> +		rcu_read_lock();
> +		pid = find_pid_ns(common->tid, common->ns);
> +		if (pid) {
> +			task = get_pid_task(pid, PIDTYPE_PID);
> +			*tid = common->tid;
> +		}
> +		rcu_read_unlock();
> +		return task;
> +	}
> +
>   	rcu_read_lock();
>   retry:
> -	pid = find_ge_pid(*tid, ns);
> +	pid = find_ge_pid(*tid, common->ns);
>   	if (pid) {
> -		*tid = pid_nr_ns(pid, ns);
> +		*tid = pid_nr_ns(pid, common->ns);
>   		task = get_pid_task(pid, PIDTYPE_PID);
> +

This extra line is unnecessary.

>   		if (!task) {
>   			++*tid;
>   			goto retry;
> -		} else if (skip_if_dup_files && !thread_group_leader(task) &&
> -			   task->files == task->group_leader->files) {
> +		} else if ((skip_if_dup_files && !thread_group_leader(task) &&
> +			    task->files == task->group_leader->files) ||
> +			   (common->type == BPF_TASK_ITER_TGID &&
> +			    __task_pid_nr_ns(task, PIDTYPE_TGID, common->ns) != common->tgid)) {
>   			put_task_struct(task);
>   			task = NULL;
>   			++*tid;
> @@ -56,7 +78,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
>   	struct bpf_iter_seq_task_info *info = seq->private;
>   	struct task_struct *task;
>   
> -	task = task_seq_get_next(info->common.ns, &info->tid, false);
> +	task = task_seq_get_next(&info->common, &info->tid, false);
> +

Extra line?

>   	if (!task)
>   		return NULL;
>   
> @@ -73,7 +96,8 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>   	++*pos;
>   	++info->tid;
>   	put_task_struct((struct task_struct *)v);
> -	task = task_seq_get_next(info->common.ns, &info->tid, false);
> +

Extra line?

> +	task = task_seq_get_next(&info->common, &info->tid, false);
>   	if (!task)
>   		return NULL;
>   
> @@ -117,6 +141,43 @@ static void task_seq_stop(struct seq_file *seq, void *v)
>   		put_task_struct((struct task_struct *)v);
>   }
>   
> +static int bpf_iter_attach_task(struct bpf_prog *prog,
> +				union bpf_iter_link_info *linfo,
> +				struct bpf_iter_aux_info *aux)
> +{
> +	unsigned int flags;
> +	struct pid_namespace *ns;
> +	struct pid *pid;
> +	pid_t tgid;

Follow reverse chrismas tree style?

> +
> +	if (linfo->task.tid != 0) {
> +		aux->task.type = BPF_TASK_ITER_TID;
> +		aux->task.tid = linfo->task.tid;
> +	} else if (linfo->task.tgid != 0) {
> +		aux->task.type = BPF_TASK_ITER_TGID;
> +		aux->task.tgid = linfo->task.tgid;
> +	} else if (linfo->task.pid_fd != 0) {
> +		aux->task.type = BPF_TASK_ITER_TGID;
> +		pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
> +		if (IS_ERR(pid))
> +			return PTR_ERR(pid);
> +
> +		ns = task_active_pid_ns(current);
> +		if (IS_ERR(ns))
> +			return PTR_ERR(ns);
> +
> +		tgid = pid_nr_ns(pid, ns);
> +		if (tgid <= 0)
> +			return -EINVAL;

Is it possible that tgid <= 0? I think no, so
the above two lines are unnecessary.

> +
> +		aux->task.tgid = tgid;

We leaks the reference count for 'pid' here.
We need to add
		put_pid(pid);
to release the reference for pid.

> +	} else {
> +		aux->task.type = BPF_TASK_ITER_ALL;
> +	}

What will happen if two or all of task.tid, task.tgid and
task.pid_fd non-zero? Should we fail here?

> +
> +	return 0;
> +}
> +
>   static const struct seq_operations task_seq_ops = {
>   	.start	= task_seq_start,
>   	.next	= task_seq_next,
> @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info {
>   static struct file *
[...]
>   
> @@ -307,11 +381,10 @@ enum bpf_task_vma_iter_find_op {
>   static struct vm_area_struct *
>   task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>   {
> -	struct pid_namespace *ns = info->common.ns;
>   	enum bpf_task_vma_iter_find_op op;
>   	struct vm_area_struct *curr_vma;
>   	struct task_struct *curr_task;
> -	u32 curr_tid = info->tid;
> +	u32 saved_tid = info->tid;
>   
>   	/* If this function returns a non-NULL vma, it holds a reference to
>   	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
> @@ -371,14 +444,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>   		}
>   	} else {
>   again:
> -		curr_task = task_seq_get_next(ns, &curr_tid, true);
> +		curr_task = task_seq_get_next(&info->common, &info->tid, true);
>   		if (!curr_task) {
> -			info->tid = curr_tid + 1;
> +			info->tid++;
>   			goto finish;
>   		}
>   
> -		if (curr_tid != info->tid) {
> -			info->tid = curr_tid;
> +		if (saved_tid != info->tid) {
>   			/* new task, process the first vma */
>   			op = task_vma_iter_first_vma;
>   		} else {
> @@ -430,9 +502,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>   	return curr_vma;
>   
>   next_task:
> +	if (info->common.type == BPF_TASK_ITER_TID)
> +		goto finish;
> +
>   	put_task_struct(curr_task);
>   	info->task = NULL;
> -	curr_tid++;
> +	info->tid++;

saved_tid = ++info->tid?

>   	goto again;
>   
>   finish:
[...]