Re: [PATCH bpf-next v5 1/3] bpf: Parameterize task iterators.

From: Hao Luo <haoluo@google.com>
To: Kui-Feng Lee <kuifeng@fb.com>
Cc: bpf@vger.kernel.org, ast@kernel.org, daniel@iogearbox.net,
	andrii@kernel.org, kernel-team@fb.com, yhs@fb.com
Subject: Re: [PATCH bpf-next v5 1/3] bpf: Parameterize task iterators.
Date: Mon, 15 Aug 2022 16:08:12 -0700	[thread overview]
Message-ID: <CA+khW7jRFFLsF=th2WKi7ryXYJzG4LcgJSLoLxjhnAsObLkC_Q@mail.gmail.com> (raw)
In-Reply-To: <20220811001654.1316689-2-kuifeng@fb.com>

Hi Kui-Feng,

On Wed, Aug 10, 2022 at 5:17 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> Allow creating an iterator that loops through resources of one task/thread.
>
> People could only create iterators to loop through all resources of
> files, vma, and tasks in the system, even though they were interested
> in only the resources of a specific task or process.  Passing the
> additional parameters, people can now create an iterator to go
> through all resources or only the resources of a task.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> ---
>  include/linux/bpf.h            |  29 ++++++++
>  include/uapi/linux/bpf.h       |   8 +++
>  kernel/bpf/task_iter.c         | 126 ++++++++++++++++++++++++++-------
>  tools/include/uapi/linux/bpf.h |   8 +++
>  4 files changed, 147 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 11950029284f..6bbe53d06faa 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>         extern int bpf_iter_ ## target(args);                   \
>         int __init bpf_iter_ ## target(args) { return 0; }
>
> +/*
> + * The task type of iterators.
> + *
> + * For BPF task iterators, they can be parameterized with various
> + * parameters to visit only some of tasks.
> + *
> + * BPF_TASK_ITER_ALL (default)
> + *     Iterate over resources of every task.
> + *
> + * BPF_TASK_ITER_TID
> + *     Iterate over resources of a task/tid.
> + *
> + * BPF_TASK_ITER_TGID
> + *     Iterate over reosurces of evevry task of a process / task group.

typos: resources and every.

> + */
> +enum bpf_iter_task_type {
> +       BPF_TASK_ITER_ALL = 0,
> +       BPF_TASK_ITER_TID,
> +       BPF_TASK_ITER_TGID,
> +};
> +
>  struct bpf_iter_aux_info {
>         struct bpf_map *map;
> +       struct {
> +               enum bpf_iter_task_type type;
> +               union {
> +                       u32 tid;
> +                       u32 tgid;
> +                       u32 pid_fd;
> +               };
> +       } task;
>  };
>
>  typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index ffcbf79a556b..6328aca0cf5c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -91,6 +91,14 @@ union bpf_iter_link_info {
>         struct {
>                 __u32   map_fd;
>         } map;
> +       /*
> +        * Parameters of task iterators.
> +        */

We could remove this particular comment. It is kind of obvious.

> +       struct {
> +               __u32   tid;
> +               __u32   tgid;
> +               __u32   pid_fd;
> +       } task;
>  };
>
>  /* BPF syscall commands, see bpf(2) man-page for more details. */
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 8c921799def4..f2e21efe075d 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -12,6 +12,12 @@
>
>  struct bpf_iter_seq_task_common {
>         struct pid_namespace *ns;
> +       enum bpf_iter_task_type type;
> +       union {
> +               u32 tid;
> +               u32 tgid;
> +               u32 pid_fd;
> +       };
>  };
>
>  struct bpf_iter_seq_task_info {
> @@ -22,24 +28,40 @@ struct bpf_iter_seq_task_info {
>         u32 tid;
>  };
>
> -static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
> +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
>                                              u32 *tid,
>                                              bool skip_if_dup_files)
>  {
>         struct task_struct *task = NULL;
>         struct pid *pid;
>
> +       if (common->type == BPF_TASK_ITER_TID) {
> +               if (*tid && *tid != common->tid)
> +                       return NULL;
> +               rcu_read_lock();
> +               pid = find_pid_ns(common->tid, common->ns);
> +               if (pid) {
> +                       task = get_pid_task(pid, PIDTYPE_PID);
> +                       *tid = common->tid;
> +               }
> +               rcu_read_unlock();

nit: this is ok. But I think the commonly used pattern (e.g. proc_pid_lookup) is

        rcu_read_lock();
        task = find_task_by_pid_ns(tid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

> +               return task;
> +       }
> +
>         rcu_read_lock();
>  retry:
> -       pid = find_ge_pid(*tid, ns);
> +       pid = find_ge_pid(*tid, common->ns);
>         if (pid) {
> -               *tid = pid_nr_ns(pid, ns);
> +               *tid = pid_nr_ns(pid, common->ns);
>                 task = get_pid_task(pid, PIDTYPE_PID);
> +
>                 if (!task) {
>                         ++*tid;
>                         goto retry;
> -               } else if (skip_if_dup_files && !thread_group_leader(task) &&
> -                          task->files == task->group_leader->files) {
> +               } else if ((skip_if_dup_files && !thread_group_leader(task) &&
> +                           task->files == task->group_leader->files) ||
> +                          (common->type == BPF_TASK_ITER_TGID &&
> +                           __task_pid_nr_ns(task, PIDTYPE_TGID, common->ns) != common->tgid)) {

Use task_tgid_nr_ns instead of __task_pid_nr_ns?

>                         put_task_struct(task);
>                         task = NULL;
>                         ++*tid;
> @@ -56,7 +78,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
>         struct bpf_iter_seq_task_info *info = seq->private;
>         struct task_struct *task;
>
> -       task = task_seq_get_next(info->common.ns, &info->tid, false);
> +       task = task_seq_get_next(&info->common, &info->tid, false);
> +
>         if (!task)
>                 return NULL;
>
> @@ -73,7 +96,8 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>         ++*pos;
>         ++info->tid;
>         put_task_struct((struct task_struct *)v);
> -       task = task_seq_get_next(info->common.ns, &info->tid, false);
> +
> +       task = task_seq_get_next(&info->common, &info->tid, false);
>         if (!task)
>                 return NULL;
>
> @@ -117,6 +141,43 @@ static void task_seq_stop(struct seq_file *seq, void *v)
>                 put_task_struct((struct task_struct *)v);
>  }
>
> +static int bpf_iter_attach_task(struct bpf_prog *prog,
> +                               union bpf_iter_link_info *linfo,
> +                               struct bpf_iter_aux_info *aux)
> +{
> +       unsigned int flags;
> +       struct pid_namespace *ns;
> +       struct pid *pid;
> +       pid_t tgid;
> +
> +       if (linfo->task.tid != 0) {
> +               aux->task.type = BPF_TASK_ITER_TID;
> +               aux->task.tid = linfo->task.tid;
> +       } else if (linfo->task.tgid != 0) {
> +               aux->task.type = BPF_TASK_ITER_TGID;
> +               aux->task.tgid = linfo->task.tgid;
> +       } else if (linfo->task.pid_fd != 0) {
> +               aux->task.type = BPF_TASK_ITER_TGID;
> +               pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
> +               if (IS_ERR(pid))
> +                       return PTR_ERR(pid);
> +
> +               ns = task_active_pid_ns(current);
> +               if (IS_ERR(ns))
> +                       return PTR_ERR(ns);
> +
> +               tgid = pid_nr_ns(pid, ns);
> +               if (tgid <= 0)
> +                       return -EINVAL;

Is this just pid_vnr?

> +
> +               aux->task.tgid = tgid;
> +       } else {
> +               aux->task.type = BPF_TASK_ITER_ALL;
> +       }

The same question as Yonghong has. Do we need to enforce that at most
one of {tid, tgid, pid_fd} is non-zero?

> +
> +       return 0;
> +}
> +
>  static const struct seq_operations task_seq_ops = {
>         .start  = task_seq_start,
>         .next   = task_seq_next,
> @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info {
>  static struct file *
>  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
>  {
> -       struct pid_namespace *ns = info->common.ns;
> -       u32 curr_tid = info->tid;
> +       u32 saved_tid = info->tid;
>         struct task_struct *curr_task;
>         unsigned int curr_fd = info->fd;
>
> @@ -151,21 +211,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
>                 curr_task = info->task;
>                 curr_fd = info->fd;
>         } else {
> -                curr_task = task_seq_get_next(ns, &curr_tid, true);
> +               curr_task = task_seq_get_next(&info->common, &info->tid, true);
>                  if (!curr_task) {
>                          info->task = NULL;
> -                        info->tid = curr_tid;
>                          return NULL;
>                  }
>
> -                /* set info->task and info->tid */
> +               /* set info->task */
>                 info->task = curr_task;
> -               if (curr_tid == info->tid) {
> +               if (saved_tid == info->tid)
>                         curr_fd = info->fd;
> -               } else {
> -                       info->tid = curr_tid;
> +               else
>                         curr_fd = 0;
> -               }
>         }
>
>         rcu_read_lock();
> @@ -186,9 +243,15 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
>         /* the current task is done, go to the next task */
>         rcu_read_unlock();
>         put_task_struct(curr_task);
> +
> +       if (info->common.type == BPF_TASK_ITER_TID) {
> +               info->task = NULL;
> +               return NULL;
> +       }
> +

Do we need to set info->fd to 0? I am not sure if the caller reads
info->fd anywhere. I think it would be good to do some refactoring on
task_file_seq_get_next().

>         info->task = NULL;
>         info->fd = 0;
> -       curr_tid = ++(info->tid);
> +       saved_tid = ++(info->tid);
>         goto again;
>  }
>
> @@ -269,6 +332,17 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
>         struct bpf_iter_seq_task_common *common = priv_data;
>
>         common->ns = get_pid_ns(task_active_pid_ns(current));
> +       common->type = aux->task.type;
> +       switch (common->type) {
> +       case BPF_TASK_ITER_TID:
> +               common->tid = aux->task.tid;
> +               break;
> +       case BPF_TASK_ITER_TGID:
> +               common->tgid = aux->task.tgid;
> +               break;
> +       default:

very nit: IMHO we could place a warning here.

> +               break;
> +       }
>         return 0;
>  }
>
> @@ -307,11 +381,10 @@ enum bpf_task_vma_iter_find_op {
>  static struct vm_area_struct *
>  task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>  {
> -       struct pid_namespace *ns = info->common.ns;
>         enum bpf_task_vma_iter_find_op op;
>         struct vm_area_struct *curr_vma;
>         struct task_struct *curr_task;
> -       u32 curr_tid = info->tid;
> +       u32 saved_tid = info->tid;
>

Why do we need to directly operate on info->tid while other task iters
(e.g. vma_iter) uses curr_tid? IMHO, prefer staying using curr_tid if
possible, for two reasons:
 - consistent with other iters.
 - decouple refactoring changes from the changes that introduce new features

>         /* If this function returns a non-NULL vma, it holds a reference to
>          * the task_struct, and holds read lock on vma->mm->mmap_lock.
> @@ -371,14 +444,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>                 }
>         } else {
>  again:
> -               curr_task = task_seq_get_next(ns, &curr_tid, true);
> +               curr_task = task_seq_get_next(&info->common, &info->tid, true);
>                 if (!curr_task) {
> -                       info->tid = curr_tid + 1;
> +                       info->tid++;
>                         goto finish;
>                 }
>
> -               if (curr_tid != info->tid) {
> -                       info->tid = curr_tid;
> +               if (saved_tid != info->tid) {
>                         /* new task, process the first vma */
>                         op = task_vma_iter_first_vma;
>                 } else {
> @@ -430,9 +502,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
>         return curr_vma;
>
>  next_task:
> +       if (info->common.type == BPF_TASK_ITER_TID)
> +               goto finish;
> +
>         put_task_struct(curr_task);
>         info->task = NULL;
> -       curr_tid++;
> +       info->tid++;
>         goto again;
>
>  finish:
> @@ -533,6 +608,7 @@ static const struct bpf_iter_seq_info task_seq_info = {
>
>  static struct bpf_iter_reg task_reg_info = {
>         .target                 = "task",
> +       .attach_target          = bpf_iter_attach_task,
>         .feature                = BPF_ITER_RESCHED,
>         .ctx_arg_info_size      = 1,
>         .ctx_arg_info           = {
> @@ -551,6 +627,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
>
>  static struct bpf_iter_reg task_file_reg_info = {
>         .target                 = "task_file",
> +       .attach_target          = bpf_iter_attach_task,
>         .feature                = BPF_ITER_RESCHED,
>         .ctx_arg_info_size      = 2,
>         .ctx_arg_info           = {
> @@ -571,6 +648,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {
>
>  static struct bpf_iter_reg task_vma_reg_info = {
>         .target                 = "task_vma",
> +       .attach_target          = bpf_iter_attach_task,
>         .feature                = BPF_ITER_RESCHED,
>         .ctx_arg_info_size      = 2,
>         .ctx_arg_info           = {
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index ffcbf79a556b..6328aca0cf5c 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -91,6 +91,14 @@ union bpf_iter_link_info {
>         struct {
>                 __u32   map_fd;
>         } map;
> +       /*
> +        * Parameters of task iterators.
> +        */
> +       struct {
> +               __u32   tid;
> +               __u32   tgid;
> +               __u32   pid_fd;
> +       } task;
>  };
>
>  /* BPF syscall commands, see bpf(2) man-page for more details. */
> --
> 2.30.2
>