linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/1] sched/psi: use kernfs polling functions for PSI trigger polling
@ 2023-06-30  0:56 Suren Baghdasaryan
  2023-06-30  1:04 ` Suren Baghdasaryan
  2023-07-06  8:16 ` Peter Zijlstra
  0 siblings, 2 replies; 4+ messages in thread
From: Suren Baghdasaryan @ 2023-06-30  0:56 UTC (permalink / raw)
  To: peterz
  Cc: gregkh, tj, lujialin4, lizefan.x, hannes, mingo, ebiggers, oleg,
	akpm, viro, brauner, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, cgroups, linux-fsdevel, kernel-team, surenb

Destroying psi trigger in cgroup_file_release causes UAF issues when
a cgroup is removed from under a polling process. This is happening
because cgroup removal causes a call to cgroup_file_release while the
actual file is still alive. Destroying the trigger at this point would
also destroy its waitqueue head and if there is still a polling process
on that file accessing the waitqueue, it will step on the freed pointer:

do_select
  vfs_poll
                           do_rmdir
                             cgroup_rmdir
                               kernfs_drain_open_files
                                 cgroup_file_release
                                   cgroup_pressure_release
                                     psi_trigger_destroy
                                       wake_up_pollfree(&t->event_wait)
// vfs_poll is unblocked
                                       synchronize_rcu
                                       kfree(t)
  poll_freewait -> UAF access to the trigger's waitqueue head

Patch [1] fixed this issue for epoll() case using wake_up_pollfree(),
however the same issue exists for synchronous poll() case.
The root cause of this issue is that the lifecycles of the psi trigger's
waitqueue and of the file associated with the trigger are different. Fix
this by using kernfs_generic_poll function when polling on cgroup-specific
psi triggers. It internally uses kernfs_open_node->poll waitqueue head
with its lifecycle tied to the file's lifecycle. This also renders the
fix in [1] obsolete, so revert it.

[1] commit c2dbe32d5db5 ("sched/psi: Fix use-after-free in ep_remove_wait_queue()")

Fixes: 0e94682b73bf ("psi: introduce psi monitor")
Reported-by: Lu Jialin <lujialin4@huawei.com>
Closes: https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com/
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/psi.h       |  5 +++--
 include/linux/psi_types.h |  3 +++
 kernel/cgroup/cgroup.c    |  2 +-
 kernel/sched/psi.c        | 29 +++++++++++++++++++++--------
 4 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/include/linux/psi.h b/include/linux/psi.h
index ab26200c2803..e0745873e3f2 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -23,8 +23,9 @@ void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
 
 int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
-struct psi_trigger *psi_trigger_create(struct psi_group *group,
-			char *buf, enum psi_res res, struct file *file);
+struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
+				       enum psi_res res, struct file *file,
+				       struct kernfs_open_file *of);
 void psi_trigger_destroy(struct psi_trigger *t);
 
 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 040c089581c6..f1fd3a8044e0 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -137,6 +137,9 @@ struct psi_trigger {
 	/* Wait queue for polling */
 	wait_queue_head_t event_wait;
 
+	/* Kernfs file for cgroup triggers */
+	struct kernfs_open_file *of;
+
 	/* Pending event flag */
 	int event;
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index bfe3cd8ccf36..f55a40db065f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3730,7 +3730,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
 	}
 
 	psi = cgroup_psi(cgrp);
-	new = psi_trigger_create(psi, buf, res, of->file);
+	new = psi_trigger_create(psi, buf, res, of->file, of);
 	if (IS_ERR(new)) {
 		cgroup_put(cgrp);
 		return PTR_ERR(new);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 81fca77397f6..9bb3f2b3ccfc 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -493,8 +493,12 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
 			continue;
 
 		/* Generate an event */
-		if (cmpxchg(&t->event, 0, 1) == 0)
-			wake_up_interruptible(&t->event_wait);
+		if (cmpxchg(&t->event, 0, 1) == 0) {
+			if (t->of)
+				kernfs_notify(t->of->kn);
+			else
+				wake_up_interruptible(&t->event_wait);
+		}
 		t->last_event_time = now;
 		/* Reset threshold breach flag once event got generated */
 		t->pending_event = false;
@@ -1271,8 +1275,9 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 	return 0;
 }
 
-struct psi_trigger *psi_trigger_create(struct psi_group *group,
-			char *buf, enum psi_res res, struct file *file)
+struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
+				       enum psi_res res, struct file *file,
+				       struct kernfs_open_file *of)
 {
 	struct psi_trigger *t;
 	enum psi_states state;
@@ -1331,7 +1336,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 
 	t->event = 0;
 	t->last_event_time = 0;
-	init_waitqueue_head(&t->event_wait);
+	t->of = of;
+	if (!of)
+		init_waitqueue_head(&t->event_wait);
 	t->pending_event = false;
 	t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
 
@@ -1388,7 +1395,10 @@ void psi_trigger_destroy(struct psi_trigger *t)
 	 * being accessed later. Can happen if cgroup is deleted from under a
 	 * polling process.
 	 */
-	wake_up_pollfree(&t->event_wait);
+	if (t->of)
+		kernfs_notify(t->of->kn);
+	else
+		wake_up_interruptible(&t->event_wait);
 
 	if (t->aggregator == PSI_AVGS) {
 		mutex_lock(&group->avgs_lock);
@@ -1465,7 +1475,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
 	if (!t)
 		return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
 
-	poll_wait(file, &t->event_wait, wait);
+	if (t->of)
+		kernfs_generic_poll(t->of, wait);
+	else
+		poll_wait(file, &t->event_wait, wait);
 
 	if (cmpxchg(&t->event, 1, 0) == 1)
 		ret |= EPOLLPRI;
@@ -1535,7 +1548,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
 		return -EBUSY;
 	}
 
-	new = psi_trigger_create(&psi_system, buf, res, file);
+	new = psi_trigger_create(&psi_system, buf, res, file, NULL);
 	if (IS_ERR(new)) {
 		mutex_unlock(&seq->lock);
 		return PTR_ERR(new);
-- 
2.41.0.255.g8b1d071c50-goog


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2 1/1] sched/psi: use kernfs polling functions for PSI trigger polling
  2023-06-30  0:56 [PATCH v2 1/1] sched/psi: use kernfs polling functions for PSI trigger polling Suren Baghdasaryan
@ 2023-06-30  1:04 ` Suren Baghdasaryan
  2023-07-10 20:22   ` Tejun Heo
  2023-07-06  8:16 ` Peter Zijlstra
  1 sibling, 1 reply; 4+ messages in thread
From: Suren Baghdasaryan @ 2023-06-30  1:04 UTC (permalink / raw)
  To: peterz
  Cc: gregkh, tj, lujialin4, lizefan.x, hannes, mingo, ebiggers, oleg,
	akpm, viro, brauner, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, cgroups, linux-fsdevel, kernel-team

On Thu, Jun 29, 2023 at 5:56 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> Destroying psi trigger in cgroup_file_release causes UAF issues when
> a cgroup is removed from under a polling process. This is happening
> because cgroup removal causes a call to cgroup_file_release while the
> actual file is still alive. Destroying the trigger at this point would
> also destroy its waitqueue head and if there is still a polling process
> on that file accessing the waitqueue, it will step on the freed pointer:
>
> do_select
>   vfs_poll
>                            do_rmdir
>                              cgroup_rmdir
>                                kernfs_drain_open_files
>                                  cgroup_file_release
>                                    cgroup_pressure_release
>                                      psi_trigger_destroy
>                                        wake_up_pollfree(&t->event_wait)
> // vfs_poll is unblocked
>                                        synchronize_rcu
>                                        kfree(t)
>   poll_freewait -> UAF access to the trigger's waitqueue head
>
> Patch [1] fixed this issue for epoll() case using wake_up_pollfree(),
> however the same issue exists for synchronous poll() case.
> The root cause of this issue is that the lifecycles of the psi trigger's
> waitqueue and of the file associated with the trigger are different. Fix
> this by using kernfs_generic_poll function when polling on cgroup-specific
> psi triggers. It internally uses kernfs_open_node->poll waitqueue head
> with its lifecycle tied to the file's lifecycle. This also renders the
> fix in [1] obsolete, so revert it.
>
> [1] commit c2dbe32d5db5 ("sched/psi: Fix use-after-free in ep_remove_wait_queue()")
>
> Fixes: 0e94682b73bf ("psi: introduce psi monitor")
> Reported-by: Lu Jialin <lujialin4@huawei.com>
> Closes: https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com/
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>

This patch is a replacement for the patchset posted at
https://lore.kernel.org/all/20230626201713.1204982-1-surenb@google.com/
The original patchset was more appropriate for Tejun's tree but this
one is mostly touching PSI-related code, so I changed the recipient to
PeterZ. That said, I would still greatly appreciate inputs from Tejun
and Greg, and anyone else of course.
Thanks,
Suren.

> ---
>  include/linux/psi.h       |  5 +++--
>  include/linux/psi_types.h |  3 +++
>  kernel/cgroup/cgroup.c    |  2 +-
>  kernel/sched/psi.c        | 29 +++++++++++++++++++++--------
>  4 files changed, 28 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/psi.h b/include/linux/psi.h
> index ab26200c2803..e0745873e3f2 100644
> --- a/include/linux/psi.h
> +++ b/include/linux/psi.h
> @@ -23,8 +23,9 @@ void psi_memstall_enter(unsigned long *flags);
>  void psi_memstall_leave(unsigned long *flags);
>
>  int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
> -struct psi_trigger *psi_trigger_create(struct psi_group *group,
> -                       char *buf, enum psi_res res, struct file *file);
> +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
> +                                      enum psi_res res, struct file *file,
> +                                      struct kernfs_open_file *of);
>  void psi_trigger_destroy(struct psi_trigger *t);
>
>  __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
> diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> index 040c089581c6..f1fd3a8044e0 100644
> --- a/include/linux/psi_types.h
> +++ b/include/linux/psi_types.h
> @@ -137,6 +137,9 @@ struct psi_trigger {
>         /* Wait queue for polling */
>         wait_queue_head_t event_wait;
>
> +       /* Kernfs file for cgroup triggers */
> +       struct kernfs_open_file *of;
> +
>         /* Pending event flag */
>         int event;
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index bfe3cd8ccf36..f55a40db065f 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -3730,7 +3730,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
>         }
>
>         psi = cgroup_psi(cgrp);
> -       new = psi_trigger_create(psi, buf, res, of->file);
> +       new = psi_trigger_create(psi, buf, res, of->file, of);
>         if (IS_ERR(new)) {
>                 cgroup_put(cgrp);
>                 return PTR_ERR(new);
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 81fca77397f6..9bb3f2b3ccfc 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -493,8 +493,12 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
>                         continue;
>
>                 /* Generate an event */
> -               if (cmpxchg(&t->event, 0, 1) == 0)
> -                       wake_up_interruptible(&t->event_wait);
> +               if (cmpxchg(&t->event, 0, 1) == 0) {
> +                       if (t->of)
> +                               kernfs_notify(t->of->kn);
> +                       else
> +                               wake_up_interruptible(&t->event_wait);
> +               }
>                 t->last_event_time = now;
>                 /* Reset threshold breach flag once event got generated */
>                 t->pending_event = false;
> @@ -1271,8 +1275,9 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
>         return 0;
>  }
>
> -struct psi_trigger *psi_trigger_create(struct psi_group *group,
> -                       char *buf, enum psi_res res, struct file *file)
> +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
> +                                      enum psi_res res, struct file *file,
> +                                      struct kernfs_open_file *of)
>  {
>         struct psi_trigger *t;
>         enum psi_states state;
> @@ -1331,7 +1336,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
>
>         t->event = 0;
>         t->last_event_time = 0;
> -       init_waitqueue_head(&t->event_wait);
> +       t->of = of;
> +       if (!of)
> +               init_waitqueue_head(&t->event_wait);
>         t->pending_event = false;
>         t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
>
> @@ -1388,7 +1395,10 @@ void psi_trigger_destroy(struct psi_trigger *t)
>          * being accessed later. Can happen if cgroup is deleted from under a
>          * polling process.
>          */
> -       wake_up_pollfree(&t->event_wait);
> +       if (t->of)
> +               kernfs_notify(t->of->kn);
> +       else
> +               wake_up_interruptible(&t->event_wait);
>
>         if (t->aggregator == PSI_AVGS) {
>                 mutex_lock(&group->avgs_lock);
> @@ -1465,7 +1475,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
>         if (!t)
>                 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
>
> -       poll_wait(file, &t->event_wait, wait);
> +       if (t->of)
> +               kernfs_generic_poll(t->of, wait);
> +       else
> +               poll_wait(file, &t->event_wait, wait);
>
>         if (cmpxchg(&t->event, 1, 0) == 1)
>                 ret |= EPOLLPRI;
> @@ -1535,7 +1548,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
>                 return -EBUSY;
>         }
>
> -       new = psi_trigger_create(&psi_system, buf, res, file);
> +       new = psi_trigger_create(&psi_system, buf, res, file, NULL);
>         if (IS_ERR(new)) {
>                 mutex_unlock(&seq->lock);
>                 return PTR_ERR(new);
> --
> 2.41.0.255.g8b1d071c50-goog
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2 1/1] sched/psi: use kernfs polling functions for PSI trigger polling
  2023-06-30  0:56 [PATCH v2 1/1] sched/psi: use kernfs polling functions for PSI trigger polling Suren Baghdasaryan
  2023-06-30  1:04 ` Suren Baghdasaryan
@ 2023-07-06  8:16 ` Peter Zijlstra
  1 sibling, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2023-07-06  8:16 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: gregkh, tj, lujialin4, lizefan.x, hannes, mingo, ebiggers, oleg,
	akpm, viro, brauner, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, cgroups, linux-fsdevel, kernel-team

On Thu, Jun 29, 2023 at 05:56:12PM -0700, Suren Baghdasaryan wrote:
> Destroying psi trigger in cgroup_file_release causes UAF issues when
> a cgroup is removed from under a polling process. This is happening
> because cgroup removal causes a call to cgroup_file_release while the
> actual file is still alive. Destroying the trigger at this point would
> also destroy its waitqueue head and if there is still a polling process
> on that file accessing the waitqueue, it will step on the freed pointer:
> 
> do_select
>   vfs_poll
>                            do_rmdir
>                              cgroup_rmdir
>                                kernfs_drain_open_files
>                                  cgroup_file_release
>                                    cgroup_pressure_release
>                                      psi_trigger_destroy
>                                        wake_up_pollfree(&t->event_wait)
> // vfs_poll is unblocked
>                                        synchronize_rcu
>                                        kfree(t)
>   poll_freewait -> UAF access to the trigger's waitqueue head
> 
> Patch [1] fixed this issue for epoll() case using wake_up_pollfree(),
> however the same issue exists for synchronous poll() case.
> The root cause of this issue is that the lifecycles of the psi trigger's
> waitqueue and of the file associated with the trigger are different. Fix
> this by using kernfs_generic_poll function when polling on cgroup-specific
> psi triggers. It internally uses kernfs_open_node->poll waitqueue head
> with its lifecycle tied to the file's lifecycle. This also renders the
> fix in [1] obsolete, so revert it.
> 
> [1] commit c2dbe32d5db5 ("sched/psi: Fix use-after-free in ep_remove_wait_queue()")
> 
> Fixes: 0e94682b73bf ("psi: introduce psi monitor")
> Reported-by: Lu Jialin <lujialin4@huawei.com>
> Closes: https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com/
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>

Thanks, I'll stuff it in sched/urgent after -rc1.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2 1/1] sched/psi: use kernfs polling functions for PSI trigger polling
  2023-06-30  1:04 ` Suren Baghdasaryan
@ 2023-07-10 20:22   ` Tejun Heo
  0 siblings, 0 replies; 4+ messages in thread
From: Tejun Heo @ 2023-07-10 20:22 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: peterz, gregkh, lujialin4, lizefan.x, hannes, mingo, ebiggers,
	oleg, akpm, viro, brauner, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, cgroups, linux-fsdevel, kernel-team

On Thu, Jun 29, 2023 at 06:04:15PM -0700, Suren Baghdasaryan wrote:
> This patch is a replacement for the patchset posted at
> https://lore.kernel.org/all/20230626201713.1204982-1-surenb@google.com/
> The original patchset was more appropriate for Tejun's tree but this
> one is mostly touching PSI-related code, so I changed the recipient to
> PeterZ. That said, I would still greatly appreciate inputs from Tejun
> and Greg, and anyone else of course.

Yeah, this looks good to me.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-07-10 20:22 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-30  0:56 [PATCH v2 1/1] sched/psi: use kernfs polling functions for PSI trigger polling Suren Baghdasaryan
2023-06-30  1:04 ` Suren Baghdasaryan
2023-07-10 20:22   ` Tejun Heo
2023-07-06  8:16 ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).