All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] psi: Fix uaf issue when psi trigger is destroyed while being polled
@ 2022-01-11  2:51 Suren Baghdasaryan
  2022-01-11  2:55   ` Linus Torvalds
  2022-01-11  3:12   ` Eric Biggers
  0 siblings, 2 replies; 8+ messages in thread
From: Suren Baghdasaryan @ 2022-01-11  2:51 UTC (permalink / raw)
  To: hannes
  Cc: torvalds, ebiggers, tj, lizefan.x, mingo, peterz, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	bristot, corbet, linux-doc, linux-kernel, cgroups, kernel-team,
	surenb, syzbot+cdb5dd11c97cc532efad

With write operation on psi files replacing old trigger with a new one,
the lifetime of its waitqueue is totally arbitrary. Overwriting an
existing trigger causes its waitqueue to be freed and pending poll()
will stumble on trigger->event_wait which was destroyed.
Fix this by disallowing to redefine an existing psi trigger. If a write
operation is used on a file descriptor with an already existing psi
trigger, the operation will fail with EBUSY error.
Also bypass a check for psi_disabled in the psi_trigger_destroy as the
flag can be flipped after the trigger is created, leading to a memory
leak.

Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com
Analyzed-by: Eric Biggers <ebiggers@kernel.org>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 Documentation/accounting/psi.rst |  3 +-
 include/linux/psi.h              |  2 +-
 include/linux/psi_types.h        |  3 --
 kernel/cgroup/cgroup.c           | 11 ++++--
 kernel/sched/psi.c               | 68 +++++++++++++++-----------------
 5 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst
index f2b3439edcc2..860fe651d645 100644
--- a/Documentation/accounting/psi.rst
+++ b/Documentation/accounting/psi.rst
@@ -92,7 +92,8 @@ Triggers can be set on more than one psi metric and more than one trigger
 for the same psi metric can be specified. However for each trigger a separate
 file descriptor is required to be able to poll it separately from others,
 therefore for each trigger a separate open() syscall should be made even
-when opening the same psi interface file.
+when opening the same psi interface file. Write operations to a file descriptor
+with an already existing psi trigger will fail with EBUSY.
 
 Monitors activate only when system enters stall state for the monitored
 psi metric and deactivates upon exit from the stall state. While system is
diff --git a/include/linux/psi.h b/include/linux/psi.h
index 65eb1476ac70..370707902345 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -32,7 +32,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to);
 
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
 			char *buf, size_t nbytes, enum psi_res res);
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t);
+void psi_trigger_destroy(void **trigger_ptr);
 
 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
 			poll_table *wait);
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 0a23300d49af..6537d0c92825 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -129,9 +129,6 @@ struct psi_trigger {
 	 * events to one per window
 	 */
 	u64 last_event_time;
-
-	/* Refcounting to prevent premature destruction */
-	struct kref refcount;
 };
 
 struct psi_group {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index cafb8c114a21..e6878238fb19 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3642,6 +3642,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
 	cgroup_get(cgrp);
 	cgroup_kn_unlock(of->kn);
 
+	/* Allow only one trigger per file descriptor */
+	if (READ_ONCE(ctx->psi.trigger)) {
+		cgroup_put(cgrp);
+		return -EBUSY;
+	}
+
 	psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
 	new = psi_trigger_create(psi, buf, nbytes, res);
 	if (IS_ERR(new)) {
@@ -3649,8 +3655,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
 		return PTR_ERR(new);
 	}
 
-	psi_trigger_replace(&ctx->psi.trigger, new);
-
+	WRITE_ONCE(ctx->psi.trigger, new);
 	cgroup_put(cgrp);
 
 	return nbytes;
@@ -3689,7 +3694,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
 	struct cgroup_file_ctx *ctx = of->priv;
 
-	psi_trigger_replace(&ctx->psi.trigger, NULL);
+	psi_trigger_destroy(&ctx->psi.trigger);
 }
 
 bool cgroup_psi_enabled(void)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1652f2bb54b7..882bf62cc247 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1151,7 +1151,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	t->event = 0;
 	t->last_event_time = 0;
 	init_waitqueue_head(&t->event_wait);
-	kref_init(&t->refcount);
 
 	mutex_lock(&group->trigger_lock);
 
@@ -1180,15 +1179,21 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	return t;
 }
 
-static void psi_trigger_destroy(struct kref *ref)
+void psi_trigger_destroy(void **trigger_ptr)
 {
-	struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
-	struct psi_group *group = t->group;
+	struct psi_trigger *t;
+	struct psi_group *group;
 	struct task_struct *task_to_destroy = NULL;
 
-	if (static_branch_likely(&psi_disabled))
+	/*
+	 * We do not check psi_disabled since it might have been disabled after
+	 * the trigger got created.
+	 */
+	t = xchg(trigger_ptr, NULL);
+	if (!t)
 		return;
 
+	group = t->group;
 	/*
 	 * Wakeup waiters to stop polling. Can happen if cgroup is deleted
 	 * from under a polling process.
@@ -1224,9 +1229,9 @@ static void psi_trigger_destroy(struct kref *ref)
 	mutex_unlock(&group->trigger_lock);
 
 	/*
-	 * Wait for both *trigger_ptr from psi_trigger_replace and
-	 * poll_task RCUs to complete their read-side critical sections
-	 * before destroying the trigger and optionally the poll_task
+	 * Wait for psi_schedule_poll_work RCU to complete its read-side
+	 * critical section before destroying the trigger and optionally the
+	 * poll_task.
 	 */
 	synchronize_rcu();
 	/*
@@ -1243,18 +1248,6 @@ static void psi_trigger_destroy(struct kref *ref)
 	kfree(t);
 }
 
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
-{
-	struct psi_trigger *old = *trigger_ptr;
-
-	if (static_branch_likely(&psi_disabled))
-		return;
-
-	rcu_assign_pointer(*trigger_ptr, new);
-	if (old)
-		kref_put(&old->refcount, psi_trigger_destroy);
-}
-
 __poll_t psi_trigger_poll(void **trigger_ptr,
 				struct file *file, poll_table *wait)
 {
@@ -1264,24 +1257,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
 	if (static_branch_likely(&psi_disabled))
 		return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
 
-	rcu_read_lock();
-
-	t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
-	if (!t) {
-		rcu_read_unlock();
+	t = READ_ONCE(*trigger_ptr);
+	if (!t)
 		return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
-	}
-	kref_get(&t->refcount);
-
-	rcu_read_unlock();
 
 	poll_wait(file, &t->event_wait, wait);
 
 	if (cmpxchg(&t->event, 1, 0) == 1)
 		ret |= EPOLLPRI;
 
-	kref_put(&t->refcount, psi_trigger_destroy);
-
 	return ret;
 }
 
@@ -1305,14 +1289,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
 
 	buf[buf_size - 1] = '\0';
 
-	new = psi_trigger_create(&psi_system, buf, nbytes, res);
-	if (IS_ERR(new))
-		return PTR_ERR(new);
-
 	seq = file->private_data;
+
 	/* Take seq->lock to protect seq->private from concurrent writes */
 	mutex_lock(&seq->lock);
-	psi_trigger_replace(&seq->private, new);
+
+	/* Allow only one trigger per file descriptor */
+	if (READ_ONCE(seq->private)) {
+		mutex_unlock(&seq->lock);
+		return -EBUSY;
+	}
+
+	new = psi_trigger_create(&psi_system, buf, nbytes, res);
+	if (IS_ERR(new)) {
+		mutex_unlock(&seq->lock);
+		return PTR_ERR(new);
+	}
+
+	WRITE_ONCE(seq->private, new);
 	mutex_unlock(&seq->lock);
 
 	return nbytes;
@@ -1347,7 +1341,7 @@ static int psi_fop_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq = file->private_data;
 
-	psi_trigger_replace(&seq->private, NULL);
+	psi_trigger_destroy(&seq->private);
 	return single_release(inode, file);
 }
 
-- 
2.34.1.575.g55b058a8bb-goog


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] psi: Fix uaf issue when psi trigger is destroyed while being polled
  2022-01-11  2:51 [PATCH 1/1] psi: Fix uaf issue when psi trigger is destroyed while being polled Suren Baghdasaryan
@ 2022-01-11  2:55   ` Linus Torvalds
  2022-01-11  3:12   ` Eric Biggers
  1 sibling, 0 replies; 8+ messages in thread
From: Linus Torvalds @ 2022-01-11  2:55 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Johannes Weiner, Eric Biggers, Tejun Heo, Zefan Li, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Jonathan Corbet,
	open list:DOCUMENTATION, Linux Kernel Mailing List, Cgroups,
	Android Kernel Team, syzbot

On Mon, Jan 10, 2022 at 6:51 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> Fix this by disallowing to redefine an existing psi trigger. If a write
> operation is used on a file descriptor with an already existing psi
> trigger, the operation will fail with EBUSY error.

Looks fine to me. Eric?

I assume I'll get it through the usual channels unless there are issues,

                Linus

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] psi: Fix uaf issue when psi trigger is destroyed while being polled
@ 2022-01-11  2:55   ` Linus Torvalds
  0 siblings, 0 replies; 8+ messages in thread
From: Linus Torvalds @ 2022-01-11  2:55 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Johannes Weiner, Eric Biggers, Tejun Heo, Zefan Li, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Jonathan Corbet,
	open list:DOCUMENTATION, Linux Kernel Mailing List, Cgroups,
	Android Kernel Team, syzbot

On Mon, Jan 10, 2022 at 6:51 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> Fix this by disallowing to redefine an existing psi trigger. If a write
> operation is used on a file descriptor with an already existing psi
> trigger, the operation will fail with EBUSY error.

Looks fine to me. Eric?

I assume I'll get it through the usual channels unless there are issues,

                Linus

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] psi: Fix uaf issue when psi trigger is destroyed while being polled
@ 2022-01-11  3:12   ` Eric Biggers
  0 siblings, 0 replies; 8+ messages in thread
From: Eric Biggers @ 2022-01-11  3:12 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: hannes, torvalds, tj, lizefan.x, mingo, peterz, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	bristot, corbet, linux-doc, linux-kernel, cgroups, kernel-team,
	syzbot+cdb5dd11c97cc532efad

On Mon, Jan 10, 2022 at 06:51:38PM -0800, Suren Baghdasaryan wrote:
> With write operation on psi files replacing old trigger with a new one,
> the lifetime of its waitqueue is totally arbitrary. Overwriting an
> existing trigger causes its waitqueue to be freed and pending poll()
> will stumble on trigger->event_wait which was destroyed.
> Fix this by disallowing to redefine an existing psi trigger. If a write
> operation is used on a file descriptor with an already existing psi
> trigger, the operation will fail with EBUSY error.
> Also bypass a check for psi_disabled in the psi_trigger_destroy as the
> flag can be flipped after the trigger is created, leading to a memory
> leak.
> 
> Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com
> Analyzed-by: Eric Biggers <ebiggers@kernel.org>
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>

Please include Fixes and Cc stable tags.

> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index cafb8c114a21..e6878238fb19 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -3642,6 +3642,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
>  	cgroup_get(cgrp);
>  	cgroup_kn_unlock(of->kn);
>  
> +	/* Allow only one trigger per file descriptor */
> +	if (READ_ONCE(ctx->psi.trigger)) {
> +		cgroup_put(cgrp);
> +		return -EBUSY;
> +	}
> +

Doesn't the task have exclusive access to the file at this point?  READ_ONCE()
is only needed instead of a plain load when the field can be concurrently
changed by another thread.

> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 1652f2bb54b7..882bf62cc247 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -1151,7 +1151,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
>  	t->event = 0;
>  	t->last_event_time = 0;
>  	init_waitqueue_head(&t->event_wait);
> -	kref_init(&t->refcount);
>  
>  	mutex_lock(&group->trigger_lock);
>  
> @@ -1180,15 +1179,21 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
>  	return t;
>  }
>  
> -static void psi_trigger_destroy(struct kref *ref)
> +void psi_trigger_destroy(void **trigger_ptr)
>  {
> -	struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
> -	struct psi_group *group = t->group;
> +	struct psi_trigger *t;
> +	struct psi_group *group;
>  	struct task_struct *task_to_destroy = NULL;
>  
> -	if (static_branch_likely(&psi_disabled))
> +	/*
> +	 * We do not check psi_disabled since it might have been disabled after
> +	 * the trigger got created.
> +	 */
> +	t = xchg(trigger_ptr, NULL);
> +	if (!t)
>  		return;

Likewise, doesn't the task have exclusive access to the file at this point?
This is only called during ->release().

And why does this take a pointer to a pointer instead of just the pointer?

> @@ -1305,14 +1289,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
>  
>  	buf[buf_size - 1] = '\0';
>  
> -	new = psi_trigger_create(&psi_system, buf, nbytes, res);
> -	if (IS_ERR(new))
> -		return PTR_ERR(new);
> -
>  	seq = file->private_data;
> +
>  	/* Take seq->lock to protect seq->private from concurrent writes */
>  	mutex_lock(&seq->lock);
> -	psi_trigger_replace(&seq->private, new);
> +
> +	/* Allow only one trigger per file descriptor */
> +	if (READ_ONCE(seq->private)) {
> +		mutex_unlock(&seq->lock);
> +		return -EBUSY;
> +	}

Likewise, what does this race against that would require the use of READ_ONCE()?

- Eric

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] psi: Fix uaf issue when psi trigger is destroyed while being polled
@ 2022-01-11  3:12   ` Eric Biggers
  0 siblings, 0 replies; 8+ messages in thread
From: Eric Biggers @ 2022-01-11  3:12 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: hannes-druUgvl0LCNAfugRpC6u6w,
	torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	tj-DgEjT+Ai2ygdnm+yROfE0A, lizefan.x-EC8Uxl6Npydl57MIdRCFDg,
	mingo-H+wXaHxf7aLQT0dZR+AlfA, peterz-wEGCiKHe2LqWVfeAwA7xHQ,
	juri.lelli-H+wXaHxf7aLQT0dZR+AlfA,
	vincent.guittot-QSEj5FYQhm4dnm+yROfE0A,
	dietmar.eggemann-5wv7dgnIgG8, rostedt-nx8X9YLhiw1AfugRpC6u6w,
	bsegall-hpIqsD4AKlfQT0dZR+AlfA, mgorman-l3A5Bk7waGM,
	bristot-H+wXaHxf7aLQT0dZR+AlfA, corbet-T1hC0tSOHrs,
	linux-doc-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA,
	kernel-team-z5hGa2qSFaRBDgjK7y7TUQ,
	syzbot+cdb5dd11c97cc532efad-Pl5Pbv+GP7P466ipTTIvnc23WoclnBCfAL8bYrjMMd8

On Mon, Jan 10, 2022 at 06:51:38PM -0800, Suren Baghdasaryan wrote:
> With write operation on psi files replacing old trigger with a new one,
> the lifetime of its waitqueue is totally arbitrary. Overwriting an
> existing trigger causes its waitqueue to be freed and pending poll()
> will stumble on trigger->event_wait which was destroyed.
> Fix this by disallowing to redefine an existing psi trigger. If a write
> operation is used on a file descriptor with an already existing psi
> trigger, the operation will fail with EBUSY error.
> Also bypass a check for psi_disabled in the psi_trigger_destroy as the
> flag can be flipped after the trigger is created, leading to a memory
> leak.
> 
> Reported-by: syzbot+cdb5dd11c97cc532efad-Pl5Pbv+GP7P466ipTTIvnc23WoclnBCfAL8bYrjMMd8@public.gmane.org
> Analyzed-by: Eric Biggers <ebiggers-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Suggested-by: Linus Torvalds <torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
> Signed-off-by: Suren Baghdasaryan <surenb-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Please include Fixes and Cc stable tags.

> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index cafb8c114a21..e6878238fb19 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -3642,6 +3642,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
>  	cgroup_get(cgrp);
>  	cgroup_kn_unlock(of->kn);
>  
> +	/* Allow only one trigger per file descriptor */
> +	if (READ_ONCE(ctx->psi.trigger)) {
> +		cgroup_put(cgrp);
> +		return -EBUSY;
> +	}
> +

Doesn't the task have exclusive access to the file at this point?  READ_ONCE()
is only needed instead of a plain load when the field can be concurrently
changed by another thread.

> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 1652f2bb54b7..882bf62cc247 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -1151,7 +1151,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
>  	t->event = 0;
>  	t->last_event_time = 0;
>  	init_waitqueue_head(&t->event_wait);
> -	kref_init(&t->refcount);
>  
>  	mutex_lock(&group->trigger_lock);
>  
> @@ -1180,15 +1179,21 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
>  	return t;
>  }
>  
> -static void psi_trigger_destroy(struct kref *ref)
> +void psi_trigger_destroy(void **trigger_ptr)
>  {
> -	struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
> -	struct psi_group *group = t->group;
> +	struct psi_trigger *t;
> +	struct psi_group *group;
>  	struct task_struct *task_to_destroy = NULL;
>  
> -	if (static_branch_likely(&psi_disabled))
> +	/*
> +	 * We do not check psi_disabled since it might have been disabled after
> +	 * the trigger got created.
> +	 */
> +	t = xchg(trigger_ptr, NULL);
> +	if (!t)
>  		return;

Likewise, doesn't the task have exclusive access to the file at this point?
This is only called during ->release().

And why does this take a pointer to a pointer instead of just the pointer?

> @@ -1305,14 +1289,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
>  
>  	buf[buf_size - 1] = '\0';
>  
> -	new = psi_trigger_create(&psi_system, buf, nbytes, res);
> -	if (IS_ERR(new))
> -		return PTR_ERR(new);
> -
>  	seq = file->private_data;
> +
>  	/* Take seq->lock to protect seq->private from concurrent writes */
>  	mutex_lock(&seq->lock);
> -	psi_trigger_replace(&seq->private, new);
> +
> +	/* Allow only one trigger per file descriptor */
> +	if (READ_ONCE(seq->private)) {
> +		mutex_unlock(&seq->lock);
> +		return -EBUSY;
> +	}

Likewise, what does this race against that would require the use of READ_ONCE()?

- Eric

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] psi: Fix uaf issue when psi trigger is destroyed while being polled
  2022-01-11  3:12   ` Eric Biggers
  (?)
@ 2022-01-11  3:55   ` Suren Baghdasaryan
  2022-01-11  7:16       ` Suren Baghdasaryan
  -1 siblings, 1 reply; 8+ messages in thread
From: Suren Baghdasaryan @ 2022-01-11  3:55 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Johannes Weiner, Linus Torvalds, Tejun Heo, Zefan Li,
	Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Benjamin Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Jonathan Corbet, linux-doc, LKML,
	cgroups mailinglist, kernel-team, syzbot

On Mon, Jan 10, 2022 at 7:12 PM Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Mon, Jan 10, 2022 at 06:51:38PM -0800, Suren Baghdasaryan wrote:
> > With write operation on psi files replacing old trigger with a new one,
> > the lifetime of its waitqueue is totally arbitrary. Overwriting an
> > existing trigger causes its waitqueue to be freed and pending poll()
> > will stumble on trigger->event_wait which was destroyed.
> > Fix this by disallowing to redefine an existing psi trigger. If a write
> > operation is used on a file descriptor with an already existing psi
> > trigger, the operation will fail with EBUSY error.
> > Also bypass a check for psi_disabled in the psi_trigger_destroy as the
> > flag can be flipped after the trigger is created, leading to a memory
> > leak.
> >
> > Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com
> > Analyzed-by: Eric Biggers <ebiggers@kernel.org>
> > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
>
> Please include Fixes and Cc stable tags.

Ack.

>
> > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > index cafb8c114a21..e6878238fb19 100644
> > --- a/kernel/cgroup/cgroup.c
> > +++ b/kernel/cgroup/cgroup.c
> > @@ -3642,6 +3642,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
> >       cgroup_get(cgrp);
> >       cgroup_kn_unlock(of->kn);
> >
> > +     /* Allow only one trigger per file descriptor */
> > +     if (READ_ONCE(ctx->psi.trigger)) {
> > +             cgroup_put(cgrp);
> > +             return -EBUSY;
> > +     }
> > +
>
> Doesn't the task have exclusive access to the file at this point?  READ_ONCE()
> is only needed instead of a plain load when the field can be concurrently
> changed by another thread.

Yeah, you are right. Concurrent writes are serialized by of->mutex and
kernfs_release_file documents "@of is guaranteed to have no other file
operations in flight", so ->release() can't race with ->write(). Will
fix.

>
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index 1652f2bb54b7..882bf62cc247 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -1151,7 +1151,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> >       t->event = 0;
> >       t->last_event_time = 0;
> >       init_waitqueue_head(&t->event_wait);
> > -     kref_init(&t->refcount);
> >
> >       mutex_lock(&group->trigger_lock);
> >
> > @@ -1180,15 +1179,21 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> >       return t;
> >  }
> >
> > -static void psi_trigger_destroy(struct kref *ref)
> > +void psi_trigger_destroy(void **trigger_ptr)
> >  {
> > -     struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
> > -     struct psi_group *group = t->group;
> > +     struct psi_trigger *t;
> > +     struct psi_group *group;
> >       struct task_struct *task_to_destroy = NULL;
> >
> > -     if (static_branch_likely(&psi_disabled))
> > +     /*
> > +      * We do not check psi_disabled since it might have been disabled after
> > +      * the trigger got created.
> > +      */
> > +     t = xchg(trigger_ptr, NULL);
> > +     if (!t)
> >               return;
>
> Likewise, doesn't the task have exclusive access to the file at this point?
> This is only called during ->release().

Yes, will fix.

>
> And why does this take a pointer to a pointer instead of just the pointer?

That was done to do atomic xchg, but as you mentioned, it's not needed
here. Will change.

>
> > @@ -1305,14 +1289,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
> >
> >       buf[buf_size - 1] = '\0';
> >
> > -     new = psi_trigger_create(&psi_system, buf, nbytes, res);
> > -     if (IS_ERR(new))
> > -             return PTR_ERR(new);
> > -
> >       seq = file->private_data;
> > +
> >       /* Take seq->lock to protect seq->private from concurrent writes */
> >       mutex_lock(&seq->lock);
> > -     psi_trigger_replace(&seq->private, new);
> > +
> > +     /* Allow only one trigger per file descriptor */
> > +     if (READ_ONCE(seq->private)) {
> > +             mutex_unlock(&seq->lock);
> > +             return -EBUSY;
> > +     }
>
> Likewise, what does this race against that would require the use of READ_ONCE()?

Will fix.
Thanks!

>
> - Eric

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] psi: Fix uaf issue when psi trigger is destroyed while being polled
@ 2022-01-11  7:16       ` Suren Baghdasaryan
  0 siblings, 0 replies; 8+ messages in thread
From: Suren Baghdasaryan @ 2022-01-11  7:16 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Johannes Weiner, Linus Torvalds, Tejun Heo, Zefan Li,
	Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Benjamin Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Jonathan Corbet, linux-doc, LKML,
	cgroups mailinglist, kernel-team, syzbot

On Mon, Jan 10, 2022 at 7:55 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Jan 10, 2022 at 7:12 PM Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > On Mon, Jan 10, 2022 at 06:51:38PM -0800, Suren Baghdasaryan wrote:
> > > With write operation on psi files replacing old trigger with a new one,
> > > the lifetime of its waitqueue is totally arbitrary. Overwriting an
> > > existing trigger causes its waitqueue to be freed and pending poll()
> > > will stumble on trigger->event_wait which was destroyed.
> > > Fix this by disallowing to redefine an existing psi trigger. If a write
> > > operation is used on a file descriptor with an already existing psi
> > > trigger, the operation will fail with EBUSY error.
> > > Also bypass a check for psi_disabled in the psi_trigger_destroy as the
> > > flag can be flipped after the trigger is created, leading to a memory
> > > leak.
> > >
> > > Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com
> > > Analyzed-by: Eric Biggers <ebiggers@kernel.org>
> > > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> >
> > Please include Fixes and Cc stable tags.
>
> Ack.
>
> >
> > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > > index cafb8c114a21..e6878238fb19 100644
> > > --- a/kernel/cgroup/cgroup.c
> > > +++ b/kernel/cgroup/cgroup.c
> > > @@ -3642,6 +3642,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
> > >       cgroup_get(cgrp);
> > >       cgroup_kn_unlock(of->kn);
> > >
> > > +     /* Allow only one trigger per file descriptor */
> > > +     if (READ_ONCE(ctx->psi.trigger)) {
> > > +             cgroup_put(cgrp);
> > > +             return -EBUSY;
> > > +     }
> > > +
> >
> > Doesn't the task have exclusive access to the file at this point?  READ_ONCE()
> > is only needed instead of a plain load when the field can be concurrently
> > changed by another thread.
>
> Yeah, you are right. Concurrent writes are serialized by of->mutex and
> kernfs_release_file documents "@of is guaranteed to have no other file
> operations in flight", so ->release() can't race with ->write(). Will
> fix.
>
> >
> > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > index 1652f2bb54b7..882bf62cc247 100644
> > > --- a/kernel/sched/psi.c
> > > +++ b/kernel/sched/psi.c
> > > @@ -1151,7 +1151,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > >       t->event = 0;
> > >       t->last_event_time = 0;
> > >       init_waitqueue_head(&t->event_wait);
> > > -     kref_init(&t->refcount);
> > >
> > >       mutex_lock(&group->trigger_lock);
> > >
> > > @@ -1180,15 +1179,21 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > >       return t;
> > >  }
> > >
> > > -static void psi_trigger_destroy(struct kref *ref)
> > > +void psi_trigger_destroy(void **trigger_ptr)
> > >  {
> > > -     struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
> > > -     struct psi_group *group = t->group;
> > > +     struct psi_trigger *t;
> > > +     struct psi_group *group;
> > >       struct task_struct *task_to_destroy = NULL;
> > >
> > > -     if (static_branch_likely(&psi_disabled))
> > > +     /*
> > > +      * We do not check psi_disabled since it might have been disabled after
> > > +      * the trigger got created.
> > > +      */
> > > +     t = xchg(trigger_ptr, NULL);
> > > +     if (!t)
> > >               return;
> >
> > Likewise, doesn't the task have exclusive access to the file at this point?
> > This is only called during ->release().
>
> Yes, will fix.
>
> >
> > And why does this take a pointer to a pointer instead of just the pointer?
>
> That was done to do atomic xchg, but as you mentioned, it's not needed
> here. Will change.
>
> >
> > > @@ -1305,14 +1289,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
> > >
> > >       buf[buf_size - 1] = '\0';
> > >
> > > -     new = psi_trigger_create(&psi_system, buf, nbytes, res);
> > > -     if (IS_ERR(new))
> > > -             return PTR_ERR(new);
> > > -
> > >       seq = file->private_data;
> > > +
> > >       /* Take seq->lock to protect seq->private from concurrent writes */
> > >       mutex_lock(&seq->lock);
> > > -     psi_trigger_replace(&seq->private, new);
> > > +
> > > +     /* Allow only one trigger per file descriptor */
> > > +     if (READ_ONCE(seq->private)) {
> > > +             mutex_unlock(&seq->lock);
> > > +             return -EBUSY;
> > > +     }
> >
> > Likewise, what does this race against that would require the use of READ_ONCE()?
>
> Will fix.
> Thanks!

Posted v2 at https://lore.kernel.org/all/20220111071212.1210124-1-surenb@google.com

>
> >
> > - Eric

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] psi: Fix uaf issue when psi trigger is destroyed while being polled
@ 2022-01-11  7:16       ` Suren Baghdasaryan
  0 siblings, 0 replies; 8+ messages in thread
From: Suren Baghdasaryan @ 2022-01-11  7:16 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Johannes Weiner, Linus Torvalds, Tejun Heo, Zefan Li,
	Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Benjamin Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Jonathan Corbet,
	linux-doc-u79uwXL29TY76Z2rM5mHXA, LKML, cgroups mailinglist,
	kernel-team, syzbot

On Mon, Jan 10, 2022 at 7:55 PM Suren Baghdasaryan <surenb-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
>
> On Mon, Jan 10, 2022 at 7:12 PM Eric Biggers <ebiggers-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> >
> > On Mon, Jan 10, 2022 at 06:51:38PM -0800, Suren Baghdasaryan wrote:
> > > With write operation on psi files replacing old trigger with a new one,
> > > the lifetime of its waitqueue is totally arbitrary. Overwriting an
> > > existing trigger causes its waitqueue to be freed and pending poll()
> > > will stumble on trigger->event_wait which was destroyed.
> > > Fix this by disallowing to redefine an existing psi trigger. If a write
> > > operation is used on a file descriptor with an already existing psi
> > > trigger, the operation will fail with EBUSY error.
> > > Also bypass a check for psi_disabled in the psi_trigger_destroy as the
> > > flag can be flipped after the trigger is created, leading to a memory
> > > leak.
> > >
> > > Reported-by: syzbot+cdb5dd11c97cc532efad-Pl5Pbv+GP7P466ipTTIvnc23WoclnBCfAL8bYrjMMd8@public.gmane.org
> > > Analyzed-by: Eric Biggers <ebiggers-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> > > Suggested-by: Linus Torvalds <torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
> > > Signed-off-by: Suren Baghdasaryan <surenb-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> >
> > Please include Fixes and Cc stable tags.
>
> Ack.
>
> >
> > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > > index cafb8c114a21..e6878238fb19 100644
> > > --- a/kernel/cgroup/cgroup.c
> > > +++ b/kernel/cgroup/cgroup.c
> > > @@ -3642,6 +3642,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
> > >       cgroup_get(cgrp);
> > >       cgroup_kn_unlock(of->kn);
> > >
> > > +     /* Allow only one trigger per file descriptor */
> > > +     if (READ_ONCE(ctx->psi.trigger)) {
> > > +             cgroup_put(cgrp);
> > > +             return -EBUSY;
> > > +     }
> > > +
> >
> > Doesn't the task have exclusive access to the file at this point?  READ_ONCE()
> > is only needed instead of a plain load when the field can be concurrently
> > changed by another thread.
>
> Yeah, you are right. Concurrent writes are serialized by of->mutex and
> kernfs_release_file documents "@of is guaranteed to have no other file
> operations in flight", so ->release() can't race with ->write(). Will
> fix.
>
> >
> > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > index 1652f2bb54b7..882bf62cc247 100644
> > > --- a/kernel/sched/psi.c
> > > +++ b/kernel/sched/psi.c
> > > @@ -1151,7 +1151,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > >       t->event = 0;
> > >       t->last_event_time = 0;
> > >       init_waitqueue_head(&t->event_wait);
> > > -     kref_init(&t->refcount);
> > >
> > >       mutex_lock(&group->trigger_lock);
> > >
> > > @@ -1180,15 +1179,21 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > >       return t;
> > >  }
> > >
> > > -static void psi_trigger_destroy(struct kref *ref)
> > > +void psi_trigger_destroy(void **trigger_ptr)
> > >  {
> > > -     struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
> > > -     struct psi_group *group = t->group;
> > > +     struct psi_trigger *t;
> > > +     struct psi_group *group;
> > >       struct task_struct *task_to_destroy = NULL;
> > >
> > > -     if (static_branch_likely(&psi_disabled))
> > > +     /*
> > > +      * We do not check psi_disabled since it might have been disabled after
> > > +      * the trigger got created.
> > > +      */
> > > +     t = xchg(trigger_ptr, NULL);
> > > +     if (!t)
> > >               return;
> >
> > Likewise, doesn't the task have exclusive access to the file at this point?
> > This is only called during ->release().
>
> Yes, will fix.
>
> >
> > And why does this take a pointer to a pointer instead of just the pointer?
>
> That was done to do atomic xchg, but as you mentioned, it's not needed
> here. Will change.
>
> >
> > > @@ -1305,14 +1289,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
> > >
> > >       buf[buf_size - 1] = '\0';
> > >
> > > -     new = psi_trigger_create(&psi_system, buf, nbytes, res);
> > > -     if (IS_ERR(new))
> > > -             return PTR_ERR(new);
> > > -
> > >       seq = file->private_data;
> > > +
> > >       /* Take seq->lock to protect seq->private from concurrent writes */
> > >       mutex_lock(&seq->lock);
> > > -     psi_trigger_replace(&seq->private, new);
> > > +
> > > +     /* Allow only one trigger per file descriptor */
> > > +     if (READ_ONCE(seq->private)) {
> > > +             mutex_unlock(&seq->lock);
> > > +             return -EBUSY;
> > > +     }
> >
> > Likewise, what does this race against that would require the use of READ_ONCE()?
>
> Will fix.
> Thanks!

Posted v2 at https://lore.kernel.org/all/20220111071212.1210124-1-surenb-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org

>
> >
> > - Eric

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-01-11  7:16 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-11  2:51 [PATCH 1/1] psi: Fix uaf issue when psi trigger is destroyed while being polled Suren Baghdasaryan
2022-01-11  2:55 ` Linus Torvalds
2022-01-11  2:55   ` Linus Torvalds
2022-01-11  3:12 ` Eric Biggers
2022-01-11  3:12   ` Eric Biggers
2022-01-11  3:55   ` Suren Baghdasaryan
2022-01-11  7:16     ` Suren Baghdasaryan
2022-01-11  7:16       ` Suren Baghdasaryan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.