linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [[RFC]PATCH] psi: fix race between psi_trigger_create and psimon
@ 2021-05-17  9:04 Huangzhaoyang
  2021-05-17 18:36 ` Johannes Weiner
  0 siblings, 1 reply; 7+ messages in thread
From: Huangzhaoyang @ 2021-05-17  9:04 UTC (permalink / raw)
  To: Johannes Weiner, Zhaoyang Huang, Ziwei Dai, Ke Wang, linux-kernel

From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Race detected between psimon_new and psimon_old as shown below, which
cause panic by accessing invalid psi_system->poll_wait->wait_queue_entry
and psi_system->poll_timer->entry->next. It is not necessary to reinit
resource of psi_system when psi_trigger_create.

psi_trigger_create      psimon_new     psimon_old
 init_waitqueue_head                    finish_wait
                                          spin_lock(lock_old)
	spin_lock_init(lock_new)
 wake_up_process(psimon_new)

                        finish_wait
                          spin_lock(lock_new)
                            list_del       list_del

Signed-off-by: ziwei.dai <ziwei.dai@unisoc.com>
Signed-off-by: ke.wang <ke.wang@unisoc.com>
Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 kernel/sched/psi.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index cc25a3c..d00e585 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -182,6 +182,8 @@ struct psi_group psi_system = {
 
 static void psi_avgs_work(struct work_struct *work);
 
+static void poll_timer_fn(struct timer_list *t);
+
 static void group_init(struct psi_group *group)
 {
 	int cpu;
@@ -201,6 +203,8 @@ static void group_init(struct psi_group *group)
 	memset(group->polling_total, 0, sizeof(group->polling_total));
 	group->polling_next_update = ULLONG_MAX;
 	group->polling_until = 0;
+	init_waitqueue_head(&group->poll_wait);
+	timer_setup(&group->poll_timer, poll_timer_fn, 0);
 	rcu_assign_pointer(group->poll_task, NULL);
 }
 
@@ -1157,7 +1161,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 			return ERR_CAST(task);
 		}
 		atomic_set(&group->poll_wakeup, 0);
-		init_waitqueue_head(&group->poll_wait);
 		wake_up_process(task);
 		timer_setup(&group->poll_timer, poll_timer_fn, 0);
 		rcu_assign_pointer(group->poll_task, task);
@@ -1233,7 +1236,6 @@ static void psi_trigger_destroy(struct kref *ref)
 		 * But it might have been already scheduled before
 		 * that - deschedule it cleanly before destroying it.
 		 */
-		del_timer_sync(&group->poll_timer);
 		kthread_stop(task_to_destroy);
 	}
 	kfree(t);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [[RFC]PATCH] psi: fix race between psi_trigger_create and psimon
  2021-05-17  9:04 [[RFC]PATCH] psi: fix race between psi_trigger_create and psimon Huangzhaoyang
@ 2021-05-17 18:36 ` Johannes Weiner
  2021-05-17 19:33   ` Suren Baghdasaryan
  0 siblings, 1 reply; 7+ messages in thread
From: Johannes Weiner @ 2021-05-17 18:36 UTC (permalink / raw)
  To: Huangzhaoyang
  Cc: Zhaoyang Huang, Ziwei Dai, Ke Wang, Suren Baghdasaryan, linux-kernel

CC Suren

On Mon, May 17, 2021 at 05:04:09PM +0800, Huangzhaoyang wrote:
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> 
> Race detected between psimon_new and psimon_old as shown below, which
> cause panic by accessing invalid psi_system->poll_wait->wait_queue_entry
> and psi_system->poll_timer->entry->next. It is not necessary to reinit
> resource of psi_system when psi_trigger_create.
> 
> psi_trigger_create      psimon_new     psimon_old
>  init_waitqueue_head                    finish_wait
>                                           spin_lock(lock_old)
> 	spin_lock_init(lock_new)
>  wake_up_process(psimon_new)
> 
>                         finish_wait
>                           spin_lock(lock_new)
>                             list_del       list_del
> 
> Signed-off-by: ziwei.dai <ziwei.dai@unisoc.com>
> Signed-off-by: ke.wang <ke.wang@unisoc.com>
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> ---
>  kernel/sched/psi.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index cc25a3c..d00e585 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -182,6 +182,8 @@ struct psi_group psi_system = {
>  
>  static void psi_avgs_work(struct work_struct *work);
>  
> +static void poll_timer_fn(struct timer_list *t);
> +
>  static void group_init(struct psi_group *group)
>  {
>  	int cpu;
> @@ -201,6 +203,8 @@ static void group_init(struct psi_group *group)
>  	memset(group->polling_total, 0, sizeof(group->polling_total));
>  	group->polling_next_update = ULLONG_MAX;
>  	group->polling_until = 0;
> +	init_waitqueue_head(&group->poll_wait);
> +	timer_setup(&group->poll_timer, poll_timer_fn, 0);

This makes sense.

>  	rcu_assign_pointer(group->poll_task, NULL);
>  }
>  
> @@ -1157,7 +1161,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
>  			return ERR_CAST(task);
>  		}
>  		atomic_set(&group->poll_wakeup, 0);
> -		init_waitqueue_head(&group->poll_wait);
>  		wake_up_process(task);
>  		timer_setup(&group->poll_timer, poll_timer_fn, 0);

This looks now unncessary?

>  		rcu_assign_pointer(group->poll_task, task);
> @@ -1233,7 +1236,6 @@ static void psi_trigger_destroy(struct kref *ref)
>  		 * But it might have been already scheduled before
>  		 * that - deschedule it cleanly before destroying it.
>  		 */
> -		del_timer_sync(&group->poll_timer);

And this looks wrong. Did you mean to delete the timer_setup() line
instead?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [[RFC]PATCH] psi: fix race between psi_trigger_create and psimon
  2021-05-17 18:36 ` Johannes Weiner
@ 2021-05-17 19:33   ` Suren Baghdasaryan
  2021-05-17 21:30     ` Suren Baghdasaryan
  0 siblings, 1 reply; 7+ messages in thread
From: Suren Baghdasaryan @ 2021-05-17 19:33 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Huangzhaoyang, Zhaoyang Huang, Ziwei Dai, Ke Wang, LKML

On Mon, May 17, 2021 at 11:36 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> CC Suren

Thanks!

>
> On Mon, May 17, 2021 at 05:04:09PM +0800, Huangzhaoyang wrote:
> > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> >
> > Race detected between psimon_new and psimon_old as shown below, which
> > cause panic by accessing invalid psi_system->poll_wait->wait_queue_entry
> > and psi_system->poll_timer->entry->next. It is not necessary to reinit
> > resource of psi_system when psi_trigger_create.

resource of psi_system will not be reinitialized because
init_waitqueue_head(&group->poll_wait) and friends are initialized
only during the creation of the first trigger for that group (see this
condition: https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L1119).

> >
> > psi_trigger_create      psimon_new     psimon_old
> >  init_waitqueue_head                    finish_wait
> >                                           spin_lock(lock_old)
> >       spin_lock_init(lock_new)
> >  wake_up_process(psimon_new)
> >
> >                         finish_wait
> >                           spin_lock(lock_new)
> >                             list_del       list_del

Could you please clarify this race a bit? I'm having trouble
deciphering this diagram. I'm guessing psimon_new/psimon_old refer to
a new trigger being created while an old one is being deleted, so it
seems like a race between psi_trigger_create/psi_trigger_destroy. The
combination of trigger_lock and RCU should be protecting us from that
but maybe I missed something?
I'm excluding a possibility of a race between psi_trigger_create with
another existing trigger on the same group because the codepath
calling init_waitqueue_head(&group->poll_wait) happens only when the
first trigger for that group is created. Therefore if there is an
existing trigger in that group that codepath will not be taken.

> >
> > Signed-off-by: ziwei.dai <ziwei.dai@unisoc.com>
> > Signed-off-by: ke.wang <ke.wang@unisoc.com>
> > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > ---
> >  kernel/sched/psi.c | 6 ++++--
> >  1 file changed, 4 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index cc25a3c..d00e585 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -182,6 +182,8 @@ struct psi_group psi_system = {
> >
> >  static void psi_avgs_work(struct work_struct *work);
> >
> > +static void poll_timer_fn(struct timer_list *t);
> > +
> >  static void group_init(struct psi_group *group)
> >  {
> >       int cpu;
> > @@ -201,6 +203,8 @@ static void group_init(struct psi_group *group)
> >       memset(group->polling_total, 0, sizeof(group->polling_total));
> >       group->polling_next_update = ULLONG_MAX;
> >       group->polling_until = 0;
> > +     init_waitqueue_head(&group->poll_wait);
> > +     timer_setup(&group->poll_timer, poll_timer_fn, 0);
>
> This makes sense.

Well, this means we initialize resources for triggers in each psi
group even if the user never creates any triggers. Current logic
initializes them when the first trigger in the group gets created.

>
> >       rcu_assign_pointer(group->poll_task, NULL);
> >  }
> >
> > @@ -1157,7 +1161,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> >                       return ERR_CAST(task);
> >               }
> >               atomic_set(&group->poll_wakeup, 0);
> > -             init_waitqueue_head(&group->poll_wait);
> >               wake_up_process(task);
> >               timer_setup(&group->poll_timer, poll_timer_fn, 0);
>
> This looks now unncessary?
>
> >               rcu_assign_pointer(group->poll_task, task);
> > @@ -1233,7 +1236,6 @@ static void psi_trigger_destroy(struct kref *ref)
> >                * But it might have been already scheduled before
> >                * that - deschedule it cleanly before destroying it.
> >                */
> > -             del_timer_sync(&group->poll_timer);
>
> And this looks wrong. Did you mean to delete the timer_setup() line
> instead?

I would like to get more details about this race before trying to fix
it. Please clarify.
Thanks!

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [[RFC]PATCH] psi: fix race between psi_trigger_create and psimon
  2021-05-17 19:33   ` Suren Baghdasaryan
@ 2021-05-17 21:30     ` Suren Baghdasaryan
  2021-05-18  0:40       ` Zhaoyang Huang
  0 siblings, 1 reply; 7+ messages in thread
From: Suren Baghdasaryan @ 2021-05-17 21:30 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Huangzhaoyang, Zhaoyang Huang, Ziwei Dai, Ke Wang, LKML

On Mon, May 17, 2021 at 12:33 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, May 17, 2021 at 11:36 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > CC Suren
>
> Thanks!
>
> >
> > On Mon, May 17, 2021 at 05:04:09PM +0800, Huangzhaoyang wrote:
> > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > >
> > > Race detected between psimon_new and psimon_old as shown below, which
> > > cause panic by accessing invalid psi_system->poll_wait->wait_queue_entry
> > > and psi_system->poll_timer->entry->next. It is not necessary to reinit
> > > resource of psi_system when psi_trigger_create.
>
> resource of psi_system will not be reinitialized because
> init_waitqueue_head(&group->poll_wait) and friends are initialized
> only during the creation of the first trigger for that group (see this
> condition: https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L1119).
>
> > >
> > > psi_trigger_create      psimon_new     psimon_old
> > >  init_waitqueue_head                    finish_wait
> > >                                           spin_lock(lock_old)
> > >       spin_lock_init(lock_new)
> > >  wake_up_process(psimon_new)
> > >
> > >                         finish_wait
> > >                           spin_lock(lock_new)
> > >                             list_del       list_del
>
> Could you please clarify this race a bit? I'm having trouble
> deciphering this diagram. I'm guessing psimon_new/psimon_old refer to
> a new trigger being created while an old one is being deleted, so it
> seems like a race between psi_trigger_create/psi_trigger_destroy. The
> combination of trigger_lock and RCU should be protecting us from that
> but maybe I missed something?
> I'm excluding a possibility of a race between psi_trigger_create with
> another existing trigger on the same group because the codepath
> calling init_waitqueue_head(&group->poll_wait) happens only when the
> first trigger for that group is created. Therefore if there is an
> existing trigger in that group that codepath will not be taken.

Ok, looking at the current code I think you can hit the following race
when psi_trigger_destroy is destroying the last trigger in a psi group
while racing with psi_trigger_create:

psi_trigger_destroy                      psi_trigger_create
mutex_lock(trigger_lock);
rcu_assign_pointer(poll_task, NULL);
mutex_unlock(trigger_lock);
                                                    mutex_lock(trigger_lock);
                                                    if
(!rcu_access_pointer(group->poll_task)) {

timer_setup(poll_timer, poll_timer_fn, 0);

rcu_assign_pointer(poll_task, task);
                                                    }
                                                    mutex_unlock(trigger_lock);

synchronize_rcu();
del_timer_sync(poll_timer); <-- poll_timer has been reinitialized by
psi_trigger_create

So, trigger_lock/RCU correctly protects destruction of
group->poll_task but misses this race affecting poll_timer and
poll_wait.
Let me think if we can handle this without moving initialization into
group_init().

>
> > >
> > > Signed-off-by: ziwei.dai <ziwei.dai@unisoc.com>
> > > Signed-off-by: ke.wang <ke.wang@unisoc.com>
> > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > ---
> > >  kernel/sched/psi.c | 6 ++++--
> > >  1 file changed, 4 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > index cc25a3c..d00e585 100644
> > > --- a/kernel/sched/psi.c
> > > +++ b/kernel/sched/psi.c
> > > @@ -182,6 +182,8 @@ struct psi_group psi_system = {
> > >
> > >  static void psi_avgs_work(struct work_struct *work);
> > >
> > > +static void poll_timer_fn(struct timer_list *t);
> > > +
> > >  static void group_init(struct psi_group *group)
> > >  {
> > >       int cpu;
> > > @@ -201,6 +203,8 @@ static void group_init(struct psi_group *group)
> > >       memset(group->polling_total, 0, sizeof(group->polling_total));
> > >       group->polling_next_update = ULLONG_MAX;
> > >       group->polling_until = 0;
> > > +     init_waitqueue_head(&group->poll_wait);
> > > +     timer_setup(&group->poll_timer, poll_timer_fn, 0);
> >
> > This makes sense.
>
> Well, this means we initialize resources for triggers in each psi
> group even if the user never creates any triggers. Current logic
> initializes them when the first trigger in the group gets created.
>
> >
> > >       rcu_assign_pointer(group->poll_task, NULL);
> > >  }
> > >
> > > @@ -1157,7 +1161,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > >                       return ERR_CAST(task);
> > >               }
> > >               atomic_set(&group->poll_wakeup, 0);
> > > -             init_waitqueue_head(&group->poll_wait);
> > >               wake_up_process(task);
> > >               timer_setup(&group->poll_timer, poll_timer_fn, 0);
> >
> > This looks now unncessary?
> >
> > >               rcu_assign_pointer(group->poll_task, task);
> > > @@ -1233,7 +1236,6 @@ static void psi_trigger_destroy(struct kref *ref)
> > >                * But it might have been already scheduled before
> > >                * that - deschedule it cleanly before destroying it.
> > >                */
> > > -             del_timer_sync(&group->poll_timer);
> >
> > And this looks wrong. Did you mean to delete the timer_setup() line
> > instead?
>
> I would like to get more details about this race before trying to fix
> it. Please clarify.
> Thanks!

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [[RFC]PATCH] psi: fix race between psi_trigger_create and psimon
  2021-05-17 21:30     ` Suren Baghdasaryan
@ 2021-05-18  0:40       ` Zhaoyang Huang
  2021-05-18  1:47         ` Suren Baghdasaryan
  0 siblings, 1 reply; 7+ messages in thread
From: Zhaoyang Huang @ 2021-05-18  0:40 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Johannes Weiner, Zhaoyang Huang, Ziwei Dai, Ke Wang, LKML

On Tue, May 18, 2021 at 5:30 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, May 17, 2021 at 12:33 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Mon, May 17, 2021 at 11:36 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > CC Suren
> >
> > Thanks!
> >
> > >
> > > On Mon, May 17, 2021 at 05:04:09PM +0800, Huangzhaoyang wrote:
> > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > >
> > > > Race detected between psimon_new and psimon_old as shown below, which
> > > > cause panic by accessing invalid psi_system->poll_wait->wait_queue_entry
> > > > and psi_system->poll_timer->entry->next. It is not necessary to reinit
> > > > resource of psi_system when psi_trigger_create.
> >
> > resource of psi_system will not be reinitialized because
> > init_waitqueue_head(&group->poll_wait) and friends are initialized
> > only during the creation of the first trigger for that group (see this
> > condition: https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L1119).
> >
> > > >
> > > > psi_trigger_create      psimon_new     psimon_old
> > > >  init_waitqueue_head                    finish_wait
> > > >                                           spin_lock(lock_old)
> > > >       spin_lock_init(lock_new)
> > > >  wake_up_process(psimon_new)
> > > >
> > > >                         finish_wait
> > > >                           spin_lock(lock_new)
> > > >                             list_del       list_del
> >
> > Could you please clarify this race a bit? I'm having trouble
> > deciphering this diagram. I'm guessing psimon_new/psimon_old refer to
> > a new trigger being created while an old one is being deleted, so it
> > seems like a race between psi_trigger_create/psi_trigger_destroy. The
> > combination of trigger_lock and RCU should be protecting us from that
> > but maybe I missed something?
> > I'm excluding a possibility of a race between psi_trigger_create with
> > another existing trigger on the same group because the codepath
> > calling init_waitqueue_head(&group->poll_wait) happens only when the
> > first trigger for that group is created. Therefore if there is an
> > existing trigger in that group that codepath will not be taken.
>
> Ok, looking at the current code I think you can hit the following race
> when psi_trigger_destroy is destroying the last trigger in a psi group
> while racing with psi_trigger_create:
>
> psi_trigger_destroy                      psi_trigger_create
> mutex_lock(trigger_lock);
> rcu_assign_pointer(poll_task, NULL);
> mutex_unlock(trigger_lock);
>                                                     mutex_lock(trigger_lock);
>                                                     if
> (!rcu_access_pointer(group->poll_task)) {
>
> timer_setup(poll_timer, poll_timer_fn, 0);
>
> rcu_assign_pointer(poll_task, task);
>                                                     }
>                                                     mutex_unlock(trigger_lock);
>
> synchronize_rcu();
> del_timer_sync(poll_timer); <-- poll_timer has been reinitialized by
> psi_trigger_create
>
> So, trigger_lock/RCU correctly protects destruction of
> group->poll_task but misses this race affecting poll_timer and
> poll_wait.
> Let me think if we can handle this without moving initialization into
> group_init().
Right, this is exactly what we met during a monkey test on an android
system, where the psimon will be destroyed/recreated by unref/recreate
the psi_trigger. IMHO,  poll_timer and poll_wait should exist during
whole period
>
> >
> > > >
> > > > Signed-off-by: ziwei.dai <ziwei.dai@unisoc.com>
> > > > Signed-off-by: ke.wang <ke.wang@unisoc.com>
> > > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > ---
> > > >  kernel/sched/psi.c | 6 ++++--
> > > >  1 file changed, 4 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > > index cc25a3c..d00e585 100644
> > > > --- a/kernel/sched/psi.c
> > > > +++ b/kernel/sched/psi.c
> > > > @@ -182,6 +182,8 @@ struct psi_group psi_system = {
> > > >
> > > >  static void psi_avgs_work(struct work_struct *work);
> > > >
> > > > +static void poll_timer_fn(struct timer_list *t);
> > > > +
> > > >  static void group_init(struct psi_group *group)
> > > >  {
> > > >       int cpu;
> > > > @@ -201,6 +203,8 @@ static void group_init(struct psi_group *group)
> > > >       memset(group->polling_total, 0, sizeof(group->polling_total));
> > > >       group->polling_next_update = ULLONG_MAX;
> > > >       group->polling_until = 0;
> > > > +     init_waitqueue_head(&group->poll_wait);
> > > > +     timer_setup(&group->poll_timer, poll_timer_fn, 0);
> > >
> > > This makes sense.
> >
> > Well, this means we initialize resources for triggers in each psi
> > group even if the user never creates any triggers. Current logic
> > initializes them when the first trigger in the group gets created.
> >
> > >
> > > >       rcu_assign_pointer(group->poll_task, NULL);
> > > >  }
> > > >
> > > > @@ -1157,7 +1161,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > > >                       return ERR_CAST(task);
> > > >               }
> > > >               atomic_set(&group->poll_wakeup, 0);
> > > > -             init_waitqueue_head(&group->poll_wait);
> > > >               wake_up_process(task);
> > > >               timer_setup(&group->poll_timer, poll_timer_fn, 0);
> > >
> > > This looks now unncessary?
> > >
> > > >               rcu_assign_pointer(group->poll_task, task);
> > > > @@ -1233,7 +1236,6 @@ static void psi_trigger_destroy(struct kref *ref)
> > > >                * But it might have been already scheduled before
> > > >                * that - deschedule it cleanly before destroying it.
> > > >                */
> > > > -             del_timer_sync(&group->poll_timer);
> > >
> > > And this looks wrong. Did you mean to delete the timer_setup() line
> > > instead?
> >
> > I would like to get more details about this race before trying to fix
> > it. Please clarify.
> > Thanks!

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [[RFC]PATCH] psi: fix race between psi_trigger_create and psimon
  2021-05-18  0:40       ` Zhaoyang Huang
@ 2021-05-18  1:47         ` Suren Baghdasaryan
  2021-05-18  2:08           ` Suren Baghdasaryan
  0 siblings, 1 reply; 7+ messages in thread
From: Suren Baghdasaryan @ 2021-05-18  1:47 UTC (permalink / raw)
  To: Zhaoyang Huang; +Cc: Johannes Weiner, Zhaoyang Huang, Ziwei Dai, Ke Wang, LKML

On Mon, May 17, 2021 at 5:41 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
>
> On Tue, May 18, 2021 at 5:30 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Mon, May 17, 2021 at 12:33 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Mon, May 17, 2021 at 11:36 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >
> > > > CC Suren
> > >
> > > Thanks!
> > >
> > > >
> > > > On Mon, May 17, 2021 at 05:04:09PM +0800, Huangzhaoyang wrote:
> > > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > >
> > > > > Race detected between psimon_new and psimon_old as shown below, which
> > > > > cause panic by accessing invalid psi_system->poll_wait->wait_queue_entry
> > > > > and psi_system->poll_timer->entry->next. It is not necessary to reinit
> > > > > resource of psi_system when psi_trigger_create.
> > >
> > > resource of psi_system will not be reinitialized because
> > > init_waitqueue_head(&group->poll_wait) and friends are initialized
> > > only during the creation of the first trigger for that group (see this
> > > condition: https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L1119).
> > >
> > > > >
> > > > > psi_trigger_create      psimon_new     psimon_old
> > > > >  init_waitqueue_head                    finish_wait
> > > > >                                           spin_lock(lock_old)
> > > > >       spin_lock_init(lock_new)
> > > > >  wake_up_process(psimon_new)
> > > > >
> > > > >                         finish_wait
> > > > >                           spin_lock(lock_new)
> > > > >                             list_del       list_del
> > >
> > > Could you please clarify this race a bit? I'm having trouble
> > > deciphering this diagram. I'm guessing psimon_new/psimon_old refer to
> > > a new trigger being created while an old one is being deleted, so it
> > > seems like a race between psi_trigger_create/psi_trigger_destroy. The
> > > combination of trigger_lock and RCU should be protecting us from that
> > > but maybe I missed something?
> > > I'm excluding a possibility of a race between psi_trigger_create with
> > > another existing trigger on the same group because the codepath
> > > calling init_waitqueue_head(&group->poll_wait) happens only when the
> > > first trigger for that group is created. Therefore if there is an
> > > existing trigger in that group that codepath will not be taken.
> >
> > Ok, looking at the current code I think you can hit the following race
> > when psi_trigger_destroy is destroying the last trigger in a psi group
> > while racing with psi_trigger_create:
> >
> > psi_trigger_destroy                      psi_trigger_create
> > mutex_lock(trigger_lock);
> > rcu_assign_pointer(poll_task, NULL);
> > mutex_unlock(trigger_lock);
> >                                                     mutex_lock(trigger_lock);
> >                                                     if
> > (!rcu_access_pointer(group->poll_task)) {
> >
> > timer_setup(poll_timer, poll_timer_fn, 0);
> >
> > rcu_assign_pointer(poll_task, task);
> >                                                     }
> >                                                     mutex_unlock(trigger_lock);
> >
> > synchronize_rcu();
> > del_timer_sync(poll_timer); <-- poll_timer has been reinitialized by
> > psi_trigger_create
> >
> > So, trigger_lock/RCU correctly protects destruction of
> > group->poll_task but misses this race affecting poll_timer and
> > poll_wait.
> > Let me think if we can handle this without moving initialization into
> > group_init().
> Right, this is exactly what we met during a monkey test on an android
> system, where the psimon will be destroyed/recreated by unref/recreate
> the psi_trigger. IMHO,  poll_timer and poll_wait should exist during
> whole period

Ok, understood. I think it should be ok to initialize poll_wait and
poll_timer at the group creation time. Looks like
init_waitqueue_head() and timer_setup() initialize the fields but I
don't think they allocate some additional resources. Johannes pointed
to some issues in your original patch, so I've made some small
modifications (see below). del_timer_sync() was important back when we
used kthread_worker, now even if timer fires unnecessarily it should
be harmless after we reset group->poll_task. So I think a del_timer()
in psi_trigger_destroy() should be enough:

@@ -181,6 +181,7 @@ struct psi_group psi_system = {
 };

 static void psi_avgs_work(struct work_struct *work);
+static void poll_timer_fn(struct timer_list *t);

 static void group_init(struct psi_group *group)
 {
@@ -202,6 +203,8 @@ static void group_init(struct psi_group *group)
         group->polling_next_update = ULLONG_MAX;
         group->polling_until = 0;
         rcu_assign_pointer(group->poll_task, NULL);
+        init_waitqueue_head(&group->poll_wait);
+        timer_setup(&group->poll_timer, poll_timer_fn, 0);
 }

 void __init psi_init(void)
@@ -1157,9 +1160,7 @@ struct psi_trigger *psi_trigger_create(struct
psi_group *group,
                         return ERR_CAST(task);
                 }
                 atomic_set(&group->poll_wakeup, 0);
-                init_waitqueue_head(&group->poll_wait);
                 wake_up_process(task);
-                timer_setup(&group->poll_timer, poll_timer_fn, 0);
                 rcu_assign_pointer(group->poll_task, task);
         }

@@ -1211,6 +1212,7 @@ static void psi_trigger_destroy(struct kref *ref)
                                         group->poll_task,
                                         lockdep_is_held(&group->trigger_lock));
                         rcu_assign_pointer(group->poll_task, NULL);
+                        del_timer(&group->poll_timer);
                 }
         }

@@ -1230,10 +1232,7 @@ static void psi_trigger_destroy(struct kref *ref)
                 /*
                  * After the RCU grace period has expired, the worker
                  * can no longer be found through group->poll_task.
-                 * But it might have been already scheduled before
-                 * that - deschedule it cleanly before destroying it.
                  */
-                del_timer_sync(&group->poll_timer);
                 kthread_stop(task_to_destroy);
         }
         kfree(t);

> >
> > >
> > > > >
> > > > > Signed-off-by: ziwei.dai <ziwei.dai@unisoc.com>
> > > > > Signed-off-by: ke.wang <ke.wang@unisoc.com>
> > > > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > ---
> > > > >  kernel/sched/psi.c | 6 ++++--
> > > > >  1 file changed, 4 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > > > index cc25a3c..d00e585 100644
> > > > > --- a/kernel/sched/psi.c
> > > > > +++ b/kernel/sched/psi.c
> > > > > @@ -182,6 +182,8 @@ struct psi_group psi_system = {
> > > > >
> > > > >  static void psi_avgs_work(struct work_struct *work);
> > > > >
> > > > > +static void poll_timer_fn(struct timer_list *t);
> > > > > +
> > > > >  static void group_init(struct psi_group *group)
> > > > >  {
> > > > >       int cpu;
> > > > > @@ -201,6 +203,8 @@ static void group_init(struct psi_group *group)
> > > > >       memset(group->polling_total, 0, sizeof(group->polling_total));
> > > > >       group->polling_next_update = ULLONG_MAX;
> > > > >       group->polling_until = 0;
> > > > > +     init_waitqueue_head(&group->poll_wait);
> > > > > +     timer_setup(&group->poll_timer, poll_timer_fn, 0);
> > > >
> > > > This makes sense.
> > >
> > > Well, this means we initialize resources for triggers in each psi
> > > group even if the user never creates any triggers. Current logic
> > > initializes them when the first trigger in the group gets created.
> > >
> > > >
> > > > >       rcu_assign_pointer(group->poll_task, NULL);
> > > > >  }
> > > > >
> > > > > @@ -1157,7 +1161,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > > > >                       return ERR_CAST(task);
> > > > >               }
> > > > >               atomic_set(&group->poll_wakeup, 0);
> > > > > -             init_waitqueue_head(&group->poll_wait);
> > > > >               wake_up_process(task);
> > > > >               timer_setup(&group->poll_timer, poll_timer_fn, 0);
> > > >
> > > > This looks now unncessary?
> > > >
> > > > >               rcu_assign_pointer(group->poll_task, task);
> > > > > @@ -1233,7 +1236,6 @@ static void psi_trigger_destroy(struct kref *ref)
> > > > >                * But it might have been already scheduled before
> > > > >                * that - deschedule it cleanly before destroying it.
> > > > >                */
> > > > > -             del_timer_sync(&group->poll_timer);
> > > >
> > > > And this looks wrong. Did you mean to delete the timer_setup() line
> > > > instead?
> > >
> > > I would like to get more details about this race before trying to fix
> > > it. Please clarify.
> > > Thanks!

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [[RFC]PATCH] psi: fix race between psi_trigger_create and psimon
  2021-05-18  1:47         ` Suren Baghdasaryan
@ 2021-05-18  2:08           ` Suren Baghdasaryan
  0 siblings, 0 replies; 7+ messages in thread
From: Suren Baghdasaryan @ 2021-05-18  2:08 UTC (permalink / raw)
  To: Zhaoyang Huang; +Cc: Johannes Weiner, Zhaoyang Huang, Ziwei Dai, Ke Wang, LKML

On Mon, May 17, 2021 at 6:47 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, May 17, 2021 at 5:41 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
> >
> > On Tue, May 18, 2021 at 5:30 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Mon, May 17, 2021 at 12:33 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Mon, May 17, 2021 at 11:36 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > >
> > > > > CC Suren
> > > >
> > > > Thanks!

When resending the patch please run scripts/get_maintainer.pl against
your patch and CC reported recipients.
Thanks!

> > > >
> > > > >
> > > > > On Mon, May 17, 2021 at 05:04:09PM +0800, Huangzhaoyang wrote:
> > > > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > >
> > > > > > Race detected between psimon_new and psimon_old as shown below, which
> > > > > > cause panic by accessing invalid psi_system->poll_wait->wait_queue_entry
> > > > > > and psi_system->poll_timer->entry->next. It is not necessary to reinit
> > > > > > resource of psi_system when psi_trigger_create.
> > > >
> > > > resource of psi_system will not be reinitialized because
> > > > init_waitqueue_head(&group->poll_wait) and friends are initialized
> > > > only during the creation of the first trigger for that group (see this
> > > > condition: https://elixir.bootlin.com/linux/latest/source/kernel/sched/psi.c#L1119).
> > > >
> > > > > >
> > > > > > psi_trigger_create      psimon_new     psimon_old
> > > > > >  init_waitqueue_head                    finish_wait
> > > > > >                                           spin_lock(lock_old)
> > > > > >       spin_lock_init(lock_new)
> > > > > >  wake_up_process(psimon_new)
> > > > > >
> > > > > >                         finish_wait
> > > > > >                           spin_lock(lock_new)
> > > > > >                             list_del       list_del
> > > >
> > > > Could you please clarify this race a bit? I'm having trouble
> > > > deciphering this diagram. I'm guessing psimon_new/psimon_old refer to
> > > > a new trigger being created while an old one is being deleted, so it
> > > > seems like a race between psi_trigger_create/psi_trigger_destroy. The
> > > > combination of trigger_lock and RCU should be protecting us from that
> > > > but maybe I missed something?
> > > > I'm excluding a possibility of a race between psi_trigger_create with
> > > > another existing trigger on the same group because the codepath
> > > > calling init_waitqueue_head(&group->poll_wait) happens only when the
> > > > first trigger for that group is created. Therefore if there is an
> > > > existing trigger in that group that codepath will not be taken.
> > >
> > > Ok, looking at the current code I think you can hit the following race
> > > when psi_trigger_destroy is destroying the last trigger in a psi group
> > > while racing with psi_trigger_create:
> > >
> > > psi_trigger_destroy                      psi_trigger_create
> > > mutex_lock(trigger_lock);
> > > rcu_assign_pointer(poll_task, NULL);
> > > mutex_unlock(trigger_lock);
> > >                                                     mutex_lock(trigger_lock);
> > >                                                     if
> > > (!rcu_access_pointer(group->poll_task)) {
> > >
> > > timer_setup(poll_timer, poll_timer_fn, 0);
> > >
> > > rcu_assign_pointer(poll_task, task);
> > >                                                     }
> > >                                                     mutex_unlock(trigger_lock);
> > >
> > > synchronize_rcu();
> > > del_timer_sync(poll_timer); <-- poll_timer has been reinitialized by
> > > psi_trigger_create
> > >
> > > So, trigger_lock/RCU correctly protects destruction of
> > > group->poll_task but misses this race affecting poll_timer and
> > > poll_wait.
> > > Let me think if we can handle this without moving initialization into
> > > group_init().
> > Right, this is exactly what we met during a monkey test on an android
> > system, where the psimon will be destroyed/recreated by unref/recreate
> > the psi_trigger. IMHO,  poll_timer and poll_wait should exist during
> > whole period
>
> Ok, understood. I think it should be ok to initialize poll_wait and
> poll_timer at the group creation time. Looks like
> init_waitqueue_head() and timer_setup() initialize the fields but I
> don't think they allocate some additional resources. Johannes pointed
> to some issues in your original patch, so I've made some small
> modifications (see below). del_timer_sync() was important back when we
> used kthread_worker, now even if timer fires unnecessarily it should
> be harmless after we reset group->poll_task. So I think a del_timer()
> in psi_trigger_destroy() should be enough:
>
> @@ -181,6 +181,7 @@ struct psi_group psi_system = {
>  };
>
>  static void psi_avgs_work(struct work_struct *work);
> +static void poll_timer_fn(struct timer_list *t);
>
>  static void group_init(struct psi_group *group)
>  {
> @@ -202,6 +203,8 @@ static void group_init(struct psi_group *group)
>          group->polling_next_update = ULLONG_MAX;
>          group->polling_until = 0;
>          rcu_assign_pointer(group->poll_task, NULL);
> +        init_waitqueue_head(&group->poll_wait);
> +        timer_setup(&group->poll_timer, poll_timer_fn, 0);
>  }
>
>  void __init psi_init(void)
> @@ -1157,9 +1160,7 @@ struct psi_trigger *psi_trigger_create(struct
> psi_group *group,
>                          return ERR_CAST(task);
>                  }
>                  atomic_set(&group->poll_wakeup, 0);
> -                init_waitqueue_head(&group->poll_wait);
>                  wake_up_process(task);
> -                timer_setup(&group->poll_timer, poll_timer_fn, 0);
>                  rcu_assign_pointer(group->poll_task, task);
>          }
>
> @@ -1211,6 +1212,7 @@ static void psi_trigger_destroy(struct kref *ref)
>                                          group->poll_task,
>                                          lockdep_is_held(&group->trigger_lock));
>                          rcu_assign_pointer(group->poll_task, NULL);
> +                        del_timer(&group->poll_timer);
>                  }
>          }
>
> @@ -1230,10 +1232,7 @@ static void psi_trigger_destroy(struct kref *ref)
>                  /*
>                   * After the RCU grace period has expired, the worker
>                   * can no longer be found through group->poll_task.
> -                 * But it might have been already scheduled before
> -                 * that - deschedule it cleanly before destroying it.
>                   */
> -                del_timer_sync(&group->poll_timer);
>                  kthread_stop(task_to_destroy);
>          }
>          kfree(t);
>
> > >
> > > >
> > > > > >
> > > > > > Signed-off-by: ziwei.dai <ziwei.dai@unisoc.com>
> > > > > > Signed-off-by: ke.wang <ke.wang@unisoc.com>
> > > > > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > > > > ---
> > > > > >  kernel/sched/psi.c | 6 ++++--
> > > > > >  1 file changed, 4 insertions(+), 2 deletions(-)
> > > > > >
> > > > > > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > > > > > index cc25a3c..d00e585 100644
> > > > > > --- a/kernel/sched/psi.c
> > > > > > +++ b/kernel/sched/psi.c
> > > > > > @@ -182,6 +182,8 @@ struct psi_group psi_system = {
> > > > > >
> > > > > >  static void psi_avgs_work(struct work_struct *work);
> > > > > >
> > > > > > +static void poll_timer_fn(struct timer_list *t);
> > > > > > +
> > > > > >  static void group_init(struct psi_group *group)
> > > > > >  {
> > > > > >       int cpu;
> > > > > > @@ -201,6 +203,8 @@ static void group_init(struct psi_group *group)
> > > > > >       memset(group->polling_total, 0, sizeof(group->polling_total));
> > > > > >       group->polling_next_update = ULLONG_MAX;
> > > > > >       group->polling_until = 0;
> > > > > > +     init_waitqueue_head(&group->poll_wait);
> > > > > > +     timer_setup(&group->poll_timer, poll_timer_fn, 0);
> > > > >
> > > > > This makes sense.
> > > >
> > > > Well, this means we initialize resources for triggers in each psi
> > > > group even if the user never creates any triggers. Current logic
> > > > initializes them when the first trigger in the group gets created.
> > > >
> > > > >
> > > > > >       rcu_assign_pointer(group->poll_task, NULL);
> > > > > >  }
> > > > > >
> > > > > > @@ -1157,7 +1161,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
> > > > > >                       return ERR_CAST(task);
> > > > > >               }
> > > > > >               atomic_set(&group->poll_wakeup, 0);
> > > > > > -             init_waitqueue_head(&group->poll_wait);
> > > > > >               wake_up_process(task);
> > > > > >               timer_setup(&group->poll_timer, poll_timer_fn, 0);
> > > > >
> > > > > This looks now unncessary?
> > > > >
> > > > > >               rcu_assign_pointer(group->poll_task, task);
> > > > > > @@ -1233,7 +1236,6 @@ static void psi_trigger_destroy(struct kref *ref)
> > > > > >                * But it might have been already scheduled before
> > > > > >                * that - deschedule it cleanly before destroying it.
> > > > > >                */
> > > > > > -             del_timer_sync(&group->poll_timer);
> > > > >
> > > > > And this looks wrong. Did you mean to delete the timer_setup() line
> > > > > instead?
> > > >
> > > > I would like to get more details about this race before trying to fix
> > > > it. Please clarify.
> > > > Thanks!

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-05-18  2:09 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-17  9:04 [[RFC]PATCH] psi: fix race between psi_trigger_create and psimon Huangzhaoyang
2021-05-17 18:36 ` Johannes Weiner
2021-05-17 19:33   ` Suren Baghdasaryan
2021-05-17 21:30     ` Suren Baghdasaryan
2021-05-18  0:40       ` Zhaoyang Huang
2021-05-18  1:47         ` Suren Baghdasaryan
2021-05-18  2:08           ` Suren Baghdasaryan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).