linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] sched/core: fix illegal RCU from offline CPUs
@ 2020-04-01 21:40 Qian Cai
  2020-04-02 11:24 ` Michael Ellerman
  2020-05-01 18:22 ` [tip: sched/core] sched/core: Fix " tip-bot2 for Peter Zijlstra
  0 siblings, 2 replies; 17+ messages in thread
From: Qian Cai @ 2020-04-01 21:40 UTC (permalink / raw)
  To: peterz, mingo
  Cc: juri.lelli, dietmar.eggemann, vincent.guittot, rostedt, bsegall,
	mgorman, paulmck, tglx, mpe, James.Bottomley, deller,
	linuxppc-dev, linux-parisc, linux-mm, linux-kernel, Qian Cai

From: Peter Zijlstra <peterz@infradead.org>

In the CPU-offline process, it calls mmdrop() after idle entry and the
subsequent call to cpuhp_report_idle_dead(). Once execution passes the
call to rcu_report_dead(), RCU is ignoring the CPU, which results in
lockdep complaining when mmdrop() uses RCU from either memcg or
debugobjects below.

Fix it by cleaning up the active_mm state from BP instead. Every arch
which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
from AP. The only exception is parisc because it switches them to
&init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
but the patch will still work there because it calls mmgrab(&init_mm) in
smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().

WARNING: suspicious RCU usage
-----------------------------
kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!

other info that might help us debug this:

RCU used illegally from offline CPU!
Call Trace:
 dump_stack+0xf4/0x164 (unreliable)
 lockdep_rcu_suspicious+0x140/0x164
 get_work_pool+0x110/0x150
 __queue_work+0x1bc/0xca0
 queue_work_on+0x114/0x120
 css_release+0x9c/0xc0
 percpu_ref_put_many+0x204/0x230
 free_pcp_prepare+0x264/0x570
 free_unref_page+0x38/0xf0
 __mmdrop+0x21c/0x2c0
 idle_task_exit+0x170/0x1b0
 pnv_smp_cpu_kill_self+0x38/0x2e0
 cpu_die+0x48/0x64
 arch_cpu_idle_dead+0x30/0x50
 do_idle+0x2f4/0x470
 cpu_startup_entry+0x38/0x40
 start_secondary+0x7a8/0xa80
 start_secondary_resume+0x10/0x14

<Peter to sign off here>
Signed-off-by: Qian Cai <cai@lca.pw>
---
 arch/powerpc/platforms/powernv/smp.c |  1 -
 include/linux/sched/mm.h             |  2 ++
 kernel/cpu.c                         | 18 +++++++++++++++++-
 kernel/sched/core.c                  |  5 +++--
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 13e251699346..b2ba3e95bda7 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
 	/* Standard hot unplug procedure */
 
 	idle_task_exit();
-	current->active_mm = NULL; /* for sanity */
 	cpu = smp_processor_id();
 	DBG("CPU%d offline\n", cpu);
 	generic_set_cpu_dead(cpu);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c49257a3b510..a132d875d351 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+void mmdrop(struct mm_struct *mm);
+
 /*
  * This has to be called after a get_task_mm()/mmget_not_zero()
  * followed by taking the mmap_sem for writing before modifying the
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2371292f30b0..244d30544377 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
  *
  * This code is licenced under the GPL.
  */
+#include <linux/sched/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/smp.h>
 #include <linux/init.h>
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
 	return bringup_wait_for_ap(cpu);
 }
 
+static int finish_cpu(unsigned int cpu)
+{
+	struct task_struct *idle = idle_thread_get(cpu);
+	struct mm_struct *mm = idle->active_mm;
+
+	/*
+	 * idle_task_exit() will have switched to &init_mm, now
+	 * clean up any remaining active_mm state.
+	 */
+	if (mm != &init_mm)
+		idle->active_mm = &init_mm;
+	mmdrop(mm);
+	return 0;
+}
+
 /*
  * Hotplug state machine related functions
  */
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
 		.startup.single		= bringup_cpu,
-		.teardown.single	= NULL,
+		.teardown.single	= finish_cpu,
 		.cant_stop		= true,
 	},
 	/* Final state before CPU kills itself */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a2694ba82874..8787958339d5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6200,13 +6200,14 @@ void idle_task_exit(void)
 	struct mm_struct *mm = current->active_mm;
 
 	BUG_ON(cpu_online(smp_processor_id()));
+	BUG_ON(current != this_rq()->idle);
 
 	if (mm != &init_mm) {
 		switch_mm(mm, &init_mm, current);
-		current->active_mm = &init_mm;
 		finish_arch_post_lock_switch();
 	}
-	mmdrop(mm);
+
+	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
 /*
-- 
2.21.0 (Apple Git-122.2)


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-04-01 21:40 [PATCH v2] sched/core: fix illegal RCU from offline CPUs Qian Cai
@ 2020-04-02 11:24 ` Michael Ellerman
  2020-04-02 14:00   ` Qian Cai
  2020-04-17 13:26   ` Qian Cai
  2020-05-01 18:22 ` [tip: sched/core] sched/core: Fix " tip-bot2 for Peter Zijlstra
  1 sibling, 2 replies; 17+ messages in thread
From: Michael Ellerman @ 2020-04-02 11:24 UTC (permalink / raw)
  To: Qian Cai, peterz, mingo
  Cc: juri.lelli, dietmar.eggemann, vincent.guittot, rostedt, bsegall,
	mgorman, paulmck, tglx, James.Bottomley, deller, linuxppc-dev,
	linux-parisc, linux-mm, linux-kernel, Qian Cai, Nicholas Piggin

Qian Cai <cai@lca.pw> writes:
> From: Peter Zijlstra <peterz@infradead.org>
>
> In the CPU-offline process, it calls mmdrop() after idle entry and the
> subsequent call to cpuhp_report_idle_dead(). Once execution passes the
> call to rcu_report_dead(), RCU is ignoring the CPU, which results in
> lockdep complaining when mmdrop() uses RCU from either memcg or
> debugobjects below.
>
> Fix it by cleaning up the active_mm state from BP instead. Every arch
> which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
> from AP. The only exception is parisc because it switches them to
> &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
> but the patch will still work there because it calls mmgrab(&init_mm) in
> smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().

Thanks for debugging this. How did you hit it in the first place?

A link to the original thread would have helped me:

  https://lore.kernel.org/lkml/20200113190331.12788-1-cai@lca.pw/

> WARNING: suspicious RCU usage
> -----------------------------
> kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!
>
> other info that might help us debug this:
>
> RCU used illegally from offline CPU!
> Call Trace:
>  dump_stack+0xf4/0x164 (unreliable)
>  lockdep_rcu_suspicious+0x140/0x164
>  get_work_pool+0x110/0x150
>  __queue_work+0x1bc/0xca0
>  queue_work_on+0x114/0x120
>  css_release+0x9c/0xc0
>  percpu_ref_put_many+0x204/0x230
>  free_pcp_prepare+0x264/0x570
>  free_unref_page+0x38/0xf0
>  __mmdrop+0x21c/0x2c0
>  idle_task_exit+0x170/0x1b0
>  pnv_smp_cpu_kill_self+0x38/0x2e0
>  cpu_die+0x48/0x64
>  arch_cpu_idle_dead+0x30/0x50
>  do_idle+0x2f4/0x470
>  cpu_startup_entry+0x38/0x40
>  start_secondary+0x7a8/0xa80
>  start_secondary_resume+0x10/0x14

Do we know when this started happening? ie. can we determine a Fixes
tag?

> <Peter to sign off here>
> Signed-off-by: Qian Cai <cai@lca.pw>
> ---
>  arch/powerpc/platforms/powernv/smp.c |  1 -
>  include/linux/sched/mm.h             |  2 ++
>  kernel/cpu.c                         | 18 +++++++++++++++++-
>  kernel/sched/core.c                  |  5 +++--
>  4 files changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
> index 13e251699346..b2ba3e95bda7 100644
> --- a/arch/powerpc/platforms/powernv/smp.c
> +++ b/arch/powerpc/platforms/powernv/smp.c
> @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
>  	/* Standard hot unplug procedure */
>  
>  	idle_task_exit();
> -	current->active_mm = NULL; /* for sanity */

If I'm reading it right, we'll now be running with active_mm == init_mm
in the offline loop.

I guess that's fine, I can't think of any reason it would matter, and it
seems like we were NULL'ing it out just for paranoia's sake not because
of any actual problem.

Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)


cheers

> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index c49257a3b510..a132d875d351 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
>  		__mmdrop(mm);
>  }
>  
> +void mmdrop(struct mm_struct *mm);
> +
>  /*
>   * This has to be called after a get_task_mm()/mmget_not_zero()
>   * followed by taking the mmap_sem for writing before modifying the
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 2371292f30b0..244d30544377 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -3,6 +3,7 @@
>   *
>   * This code is licenced under the GPL.
>   */
> +#include <linux/sched/mm.h>
>  #include <linux/proc_fs.h>
>  #include <linux/smp.h>
>  #include <linux/init.h>
> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
>  	return bringup_wait_for_ap(cpu);
>  }
>  
> +static int finish_cpu(unsigned int cpu)
> +{
> +	struct task_struct *idle = idle_thread_get(cpu);
> +	struct mm_struct *mm = idle->active_mm;
> +
> +	/*
> +	 * idle_task_exit() will have switched to &init_mm, now
> +	 * clean up any remaining active_mm state.
> +	 */
> +	if (mm != &init_mm)
> +		idle->active_mm = &init_mm;
> +	mmdrop(mm);
> +	return 0;
> +}
> +
>  /*
>   * Hotplug state machine related functions
>   */
> @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
>  	[CPUHP_BRINGUP_CPU] = {
>  		.name			= "cpu:bringup",
>  		.startup.single		= bringup_cpu,
> -		.teardown.single	= NULL,
> +		.teardown.single	= finish_cpu,
>  		.cant_stop		= true,
>  	},
>  	/* Final state before CPU kills itself */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a2694ba82874..8787958339d5 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6200,13 +6200,14 @@ void idle_task_exit(void)
>  	struct mm_struct *mm = current->active_mm;
>  
>  	BUG_ON(cpu_online(smp_processor_id()));
> +	BUG_ON(current != this_rq()->idle);
>  
>  	if (mm != &init_mm) {
>  		switch_mm(mm, &init_mm, current);
> -		current->active_mm = &init_mm;
>  		finish_arch_post_lock_switch();
>  	}
> -	mmdrop(mm);
> +
> +	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
>  }
>  
>  /*
> -- 
> 2.21.0 (Apple Git-122.2)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-04-02 11:24 ` Michael Ellerman
@ 2020-04-02 14:00   ` Qian Cai
  2020-04-02 15:54     ` Paul E. McKenney
  2020-04-17 13:26   ` Qian Cai
  1 sibling, 1 reply; 17+ messages in thread
From: Qian Cai @ 2020-04-02 14:00 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Peter Zijlstra, Ingo Molnar, juri.lelli, dietmar.eggemann,
	vincent.guittot, rostedt, bsegall, mgorman, paulmck, tglx,
	James.Bottomley@hansenpartnership.com, deller, linuxppc-dev,
	linux-parisc, linux-mm, linux-kernel, Nicholas Piggin



> On Apr 2, 2020, at 7:24 AM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> Qian Cai <cai@lca.pw> writes:
>> From: Peter Zijlstra <peterz@infradead.org>
>> 
>> In the CPU-offline process, it calls mmdrop() after idle entry and the
>> subsequent call to cpuhp_report_idle_dead(). Once execution passes the
>> call to rcu_report_dead(), RCU is ignoring the CPU, which results in
>> lockdep complaining when mmdrop() uses RCU from either memcg or
>> debugobjects below.
>> 
>> Fix it by cleaning up the active_mm state from BP instead. Every arch
>> which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
>> from AP. The only exception is parisc because it switches them to
>> &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
>> but the patch will still work there because it calls mmgrab(&init_mm) in
>> smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().
> 
> Thanks for debugging this. How did you hit it in the first place?

Just repeatedly offline/online CPUs which will eventually cause an idle thread
refcount goes to 0 and trigger __mmdrop() and of course it needs to enable
lockdep (PROVE_RCU?) as well as having luck to hit the cgroup, workqueue
or debugobject code paths to call RCU.

> 
> A link to the original thread would have helped me:
> 
>  https://lore.kernel.org/lkml/20200113190331.12788-1-cai@lca.pw/
> 
>> WARNING: suspicious RCU usage
>> -----------------------------
>> kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!
>> 
>> other info that might help us debug this:
>> 
>> RCU used illegally from offline CPU!
>> Call Trace:
>> dump_stack+0xf4/0x164 (unreliable)
>> lockdep_rcu_suspicious+0x140/0x164
>> get_work_pool+0x110/0x150
>> __queue_work+0x1bc/0xca0
>> queue_work_on+0x114/0x120
>> css_release+0x9c/0xc0
>> percpu_ref_put_many+0x204/0x230
>> free_pcp_prepare+0x264/0x570
>> free_unref_page+0x38/0xf0
>> __mmdrop+0x21c/0x2c0
>> idle_task_exit+0x170/0x1b0
>> pnv_smp_cpu_kill_self+0x38/0x2e0
>> cpu_die+0x48/0x64
>> arch_cpu_idle_dead+0x30/0x50
>> do_idle+0x2f4/0x470
>> cpu_startup_entry+0x38/0x40
>> start_secondary+0x7a8/0xa80
>> start_secondary_resume+0x10/0x14
> 
> Do we know when this started happening? ie. can we determine a Fixes
> tag?

I don’t know. I looked at some commits that it seems the code was like that
even 10-year ago. It must be nobody who cares to run lockdep (PROVE_RCU?)
with CPU hotplug very regularly.

> 
>> <Peter to sign off here>
>> Signed-off-by: Qian Cai <cai@lca.pw>
>> ---
>> arch/powerpc/platforms/powernv/smp.c |  1 -
>> include/linux/sched/mm.h             |  2 ++
>> kernel/cpu.c                         | 18 +++++++++++++++++-
>> kernel/sched/core.c                  |  5 +++--
>> 4 files changed, 22 insertions(+), 4 deletions(-)
>> 
>> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
>> index 13e251699346..b2ba3e95bda7 100644
>> --- a/arch/powerpc/platforms/powernv/smp.c
>> +++ b/arch/powerpc/platforms/powernv/smp.c
>> @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
>> 	/* Standard hot unplug procedure */
>> 
>> 	idle_task_exit();
>> -	current->active_mm = NULL; /* for sanity */
> 
> If I'm reading it right, we'll now be running with active_mm == init_mm
> in the offline loop.
> 
> I guess that's fine, I can't think of any reason it would matter, and it
> seems like we were NULL'ing it out just for paranoia's sake not because
> of any actual problem.
> 
> Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
> 
> 
> cheers
> 
>> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
>> index c49257a3b510..a132d875d351 100644
>> --- a/include/linux/sched/mm.h
>> +++ b/include/linux/sched/mm.h
>> @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
>> 		__mmdrop(mm);
>> }
>> 
>> +void mmdrop(struct mm_struct *mm);
>> +
>> /*
>>  * This has to be called after a get_task_mm()/mmget_not_zero()
>>  * followed by taking the mmap_sem for writing before modifying the
>> diff --git a/kernel/cpu.c b/kernel/cpu.c
>> index 2371292f30b0..244d30544377 100644
>> --- a/kernel/cpu.c
>> +++ b/kernel/cpu.c
>> @@ -3,6 +3,7 @@
>>  *
>>  * This code is licenced under the GPL.
>>  */
>> +#include <linux/sched/mm.h>
>> #include <linux/proc_fs.h>
>> #include <linux/smp.h>
>> #include <linux/init.h>
>> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
>> 	return bringup_wait_for_ap(cpu);
>> }
>> 
>> +static int finish_cpu(unsigned int cpu)
>> +{
>> +	struct task_struct *idle = idle_thread_get(cpu);
>> +	struct mm_struct *mm = idle->active_mm;
>> +
>> +	/*
>> +	 * idle_task_exit() will have switched to &init_mm, now
>> +	 * clean up any remaining active_mm state.
>> +	 */
>> +	if (mm != &init_mm)
>> +		idle->active_mm = &init_mm;
>> +	mmdrop(mm);
>> +	return 0;
>> +}
>> +
>> /*
>>  * Hotplug state machine related functions
>>  */
>> @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
>> 	[CPUHP_BRINGUP_CPU] = {
>> 		.name			= "cpu:bringup",
>> 		.startup.single		= bringup_cpu,
>> -		.teardown.single	= NULL,
>> +		.teardown.single	= finish_cpu,
>> 		.cant_stop		= true,
>> 	},
>> 	/* Final state before CPU kills itself */
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index a2694ba82874..8787958339d5 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -6200,13 +6200,14 @@ void idle_task_exit(void)
>> 	struct mm_struct *mm = current->active_mm;
>> 
>> 	BUG_ON(cpu_online(smp_processor_id()));
>> +	BUG_ON(current != this_rq()->idle);
>> 
>> 	if (mm != &init_mm) {
>> 		switch_mm(mm, &init_mm, current);
>> -		current->active_mm = &init_mm;
>> 		finish_arch_post_lock_switch();
>> 	}
>> -	mmdrop(mm);
>> +
>> +	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
>> }
>> 
>> /*
>> -- 
>> 2.21.0 (Apple Git-122.2)


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-04-02 14:00   ` Qian Cai
@ 2020-04-02 15:54     ` Paul E. McKenney
  2020-04-02 16:19       ` Qian Cai
  0 siblings, 1 reply; 17+ messages in thread
From: Paul E. McKenney @ 2020-04-02 15:54 UTC (permalink / raw)
  To: Qian Cai
  Cc: Michael Ellerman, Peter Zijlstra, Ingo Molnar, juri.lelli,
	dietmar.eggemann, vincent.guittot, rostedt, bsegall, mgorman,
	tglx, James.Bottomley@hansenpartnership.com, deller,
	linuxppc-dev, linux-parisc, linux-mm, linux-kernel,
	Nicholas Piggin

On Thu, Apr 02, 2020 at 10:00:16AM -0400, Qian Cai wrote:
> 
> 
> > On Apr 2, 2020, at 7:24 AM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> > 
> > Qian Cai <cai@lca.pw> writes:
> >> From: Peter Zijlstra <peterz@infradead.org>
> >> 
> >> In the CPU-offline process, it calls mmdrop() after idle entry and the
> >> subsequent call to cpuhp_report_idle_dead(). Once execution passes the
> >> call to rcu_report_dead(), RCU is ignoring the CPU, which results in
> >> lockdep complaining when mmdrop() uses RCU from either memcg or
> >> debugobjects below.
> >> 
> >> Fix it by cleaning up the active_mm state from BP instead. Every arch
> >> which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
> >> from AP. The only exception is parisc because it switches them to
> >> &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
> >> but the patch will still work there because it calls mmgrab(&init_mm) in
> >> smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().
> > 
> > Thanks for debugging this. How did you hit it in the first place?
> 
> Just repeatedly offline/online CPUs which will eventually cause an idle thread
> refcount goes to 0 and trigger __mmdrop() and of course it needs to enable
> lockdep (PROVE_RCU?) as well as having luck to hit the cgroup, workqueue
> or debugobject code paths to call RCU.
> 
> > 
> > A link to the original thread would have helped me:
> > 
> >  https://lore.kernel.org/lkml/20200113190331.12788-1-cai@lca.pw/
> > 
> >> WARNING: suspicious RCU usage
> >> -----------------------------
> >> kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!
> >> 
> >> other info that might help us debug this:
> >> 
> >> RCU used illegally from offline CPU!
> >> Call Trace:
> >> dump_stack+0xf4/0x164 (unreliable)
> >> lockdep_rcu_suspicious+0x140/0x164
> >> get_work_pool+0x110/0x150
> >> __queue_work+0x1bc/0xca0
> >> queue_work_on+0x114/0x120
> >> css_release+0x9c/0xc0
> >> percpu_ref_put_many+0x204/0x230
> >> free_pcp_prepare+0x264/0x570
> >> free_unref_page+0x38/0xf0
> >> __mmdrop+0x21c/0x2c0
> >> idle_task_exit+0x170/0x1b0
> >> pnv_smp_cpu_kill_self+0x38/0x2e0
> >> cpu_die+0x48/0x64
> >> arch_cpu_idle_dead+0x30/0x50
> >> do_idle+0x2f4/0x470
> >> cpu_startup_entry+0x38/0x40
> >> start_secondary+0x7a8/0xa80
> >> start_secondary_resume+0x10/0x14
> > 
> > Do we know when this started happening? ie. can we determine a Fixes
> > tag?
> 
> I don’t know. I looked at some commits that it seems the code was like that
> even 10-year ago. It must be nobody who cares to run lockdep (PROVE_RCU?)
> with CPU hotplug very regularly.

I do run this combination quite frequently, but only as part of
rcutorture, which might not be a representative workload.  For one thing,
it has a minimal userspace consisting only of a trivial init program.
I don't recall having ever seen this.  (I have seen one recent complaint
about an IPI being sent to an offline CPU, but I cannot prove that this
was not due to RCU bugs that I was chasing at the time.)

							Thanx, Paul

> >> <Peter to sign off here>
> >> Signed-off-by: Qian Cai <cai@lca.pw>
> >> ---
> >> arch/powerpc/platforms/powernv/smp.c |  1 -
> >> include/linux/sched/mm.h             |  2 ++
> >> kernel/cpu.c                         | 18 +++++++++++++++++-
> >> kernel/sched/core.c                  |  5 +++--
> >> 4 files changed, 22 insertions(+), 4 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
> >> index 13e251699346..b2ba3e95bda7 100644
> >> --- a/arch/powerpc/platforms/powernv/smp.c
> >> +++ b/arch/powerpc/platforms/powernv/smp.c
> >> @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
> >> 	/* Standard hot unplug procedure */
> >> 
> >> 	idle_task_exit();
> >> -	current->active_mm = NULL; /* for sanity */
> > 
> > If I'm reading it right, we'll now be running with active_mm == init_mm
> > in the offline loop.
> > 
> > I guess that's fine, I can't think of any reason it would matter, and it
> > seems like we were NULL'ing it out just for paranoia's sake not because
> > of any actual problem.
> > 
> > Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
> > 
> > 
> > cheers
> > 
> >> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> >> index c49257a3b510..a132d875d351 100644
> >> --- a/include/linux/sched/mm.h
> >> +++ b/include/linux/sched/mm.h
> >> @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
> >> 		__mmdrop(mm);
> >> }
> >> 
> >> +void mmdrop(struct mm_struct *mm);
> >> +
> >> /*
> >>  * This has to be called after a get_task_mm()/mmget_not_zero()
> >>  * followed by taking the mmap_sem for writing before modifying the
> >> diff --git a/kernel/cpu.c b/kernel/cpu.c
> >> index 2371292f30b0..244d30544377 100644
> >> --- a/kernel/cpu.c
> >> +++ b/kernel/cpu.c
> >> @@ -3,6 +3,7 @@
> >>  *
> >>  * This code is licenced under the GPL.
> >>  */
> >> +#include <linux/sched/mm.h>
> >> #include <linux/proc_fs.h>
> >> #include <linux/smp.h>
> >> #include <linux/init.h>
> >> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
> >> 	return bringup_wait_for_ap(cpu);
> >> }
> >> 
> >> +static int finish_cpu(unsigned int cpu)
> >> +{
> >> +	struct task_struct *idle = idle_thread_get(cpu);
> >> +	struct mm_struct *mm = idle->active_mm;
> >> +
> >> +	/*
> >> +	 * idle_task_exit() will have switched to &init_mm, now
> >> +	 * clean up any remaining active_mm state.
> >> +	 */
> >> +	if (mm != &init_mm)
> >> +		idle->active_mm = &init_mm;
> >> +	mmdrop(mm);
> >> +	return 0;
> >> +}
> >> +
> >> /*
> >>  * Hotplug state machine related functions
> >>  */
> >> @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
> >> 	[CPUHP_BRINGUP_CPU] = {
> >> 		.name			= "cpu:bringup",
> >> 		.startup.single		= bringup_cpu,
> >> -		.teardown.single	= NULL,
> >> +		.teardown.single	= finish_cpu,
> >> 		.cant_stop		= true,
> >> 	},
> >> 	/* Final state before CPU kills itself */
> >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> >> index a2694ba82874..8787958339d5 100644
> >> --- a/kernel/sched/core.c
> >> +++ b/kernel/sched/core.c
> >> @@ -6200,13 +6200,14 @@ void idle_task_exit(void)
> >> 	struct mm_struct *mm = current->active_mm;
> >> 
> >> 	BUG_ON(cpu_online(smp_processor_id()));
> >> +	BUG_ON(current != this_rq()->idle);
> >> 
> >> 	if (mm != &init_mm) {
> >> 		switch_mm(mm, &init_mm, current);
> >> -		current->active_mm = &init_mm;
> >> 		finish_arch_post_lock_switch();
> >> 	}
> >> -	mmdrop(mm);
> >> +
> >> +	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
> >> }
> >> 
> >> /*
> >> -- 
> >> 2.21.0 (Apple Git-122.2)
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-04-02 15:54     ` Paul E. McKenney
@ 2020-04-02 16:19       ` Qian Cai
  2020-04-02 16:57         ` Paul E. McKenney
  0 siblings, 1 reply; 17+ messages in thread
From: Qian Cai @ 2020-04-02 16:19 UTC (permalink / raw)
  To: paulmck
  Cc: Michael Ellerman, Peter Zijlstra, Ingo Molnar, juri.lelli,
	dietmar.eggemann, vincent.guittot, rostedt, bsegall, mgorman,
	tglx, James.Bottomley, deller, linuxppc-dev, linux-parisc,
	linux-mm, linux-kernel, Nicholas Piggin



> On Apr 2, 2020, at 11:54 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
> 
> I do run this combination quite frequently, but only as part of
> rcutorture, which might not be a representative workload.  For one thing,
> it has a minimal userspace consisting only of a trivial init program.
> I don't recall having ever seen this.  (I have seen one recent complaint
> about an IPI being sent to an offline CPU, but I cannot prove that this
> was not due to RCU bugs that I was chasing at the time.)

Yes, a trivial init is tough while running systemd should be able to catch it as it will use cgroup.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-04-02 16:19       ` Qian Cai
@ 2020-04-02 16:57         ` Paul E. McKenney
  0 siblings, 0 replies; 17+ messages in thread
From: Paul E. McKenney @ 2020-04-02 16:57 UTC (permalink / raw)
  To: Qian Cai
  Cc: Michael Ellerman, Peter Zijlstra, Ingo Molnar, juri.lelli,
	dietmar.eggemann, vincent.guittot, rostedt, bsegall, mgorman,
	tglx, James.Bottomley, deller, linuxppc-dev, linux-parisc,
	linux-mm, linux-kernel, Nicholas Piggin

On Thu, Apr 02, 2020 at 12:19:54PM -0400, Qian Cai wrote:
> 
> 
> > On Apr 2, 2020, at 11:54 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
> > 
> > I do run this combination quite frequently, but only as part of
> > rcutorture, which might not be a representative workload.  For one thing,
> > it has a minimal userspace consisting only of a trivial init program.
> > I don't recall having ever seen this.  (I have seen one recent complaint
> > about an IPI being sent to an offline CPU, but I cannot prove that this
> > was not due to RCU bugs that I was chasing at the time.)
> 
> Yes, a trivial init is tough while running systemd should be able to catch it as it will use cgroup.

Not planning to add systemd to my rcutorture runs.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-04-02 11:24 ` Michael Ellerman
  2020-04-02 14:00   ` Qian Cai
@ 2020-04-17 13:26   ` Qian Cai
  2020-04-21 13:56     ` Peter Zijlstra
  1 sibling, 1 reply; 17+ messages in thread
From: Qian Cai @ 2020-04-17 13:26 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Peter Zijlstra, Ingo Molnar, juri.lelli, dietmar.eggemann,
	vincent.guittot, Steven Rostedt, bsegall, mgorman, paulmck, tglx,
	James.Bottomley@hansenpartnership.com, deller, linuxppc-dev,
	linux-parisc, linux-mm, linux-kernel, Nicholas Piggin



> On Apr 2, 2020, at 7:24 AM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> Qian Cai <cai@lca.pw> writes:
>> From: Peter Zijlstra <peterz@infradead.org>
>> 
>> In the CPU-offline process, it calls mmdrop() after idle entry and the
>> subsequent call to cpuhp_report_idle_dead(). Once execution passes the
>> call to rcu_report_dead(), RCU is ignoring the CPU, which results in
>> lockdep complaining when mmdrop() uses RCU from either memcg or
>> debugobjects below.
>> 
>> Fix it by cleaning up the active_mm state from BP instead. Every arch
>> which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
>> from AP. The only exception is parisc because it switches them to
>> &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
>> but the patch will still work there because it calls mmgrab(&init_mm) in
>> smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().
> 
> Thanks for debugging this. How did you hit it in the first place?
> 
> A link to the original thread would have helped me:
> 
>  https://lore.kernel.org/lkml/20200113190331.12788-1-cai@lca.pw/
> 
>> WARNING: suspicious RCU usage
>> -----------------------------
>> kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!
>> 
>> other info that might help us debug this:
>> 
>> RCU used illegally from offline CPU!
>> Call Trace:
>> dump_stack+0xf4/0x164 (unreliable)
>> lockdep_rcu_suspicious+0x140/0x164
>> get_work_pool+0x110/0x150
>> __queue_work+0x1bc/0xca0
>> queue_work_on+0x114/0x120
>> css_release+0x9c/0xc0
>> percpu_ref_put_many+0x204/0x230
>> free_pcp_prepare+0x264/0x570
>> free_unref_page+0x38/0xf0
>> __mmdrop+0x21c/0x2c0
>> idle_task_exit+0x170/0x1b0
>> pnv_smp_cpu_kill_self+0x38/0x2e0
>> cpu_die+0x48/0x64
>> arch_cpu_idle_dead+0x30/0x50
>> do_idle+0x2f4/0x470
>> cpu_startup_entry+0x38/0x40
>> start_secondary+0x7a8/0xa80
>> start_secondary_resume+0x10/0x14
> 
> Do we know when this started happening? ie. can we determine a Fixes
> tag?
> 
>> <Peter to sign off here>
>> Signed-off-by: Qian Cai <cai@lca.pw>
>> ---
>> arch/powerpc/platforms/powernv/smp.c |  1 -
>> include/linux/sched/mm.h             |  2 ++
>> kernel/cpu.c                         | 18 +++++++++++++++++-
>> kernel/sched/core.c                  |  5 +++--
>> 4 files changed, 22 insertions(+), 4 deletions(-)
>> 
>> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
>> index 13e251699346..b2ba3e95bda7 100644
>> --- a/arch/powerpc/platforms/powernv/smp.c
>> +++ b/arch/powerpc/platforms/powernv/smp.c
>> @@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
>> 	/* Standard hot unplug procedure */
>> 
>> 	idle_task_exit();
>> -	current->active_mm = NULL; /* for sanity */
> 
> If I'm reading it right, we'll now be running with active_mm == init_mm
> in the offline loop.
> 
> I guess that's fine, I can't think of any reason it would matter, and it
> seems like we were NULL'ing it out just for paranoia's sake not because
> of any actual problem.
> 
> Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)

Peter, can you take a look at this patch when you have a chance?

> 
> 
> cheers
> 
>> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
>> index c49257a3b510..a132d875d351 100644
>> --- a/include/linux/sched/mm.h
>> +++ b/include/linux/sched/mm.h
>> @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
>> 		__mmdrop(mm);
>> }
>> 
>> +void mmdrop(struct mm_struct *mm);
>> +
>> /*
>>  * This has to be called after a get_task_mm()/mmget_not_zero()
>>  * followed by taking the mmap_sem for writing before modifying the
>> diff --git a/kernel/cpu.c b/kernel/cpu.c
>> index 2371292f30b0..244d30544377 100644
>> --- a/kernel/cpu.c
>> +++ b/kernel/cpu.c
>> @@ -3,6 +3,7 @@
>>  *
>>  * This code is licenced under the GPL.
>>  */
>> +#include <linux/sched/mm.h>
>> #include <linux/proc_fs.h>
>> #include <linux/smp.h>
>> #include <linux/init.h>
>> @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
>> 	return bringup_wait_for_ap(cpu);
>> }
>> 
>> +static int finish_cpu(unsigned int cpu)
>> +{
>> +	struct task_struct *idle = idle_thread_get(cpu);
>> +	struct mm_struct *mm = idle->active_mm;
>> +
>> +	/*
>> +	 * idle_task_exit() will have switched to &init_mm, now
>> +	 * clean up any remaining active_mm state.
>> +	 */
>> +	if (mm != &init_mm)
>> +		idle->active_mm = &init_mm;
>> +	mmdrop(mm);
>> +	return 0;
>> +}
>> +
>> /*
>>  * Hotplug state machine related functions
>>  */
>> @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
>> 	[CPUHP_BRINGUP_CPU] = {
>> 		.name			= "cpu:bringup",
>> 		.startup.single		= bringup_cpu,
>> -		.teardown.single	= NULL,
>> +		.teardown.single	= finish_cpu,
>> 		.cant_stop		= true,
>> 	},
>> 	/* Final state before CPU kills itself */
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index a2694ba82874..8787958339d5 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -6200,13 +6200,14 @@ void idle_task_exit(void)
>> 	struct mm_struct *mm = current->active_mm;
>> 
>> 	BUG_ON(cpu_online(smp_processor_id()));
>> +	BUG_ON(current != this_rq()->idle);
>> 
>> 	if (mm != &init_mm) {
>> 		switch_mm(mm, &init_mm, current);
>> -		current->active_mm = &init_mm;
>> 		finish_arch_post_lock_switch();
>> 	}
>> -	mmdrop(mm);
>> +
>> +	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
>> }
>> 
>> /*
>> -- 
>> 2.21.0 (Apple Git-122.2)


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-04-17 13:26   ` Qian Cai
@ 2020-04-21 13:56     ` Peter Zijlstra
  0 siblings, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2020-04-21 13:56 UTC (permalink / raw)
  To: Qian Cai
  Cc: Michael Ellerman, Ingo Molnar, juri.lelli, dietmar.eggemann,
	vincent.guittot, Steven Rostedt, bsegall, mgorman, paulmck, tglx,
	James.Bottomley@hansenpartnership.com, deller, linuxppc-dev,
	linux-parisc, linux-mm, linux-kernel, Nicholas Piggin

On Fri, Apr 17, 2020 at 09:26:56AM -0400, Qian Cai wrote:

> > Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
> 
> Peter, can you take a look at this patch when you have a chance?

Sorry, -ETOOMUCHEMAIL, got it now, thanks!

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tip: sched/core] sched/core: Fix illegal RCU from offline CPUs
  2020-04-01 21:40 [PATCH v2] sched/core: fix illegal RCU from offline CPUs Qian Cai
  2020-04-02 11:24 ` Michael Ellerman
@ 2020-05-01 18:22 ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 17+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-05-01 18:22 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Qian Cai, Michael Ellerman, x86, LKML

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     bf2c59fce4074e55d622089b34be3a6bc95484fb
Gitweb:        https://git.kernel.org/tip/bf2c59fce4074e55d622089b34be3a6bc95484fb
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 01 Apr 2020 17:40:33 -04:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 30 Apr 2020 20:14:41 +02:00

sched/core: Fix illegal RCU from offline CPUs

In the CPU-offline process, it calls mmdrop() after idle entry and the
subsequent call to cpuhp_report_idle_dead(). Once execution passes the
call to rcu_report_dead(), RCU is ignoring the CPU, which results in
lockdep complaining when mmdrop() uses RCU from either memcg or
debugobjects below.

Fix it by cleaning up the active_mm state from BP instead. Every arch
which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
from AP. The only exception is parisc because it switches them to
&init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
but the patch will still work there because it calls mmgrab(&init_mm) in
smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu().

  WARNING: suspicious RCU usage
  -----------------------------
  kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!

  other info that might help us debug this:

  RCU used illegally from offline CPU!
  Call Trace:
   dump_stack+0xf4/0x164 (unreliable)
   lockdep_rcu_suspicious+0x140/0x164
   get_work_pool+0x110/0x150
   __queue_work+0x1bc/0xca0
   queue_work_on+0x114/0x120
   css_release+0x9c/0xc0
   percpu_ref_put_many+0x204/0x230
   free_pcp_prepare+0x264/0x570
   free_unref_page+0x38/0xf0
   __mmdrop+0x21c/0x2c0
   idle_task_exit+0x170/0x1b0
   pnv_smp_cpu_kill_self+0x38/0x2e0
   cpu_die+0x48/0x64
   arch_cpu_idle_dead+0x30/0x50
   do_idle+0x2f4/0x470
   cpu_startup_entry+0x38/0x40
   start_secondary+0x7a8/0xa80
   start_secondary_resume+0x10/0x14

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Link: https://lkml.kernel.org/r/20200401214033.8448-1-cai@lca.pw
---
 arch/powerpc/platforms/powernv/smp.c |  1 -
 include/linux/sched/mm.h             |  2 ++
 kernel/cpu.c                         | 18 +++++++++++++++++-
 kernel/sched/core.c                  |  5 +++--
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 13e2516..b2ba3e9 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
 	/* Standard hot unplug procedure */
 
 	idle_task_exit();
-	current->active_mm = NULL; /* for sanity */
 	cpu = smp_processor_id();
 	DBG("CPU%d offline\n", cpu);
 	generic_set_cpu_dead(cpu);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c49257a..a132d87 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+void mmdrop(struct mm_struct *mm);
+
 /*
  * This has to be called after a get_task_mm()/mmget_not_zero()
  * followed by taking the mmap_sem for writing before modifying the
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2371292..244d305 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
  *
  * This code is licenced under the GPL.
  */
+#include <linux/sched/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/smp.h>
 #include <linux/init.h>
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
 	return bringup_wait_for_ap(cpu);
 }
 
+static int finish_cpu(unsigned int cpu)
+{
+	struct task_struct *idle = idle_thread_get(cpu);
+	struct mm_struct *mm = idle->active_mm;
+
+	/*
+	 * idle_task_exit() will have switched to &init_mm, now
+	 * clean up any remaining active_mm state.
+	 */
+	if (mm != &init_mm)
+		idle->active_mm = &init_mm;
+	mmdrop(mm);
+	return 0;
+}
+
 /*
  * Hotplug state machine related functions
  */
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
 		.startup.single		= bringup_cpu,
-		.teardown.single	= NULL,
+		.teardown.single	= finish_cpu,
 		.cant_stop		= true,
 	},
 	/* Final state before CPU kills itself */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e6ba9e..f6ae262 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6197,13 +6197,14 @@ void idle_task_exit(void)
 	struct mm_struct *mm = current->active_mm;
 
 	BUG_ON(cpu_online(smp_processor_id()));
+	BUG_ON(current != this_rq()->idle);
 
 	if (mm != &init_mm) {
 		switch_mm(mm, &init_mm, current);
-		current->active_mm = &init_mm;
 		finish_arch_post_lock_switch();
 	}
-	mmdrop(mm);
+
+	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
 /*

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-03-30  2:42       ` Qian Cai
@ 2020-04-01 21:05         ` Qian Cai
  0 siblings, 0 replies; 17+ messages in thread
From: Qian Cai @ 2020-04-01 21:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, juri.lelli, vincent.guittot, dietmar.eggemann,
	Steven Rostedt (VMware),
	bsegall, mgorman, Paul E. McKenney, Thomas Gleixner,
	Linux Memory Management List, LKML



> On Mar 29, 2020, at 10:42 PM, Qian Cai <cai@lca.pw> wrote:
> 
> 
> 
>> On Jan 21, 2020, at 5:35 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> 
>> On Mon, Jan 20, 2020 at 03:35:09PM -0500, Qian Cai wrote:
>>>> On Jan 20, 2020, at 5:17 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>>> 
>>>> Bah.. that's horrible. Surely we can find a better place to do this in
>>>> the whole hotplug machinery.
>>>> 
>>>> Perhaps you can have takedown_cpu() do the mmdrop()?
>>> 
>>> The problem is that no all arch_cpu_idle_dead() will call
>>> idle_task_exit(). For example, alpha and parisc are not, so it needs
>> 
>> How is that not broken? If the idle thread runs on an active_mm, we need
>> to drop that reference. This needs fixing regardless.
> 
> It turns out alpha does not support cpu hotplug (no CONFIG_HOTPLUG_CPU),
> and parisc will use &init_mm for idle tasks of secondary CPUs as in,
> 
> smp_callin()
>  smp_cpu_init()
> 
> /* Initialise the idle task for this CPU */	
> mmgrab(&init_mm);
> current->active_mm = &init_mm;
> BUG_ON(current->mm);
> enter_lazy_tlb(&init_mm, current);
> 
>> 
>>> to deal with some kind of ifdef dance in takedown_cpu() to
>>> conditionally call mmdrop() which sounds even more horrible?
>>> 
>>> If you really prefer it anyway, maybe something like touching every arch’s __cpu_die() to also call mmdrop() depends on arches?
>>> 
>>> BTW, how to obtain the other CPU’s current task mm? Is that cpu_rq(cpu)->curr->active_mm?
>> 
>> Something like this; except you'll need to go audit archs to make sure
>> they all call idle_task_exit() and/or put in comments on why they don't
>> have to (perhaps their bringup switches them to &init_mm unconditionally
>> and the switch_mm() is not required).
> 
> I have to modify your code slightly (below) to pass compilation, but then it will
> panic during cpu offline for some reasons,

OK, I found this little thing on powerpc needs to flip. I’ll post an v2 with Peter
as an author. Thus, need your signed-off.

> 
> [  258.972625][ T9006] Faulting instruction address: 0xc00000000010afc4
> [  258.972640][ T9006] Oops: Kernel access of bad area, sig: 11 [#1]
> [  258.972651][ T9006] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=256 DEBUG_PAGEALLOC NUMA PowerNV
> [  258.972675][ T9006] Modules linked in: kvm_hv kvm ip_tables x_tables xfs sd_mod bnx2x ahci mdio libahci tg3 libata libphy firmware_class dm_mirror dm_region_hash dm_log dm_mod
> [  258.972716][ T9006] CPU: 84 PID: 9006 Comm: cpuhotplug04.sh Not tainted 5.6.0-rc7-next-20200327+ #7
> [  258.972741][ T9006] NIP:  c00000000010afc4 LR: c00000000010afa0 CTR: fffffffffffffffd
> [  258.972775][ T9006] REGS: c000201a54def7f0 TRAP: 0300   Not tainted  (5.6.0-rc7-next-20200327+)
> [  258.972809][ T9006] MSR:  900000000280b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 28224824  XER: 20040000
> [  258.972839][ T9006] CFAR: c00000000015951c DAR: 0000000000000050 DSISR: 40000000 IRQMASK: 0 
> [  258.972839][ T9006] GPR00: c00000000010afa0 c000201a54defa80 c00000000165bd00 c000000001604180 
> [  258.972839][ T9006] GPR04: c00000008160496f 0000000000000000 ffff0a01ffffff10 0000000000000020 
> [  258.972839][ T9006] GPR08: 0000000000000050 0000000000000000 c00000000162c0e0 0000000000000002 
> [  258.972839][ T9006] GPR12: 0000000088224428 c000201fff69a900 0000000040000000 000000011a029798 
> [  258.972839][ T9006] GPR16: 000000011a029724 0000000119fc6968 0000000119f5f230 000000011a02d568 
> [  258.972839][ T9006] GPR20: 00000001554c66f0 0000000000000000 c000001ffc957820 c00000000010af80 
> [  258.972839][ T9006] GPR24: 0000001ffb8a0000 c0000000014f34a8 0000000000000056 c0000000010b7820 
> [  258.972839][ T9006] GPR28: c00000000168c654 0000000000000000 c00000000168c3a8 0000000000000000 
> [  258.973070][ T9006] NIP [c00000000010afc4] finish_cpu+0x44/0x90
> atomic_dec_return_relaxed at arch/powerpc/include/asm/atomic.h:179
> (inlined by) atomic_dec_return at include/linux/atomic-fallback.h:518
> (inlined by) atomic_dec_and_test at include/linux/atomic-fallback.h:1035
> (inlined by) mmdrop at include/linux/sched/mm.h:48
> (inlined by) finish_cpu at kernel/cpu.c:579
> [  258.973092][ T9006] LR [c00000000010afa0] finish_cpu+0x20/0x90
> [  258.973113][ T9006] Call Trace:
> [  258.973132][ T9006] [c000201a54defa80] [c00000000010afa0] finish_cpu+0x20/0x90 (unreliable)
> [  258.973166][ T9006] [c000201a54defaa0] [c00000000010cd34] cpuhp_invoke_callback+0x194/0x15f0
> cpuhp_invoke_callback at kernel/cpu.c:173
> [  258.973202][ T9006] [c000201a54defb50] [c000000000111acc] _cpu_down.constprop.15+0x17c/0x410
> [  258.973237][ T9006] [c000201a54defbc0] [c00000000010ff84] cpu_down+0x64/0xb0
> [  258.973263][ T9006] [c000201a54defc00] [c000000000754760] cpu_subsys_offline+0x20/0x40
> [  258.973299][ T9006] [c000201a54defc20] [c00000000074ab10] device_offline+0x100/0x140
> [  258.973344][ T9006] [c000201a54defc60] [c00000000074ace0] online_store+0xa0/0x110
> [  258.973378][ T9006] [c000201a54defcb0] [c000000000744508] dev_attr_store+0x38/0x60
> [  258.973414][ T9006] [c000201a54defcd0] [c0000000005df770] sysfs_kf_write+0x70/0xb0
> [  258.973438][ T9006] [c000201a54defd10] [c0000000005de92c] kernfs_fop_write+0x11c/0x270
> [  258.973463][ T9006] [c000201a54defd60] [c0000000004c09bc] __vfs_write+0x3c/0x70
> [  258.973487][ T9006] [c000201a54defd80] [c0000000004c3dbc] vfs_write+0xcc/0x200
> [  258.973522][ T9006] [c000201a54defdd0] [c0000000004c415c] ksys_write+0x7c/0x140
> [  258.973557][ T9006] [c000201a54defe20] [c00000000000b3f8] system_call+0x5c/0x68
> [  258.973579][ T9006] Instruction dump:
> [  258.973598][ T9006] f8010010 f821ffe1 4804e4ed 60000000 3d42fffd 394a03e0 e9230560 7fa95000 
> [  258.973636][ T9006] 419e0030 f9430560 39090050 7c0004ac <7d404028> 314affff 7d40412d 40c2fff4 
> [  258.973675][ T9006] ---[ end trace 4f72c711066ac397 ]---
> [  259.076271][ T9006] 
> [  260.076362][ T9006] Kernel panic - not syncing: Fatal exception
> 
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index c49257a3b510..bc782562ae93 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
> 		__mmdrop(mm);
> }
> 
> +extern void mmdrop(struct mm_struct *mm);
> +
> /*
>  * This has to be called after a get_task_mm()/mmget_not_zero()
>  * followed by taking the mmap_sem for writing before modifying the
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 0cca1a2b4930..302007a602a5 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -3,6 +3,7 @@
>  *
>  * This code is licenced under the GPL.
>  */
> +#include <linux/sched/mm.h>
> #include <linux/proc_fs.h>
> #include <linux/smp.h>
> #include <linux/init.h>
> @@ -564,6 +565,22 @@ static int bringup_cpu(unsigned int cpu)
> 	return bringup_wait_for_ap(cpu);
> }
> 
> +static int finish_cpu(unsigned int cpu)
> +{
> +	struct task_struct *idle = idle_thread_get(cpu);
> +	struct mm_struct *mm = idle->active_mm;
> +
> +	/*
> +	 * idle_task_exit() will have switched to &init_mm, now
> +	 * clean up any remaining active_mm state.
> +	 */
> +	if (mm != &init_mm) {
> +		idle->active_mm = &init_mm;
> +		mmdrop(mm);
> +	}
> +	return 0;
> +}
> +
> /*
>  * Hotplug state machine related functions
>  */
> @@ -1549,7 +1566,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
> 	[CPUHP_BRINGUP_CPU] = {
> 		.name			= "cpu:bringup",
> 		.startup.single		= bringup_cpu,
> -		.teardown.single	= NULL,
> +		.teardown.single	= finish_cpu,
> 		.cant_stop		= true,
> 	},
> 	/* Final state before CPU kills itself */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ac8b65298774..16aad39216a2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6248,13 +6248,14 @@ void idle_task_exit(void)
> 	struct mm_struct *mm = current->active_mm;
> 
> 	BUG_ON(cpu_online(smp_processor_id()));
> +	BUG_ON(current != this_rq()->idle);
> 
> 	if (mm != &init_mm) {
> 		switch_mm(mm, &init_mm, current);
> -		current->active_mm = &init_mm;
> 		finish_arch_post_lock_switch();
> 	}
> -	mmdrop(mm);
> +
> +	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
> }
> 
> /*
> 
> 
>> 
>> ---
>> diff --git a/kernel/cpu.c b/kernel/cpu.c
>> index 9c706af713fb..2b4d8a69e8d9 100644
>> --- a/kernel/cpu.c
>> +++ b/kernel/cpu.c
>> @@ -564,6 +564,23 @@ static int bringup_cpu(unsigned int cpu)
>> 	return bringup_wait_for_ap(cpu);
>> }
>> 
>> +static int finish_cpu(unsigned int cpu)
>> +{
>> +	struct task_struct *idle = idle_thread_get(cpu);
>> +	struct mm_struct *mm = idle->active_mm;
>> +
>> +	/*
>> +	 * idle_task_exit() will have switched to &init_mm, now
>> +	 * clean up any remaining active_mm state.
>> +	 */
>> +
>> +	if (mm == &init_mm)
>> +		return;
>> +
>> +	idle->active_mm = &init_mm;
>> +	mmdrop(mm);
>> +}
>> +
>> /*
>> * Hotplug state machine related functions
>> */
>> @@ -1434,7 +1451,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
>> 	[CPUHP_BRINGUP_CPU] = {
>> 		.name			= "cpu:bringup",
>> 		.startup.single		= bringup_cpu,
>> -		.teardown.single	= NULL,
>> +		.teardown.single	= finish_cpu,
>> 		.cant_stop		= true,
>> 	},
>> 	/* Final state before CPU kills itself */
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index fc1dfc007604..8f049fb77a3d 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -6188,13 +6188,14 @@ void idle_task_exit(void)
>> 	struct mm_struct *mm = current->active_mm;
>> 
>> 	BUG_ON(cpu_online(smp_processor_id()));
>> +	BUG_ON(current != this_rq()->idle);
>> 
>> 	if (mm != &init_mm) {
>> 		switch_mm(mm, &init_mm, current);
>> -		current->active_mm = &init_mm;
>> 		finish_arch_post_lock_switch();
>> 	}
>> -	mmdrop(mm);
>> +
>> +	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
>> }
>> 
>> /*


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-01-21 10:35     ` Peter Zijlstra
  2020-01-24  4:21       ` Qian Cai
@ 2020-03-30  2:42       ` Qian Cai
  2020-04-01 21:05         ` Qian Cai
  1 sibling, 1 reply; 17+ messages in thread
From: Qian Cai @ 2020-03-30  2:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, juri.lelli, vincent.guittot, dietmar.eggemann,
	Steven Rostedt (VMware),
	bsegall, mgorman, Paul E. McKenney, Thomas Gleixner,
	Linux Memory Management List, linux-kernel



> On Jan 21, 2020, at 5:35 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Mon, Jan 20, 2020 at 03:35:09PM -0500, Qian Cai wrote:
>>> On Jan 20, 2020, at 5:17 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>> Bah.. that's horrible. Surely we can find a better place to do this in
>>> the whole hotplug machinery.
>>> 
>>> Perhaps you can have takedown_cpu() do the mmdrop()?
>> 
>> The problem is that no all arch_cpu_idle_dead() will call
>> idle_task_exit(). For example, alpha and parisc are not, so it needs
> 
> How is that not broken? If the idle thread runs on an active_mm, we need
> to drop that reference. This needs fixing regardless.

It turns out alpha does not support cpu hotplug (no CONFIG_HOTPLUG_CPU),
and parisc will use &init_mm for idle tasks of secondary CPUs as in,

smp_callin()
  smp_cpu_init()

/* Initialise the idle task for this CPU */	
mmgrab(&init_mm);
current->active_mm = &init_mm;
BUG_ON(current->mm);
enter_lazy_tlb(&init_mm, current);

> 
>> to deal with some kind of ifdef dance in takedown_cpu() to
>> conditionally call mmdrop() which sounds even more horrible?
>> 
>> If you really prefer it anyway, maybe something like touching every arch’s __cpu_die() to also call mmdrop() depends on arches?
>> 
>> BTW, how to obtain the other CPU’s current task mm? Is that cpu_rq(cpu)->curr->active_mm?
> 
> Something like this; except you'll need to go audit archs to make sure
> they all call idle_task_exit() and/or put in comments on why they don't
> have to (perhaps their bringup switches them to &init_mm unconditionally
> and the switch_mm() is not required).

I have to modify your code slightly (below) to pass compilation, but then it will
panic during cpu offline for some reasons,

[  258.972625][ T9006] Faulting instruction address: 0xc00000000010afc4
[  258.972640][ T9006] Oops: Kernel access of bad area, sig: 11 [#1]
[  258.972651][ T9006] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=256 DEBUG_PAGEALLOC NUMA PowerNV
[  258.972675][ T9006] Modules linked in: kvm_hv kvm ip_tables x_tables xfs sd_mod bnx2x ahci mdio libahci tg3 libata libphy firmware_class dm_mirror dm_region_hash dm_log dm_mod
[  258.972716][ T9006] CPU: 84 PID: 9006 Comm: cpuhotplug04.sh Not tainted 5.6.0-rc7-next-20200327+ #7
[  258.972741][ T9006] NIP:  c00000000010afc4 LR: c00000000010afa0 CTR: fffffffffffffffd
[  258.972775][ T9006] REGS: c000201a54def7f0 TRAP: 0300   Not tainted  (5.6.0-rc7-next-20200327+)
[  258.972809][ T9006] MSR:  900000000280b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 28224824  XER: 20040000
[  258.972839][ T9006] CFAR: c00000000015951c DAR: 0000000000000050 DSISR: 40000000 IRQMASK: 0 
[  258.972839][ T9006] GPR00: c00000000010afa0 c000201a54defa80 c00000000165bd00 c000000001604180 
[  258.972839][ T9006] GPR04: c00000008160496f 0000000000000000 ffff0a01ffffff10 0000000000000020 
[  258.972839][ T9006] GPR08: 0000000000000050 0000000000000000 c00000000162c0e0 0000000000000002 
[  258.972839][ T9006] GPR12: 0000000088224428 c000201fff69a900 0000000040000000 000000011a029798 
[  258.972839][ T9006] GPR16: 000000011a029724 0000000119fc6968 0000000119f5f230 000000011a02d568 
[  258.972839][ T9006] GPR20: 00000001554c66f0 0000000000000000 c000001ffc957820 c00000000010af80 
[  258.972839][ T9006] GPR24: 0000001ffb8a0000 c0000000014f34a8 0000000000000056 c0000000010b7820 
[  258.972839][ T9006] GPR28: c00000000168c654 0000000000000000 c00000000168c3a8 0000000000000000 
[  258.973070][ T9006] NIP [c00000000010afc4] finish_cpu+0x44/0x90
atomic_dec_return_relaxed at arch/powerpc/include/asm/atomic.h:179
(inlined by) atomic_dec_return at include/linux/atomic-fallback.h:518
(inlined by) atomic_dec_and_test at include/linux/atomic-fallback.h:1035
(inlined by) mmdrop at include/linux/sched/mm.h:48
(inlined by) finish_cpu at kernel/cpu.c:579
[  258.973092][ T9006] LR [c00000000010afa0] finish_cpu+0x20/0x90
[  258.973113][ T9006] Call Trace:
[  258.973132][ T9006] [c000201a54defa80] [c00000000010afa0] finish_cpu+0x20/0x90 (unreliable)
[  258.973166][ T9006] [c000201a54defaa0] [c00000000010cd34] cpuhp_invoke_callback+0x194/0x15f0
cpuhp_invoke_callback at kernel/cpu.c:173
[  258.973202][ T9006] [c000201a54defb50] [c000000000111acc] _cpu_down.constprop.15+0x17c/0x410
[  258.973237][ T9006] [c000201a54defbc0] [c00000000010ff84] cpu_down+0x64/0xb0
[  258.973263][ T9006] [c000201a54defc00] [c000000000754760] cpu_subsys_offline+0x20/0x40
[  258.973299][ T9006] [c000201a54defc20] [c00000000074ab10] device_offline+0x100/0x140
[  258.973344][ T9006] [c000201a54defc60] [c00000000074ace0] online_store+0xa0/0x110
[  258.973378][ T9006] [c000201a54defcb0] [c000000000744508] dev_attr_store+0x38/0x60
[  258.973414][ T9006] [c000201a54defcd0] [c0000000005df770] sysfs_kf_write+0x70/0xb0
[  258.973438][ T9006] [c000201a54defd10] [c0000000005de92c] kernfs_fop_write+0x11c/0x270
[  258.973463][ T9006] [c000201a54defd60] [c0000000004c09bc] __vfs_write+0x3c/0x70
[  258.973487][ T9006] [c000201a54defd80] [c0000000004c3dbc] vfs_write+0xcc/0x200
[  258.973522][ T9006] [c000201a54defdd0] [c0000000004c415c] ksys_write+0x7c/0x140
[  258.973557][ T9006] [c000201a54defe20] [c00000000000b3f8] system_call+0x5c/0x68
[  258.973579][ T9006] Instruction dump:
[  258.973598][ T9006] f8010010 f821ffe1 4804e4ed 60000000 3d42fffd 394a03e0 e9230560 7fa95000 
[  258.973636][ T9006] 419e0030 f9430560 39090050 7c0004ac <7d404028> 314affff 7d40412d 40c2fff4 
[  258.973675][ T9006] ---[ end trace 4f72c711066ac397 ]---
[  259.076271][ T9006] 
[  260.076362][ T9006] Kernel panic - not syncing: Fatal exception

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c49257a3b510..bc782562ae93 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+extern void mmdrop(struct mm_struct *mm);
+
 /*
  * This has to be called after a get_task_mm()/mmget_not_zero()
  * followed by taking the mmap_sem for writing before modifying the
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0cca1a2b4930..302007a602a5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
  *
  * This code is licenced under the GPL.
  */
+#include <linux/sched/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/smp.h>
 #include <linux/init.h>
@@ -564,6 +565,22 @@ static int bringup_cpu(unsigned int cpu)
 	return bringup_wait_for_ap(cpu);
 }
 
+static int finish_cpu(unsigned int cpu)
+{
+	struct task_struct *idle = idle_thread_get(cpu);
+	struct mm_struct *mm = idle->active_mm;
+
+	/*
+	 * idle_task_exit() will have switched to &init_mm, now
+	 * clean up any remaining active_mm state.
+	 */
+	if (mm != &init_mm) {
+		idle->active_mm = &init_mm;
+		mmdrop(mm);
+	}
+	return 0;
+}
+
 /*
  * Hotplug state machine related functions
  */
@@ -1549,7 +1566,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
 		.startup.single		= bringup_cpu,
-		.teardown.single	= NULL,
+		.teardown.single	= finish_cpu,
 		.cant_stop		= true,
 	},
 	/* Final state before CPU kills itself */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ac8b65298774..16aad39216a2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6248,13 +6248,14 @@ void idle_task_exit(void)
 	struct mm_struct *mm = current->active_mm;
 
 	BUG_ON(cpu_online(smp_processor_id()));
+	BUG_ON(current != this_rq()->idle);
 
 	if (mm != &init_mm) {
 		switch_mm(mm, &init_mm, current);
-		current->active_mm = &init_mm;
 		finish_arch_post_lock_switch();
 	}
-	mmdrop(mm);
+
+	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
 /*


> 
> ---
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 9c706af713fb..2b4d8a69e8d9 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -564,6 +564,23 @@ static int bringup_cpu(unsigned int cpu)
> 	return bringup_wait_for_ap(cpu);
> }
> 
> +static int finish_cpu(unsigned int cpu)
> +{
> +	struct task_struct *idle = idle_thread_get(cpu);
> +	struct mm_struct *mm = idle->active_mm;
> +
> +	/*
> +	 * idle_task_exit() will have switched to &init_mm, now
> +	 * clean up any remaining active_mm state.
> +	 */
> +
> +	if (mm == &init_mm)
> +		return;
> +
> +	idle->active_mm = &init_mm;
> +	mmdrop(mm);
> +}
> +
> /*
>  * Hotplug state machine related functions
>  */
> @@ -1434,7 +1451,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
> 	[CPUHP_BRINGUP_CPU] = {
> 		.name			= "cpu:bringup",
> 		.startup.single		= bringup_cpu,
> -		.teardown.single	= NULL,
> +		.teardown.single	= finish_cpu,
> 		.cant_stop		= true,
> 	},
> 	/* Final state before CPU kills itself */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index fc1dfc007604..8f049fb77a3d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6188,13 +6188,14 @@ void idle_task_exit(void)
> 	struct mm_struct *mm = current->active_mm;
> 
> 	BUG_ON(cpu_online(smp_processor_id()));
> +	BUG_ON(current != this_rq()->idle);
> 
> 	if (mm != &init_mm) {
> 		switch_mm(mm, &init_mm, current);
> -		current->active_mm = &init_mm;
> 		finish_arch_post_lock_switch();
> 	}
> -	mmdrop(mm);
> +
> +	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
> }
> 
> /*


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-01-24  4:21       ` Qian Cai
@ 2020-01-24  5:02         ` Matthew Wilcox
  0 siblings, 0 replies; 17+ messages in thread
From: Matthew Wilcox @ 2020-01-24  5:02 UTC (permalink / raw)
  To: Qian Cai
  Cc: Peter Zijlstra, mingo, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, paulmck, tglx,
	linux-mm, linux-kernel, linux-alpha, Matt Turner, linux-parisc,
	Helge Deller

On Thu, Jan 23, 2020 at 11:21:35PM -0500, Qian Cai wrote:
> > On Jan 21, 2020, at 5:35 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > Something like this; except you'll need to go audit archs to make sure
> > they all call idle_task_exit() and/or put in comments on why they don't
> > have to (perhaps their bringup switches them to &init_mm unconditionally
> > and the switch_mm() is not required).
> 
> Damn, I am having a hard time to motivate myself to learn all about those two “dead“ arches from scratch. I suppose the first step we could put a dummy finish_cpu() for alpha and parisc if they don’t call idle_task_exit() in the first place anyway, so if it is a bug there it is another issue that could be dealt with in a separate patch later?

Or you could consult the maintainers of those architectures?  There are regular pull requests for parisc still, and alpha still gets odd fixes.

It would have helped had you not trimmed the context so aggressively.
For those seeing this thread for the first time, try:
https://lore.kernel.org/linux-mm/A72A7F42-A166-4403-B12C-32B2D7A662C4@lca.pw/T/#t

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-01-21 10:35     ` Peter Zijlstra
@ 2020-01-24  4:21       ` Qian Cai
  2020-01-24  5:02         ` Matthew Wilcox
  2020-03-30  2:42       ` Qian Cai
  1 sibling, 1 reply; 17+ messages in thread
From: Qian Cai @ 2020-01-24  4:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, paulmck, tglx, linux-mm, linux-kernel



> On Jan 21, 2020, at 5:35 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> Something like this; except you'll need to go audit archs to make sure
> they all call idle_task_exit() and/or put in comments on why they don't
> have to (perhaps their bringup switches them to &init_mm unconditionally
> and the switch_mm() is not required).

Damn, I am having a hard time to motivate myself to learn all about those two “dead“ arches from scratch. I suppose the first step we could put a dummy finish_cpu() for alpha and parisc if they don’t call idle_task_exit() in the first place anyway, so if it is a bug there it is another issue that could be dealt with in a separate patch later?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-01-20 20:35   ` Qian Cai
@ 2020-01-21 10:35     ` Peter Zijlstra
  2020-01-24  4:21       ` Qian Cai
  2020-03-30  2:42       ` Qian Cai
  0 siblings, 2 replies; 17+ messages in thread
From: Peter Zijlstra @ 2020-01-21 10:35 UTC (permalink / raw)
  To: Qian Cai
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, paulmck, tglx, linux-mm, linux-kernel

On Mon, Jan 20, 2020 at 03:35:09PM -0500, Qian Cai wrote:
> > On Jan 20, 2020, at 5:17 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > Bah.. that's horrible. Surely we can find a better place to do this in
> > the whole hotplug machinery.
> > 
> > Perhaps you can have takedown_cpu() do the mmdrop()?
> 
> The problem is that no all arch_cpu_idle_dead() will call
> idle_task_exit(). For example, alpha and parisc are not, so it needs

How is that not broken? If the idle thread runs on an active_mm, we need
to drop that reference. This needs fixing regardless.

> to deal with some kind of ifdef dance in takedown_cpu() to
> conditionally call mmdrop() which sounds even more horrible?
> 
> If you really prefer it anyway, maybe something like touching every arch’s __cpu_die() to also call mmdrop() depends on arches?
> 
> BTW, how to obtain the other CPU’s current task mm? Is that cpu_rq(cpu)->curr->active_mm?

Something like this; except you'll need to go audit archs to make sure
they all call idle_task_exit() and/or put in comments on why they don't
have to (perhaps their bringup switches them to &init_mm unconditionally
and the switch_mm() is not required).

---
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9c706af713fb..2b4d8a69e8d9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -564,6 +564,23 @@ static int bringup_cpu(unsigned int cpu)
 	return bringup_wait_for_ap(cpu);
 }
 
+static int finish_cpu(unsigned int cpu)
+{
+	struct task_struct *idle = idle_thread_get(cpu);
+	struct mm_struct *mm = idle->active_mm;
+
+	/*
+	 * idle_task_exit() will have switched to &init_mm, now
+	 * clean up any remaining active_mm state.
+	 */
+
+	if (mm == &init_mm)
+		return;
+
+	idle->active_mm = &init_mm;
+	mmdrop(mm);
+}
+
 /*
  * Hotplug state machine related functions
  */
@@ -1434,7 +1451,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
 		.startup.single		= bringup_cpu,
-		.teardown.single	= NULL,
+		.teardown.single	= finish_cpu,
 		.cant_stop		= true,
 	},
 	/* Final state before CPU kills itself */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fc1dfc007604..8f049fb77a3d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6188,13 +6188,14 @@ void idle_task_exit(void)
 	struct mm_struct *mm = current->active_mm;
 
 	BUG_ON(cpu_online(smp_processor_id()));
+	BUG_ON(current != this_rq()->idle);
 
 	if (mm != &init_mm) {
 		switch_mm(mm, &init_mm, current);
-		current->active_mm = &init_mm;
 		finish_arch_post_lock_switch();
 	}
-	mmdrop(mm);
+
+	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
 /*

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-01-20 10:16 ` Peter Zijlstra
@ 2020-01-20 20:35   ` Qian Cai
  2020-01-21 10:35     ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Qian Cai @ 2020-01-20 20:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, paulmck, tglx, linux-mm, linux-kernel



> On Jan 20, 2020, at 5:17 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> Bah.. that's horrible. Surely we can find a better place to do this in
> the whole hotplug machinery.
> 
> Perhaps you can have takedown_cpu() do the mmdrop()?

The problem is that no all arch_cpu_idle_dead() will call idle_task_exit(). For example, alpha and parisc are not, so it needs to deal with some kind of ifdef dance in takedown_cpu() to conditionally call mmdrop() which sounds even more horrible?

If you really prefer it anyway, maybe something like touching every arch’s __cpu_die() to also call mmdrop() depends on arches?

BTW, how to obtain the other CPU’s current task mm? Is that cpu_rq(cpu)->curr->active_mm?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] sched/core: fix illegal RCU from offline CPUs
  2020-01-13 19:03 [PATCH v2] sched/core: fix " Qian Cai
@ 2020-01-20 10:16 ` Peter Zijlstra
  2020-01-20 20:35   ` Qian Cai
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2020-01-20 10:16 UTC (permalink / raw)
  To: Qian Cai
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, paulmck, tglx, linux-mm, linux-kernel

On Mon, Jan 13, 2020 at 02:03:31PM -0500, Qian Cai wrote:
> In the CPU-offline process, it calls mmdrop() after idle entry and the
> subsequent call to cpuhp_report_idle_dead(). Once execution passes the
> call to rcu_report_dead(), RCU is ignoring the CPU, which results in
> lockdep complaints when mmdrop() uses RCU from either memcg or
> debugobjects, so it by scheduling mmdrop() on another online CPU.
> 
> According to the commit a79e53d85683 ("x86/mm: Fix pgd_lock deadlock"),
> mmdrop() is not interrupt-safe, and called from
> smp_call_function_single() could end up running mmdrop() from the IPI
> interrupt handler.
> 

<deletes ~100 lines of gunk>

Surely the critical information contained in these nearly 100 lines of
splat can be more consicely represented?


> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 90e4b00ace89..1863a6fc4d82 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6194,7 +6194,8 @@ void idle_task_exit(void)
>  		current->active_mm = &init_mm;
>  		finish_arch_post_lock_switch();
>  	}
> -	mmdrop(mm);
> +	smp_call_function_single(cpumask_first(cpu_online_mask),
> +				 (void (*)(void *))mmdrop_async, mm, 0);
>  }

Bah.. that's horrible. Surely we can find a better place to do this in
the whole hotplug machinery.

Perhaps you can have takedown_cpu() do the mmdrop()?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v2] sched/core: fix illegal RCU from offline CPUs
@ 2020-01-13 19:03 Qian Cai
  2020-01-20 10:16 ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Qian Cai @ 2020-01-13 19:03 UTC (permalink / raw)
  To: peterz, mingo
  Cc: juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, paulmck, tglx, linux-mm, linux-kernel, Qian Cai

In the CPU-offline process, it calls mmdrop() after idle entry and the
subsequent call to cpuhp_report_idle_dead(). Once execution passes the
call to rcu_report_dead(), RCU is ignoring the CPU, which results in
lockdep complaints when mmdrop() uses RCU from either memcg or
debugobjects, so it by scheduling mmdrop() on another online CPU.

According to the commit a79e53d85683 ("x86/mm: Fix pgd_lock deadlock"),
mmdrop() is not interrupt-safe, and called from
smp_call_function_single() could end up running mmdrop() from the IPI
interrupt handler.

 Call Trace:
  mmdrop+0x58/0x80
  flush_smp_call_function_queue+0x128/0x2e0
  smp_ipi_demux_relaxed+0x84/0xf0
  doorbell_exception+0x118/0x588
  h_doorbell_common+0x124/0x130
  --- interrupt: e81 at replay_interrupt_return+0x0/0x4
      LR = arch_local_irq_restore.part.8+0x78/0x90
  arch_local_irq_restore.part.8+0x34/0x90 (unreliable)
  _raw_spin_unlock_irq+0x50/0x80
  finish_task_switch+0xd8/0x340
  __schedule+0x4bc/0xba0
  schedule_idle+0x38/0x70
  do_idle+0x2a4/0x470
  cpu_startup_entry+0x3c/0x40
  start_secondary+0x7a8/0xa80
  start_secondary_resume+0x10/0x14

Therefore, use mmdrop_async() instead to run mmdrop() from a process
context similar to the commit 7283094ec3db ("kernel, oom: fix potential
pgd_lock deadlock from __mmdrop").

=============================
 WARNING: suspicious RCU usage
 -----------------------------
 kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!

 other info that might help us debug this:

 RCU used illegally from offline CPU!
 rcu_scheduler_active = 2, debug_locks = 1
 2 locks held by swapper/37/0:
  #0: c0000000010af608 (rcu_read_lock){....}, at:
      percpu_ref_put_many+0x8/0x230
  #1: c0000000010af608 (rcu_read_lock){....}, at:
      __queue_work+0x7c/0xca0

 stack backtrace:
 Call Trace:
  dump_stack+0xf4/0x164 (unreliable)
  lockdep_rcu_suspicious+0x140/0x164
  get_work_pool+0x110/0x150
  __queue_work+0x1bc/0xca0
  queue_work_on+0x114/0x120
  css_release+0x9c/0xc0
  percpu_ref_put_many+0x204/0x230
  free_pcp_prepare+0x264/0x570
  free_unref_page+0x38/0xf0
  __mmdrop+0x21c/0x2c0
  idle_task_exit+0x170/0x1b0
  pnv_smp_cpu_kill_self+0x38/0x2e0
  cpu_die+0x48/0x64
  arch_cpu_idle_dead+0x30/0x50
  do_idle+0x2f4/0x470
  cpu_startup_entry+0x38/0x40
  start_secondary+0x7a8/0xa80
  start_secondary_resume+0x10/0x14

 =============================
 WARNING: suspicious RCU usage
 -----------------------------
 kernel/sched/core.c:562 suspicious rcu_dereference_check() usage!

 other info that might help us debug this:

 RCU used illegally from offline CPU!
 rcu_scheduler_active = 2, debug_locks = 1
 2 locks held by swapper/94/0:
  #0: c000201cc77dc118 (&base->lock){-.-.}, at:
      lock_timer_base+0x114/0x1f0
  #1: c0000000010af608 (rcu_read_lock){....}, at:
      get_nohz_timer_target+0x3c/0x2d0

 stack backtrace:
 Call Trace:
  dump_stack+0xf4/0x164 (unreliable)
  lockdep_rcu_suspicious+0x140/0x164
  get_nohz_timer_target+0x248/0x2d0
  add_timer+0x24c/0x470
  __queue_delayed_work+0x8c/0x110
  queue_delayed_work_on+0x128/0x130
  __debug_check_no_obj_freed+0x2ec/0x320
  free_pcp_prepare+0x1b4/0x570
  free_unref_page+0x38/0xf0
  __mmdrop+0x21c/0x2c0
  idle_task_exit+0x170/0x1b0
  pnv_smp_cpu_kill_self+0x38/0x2e0
  cpu_die+0x48/0x64
  arch_cpu_idle_dead+0x30/0x50
  do_idle+0x2f4/0x470
  cpu_startup_entry+0x38/0x40
  start_secondary+0x7a8/0xa80
  start_secondary_prolog+0x10/0x14

Signed-off-by: Qian Cai <cai@lca.pw>
---

v2: use mmdrop_async() thanks to Tetsuo.

 include/linux/sched/mm.h | 2 ++
 kernel/fork.c            | 2 +-
 kernel/sched/core.c      | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c49257a3b510..205de134348c 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+extern void mmdrop_async(struct mm_struct *mm);
+
 /*
  * This has to be called after a get_task_mm()/mmget_not_zero()
  * followed by taking the mmap_sem for writing before modifying the
diff --git a/kernel/fork.c b/kernel/fork.c
index 080809560072..9a823a955b7c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -707,7 +707,7 @@ static void mmdrop_async_fn(struct work_struct *work)
 	__mmdrop(mm);
 }
 
-static void mmdrop_async(struct mm_struct *mm)
+void mmdrop_async(struct mm_struct *mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
 		INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 90e4b00ace89..1863a6fc4d82 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6194,7 +6194,8 @@ void idle_task_exit(void)
 		current->active_mm = &init_mm;
 		finish_arch_post_lock_switch();
 	}
-	mmdrop(mm);
+	smp_call_function_single(cpumask_first(cpu_online_mask),
+				 (void (*)(void *))mmdrop_async, mm, 0);
 }
 
 /*
-- 
2.21.0 (Apple Git-122.2)


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2020-05-01 18:23 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-01 21:40 [PATCH v2] sched/core: fix illegal RCU from offline CPUs Qian Cai
2020-04-02 11:24 ` Michael Ellerman
2020-04-02 14:00   ` Qian Cai
2020-04-02 15:54     ` Paul E. McKenney
2020-04-02 16:19       ` Qian Cai
2020-04-02 16:57         ` Paul E. McKenney
2020-04-17 13:26   ` Qian Cai
2020-04-21 13:56     ` Peter Zijlstra
2020-05-01 18:22 ` [tip: sched/core] sched/core: Fix " tip-bot2 for Peter Zijlstra
  -- strict thread matches above, loose matches on Subject: below --
2020-01-13 19:03 [PATCH v2] sched/core: fix " Qian Cai
2020-01-20 10:16 ` Peter Zijlstra
2020-01-20 20:35   ` Qian Cai
2020-01-21 10:35     ` Peter Zijlstra
2020-01-24  4:21       ` Qian Cai
2020-01-24  5:02         ` Matthew Wilcox
2020-03-30  2:42       ` Qian Cai
2020-04-01 21:05         ` Qian Cai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).