All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
@ 2014-06-20 14:28 Boris Ostrovsky
  2014-06-20 15:23 ` Borislav Petkov
  2014-07-24 23:36 ` [tip:x86/urgent] x86, MCE: Robustify mcheck_init_device tip-bot for Borislav Petkov
  0 siblings, 2 replies; 15+ messages in thread
From: Boris Ostrovsky @ 2014-06-20 14:28 UTC (permalink / raw)
  To: bp, tony.luck; +Cc: linux-kernel, linux-edac, mattieu.souchaud, boris.ostrovsky

Commit 9c15a24b038f4d8da93a2bc2554731f8953a7c17 (x86/mce: Improve
mcheck_init_device() error handling) unregisters (or never registers)
MCE's hotplug notifier if an error is encountered.

Since unplugging a CPU would normally result in the notifier deleting
MCE timer we are now left with the timer running if a CPU is removed on
a system where mcheck_init_device() had failed.

If we later hotplug this CPU back we add this timer again in
mcheck_cpu_init()). Eventually the two timers start intefering with each
other, causing soft lockups or system hangs.

We should leave the notifier always on and, in fact, set it up early
during the boot.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 42 ++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index bb92f38..0d2828a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1677,6 +1677,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code)
 void (*machine_check_vector)(struct pt_regs *, long error_code) =
 						unexpected_machine_check;
 
+static int
+mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu);
+static struct notifier_block mce_cpu_notifier = {
+	.notifier_call = mce_cpu_callback,
+};
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off:
@@ -1704,6 +1709,9 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_timer();
 	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
 	init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
+
+	if (c == &boot_cpu_data)
+		register_cpu_notifier(&mce_cpu_notifier); /* pre-SMP */
 }
 
 /*
@@ -1951,6 +1959,7 @@ static struct miscdevice mce_chrdev_device = {
 	"mcelog",
 	&mce_chrdev_ops,
 };
+static bool is_mce_chrdev_set;
 
 static void __mce_disable_bank(void *arg)
 {
@@ -2376,14 +2385,18 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
-		mce_device_create(cpu);
-		if (threshold_cpu_callback)
-			threshold_cpu_callback(action, cpu);
+		if (is_mce_chrdev_set) {
+			mce_device_create(cpu);
+			if (threshold_cpu_callback)
+				threshold_cpu_callback(action, cpu);
+		}
 		break;
 	case CPU_DEAD:
-		if (threshold_cpu_callback)
-			threshold_cpu_callback(action, cpu);
-		mce_device_remove(cpu);
+		if (is_mce_chrdev_set) {
+			if (threshold_cpu_callback)
+				threshold_cpu_callback(action, cpu);
+			mce_device_remove(cpu);
+		}
 		mce_intel_hcpu_update(cpu);
 		break;
 	case CPU_DOWN_PREPARE:
@@ -2404,10 +2417,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static struct notifier_block mce_cpu_notifier = {
-	.notifier_call = mce_cpu_callback,
-};
-
 static __init void mce_init_banks(void)
 {
 	int i;
@@ -2447,18 +2456,12 @@ static __init int mcheck_init_device(void)
 	if (err)
 		goto err_out_mem;
 
-	cpu_notifier_register_begin();
 	for_each_online_cpu(i) {
 		err = mce_device_create(i);
-		if (err) {
-			cpu_notifier_register_done();
+		if (err)
 			goto err_device_create;
-		}
 	}
 
-	__register_hotcpu_notifier(&mce_cpu_notifier);
-	cpu_notifier_register_done();
-
 	register_syscore_ops(&mce_syscore_ops);
 
 	/* register character device /dev/mcelog */
@@ -2466,15 +2469,12 @@ static __init int mcheck_init_device(void)
 	if (err)
 		goto err_register;
 
+	is_mce_chrdev_set = true;
 	return 0;
 
 err_register:
 	unregister_syscore_ops(&mce_syscore_ops);
 
-	cpu_notifier_register_begin();
-	__unregister_hotcpu_notifier(&mce_cpu_notifier);
-	cpu_notifier_register_done();
-
 err_device_create:
 	/*
 	 * We didn't keep track of which devices were created above, but
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 14:28 [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path Boris Ostrovsky
@ 2014-06-20 15:23 ` Borislav Petkov
  2014-06-20 15:41   ` Boris Ostrovsky
  2014-07-24 23:36 ` [tip:x86/urgent] x86, MCE: Robustify mcheck_init_device tip-bot for Borislav Petkov
  1 sibling, 1 reply; 15+ messages in thread
From: Borislav Petkov @ 2014-06-20 15:23 UTC (permalink / raw)
  To: Boris Ostrovsky; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On Fri, Jun 20, 2014 at 10:28:13AM -0400, Boris Ostrovsky wrote:
> Commit 9c15a24b038f4d8da93a2bc2554731f8953a7c17 (x86/mce: Improve
> mcheck_init_device() error handling) unregisters (or never registers)
> MCE's hotplug notifier if an error is encountered.

Well, mcheck_init_device() did encounter errors before that commit too,
can you please go into detail on how exactly you're triggering this?
Which error are you talking about exactly?

Lemme guess: some xen special handling which baremetal doesn't need.

> Since unplugging a CPU would normally result in the notifier deleting
> MCE timer we are now left with the timer running if a CPU is removed on
> a system where mcheck_init_device() had failed.
> 
> If we later hotplug this CPU back we add this timer again in
> mcheck_cpu_init()). Eventually the two timers start intefering with each
> other, causing soft lockups or system hangs.
> 
> We should leave the notifier always on and, in fact, set it up early
> during the boot.

We do leave it always on - we only unregister it if we've encountered an
error.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 15:23 ` Borislav Petkov
@ 2014-06-20 15:41   ` Boris Ostrovsky
  2014-06-20 15:58     ` Borislav Petkov
  0 siblings, 1 reply; 15+ messages in thread
From: Boris Ostrovsky @ 2014-06-20 15:41 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On 06/20/2014 11:23 AM, Borislav Petkov wrote:
> On Fri, Jun 20, 2014 at 10:28:13AM -0400, Boris Ostrovsky wrote:
>> Commit 9c15a24b038f4d8da93a2bc2554731f8953a7c17 (x86/mce: Improve
>> mcheck_init_device() error handling) unregisters (or never registers)
>> MCE's hotplug notifier if an error is encountered.
> Well, mcheck_init_device() did encounter errors before that commit too,
> can you please go into detail on how exactly you're triggering this?
> Which error are you talking about exactly?

You can simulate this on baremetal by having, for example, 
misc_register() fail (just add 'err = -EOI' after the call). Or you can 
return an error right upon entry to mcheck_init_device() (I haven't 
tested that though).

Then, after you are booted do a couple of
     echo 0 > /sys/devices/system/cpu/cpu1/online
     echo 1 > /sys/devices/system/cpu/cpu1/online

Then sit still for about 10 minutes. I don't think any activity is 
necessary.

You are dead now. If you are lucky you may see messages about soft 
lockups or RCU stalls but often nothing.

> Lemme guess: some xen special handling which baremetal doesn't need.

Only in the sense that on Xen misc_register() often fails. But any 
failure on baremetal will result in the same behavior.

>
>> Since unplugging a CPU would normally result in the notifier deleting
>> MCE timer we are now left with the timer running if a CPU is removed on
>> a system where mcheck_init_device() had failed.
>>
>> If we later hotplug this CPU back we add this timer again in
>> mcheck_cpu_init()). Eventually the two timers start intefering with each
>> other, causing soft lockups or system hangs.
>>
>> We should leave the notifier always on and, in fact, set it up early
>> during the boot.
> We do leave it always on - we only unregister it if we've encountered an
> error.

Right. And I think we shouldn't because we leave undeleted timers.

-boris


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 15:41   ` Boris Ostrovsky
@ 2014-06-20 15:58     ` Borislav Petkov
  2014-06-20 16:16       ` Boris Ostrovsky
  0 siblings, 1 reply; 15+ messages in thread
From: Borislav Petkov @ 2014-06-20 15:58 UTC (permalink / raw)
  To: Boris Ostrovsky; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On Fri, Jun 20, 2014 at 11:41:27AM -0400, Boris Ostrovsky wrote:
> Only in the sense that on Xen misc_register() often fails. But any
> failure on baremetal will result in the same behavior.

Ok, thanks for explaining the details.

> Right. And I think we shouldn't because we leave undeleted timers.

I wonder if we could simply move the oneliner:

	__register_hotcpu_notifier(&mce_cpu_notifier);

to mcheck_init()? We don't need any locking there because we're pre-SMP
then and from looking at notifier_chain_register(), it seems there's
nothing special that needs initializing for the call to be too early -
we're simply adding the callback to a list...

Hmm, because if that would work, the fix is almost trivial. :)

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 15:58     ` Borislav Petkov
@ 2014-06-20 16:16       ` Boris Ostrovsky
  2014-06-20 17:52         ` Borislav Petkov
  0 siblings, 1 reply; 15+ messages in thread
From: Boris Ostrovsky @ 2014-06-20 16:16 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On 06/20/2014 11:58 AM, Borislav Petkov wrote:
> On Fri, Jun 20, 2014 at 11:41:27AM -0400, Boris Ostrovsky wrote:
>> Only in the sense that on Xen misc_register() often fails. But any
>> failure on baremetal will result in the same behavior.
> Ok, thanks for explaining the details.
>
>> Right. And I think we shouldn't because we leave undeleted timers.
> I wonder if we could simply move the oneliner:
>
> 	__register_hotcpu_notifier(&mce_cpu_notifier);
>
> to mcheck_init()? We don't need any locking there because we're pre-SMP
> then and from looking at notifier_chain_register(), it seems there's
> nothing special that needs initializing for the call to be too early -
> we're simply adding the callback to a list...
>
> Hmm, because if that would work, the fix is almost trivial. :)

That's cleaner than what I had in mcheck_cpu_init() (checking for 
boot_cpu_data).

But I think you still need to do the dance in the notifier to make sure 
you are not trying to add/remove device if mcheck_init_device() had 
failed earlier.

-boris


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 16:16       ` Boris Ostrovsky
@ 2014-06-20 17:52         ` Borislav Petkov
  2014-06-20 19:39           ` Boris Ostrovsky
  0 siblings, 1 reply; 15+ messages in thread
From: Borislav Petkov @ 2014-06-20 17:52 UTC (permalink / raw)
  To: Boris Ostrovsky; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On Fri, Jun 20, 2014 at 12:16:39PM -0400, Boris Ostrovsky wrote:
> But I think you still need to do the dance in the notifier to make
> sure you are not trying to add/remove device if mcheck_init_device()
> had failed earlier.

mce_device_remove should be smart enough. Hint: mce_device_initialized
cpumask.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 17:52         ` Borislav Petkov
@ 2014-06-20 19:39           ` Boris Ostrovsky
  2014-06-20 20:03             ` Borislav Petkov
  0 siblings, 1 reply; 15+ messages in thread
From: Boris Ostrovsky @ 2014-06-20 19:39 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On 06/20/2014 01:52 PM, Borislav Petkov wrote:
> On Fri, Jun 20, 2014 at 12:16:39PM -0400, Boris Ostrovsky wrote:
>> But I think you still need to do the dance in the notifier to make
>> sure you are not trying to add/remove device if mcheck_init_device()
>> had failed earlier.
> mce_device_remove should be smart enough. Hint: mce_device_initialized
> cpumask.

What about mce_device_add()?

-boris


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 19:39           ` Boris Ostrovsky
@ 2014-06-20 20:03             ` Borislav Petkov
  2014-06-20 20:16               ` Boris Ostrovsky
  0 siblings, 1 reply; 15+ messages in thread
From: Borislav Petkov @ 2014-06-20 20:03 UTC (permalink / raw)
  To: Boris Ostrovsky; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On Fri, Jun 20, 2014 at 03:39:34PM -0400, Boris Ostrovsky wrote:
> What about mce_device_add()?

What is a mce_device_add()? There's no such function.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 20:03             ` Borislav Petkov
@ 2014-06-20 20:16               ` Boris Ostrovsky
  2014-06-20 20:29                 ` Borislav Petkov
  0 siblings, 1 reply; 15+ messages in thread
From: Boris Ostrovsky @ 2014-06-20 20:16 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On 06/20/2014 04:03 PM, Borislav Petkov wrote:
> On Fri, Jun 20, 2014 at 03:39:34PM -0400, Boris Ostrovsky wrote:
>> What about mce_device_add()?
> What is a mce_device_add()? There's no such function.
>


Sorry, mce_device_create().

We can't call it in the notifier until mcheck_init_device() has been 
successfully executed (we need subsys_system_register(&mce_subsys)). I 
don't know whether we can call subsys_system_register() in mcheck_init() 
-- it is quite early in the boot.

-boris

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 20:16               ` Boris Ostrovsky
@ 2014-06-20 20:29                 ` Borislav Petkov
  2014-06-20 20:43                   ` Boris Ostrovsky
  0 siblings, 1 reply; 15+ messages in thread
From: Borislav Petkov @ 2014-06-20 20:29 UTC (permalink / raw)
  To: Boris Ostrovsky; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On Fri, Jun 20, 2014 at 04:16:50PM -0400, Boris Ostrovsky wrote:
> Sorry, mce_device_create().
> 
> We can't call it in the notifier until mcheck_init_device() has been
> successfully executed (we need subsys_system_register(&mce_subsys)). I don't
> know whether we can call subsys_system_register() in mcheck_init() -- it is
> quite early in the boot.

I don't think it matters: we want to add only this oneliner to
mcheck_init():

	__register_hotcpu_notifier(&mce_cpu_notifier);

and remove it from mcheck_init_device(), nothing else. And we don't need
the synchronization even because we're BSP only then.

I mean, we won't be able to offline CPUs that early anyway - thus
call mce_device_create() in the notifier callback - as we don't have
userspace to do "echo 0 > ..."

The rest of the code remains and mcheck_init_device() executes when it
does. Unless I'm missing something, of course...

Oh, not quite. We probably should remove the

	__unregister_hotcpu_notifier(&mce_cpu_notifier);

from the error path too, as you suggest.

When you do, please hold that down in the commit message so that it is
clear what we're doing.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 20:29                 ` Borislav Petkov
@ 2014-06-20 20:43                   ` Boris Ostrovsky
  2014-06-20 21:11                     ` Borislav Petkov
  0 siblings, 1 reply; 15+ messages in thread
From: Boris Ostrovsky @ 2014-06-20 20:43 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On 06/20/2014 04:29 PM, Borislav Petkov wrote:
> On Fri, Jun 20, 2014 at 04:16:50PM -0400, Boris Ostrovsky wrote:
>> Sorry, mce_device_create().
>>
>> We can't call it in the notifier until mcheck_init_device() has been
>> successfully executed (we need subsys_system_register(&mce_subsys)). I don't
>> know whether we can call subsys_system_register() in mcheck_init() -- it is
>> quite early in the boot.
> I don't think it matters: we want to add only this oneliner to
> mcheck_init():
>
> 	__register_hotcpu_notifier(&mce_cpu_notifier);
>
> and remove it from mcheck_init_device(), nothing else. And we don't need
> the synchronization even because we're BSP only then.
>
> I mean, we won't be able to offline CPUs that early anyway - thus
> call mce_device_create() in the notifier callback - as we don't have
> userspace to do "echo 0 > ..."
>
> The rest of the code remains and mcheck_init_device() executes when it
> does. Unless I'm missing something, of course...

We are getting CPU_ONLINE notifier for ASPs during boot:


[   14.489595] cpu 1 spinlock event irq 48
[   14.502908] BUG: unable to handle kernel NULL pointer dereference at 
0000000000000060
[   14.527373] IP: [<ffffffff8144deec>] bus_add_device+0xfc/0x1e0
[   14.545859] PGD 0
[   14.552380] Oops: 0000 [#1] SMP
[   14.562711] Modules linked in:
[   14.572494] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
3.16.0-rc1-pmu-dom0 #195
[   14.595307] Hardware name: Intel Corporation Shark Bay Client 
platform/Flathead Creek Crb, BIOS HSWLPTU1.86C.0109.R03.1301282055 
01/28/2013
[   14.634718] task: ffff88022f5a0000 ti: ffff88022f53c000 task.ti: 
ffff88022f53c000
[   14.658364] RIP: e030:[<ffffffff8144deec>] [<ffffffff8144deec>] 
bus_add_device+0xfc/0x1e0
[   14.684457] RSP: e02b:ffff88022f53fc68  EFLAGS: 00010246
[   14.701310] RAX: 0000000000000000 RBX: ffff88023d411810 RCX: 
00000000d7c6bb9d
[   14.723875] RDX: ffff88023d402a60 RSI: ffff88023d411810 RDI: 
ffff88023d411810
[   14.746427] RBP: ffff88022f53fc98 R08: 0000000000000000 R09: 
0000000000000000
[   14.768962] R10: ffffffff8133bbc0 R11: ffffea0008bd9600 R12: 
ffff88023d411800
[   14.791522] R13: ffffffff81c284b8 R14: ffffffff81c284a0 R15: 
0000000000000000
[   14.814087] FS:  0000000000000000(0000) GS:ffff88023da00000(0000) 
knlGS:0000000000000000
[   14.839632] CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
[   14.857845] CR2: 0000000000000060 CR3: 0000000001c10000 CR4: 
0000000000042660
[   14.880413] Stack:
[   14.886913]  ffff88023d411800 ffff88023d411800 0000000000000000 
0000000000000000
[   14.910293]  ffff88023d411810 0000000000000000 ffff88022f53fcf8 
ffffffff8144be3f
[   14.933692]  00000000fffffffb 0000000000000000 ffff88022f53fcd8 
ffffffff81459c85
[   14.957075] Call Trace:
[   14.964971]  [<ffffffff8144be3f>] device_add+0x43f/0x5e0
[   14.981809]  [<ffffffff81459c85>] ? pm_runtime_init+0xe5/0xf0
[   15.000014]  [<ffffffff8144c1be>] device_register+0x1e/0x30
[   15.017697]  [<ffffffff8103b04c>] mce_device_create+0x7c/0x1c0
[   15.036168]  [<ffffffff8103b2a8>] mce_cpu_callback+0x118/0x140
[   15.054636]  [<ffffffff810abb3d>] notifier_call_chain+0x4d/0x70
[   15.073371]  [<ffffffff810abc4e>] __raw_notifier_call_chain+0xe/0x10
[   15.093466]  [<ffffffff81085460>] __cpu_notify+0x20/0x40
[   15.110321]  [<ffffffff81085495>] cpu_notify+0x15/0x20
[   15.126613]  [<ffffffff81085767>] _cpu_up+0x107/0x160
[   15.142649]  [<ffffffff81085819>] cpu_up+0x59/0x80
[   15.157870]  [<ffffffff81d46fdf>] smp_init+0x60/0x8c
[   15.173620]  [<ffffffff81d2616a>] kernel_init_freeable+0xfa/0x20d
[   15.192908]  [<ffffffff8100332e>] ? xen_end_context_switch+0x1e/0x30
[   15.213023]  [<ffffffff816aecf0>] ? rest_init+0x80/0x80
[   15.229592]  [<ffffffff816aecfe>] kernel_init+0xe/0xf0
[   15.245904]  [<ffffffff816c0ebc>] ret_from_fork+0x7c/0xb0
[   15.263034]  [<ffffffff816aecf0>] ? rest_init+0x80/0x80
[   15.279607] Code: d2 ff ff 85 c0 41 89 c7 0f 85 88 00 00 00 49 8b 54 
24 50 48 85 d2 0f 84 93 00 00 00 49 8b 86 90 00 00 00 49 8d 5c 24 10 48 
89 de <48> 8b 78 60 48 83 c7 18 e8 c7 00 e0 ff 85 c0 41 89 c7 74 10 4c
[   15.338846] RIP  [<ffffffff8144deec>] bus_add_device+0xfc/0x1e0
[   15.357605]  RSP <ffff88022f53fc68>
[   15.368729] CR2: 0000000000000060
[   15.379338] ---[ end trace d288f65f5999f472 ]---
[   15.394005] Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x00000009



-boris


>
> Oh, not quite. We probably should remove the
>
> 	__unregister_hotcpu_notifier(&mce_cpu_notifier);
>
> from the error path too, as you suggest.
>
> When you do, please hold that down in the commit message so that it is
> clear what we're doing.
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 20:43                   ` Boris Ostrovsky
@ 2014-06-20 21:11                     ` Borislav Petkov
  2014-06-21  2:04                       ` Boris Ostrovsky
  0 siblings, 1 reply; 15+ messages in thread
From: Borislav Petkov @ 2014-06-20 21:11 UTC (permalink / raw)
  To: Boris Ostrovsky; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On Fri, Jun 20, 2014 at 04:43:37PM -0400, Boris Ostrovsky wrote:
> We are getting CPU_ONLINE notifier for ASPs during boot:

Bah, that's craptastic. Hmm, ok, let's try this instead:

--
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index bb92f38153b2..9a79c8dbd8e8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2451,6 +2451,12 @@ static __init int mcheck_init_device(void)
 	for_each_online_cpu(i) {
 		err = mce_device_create(i);
 		if (err) {
+			/*
+			 * Register notifier anyway (and do not unreg it) so
+			 * that we don't leave undeleted timers, see notifier
+			 * callback above.
+			 */
+			__register_hotcpu_notifier(&mce_cpu_notifier);
 			cpu_notifier_register_done();
 			goto err_device_create;
 		}
@@ -2471,10 +2477,6 @@ static __init int mcheck_init_device(void)
 err_register:
 	unregister_syscore_ops(&mce_syscore_ops);
 
-	cpu_notifier_register_begin();
-	__unregister_hotcpu_notifier(&mce_cpu_notifier);
-	cpu_notifier_register_done();
-
 err_device_create:
 	/*
 	 * We didn't keep track of which devices were created above, but

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-20 21:11                     ` Borislav Petkov
@ 2014-06-21  2:04                       ` Boris Ostrovsky
  2014-06-21 10:08                         ` Borislav Petkov
  0 siblings, 1 reply; 15+ messages in thread
From: Boris Ostrovsky @ 2014-06-21  2:04 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On 06/20/2014 05:11 PM, Borislav Petkov wrote:
> On Fri, Jun 20, 2014 at 04:43:37PM -0400, Boris Ostrovsky wrote:
>> We are getting CPU_ONLINE notifier for ASPs during boot:
> Bah, that's craptastic. Hmm, ok, let's try this instead:

I'll try it later but this doesn't look sufficient to me: we might not 
reach this point if subsys_system_register() or zalloc_cpumask_var() 
fail.  We could register the notifier as the first thing in this routine 
(probably after mce_available() succeeds).


-boris

>
> --
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index bb92f38153b2..9a79c8dbd8e8 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -2451,6 +2451,12 @@ static __init int mcheck_init_device(void)
>   	for_each_online_cpu(i) {
>   		err = mce_device_create(i);
>   		if (err) {
> +			/*
> +			 * Register notifier anyway (and do not unreg it) so
> +			 * that we don't leave undeleted timers, see notifier
> +			 * callback above.
> +			 */
> +			__register_hotcpu_notifier(&mce_cpu_notifier);
>   			cpu_notifier_register_done();
>   			goto err_device_create;
>   		}
> @@ -2471,10 +2477,6 @@ static __init int mcheck_init_device(void)
>   err_register:
>   	unregister_syscore_ops(&mce_syscore_ops);
>   
> -	cpu_notifier_register_begin();
> -	__unregister_hotcpu_notifier(&mce_cpu_notifier);
> -	cpu_notifier_register_done();
> -
>   err_device_create:
>   	/*
>   	 * We didn't keep track of which devices were created above, but
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path
  2014-06-21  2:04                       ` Boris Ostrovsky
@ 2014-06-21 10:08                         ` Borislav Petkov
  0 siblings, 0 replies; 15+ messages in thread
From: Borislav Petkov @ 2014-06-21 10:08 UTC (permalink / raw)
  To: Boris Ostrovsky; +Cc: tony.luck, linux-kernel, linux-edac, mattieu.souchaud

On Fri, Jun 20, 2014 at 10:04:37PM -0400, Boris Ostrovsky wrote:
> I'll try it later but this doesn't look sufficient to me: we might not
> reach this point if subsys_system_register() or zalloc_cpumask_var()
> fail.

If those fail, I'd say we have a much bigger problem than undeleted
timers.

> We could register the notifier as the first thing in this routine
> (probably after mce_available() succeeds).

I guess...

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [tip:x86/urgent] x86, MCE: Robustify mcheck_init_device
  2014-06-20 14:28 [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path Boris Ostrovsky
  2014-06-20 15:23 ` Borislav Petkov
@ 2014-07-24 23:36 ` tip-bot for Borislav Petkov
  1 sibling, 0 replies; 15+ messages in thread
From: tip-bot for Borislav Petkov @ 2014-07-24 23:36 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, boris.ostrovsky, tglx, bp

Commit-ID:  51cbe7e7c400def749950ab6b2c120624dbe21a7
Gitweb:     http://git.kernel.org/tip/51cbe7e7c400def749950ab6b2c120624dbe21a7
Author:     Borislav Petkov <bp@suse.de>
AuthorDate: Fri, 20 Jun 2014 23:16:45 +0200
Committer:  Borislav Petkov <bp@suse.de>
CommitDate: Mon, 21 Jul 2014 18:14:32 +0200

x86, MCE: Robustify mcheck_init_device

BorisO reports that misc_register() fails often on xen. The current code
unregisters the CPU hotplug notifier in that case. If then a CPU is
offlined and onlined back again, we end up with a second timer running
on that CPU, leading to soft lockups and system hangs.

So let's leave the hotcpu notifier always registered - even if
mce_device_create failed for some cores and never unreg it so that we
can deal with the timer handling accordingly.

Reported-and-Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: http://lkml.kernel.org/r/1403274493-1371-1-git-send-email-boris.ostrovsky@oracle.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index bb92f38..9a79c8d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2451,6 +2451,12 @@ static __init int mcheck_init_device(void)
 	for_each_online_cpu(i) {
 		err = mce_device_create(i);
 		if (err) {
+			/*
+			 * Register notifier anyway (and do not unreg it) so
+			 * that we don't leave undeleted timers, see notifier
+			 * callback above.
+			 */
+			__register_hotcpu_notifier(&mce_cpu_notifier);
 			cpu_notifier_register_done();
 			goto err_device_create;
 		}
@@ -2471,10 +2477,6 @@ static __init int mcheck_init_device(void)
 err_register:
 	unregister_syscore_ops(&mce_syscore_ops);
 
-	cpu_notifier_register_begin();
-	__unregister_hotcpu_notifier(&mce_cpu_notifier);
-	cpu_notifier_register_done();
-
 err_device_create:
 	/*
 	 * We didn't keep track of which devices were created above, but

^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2014-07-24 23:37 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-20 14:28 [PATCH] x86/mce: Don't unregister CPU hotplug notifier in error path Boris Ostrovsky
2014-06-20 15:23 ` Borislav Petkov
2014-06-20 15:41   ` Boris Ostrovsky
2014-06-20 15:58     ` Borislav Petkov
2014-06-20 16:16       ` Boris Ostrovsky
2014-06-20 17:52         ` Borislav Petkov
2014-06-20 19:39           ` Boris Ostrovsky
2014-06-20 20:03             ` Borislav Petkov
2014-06-20 20:16               ` Boris Ostrovsky
2014-06-20 20:29                 ` Borislav Petkov
2014-06-20 20:43                   ` Boris Ostrovsky
2014-06-20 21:11                     ` Borislav Petkov
2014-06-21  2:04                       ` Boris Ostrovsky
2014-06-21 10:08                         ` Borislav Petkov
2014-07-24 23:36 ` [tip:x86/urgent] x86, MCE: Robustify mcheck_init_device tip-bot for Borislav Petkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.