[RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
@ 2014-11-11 20:56 Andy Lutomirski
  2014-11-11 21:36 ` Borislav Petkov
  2014-11-12 22:00 ` Oleg Nesterov
  0 siblings, 2 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-11 20:56 UTC (permalink / raw)
  To: Borislav Petkov, x86
  Cc: linux-kernel, Peter Zijlstra, Oleg Nesterov, Tony Luck,
	Andi Kleen, Andy Lutomirski

This causes all non-NMI kernel entries from userspace to run on the
normal kernel stack.

This means that machine check recovery can happen in non-atomic
context.  It also obviates the need for the paranoid_userspace path.

Borislav has referred to this idea as the tail wagging the dog.  I
think that's okay -- the dog was pretty ugly.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/x86/kernel/entry_64.S | 61 +++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index df088bb03fb3..9c0a7311af0d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1064,6 +1064,9 @@ ENTRY(\sym)
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 
 	.if \paranoid
+	CFI_REMEMBER_STATE
+	testl $3, CS(%rsp)		/* If coming from userspace, switch */
+	jnz 1f				/* stacks. */
 	call save_paranoid
 	.else
 	call error_entry
@@ -1104,6 +1107,36 @@ ENTRY(\sym)
 	jmp error_exit			/* %ebx: no swapgs flag */
 	.endif
 
+	.if \paranoid
+	CFI_RESTORE_STATE
+	/*
+	 * Paranoid entry from userspace.  Switch stacks and treat it
+	 * as a normal entry.  This means that paranoid handlers
+	 * run in real process context if user_mode(regs).
+	 */
+1:
+	call error_entry
+
+	DEFAULT_FRAME 0
+
+	movq %rsp,%rdi			/* pt_regs pointer */
+	call sync_regs
+	movq %rax,%rsp			/* switch stack */
+
+	movq %rsp,%rdi			/* pt_regs pointer */
+
+	.if \has_error_code
+	movq ORIG_RAX(%rsp),%rsi	/* get error code */
+	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
+	.else
+	xorl %esi,%esi			/* no error code */
+	.endif
+
+	call \do_sym
+
+	jmp error_exit			/* %ebx: no swapgs flag */
+	.endif
+
 	CFI_ENDPROC
 END(\sym)
 .endm
@@ -1324,8 +1357,6 @@ ENTRY(paranoid_exit)
 	TRACE_IRQS_OFF_DEBUG
 	testl %ebx,%ebx				/* swapgs needed? */
 	jnz paranoid_restore
-	testl $3,CS(%rsp)
-	jnz   paranoid_userspace
 paranoid_swapgs:
 	TRACE_IRQS_IRETQ 0
 	SWAPGS_UNSAFE_STACK
@@ -1335,32 +1366,6 @@ paranoid_restore:
 	TRACE_IRQS_IRETQ_DEBUG 0
 	RESTORE_ALL 8
 	jmp irq_return
-paranoid_userspace:
-	GET_THREAD_INFO(%rcx)
-	movl TI_flags(%rcx),%ebx
-	andl $_TIF_WORK_MASK,%ebx
-	jz paranoid_swapgs
-	movq %rsp,%rdi			/* &pt_regs */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
-	jnz paranoid_schedule
-	movl %ebx,%edx			/* arg3: thread flags */
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	xorl %esi,%esi 			/* arg2: oldset */
-	movq %rsp,%rdi 			/* arg1: &pt_regs */
-	call do_notify_resume
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp paranoid_userspace
-paranoid_schedule:
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	SCHEDULE_USER
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	jmp paranoid_userspace
 	CFI_ENDPROC
 END(paranoid_exit)
 
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-11 20:56 [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace Andy Lutomirski
@ 2014-11-11 21:36 ` Borislav Petkov
  2014-11-11 22:00   ` Luck, Tony
  2014-11-11 22:12   ` Andy Lutomirski
  2014-11-12 22:00 ` Oleg Nesterov
  1 sibling, 2 replies; 63+ messages in thread
From: Borislav Petkov @ 2014-11-11 21:36 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: x86, linux-kernel, Peter Zijlstra, Oleg Nesterov, Tony Luck, Andi Kleen

A very big hmmm...

On Tue, Nov 11, 2014 at 12:56:52PM -0800, Andy Lutomirski wrote:
> This causes all non-NMI kernel entries from userspace to run on the
> normal kernel stack.

So one of the reasons #MC has its own stack is because we need a
known-good stack in such situations. What if the normal kernel stack is
corrupted too due to a #MC?

> This means that machine check recovery can happen in non-atomic
> context.  It also obviates the need for the paranoid_userspace path.
> 
> Borislav has referred to this idea as the tail wagging the dog.  I
> think that's okay -- the dog was pretty ugly.

And I still am not sure about this: so the #MC handler makes implicit
assumptions that while it is running nothing is going to interrupt it
and it can access MCA MSRs. If you switch to process context, another
#MC will preempt it and overwrite MCA MSRs. Which is a no-no.

So unless I'm missing something - and I probably am - I don't think
we can run #MC handler in process context. #MC is the highest prio
abort-type exception along with processor reset for a reason.

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-11 21:36 ` Borislav Petkov
@ 2014-11-11 22:00   ` Luck, Tony
  2014-11-11 22:15     ` Andy Lutomirski
  2014-11-11 22:12   ` Andy Lutomirski
  1 sibling, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-11 22:00 UTC (permalink / raw)
  To: Borislav Petkov, Andy Lutomirski
  Cc: x86, linux-kernel, Peter Zijlstra, Oleg Nesterov, Andi Kleen

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1791 bytes --]

So here is the flow:

1) A machine check happens - it is (currently) broadcast to all logical cpus on all sockets

2) First cpu to execute "order = atomic_inc_return(&mce_callin);" in mce_start() gets to be the "monarch" and directs things during the handler.

3) Every cpu gets to scan all the machine check banks to see what happened. If the error was a fatal one we are going to panic - this isn't the interesting case.

4) There are two kinds of recoverable error
4a) Ones not in execution context (SRAO = Software Recoverable Action Optional) -  these also aren't very interesting - save the address in a NMI safe ring buffer to process later
4b) In execution context (SRAR = Software Recoverable Action Required) - this is where we need to do some real work to convert from the physical address logged to the list of affected processes.

Now when we get to step 4b - we need to let all the other processors return from the machine check handler (they may have been interrupted in kernel context and could hold locks that we need).

We also need to clear the MSR MCG_STATUS (on each logical cpu) to indicate we are done with this machine check.

Andy - with your RFC patch - can we just make the bottom end of do_machine_check() look like this:

	/* collected everything we need from banks - re-enable machine check on all cpus */
	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);

	if (we are *not* the thread with the SRAR error)
		return;

	/* do all the things that were previously in mce_notify_process() here */
}

and if we do this - what happens if we get another machine check while we are in the "do all the things" bit?

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-11 21:36 ` Borislav Petkov
  2014-11-11 22:00   ` Luck, Tony
@ 2014-11-11 22:12   ` Andy Lutomirski
  2014-11-11 22:33     ` Borislav Petkov
  1 sibling, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-11 22:12 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: X86 ML, linux-kernel, Peter Zijlstra, Oleg Nesterov, Tony Luck,
	Andi Kleen

On Tue, Nov 11, 2014 at 1:36 PM, Borislav Petkov <bp@alien8.de> wrote:
> A very big hmmm...
>
> On Tue, Nov 11, 2014 at 12:56:52PM -0800, Andy Lutomirski wrote:
>> This causes all non-NMI kernel entries from userspace to run on the
>> normal kernel stack.
>
> So one of the reasons #MC has its own stack is because we need a
> known-good stack in such situations. What if the normal kernel stack is
> corrupted too due to a #MC?

I don't see why it would be any more likely for the normal kernel
stack to be corrupted due to a hardware issue that interrupted ring 3
code than that the IST stack is corrupted.

>
>> This means that machine check recovery can happen in non-atomic
>> context.  It also obviates the need for the paranoid_userspace path.
>>
>> Borislav has referred to this idea as the tail wagging the dog.  I
>> think that's okay -- the dog was pretty ugly.
>
> And I still am not sure about this: so the #MC handler makes implicit
> assumptions that while it is running nothing is going to interrupt it
> and it can access MCA MSRs. If you switch to process context, another
> #MC will preempt it and overwrite MCA MSRs. Which is a no-no.
>
> So unless I'm missing something - and I probably am - I don't think
> we can run #MC handler in process context. #MC is the highest prio
> abort-type exception along with processor reset for a reason.
>

I don't know what, if anything, masks and unmasks #MC, but certainly
switching to process context like this patch does will not unmask it.
Of course, if you sleep, then all bets are off.

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-11 22:00   ` Luck, Tony
@ 2014-11-11 22:15     ` Andy Lutomirski
  0 siblings, 0 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-11 22:15 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, x86, linux-kernel, Peter Zijlstra,
	Oleg Nesterov, Andi Kleen

On Tue, Nov 11, 2014 at 2:00 PM, Luck, Tony <tony.luck@intel.com> wrote:
> So here is the flow:
>
> 1) A machine check happens - it is (currently) broadcast to all logical cpus on all sockets
>
> 2) First cpu to execute "order = atomic_inc_return(&mce_callin);" in mce_start() gets to be the "monarch" and directs things during the handler.

This is a bit funky, since any number of #MCs might be delivered in
process context, and any one of them, or none of them, might win the
election.  Can this be made to not matter, or can the actual faulting
CPU be made to win the election?

>
> 3) Every cpu gets to scan all the machine check banks to see what happened. If the error was a fatal one we are going to panic - this isn't the interesting case.
>
> 4) There are two kinds of recoverable error
> 4a) Ones not in execution context (SRAO = Software Recoverable Action Optional) -  these also aren't very interesting - save the address in a NMI safe ring buffer to process later
> 4b) In execution context (SRAR = Software Recoverable Action Required) - this is where we need to do some real work to convert from the physical address logged to the list of affected processes.
>
> Now when we get to step 4b - we need to let all the other processors return from the machine check handler (they may have been interrupted in kernel context and could hold locks that we need).
>
> We also need to clear the MSR MCG_STATUS (on each logical cpu) to indicate we are done with this machine check.
>
>
> Andy - with your RFC patch - can we just make the bottom end of do_machine_check() look like this:
>
>         /* collected everything we need from banks - re-enable machine check on all cpus */
>         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
>
>         if (we are *not* the thread with the SRAR error)
>                 return;
>
>         /* do all the things that were previously in mce_notify_process() here */
> }

That's the intent.

>
> and if we do this - what happens if we get another machine check while we are in the "do all the things" bit?
>

Is it the case that another #MC can't happen until you clear IA32_MCG_STATUS?

In any event, this should be no more or less safe than the previous
approach of doing the main part of #MC handling, then switching
stacks, then handling the rest in user context.  But you might be able
to get away with waiting to clear IA32_MCG_STATUS until after you're
done, as long as you don't let yourself be preempted.

What are the semantics of #MC nesting and masking, anyway?  Last time
I looked at the SDM, I couldn't figure it out.

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-11 22:12   ` Andy Lutomirski
@ 2014-11-11 22:33     ` Borislav Petkov
  2014-11-11 22:40       ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Borislav Petkov @ 2014-11-11 22:33 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: X86 ML, linux-kernel, Peter Zijlstra, Oleg Nesterov, Tony Luck,
	Andi Kleen

On Tue, Nov 11, 2014 at 02:12:18PM -0800, Andy Lutomirski wrote:
> I don't see why it would be any more likely for the normal kernel
> stack to be corrupted due to a hardware issue that interrupted ring 3
> code than that the IST stack is corrupted.

The IST stack is, well, used solely be used for the vectors it is
assigned for. Maybe the probabability of it getting bad is a bit
lower..., who knows.

> I don't know what, if anything, masks and unmasks #MC, but certainly
> switching to process context like this patch does will not unmask it.

Manuals say to clear MCG_STATUS[MCIP] before you return but you also
have to IRET. Because not having cleared MCIP and returning would shut
down the machine on another #MC.

But then what does it bring me to run on the kernel stack if I'm still
in atomic context and I can't take locks? That doesn't help me with the
memory_failure() thing.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-11 22:33     ` Borislav Petkov
@ 2014-11-11 22:40       ` Andy Lutomirski
  2014-11-11 23:09         ` Borislav Petkov
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-11 22:40 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: X86 ML, linux-kernel, Peter Zijlstra, Oleg Nesterov, Tony Luck,
	Andi Kleen

On Tue, Nov 11, 2014 at 2:33 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Nov 11, 2014 at 02:12:18PM -0800, Andy Lutomirski wrote:
>> I don't see why it would be any more likely for the normal kernel
>> stack to be corrupted due to a hardware issue that interrupted ring 3
>> code than that the IST stack is corrupted.
>
> The IST stack is, well, used solely be used for the vectors it is
> assigned for. Maybe the probabability of it getting bad is a bit
> lower..., who knows.
>
>> I don't know what, if anything, masks and unmasks #MC, but certainly
>> switching to process context like this patch does will not unmask it.
>
> Manuals say to clear MCG_STATUS[MCIP] before you return but you also
> have to IRET. Because not having cleared MCIP and returning would shut
> down the machine on another #MC.

I wonder what the IRET is for.  There had better not be another magic
IRET unmask thing.  I'm guessing that the actual semantics are that
nothing whatsoever can mask #MC, but that a second #MC when MCIP is
still set is a shutdown condition.

>
> But then what does it bring me to run on the kernel stack if I'm still
> in atomic context and I can't take locks? That doesn't help me with the
> memory_failure() thing.

Define "atomic".

You're still running with irqs off and MCIP set.  At some point,
you're presumably done with all of the machine check registers, and
you can clear MCIP.  Now, if current == victim, you can enable irqs
and do whatever you want.

In my mind, the benefit is that you don't need to think about how to
save your information and arrange to get called back the next time
that the victim task is a non-atomic context, since you *are* the
victim task and you're running in normal irqs-disabled kernel mode.

In contrast, with the current entry code, if you enable IRQs or so
anything that could sleep, you're on the wrong stack, so you'll crash.
That means that taking mutexes, even after clearing MCIP, is
impossible.

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-11 22:40       ` Andy Lutomirski
@ 2014-11-11 23:09         ` Borislav Petkov
  2014-11-11 23:21           ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Borislav Petkov @ 2014-11-11 23:09 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: X86 ML, linux-kernel, Peter Zijlstra, Oleg Nesterov, Tony Luck,
	Andi Kleen

On Tue, Nov 11, 2014 at 02:40:12PM -0800, Andy Lutomirski wrote:
> I wonder what the IRET is for.  There had better not be another magic
> IRET unmask thing.  I'm guessing that the actual semantics are that
> nothing whatsoever can mask #MC, but that a second #MC when MCIP is
> still set is a shutdown condition.

Hmmm, both manuals are unclear as to what exactly reenables #MC. So
forget about IRET and look at this: "When the processor receives a
machine check when MCIP is set, it automatically enters the shutdown
state." so this really reads like a second #MC while the first is
happening would shutdown the system - regardless whether I'm still in
#MC context or not, running the first #MC handler.

I guess I needz me some hw people to actually confirm.

> Define "atomic".
> 
> You're still running with irqs off and MCIP set.  At some point,

Yes, I need to be atomic wrt to another #MC so that I can be able to
read out the MCA MSRs in time and undisturbed.

> you're presumably done with all of the machine check registers, and
> you can clear MCIP.  Now, if current == victim, you can enable irqs
> and do whatever you want.

This is the key: if I enable irqs and the process gets scheduled on
another CPU, I lose. So I have to be able to say: before you run this
task on any CPU, kill it.

> In my mind, the benefit is that you don't need to think about how to
> save your information and arrange to get called back the next time
> that the victim task is a non-atomic context, since you *are* the
> victim task and you're running in normal irqs-disabled kernel mode.
> 
> In contrast, with the current entry code, if you enable IRQs or so
> anything that could sleep, you're on the wrong stack, so you'll crash.
> That means that taking mutexes, even after clearing MCIP, is
> impossible.

Hmm, it is late here and I need to think about this on a clear head
again but I think I can see the benefit of this to a certain extent.
However(!), I need to be able to run undisturbed and do the minimum work
in the #MC handler before I reenable MCEs.

But Tony also has a valid point as in what is going to happen if I
get another MCE while doing the memory_failure() dance. I guess if
memory_failure() takes proper locks, the second #MC will get to wait
until the first is done. But who knows in reality ...

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-11 23:09         ` Borislav Petkov
@ 2014-11-11 23:21           ` Andy Lutomirski
  2014-11-12  0:22             ` Luck, Tony
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-11 23:21 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: X86 ML, linux-kernel, Peter Zijlstra, Oleg Nesterov, Tony Luck,
	Andi Kleen

On Tue, Nov 11, 2014 at 3:09 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Nov 11, 2014 at 02:40:12PM -0800, Andy Lutomirski wrote:
>> I wonder what the IRET is for.  There had better not be another magic
>> IRET unmask thing.  I'm guessing that the actual semantics are that
>> nothing whatsoever can mask #MC, but that a second #MC when MCIP is
>> still set is a shutdown condition.
>
> Hmmm, both manuals are unclear as to what exactly reenables #MC. So
> forget about IRET and look at this: "When the processor receives a
> machine check when MCIP is set, it automatically enters the shutdown
> state." so this really reads like a second #MC while the first is
> happening would shutdown the system - regardless whether I'm still in
> #MC context or not, running the first #MC handler.
>
> I guess I needz me some hw people to actually confirm.
>
>> Define "atomic".
>>
>> You're still running with irqs off and MCIP set.  At some point,
>
> Yes, I need to be atomic wrt to another #MC so that I can be able to
> read out the MCA MSRs in time and undisturbed.
>
>> you're presumably done with all of the machine check registers, and
>> you can clear MCIP.  Now, if current == victim, you can enable irqs
>> and do whatever you want.
>
> This is the key: if I enable irqs and the process gets scheduled on
> another CPU, I lose. So I have to be able to say: before you run this
> task on any CPU, kill it.

Why do you lose?  With my patch applied, you are that process, and the
process can't possibly return to user space until you return from
do_machine_check.  In other words, it works kind of like a page fault.

>
>> In my mind, the benefit is that you don't need to think about how to
>> save your information and arrange to get called back the next time
>> that the victim task is a non-atomic context, since you *are* the
>> victim task and you're running in normal irqs-disabled kernel mode.
>>
>> In contrast, with the current entry code, if you enable IRQs or so
>> anything that could sleep, you're on the wrong stack, so you'll crash.
>> That means that taking mutexes, even after clearing MCIP, is
>> impossible.
>
> Hmm, it is late here and I need to think about this on a clear head
> again but I think I can see the benefit of this to a certain extent.
> However(!), I need to be able to run undisturbed and do the minimum work
> in the #MC handler before I reenable MCEs.
>
> But Tony also has a valid point as in what is going to happen if I
> get another MCE while doing the memory_failure() dance. I guess if
> memory_failure() takes proper locks, the second #MC will get to wait
> until the first is done. But who knows in reality ...

Yeah.  But if you haven't cleared MCIP, you go boom, which is the same
with pretty much any approach.

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-11 23:21           ` Andy Lutomirski
@ 2014-11-12  0:22             ` Luck, Tony
  2014-11-12  0:40               ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-12  0:22 UTC (permalink / raw)
  To: Andy Lutomirski, Borislav Petkov
  Cc: X86 ML, linux-kernel, Peter Zijlstra, Oleg Nesterov, Andi Kleen

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1492 bytes --]

Andy said:
> Yeah.  But if you haven't cleared MCIP, you go boom, which is the same
> with pretty much any approach.

The current code has an ugly hole at the moment.  End of do_machine_check()
clears MCG_STATUS.  At that point we are still running on the magic stack for
machine check exceptions ... if we take a machine check in the small window
from there until we get off this stack (iret) ... then we will enter the handler
back on the same stack that we haven't finished using yet. Bad things ensue.

Andy's RFC patch removes this window.  We are already off on the normal stack
when we clear MCG_STATUS.MCIP ... so we enter the new machine check on the
magic stack, but then (I hope) transition to the kernel stack (pushing a new frame
below the other one).

Boris said:
> This is the key: if I enable irqs and the process gets scheduled on
> another CPU, I lose. So I have to be able to say: before you run this
> task on any CPU, kill it.

I don't think it matters if sleep and schedule this task on another cpu. When
we run there we'll keep running the memory_failure() code that we were
in the middle of when we slept.  The task can move around - we just need to
make sure it doesn't *return to the user-mode instruction* that hit the machine
check before we find the pte and zero it and mark the page as POISON.

-Tony

ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12  0:22             ` Luck, Tony
@ 2014-11-12  0:40               ` Andy Lutomirski
  2014-11-12  1:06                 ` Luck, Tony
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-12  0:40 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, X86 ML, linux-kernel, Peter Zijlstra,
	Oleg Nesterov, Andi Kleen

On Tue, Nov 11, 2014 at 4:22 PM, Luck, Tony <tony.luck@intel.com> wrote:
> Andy said:
>> Yeah.  But if you haven't cleared MCIP, you go boom, which is the same
>> with pretty much any approach.
>
> The current code has an ugly hole at the moment.  End of do_machine_check()
> clears MCG_STATUS.  At that point we are still running on the magic stack for
> machine check exceptions ... if we take a machine check in the small window
> from there until we get off this stack (iret) ... then we will enter the handler
> back on the same stack that we haven't finished using yet. Bad things ensue.
>
> Andy's RFC patch removes this window.  We are already off on the normal stack
> when we clear MCG_STATUS.MCIP ...

Only if the first #MC came from user mode.

> so we enter the new machine check on the
> magic stack, but then (I hope) transition to the kernel stack (pushing a new frame
> below the other one).

We could, in theory, do this, but it seems really dangerous.  First,
there's a significant risk of overflowing the stack, since it makes
stack usage impossible to analyze.  Second, there are contexts in
which the kernel stack is unusable (the syscall prologue and epilogue
are major examples).

If the first #MC hits in user mode, then we'll transition to the
kernel stack.  If the second #MC hits in kernel mode, then we'll
handle it on the IST stack.

I think that the real nasty case is when there's a broadcast MCE that
hits a non-offending CPU that's running in kernel mode.  The best that
I can come up with is to find a way to schedule a work item from the
extremely atomic context that we're running in and defer clearing MCIP
to that work item.

I've thought about one sneaky option.  If we can reliably determine
that we're an innocent bystander of a broadcast #MC, can we send an
IPI-to-self and return without clearing MCIP?  Then we get another
interrupt as soon as interrupts are enabled, and we can clear MCIP at
a time when we're definitely not running on the IST stack.

>
> Boris said:
>> This is the key: if I enable irqs and the process gets scheduled on
>> another CPU, I lose. So I have to be able to say: before you run this
>> task on any CPU, kill it.
>
> I don't think it matters if sleep and schedule this task on another cpu. When
> we run there we'll keep running the memory_failure() code that we were
> in the middle of when we slept.  The task can move around - we just need to
> make sure it doesn't *return to the user-mode instruction* that hit the machine
> check before we find the pte and zero it and mark the page as POISON.

Yeah, this is the idea.

But, damnit, machine check broadcast is the worst idea ever.

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12  0:40               ` Andy Lutomirski
@ 2014-11-12  1:06                 ` Luck, Tony
  2014-11-12  2:01                   ` Andy Lutomirski
  2014-11-12  2:06                   ` Tony Luck
  0 siblings, 2 replies; 63+ messages in thread
From: Luck, Tony @ 2014-11-12  1:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, X86 ML, linux-kernel, Peter Zijlstra,
	Oleg Nesterov, Andi Kleen

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1643 bytes --]

> I've thought about one sneaky option.  If we can reliably determine
> that we're an innocent bystander of a broadcast #MC, can we send an
> IPI-to-self and return without clearing MCIP?  Then we get another
> interrupt as soon as interrupts are enabled, and we can clear MCIP at
> a time when we're definitely not running on the IST stack.

Innocent bystanders have RIPV=1, EIPV=0 in MCG_STATUS ... so they
are quite easy to spot.  Perhaps we might look at subverting the silly
broadcast by just having them immediately clear MCG_STATUS and iret
(i.e. not go to do_machine_check() at all).  That would require lots of
surgery to do_machine_check() and friends - now it wouldn't be sure
how many processors to expect to show up.  It also opens a different
window - once they are back running normal code they might trip another
machine check while the victims of the first are still processing - so
another "boom, you're dead".  The advantage of hitting everyone
with the machine check is that it lessens the chance that another will
happen as everyone is running looking at a few pages of kernel code
& data.

The worrying part in that is "as soon as interrupts are enabled". Until
we do clear MCIP we're sitting in a mode where another machine check
means instant death no saving throw.  Nominally better than the "we'll
mess the stack up for you" that we are trying to avoid - but the old window
is quite short and known to be bounded. The new one might be a lot bigger.

-Tony

ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12  1:06                 ` Luck, Tony
@ 2014-11-12  2:01                   ` Andy Lutomirski
  2014-11-12  2:06                   ` Tony Luck
  1 sibling, 0 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-12  2:01 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, X86 ML, linux-kernel, Peter Zijlstra,
	Oleg Nesterov, Andi Kleen

On Tue, Nov 11, 2014 at 5:06 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> I've thought about one sneaky option.  If we can reliably determine
>> that we're an innocent bystander of a broadcast #MC, can we send an
>> IPI-to-self and return without clearing MCIP?  Then we get another
>> interrupt as soon as interrupts are enabled, and we can clear MCIP at
>> a time when we're definitely not running on the IST stack.
>
> Innocent bystanders have RIPV=1, EIPV=0 in MCG_STATUS ... so they
> are quite easy to spot.  Perhaps we might look at subverting the silly
> broadcast by just having them immediately clear MCG_STATUS and iret
> (i.e. not go to do_machine_check() at all).  That would require lots of
> surgery to do_machine_check() and friends - now it wouldn't be sure
> how many processors to expect to show up.  It also opens a different
> window - once they are back running normal code they might trip another
> machine check while the victims of the first are still processing - so
> another "boom, you're dead".  The advantage of hitting everyone
> with the machine check is that it lessens the chance that another will
> happen as everyone is running looking at a few pages of kernel code
> & data.
>
> The worrying part in that is "as soon as interrupts are enabled". Until
> we do clear MCIP we're sitting in a mode where another machine check
> means instant death no saving throw.  Nominally better than the "we'll
> mess the stack up for you" that we are trying to avoid - but the old window
> is quite short and known to be bounded. The new one might be a lot bigger.

Yeah, fair enough.

The annoying thing is that there's no way to atomically return from
interrupt and clear MCIP.

Here's a different idea.  In do_machine_check, check if (regs->sp
points at the machine check IST stack && !user_mode(regs)) and, if so,
declare the machine check to be unrecoverable.  There are a couple
ways this can happen:

 - This is a second #MC that hit after clearing MCIP and before
returning.  It's genuinely unrecoverable (we're well and truly screwed
at this point), but we probably won't actually crash unless we try to
return.

 - This is a normal #MC that hit in kernel mode during a time when sp
was bogus and coincidentally pointed at the #MC IST stack.

This isn't perfect.  A malicious user can do dummy syscalls in a loop
on one CPU with rsp pointing at the IST stack and try to cause a
machine check on a different CPU, causing the system to panic when it
thinks that the first CPU had a recursive IST usage.  I think that we
probably have bigger problems if a malicious user can cause machine
checks, though.

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12  1:06                 ` Luck, Tony
  2014-11-12  2:01                   ` Andy Lutomirski
@ 2014-11-12  2:06                   ` Tony Luck
  2014-11-12 10:30                     ` Borislav Petkov
  1 sibling, 1 reply; 63+ messages in thread
From: Tony Luck @ 2014-11-12  2:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, X86 ML, linux-kernel, Peter Zijlstra,
	Oleg Nesterov, Andi Kleen

> Innocent bystanders have RIPV=1, EIPV=0 in MCG_STATUS ... so they
> are quite easy to spot.

Bother ... except for the SRAO cases where *everyone* is an innocent
bystander - but someone should go look for the error and queue up
a page offline event.  Perhaps for this we'd do the self-ipi trick and
then scan the banks to look for SRAO signatures to process as well
as clearing MCIP

-Tony

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12  2:06                   ` Tony Luck
@ 2014-11-12 10:30                     ` Borislav Petkov
  2014-11-12 15:48                       ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Borislav Petkov @ 2014-11-12 10:30 UTC (permalink / raw)
  To: Tony Luck, Andy Lutomirski
  Cc: X86 ML, linux-kernel, Peter Zijlstra, Oleg Nesterov, Andi Kleen

On Tue, Nov 11, 2014 at 06:06:48PM -0800, Tony Luck wrote:
> > Innocent bystanders have RIPV=1, EIPV=0 in MCG_STATUS ... so they
> > are quite easy to spot.
> 
> Bother ... except for the SRAO cases where *everyone* is an innocent
> bystander - but someone should go look for the error and queue up
> a page offline event.  Perhaps for this we'd do the self-ipi trick and
> then scan the banks to look for SRAO signatures to process as well
> as clearing MCIP

First of all, good morning :-)

So I'm reading all those notes again and I can't say I'm convinced about
the change. And all those "but but, what might happen if" cases are what
bother me.

First, the stack: switching to a possibly almost full kernel stack
where we have other stuff already allocated is a bad idea IMO. There's
a good reason to have a known-good, solely-for-our-purpose stack which
we get to use. It is always there and it gives us the optimal conditions
possible for recovery. Switching back to the possibly crowded kernel
stack and causing stack overflows while handling an MCE is not an
improvement in my book.

Tony:
> The current code has an ugly hole at the moment. End of
> do_machine_check() clears MCG_STATUS. At that point we are still
> running on the magic stack for machine check exceptions ... if we take
> a machine check in the small window from there until we get off this
> stack (iret) ... then we will enter the handler back on the same stack
> that we haven't finished using yet. Bad things ensue.

I'm not crazy about it either but how often does this actually happen in
practice? And if it does happen, then, well, we die, so be it. It can
happen with other MCEs coming in back-to-back like poisoned data (1st
MCE) being consumed on another core (2nd MCE). I know, Intel does the
broadcasting but that is going to change, I hear and AMD raises an MCE
on the core which detects it only.

Another big issue I see with this is verifying the locking works and
generally testing this serious change. MCA is core kernel infrastructure
and we cannot break this left and right. And changing stacks we're
running on and the whole dynamics of the code might open a whole other
can of worms.

Oh, and we shouldn't forget why we're doing this: just to save two
members to task_struct and getting rid of paranoid_userspace. Yep, still
not convinced the effort is worth it.

Now, I think Andy had a much better/simpler/cleaner suggestion
yesterday: task_work_add(). It is basically what _TIF_MCE_NOTIFY does
and it is called in the same path - do_notify_resume() - but generic and
we can drop _TIF_MCE_NOTIFY then and use the generic thing.

Much less intrusive change.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12 10:30                     ` Borislav Petkov
@ 2014-11-12 15:48                       ` Andy Lutomirski
  2014-11-12 16:22                         ` Borislav Petkov
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-12 15:48 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra, Tony Luck,
	Oleg Nesterov

On Nov 12, 2014 2:30 AM, "Borislav Petkov" <bp@alien8.de> wrote:
>
> On Tue, Nov 11, 2014 at 06:06:48PM -0800, Tony Luck wrote:
> > > Innocent bystanders have RIPV=1, EIPV=0 in MCG_STATUS ... so they
> > > are quite easy to spot.
> >
> > Bother ... except for the SRAO cases where *everyone* is an innocent
> > bystander - but someone should go look for the error and queue up
> > a page offline event.  Perhaps for this we'd do the self-ipi trick and
> > then scan the banks to look for SRAO signatures to process as well
> > as clearing MCIP
>
> First of all, good morning :-)
>
> So I'm reading all those notes again and I can't say I'm convinced about
> the change. And all those "but but, what might happen if" cases are what
> bother me.
>
> First, the stack: switching to a possibly almost full kernel stack
> where we have other stuff already allocated is a bad idea IMO. There's
> a good reason to have a known-good, solely-for-our-purpose stack which
> we get to use. It is always there and it gives us the optimal conditions
> possible for recovery. Switching back to the possibly crowded kernel
> stack and causing stack overflows while handling an MCE is not an
> improvement in my book.

I only switch stacks on entry from userspace, and the kernel stack is
completely empty if that happens.

>
> Tony:
> > The current code has an ugly hole at the moment. End of
> > do_machine_check() clears MCG_STATUS. At that point we are still
> > running on the magic stack for machine check exceptions ... if we take
> > a machine check in the small window from there until we get off this
> > stack (iret) ... then we will enter the handler back on the same stack
> > that we haven't finished using yet. Bad things ensue.
>
> I'm not crazy about it either but how often does this actually happen in
> practice? And if it does happen, then, well, we die, so be it. It can
> happen with other MCEs coming in back-to-back like poisoned data (1st
> MCE) being consumed on another core (2nd MCE). I know, Intel does the
> broadcasting but that is going to change, I hear and AMD raises an MCE
> on the core which detects it only.
>
> Another big issue I see with this is verifying the locking works and
> generally testing this serious change. MCA is core kernel infrastructure
> and we cannot break this left and right. And changing stacks we're
> running on and the whole dynamics of the code might open a whole other
> can of worms.

One nice thing for testing is that my patch applies to int3 from
userspace as well, and that's easy to test.

>
> Oh, and we shouldn't forget why we're doing this: just to save two
> members to task_struct and getting rid of paranoid_userspace. Yep, still
> not convinced the effort is worth it.

I think I want to make this change anyway, though, since it may
simplify fsgsbase support enough to justify it solely on that account.
I don't think that the machine check code needs to change at all to
accommodate a stack switch, but I think it makes some simplifications
possible.

>
> Now, I think Andy had a much better/simpler/cleaner suggestion
> yesterday: task_work_add(). It is basically what _TIF_MCE_NOTIFY does
> and it is called in the same path - do_notify_resume() - but generic and
> we can drop _TIF_MCE_NOTIFY then and use the generic thing.
>
> Much less intrusive change.

Less intrusive is certainly true.

--Andy

>
> --
> Regards/Gruss,
>     Boris.
>
> Sent from a fat crate under my desk. Formatting is fine.
> --

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12 15:48                       ` Andy Lutomirski
@ 2014-11-12 16:22                         ` Borislav Petkov
  2014-11-12 17:17                           ` Luck, Tony
  2014-11-13 18:04                           ` Borislav Petkov
  0 siblings, 2 replies; 63+ messages in thread
From: Borislav Petkov @ 2014-11-12 16:22 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra, Tony Luck,
	Oleg Nesterov

On Wed, Nov 12, 2014 at 07:48:15AM -0800, Andy Lutomirski wrote:
> I only switch stacks on entry from userspace, and the kernel stack is
> completely empty if that happens.

Ok, fair enough. There's still the argument that something might've
corrupted the kernel stack memory while the MCE_STACK is used only by
#MC.

Btw, we could try something else like making the duration we run on the
IST stack shorter by simply reading out the MCA MSRs, then switch stacks
on exit and do the rest of the processing on the kernel stack. I have no
idea whether something like that would even work/be better.

> One nice thing for testing is that my patch applies to int3 from
> userspace as well, and that's easy to test.

Not that easy for testing the #MC path - there we have to inject real
MCEs and then noodle through the memory_failure() code. I'd be very much
interested to see what would happen if two MCEs happen back-to-back with
your change, the second one being raised when we're on the kernel stack
and in memory_failure()...

> I think I want to make this change anyway, though, since it may
> simplify fsgsbase support enough to justify it solely on that account.
> I don't think that the machine check code needs to change at all to
> accommodate a stack switch, but I think it makes some simplifications
> possible.

Right, I'm very nervous when touching this with non-trivial changes.

> Less intrusive is certainly true.

Right, I can do it in the meantime and we can always experiment more
later. Getting rid of _TIF_MCE_NOTIFY is a good thing already.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12 16:22                         ` Borislav Petkov
@ 2014-11-12 17:17                           ` Luck, Tony
  2014-11-12 17:30                             ` Borislav Petkov
  2014-11-13 18:04                           ` Borislav Petkov
  1 sibling, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-12 17:17 UTC (permalink / raw)
  To: Borislav Petkov, Andy Lutomirski
  Cc: Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra, Oleg Nesterov

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1401 bytes --]

> Not that easy for testing the #MC path - there we have to inject real
> MCEs and then noodle through the memory_failure() code. I'd be very much
> interested to see what would happen if two MCEs happen back-to-back with
> your change, the second one being raised when we're on the kernel stack
> and in memory_failure()...

If the second one hits before we clear MCG_STATUS, then the processor resets.

If the second one is caused by the recovery thread somewhere in memory_failure(),
then Andy won't switch stacks - but we will declare this a fatal error an panic (we have
no recovery from machine checks in the kernel).

Otherwise the memory_failure() thread is the innocent bystander. If the affected thread
decides to do recovery, then the first thread will be allowed to return and continue.

I might worry a bit if the second error is another thread hitting the *same* page which
hasn't finished processing yet ... then the second will chase along behind the first trying
to fix the same problem.  I *think* the first will complete and the second will just end
up here:

	if (TestSetPageHWPoison(p)) {
		printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
		return 0;
	}

which is really early in memory_failure().

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12 17:17                           ` Luck, Tony
@ 2014-11-12 17:30                             ` Borislav Petkov
  0 siblings, 0 replies; 63+ messages in thread
From: Borislav Petkov @ 2014-11-12 17:30 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Andy Lutomirski, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

On Wed, Nov 12, 2014 at 05:17:55PM +0000, Luck, Tony wrote:
> > Not that easy for testing the #MC path - there we have to inject real
> > MCEs and then noodle through the memory_failure() code. I'd be very much
> > interested to see what would happen if two MCEs happen back-to-back with
> > your change, the second one being raised when we're on the kernel stack
> > and in memory_failure()...
> 
> If the second one hits before we clear MCG_STATUS, then the processor resets.
> 
> If the second one is caused by the recovery thread somewhere in memory_failure(),
> then Andy won't switch stacks - but we will declare this a fatal error an panic (we have
> no recovery from machine checks in the kernel).
> 
> Otherwise the memory_failure() thread is the innocent bystander. If the affected thread
> decides to do recovery, then the first thread will be allowed to return and continue.
> 
> I might worry a bit if the second error is another thread hitting the *same* page which
> hasn't finished processing yet ... then the second will chase along behind the first trying
> to fix the same problem.  I *think* the first will complete and the second will just end
> up here:
> 
> 	if (TestSetPageHWPoison(p)) {
> 		printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
> 		return 0;
> 	}
> 
> which is really early in memory_failure().

Yeah, I meant this case: when we have switched stacks, exited
do_machine_check() and running the recovery code. Exactly then we get
another MCE. And the code might handle it, as you say, but I'd like to
see this in action first to be sure - it is not trivial code.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-11 20:56 [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace Andy Lutomirski
  2014-11-11 21:36 ` Borislav Petkov
@ 2014-11-12 22:00 ` Oleg Nesterov
  2014-11-12 23:17   ` Andy Lutomirski
  1 sibling, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2014-11-12 22:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, x86, linux-kernel, Peter Zijlstra, Tony Luck,
	Andi Kleen

Andy,

As I said many times I do not understand asm ;) so most probably I missed
something but let me ask anyway.

On 11/11, Andy Lutomirski wrote:
>
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -1064,6 +1064,9 @@ ENTRY(\sym)
>  	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
>
>  	.if \paranoid
> +	CFI_REMEMBER_STATE
> +	testl $3, CS(%rsp)		/* If coming from userspace, switch */
> +	jnz 1f				/* stacks. */
>  	call save_paranoid
>  	.else
>  	call error_entry
> @@ -1104,6 +1107,36 @@ ENTRY(\sym)
>  	jmp error_exit			/* %ebx: no swapgs flag */
>  	.endif
>
> +	.if \paranoid
> +	CFI_RESTORE_STATE
> +	/*
> +	 * Paranoid entry from userspace.  Switch stacks and treat it
> +	 * as a normal entry.  This means that paranoid handlers
> +	 * run in real process context if user_mode(regs).
> +	 */
> +1:
> +	call error_entry
> +
> +	DEFAULT_FRAME 0
> +
> +	movq %rsp,%rdi			/* pt_regs pointer */
> +	call sync_regs

Can't we simplify sync_regs() then?

> @@ -1324,8 +1357,6 @@ ENTRY(paranoid_exit)
>  	TRACE_IRQS_OFF_DEBUG
>  	testl %ebx,%ebx				/* swapgs needed? */
>  	jnz paranoid_restore
> -	testl $3,CS(%rsp)
> -	jnz   paranoid_userspace
>  paranoid_swapgs:

Looks like this label can die.

> -paranoid_userspace:
> -	GET_THREAD_INFO(%rcx)
> -	movl TI_flags(%rcx),%ebx
> -	andl $_TIF_WORK_MASK,%ebx
> -	jz paranoid_swapgs
> -	movq %rsp,%rdi			/* &pt_regs */
> -	call sync_regs
> -	movq %rax,%rsp			/* switch stack for scheduling */
> -	testl $_TIF_NEED_RESCHED,%ebx
> -	jnz paranoid_schedule
> -	movl %ebx,%edx			/* arg3: thread flags */
> -	TRACE_IRQS_ON
> -	ENABLE_INTERRUPTS(CLBR_NONE)
> -	xorl %esi,%esi 			/* arg2: oldset */
> -	movq %rsp,%rdi 			/* arg1: &pt_regs */
> -	call do_notify_resume

So, before this patch we use _TIF_WORK_MASK to decide if we need to call
do_notify_resume().

After this patch we jump to error_exit and it checks the same _TIF_WORK_MASK.
But note that retint_careful->retint_careful checks another mask,
_TIF_DO_NOTIFY_MASK.

So it seems to me we can miss (say) TIF_UPROBE after int3 handler, no?

Yes, even _if_ I am right we should blame these masks, _TIF_DO_NOTIFY_MASK
should probably include _TIF_UPROBE (and afaics in this case we can remove
set_thread_flag(TIF_NOTIFY_RESUME) in uprobe_deny_signal()).

And in any case, can't we cleanup _TIF_WORK_MASK and _TIF_ALLWORK_MASK?
IMHO, they should clearly define which bits we want to check.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12 22:00 ` Oleg Nesterov
@ 2014-11-12 23:17   ` Andy Lutomirski
  2014-11-12 23:41     ` Luck, Tony
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-12 23:17 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Borislav Petkov, X86 ML, linux-kernel, Peter Zijlstra, Tony Luck,
	Andi Kleen

On Wed, Nov 12, 2014 at 2:00 PM, Oleg Nesterov <oleg@redhat.com> wrote:
> Andy,
>
> As I said many times I do not understand asm ;) so most probably I missed
> something but let me ask anyway.

You must be the most competent non-asm-speaking asm reviewer in the world :)

>
> On 11/11, Andy Lutomirski wrote:
>>
>> --- a/arch/x86/kernel/entry_64.S
>> +++ b/arch/x86/kernel/entry_64.S
>> @@ -1064,6 +1064,9 @@ ENTRY(\sym)
>>       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
>>
>>       .if \paranoid
>> +     CFI_REMEMBER_STATE
>> +     testl $3, CS(%rsp)              /* If coming from userspace, switch */
>> +     jnz 1f                          /* stacks. */
>>       call save_paranoid
>>       .else
>>       call error_entry
>> @@ -1104,6 +1107,36 @@ ENTRY(\sym)
>>       jmp error_exit                  /* %ebx: no swapgs flag */
>>       .endif
>>
>> +     .if \paranoid
>> +     CFI_RESTORE_STATE
>> +     /*
>> +      * Paranoid entry from userspace.  Switch stacks and treat it
>> +      * as a normal entry.  This means that paranoid handlers
>> +      * run in real process context if user_mode(regs).
>> +      */
>> +1:
>> +     call error_entry
>> +
>> +     DEFAULT_FRAME 0
>> +
>> +     movq %rsp,%rdi                  /* pt_regs pointer */
>> +     call sync_regs
>
> Can't we simplify sync_regs() then?

Yes.  Will do.

>
>> @@ -1324,8 +1357,6 @@ ENTRY(paranoid_exit)
>>       TRACE_IRQS_OFF_DEBUG
>>       testl %ebx,%ebx                         /* swapgs needed? */
>>       jnz paranoid_restore
>> -     testl $3,CS(%rsp)
>> -     jnz   paranoid_userspace
>>  paranoid_swapgs:
>
> Looks like this label can die.
>

Yes.

>> -paranoid_userspace:
>> -     GET_THREAD_INFO(%rcx)
>> -     movl TI_flags(%rcx),%ebx
>> -     andl $_TIF_WORK_MASK,%ebx
>> -     jz paranoid_swapgs
>> -     movq %rsp,%rdi                  /* &pt_regs */
>> -     call sync_regs
>> -     movq %rax,%rsp                  /* switch stack for scheduling */
>> -     testl $_TIF_NEED_RESCHED,%ebx
>> -     jnz paranoid_schedule
>> -     movl %ebx,%edx                  /* arg3: thread flags */
>> -     TRACE_IRQS_ON
>> -     ENABLE_INTERRUPTS(CLBR_NONE)
>> -     xorl %esi,%esi                  /* arg2: oldset */
>> -     movq %rsp,%rdi                  /* arg1: &pt_regs */
>> -     call do_notify_resume
>
> So, before this patch we use _TIF_WORK_MASK to decide if we need to call
> do_notify_resume().
>
> After this patch we jump to error_exit and it checks the same _TIF_WORK_MASK.
> But note that retint_careful->retint_careful checks another mask,
> _TIF_DO_NOTIFY_MASK.
>
> So it seems to me we can miss (say) TIF_UPROBE after int3 handler, no?

Only if we didn't call uprobe_deny_signal.  Presumably the
TIF_NOTIFY_RESUME got added there because it didn't work on x86 due to
exactly this issue.

>
> Yes, even _if_ I am right we should blame these masks, _TIF_DO_NOTIFY_MASK
> should probably include _TIF_UPROBE (and afaics in this case we can remove
> set_thread_flag(TIF_NOTIFY_RESUME) in uprobe_deny_signal()).

Ugh, yes.  I'll fix that with a separate patch.  Fortunately, the
other uprobe architectures seem to be okay.

>
> And in any case, can't we cleanup _TIF_WORK_MASK and _TIF_ALLWORK_MASK?
> IMHO, they should clearly define which bits we want to check.

Almost certainly.  I'll leave that for another day, though.

v2 coming soon with these changes and some additional comment cleanups.

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12 23:17   ` Andy Lutomirski
@ 2014-11-12 23:41     ` Luck, Tony
  2014-11-13  0:02       ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-12 23:41 UTC (permalink / raw)
  To: Andy Lutomirski, Oleg Nesterov
  Cc: Borislav Petkov, X86 ML, linux-kernel, Peter Zijlstra, Andi Kleen

[-- Attachment #1: Type: text/plain, Size: 785 bytes --]

> v2 coming soon with these changes and some additional comment cleanups.

So v1 + do_machine_check change is not surviving some real testing.  I'm injecting and
consuming errors sequentially with a small delay in between - so no fancy corner cases with
multiple errors being processed ... we get all the way done with one error before we start
the next.  Test only survives about 400ish recoveries before Linux dies complaining:
    "Timeout synchronizing machine check over CPUs". 
This probably means that some cpu wandered into the weeds and never showed up in the
handler.

patch just moves the recovery to the end of do_machine_check() instead of
doing the whole TIF_MCE_NOTIFY thing - and deletes all the TIF_MCE_NOTIFY
bits that are no longer needed.

-Tony



[-- Attachment #2: mce-stack.patch --]
[-- Type: application/octet-stream, Size: 6556 bytes --]

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 958b90f761e5..be5a51a4a219 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -187,7 +187,6 @@ enum mcp_flags {
 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
 int mce_notify_irq(void);
-void mce_notify_process(void);
 
 DECLARE_PER_CPU(struct mce, injectm);
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 854053889d4d..116973fe8819 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -75,7 +75,6 @@ struct thread_info {
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
-#define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
@@ -100,7 +99,6 @@ struct thread_info {
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
-#define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
@@ -140,8 +138,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |	\
-	 _TIF_USER_RETURN_NOTIFY)
+	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |	_TIF_USER_RETURN_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 61a9668cebfd..f7378aadaa5d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -956,51 +956,6 @@ static void mce_clear_state(unsigned long *toclear)
 }
 
 /*
- * Need to save faulting physical address associated with a process
- * in the machine check handler some place where we can grab it back
- * later in mce_notify_process()
- */
-#define	MCE_INFO_MAX	16
-
-struct mce_info {
-	atomic_t		inuse;
-	struct task_struct	*t;
-	__u64			paddr;
-	int			restartable;
-} mce_info[MCE_INFO_MAX];
-
-static void mce_save_info(__u64 addr, int c)
-{
-	struct mce_info *mi;
-
-	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
-		if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
-			mi->t = current;
-			mi->paddr = addr;
-			mi->restartable = c;
-			return;
-		}
-	}
-
-	mce_panic("Too many concurrent recoverable errors", NULL, NULL);
-}
-
-static struct mce_info *mce_find_info(void)
-{
-	struct mce_info *mi;
-
-	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
-		if (atomic_read(&mi->inuse) && mi->t == current)
-			return mi;
-	return NULL;
-}
-
-static void mce_clear_info(struct mce_info *mi)
-{
-	atomic_set(&mi->inuse, 0);
-}
-
-/*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
  *
@@ -1037,6 +992,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
+	u64 recover_paddr = ~0ull;
+	int flags = MF_ACTION_REQUIRED;
 
 	this_cpu_inc(mce_exception_count);
 
@@ -1155,9 +1112,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		if (no_way_out)
 			mce_panic("Fatal machine check on current CPU", &m, msg);
 		if (worst == MCE_AR_SEVERITY) {
-			/* schedule action before return to userland */
-			mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
-			set_thread_flag(TIF_MCE_NOTIFY);
+			recover_paddr = m.addr;
+			if (!(m.mcgstatus & MCG_STATUS_RIPV))
+				flags |= MF_MUST_KILL;
 		} else if (kill_it) {
 			force_sig(SIGBUS, current);
 		}
@@ -1168,6 +1125,21 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
 out:
 	sync_core();
+
+	if (recover_paddr == ~0ull)
+		return;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx",
+		 recover_paddr);
+	/*
+	 * We must call memory_failure() here even if the current process is
+	 * doomed. We still need to mark the page as poisoned and alert any
+	 * other users of the page.
+	 */
+	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
+		pr_err("Memory error not recovered");
+		force_sig(SIGBUS, current);
+	}
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
 
@@ -1185,42 +1157,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
 #endif
 
 /*
- * Called in process context that interrupted by MCE and marked with
- * TIF_MCE_NOTIFY, just before returning to erroneous userland.
- * This code is allowed to sleep.
- * Attempt possible recovery such as calling the high level VM handler to
- * process any corrupted pages, and kill/signal current process if required.
- * Action required errors are handled here.
- */
-void mce_notify_process(void)
-{
-	unsigned long pfn;
-	struct mce_info *mi = mce_find_info();
-	int flags = MF_ACTION_REQUIRED;
-
-	if (!mi)
-		mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
-	pfn = mi->paddr >> PAGE_SHIFT;
-
-	clear_thread_flag(TIF_MCE_NOTIFY);
-
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 mi->paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	if (!mi->restartable)
-		flags |= MF_MUST_KILL;
-	if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
-	}
-	mce_clear_info(mi);
-}
-
-/*
  * Action optional processing happens here (picking up
  * from the list of faulting pages that do_machine_check()
  * placed into the "ring").
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ed37a768d0fc..2a33c8f68319 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
 	user_exit();
 
-#ifdef CONFIG_X86_MCE
-	/* notify userspace of pending MCEs */
-	if (thread_info_flags & _TIF_MCE_NOTIFY)
-		mce_notify_process();
-#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
-
 	if (thread_info_flags & _TIF_UPROBE)
 		uprobe_notify_resume(regs);
 

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12 23:41     ` Luck, Tony
@ 2014-11-13  0:02       ` Andy Lutomirski
  2014-11-13  0:31         ` Luck, Tony
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-13  0:02 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

On Wed, Nov 12, 2014 at 3:41 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> v2 coming soon with these changes and some additional comment cleanups.
>

v2's not going to make a difference unless you're using uprobes at the
same time.

> So v1 + do_machine_check change is not surviving some real testing.  I'm injecting and
> consuming errors sequentially with a small delay in between - so no fancy corner cases with
> multiple errors being processed ... we get all the way done with one error before we start
> the next.  Test only survives about 400ish recoveries before Linux dies complaining:
>     "Timeout synchronizing machine check over CPUs".
> This probably means that some cpu wandered into the weeds and never showed up in the
> handler.

In the interest of my sanity, can you add something like
BUG_ON(!user_mode_vm(regs)) or the mce_panic equivalent before calling
memory_failure?

What happens if there's a shared bank but the actual offender has a
higher order than the cpu that finds the error?

Is this something I can try under KVM?

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13  0:02       ` Andy Lutomirski
@ 2014-11-13  0:31         ` Luck, Tony
  2014-11-13  1:34           ` Andy Lutomirski
                             ` (2 more replies)
  0 siblings, 3 replies; 63+ messages in thread
From: Luck, Tony @ 2014-11-13  0:31 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1430 bytes --]

> v2's not going to make a difference unless you're using uprobes at the
> same time.

Not (knowingly) using uprobes. System is installed with a RHEL7 userspace ... but is essentially
idle except for my test program.

> In the interest of my sanity, can you add something like
> BUG_ON(!user_mode_vm(regs)) or the mce_panic equivalent before calling
> memory_failure?

I don't think that can possibly trip - we can only end up with a recoverable error from
a user mode access.  But I'll see about adding it anyway

> What happens if there's a shared bank but the actual offender has a
> higher order than the cpu that finds the error?

This test case injects a memory error which is logged in bank1. This bank is shared by the
two hyperthreads that are on the same core.  The mce_severity() function distinguishes
which is the active thread and which the innocent bystander by looking at MCG_STATUS.
In the active thread MCG_STATUS.EIPV is 1, in the bystander it is 0. The returned severity
is MCE_AR_SEVERITY for the thread that hit the error, MCE_KEEP_SEVERITY for the bystander.
So it doesn't matter which thread has the lower order and sees it first.

> Is this something I can try under KVM?

I don't know if KVM has a way to simulate a machine check event.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13  0:31         ` Luck, Tony
@ 2014-11-13  1:34           ` Andy Lutomirski
  2014-11-13  3:03           ` Andy Lutomirski
  2014-11-13 10:59           ` Borislav Petkov
  2 siblings, 0 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-13  1:34 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

On Wed, Nov 12, 2014 at 4:31 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> v2's not going to make a difference unless you're using uprobes at the
>> same time.
>
> Not (knowingly) using uprobes. System is installed with a RHEL7 userspace ... but is essentially
> idle except for my test program.
>
>> In the interest of my sanity, can you add something like
>> BUG_ON(!user_mode_vm(regs)) or the mce_panic equivalent before calling
>> memory_failure?
>
> I don't think that can possibly trip - we can only end up with a recoverable error from
> a user mode access.  But I'll see about adding it anyway
>
>> What happens if there's a shared bank but the actual offender has a
>> higher order than the cpu that finds the error?
>
> This test case injects a memory error which is logged in bank1. This bank is shared by the
> two hyperthreads that are on the same core.  The mce_severity() function distinguishes
> which is the active thread and which the innocent bystander by looking at MCG_STATUS.
> In the active thread MCG_STATUS.EIPV is 1, in the bystander it is 0. The returned severity
> is MCE_AR_SEVERITY for the thread that hit the error, MCE_KEEP_SEVERITY for the bystander.
> So it doesn't matter which thread has the lower order and sees it first.
>
>> Is this something I can try under KVM?
>
> I don't know if KVM has a way to simulate a machine check event.

With mce-inject and this input:

CPU 1
BANK 0
STATUS uc s ar val en 0x0134
MCGSTATUS mcip eipv
ADDR 4096
MISC 0
IN_PROC
EXCP

I blow up because mce-inject doesn't have a credible emulation of
process context (i.e. IN_PROC doesn't work).

--Andy

>
> -Tony



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13  0:31         ` Luck, Tony
  2014-11-13  1:34           ` Andy Lutomirski
@ 2014-11-13  3:03           ` Andy Lutomirski
  2014-11-13 18:43             ` Luck, Tony
  2014-11-14 10:34             ` Borislav Petkov
  2014-11-13 10:59           ` Borislav Petkov
  2 siblings, 2 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-13  3:03 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

On Wed, Nov 12, 2014 at 4:31 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> v2's not going to make a difference unless you're using uprobes at the
>> same time.
>
> Not (knowingly) using uprobes. System is installed with a RHEL7 userspace ... but is essentially
> idle except for my test program.
>
>> In the interest of my sanity, can you add something like
>> BUG_ON(!user_mode_vm(regs)) or the mce_panic equivalent before calling
>> memory_failure?
>
> I don't think that can possibly trip - we can only end up with a recoverable error from
> a user mode access.  But I'll see about adding it anyway
>
>> What happens if there's a shared bank but the actual offender has a
>> higher order than the cpu that finds the error?
>
> This test case injects a memory error which is logged in bank1. This bank is shared by the
> two hyperthreads that are on the same core.  The mce_severity() function distinguishes
> which is the active thread and which the innocent bystander by looking at MCG_STATUS.
> In the active thread MCG_STATUS.EIPV is 1, in the bystander it is 0. The returned severity
> is MCE_AR_SEVERITY for the thread that hit the error, MCE_KEEP_SEVERITY for the bystander.
> So it doesn't matter which thread has the lower order and sees it first.
>
>> Is this something I can try under KVM?
>
> I don't know if KVM has a way to simulate a machine check event.

printk seems to work just fine in do_machine_check.  Any chance you
can instrument, for each cpu, all entries to do_machine_check, all
calls to do_machine_check, all returns, and everything that tries to
do memory_failure?

Also, shouldn't there be a local_irq_enable before memory_failure and
a local_irq_disable after it?  It wouldn't surprise me if you've
deadlocked somewhere.  Lockdep could also have something interesting
to say.

(Although I'm a bit confused.  A deadlock in memory_failure shouldn't
cause the particular failure mode you're seeing, since a new #MC
should still be deliverable.  Is it possible that we really need an
IRET to unmask NMIs?  This seems unlikely.)

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13  0:31         ` Luck, Tony
  2014-11-13  1:34           ` Andy Lutomirski
  2014-11-13  3:03           ` Andy Lutomirski
@ 2014-11-13 10:59           ` Borislav Petkov
  2014-11-13 21:23             ` Borislav Petkov
  2 siblings, 1 reply; 63+ messages in thread
From: Borislav Petkov @ 2014-11-13 10:59 UTC (permalink / raw)
  To: Luck, Tony, Andy Lutomirski
  Cc: Oleg Nesterov, X86 ML, linux-kernel, Peter Zijlstra, Andi Kleen, kvm ML

On Thu, Nov 13, 2014 at 12:31:30AM +0000, Luck, Tony wrote:
> > Is this something I can try under KVM?
> 
> I don't know if KVM has a way to simulate a machine check event.

I've been thinking about it recently too - adding MCA functionality to
qemu/kvm could be very useful, especially the thresholding stuff, for
testing RAS kernel code.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-12 16:22                         ` Borislav Petkov
  2014-11-12 17:17                           ` Luck, Tony
@ 2014-11-13 18:04                           ` Borislav Petkov
  2014-11-14 21:56                             ` Luck, Tony
  1 sibling, 1 reply; 63+ messages in thread
From: Borislav Petkov @ 2014-11-13 18:04 UTC (permalink / raw)
  To: Andy Lutomirski, Tony Luck
  Cc: Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra, Oleg Nesterov

On Wed, Nov 12, 2014 at 05:22:25PM +0100, Borislav Petkov wrote:
> > Less intrusive is certainly true.
> 
> Right, I can do it in the meantime and we can always experiment more
> later. Getting rid of _TIF_MCE_NOTIFY is a good thing already.

Yep, it looks pretty simple - not tested yet, it builds though.

---
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 276392f121fb..d74c85def853 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -186,7 +186,6 @@ enum mcp_flags {
 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
 int mce_notify_irq(void);
-void mce_notify_process(void);
 
 DECLARE_PER_CPU(struct mce, injectm);
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 854053889d4d..9a121e3cdf1e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -75,7 +75,7 @@ struct thread_info {
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
-#define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+/* unused, was #define TIF_MCE_NOTIFY		10	* notify userspace of an MCE */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
@@ -100,7 +100,6 @@ struct thread_info {
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
-#define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
@@ -140,8 +139,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |	\
-	 _TIF_USER_RETURN_NOTIFY)
+	(_TIF_SIGPENDING |  _TIF_NOTIFY_RESUME | _TIF_USER_RETURN_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 61a9668cebfd..0e82c2cc6b0c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -14,6 +14,7 @@
 #include <linux/capability.h>
 #include <linux/miscdevice.h>
 #include <linux/ratelimit.h>
+#include <linux/task_work.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
 #include <linux/kobject.h>
@@ -111,6 +112,8 @@ static DEFINE_PER_CPU(struct work_struct, mce_work);
 
 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 
+static struct callback_head mce_task_work;
+
 /*
  * CPU/chipset specific EDAC code can register a notifier call here to print
  * MCE errors in a human-readable form.
@@ -1157,7 +1160,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		if (worst == MCE_AR_SEVERITY) {
 			/* schedule action before return to userland */
 			mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
-			set_thread_flag(TIF_MCE_NOTIFY);
+			task_work_add(current, &mce_task_work, true);
 		} else if (kill_it) {
 			force_sig(SIGBUS, current);
 		}
@@ -1185,14 +1188,13 @@ int memory_failure(unsigned long pfn, int vector, int flags)
 #endif
 
 /*
- * Called in process context that interrupted by MCE and marked with
- * TIF_MCE_NOTIFY, just before returning to erroneous userland.
- * This code is allowed to sleep.
+ * Called in process context that interrupted by MCE just before returning to
+ * erroneous userland. This code is allowed to sleep.
  * Attempt possible recovery such as calling the high level VM handler to
  * process any corrupted pages, and kill/signal current process if required.
  * Action required errors are handled here.
  */
-void mce_notify_process(void)
+static void mce_notify_process(struct callback_head *unused)
 {
 	unsigned long pfn;
 	struct mce_info *mi = mce_find_info();
@@ -1202,8 +1204,6 @@ void mce_notify_process(void)
 		mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
 	pfn = mi->paddr >> PAGE_SHIFT;
 
-	clear_thread_flag(TIF_MCE_NOTIFY);
-
 	pr_err("Uncorrected hardware memory error in user-access at %llx",
 		 mi->paddr);
 	/*
@@ -1704,6 +1704,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_timer();
 	INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
 	init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
+	init_task_work(&mce_task_work, mce_notify_process);
 }
 
 /*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ed37a768d0fc..2a33c8f68319 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
 	user_exit();
 
-#ifdef CONFIG_X86_MCE
-	/* notify userspace of pending MCEs */
-	if (thread_info_flags & _TIF_MCE_NOTIFY)
-		mce_notify_process();
-#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
-
 	if (thread_info_flags & _TIF_UPROBE)
 		uprobe_notify_resume(regs);

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13  3:03           ` Andy Lutomirski
@ 2014-11-13 18:43             ` Luck, Tony
  2014-11-13 22:23               ` Andy Lutomirski
  2014-11-14 10:34             ` Borislav Petkov
  1 sibling, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-13 18:43 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1784 bytes --]

> printk seems to work just fine in do_machine_check.  Any chance you
> can instrument, for each cpu, all entries to do_machine_check, all
> calls to do_machine_check, all returns, and everything that tries to
> do memory_failure?

I first added a printk() just for the cpu that calls do_machine_check()

   printk("MCE: regs = %p\n", regs);

to see if something went wonky when jumping to the kernel stack.
But that printed the same value every time (same process is consuming
and recovering from errors).  Maybe this took longer to hit the problem
case - I ran to 1500ish errors instead of just 400 in the previous two tests.
But not sure if that is a significant change.

Then I added printk() for every entry/return on all cpus.  This just locked
up on the third injection.  Serial console looked to have stopped printing
after the first - so I put in bigger delays into my test program between injection
and consumption, and before looping around for the next cycle to give
time for all the messages (4-socket HSW-EX  ... there are a lot of cpus
printing messages). But now it is taking a lot longer to get through
injection/consumption iterations. At 226 now and counting.

> Also, shouldn't there be a local_irq_enable before memory_failure and
> a local_irq_disable after it?  It wouldn't surprise me if you've
> deadlocked somewhere.  Lockdep could also have something interesting
> to say.
Added enable/disable.

> should still be deliverable.  Is it possible that we really need an
> IRET to unmask NMIs?  This seems unlikely.)

If that were the problem, wouldn't we fail on iteration 2, instead of
400+ ?

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13 10:59           ` Borislav Petkov
@ 2014-11-13 21:23             ` Borislav Petkov
  0 siblings, 0 replies; 63+ messages in thread
From: Borislav Petkov @ 2014-11-13 21:23 UTC (permalink / raw)
  To: Luck, Tony, Andy Lutomirski
  Cc: Oleg Nesterov, X86 ML, linux-kernel, Peter Zijlstra, Andi Kleen, kvm ML

On Thu, Nov 13, 2014 at 11:59:37AM +0100, Borislav Petkov wrote:
> I've been thinking about it recently too - adding MCA functionality to
> qemu/kvm could be very useful, especially the thresholding stuff, for
> testing RAS kernel code.

Btw, qemu monitor has a mce injection command with which I was able
to tickle some response from the guest kernel. I'll play more with it
tomorrow and try to tickle a response from the memory failure code.

[  195.328466] Disabling lock debugging due to kernel taint
[  195.328466] [Hardware Error]: System Fatal error.
[  195.328466] [Hardware Error]: CPU:1 (10:2:3) MC4_STATUS[Over|UE|MiscV|PCC|AddrV|UECC]: 0xfe002000001f012b
[  195.328466] [Hardware Error]: MC4_ADDR: 0x0000000000000000
[  195.328466] [Hardware Error]: MC4 Error (node 1): ECC Error in the Probe Filter directory.
[  195.328466] [Hardware Error]: cache level: L3/GEN, tx: GEN, mem-tx: WR
[  195.328466] mce: [Hardware Error]: CPU 1: Machine Check Exception: 3 Bank 4: fe002000001f012b
[  195.328466] mce: [Hardware Error]: RIP 10:<ffffffff8100e0b5> {default_idle+0x25/0x240}
[  195.328466] mce: [Hardware Error]: TSC b9e2f56f95 MISC d1d1dad1deadbeef 
[  195.328466] mce: [Hardware Error]: PROCESSOR 2:100f23 TIME 1415915466 SOCKET 1 APIC 1 microcode 1000065
[  195.328466] [Hardware Error]: System Fatal error.
[  195.328466] [Hardware Error]: CPU:1 (10:2:3) MC4_STATUS[Over|UE|MiscV|PCC|AddrV|UECC]: 0xfe002000001f012b
[  195.328466] [Hardware Error]: MC4_ADDR: 0x0000000000000000
[  195.328466] [Hardware Error]: MC4 Error (node 1): ECC Error in the Probe Filter directory.
[  195.328466] [Hardware Error]: cache level: L3/GEN, tx: GEN, mem-tx: WR
[  195.328466] mce: [Hardware Error]: Machine check: Invalid
[  195.328466] Kernel panic - not syncing: Fatal machine check on current CPU
[  195.328466] Kernel Offset: 0x0 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffff9fffffff)
[  195.328466] ---[ end Kernel panic - not syncing: Fatal machine check on current CPU

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13 18:43             ` Luck, Tony
@ 2014-11-13 22:23               ` Andy Lutomirski
  2014-11-13 22:25                 ` Andy Lutomirski
  2014-11-13 22:33                 ` Luck, Tony
  0 siblings, 2 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-13 22:23 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

On Thu, Nov 13, 2014 at 10:43 AM, Luck, Tony <tony.luck@intel.com> wrote:
>> printk seems to work just fine in do_machine_check.  Any chance you
>> can instrument, for each cpu, all entries to do_machine_check, all
>> calls to do_machine_check, all returns, and everything that tries to
>> do memory_failure?
>
> I first added a printk() just for the cpu that calls do_machine_check()
>
>    printk("MCE: regs = %p\n", regs);
>
> to see if something went wonky when jumping to the kernel stack.
> But that printed the same value every time (same process is consuming
> and recovering from errors).  Maybe this took longer to hit the problem
> case - I ran to 1500ish errors instead of just 400 in the previous two tests.
> But not sure if that is a significant change.
>
> Then I added printk() for every entry/return on all cpus.  This just locked
> up on the third injection.  Serial console looked to have stopped printing
> after the first - so I put in bigger delays into my test program between injection
> and consumption, and before looping around for the next cycle to give
> time for all the messages (4-socket HSW-EX  ... there are a lot of cpus
> printing messages). But now it is taking a lot longer to get through
> injection/consumption iterations. At 226 now and counting.
>
>> Also, shouldn't there be a local_irq_enable before memory_failure and
>> a local_irq_disable after it?  It wouldn't surprise me if you've
>> deadlocked somewhere.  Lockdep could also have something interesting
>> to say.
> Added enable/disable.
>
>> should still be deliverable.  Is it possible that we really need an
>> IRET to unmask NMIs?  This seems unlikely.)
>
> If that were the problem, wouldn't we fail on iteration 2, instead of
> 400+ ?
>
> -Tony

There could be a timer interrupt or something.  But I agree, it seems
implausible.

Are you sure that this works in an unmodified kernel?  The timeout
code seems highly questionable to me.  For example, there's this:

        if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
            cfg->monarch_timeout < 0)
            cfg->monarch_timeout = USEC_PER_SEC;

which presumably determines monarch_timeout on your system and sets it
to 1000000.  But then there's this:

#define SPINUNIT 100    /* 100ns */

which smells like unit error to me.  On top of that, it seems likely
to me that the cpu could execute a loop iteration in much less than
100ns, since the only thing that should be anything other than an L1
hit or a correctly predicted branch is the rmb(), which is lfence,
which is probably just a few ns.  So you have 10k iterations at, say,
10ns each, allowing about 100µs to synchronize, and if an SMI hits at
an inopportune time, boom.

Also, rmb, seriously?  I would understand smp_rmb() or cpu_relax() or
even barrier(), but rmb() seems completely bogus if harmless.

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13 22:23               ` Andy Lutomirski
@ 2014-11-13 22:25                 ` Andy Lutomirski
  2014-11-13 22:33                 ` Luck, Tony
  1 sibling, 0 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-13 22:25 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

On Thu, Nov 13, 2014 at 2:23 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Thu, Nov 13, 2014 at 10:43 AM, Luck, Tony <tony.luck@intel.com> wrote:
>>> printk seems to work just fine in do_machine_check.  Any chance you
>>> can instrument, for each cpu, all entries to do_machine_check, all
>>> calls to do_machine_check, all returns, and everything that tries to
>>> do memory_failure?
>>
>> I first added a printk() just for the cpu that calls do_machine_check()
>>
>>    printk("MCE: regs = %p\n", regs);
>>
>> to see if something went wonky when jumping to the kernel stack.
>> But that printed the same value every time (same process is consuming
>> and recovering from errors).  Maybe this took longer to hit the problem
>> case - I ran to 1500ish errors instead of just 400 in the previous two tests.
>> But not sure if that is a significant change.
>>
>> Then I added printk() for every entry/return on all cpus.  This just locked
>> up on the third injection.  Serial console looked to have stopped printing
>> after the first - so I put in bigger delays into my test program between injection
>> and consumption, and before looping around for the next cycle to give
>> time for all the messages (4-socket HSW-EX  ... there are a lot of cpus
>> printing messages). But now it is taking a lot longer to get through
>> injection/consumption iterations. At 226 now and counting.
>>
>>> Also, shouldn't there be a local_irq_enable before memory_failure and
>>> a local_irq_disable after it?  It wouldn't surprise me if you've
>>> deadlocked somewhere.  Lockdep could also have something interesting
>>> to say.
>> Added enable/disable.
>>
>>> should still be deliverable.  Is it possible that we really need an
>>> IRET to unmask NMIs?  This seems unlikely.)
>>
>> If that were the problem, wouldn't we fail on iteration 2, instead of
>> 400+ ?
>>
>> -Tony
>
> There could be a timer interrupt or something.  But I agree, it seems
> implausible.
>
> Are you sure that this works in an unmodified kernel?  The timeout
> code seems highly questionable to me.  For example, there's this:
>
>         if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
>             cfg->monarch_timeout < 0)
>             cfg->monarch_timeout = USEC_PER_SEC;
>
> which presumably determines monarch_timeout on your system and sets it
> to 1000000.  But then there's this:
>
> #define SPINUNIT 100    /* 100ns */
>
> which smells like unit error to me.  On top of that, it seems likely
> to me that the cpu could execute a loop iteration in much less than
> 100ns, since the only thing that should be anything other than an L1
> hit or a correctly predicted branch is the rmb(), which is lfence,
> which is probably just a few ns.  So you have 10k iterations at, say,
> 10ns each, allowing about 100µs to synchronize, and if an SMI hits at
> an inopportune time, boom.

This theory is consistent with the very quick failure with an extra
printk on entry.

--Andy

>
> Also, rmb, seriously?  I would understand smp_rmb() or cpu_relax() or
> even barrier(), but rmb() seems completely bogus if harmless.
>
> --Andy
>
> --
> Andy Lutomirski
> AMA Capital Management, LLC



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13 22:23               ` Andy Lutomirski
  2014-11-13 22:25                 ` Andy Lutomirski
@ 2014-11-13 22:33                 ` Luck, Tony
  2014-11-13 22:47                   ` Andy Lutomirski
  1 sibling, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-13 22:33 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

[-- Attachment #1: Type: text/plain, Size: 414 bytes --]

> Are you sure that this works in an unmodified kernel

Unmodified kernel has run tens of thousands of injection/consumption/recovery cycles.

I did get a crash with the entry/exit traces you asked for.  Last 20000 lines of console log
attached.  There are a couple of OOPs before things fall apart completely.  I haven't yet
counted all the entry/exits from the last cycle to see if they match.

-Tony

[-- Attachment #2: console.log.bz2 --]
[-- Type: application/octet-stream, Size: 80843 bytes --]

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13 22:33                 ` Luck, Tony
@ 2014-11-13 22:47                   ` Andy Lutomirski
  2014-11-13 23:13                     ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-13 22:47 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

On Thu, Nov 13, 2014 at 2:33 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> Are you sure that this works in an unmodified kernel
>
> Unmodified kernel has run tens of thousands of injection/consumption/recovery cycles.
>
> I did get a crash with the entry/exit traces you asked for.  Last 20000 lines of console log
> attached.  There are a couple of OOPs before things fall apart completely.  I haven't yet
> counted all the entry/exits from the last cycle to see if they match.
>

That log was a good hint, and I am a fool.  I'll send a v3 once I test it.

I'm still unconvinced by the timeout code, though...

---Andy

> -Tony
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13 22:47                   ` Andy Lutomirski
@ 2014-11-13 23:13                     ` Andy Lutomirski
  2014-11-14  0:50                       ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-13 23:13 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

On Thu, Nov 13, 2014 at 2:47 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Thu, Nov 13, 2014 at 2:33 PM, Luck, Tony <tony.luck@intel.com> wrote:
>>> Are you sure that this works in an unmodified kernel
>>
>> Unmodified kernel has run tens of thousands of injection/consumption/recovery cycles.
>>
>> I did get a crash with the entry/exit traces you asked for.  Last 20000 lines of console log
>> attached.  There are a couple of OOPs before things fall apart completely.  I haven't yet
>> counted all the entry/exits from the last cycle to see if they match.
>>
>
> That log was a good hint, and I am a fool.  I'll send a v3 once I test it.

...or not.  I confused myself there.  I thought I had a bug, but I was wrong.

I'm stress-testing sleeping in an int3 handler that entered from user
space, and I'm not seeing any problems, even with perf firing lots of
NMIs.  I'm also passing the kprobes smoke test with my patch applied,
and the stack switching code is correctly not switching stacks.

Any chance you could try to trigger this this again with regs->sp,
regs->ip, and regs->cs added to the cpu=%d regs=... message?  I feel
like I'm missing something weird here.

--Andy

>
> I'm still unconvinced by the timeout code, though...
>
> ---Andy
>
>> -Tony
>>
>
>
>
> --
> Andy Lutomirski
> AMA Capital Management, LLC



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13 23:13                     ` Andy Lutomirski
@ 2014-11-14  0:50                       ` Andy Lutomirski
  2014-11-14  1:20                         ` Luck, Tony
                                           ` (2 more replies)
  0 siblings, 3 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-14  0:50 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

On Thu, Nov 13, 2014 at 3:13 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Thu, Nov 13, 2014 at 2:47 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Thu, Nov 13, 2014 at 2:33 PM, Luck, Tony <tony.luck@intel.com> wrote:
>>>> Are you sure that this works in an unmodified kernel
>>>
>>> Unmodified kernel has run tens of thousands of injection/consumption/recovery cycles.
>>>
>>> I did get a crash with the entry/exit traces you asked for.  Last 20000 lines of console log
>>> attached.  There are a couple of OOPs before things fall apart completely.  I haven't yet
>>> counted all the entry/exits from the last cycle to see if they match.
>>>
>>
>> That log was a good hint, and I am a fool.  I'll send a v3 once I test it.
>
> ...or not.  I confused myself there.  I thought I had a bug, but I was wrong.
>
> I'm stress-testing sleeping in an int3 handler that entered from user
> space, and I'm not seeing any problems, even with perf firing lots of
> NMIs.  I'm also passing the kprobes smoke test with my patch applied,
> and the stack switching code is correctly not switching stacks.
>
> Any chance you could try to trigger this this again with regs->sp,
> regs->ip, and regs->cs added to the cpu=%d regs=... message?  I feel
> like I'm missing something weird here.

Can you also try rebasing onto what will probably be v3?

https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/tag/?id=paranoid-stack-v2.9

It adds debugging for inappropriate reschedules from the wrong stack.
Setting CONFIG_DEBUG_ATOMIC_SLEEP might also be a good idea.

It seems plausible to me that the failure mode is worst ==
MCE_AR_SEVERITY but regs->cs == 0 (i.e. in kernel).  This could blow
up in one of two ways:

1. current is the idle thread.  This causes the warning you saw.

2. current is a real user process, but that MCE was nested inside
another exception somehow or otherwise didn't switch stacks.  Now
we're on an IST stack and we schedule.  So far so good, but the next
thing that tries to use that IST stack cause lots of corruption.

It looks like if bug 1 is happening, then you might never notice it
without mce-stack.patch applied -- you'll set TIF_MCE_NOTIFY on the
idle thread, but the idle thread never returns to userspace, so the
mce notifier never has a chance to crash.

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14  0:50                       ` Andy Lutomirski
@ 2014-11-14  1:20                         ` Luck, Tony
  2014-11-14  1:36                           ` Andy Lutomirski
  2014-11-14 17:49                         ` Luck, Tony
  2014-11-14 18:27                         ` Luck, Tony
  2 siblings, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-14  1:20 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

"worst ==
MCE_AR_SEVERITY but regs->cs == 0 (i.e. in kernel)"

This can't happen. We can only declare AR severity for a user mode fault.

Sent from my iPhone

> On Nov 13, 2014, at 16:50, Andy Lutomirski <luto@amacapital.net> wrote:
> 
> worst ==
> MCE_AR_SEVERITY but regs->cs == 0 (i.e. in kernel)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14  1:20                         ` Luck, Tony
@ 2014-11-14  1:36                           ` Andy Lutomirski
  0 siblings, 0 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-14  1:36 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

On Thu, Nov 13, 2014 at 5:20 PM, Luck, Tony <tony.luck@intel.com> wrote:
> "worst ==
> MCE_AR_SEVERITY but regs->cs == 0 (i.e. in kernel)"
>
> This can't happen. We can only declare AR severity for a user mode fault.

I believe you, and I see that in the code, but the code is mightily twisted.

Anyway, my v3 will also catch a failure in which my asm fails to
switch to the right stack for some other reason.

Without the scheduling in the idle thread warning, I'd believe that
the problem is that some cpu is going out to lunch or failing to
receive the #MC, which would make me want to consider IRET issues (w/
my patch, there might not be an IRET until after memory_failure, and
maybe it sometimes takes a really long time).  But I don't see why
that would cause a scheduling in the idle thread warning.

>
> Sent from my iPhone
>
>> On Nov 13, 2014, at 16:50, Andy Lutomirski <luto@amacapital.net> wrote:
>>
>> worst ==
>> MCE_AR_SEVERITY but regs->cs == 0 (i.e. in kernel)

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13  3:03           ` Andy Lutomirski
  2014-11-13 18:43             ` Luck, Tony
@ 2014-11-14 10:34             ` Borislav Petkov
  2014-11-14 17:18               ` Andy Lutomirski
  1 sibling, 1 reply; 63+ messages in thread
From: Borislav Petkov @ 2014-11-14 10:34 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Luck, Tony, Oleg Nesterov, X86 ML, linux-kernel, Peter Zijlstra,
	Andi Kleen

On Wed, Nov 12, 2014 at 07:03:21PM -0800, Andy Lutomirski wrote:
> printk seems to work just fine in do_machine_check.

That must be pure luck. Has anything changed which I missed to make
printk NMI-safe?

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14 10:34             ` Borislav Petkov
@ 2014-11-14 17:18               ` Andy Lutomirski
  2014-11-14 17:24                 ` Borislav Petkov
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-14 17:18 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra, Luck, Tony,
	Oleg Nesterov

On Nov 14, 2014 2:34 AM, "Borislav Petkov" <bp@alien8.de> wrote:
>
> On Wed, Nov 12, 2014 at 07:03:21PM -0800, Andy Lutomirski wrote:
> > printk seems to work just fine in do_machine_check.
>
> That must be pure luck. Has anything changed which I missed to make
> printk NMI-safe?

Heh.  Probably not.  Now I wonder whether that might have caused the
scheduling in idle cpu thing.

Grr.  Do you or Tony have any pointers for how to test this myself?  I
don't know enough about the acpi error injection thing, which I assume
is that Tony is using.

--Andy

>
> --
> Regards/Gruss,
>     Boris.
>
> Sent from a fat crate under my desk. Formatting is fine.
> --

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14 17:18               ` Andy Lutomirski
@ 2014-11-14 17:24                 ` Borislav Petkov
  2014-11-14 17:26                   ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Borislav Petkov @ 2014-11-14 17:24 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra, Luck, Tony,
	Oleg Nesterov

On Fri, Nov 14, 2014 at 09:18:51AM -0800, Andy Lutomirski wrote:
> Grr. Do you or Tony have any pointers for how to test this myself? I
> don't know enough about the acpi error injection thing, which I assume
> is that Tony is using.

Maybe that would help:

Documentation/acpi/apei/einj.txt

provided your box's BIOS even has EINJ support ...

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14 17:24                 ` Borislav Petkov
@ 2014-11-14 17:26                   ` Andy Lutomirski
  2014-11-14 18:53                     ` Borislav Petkov
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-14 17:26 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra, Luck, Tony,
	Oleg Nesterov

On Fri, Nov 14, 2014 at 9:24 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Fri, Nov 14, 2014 at 09:18:51AM -0800, Andy Lutomirski wrote:
>> Grr. Do you or Tony have any pointers for how to test this myself? I
>> don't know enough about the acpi error injection thing, which I assume
>> is that Tony is using.
>
> Maybe that would help:
>
> Documentation/acpi/apei/einj.txt
>
> provided your box's BIOS even has EINJ support ...
>

I was hoping for an actual worked-out example of what the parameters
should be :)  I'll see if my weird server-ish test machine can do
this.

--Andy

> --
> Regards/Gruss,
>     Boris.
>
> Sent from a fat crate under my desk. Formatting is fine.
> --



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14  0:50                       ` Andy Lutomirski
  2014-11-14  1:20                         ` Luck, Tony
@ 2014-11-14 17:49                         ` Luck, Tony
  2014-11-14 19:10                           ` Andy Lutomirski
  2014-11-14 18:27                         ` Luck, Tony
  2 siblings, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-14 17:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 11106 bytes --]

> Can you also try rebasing onto what will probably be v3?
>
> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/tag/?id=paranoid-stack-v2.9

Built that - with none of my other changes ... i.e. still use TIF_NOTIFY_MCE etc. No printk()
in the MCE context.

System ran 736 injection/consumption/recovery cycles and then got an RCU
stall - followed by a zillion soft lockups.

[  203.326117] mce: Uncorrected hardware memory error in user-access at 100f07f800
[  203.326193] MCE 0x100f07f: Killing harderrors:12052 due to hardware memory corruption
[  203.326195] MCE 0x100f07f: dirty LRU page recovery: Recovered
[  204.721893] mce: Uncorrected hardware memory error in user-access at 100f7073c0
[  204.721906] INFO: rcu_sched self-detected stall on CPU { 91}  (t=60002 jiffies g=5125 c=5124 q=0)
[  204.721908] Task dump for CPU 91:
[  204.721911] kworker/91:1    R  running task        0  1033      2 0x00000008
[  204.721925] Workqueue: events_power_efficient fb_flashcursor
[  204.721929]  ffff880c6767def0 00000000c74bfa96 ffff880c6fa63d68 ffffffff81099d68
[  204.721930]  000000000000005b ffffffff819d1140 ffff880c6fa63d88 ffffffff8109d38d
[  204.721932]  0000000000000087 000000000000000c ffff880c6fa63db8 ffffffff810caed0
[  204.721933] Call Trace:
[  204.721946]  <IRQ>  [<ffffffff81099d68>] sched_show_task+0xa8/0x110
[  204.721951]  [<ffffffff8109d38d>] dump_cpu_task+0x3d/0x50
[  204.721961]  [<ffffffff810caed0>] rcu_dump_cpu_stacks+0x90/0xd0
[  204.721967]  [<ffffffff810cec17>] rcu_check_callbacks+0x497/0x710
[  204.721974]  [<ffffffff810d3b7b>] update_process_times+0x4b/0x80
[  204.721986]  [<ffffffff810e37c5>] tick_sched_handle.isra.19+0x25/0x60
[  204.721989]  [<ffffffff810e3845>] tick_sched_timer+0x45/0x80
[  204.721992]  [<ffffffff810d4887>] __run_hrtimer+0x77/0x1d0
[  204.721995]  [<ffffffff810e3800>] ? tick_sched_handle.isra.19+0x60/0x60
[  204.721997]  [<ffffffff810d4c77>] hrtimer_interrupt+0xf7/0x240
[  204.722008]  [<ffffffff810455ab>] local_apic_timer_interrupt+0x3b/0x70
[  204.722018]  [<ffffffff8165f8d5>] smp_apic_timer_interrupt+0x45/0x60
[  204.722020]  [<ffffffff8165d91d>] apic_timer_interrupt+0x6d/0x80
[  204.722034]  <EOI>  [<ffffffff810c1a38>] ? console_unlock+0x418/0x460
[  204.722037]  [<ffffffff8135600d>] fb_flashcursor+0x5d/0x140
[  204.722040]  [<ffffffff8135b8e0>] ? bit_clear+0x120/0x120
[  204.722049]  [<ffffffff81086b5e>] process_one_work+0x14e/0x3f0
[  204.722051]  [<ffffffff8108726b>] worker_thread+0x11b/0x510
[  204.722053]  [<ffffffff81087150>] ? rescuer_thread+0x350/0x350
[  204.722057]  [<ffffffff8108c9f1>] kthread+0xe1/0x100
[  204.722059]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
[  204.722074]  [<ffffffff8165c97c>] ret_from_fork+0x7c/0xb0
[  204.722076]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
[  227.462386] NMI watchdog: BUG: soft lockup - CPU#18 stuck for 22s! [migration/18:134]
[  227.462452] Modules linked in: einj ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack cfg80211 rfkill ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_mangle iptable_security iptable_raw sg iptable_filter ip_tables vfat fat iTCO_wdt iTCO_vendor_support x86_pkg_temp_thermal coretemp kvm ixgbe crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel ptp lrw gf128mul pps_core glue_helper mdio dca ablk_helper sb_edac cryptd edac_core lpc_ich pcspkr shpchp i2c_i801 mfd_core ipmi_si wmi ipmi_msghandler acpi_pad xfs libcrc32c sd_mod mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper sr_mod cdrom ttm drm ahci libahci mpt2sas libata raid_class i2c_core scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod
[  227.462470] CPU: 18 PID: 134 Comm: migration/18 Tainted: G   M    W      3.18.0-rc3 #1
[  227.462472] Hardware name: Intel Corporation BRICKLAND/BRICKLAND, BIOS BRHSXSD1.86B.0058.D01.1410201505 10/20/2014
[  227.462474] task: ffff880c68605ef0 ti: ffff880c67d9c000 task.ti: ffff880c67d9c000
[  227.462484] RIP: 0010:[<ffffffff81105570>]  [<ffffffff81105570>] multi_cpu_stop+0x70/0xf0
[  227.462485] RSP: 0018:ffff880c67d9fd68  EFLAGS: 00000293
[  227.462487] RAX: 0000000000000000 RBX: ffff880c6f814840 RCX: ffffffffffffffff
[  227.462488] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffffff81ab3320
[  227.462489] RBP: ffff880c67d9fd88 R08: ffffffff81ab3328 R09: ffff881467e58d90
[  227.462490] R10: ffffffff81ab3320 R11: 0000000000000001 R12: 0000000000000000
[  227.462492] R13: ffff880c677c7800 R14: ffff880c67000800 R15: ffff880c00000000
[  227.462494] FS:  0000000000000000(0000) GS:ffff880c6f800000(0000) knlGS:0000000000000000
[  227.462495] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  227.462496] CR2: 00007f2147fcce90 CR3: 0000000001978000 CR4: 00000000001407e0
[  227.462498] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  227.462500] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  227.462500] Stack:
[  227.462503]  ffff880c65a8fd20 ffff880c6f80f0a0 ffff880c65a8fdb8 ffff880c6f80f0a8
[  227.462505]  ffff880c67d9fe58 ffffffff81105778 ffffffff81095387 0000000000000010
[  227.462507]  0000000000000282 ffff880c67d9fdc8 0000000000000018 0000000000000000
[  227.462508] Call Trace:
[  227.462512]  [<ffffffff81105778>] cpu_stopper_thread+0x78/0x150
[  227.462516]  [<ffffffff81095387>] ? finish_task_switch+0x57/0x180
[  227.462522]  [<ffffffff81657f67>] ? __schedule+0x2f7/0x7e0
[  227.462531]  [<ffffffff8109096f>] smpboot_thread_fn+0xff/0x1b0
[  227.462534]  [<ffffffff81090870>] ? SyS_setgroups+0x1a0/0x1a0
[  227.462537]  [<ffffffff8108c9f1>] kthread+0xe1/0x100
[  227.462539]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
[  227.462544]  [<ffffffff8165c97c>] ret_from_fork+0x7c/0xb0
[  227.462547]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
[  227.462572] Code: 23 66 2e 0f 1f 84 00 00 00 00 00 83 fb 03 75 05 45 84 ed 75 66 f0 41 ff 4c 24 24 74 26 89 da 83 fa 04 74 3d f3 90 41 8b 5c 24 20 <39> d3 74 f0 83 fb 02 75 d7 fa 66 0f 1f 44 00 00 eb d8 66 0f 1f 
[  227.478401] NMI watchdog: BUG: soft lockup - CPU#19 stuck for 22s! [migration/19:142]
[  227.478437] Modules linked in: einj ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack cfg80211 rfkill ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_mangle iptable_security iptable_raw sg iptable_filter ip_tables vfat fat iTCO_wdt iTCO_vendor_support x86_pkg_temp_thermal coretemp kvm ixgbe crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel ptp lrw gf128mul pps_core glue_helper mdio dca ablk_helper sb_edac cryptd edac_core lpc_ich pcspkr shpchp i2c_i801 mfd_core ipmi_si wmi ipmi_msghandler acpi_pad xfs libcrc32c sd_mod mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper sr_mod cdrom ttm drm ahci libahci mpt2sas libata raid_class i2c_core scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod
[  227.478448] CPU: 19 PID: 142 Comm: migration/19 Tainted: G   M    W    L 3.18.0-rc3 #1
[  227.478449] Hardware name: Intel Corporation BRICKLAND/BRICKLAND, BIOS BRHSXSD1.86B.0058.D01.1410201505 10/20/2014
[  227.478451] task: ffff880c67dc1b20 ti: ffff880c67dd0000 task.ti: ffff880c67dd0000
[  227.478456] RIP: 0010:[<ffffffff81105570>]  [<ffffffff81105570>] multi_cpu_stop+0x70/0xf0
[  227.478457] RSP: 0018:ffff880c67dd3d68  EFLAGS: 00000293
[  227.478459] RAX: 0000000000000000 RBX: ffff880c6f834840 RCX: ffffffffffffffff
[  227.478460] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffffff81ab3320
[  227.478461] RBP: ffff880c67dd3d88 R08: ffffffff81ab3328 R09: ffff881467e59b20
[  227.478462] R10: 0000000000000004 R11: 0000000000000005 R12: 0000000000000000
[  227.478463] R13: ffff880c677c6000 R14: ffff880c67002800 R15: ffff880c00000000
[  227.478464] FS:  0000000000000000(0000) GS:ffff880c6f820000(0000) knlGS:0000000000000000
[  227.478466] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  227.478467] CR2: 00007f09b6e2eef0 CR3: 0000000001978000 CR4: 00000000001407e0
[  227.478468] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  227.478469] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  227.478470] Stack:
[  227.478472]  ffff880c65a8fd20 ffff880c6f82f0a0 ffff880c65a8fdb8 ffff880c6f82f0a8
[  227.478474]  ffff880c67dd3e58 ffffffff81105778 ffffffff81095387 0000000000000010
[  227.478476]  0000000000000216 ffff880c67dd3dc8 0000000000000018 0000000000000000
[  227.478477] Call Trace:
[  227.478480]  [<ffffffff81105778>] cpu_stopper_thread+0x78/0x150
[  227.478483]  [<ffffffff81095387>] ? finish_task_switch+0x57/0x180
[  227.478486]  [<ffffffff81657f67>] ? __schedule+0x2f7/0x7e0
[  227.478491]  [<ffffffff8109096f>] smpboot_thread_fn+0xff/0x1b0
[  227.478494]  [<ffffffff81090870>] ? SyS_setgroups+0x1a0/0x1a0
[  227.478496]  [<ffffffff8108c9f1>] kthread+0xe1/0x100
[  227.478498]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
[  227.478502]  [<ffffffff8165c97c>] ret_from_fork+0x7c/0xb0
[  227.478504]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
[  227.478526] Code: 23 66 2e 0f 1f 84 00 00 00 00 00 83 fb 03 75 05 45 84 ed 75 66 f0 41 ff 4c 24 24 74 26 89 da 83 fa 04 74 3d f3 90 41 8b 5c 24 20 <39> d3 74 f0 83 fb 02 75 d7 fa 66 0f 1f 44 00 00 eb d8 66 0f 1f 
[  227.493414] NMI watchdog: BUG: soft lockup - CPU#20 stuck for 22s! [migration/20:149]
[  227.493448] Modules linked in: einj ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack cfg80211 rfkill ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_mangle iptable_security iptable_raw sg iptable_filter ip_tables vfat fat iTCO_wdt iTCO_vendor_support x86_pkg_temp_thermal coretemp kvm ixgbe crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel ptp lrw gf128mul pps_core glue_helper mdio dca ablk_helper sb_edac cryptd edac_core lpc_ich pcspkr shpchp i2c_i801 mfd_core ipmi_si wmi ipmi_msghandler acpi_pad xfs libcrc32c sd_mod mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper sr_mod cdrom ttm drm ahci libahci mpt2sas libata raid_class i2c_core scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod
[  227.493460] CPU: 20 PID: 149 Comm: migration/20 Tainted: G   M    W    L 3.18.0-rc3 #1

> It adds debugging for inappropriate reschedules from the wrong stack.
> Setting CONFIG_DEBUG_ATOMIC_SLEEP might also be a good idea.

Will add that for next build/test

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14  0:50                       ` Andy Lutomirski
  2014-11-14  1:20                         ` Luck, Tony
  2014-11-14 17:49                         ` Luck, Tony
@ 2014-11-14 18:27                         ` Luck, Tony
  2 siblings, 0 replies; 63+ messages in thread
From: Luck, Tony @ 2014-11-14 18:27 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 445 bytes --]

>> It adds debugging for inappropriate reschedules from the wrong stack.
>> Setting CONFIG_DEBUG_ATOMIC_SLEEP might also be a good idea.
>
> Will add that for next build/test

Didn't see anything new.  System died at 1108 recoveries with the
"Timeout synchronization ..." panic

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14 17:26                   ` Andy Lutomirski
@ 2014-11-14 18:53                     ` Borislav Petkov
  0 siblings, 0 replies; 63+ messages in thread
From: Borislav Petkov @ 2014-11-14 18:53 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra, Luck, Tony,
	Oleg Nesterov

On Fri, Nov 14, 2014 at 09:26:26AM -0800, Andy Lutomirski wrote:
> I was hoping for an actual worked-out example of what the parameters
> should be :)

Sorry, I haven't played with this myself either - haven't had a box with
EINJ yet. Maybe Tony has something.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14 17:49                         ` Luck, Tony
@ 2014-11-14 19:10                           ` Andy Lutomirski
  2014-11-14 19:37                             ` Luck, Tony
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-14 19:10 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

On Fri, Nov 14, 2014 at 9:49 AM, Luck, Tony <tony.luck@intel.com> wrote:
>> Can you also try rebasing onto what will probably be v3?
>>
>> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/tag/?id=paranoid-stack-v2.9
>
> Built that - with none of my other changes ... i.e. still use TIF_NOTIFY_MCE etc. No printk()
> in the MCE context.
>
> System ran 736 injection/consumption/recovery cycles and then got an RCU
> stall - followed by a zillion soft lockups.

Grr.  I must be missing something rather basic here.

So far, the only thing I've come up with is that do_machine_check
seems to be missing exception_enter or the equivalent.  Do you have
CONFIG_CONTEXT_TRACKING on and/or full nohz enabled?  I don't think
that this explains my bug, though.

These traces seem to suggest that you're in stop_machine.  I wonder
how you got there.  This could also be an artifact of whatever
corruption is screwing everything up in the first place, though.

I'm still not seeing the bug.  I'll see if I can get EINJ working this
afternoon.

--Andy


>
> [  203.326117] mce: Uncorrected hardware memory error in user-access at 100f07f800
> [  203.326193] MCE 0x100f07f: Killing harderrors:12052 due to hardware memory corruption
> [  203.326195] MCE 0x100f07f: dirty LRU page recovery: Recovered
> [  204.721893] mce: Uncorrected hardware memory error in user-access at 100f7073c0
> [  204.721906] INFO: rcu_sched self-detected stall on CPU { 91}  (t=60002 jiffies g=5125 c=5124 q=0)
> [  204.721908] Task dump for CPU 91:
> [  204.721911] kworker/91:1    R  running task        0  1033      2 0x00000008
> [  204.721925] Workqueue: events_power_efficient fb_flashcursor
> [  204.721929]  ffff880c6767def0 00000000c74bfa96 ffff880c6fa63d68 ffffffff81099d68
> [  204.721930]  000000000000005b ffffffff819d1140 ffff880c6fa63d88 ffffffff8109d38d
> [  204.721932]  0000000000000087 000000000000000c ffff880c6fa63db8 ffffffff810caed0
> [  204.721933] Call Trace:
> [  204.721946]  <IRQ>  [<ffffffff81099d68>] sched_show_task+0xa8/0x110
> [  204.721951]  [<ffffffff8109d38d>] dump_cpu_task+0x3d/0x50
> [  204.721961]  [<ffffffff810caed0>] rcu_dump_cpu_stacks+0x90/0xd0
> [  204.721967]  [<ffffffff810cec17>] rcu_check_callbacks+0x497/0x710
> [  204.721974]  [<ffffffff810d3b7b>] update_process_times+0x4b/0x80
> [  204.721986]  [<ffffffff810e37c5>] tick_sched_handle.isra.19+0x25/0x60
> [  204.721989]  [<ffffffff810e3845>] tick_sched_timer+0x45/0x80
> [  204.721992]  [<ffffffff810d4887>] __run_hrtimer+0x77/0x1d0
> [  204.721995]  [<ffffffff810e3800>] ? tick_sched_handle.isra.19+0x60/0x60
> [  204.721997]  [<ffffffff810d4c77>] hrtimer_interrupt+0xf7/0x240
> [  204.722008]  [<ffffffff810455ab>] local_apic_timer_interrupt+0x3b/0x70
> [  204.722018]  [<ffffffff8165f8d5>] smp_apic_timer_interrupt+0x45/0x60
> [  204.722020]  [<ffffffff8165d91d>] apic_timer_interrupt+0x6d/0x80
> [  204.722034]  <EOI>  [<ffffffff810c1a38>] ? console_unlock+0x418/0x460
> [  204.722037]  [<ffffffff8135600d>] fb_flashcursor+0x5d/0x140
> [  204.722040]  [<ffffffff8135b8e0>] ? bit_clear+0x120/0x120
> [  204.722049]  [<ffffffff81086b5e>] process_one_work+0x14e/0x3f0
> [  204.722051]  [<ffffffff8108726b>] worker_thread+0x11b/0x510
> [  204.722053]  [<ffffffff81087150>] ? rescuer_thread+0x350/0x350
> [  204.722057]  [<ffffffff8108c9f1>] kthread+0xe1/0x100
> [  204.722059]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
> [  204.722074]  [<ffffffff8165c97c>] ret_from_fork+0x7c/0xb0
> [  204.722076]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
> [  227.462386] NMI watchdog: BUG: soft lockup - CPU#18 stuck for 22s! [migration/18:134]
> [  227.462452] Modules linked in: einj ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack cfg80211 rfkill ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_mangle iptable_security iptable_raw sg iptable_filter ip_tables vfat fat iTCO_wdt iTCO_vendor_support x86_pkg_temp_thermal coretemp kvm ixgbe crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel ptp lrw gf128mul pps_core glue_helper mdio dca ablk_helper sb_edac cryptd edac_core lpc_ich pcspkr shpchp i2c_i801 mfd_core ipmi_si wmi ipmi_msghandler acpi_pad xfs libcrc32c sd_mod mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper sr_mod cdrom ttm drm ahci libahci mpt2sas libata raid_class i2c_core scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod
> [  227.462470] CPU: 18 PID: 134 Comm: migration/18 Tainted: G   M    W      3.18.0-rc3 #1
> [  227.462472] Hardware name: Intel Corporation BRICKLAND/BRICKLAND, BIOS BRHSXSD1.86B.0058.D01.1410201505 10/20/2014
> [  227.462474] task: ffff880c68605ef0 ti: ffff880c67d9c000 task.ti: ffff880c67d9c000
> [  227.462484] RIP: 0010:[<ffffffff81105570>]  [<ffffffff81105570>] multi_cpu_stop+0x70/0xf0
> [  227.462485] RSP: 0018:ffff880c67d9fd68  EFLAGS: 00000293
> [  227.462487] RAX: 0000000000000000 RBX: ffff880c6f814840 RCX: ffffffffffffffff
> [  227.462488] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffffff81ab3320
> [  227.462489] RBP: ffff880c67d9fd88 R08: ffffffff81ab3328 R09: ffff881467e58d90
> [  227.462490] R10: ffffffff81ab3320 R11: 0000000000000001 R12: 0000000000000000
> [  227.462492] R13: ffff880c677c7800 R14: ffff880c67000800 R15: ffff880c00000000
> [  227.462494] FS:  0000000000000000(0000) GS:ffff880c6f800000(0000) knlGS:0000000000000000
> [  227.462495] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  227.462496] CR2: 00007f2147fcce90 CR3: 0000000001978000 CR4: 00000000001407e0
> [  227.462498] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  227.462500] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  227.462500] Stack:
> [  227.462503]  ffff880c65a8fd20 ffff880c6f80f0a0 ffff880c65a8fdb8 ffff880c6f80f0a8
> [  227.462505]  ffff880c67d9fe58 ffffffff81105778 ffffffff81095387 0000000000000010
> [  227.462507]  0000000000000282 ffff880c67d9fdc8 0000000000000018 0000000000000000
> [  227.462508] Call Trace:
> [  227.462512]  [<ffffffff81105778>] cpu_stopper_thread+0x78/0x150
> [  227.462516]  [<ffffffff81095387>] ? finish_task_switch+0x57/0x180
> [  227.462522]  [<ffffffff81657f67>] ? __schedule+0x2f7/0x7e0
> [  227.462531]  [<ffffffff8109096f>] smpboot_thread_fn+0xff/0x1b0
> [  227.462534]  [<ffffffff81090870>] ? SyS_setgroups+0x1a0/0x1a0
> [  227.462537]  [<ffffffff8108c9f1>] kthread+0xe1/0x100
> [  227.462539]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
> [  227.462544]  [<ffffffff8165c97c>] ret_from_fork+0x7c/0xb0
> [  227.462547]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
> [  227.462572] Code: 23 66 2e 0f 1f 84 00 00 00 00 00 83 fb 03 75 05 45 84 ed 75 66 f0 41 ff 4c 24 24 74 26 89 da 83 fa 04 74 3d f3 90 41 8b 5c 24 20 <39> d3 74 f0 83 fb 02 75 d7 fa 66 0f 1f 44 00 00 eb d8 66 0f 1f
> [  227.478401] NMI watchdog: BUG: soft lockup - CPU#19 stuck for 22s! [migration/19:142]
> [  227.478437] Modules linked in: einj ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack cfg80211 rfkill ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_mangle iptable_security iptable_raw sg iptable_filter ip_tables vfat fat iTCO_wdt iTCO_vendor_support x86_pkg_temp_thermal coretemp kvm ixgbe crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel ptp lrw gf128mul pps_core glue_helper mdio dca ablk_helper sb_edac cryptd edac_core lpc_ich pcspkr shpchp i2c_i801 mfd_core ipmi_si wmi ipmi_msghandler acpi_pad xfs libcrc32c sd_mod mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper sr_mod cdrom ttm drm ahci libahci mpt2sas libata raid_class i2c_core scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod
> [  227.478448] CPU: 19 PID: 142 Comm: migration/19 Tainted: G   M    W    L 3.18.0-rc3 #1
> [  227.478449] Hardware name: Intel Corporation BRICKLAND/BRICKLAND, BIOS BRHSXSD1.86B.0058.D01.1410201505 10/20/2014
> [  227.478451] task: ffff880c67dc1b20 ti: ffff880c67dd0000 task.ti: ffff880c67dd0000
> [  227.478456] RIP: 0010:[<ffffffff81105570>]  [<ffffffff81105570>] multi_cpu_stop+0x70/0xf0
> [  227.478457] RSP: 0018:ffff880c67dd3d68  EFLAGS: 00000293
> [  227.478459] RAX: 0000000000000000 RBX: ffff880c6f834840 RCX: ffffffffffffffff
> [  227.478460] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffffff81ab3320
> [  227.478461] RBP: ffff880c67dd3d88 R08: ffffffff81ab3328 R09: ffff881467e59b20
> [  227.478462] R10: 0000000000000004 R11: 0000000000000005 R12: 0000000000000000
> [  227.478463] R13: ffff880c677c6000 R14: ffff880c67002800 R15: ffff880c00000000
> [  227.478464] FS:  0000000000000000(0000) GS:ffff880c6f820000(0000) knlGS:0000000000000000
> [  227.478466] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  227.478467] CR2: 00007f09b6e2eef0 CR3: 0000000001978000 CR4: 00000000001407e0
> [  227.478468] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  227.478469] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  227.478470] Stack:
> [  227.478472]  ffff880c65a8fd20 ffff880c6f82f0a0 ffff880c65a8fdb8 ffff880c6f82f0a8
> [  227.478474]  ffff880c67dd3e58 ffffffff81105778 ffffffff81095387 0000000000000010
> [  227.478476]  0000000000000216 ffff880c67dd3dc8 0000000000000018 0000000000000000
> [  227.478477] Call Trace:
> [  227.478480]  [<ffffffff81105778>] cpu_stopper_thread+0x78/0x150
> [  227.478483]  [<ffffffff81095387>] ? finish_task_switch+0x57/0x180
> [  227.478486]  [<ffffffff81657f67>] ? __schedule+0x2f7/0x7e0
> [  227.478491]  [<ffffffff8109096f>] smpboot_thread_fn+0xff/0x1b0
> [  227.478494]  [<ffffffff81090870>] ? SyS_setgroups+0x1a0/0x1a0
> [  227.478496]  [<ffffffff8108c9f1>] kthread+0xe1/0x100
> [  227.478498]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
> [  227.478502]  [<ffffffff8165c97c>] ret_from_fork+0x7c/0xb0
> [  227.478504]  [<ffffffff8108c910>] ? kthread_create_on_node+0x1b0/0x1b0
> [  227.478526] Code: 23 66 2e 0f 1f 84 00 00 00 00 00 83 fb 03 75 05 45 84 ed 75 66 f0 41 ff 4c 24 24 74 26 89 da 83 fa 04 74 3d f3 90 41 8b 5c 24 20 <39> d3 74 f0 83 fb 02 75 d7 fa 66 0f 1f 44 00 00 eb d8 66 0f 1f
> [  227.493414] NMI watchdog: BUG: soft lockup - CPU#20 stuck for 22s! [migration/20:149]
> [  227.493448] Modules linked in: einj ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack cfg80211 rfkill ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_mangle iptable_security iptable_raw sg iptable_filter ip_tables vfat fat iTCO_wdt iTCO_vendor_support x86_pkg_temp_thermal coretemp kvm ixgbe crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel ptp lrw gf128mul pps_core glue_helper mdio dca ablk_helper sb_edac cryptd edac_core lpc_ich pcspkr shpchp i2c_i801 mfd_core ipmi_si wmi ipmi_msghandler acpi_pad xfs libcrc32c sd_mod mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper sr_mod cdrom ttm drm ahci libahci mpt2sas libata raid_class i2c_core scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod
> [  227.493460] CPU: 20 PID: 149 Comm: migration/20 Tainted: G   M    W    L 3.18.0-rc3 #1
>
>> It adds debugging for inappropriate reschedules from the wrong stack.
>> Setting CONFIG_DEBUG_ATOMIC_SLEEP might also be a good idea.
>
> Will add that for next build/test
>
> -Tony

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14 19:10                           ` Andy Lutomirski
@ 2014-11-14 19:37                             ` Luck, Tony
  0 siblings, 0 replies; 63+ messages in thread
From: Luck, Tony @ 2014-11-14 19:37 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Oleg Nesterov, Borislav Petkov, X86 ML, linux-kernel,
	Peter Zijlstra, Andi Kleen

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 936 bytes --]

> So far, the only thing I've come up with is that do_machine_check
> seems to be missing exception_enter or the equivalent.  Do you have
> CONFIG_CONTEXT_TRACKING on and/or full nohz enabled?  I don't think
> that this explains my bug, though.

Yes to both:

$ grep CONTEXT_TRACK .config
CONFIG_CONTEXT_TRACKING=y
# CONFIG_CONTEXT_TRACKING_FORCE is not set
CONFIG_HAVE_CONTEXT_TRACKING=y

$ grep HZ .config
CONFIG_NO_HZ_COMMON=y
# CONFIG_HZ_PERIODIC is not set
# CONFIG_NO_HZ_IDLE is not set
CONFIG_NO_HZ_FULL=y
# CONFIG_NO_HZ_FULL_ALL is not set
# CONFIG_NO_HZ_FULL_SYSIDLE is not set
CONFIG_NO_HZ=y
# CONFIG_RCU_FAST_NO_HZ is not set
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
# CONFIG_HZ_300 is not set
CONFIG_HZ_1000=y
CONFIG_HZ=1000
CONFIG_MACHZ_WDT=m

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-13 18:04                           ` Borislav Petkov
@ 2014-11-14 21:56                             ` Luck, Tony
  2014-11-14 22:07                               ` Andy Lutomirski
  2014-11-17 18:50                               ` Borislav Petkov
  0 siblings, 2 replies; 63+ messages in thread
From: Luck, Tony @ 2014-11-14 21:56 UTC (permalink / raw)
  To: Borislav Petkov, Andy Lutomirski
  Cc: Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra, Oleg Nesterov

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1320 bytes --]

>> Right, I can do it in the meantime and we can always experiment more
>> later. Getting rid of _TIF_MCE_NOTIFY is a good thing already.
>
> Yep, it looks pretty simple - not tested yet, it builds though.

It seems pretty solid under test so far.

Can we make it pass the address/flag to mce_notify_process() too? So
we can get rid of mce_save_info() and mce_find_info().

We'd need to wrap mce_task_work inside a bigger structure with fields
to have the address and flags - then use "container_of" inside mce_notify_process().

But I think that means we need more than one of these structures ... we may not
be done with one before a new machine check occurs. So we'd have to make an
NMI-safe allocator to grab one for use inside do_machine_check()

-Tony

General testing note - one thing I did see was that if inject 1000 errors at 0.3s interval from
my ssh'd login ... the serial console keeps streaming messages for about 40 seconds
after my test says it is all done. This might be a factor in the other tests I've been
running against the stack-switching code (especially with extra debug) ... at some
point __log_buf must get full - what happens then?
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14 21:56                             ` Luck, Tony
@ 2014-11-14 22:07                               ` Andy Lutomirski
  2014-11-17 18:50                               ` Borislav Petkov
  1 sibling, 0 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-14 22:07 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

On Fri, Nov 14, 2014 at 1:56 PM, Luck, Tony <tony.luck@intel.com> wrote:
>>> Right, I can do it in the meantime and we can always experiment more
>>> later. Getting rid of _TIF_MCE_NOTIFY is a good thing already.
>>
>> Yep, it looks pretty simple - not tested yet, it builds though.
>
> It seems pretty solid under test so far.
>
> Can we make it pass the address/flag to mce_notify_process() too? So
> we can get rid of mce_save_info() and mce_find_info().
>
> We'd need to wrap mce_task_work inside a bigger structure with fields
> to have the address and flags - then use "container_of" inside mce_notify_process().
>
> But I think that means we need more than one of these structures ... we may not
> be done with one before a new machine check occurs. So we'd have to make an
> NMI-safe allocator to grab one for use inside do_machine_check()

I think that this is a considerable benefit of my code, aside from the
non-working-ness part.  :-/

--Andy

>
> -Tony
>
> General testing note - one thing I did see was that if inject 1000 errors at 0.3s interval from
> my ssh'd login ... the serial console keeps streaming messages for about 40 seconds
> after my test says it is all done. This might be a factor in the other tests I've been
> running against the stack-switching code (especially with extra debug) ... at some
> point __log_buf must get full - what happens then?



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-14 21:56                             ` Luck, Tony
  2014-11-14 22:07                               ` Andy Lutomirski
@ 2014-11-17 18:50                               ` Borislav Petkov
  2014-11-17 19:57                                 ` Andy Lutomirski
  1 sibling, 1 reply; 63+ messages in thread
From: Borislav Petkov @ 2014-11-17 18:50 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Andy Lutomirski, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

On Fri, Nov 14, 2014 at 09:56:38PM +0000, Luck, Tony wrote:
> ...
> But I think that means we need more than one of these structures ...
> we may not be done with one before a new machine check occurs. So
> we'd have to make an NMI-safe allocator to grab one for use inside
> do_machine_check()

Well, I think we might do something with a lockless list as it is being
done in ghes.c.

It allocates entries from its own pool in the NMI handler and
llist_add's them to a list.

Then, in user context it does llist_del_all and then looks at each of
the elements at leisure and stress-free :-)

Pool alloc/free is NMI-safe too so we should be good. It looks pretty
clean, I'd give it a try.

> General testing note - one thing I did see was that if inject 1000
> errors at 0.3s interval from my ssh'd login ... the serial console
> keeps streaming messages for about 40 seconds after my test says it is
> all done. This might be a factor in the other tests I've been running
> against the stack-switching code (especially with extra debug) ... at
> some point __log_buf must get full - what happens then?

Start gets overwritten AFAICR.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-17 18:50                               ` Borislav Petkov
@ 2014-11-17 19:57                                 ` Andy Lutomirski
  2014-11-17 20:03                                   ` Borislav Petkov
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-17 19:57 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Luck, Tony, Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra,
	Oleg Nesterov

On Mon, Nov 17, 2014 at 10:50 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Fri, Nov 14, 2014 at 09:56:38PM +0000, Luck, Tony wrote:
>> ...
>> But I think that means we need more than one of these structures ...
>> we may not be done with one before a new machine check occurs. So
>> we'd have to make an NMI-safe allocator to grab one for use inside
>> do_machine_check()
>
> Well, I think we might do something with a lockless list as it is being
> done in ghes.c.
>
> It allocates entries from its own pool in the NMI handler and
> llist_add's them to a list.
>
> Then, in user context it does llist_del_all and then looks at each of
> the elements at leisure and stress-free :-)
>
> Pool alloc/free is NMI-safe too so we should be good. It looks pretty
> clean, I'd give it a try.

Would it be worth making a decision on task_work_add vs. stack switching first?

Stack switching pros: all this lockless allocation stuff is completely
unnecessary, and it's plausible that the stack switching code will be
added anyway.

task_work_add pros: conceptually simpler mce.c diff.

Tony, did the code survive your new stress test?

--Andy

>
>> General testing note - one thing I did see was that if inject 1000
>> errors at 0.3s interval from my ssh'd login ... the serial console
>> keeps streaming messages for about 40 seconds after my test says it is
>> all done. This might be a factor in the other tests I've been running
>> against the stack-switching code (especially with extra debug) ... at
>> some point __log_buf must get full - what happens then?
>
> Start gets overwritten AFAICR.
>
> --
> Regards/Gruss,
>     Boris.
>
> Sent from a fat crate under my desk. Formatting is fine.
> --



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-17 19:57                                 ` Andy Lutomirski
@ 2014-11-17 20:03                                   ` Borislav Petkov
  2014-11-17 20:05                                     ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Borislav Petkov @ 2014-11-17 20:03 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Luck, Tony, Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra,
	Oleg Nesterov

On Mon, Nov 17, 2014 at 11:57:22AM -0800, Andy Lutomirski wrote:
> Would it be worth making a decision on task_work_add vs. stack
> switching first?

Probably a prudent thing to do in order to save unnecessary cycles :-)

> Stack switching pros: all this lockless allocation stuff is completely
> unnecessary, and it's plausible that the stack switching code will be
> added anyway.

Yes.

However, I'd like to be very sure this thing doesn't introduce any
regressions to the MCA code. So even if Tony's testing passes, I'd like
to be very conservative here and stress it more than usual. Because once
this thing hits upstream and stuff starts breaking, it'll be a serious
PITA reverting it.

I hope you can understand my concerns.

Btw, which branch has your latest version - I'd like to take a look at
it in detail.

> task_work_add pros: conceptually simpler mce.c diff.

Right, that's the safe path.

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-17 20:03                                   ` Borislav Petkov
@ 2014-11-17 20:05                                     ` Andy Lutomirski
  2014-11-17 21:55                                       ` Luck, Tony
  2014-11-18 16:12                                       ` Borislav Petkov
  0 siblings, 2 replies; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-17 20:05 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Luck, Tony, Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra,
	Oleg Nesterov

On Mon, Nov 17, 2014 at 12:03 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Nov 17, 2014 at 11:57:22AM -0800, Andy Lutomirski wrote:
>> Would it be worth making a decision on task_work_add vs. stack
>> switching first?
>
> Probably a prudent thing to do in order to save unnecessary cycles :-)
>
>> Stack switching pros: all this lockless allocation stuff is completely
>> unnecessary, and it's plausible that the stack switching code will be
>> added anyway.
>
> Yes.
>
> However, I'd like to be very sure this thing doesn't introduce any
> regressions to the MCA code. So even if Tony's testing passes, I'd like
> to be very conservative here and stress it more than usual. Because once
> this thing hits upstream and stuff starts breaking, it'll be a serious
> PITA reverting it.
>
> I hope you can understand my concerns.

I agree completely.

>
> Btw, which branch has your latest version - I'd like to take a look at
> it in detail.

https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/log/?h=x86/paranoid

I'm not quite ready to send v3.  I want to do two things first:

1. Consider disabling the stack switch for double_fault.

2. Clean up the macros.  I'll validate this by ensuring that the
generated code is identical to the current version.

IOW, I don't expect the asm for machine_check to change.

--Andy

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-17 20:05                                     ` Andy Lutomirski
@ 2014-11-17 21:55                                       ` Luck, Tony
  2014-11-17 22:26                                         ` Andy Lutomirski
  2014-11-18 16:12                                       ` Borislav Petkov
  1 sibling, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-17 21:55 UTC (permalink / raw)
  To: Andy Lutomirski, Borislav Petkov
  Cc: Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra, Oleg Nesterov

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 971 bytes --]

>> However, I'd like to be very sure this thing doesn't introduce any
>> regressions to the MCA code. So even if Tony's testing passes, I'd like
>> to be very conservative here and stress it more than usual. Because once
>> this thing hits upstream and stuff starts breaking, it'll be a serious
>> PITA reverting it.

The test I left running on Friday was just running the stack-switch asm
patch, without any mce.c changes.  It died at 16000 iterations with the
mce synchronization issue.

This morning I started a new test with all the mce changes (no TIF_MCE_NOTIFY,
just process the recovery in the tail of do_machine_check().

It just passed the 18000 point, and it still going.  In addition I've been throwing
the odd "make -j144" kernel build at the machine so we check out the non-idle
paths too.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-17 21:55                                       ` Luck, Tony
@ 2014-11-17 22:26                                         ` Andy Lutomirski
  2014-11-17 23:16                                           ` Luck, Tony
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-17 22:26 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

On Mon, Nov 17, 2014 at 1:55 PM, Luck, Tony <tony.luck@intel.com> wrote:
>>> However, I'd like to be very sure this thing doesn't introduce any
>>> regressions to the MCA code. So even if Tony's testing passes, I'd like
>>> to be very conservative here and stress it more than usual. Because once
>>> this thing hits upstream and stuff starts breaking, it'll be a serious
>>> PITA reverting it.
>
> The test I left running on Friday was just running the stack-switch asm
> patch, without any mce.c changes.  It died at 16000 iterations with the
> mce synchronization issue.

I still wonder whether the timeout code is the real culprit.  My patch
will slow down entry into do_machine_check by tens of cycles, several
cachelines, and possibly a couple of TLB misses.  Given that the
timing seemed marginal to me, it's possible (albeit not that likely)
that it pushed the time needed for synchronization into the range of
unreliability.

Any chance you can retry it at some point with that USEC_PER_SEC thing
changed to NSEC_PER_SEC and SPINUNIT set to something closer to 10
than 100?

--Andy

>
> This morning I started a new test with all the mce changes (no TIF_MCE_NOTIFY,
> just process the recovery in the tail of do_machine_check().
>
> It just passed the 18000 point, and it still going.  In addition I've been throwing
> the odd "make -j144" kernel build at the machine so we check out the non-idle
> paths too.
>
> -Tony



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-17 22:26                                         ` Andy Lutomirski
@ 2014-11-17 23:16                                           ` Luck, Tony
  2014-11-18  0:05                                             ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-17 23:16 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 2267 bytes --]

> I still wonder whether the timeout code is the real culprit.  My patch
> will slow down entry into do_machine_check by tens of cycles, several
> cachelines, and possibly a couple of TLB misses.  Given that the
> timing seemed marginal to me, it's possible (albeit not that likely)
> that it pushed the time needed for synchronization into the range of
> unreliability.

I think we have very conservative timeouts.  Here are the significant bits from mce

First SPINUNIT ... how many nanoseconds to spin "ndelay(SPINUNIT)"  before pounding on an atomic op to see if we are done:
#define SPINUNIT 100    /* 100ns */

Initialization of our timeout - comment above this claims we are aiming for "one second"

	cfg->monarch_timeout = USEC_PER_SEC;

Inside mce_start() - set up our timeout. Math says this will be 1e9

	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;

Now the actual loop:
        while (atomic_read(&mce_callin) != cpus) {
                if (mce_timed_out(&timeout)) {
                        atomic_set(&global_nwo, 0);
                        return -1;
                }
                ndelay(SPINUNIT);
        }

And the inside of mce_timed_out() ... we decrement our 1e9 by 100 for each call:

        if ((s64)*t < SPINUNIT) {
                if (mca_cfg.tolerant <= 1)
                        mce_panic("Timeout synchronizing machine check over CPUs",
                                  NULL, NULL);
                cpu_missing = 1;
                return 1;
        }
        *t -= SPINUNIT;

Not sure what the worst case is for waiting for other processors to show up. Perhaps it is
when the other cpu is in some deep C-state and we have to wait for it to power the core
back on and resync clock.  Maybe it is when a cpu has just started a "wbinvd" instruction
with a cache entirely full of dirty lines that belong on some other processor so have to
be written back across QPI (may when QPI links are congested with 10gbit network traffic?

But a full second seems like a lot ... you adding a handful of cache and TLB misses shouldn't
push us over the edge.

-Tony

ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-17 23:16                                           ` Luck, Tony
@ 2014-11-18  0:05                                             ` Andy Lutomirski
  2014-11-18  0:22                                               ` Luck, Tony
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-18  0:05 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

On Mon, Nov 17, 2014 at 3:16 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> I still wonder whether the timeout code is the real culprit.  My patch
>> will slow down entry into do_machine_check by tens of cycles, several
>> cachelines, and possibly a couple of TLB misses.  Given that the
>> timing seemed marginal to me, it's possible (albeit not that likely)
>> that it pushed the time needed for synchronization into the range of
>> unreliability.
>
> I think we have very conservative timeouts.  Here are the significant bits from mce
>
> First SPINUNIT ... how many nanoseconds to spin "ndelay(SPINUNIT)"  before pounding on an atomic op to see if we are done:
> #define SPINUNIT 100    /* 100ns */
>
> Initialization of our timeout - comment above this claims we are aiming for "one second"
>
>         cfg->monarch_timeout = USEC_PER_SEC;
>
> Inside mce_start() - set up our timeout. Math says this will be 1e9
>
>         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;

Aha.  I missed the extra multiplication.

>
> Now the actual loop:
>         while (atomic_read(&mce_callin) != cpus) {
>                 if (mce_timed_out(&timeout)) {
>                         atomic_set(&global_nwo, 0);
>                         return -1;
>                 }
>                 ndelay(SPINUNIT);
>         }

And I missed the ndelay.

*sigh* reading comprehension wasn't so good there.

>From a debugging POV, it would be kind of nice to know which of the
four mce_timed_out calls is reporting the failure.  IIUC, the first
one would mean that a cpu didn't make it in to do_machine_check at
all, whereas the second one would mean that a cpu got lost after
entering do_machine_check.  Also, the second one presumably knows how
far it got (i.e. mce_executing and order).

It could also be interesting to tweak mce_panic to not actually panic
the machine but to try to return and stop the test instead.  Then real
debugging could be possible :)

I still wish I could test this.  The mce-inject code is completely
useless here, since it calls do_machine_check directly instead of
going through the MCE code.

>
> And the inside of mce_timed_out() ... we decrement our 1e9 by 100 for each call:
>
>         if ((s64)*t < SPINUNIT) {
>                 if (mca_cfg.tolerant <= 1)
>                         mce_panic("Timeout synchronizing machine check over CPUs",
>                                   NULL, NULL);
>                 cpu_missing = 1;
>                 return 1;
>         }
>         *t -= SPINUNIT;
>
> Not sure what the worst case is for waiting for other processors to show up. Perhaps it is
> when the other cpu is in some deep C-state and we have to wait for it to power the core
> back on and resync clock.  Maybe it is when a cpu has just started a "wbinvd" instruction
> with a cache entirely full of dirty lines that belong on some other processor so have to
> be written back across QPI (may when QPI links are congested with 10gbit network traffic?
>
> But a full second seems like a lot ... you adding a handful of cache and TLB misses shouldn't
> push us over the edge.

Agreed.

>
> -Tony
>
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-18  0:05                                             ` Andy Lutomirski
@ 2014-11-18  0:22                                               ` Luck, Tony
  2014-11-18  0:55                                                 ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-18  0:22 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 455 bytes --]

> It could also be interesting to tweak mce_panic to not actually panic
> the machine but to try to return and stop the test instead.  Then real
> debugging could be possible :)

The lost cpu is *really* lost.  Warm reset doesn't fix the machine, I usually
have to do a full power cycle.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-18  0:22                                               ` Luck, Tony
@ 2014-11-18  0:55                                                 ` Andy Lutomirski
  2014-11-18 18:30                                                   ` Luck, Tony
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-18  0:55 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

On Mon, Nov 17, 2014 at 4:22 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> It could also be interesting to tweak mce_panic to not actually panic
>> the machine but to try to return and stop the test instead.  Then real
>> debugging could be possible :)
>
> The lost cpu is *really* lost.  Warm reset doesn't fix the machine, I usually
> have to do a full power cycle.

How is it even possible that I did that with a few lines of asm?

Could this be a hardware bug?  Is there some condition that causes #MC
delivery to wedge hard enough that even INIT/RESET stops working?  Or
possibly some CPU got stuck in SMM -- I have no idea what warm reset
does these days.

My initial attempts to test machine_check in KVM using IPIs are having
some issues, probably because I'm not acking the interrupt.  I can do
it once, but then it stops working.

Here's the patch to improve the timeout messages, but given the degree
of wedgedness, I can guess what it'll say:

https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/paranoid&id=e5cbd9d141bde651ecb20f0b65ad13bcef2468d0

--Andy

>
> -Tony

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-17 20:05                                     ` Andy Lutomirski
  2014-11-17 21:55                                       ` Luck, Tony
@ 2014-11-18 16:12                                       ` Borislav Petkov
  1 sibling, 0 replies; 63+ messages in thread
From: Borislav Petkov @ 2014-11-18 16:12 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Luck, Tony, Andi Kleen, linux-kernel, X86 ML, Peter Zijlstra,
	Oleg Nesterov

On Mon, Nov 17, 2014 at 12:05:59PM -0800, Andy Lutomirski wrote:
> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/log/?h=x86/paranoid
> 
> I'm not quite ready to send v3.  I want to do two things first:
> 
> 1. Consider disabling the stack switch for double_fault.

Sounds conservatively nice :)

> 2. Clean up the macros.  I'll validate this by ensuring that the
> generated code is identical to the current version.
> 
> IOW, I don't expect the asm for machine_check to change.

Right, so I don't see anything wrong with the patch but entry_64.S is
nasty so don't take my word for it.

I guess handlers should need to do "if (user_mode_vm(regs))" after your
change now, in order to know what to do.

I'll try to do some error injection here too, on my boxes to see how
this behaves.

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-18  0:55                                                 ` Andy Lutomirski
@ 2014-11-18 18:30                                                   ` Luck, Tony
  2014-11-18 23:04                                                     ` Andy Lutomirski
  0 siblings, 1 reply; 63+ messages in thread
From: Luck, Tony @ 2014-11-18 18:30 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1405 bytes --]

>> The lost cpu is *really* lost.  Warm reset doesn't fix the machine, I usually
>> have to do a full power cycle.

> How is it even possible that I did that with a few lines of asm?

Probably not your directly your fault - some cascade of errors may have occurred.

> Could this be a hardware bug?  Is there some condition that causes #MC
> delivery to wedge hard enough that even INIT/RESET stops working?  Or
> possibly some CPU got stuck in SMM -- I have no idea what warm reset
> does these days.

I'm not even sure what kind of reset the remote management i/f I used
actually applied.

> Here's the patch to improve the timeout messages, but given the degree
> of wedgedness, I can guess what it'll say:
>
> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/paranoid&id=e5cbd9d141bde651ecb20f0b65ad13bcef2468d0

Heh - I'd already put in some hacky printk()s to do similar. Mine aren't upstream quality, but do print the value of mce_callin/mce_executing
as appropriate.  But I got some confusing results - reporter complained that only 142 of 144 had shown up. So two threads missing,
maybe means one core went into h/w shutdown.  Need to dig further to see if the missing duo really are from the same core.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-18 18:30                                                   ` Luck, Tony
@ 2014-11-18 23:04                                                     ` Andy Lutomirski
  2014-11-18 23:26                                                       ` Luck, Tony
  0 siblings, 1 reply; 63+ messages in thread
From: Andy Lutomirski @ 2014-11-18 23:04 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

On Tue, Nov 18, 2014 at 10:30 AM, Luck, Tony <tony.luck@intel.com> wrote:
>>> The lost cpu is *really* lost.  Warm reset doesn't fix the machine, I usually
>>> have to do a full power cycle.
>
>> How is it even possible that I did that with a few lines of asm?
>
> Probably not your directly your fault - some cascade of errors may have occurred.

I went and read the manual.  Here's a hypothesis:

Your test case is presumably doing something that involves setting
undocumented registers* to program the CPU or memory controller to
generate a machine check on access to some address.  Presumably this
is done by broadcasting an SMI and programming the registers in SMM.

Now SMM is rather strange.  The docs list a large set of interrupt
sources that are disabled on SMM entry, and this list does not include
#MC.  So presumably #MC is actually left enabled on entry to SMM.
That means that, unless SMRAM has an interrupt table that has a
working machine check handler (which seems highly unlikely), then
there is at least some window in which a #MC delivered in SMM will
cause some kind of failure.  This could really happen: a broadcast #MC
could easily race a broadcast SMI and do this.

If you crash your SMM code, then I wouldn't be at all surprised if the
CPU wedges hard enough that even your remote management thing can't
reset it.

* These are probably the registers that are supposed to be documented
in volume 2 section 4.4.9 of the Xeon E5 1600/2600 datasheet,
reference 326509-003, but the docs are extremely incomplete.

--Andy

>
>> Could this be a hardware bug?  Is there some condition that causes #MC
>> delivery to wedge hard enough that even INIT/RESET stops working?  Or
>> possibly some CPU got stuck in SMM -- I have no idea what warm reset
>> does these days.
>
> I'm not even sure what kind of reset the remote management i/f I used
> actually applied.
>
>> Here's the patch to improve the timeout messages, but given the degree
>> of wedgedness, I can guess what it'll say:
>>
>> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/paranoid&id=e5cbd9d141bde651ecb20f0b65ad13bcef2468d0
>
> Heh - I'd already put in some hacky printk()s to do similar. Mine aren't upstream quality, but do print the value of mce_callin/mce_executing
> as appropriate.  But I got some confusing results - reporter complained that only 142 of 144 had shown up. So two threads missing,
> maybe means one core went into h/w shutdown.  Need to dig further to see if the missing duo really are from the same core.
>
> -Tony

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 63+ messages in thread

* RE: [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace
  2014-11-18 23:04                                                     ` Andy Lutomirski
@ 2014-11-18 23:26                                                       ` Luck, Tony
  0 siblings, 0 replies; 63+ messages in thread
From: Luck, Tony @ 2014-11-18 23:26 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Andi Kleen, linux-kernel, X86 ML,
	Peter Zijlstra, Oleg Nesterov

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1163 bytes --]

> Your test case is presumably doing something that involves setting
> undocumented registers* to program the CPU or memory controller to
> generate a machine check on access to some address.  Presumably this
> is done by broadcasting an SMI and programming the registers in SMM.

Good theory - but not quite how it works.  The ACPI/EINJ table does trigger
an SMI so the BIOS can do the injection.  What BIOS actually does is to play
with the memory controller so that the next write to the target address will
flip some ECC bits in an unnatural way (to either plant a correctable error
with just one bit flipped, or a UC error with two bits flipped).  Then the SMI
returns.

Then my application reads the target address, and we see CMCI or MCE
when the ECC check fails.

Hopefully this keeps the SMI path decoupled from the MCE ... I even sleep
a little after injection and before consumption just in case there are any
stragglers late returning from the (broadcast) SMI that planted the error.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a¢f£¢·hšïêÿ‘êçz_è®\x03(éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨èÚ&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 63+ messages in thread

end of thread, other threads:[~2014-11-18 23:27 UTC | newest]

Thread overview: 63+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-11 20:56 [RFC PATCH] x86, entry: Switch stacks on a paranoid entry from userspace Andy Lutomirski
2014-11-11 21:36 ` Borislav Petkov
2014-11-11 22:00   ` Luck, Tony
2014-11-11 22:15     ` Andy Lutomirski
2014-11-11 22:12   ` Andy Lutomirski
2014-11-11 22:33     ` Borislav Petkov
2014-11-11 22:40       ` Andy Lutomirski
2014-11-11 23:09         ` Borislav Petkov
2014-11-11 23:21           ` Andy Lutomirski
2014-11-12  0:22             ` Luck, Tony
2014-11-12  0:40               ` Andy Lutomirski
2014-11-12  1:06                 ` Luck, Tony
2014-11-12  2:01                   ` Andy Lutomirski
2014-11-12  2:06                   ` Tony Luck
2014-11-12 10:30                     ` Borislav Petkov
2014-11-12 15:48                       ` Andy Lutomirski
2014-11-12 16:22                         ` Borislav Petkov
2014-11-12 17:17                           ` Luck, Tony
2014-11-12 17:30                             ` Borislav Petkov
2014-11-13 18:04                           ` Borislav Petkov
2014-11-14 21:56                             ` Luck, Tony
2014-11-14 22:07                               ` Andy Lutomirski
2014-11-17 18:50                               ` Borislav Petkov
2014-11-17 19:57                                 ` Andy Lutomirski
2014-11-17 20:03                                   ` Borislav Petkov
2014-11-17 20:05                                     ` Andy Lutomirski
2014-11-17 21:55                                       ` Luck, Tony
2014-11-17 22:26                                         ` Andy Lutomirski
2014-11-17 23:16                                           ` Luck, Tony
2014-11-18  0:05                                             ` Andy Lutomirski
2014-11-18  0:22                                               ` Luck, Tony
2014-11-18  0:55                                                 ` Andy Lutomirski
2014-11-18 18:30                                                   ` Luck, Tony
2014-11-18 23:04                                                     ` Andy Lutomirski
2014-11-18 23:26                                                       ` Luck, Tony
2014-11-18 16:12                                       ` Borislav Petkov
2014-11-12 22:00 ` Oleg Nesterov
2014-11-12 23:17   ` Andy Lutomirski
2014-11-12 23:41     ` Luck, Tony
2014-11-13  0:02       ` Andy Lutomirski
2014-11-13  0:31         ` Luck, Tony
2014-11-13  1:34           ` Andy Lutomirski
2014-11-13  3:03           ` Andy Lutomirski
2014-11-13 18:43             ` Luck, Tony
2014-11-13 22:23               ` Andy Lutomirski
2014-11-13 22:25                 ` Andy Lutomirski
2014-11-13 22:33                 ` Luck, Tony
2014-11-13 22:47                   ` Andy Lutomirski
2014-11-13 23:13                     ` Andy Lutomirski
2014-11-14  0:50                       ` Andy Lutomirski
2014-11-14  1:20                         ` Luck, Tony
2014-11-14  1:36                           ` Andy Lutomirski
2014-11-14 17:49                         ` Luck, Tony
2014-11-14 19:10                           ` Andy Lutomirski
2014-11-14 19:37                             ` Luck, Tony
2014-11-14 18:27                         ` Luck, Tony
2014-11-14 10:34             ` Borislav Petkov
2014-11-14 17:18               ` Andy Lutomirski
2014-11-14 17:24                 ` Borislav Petkov
2014-11-14 17:26                   ` Andy Lutomirski
2014-11-14 18:53                     ` Borislav Petkov
2014-11-13 10:59           ` Borislav Petkov
2014-11-13 21:23             ` Borislav Petkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).