Re: [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas

From: Andy Lutomirski <luto@kernel.org>
To: Tony Luck <tony.luck@intel.com>, Borislav Petkov <bp@alien8.de>
Cc: linux-kernel@vger.kernel.org, linux-edac@vger.kernel.org, x86@kernel.org
Subject: Re: [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
Date: Wed, 11 Nov 2015 20:19:35 -0800	[thread overview]
Message-ID: <56441357.70201@kernel.org> (raw)
In-Reply-To: <e916478b9587ef006b30255a7adbee6d84268d7c.1447093568.git.tony.luck@intel.com>

On 11/06/2015 01:01 PM, Tony Luck wrote:
> Extend the severity checking code to add a new context IN_KERN_RECOV
> which is used to indicate that the machine check was triggered by code
> in the kernel with a fixup entry.
>
> Add code to check for this situation and respond by altering the return
> IP to the fixup address and changing the regs->ax so that the recovery
> code knows the physical address of the error. Note that we also set bit
> 63 because 0x0 is a legal physical address.
>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>   arch/x86/kernel/cpu/mcheck/mce-severity.c | 19 +++++++++++++++++--
>   arch/x86/kernel/cpu/mcheck/mce.c          | 13 ++++++++++---
>   2 files changed, 27 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
> index 9c682c222071..1e83842310e8 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
> @@ -12,6 +12,7 @@
>   #include <linux/kernel.h>
>   #include <linux/seq_file.h>
>   #include <linux/init.h>
> +#include <linux/module.h>
>   #include <linux/debugfs.h>
>   #include <asm/mce.h>
>
> @@ -29,7 +30,7 @@
>    * panic situations)
>    */
>
> -enum context { IN_KERNEL = 1, IN_USER = 2 };
> +enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
>   enum ser { SER_REQUIRED = 1, NO_SER = 2 };
>   enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
>
> @@ -48,6 +49,7 @@ static struct severity {
>   #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
>   #define  KERNEL		.context = IN_KERNEL
>   #define  USER		.context = IN_USER
> +#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
>   #define  SER		.ser = SER_REQUIRED
>   #define  NOSER		.ser = NO_SER
>   #define  EXCP		.excp = EXCP_CONTEXT
> @@ -87,6 +89,10 @@ static struct severity {
>   		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
>   		),
>   	MCESEV(
> +		PANIC, "In kernel and no restart IP",
> +		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
> +		),
> +	MCESEV(
>   		DEFERRED, "Deferred error",
>   		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
>   		),
> @@ -123,6 +129,11 @@ static struct severity {
>   		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
>   		),
>   	MCESEV(
> +		AR, "Action required: data load error recoverable area of kernel",
> +		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
> +		KERNEL_RECOV
> +		),
> +	MCESEV(
>   		AR, "Action required: data load error in a user process",
>   		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
>   		USER
> @@ -183,7 +194,11 @@ static struct severity {
>    */
>   static int error_context(struct mce *m)
>   {
> -	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
> +	if ((m->cs & 3) == 3)
> +		return IN_USER;
> +	if (search_mcexception_tables(m->ip))
> +		return IN_KERNEL_RECOV;
> +	return IN_KERNEL;
>   }
>
>   /*
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 9d014b82a124..472d11150b7a 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -31,6 +31,7 @@
>   #include <linux/types.h>
>   #include <linux/slab.h>
>   #include <linux/init.h>
> +#include <linux/module.h>
>   #include <linux/kmod.h>
>   #include <linux/poll.h>
>   #include <linux/nmi.h>
> @@ -1132,9 +1133,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
>   		if (no_way_out)
>   			mce_panic("Fatal machine check on current CPU", &m, msg);
>   		if (worst == MCE_AR_SEVERITY) {
> -			recover_paddr = m.addr;
> -			if (!(m.mcgstatus & MCG_STATUS_RIPV))
> -				flags |= MF_MUST_KILL;
> +			if ((m.cs & 3) == 3) {
> +				recover_paddr = m.addr;
> +				if (!(m.mcgstatus & MCG_STATUS_RIPV))
> +					flags |= MF_MUST_KILL;
> +			} else if (fixup_mcexception(regs)) {
> +				regs->ax = BIT(63) | m.addr;
> +			} else
> +				mce_panic("Failed kernel mode recovery",
> +					  &m, NULL);

Maybe I'm misunderstanding this, but presumably you shouldn't call 
fixup_mcexception unless you've first verified RIPV (i.e. that the ip 
you're looking up in the table is valid).

Also... I find the general flow of this code very hard to follow.  It's 
critical that an MCE hitting kernel mode not get as far as 
ist_begin_non_atomic.  It was already hard enough to tell that the code 
follows that rule, and now it's even harder.  Would it make sense to add 
clear assertions that m.cs == regs->cs and that user_mode(regs) when you 
get to the end?  Simplifying the control flow might also be nice.

>   		} else if (kill_it) {
>   			force_sig(SIGBUS, current);
>   		}
>

I would argue that this should happen in the non-atomic section.  It's 
probably okay as long as we came from user mode, but it's more obviously 
safe in the non-atomic section.

--Andy