All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-11-09 18:26 [RFC PATCH 0/3] Machine check recovery when kernel accesses poison Tony Luck
@ 2015-11-06 20:57 ` Tony Luck
  2015-11-10 11:21   ` Borislav Petkov
  2015-11-12  4:14   ` Andy Lutomirski
  2015-11-06 21:01 ` [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas Tony Luck
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 29+ messages in thread
From: Tony Luck @ 2015-11-06 20:57 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

Copy the existing page fault fixup mechanisms to create a new table
to be used when fixing machine checks. Note:
1) At this time we only provide a macro to annotate assembly code
2) We assume all fixups will in code builtin to the kernel.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/asm.h        |  7 +++++++
 arch/x86/include/asm/uaccess.h    |  1 +
 arch/x86/mm/extable.c             | 16 ++++++++++++++++
 include/asm-generic/vmlinux.lds.h |  6 ++++++
 include/linux/module.h            |  1 +
 kernel/extable.c                  | 14 ++++++++++++++
 6 files changed, 45 insertions(+)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..f2fa7973f18f 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -58,6 +58,13 @@
 	.long (to) - . + 0x7ffffff0 ;				\
 	.popsection
 
+# define _ASM_MCEXTABLE(from, to)				\
+	.pushsection "__mcex_table", "a" ;			\
+	.balign 8 ;						\
+	.long (from) - . ;					\
+	.long (to) - . ;					\
+	.popsection
+
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
 	_ASM_ALIGN ;						\
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a8df874f3e88..b8231301a224 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -111,6 +111,7 @@ struct exception_table_entry {
 #define ARCH_HAS_SEARCH_EXTABLE
 
 extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_mcexception(struct pt_regs *regs);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..5b328ae00365 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -49,6 +49,22 @@ int fixup_exception(struct pt_regs *regs)
 	return 0;
 }
 
+int fixup_mcexception(struct pt_regs *regs)
+{
+	const struct exception_table_entry *fixup;
+	unsigned long new_ip;
+
+	fixup = search_mcexception_tables(regs->ip);
+	if (fixup) {
+		new_ip = ex_fixup_addr(fixup);
+
+		regs->ip = new_ip;
+		return 1;
+	}
+
+	return 0;
+}
+
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 1781e54ea6d3..21bb20d1172a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -473,6 +473,12 @@
 		VMLINUX_SYMBOL(__start___ex_table) = .;			\
 		*(__ex_table)						\
 		VMLINUX_SYMBOL(__stop___ex_table) = .;			\
+	}								\
+	. = ALIGN(align);						\
+	__mcex_table : AT(ADDR(__mcex_table) - LOAD_OFFSET) {		\
+		VMLINUX_SYMBOL(__start___mcex_table) = .;		\
+		*(__mcex_table)						\
+		VMLINUX_SYMBOL(__stop___mcex_table) = .;		\
 	}
 
 /*
diff --git a/include/linux/module.h b/include/linux/module.h
index 3a19c79918e0..ffecbfcc462c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -270,6 +270,7 @@ extern const typeof(name) __mod_##type##__##name##_device_table		\
 
 /* Given an address, look for it in the exception tables */
 const struct exception_table_entry *search_exception_tables(unsigned long add);
+const struct exception_table_entry *search_mcexception_tables(unsigned long a);
 
 struct notifier_block;
 
diff --git a/kernel/extable.c b/kernel/extable.c
index e820ccee9846..261c3e2816db 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -34,6 +34,8 @@ DEFINE_MUTEX(text_mutex);
 
 extern struct exception_table_entry __start___ex_table[];
 extern struct exception_table_entry __stop___ex_table[];
+extern struct exception_table_entry __start___mcex_table[];
+extern struct exception_table_entry __stop___mcex_table[];
 
 /* Cleared by build time tools if the table is already sorted. */
 u32 __initdata __visible main_extable_sort_needed = 1;
@@ -45,6 +47,8 @@ void __init sort_main_extable(void)
 		pr_notice("Sorting __ex_table...\n");
 		sort_extable(__start___ex_table, __stop___ex_table);
 	}
+	if (__stop___mcex_table > __start___mcex_table)
+		sort_extable(__start___mcex_table, __stop___mcex_table);
 }
 
 /* Given an address, look for it in the exception tables. */
@@ -58,6 +62,16 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
+/* Given an address, look for it in the machine check exception tables. */
+const struct exception_table_entry *search_mcexception_tables(
+				    unsigned long addr)
+{
+	const struct exception_table_entry *e;
+
+	e = search_extable(__start___mcex_table, __stop___mcex_table-1, addr);
+	return e;
+}
+
 static inline int init_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_sinittext &&
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
  2015-11-09 18:26 [RFC PATCH 0/3] Machine check recovery when kernel accesses poison Tony Luck
  2015-11-06 20:57 ` [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables Tony Luck
@ 2015-11-06 21:01 ` Tony Luck
  2015-11-10 11:21   ` Borislav Petkov
  2015-11-12  4:19   ` Andy Lutomirski
  2015-11-06 21:08 ` [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks Tony Luck
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 29+ messages in thread
From: Tony Luck @ 2015-11-06 21:01 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel with a fixup entry.

Add code to check for this situation and respond by altering the return
IP to the fixup address and changing the regs->ax so that the recovery
code knows the physical address of the error. Note that we also set bit
63 because 0x0 is a legal physical address.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 19 +++++++++++++++++--
 arch/x86/kernel/cpu/mcheck/mce.c          | 13 ++++++++++---
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..1e83842310e8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/debugfs.h>
 #include <asm/mce.h>
 
@@ -29,7 +30,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +49,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -87,6 +89,10 @@ static struct severity {
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
 	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
 		),
@@ -123,6 +129,11 @@ static struct severity {
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
 	MCESEV(
+		AR, "Action required: data load error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
+	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
@@ -183,7 +194,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (search_mcexception_tables(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9d014b82a124..472d11150b7a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/poll.h>
 #include <linux/nmi.h>
@@ -1132,9 +1133,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		if (no_way_out)
 			mce_panic("Fatal machine check on current CPU", &m, msg);
 		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
+			if ((m.cs & 3) == 3) {
+				recover_paddr = m.addr;
+				if (!(m.mcgstatus & MCG_STATUS_RIPV))
+					flags |= MF_MUST_KILL;
+			} else if (fixup_mcexception(regs)) {
+				regs->ax = BIT(63) | m.addr;
+			} else
+				mce_panic("Failed kernel mode recovery",
+					  &m, NULL);
 		} else if (kill_it) {
 			force_sig(SIGBUS, current);
 		}
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-11-09 18:26 [RFC PATCH 0/3] Machine check recovery when kernel accesses poison Tony Luck
  2015-11-06 20:57 ` [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables Tony Luck
  2015-11-06 21:01 ` [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas Tony Luck
@ 2015-11-06 21:08 ` Tony Luck
  2015-11-12  7:53   ` Ingo Molnar
  2015-11-09 18:48 ` [RFC PATCH 0/3] Machine check recovery when kernel accesses poison Tony Luck
  2015-11-10 11:21 ` Borislav Petkov
  4 siblings, 1 reply; 29+ messages in thread
From: Tony Luck @ 2015-11-06 21:08 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

Using __copy_user_nocache() as inspiration create a memory copy
routine for use by kernel code with annotations to allow for
recovery from machine checks.

Notes:
1) Unlike the original we make no attempt to copy all the bytes
   up to the faulting address. The original achieves that by
   re-executing the failing part as a byte-by-byte copy,
   which will take another page fault. We don't want to have
   a second machine check!
2) Likewise the return value for the original indicates exactly
   how many bytes were not copied. Instead we provide the physical
   address of the fault (thanks to help from do_machine_check()

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/uaccess_64.h |  3 ++
 arch/x86/kernel/x8664_ksyms_64.c  |  2 +
 arch/x86/lib/copy_user_64.S       | 91 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f2f9b39b274a..79517954e652 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -216,6 +216,9 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 extern long __copy_user_nocache(void *dst, const void __user *src,
 				unsigned size, int zerorest);
 
+extern phys_addr_t mcsafe_memcpy(void *dst, const void __user *src,
+				unsigned size);
+
 static inline int
 __copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
 {
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..ec988c92c055 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+EXPORT_SYMBOL(mcsafe_memcpy);
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 982ce34f4a9b..ffce93cbc9a5 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -319,3 +319,94 @@ ENTRY(__copy_user_nocache)
 	_ASM_EXTABLE(21b,50b)
 	_ASM_EXTABLE(22b,50b)
 ENDPROC(__copy_user_nocache)
+
+/*
+ * mcsafe_memcpy - Uncached memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ * This will force destination/source out of cache for more performance.
+ */
+ENTRY(mcsafe_memcpy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 100b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	movnti %r8,(%rdi)
+	movnti %r9,1*8(%rdi)
+	movnti %r10,2*8(%rdi)
+	movnti %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	movnti %r8,4*8(%rdi)
+	movnti %r9,5*8(%rdi)
+	movnti %r10,6*8(%rdi)
+	movnti %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	movnti %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorl %eax,%eax
+	sfence
+	ret
+
+	.section .fixup,"ax"
+30:
+	sfence
+	/* do_machine_check() sets %eax return value */
+	ret
+	.previous
+
+	_ASM_MCEXTABLE(0b,30b)
+	_ASM_MCEXTABLE(1b,30b)
+	_ASM_MCEXTABLE(2b,30b)
+	_ASM_MCEXTABLE(3b,30b)
+	_ASM_MCEXTABLE(4b,30b)
+	_ASM_MCEXTABLE(9b,30b)
+	_ASM_MCEXTABLE(10b,30b)
+	_ASM_MCEXTABLE(11b,30b)
+	_ASM_MCEXTABLE(12b,30b)
+	_ASM_MCEXTABLE(18b,30b)
+	_ASM_MCEXTABLE(21b,30b)
+ENDPROC(mcsafe_memcpy)
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [RFC PATCH 0/3] Machine check recovery when kernel accesses poison
@ 2015-11-09 18:26 Tony Luck
  2015-11-06 20:57 ` [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables Tony Luck
                   ` (4 more replies)
  0 siblings, 5 replies; 29+ messages in thread
From: Tony Luck @ 2015-11-09 18:26 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

This is a first draft to show the direction I'm taking to
make it possible for the kernel to recover from machine
checks taken while kernel code is executing.

In a nutshell I'm duplicating the existing annotation mechanism
we use to handle faults when copying to/from user space and
then having the machine check handler check to see if we were
running code in a marked region to fudge the return IP to
point to the recovery path.

Note that I also fudge the return value.  I'd like in the future
to be able to write a "mcsafe_copy_from_user()" function that
would be annotated both for page faults, to return a count of
bytes uncopied, or an indication that there was a machine check.
Hence the BIT(63) bit.  Internal feedback suggested we'd need
some IS_ERR() like macros to help users decode what happened
to take the right action.  But this is "RFC" to see if people
have better ideas on how to handle this.

Almost certainly breaks 32-bit kernels ... while we need to
not break them, I don't see that we need to make this work for
them. Machine check recovery only works on Xeon systems that
have a minimum memory too big for a 32-bit kernel to even boot.

Tony Luck (3):
  x86, ras: Add new infrastructure for machine check fixup tables
  x86, ras: Extend machine check recovery code to annotated ring0 areas
  x86, ras: Add mcsafe_memcpy() function to recover from machine checks

 arch/x86/include/asm/asm.h                |  7 +++
 arch/x86/include/asm/uaccess.h            |  1 +
 arch/x86/include/asm/uaccess_64.h         |  3 +
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 19 ++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 13 ++++-
 arch/x86/kernel/x8664_ksyms_64.c          |  2 +
 arch/x86/lib/copy_user_64.S               | 91 +++++++++++++++++++++++++++++++
 arch/x86/mm/extable.c                     | 16 ++++++
 include/asm-generic/vmlinux.lds.h         |  6 ++
 include/linux/module.h                    |  1 +
 kernel/extable.c                          | 14 +++++
 11 files changed, 168 insertions(+), 5 deletions(-)

-- 
2.1.4


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC PATCH 0/3] Machine check recovery when kernel accesses poison
  2015-11-09 18:26 [RFC PATCH 0/3] Machine check recovery when kernel accesses poison Tony Luck
                   ` (2 preceding siblings ...)
  2015-11-06 21:08 ` [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks Tony Luck
@ 2015-11-09 18:48 ` Tony Luck
  2015-11-10 11:21 ` Borislav Petkov
  4 siblings, 0 replies; 29+ messages in thread
From: Tony Luck @ 2015-11-09 18:48 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Linux Kernel Mailing List, Linux Edac Mailing List, X86-ML

[-- Attachment #1: Type: text/plain, Size: 365 bytes --]

On Mon, Nov 9, 2015 at 10:26 AM, Tony Luck <tony.luck@intel.com> wrote:
> This is a first draft to show the direction I'm taking to
> make it possible for the kernel to recover from machine
> checks taken while kernel code is executing.

Simple test case to show it actually works.  You need a Xeon E7 class system
and to enable error injection in the BIOS.

-Tony

[-- Attachment #2: mcsafe_memcpy_test.tgz --]
[-- Type: application/x-gzip, Size: 1046 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC PATCH 0/3] Machine check recovery when kernel accesses poison
  2015-11-09 18:26 [RFC PATCH 0/3] Machine check recovery when kernel accesses poison Tony Luck
                   ` (3 preceding siblings ...)
  2015-11-09 18:48 ` [RFC PATCH 0/3] Machine check recovery when kernel accesses poison Tony Luck
@ 2015-11-10 11:21 ` Borislav Petkov
  2015-11-10 21:55   ` Luck, Tony
  4 siblings, 1 reply; 29+ messages in thread
From: Borislav Petkov @ 2015-11-10 11:21 UTC (permalink / raw)
  To: Tony Luck; +Cc: linux-kernel, linux-edac, x86

On Mon, Nov 09, 2015 at 10:26:08AM -0800, Tony Luck wrote:
> This is a first draft to show the direction I'm taking to
> make it possible for the kernel to recover from machine
> checks taken while kernel code is executing.

Just a general, why-do-we-do-this, question: on big systems, the memory
occupied by the kernel is a very small percentage compared to whole RAM,
right? And yet we want to recover from there too? Not, say, kexec...

> Note that I also fudge the return value.  I'd like in the future
> to be able to write a "mcsafe_copy_from_user()" function that
> would be annotated both for page faults, to return a count of
> bytes uncopied, or an indication that there was a machine check.
> Hence the BIT(63) bit.  Internal feedback suggested we'd need
> some IS_ERR() like macros to help users decode what happened
> to take the right action.  But this is "RFC" to see if people
> have better ideas on how to handle this.

Hmm, shouldn't this be using MF_ACTION_REQUIRED or even maybe a new MF_
flag which is converted into a BUS_MCEERR_AR si_code and thus current
gets a signal?

Only setting bit 63 looks a bit flaky to me...

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-11-06 20:57 ` [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables Tony Luck
@ 2015-11-10 11:21   ` Borislav Petkov
  2015-11-10 22:05     ` Luck, Tony
  2015-11-12  4:14   ` Andy Lutomirski
  1 sibling, 1 reply; 29+ messages in thread
From: Borislav Petkov @ 2015-11-10 11:21 UTC (permalink / raw)
  To: Tony Luck; +Cc: linux-kernel, linux-edac, x86

On Fri, Nov 06, 2015 at 12:57:03PM -0800, Tony Luck wrote:
> Copy the existing page fault fixup mechanisms to create a new table
> to be used when fixing machine checks. Note:
> 1) At this time we only provide a macro to annotate assembly code
> 2) We assume all fixups will in code builtin to the kernel.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/asm.h        |  7 +++++++
>  arch/x86/include/asm/uaccess.h    |  1 +
>  arch/x86/mm/extable.c             | 16 ++++++++++++++++
>  include/asm-generic/vmlinux.lds.h |  6 ++++++
>  include/linux/module.h            |  1 +
>  kernel/extable.c                  | 14 ++++++++++++++
>  6 files changed, 45 insertions(+)
> 
> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
> index 189679aba703..f2fa7973f18f 100644
> --- a/arch/x86/include/asm/asm.h
> +++ b/arch/x86/include/asm/asm.h
> @@ -58,6 +58,13 @@
>  	.long (to) - . + 0x7ffffff0 ;				\
>  	.popsection
>  
> +# define _ASM_MCEXTABLE(from, to)				\

Maybe add an intermediary macro which abstracts the table name:

#define __ASM_EXTABLE(from, to, table)
...

and then do

#define _ASM_EXTABLE(from, to)		__ASM_EXTABLE(from, to, "__ex_table")
#define _ASM_MCEXTABLE(from, to)	__ASM_EXTABLE(from, to, "__mcex_table")

> +	.pushsection "__mcex_table", "a" ;			\
> +	.balign 8 ;						\
> +	.long (from) - . ;					\
> +	.long (to) - . ;					\
> +	.popsection
> +
>  # define _ASM_NOKPROBE(entry)					\
>  	.pushsection "_kprobe_blacklist","aw" ;			\
>  	_ASM_ALIGN ;						\
> diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
> index a8df874f3e88..b8231301a224 100644
> --- a/arch/x86/include/asm/uaccess.h
> +++ b/arch/x86/include/asm/uaccess.h
> @@ -111,6 +111,7 @@ struct exception_table_entry {
>  #define ARCH_HAS_SEARCH_EXTABLE
>  
>  extern int fixup_exception(struct pt_regs *regs);
> +extern int fixup_mcexception(struct pt_regs *regs);
>  extern int early_fixup_exception(unsigned long *ip);
>  
>  /*
> diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
> index 903ec1e9c326..5b328ae00365 100644
> --- a/arch/x86/mm/extable.c
> +++ b/arch/x86/mm/extable.c
> @@ -49,6 +49,22 @@ int fixup_exception(struct pt_regs *regs)
>  	return 0;
>  }
>  
> +int fixup_mcexception(struct pt_regs *regs)
> +{
> +	const struct exception_table_entry *fixup;
> +	unsigned long new_ip;
> +
> +	fixup = search_mcexception_tables(regs->ip);
> +	if (fixup) {
> +		new_ip = ex_fixup_addr(fixup);
> +
> +		regs->ip = new_ip;
> +		return 1;
> +	}
> +
> +	return 0;
> +}

Yeah, all that duplication might raise some brows but I'd guess
special-handling MCA in the normal exception paths might make the code
a bit too ugly...

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
  2015-11-06 21:01 ` [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas Tony Luck
@ 2015-11-10 11:21   ` Borislav Petkov
  2015-11-10 22:11     ` Luck, Tony
  2015-11-12  4:19   ` Andy Lutomirski
  1 sibling, 1 reply; 29+ messages in thread
From: Borislav Petkov @ 2015-11-10 11:21 UTC (permalink / raw)
  To: Tony Luck; +Cc: linux-kernel, linux-edac, x86

On Fri, Nov 06, 2015 at 01:01:55PM -0800, Tony Luck wrote:
> Extend the severity checking code to add a new context IN_KERN_RECOV
> which is used to indicate that the machine check was triggered by code
> in the kernel with a fixup entry.
> 
> Add code to check for this situation and respond by altering the return
> IP to the fixup address and changing the regs->ax so that the recovery
> code knows the physical address of the error. Note that we also set bit
> 63 because 0x0 is a legal physical address.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/kernel/cpu/mcheck/mce-severity.c | 19 +++++++++++++++++--
>  arch/x86/kernel/cpu/mcheck/mce.c          | 13 ++++++++++---
>  2 files changed, 27 insertions(+), 5 deletions(-)

...

> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 9d014b82a124..472d11150b7a 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -31,6 +31,7 @@
>  #include <linux/types.h>
>  #include <linux/slab.h>
>  #include <linux/init.h>
> +#include <linux/module.h>
>  #include <linux/kmod.h>
>  #include <linux/poll.h>
>  #include <linux/nmi.h>
> @@ -1132,9 +1133,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)

You could save a precious indentation level here:

	if (cfg->tolerant == 3)
		goto clear;

and add the "clear" label below.

clear:
        if (worst > 0)
                mce_report_event(regs);
        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0)

>  		if (no_way_out)
>  			mce_panic("Fatal machine check on current CPU", &m, msg);
>  		if (worst == MCE_AR_SEVERITY) {
> -			recover_paddr = m.addr;
> -			if (!(m.mcgstatus & MCG_STATUS_RIPV))
> -				flags |= MF_MUST_KILL;
> +			if ((m.cs & 3) == 3) {
> +				recover_paddr = m.addr;
> +				if (!(m.mcgstatus & MCG_STATUS_RIPV))
> +					flags |= MF_MUST_KILL;
> +			} else if (fixup_mcexception(regs)) {
> +				regs->ax = BIT(63) | m.addr;
> +			} else
> +				mce_panic("Failed kernel mode recovery",
> +					  &m, NULL);
>  		} else if (kill_it) {
>  			force_sig(SIGBUS, current);
>  		}
> -- 
> 2.1.4
> 
> 

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC PATCH 0/3] Machine check recovery when kernel accesses poison
  2015-11-10 11:21 ` Borislav Petkov
@ 2015-11-10 21:55   ` Luck, Tony
  2015-11-11 20:41     ` Borislav Petkov
  0 siblings, 1 reply; 29+ messages in thread
From: Luck, Tony @ 2015-11-10 21:55 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

On Tue, Nov 10, 2015 at 12:21:01PM +0100, Borislav Petkov wrote:
> Just a general, why-do-we-do-this, question: on big systems, the memory
> occupied by the kernel is a very small percentage compared to whole RAM,
> right? And yet we want to recover from there too? Not, say, kexec...

I need to add more to the motivation part of this. The people who want
this are playing with NVDIMMs as storage. So think of many GBytes of
non-volatile memory on the source end of the memcpy(). People are used
to disk errors just giving them a -EIO error. They'll be unhappy if an
NVDIMM error crashes the machine.

> > Note that I also fudge the return value.  I'd like in the future
> > to be able to write a "mcsafe_copy_from_user()" function that
> > would be annotated both for page faults, to return a count of
> > bytes uncopied, or an indication that there was a machine check.
> > Hence the BIT(63) bit.  Internal feedback suggested we'd need
> > some IS_ERR() like macros to help users decode what happened
> > to take the right action.  But this is "RFC" to see if people
> > have better ideas on how to handle this.
> 
> Hmm, shouldn't this be using MF_ACTION_REQUIRED or even maybe a new MF_
> flag which is converted into a BUS_MCEERR_AR si_code and thus current
> gets a signal?
> 
> Only setting bit 63 looks a bit flaky to me...

It will be up to the caller to figure out what action to take. In
the NVDIMM filessytem scenario outlined above the result may be -EIO
for a data block ... something more drastic if we were reading metadata.

When I get around to writing mcsafe_copy_from_user() the code might
end up like:

some_syscall_e_g_write(void __user *buf, size_t cnt)
{
	u64 ret;

	ret = mcsafe_copy_from_user(kbuf, buf, cnt);

	if (ret & BIT(63)) {
		do some machine check thing ... e.g.
		send a SIGBUS to this process and return -EINTR
		This is where we use the address (after converting
		back to a user virtual address).
	} else if (ret) {
		user gave us a bad buffer: return -EFAULT
	} else {
		success!!!
	}
}

Which all looks quite ugly in long-hand ... I'm hoping that with
some pretty macros we can make it pretty.

-Tony

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-11-10 11:21   ` Borislav Petkov
@ 2015-11-10 22:05     ` Luck, Tony
  0 siblings, 0 replies; 29+ messages in thread
From: Luck, Tony @ 2015-11-10 22:05 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

On Tue, Nov 10, 2015 at 12:21:16PM +0100, Borislav Petkov wrote:
> > +# define _ASM_MCEXTABLE(from, to)				\
> 
> Maybe add an intermediary macro which abstracts the table name:
> 
> #define __ASM_EXTABLE(from, to, table)
> ...
> 
> and then do
> 
> #define _ASM_EXTABLE(from, to)		__ASM_EXTABLE(from, to, "__ex_table")
> #define _ASM_MCEXTABLE(from, to)	__ASM_EXTABLE(from, to, "__mcex_table")

That looks a bit nicer.
> 
> Yeah, all that duplication might raise some brows but I'd guess
> special-handling MCA in the normal exception paths might make the code
> a bit too ugly...

The 0-day robot berated me for bloating the i386-tinyconfig by 88 bytes.
I guess I can put the new functions inside #ifdef CONFIG_MEMORY_FAILURE
to save them from that. Enterprise kernels that turn this option on can
probably live with 88 bytes.

-Tony

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
  2015-11-10 11:21   ` Borislav Petkov
@ 2015-11-10 22:11     ` Luck, Tony
  2015-11-11 11:01       ` Borislav Petkov
  0 siblings, 1 reply; 29+ messages in thread
From: Luck, Tony @ 2015-11-10 22:11 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

On Tue, Nov 10, 2015 at 12:21:42PM +0100, Borislav Petkov wrote:
> You could save a precious indentation level here:
> 
> 	if (cfg->tolerant == 3)
> 		goto clear;
> 
> and add the "clear" label below.
> 
> clear:
>         if (worst > 0)
>                 mce_report_event(regs);
>         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0)
> 
> >  		if (no_way_out)
> >  			mce_panic("Fatal machine check on current CPU", &m, msg);
> >  		if (worst == MCE_AR_SEVERITY) {
> > -			recover_paddr = m.addr;
> > -			if (!(m.mcgstatus & MCG_STATUS_RIPV))
> > -				flags |= MF_MUST_KILL;
> > +			if ((m.cs & 3) == 3) {
> > +				recover_paddr = m.addr;
> > +				if (!(m.mcgstatus & MCG_STATUS_RIPV))
> > +					flags |= MF_MUST_KILL;
> > +			} else if (fixup_mcexception(regs)) {
> > +				regs->ax = BIT(63) | m.addr;
> > +			} else
> > +				mce_panic("Failed kernel mode recovery",
> > +					  &m, NULL);
> >  		} else if (kill_it) {
> >  			force_sig(SIGBUS, current);
> >  		}

That would be tidier ... the inside of the "if" has been gradually growing
with added recovery paths.  I had to fold the mce_panic() line to shut
checkpatch up.

But I'm not really sure what tolerant==3 people really want here. By skipping
the recovery code they doom themselves to hitting the machine check again.

-Tony

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
  2015-11-10 22:11     ` Luck, Tony
@ 2015-11-11 11:01       ` Borislav Petkov
  0 siblings, 0 replies; 29+ messages in thread
From: Borislav Petkov @ 2015-11-11 11:01 UTC (permalink / raw)
  To: Luck, Tony; +Cc: linux-kernel, linux-edac, x86

On Tue, Nov 10, 2015 at 02:11:35PM -0800, Luck, Tony wrote:
> That would be tidier ... the inside of the "if" has been gradually growing
> with added recovery paths.  I had to fold the mce_panic() line to shut
> checkpatch up.

Bah, I don't take checkpatch seriously anymore. In that case, you
could've left the line stick out, for all I know. Our monitors can do
more pixels since the 1980s I believe. :)

> But I'm not really sure what tolerant==3 people really want here. By skipping
> the recovery code they doom themselves to hitting the machine check again.

Looks like a loop to me, fun.

I guess the user application should be aware of BUS_MCEERR_AR and
not attempt the same access to not cause a loop with tolerant==3.
Alternatively, we can still kill it to prevent the looping...

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC PATCH 0/3] Machine check recovery when kernel accesses poison
  2015-11-10 21:55   ` Luck, Tony
@ 2015-11-11 20:41     ` Borislav Petkov
  2015-11-11 21:48       ` Luck, Tony
  0 siblings, 1 reply; 29+ messages in thread
From: Borislav Petkov @ 2015-11-11 20:41 UTC (permalink / raw)
  To: Luck, Tony; +Cc: linux-kernel, linux-edac, x86

On Tue, Nov 10, 2015 at 01:55:46PM -0800, Luck, Tony wrote:
> I need to add more to the motivation part of this. The people who want
> this are playing with NVDIMMs as storage. So think of many GBytes of
> non-volatile memory on the source end of the memcpy(). People are used
> to disk errors just giving them a -EIO error. They'll be unhappy if an
> NVDIMM error crashes the machine.

Ah.

Btw, there's no flag, by chance, somewhere in the MCA regs bunch at
error time which says that the error is originating from NVDIMM? Because
if there were, this patchset is moot. :)

> It will be up to the caller to figure out what action to take. In
> the NVDIMM filessytem scenario outlined above the result may be -EIO
> for a data block ... something more drastic if we were reading metadata.
> 
> When I get around to writing mcsafe_copy_from_user() the code might
> end up like:
> 
> some_syscall_e_g_write(void __user *buf, size_t cnt)
> {
> 	u64 ret;
> 
> 	ret = mcsafe_copy_from_user(kbuf, buf, cnt);
> 
> 	if (ret & BIT(63)) {
> 		do some machine check thing ... e.g.
> 		send a SIGBUS to this process and return -EINTR
> 		This is where we use the address (after converting
> 		back to a user virtual address).
> 	} else if (ret) {
> 		user gave us a bad buffer: return -EFAULT
> 	} else {
> 		success!!!
> 	}
> }

Oh ok, so bit 63 doesn't leave the kernel. Then it's all fine,
nevermind.

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC PATCH 0/3] Machine check recovery when kernel accesses poison
  2015-11-11 20:41     ` Borislav Petkov
@ 2015-11-11 21:48       ` Luck, Tony
  2015-11-11 22:28         ` Borislav Petkov
  0 siblings, 1 reply; 29+ messages in thread
From: Luck, Tony @ 2015-11-11 21:48 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

On Wed, Nov 11, 2015 at 09:41:58PM +0100, Borislav Petkov wrote:
> On Tue, Nov 10, 2015 at 01:55:46PM -0800, Luck, Tony wrote:
> > I need to add more to the motivation part of this. The people who want
> > this are playing with NVDIMMs as storage. So think of many GBytes of
> > non-volatile memory on the source end of the memcpy(). People are used
> > to disk errors just giving them a -EIO error. They'll be unhappy if an
> > NVDIMM error crashes the machine.
> 
> Ah.
> 
> Btw, there's no flag, by chance, somewhere in the MCA regs bunch at
> error time which says that the error is originating from NVDIMM? Because
> if there were, this patchset is moot. :)

No flag. We can search MCi_ADDR across the ranges to see whether this
was a normal RAM error on non-volatile. But that doesn't make this patch
moot. We still need to change the return address to go to the fixup code
instead of back to the place where we hit the error. The exception table
is a list of pairs of instruction pointers:

   [Instruction-that-may-fault, Address-of-fixup-code]

In my RFC code I only have one function that can fault, and all the fixup
addresses point to the same place. But that doesn't scale to adding more
functions (like mcsafe_copy_from_user()).

-Tony

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC PATCH 0/3] Machine check recovery when kernel accesses poison
  2015-11-11 21:48       ` Luck, Tony
@ 2015-11-11 22:28         ` Borislav Petkov
  2015-11-11 22:32           ` Luck, Tony
  0 siblings, 1 reply; 29+ messages in thread
From: Borislav Petkov @ 2015-11-11 22:28 UTC (permalink / raw)
  To: Luck, Tony; +Cc: linux-kernel, linux-edac, x86

On Wed, Nov 11, 2015 at 01:48:04PM -0800, Luck, Tony wrote:
> No flag. We can search MCi_ADDR across the ranges to see whether this
> was a normal RAM error on non-volatile. But that doesn't make this patch
> moot. We still need to change the return address to go to the fixup code
> instead of back to the place where we hit the error.

Why?

If you know that it is in the nvdimm range, you can grade the error with
lower severity...

Or do you mean that without the exception table we'll return back to the
insn causing the error and loop indefinitely this way?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [RFC PATCH 0/3] Machine check recovery when kernel accesses poison
  2015-11-11 22:28         ` Borislav Petkov
@ 2015-11-11 22:32           ` Luck, Tony
  0 siblings, 0 replies; 29+ messages in thread
From: Luck, Tony @ 2015-11-11 22:32 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 626 bytes --]

> If you know that it is in the nvdimm range, you can grade the error with
> lower severity...

Grading the severity isn't the main issue.

> Or do you mean that without the exception table we'll return back to the
> insn causing the error and loop indefinitely this way?

Yes. We need to NOT return to the instruction that faulted. We need to pick
a different return address. We need to do that inside the #MC handler.

The exception table does that for us.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-11-06 20:57 ` [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables Tony Luck
  2015-11-10 11:21   ` Borislav Petkov
@ 2015-11-12  4:14   ` Andy Lutomirski
  2015-11-12 19:44     ` Luck, Tony
  1 sibling, 1 reply; 29+ messages in thread
From: Andy Lutomirski @ 2015-11-12  4:14 UTC (permalink / raw)
  To: Tony Luck, Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

On 11/06/2015 12:57 PM, Tony Luck wrote:
> Copy the existing page fault fixup mechanisms to create a new table
> to be used when fixing machine checks. Note:
> 1) At this time we only provide a macro to annotate assembly code
> 2) We assume all fixups will in code builtin to the kernel.

Shouldn't the first step be to fixup failures during user memory access?

>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>   arch/x86/include/asm/asm.h        |  7 +++++++
>   arch/x86/include/asm/uaccess.h    |  1 +
>   arch/x86/mm/extable.c             | 16 ++++++++++++++++
>   include/asm-generic/vmlinux.lds.h |  6 ++++++
>   include/linux/module.h            |  1 +
>   kernel/extable.c                  | 14 ++++++++++++++
>   6 files changed, 45 insertions(+)
>
> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
> index 189679aba703..f2fa7973f18f 100644
> --- a/arch/x86/include/asm/asm.h
> +++ b/arch/x86/include/asm/asm.h
> @@ -58,6 +58,13 @@
>   	.long (to) - . + 0x7ffffff0 ;				\
>   	.popsection
>
> +# define _ASM_MCEXTABLE(from, to)				\
> +	.pushsection "__mcex_table", "a" ;			\
> +	.balign 8 ;						\
> +	.long (from) - . ;					\
> +	.long (to) - . ;					\
> +	.popsection
> +

This does something really weird to rax.  (Also, what happens on 32-bit 
kernels?  There's no bit 63.)

Please at least document it clearly.

--Andy

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
  2015-11-06 21:01 ` [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas Tony Luck
  2015-11-10 11:21   ` Borislav Petkov
@ 2015-11-12  4:19   ` Andy Lutomirski
  2015-11-12 19:55     ` Luck, Tony
  1 sibling, 1 reply; 29+ messages in thread
From: Andy Lutomirski @ 2015-11-12  4:19 UTC (permalink / raw)
  To: Tony Luck, Borislav Petkov; +Cc: linux-kernel, linux-edac, x86

On 11/06/2015 01:01 PM, Tony Luck wrote:
> Extend the severity checking code to add a new context IN_KERN_RECOV
> which is used to indicate that the machine check was triggered by code
> in the kernel with a fixup entry.
>
> Add code to check for this situation and respond by altering the return
> IP to the fixup address and changing the regs->ax so that the recovery
> code knows the physical address of the error. Note that we also set bit
> 63 because 0x0 is a legal physical address.
>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>   arch/x86/kernel/cpu/mcheck/mce-severity.c | 19 +++++++++++++++++--
>   arch/x86/kernel/cpu/mcheck/mce.c          | 13 ++++++++++---
>   2 files changed, 27 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
> index 9c682c222071..1e83842310e8 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
> @@ -12,6 +12,7 @@
>   #include <linux/kernel.h>
>   #include <linux/seq_file.h>
>   #include <linux/init.h>
> +#include <linux/module.h>
>   #include <linux/debugfs.h>
>   #include <asm/mce.h>
>
> @@ -29,7 +30,7 @@
>    * panic situations)
>    */
>
> -enum context { IN_KERNEL = 1, IN_USER = 2 };
> +enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
>   enum ser { SER_REQUIRED = 1, NO_SER = 2 };
>   enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
>
> @@ -48,6 +49,7 @@ static struct severity {
>   #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
>   #define  KERNEL		.context = IN_KERNEL
>   #define  USER		.context = IN_USER
> +#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
>   #define  SER		.ser = SER_REQUIRED
>   #define  NOSER		.ser = NO_SER
>   #define  EXCP		.excp = EXCP_CONTEXT
> @@ -87,6 +89,10 @@ static struct severity {
>   		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
>   		),
>   	MCESEV(
> +		PANIC, "In kernel and no restart IP",
> +		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
> +		),
> +	MCESEV(
>   		DEFERRED, "Deferred error",
>   		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
>   		),
> @@ -123,6 +129,11 @@ static struct severity {
>   		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
>   		),
>   	MCESEV(
> +		AR, "Action required: data load error recoverable area of kernel",
> +		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
> +		KERNEL_RECOV
> +		),
> +	MCESEV(
>   		AR, "Action required: data load error in a user process",
>   		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
>   		USER
> @@ -183,7 +194,11 @@ static struct severity {
>    */
>   static int error_context(struct mce *m)
>   {
> -	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
> +	if ((m->cs & 3) == 3)
> +		return IN_USER;
> +	if (search_mcexception_tables(m->ip))
> +		return IN_KERNEL_RECOV;
> +	return IN_KERNEL;
>   }
>
>   /*
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 9d014b82a124..472d11150b7a 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -31,6 +31,7 @@
>   #include <linux/types.h>
>   #include <linux/slab.h>
>   #include <linux/init.h>
> +#include <linux/module.h>
>   #include <linux/kmod.h>
>   #include <linux/poll.h>
>   #include <linux/nmi.h>
> @@ -1132,9 +1133,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
>   		if (no_way_out)
>   			mce_panic("Fatal machine check on current CPU", &m, msg);
>   		if (worst == MCE_AR_SEVERITY) {
> -			recover_paddr = m.addr;
> -			if (!(m.mcgstatus & MCG_STATUS_RIPV))
> -				flags |= MF_MUST_KILL;
> +			if ((m.cs & 3) == 3) {
> +				recover_paddr = m.addr;
> +				if (!(m.mcgstatus & MCG_STATUS_RIPV))
> +					flags |= MF_MUST_KILL;
> +			} else if (fixup_mcexception(regs)) {
> +				regs->ax = BIT(63) | m.addr;
> +			} else
> +				mce_panic("Failed kernel mode recovery",
> +					  &m, NULL);

Maybe I'm misunderstanding this, but presumably you shouldn't call 
fixup_mcexception unless you've first verified RIPV (i.e. that the ip 
you're looking up in the table is valid).

Also... I find the general flow of this code very hard to follow.  It's 
critical that an MCE hitting kernel mode not get as far as 
ist_begin_non_atomic.  It was already hard enough to tell that the code 
follows that rule, and now it's even harder.  Would it make sense to add 
clear assertions that m.cs == regs->cs and that user_mode(regs) when you 
get to the end?  Simplifying the control flow might also be nice.

>   		} else if (kill_it) {
>   			force_sig(SIGBUS, current);
>   		}
>

I would argue that this should happen in the non-atomic section.  It's 
probably okay as long as we came from user mode, but it's more obviously 
safe in the non-atomic section.

--Andy


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-11-06 21:08 ` [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks Tony Luck
@ 2015-11-12  7:53   ` Ingo Molnar
  2015-11-12 20:01     ` Luck, Tony
  0 siblings, 1 reply; 29+ messages in thread
From: Ingo Molnar @ 2015-11-12  7:53 UTC (permalink / raw)
  To: Tony Luck; +Cc: Borislav Petkov, linux-kernel, linux-edac, x86


* Tony Luck <tony.luck@intel.com> wrote:

> Using __copy_user_nocache() as inspiration create a memory copy
> routine for use by kernel code with annotations to allow for
> recovery from machine checks.
> 
> Notes:
> 1) Unlike the original we make no attempt to copy all the bytes
>    up to the faulting address. The original achieves that by
>    re-executing the failing part as a byte-by-byte copy,
>    which will take another page fault. We don't want to have
>    a second machine check!
> 2) Likewise the return value for the original indicates exactly
>    how many bytes were not copied. Instead we provide the physical
>    address of the fault (thanks to help from do_machine_check()

> +extern phys_addr_t mcsafe_memcpy(void *dst, const void __user *src,
> +				unsigned size);

So what's the longer term purpose, where will mcsafe_memcpy() be used?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-11-12  4:14   ` Andy Lutomirski
@ 2015-11-12 19:44     ` Luck, Tony
  2015-11-12 20:04       ` Andy Lutomirski
  0 siblings, 1 reply; 29+ messages in thread
From: Luck, Tony @ 2015-11-12 19:44 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, linux-kernel, linux-edac, x86,
	DanWilliamsdan.j.williams

On Wed, Nov 11, 2015 at 08:14:56PM -0800, Andy Lutomirski wrote:
> On 11/06/2015 12:57 PM, Tony Luck wrote:
> >Copy the existing page fault fixup mechanisms to create a new table
> >to be used when fixing machine checks. Note:
> >1) At this time we only provide a macro to annotate assembly code
> >2) We assume all fixups will in code builtin to the kernel.
> 
> Shouldn't the first step be to fixup failures during user memory access?

We already have code to recover from machine checks encountered
while the processor is executing ring3 code.

This series is gently extending to ring0 code in some places that look
to be high enough profile to warrant the attention (and that we have
some plan for a recovery action). Initial user will be filessytem code
using NVDIMM as storage. I.e. lots of memory accessed by a small amount
of code. If we get a machine check reading the NVDIMM, then we turn it
into -EIO.

> This does something really weird to rax.  (Also, what happens on 32-bit
> kernels?  There's no bit 63.)

32-bit kernels are out of luck for this - but I don't feel bad about it -
you simply cannot run a 32-bit kernel on machines that have this level
of recovery (they have too much memory to boot 32-bit kernels).

> Please at least document it clearly.

Will do.

-Tony

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
  2015-11-12  4:19   ` Andy Lutomirski
@ 2015-11-12 19:55     ` Luck, Tony
  0 siblings, 0 replies; 29+ messages in thread
From: Luck, Tony @ 2015-11-12 19:55 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Borislav Petkov, linux-kernel, linux-edac, x86

On Wed, Nov 11, 2015 at 08:19:35PM -0800, Andy Lutomirski wrote:
> >@@ -1132,9 +1133,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
> >  		if (no_way_out)
> >  			mce_panic("Fatal machine check on current CPU", &m, msg);
> >  		if (worst == MCE_AR_SEVERITY) {
> >-			recover_paddr = m.addr;
> >-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
> >-				flags |= MF_MUST_KILL;
> >+			if ((m.cs & 3) == 3) {
> >+				recover_paddr = m.addr;
> >+				if (!(m.mcgstatus & MCG_STATUS_RIPV))
> >+					flags |= MF_MUST_KILL;
> >+			} else if (fixup_mcexception(regs)) {
> >+				regs->ax = BIT(63) | m.addr;
> >+			} else
> >+				mce_panic("Failed kernel mode recovery",
> >+					  &m, NULL);
> 
> Maybe I'm misunderstanding this, but presumably you shouldn't call
> fixup_mcexception unless you've first verified RIPV (i.e. that the ip you're
> looking up in the table is valid).

Good point. We can only arrive here with a AR_SEVERITY from some
kernel code if the code in mce_severity.c assigned that severity.
But it doesn't currently look at RIPV ... I should make it do that.
Actually I'll check for both RIPV and EIPV: we don't need to look for
a fixup entry for all the innocent bystander cpus that got dragged
into the exception handler because the exception was broadcast to
everyone.

> Also... I find the general flow of this code very hard to follow.  It's
> critical that an MCE hitting kernel mode not get as far as
> ist_begin_non_atomic.  It was already hard enough to tell that the code
> follows that rule, and now it's even harder.  Would it make sense to add
> clear assertions that m.cs == regs->cs and that user_mode(regs) when you get
> to the end?  Simplifying the control flow might also be nice.

Yes. This is a mess now. It works (because we only set recover_paddr
in the user mode case ... so we'll take the "goto done" for the kernel
case). But I agree that this is far from obvious.

> >  		} else if (kill_it) {
> >  			force_sig(SIGBUS, current);
> >  		}
> >
> 
> I would argue that this should happen in the non-atomic section.  It's
> probably okay as long as we came from user mode, but it's more obviously
> safe in the non-atomic section.

Will look at relocating this too when I restructure the tail of the
function.

Thanks for the review.

-Tony

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-11-12  7:53   ` Ingo Molnar
@ 2015-11-12 20:01     ` Luck, Tony
  2015-11-27 10:16       ` Ingo Molnar
  0 siblings, 1 reply; 29+ messages in thread
From: Luck, Tony @ 2015-11-12 20:01 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Borislav Petkov, linux-kernel, linux-edac, x86

On Thu, Nov 12, 2015 at 08:53:13AM +0100, Ingo Molnar wrote:
> > +extern phys_addr_t mcsafe_memcpy(void *dst, const void __user *src,
> > +				unsigned size);
> 
> So what's the longer term purpose, where will mcsafe_memcpy() be used?

The initial plan is to use this for file systems backed by NVDIMMs. They
will have a large amount of memory, and we have a practical recovery
path - return -EIO just like legacy h/w.

We can look for other places in the kernel where we read large amounts
of memory and have some idea how to recover if the memory turns out to
be bad.

-Tony

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-11-12 19:44     ` Luck, Tony
@ 2015-11-12 20:04       ` Andy Lutomirski
  2015-11-12 21:17         ` Luck, Tony
  0 siblings, 1 reply; 29+ messages in thread
From: Andy Lutomirski @ 2015-11-12 20:04 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Andy Lutomirski, Borislav Petkov, linux-kernel, linux-edac,
	X86 ML, DanWilliamsdan.j.williams

On Thu, Nov 12, 2015 at 11:44 AM, Luck, Tony <tony.luck@intel.com> wrote:
> On Wed, Nov 11, 2015 at 08:14:56PM -0800, Andy Lutomirski wrote:
>> On 11/06/2015 12:57 PM, Tony Luck wrote:
>> >Copy the existing page fault fixup mechanisms to create a new table
>> >to be used when fixing machine checks. Note:
>> >1) At this time we only provide a macro to annotate assembly code
>> >2) We assume all fixups will in code builtin to the kernel.
>>
>> Shouldn't the first step be to fixup failures during user memory access?
>
> We already have code to recover from machine checks encountered
> while the processor is executing ring3 code.

I meant failures during copy_from_user, copy_to_user, etc.

--Andy

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-11-12 20:04       ` Andy Lutomirski
@ 2015-11-12 21:17         ` Luck, Tony
  0 siblings, 0 replies; 29+ messages in thread
From: Luck, Tony @ 2015-11-12 21:17 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Borislav Petkov, linux-kernel, linux-edac,
	X86 ML, DanWilliamsdan.j.williams

On Thu, Nov 12, 2015 at 12:04:36PM -0800, Andy Lutomirski wrote:
> > We already have code to recover from machine checks encountered
> > while the processor is executing ring3 code.
> 
> I meant failures during copy_from_user, copy_to_user, etc.

Yes.  copy_from_user() will be pretty interesting from a coverage point
of view.  We can recover by sending a SIGBUS to the process just like
we would have if the process had accessed the data directly rather than
passing the address to the kernel to acccess it.

copy_to_user() is a lot harder. The machine check is on the kernel side
of the copy. If we are copying from page cache as part of a read(2)
syscall from a regular file we can probably nuke the page from the cache
and return -EIO to the user.  Other cases may be possible, but I don't
immediately see any way to do it as a general case.

-Tony

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-11-12 20:01     ` Luck, Tony
@ 2015-11-27 10:16       ` Ingo Molnar
  2015-12-08 21:30         ` Dan Williams
  0 siblings, 1 reply; 29+ messages in thread
From: Ingo Molnar @ 2015-11-27 10:16 UTC (permalink / raw)
  To: Luck, Tony; +Cc: Borislav Petkov, linux-kernel, linux-edac, x86


* Luck, Tony <tony.luck@intel.com> wrote:

> On Thu, Nov 12, 2015 at 08:53:13AM +0100, Ingo Molnar wrote:
> > > +extern phys_addr_t mcsafe_memcpy(void *dst, const void __user *src,
> > > +				unsigned size);
> > 
> > So what's the longer term purpose, where will mcsafe_memcpy() be used?
> 
> The initial plan is to use this for file systems backed by NVDIMMs. They will 
> have a large amount of memory, and we have a practical recovery path - return 
> -EIO just like legacy h/w.
> 
> We can look for other places in the kernel where we read large amounts of memory 
> and have some idea how to recover if the memory turns out to be bad.

I see, that's sensible!

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-11-27 10:16       ` Ingo Molnar
@ 2015-12-08 21:30         ` Dan Williams
  2015-12-08 22:08             ` Luck, Tony
  2015-12-14  9:55           ` Ingo Molnar
  0 siblings, 2 replies; 29+ messages in thread
From: Dan Williams @ 2015-12-08 21:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Luck, Tony, Borislav Petkov, Linux Kernel Mailing List,
	linux-edac, the arch/x86 maintainers, Jens Axboe, Verma,
	Vishal L, Jeff Moyer, linux-nvdimm@lists.01.org

[ adding nvdimm folks ]

On Fri, Nov 27, 2015 at 2:16 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Luck, Tony <tony.luck@intel.com> wrote:
>
>> On Thu, Nov 12, 2015 at 08:53:13AM +0100, Ingo Molnar wrote:
>> > > +extern phys_addr_t mcsafe_memcpy(void *dst, const void __user *src,
>> > > +                         unsigned size);
>> >
>> > So what's the longer term purpose, where will mcsafe_memcpy() be used?
>>
>> The initial plan is to use this for file systems backed by NVDIMMs. They will
>> have a large amount of memory, and we have a practical recovery path - return
>> -EIO just like legacy h/w.
>>
>> We can look for other places in the kernel where we read large amounts of memory
>> and have some idea how to recover if the memory turns out to be bad.
>
> I see, that's sensible!
>
> Thanks,
>
>         Ingo

Is that an "Acked-by"?  I'd like to pull this plus Vishal's
gendisk-badblocks patches into a unified libnvdimm-error-handling
branch.  We're looking to have v4.5 able to avoid or survive nvdimm
media errors through the pmem driver and DAX paths.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-12-08 21:30         ` Dan Williams
@ 2015-12-08 22:08             ` Luck, Tony
  2015-12-14  9:55           ` Ingo Molnar
  1 sibling, 0 replies; 29+ messages in thread
From: Luck, Tony @ 2015-12-08 22:08 UTC (permalink / raw)
  To: Williams, Dan J, Ingo Molnar
  Cc: Borislav Petkov, Linux Kernel Mailing List, linux-edac,
	the arch/x86 maintainers, Jens Axboe, Verma, Vishal L,
	Jeff Moyer, linux-nvdimm@lists.01.org

> Is that an "Acked-by"?  I'd like to pull this plus Vishal's
> gendisk-badblocks patches into a unified libnvdimm-error-handling
> branch.  We're looking to have v4.5 able to avoid or survive nvdimm
> media errors through the pmem driver and DAX paths.

I'm making a V2 that fixes some build errors for 32-bit and addresses other commentary
from the community. Perhaps a couple more days before I finish it up ready to post.

-Tony

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
@ 2015-12-08 22:08             ` Luck, Tony
  0 siblings, 0 replies; 29+ messages in thread
From: Luck, Tony @ 2015-12-08 22:08 UTC (permalink / raw)
  To: Williams, Dan J, Ingo Molnar
  Cc: Borislav Petkov, Linux Kernel Mailing List, linux-edac,
	the arch/x86 maintainers, Jens Axboe, Verma, Vishal L,
	Jeff Moyer, linux-nvdimm@lists.01.org

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 593 bytes --]

> Is that an "Acked-by"?  I'd like to pull this plus Vishal's
> gendisk-badblocks patches into a unified libnvdimm-error-handling
> branch.  We're looking to have v4.5 able to avoid or survive nvdimm
> media errors through the pmem driver and DAX paths.

I'm making a V2 that fixes some build errors for 32-bit and addresses other commentary
from the community. Perhaps a couple more days before I finish it up ready to post.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-12-08 21:30         ` Dan Williams
  2015-12-08 22:08             ` Luck, Tony
@ 2015-12-14  9:55           ` Ingo Molnar
  1 sibling, 0 replies; 29+ messages in thread
From: Ingo Molnar @ 2015-12-14  9:55 UTC (permalink / raw)
  To: Dan Williams
  Cc: Luck, Tony, Borislav Petkov, Linux Kernel Mailing List,
	linux-edac, the arch/x86 maintainers, Jens Axboe, Verma,
	Vishal L, Jeff Moyer, linux-nvdimm@lists.01.org


* Dan Williams <dan.j.williams@intel.com> wrote:

> [ adding nvdimm folks ]
> 
> On Fri, Nov 27, 2015 at 2:16 AM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > * Luck, Tony <tony.luck@intel.com> wrote:
> >
> >> On Thu, Nov 12, 2015 at 08:53:13AM +0100, Ingo Molnar wrote:
> >> > > +extern phys_addr_t mcsafe_memcpy(void *dst, const void __user *src,
> >> > > +                         unsigned size);
> >> >
> >> > So what's the longer term purpose, where will mcsafe_memcpy() be used?
> >>
> >> The initial plan is to use this for file systems backed by NVDIMMs. They will
> >> have a large amount of memory, and we have a practical recovery path - return
> >> -EIO just like legacy h/w.
> >>
> >> We can look for other places in the kernel where we read large amounts of memory
> >> and have some idea how to recover if the memory turns out to be bad.
> >
> > I see, that's sensible!
> >
> > Thanks,
> >
> >         Ingo
> 
> Is that an "Acked-by"?  I'd like to pull this plus Vishal's
> gendisk-badblocks patches into a unified libnvdimm-error-handling
> branch.  We're looking to have v4.5 able to avoid or survive nvdimm
> media errors through the pmem driver and DAX paths.

So there was some feedback for v2 as well - I'd like to see v3 before an Acked-by.

But yeah, this is progressing in the right direction, and I suspect it's a 
relatively urgent feature from an nvdimm POV?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2015-12-14  9:55 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-09 18:26 [RFC PATCH 0/3] Machine check recovery when kernel accesses poison Tony Luck
2015-11-06 20:57 ` [PATCH 1/3] x86, ras: Add new infrastructure for machine check fixup tables Tony Luck
2015-11-10 11:21   ` Borislav Petkov
2015-11-10 22:05     ` Luck, Tony
2015-11-12  4:14   ` Andy Lutomirski
2015-11-12 19:44     ` Luck, Tony
2015-11-12 20:04       ` Andy Lutomirski
2015-11-12 21:17         ` Luck, Tony
2015-11-06 21:01 ` [PATCH 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas Tony Luck
2015-11-10 11:21   ` Borislav Petkov
2015-11-10 22:11     ` Luck, Tony
2015-11-11 11:01       ` Borislav Petkov
2015-11-12  4:19   ` Andy Lutomirski
2015-11-12 19:55     ` Luck, Tony
2015-11-06 21:08 ` [PATCH 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks Tony Luck
2015-11-12  7:53   ` Ingo Molnar
2015-11-12 20:01     ` Luck, Tony
2015-11-27 10:16       ` Ingo Molnar
2015-12-08 21:30         ` Dan Williams
2015-12-08 22:08           ` Luck, Tony
2015-12-08 22:08             ` Luck, Tony
2015-12-14  9:55           ` Ingo Molnar
2015-11-09 18:48 ` [RFC PATCH 0/3] Machine check recovery when kernel accesses poison Tony Luck
2015-11-10 11:21 ` Borislav Petkov
2015-11-10 21:55   ` Luck, Tony
2015-11-11 20:41     ` Borislav Petkov
2015-11-11 21:48       ` Luck, Tony
2015-11-11 22:28         ` Borislav Petkov
2015-11-11 22:32           ` Luck, Tony

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.