All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-12-16 16:39 ` Tony Luck
@ 2015-12-16  1:29   ` Tony Luck
  -1 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-16  1:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

Copy the existing page fault fixup mechanisms to create a new table
to be used when fixing machine checks. Note:
1) At this time we only provide a macro to annotate assembly code
2) We assume all fixups will in code builtin to the kernel.
3) Only for x86_64
4) New code under CONFIG_MCE_KERNEL_RECOVERY (default 'n')

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/Kconfig                  | 10 ++++++++++
 arch/x86/include/asm/asm.h        | 10 ++++++++--
 arch/x86/include/asm/mce.h        | 14 ++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce.c  | 16 ++++++++++++++++
 arch/x86/kernel/vmlinux.lds.S     |  6 +++++-
 arch/x86/mm/extable.c             | 19 +++++++++++++++++++
 include/asm-generic/vmlinux.lds.h | 12 +++++++-----
 7 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 96d058a87100..42d26b4d1ec4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
 	  If you don't know what a machine check is and you don't do kernel
 	  QA it is safe to say n.
 
+config MCE_KERNEL_RECOVERY
+	bool "Recovery from machine checks in special kernel memory copy functions"
+	default n
+	depends on X86_MCE && X86_64
+	---help---
+	  This option provides a new memory copy function mcsafe_memcpy()
+	  that is annotated to allow the machine check handler to return
+	  to an alternate code path to return an error to the caller instead
+	  of crashing the system. Say yes if you have a driver that uses this.
+
 config X86_THERMAL_VECTOR
 	def_bool y
 	depends on X86_MCE_INTEL
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..a5d483ac11fa 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,13 +44,19 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
-	.pushsection "__ex_table","a" ;				\
+# define __ASM_EXTABLE(from, to, table)				\
+	.pushsection table, "a" ;				\
 	.balign 8 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
 	.popsection
 
+# define _ASM_EXTABLE(from, to)					\
+	__ASM_EXTABLE(from, to, "__ex_table")
+
+# define _ASM_MCEXTABLE(from, to)				\
+	__ASM_EXTABLE(from, to, "__mcex_table")
+
 # define _ASM_EXTABLE_EX(from,to)				\
 	.pushsection "__ex_table","a" ;				\
 	.balign 8 ;						\
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2dbc0bf2b9f3..9dc11d2a9db1 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -279,4 +279,18 @@ struct cper_sec_mem_err;
 extern void apei_mce_report_mem_error(int corrected,
 				      struct cper_sec_mem_err *mem_err);
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+const struct exception_table_entry *search_mcexception_tables(unsigned long a);
+int fixup_mcexception(struct pt_regs *regs);
+#else
+static inline const struct exception_table_entry *search_mcexception_tables(unsigned long a)
+{
+	return 0;
+}
+static inline int fixup_mcexception(struct pt_regs *regs)
+{
+	return 0;
+}
+#endif
+
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9d014b82a124..0111cd49ee94 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/poll.h>
 #include <linux/nmi.h>
@@ -2022,8 +2023,23 @@ static int __init mcheck_enable(char *str)
 }
 __setup("mce", mcheck_enable);
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+extern struct exception_table_entry __start___mcex_table[];
+extern struct exception_table_entry __stop___mcex_table[];
+
+/* Given an address, look for it in the machine check exception tables. */
+const struct exception_table_entry *search_mcexception_tables(unsigned long addr)
+{
+	return search_extable(__start___mcex_table, __stop___mcex_table-1, addr);
+}
+#endif
+
 int __init mcheck_init(void)
 {
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+	if (__stop___mcex_table > __start___mcex_table)
+		sort_extable(__start___mcex_table, __stop___mcex_table);
+#endif
 	mcheck_intel_therm_init();
 	mce_register_decode_chain(&mce_srao_nb);
 	mcheck_vendor_init_severity();
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf11f562..a65fa0deda06 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -110,7 +110,11 @@ SECTIONS
 
 	NOTES :text :note
 
-	EXCEPTION_TABLE(16) :text = 0x9090
+	EXCEPTION_TABLE(16)
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+	NAMED_EXCEPTION_TABLE(16, mcex)
+#endif
+	:text = 0x9090
 
 #if defined(CONFIG_DEBUG_RODATA)
 	/* .text should occupy whole number of pages */
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..f8f8b72e88af 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -2,6 +2,7 @@
 #include <linux/spinlock.h>
 #include <linux/sort.h>
 #include <asm/uaccess.h>
+#include <asm/mce.h>
 
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
@@ -49,6 +50,24 @@ int fixup_exception(struct pt_regs *regs)
 	return 0;
 }
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+int fixup_mcexception(struct pt_regs *regs)
+{
+	const struct exception_table_entry *fixup;
+	unsigned long new_ip;
+
+	fixup = search_mcexception_tables(regs->ip);
+	if (fixup) {
+		new_ip = ex_fixup_addr(fixup);
+
+		regs->ip = new_ip;
+		return 1;
+	}
+
+	return 0;
+}
+#endif
+
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 1781e54ea6d3..42ef98de373a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -467,14 +467,16 @@
 /*
  * Exception table
  */
-#define EXCEPTION_TABLE(align)						\
+#define NAMED_EXCEPTION_TABLE(align, pfx)				\
 	. = ALIGN(align);						\
-	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__start___ex_table) = .;			\
-		*(__ex_table)						\
-		VMLINUX_SYMBOL(__stop___ex_table) = .;			\
+	__##pfx##_table : AT(ADDR(__##pfx##_table) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__start___##pfx##_table) = .;		\
+		*(__##pfx##_table)					\
+		VMLINUX_SYMBOL(__stop___##pfx##_table) = .;		\
 	}
 
+#define EXCEPTION_TABLE(align) NAMED_EXCEPTION_TABLE(align, ex)
+
 /*
  * Init task
  */
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
@ 2015-12-16  1:29   ` Tony Luck
  0 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-16  1:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert (Persistent Memory),
	linux-kernel, linux-mm, linux-nvdimm, x86

Copy the existing page fault fixup mechanisms to create a new table
to be used when fixing machine checks. Note:
1) At this time we only provide a macro to annotate assembly code
2) We assume all fixups will in code builtin to the kernel.
3) Only for x86_64
4) New code under CONFIG_MCE_KERNEL_RECOVERY (default 'n')

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/Kconfig                  | 10 ++++++++++
 arch/x86/include/asm/asm.h        | 10 ++++++++--
 arch/x86/include/asm/mce.h        | 14 ++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce.c  | 16 ++++++++++++++++
 arch/x86/kernel/vmlinux.lds.S     |  6 +++++-
 arch/x86/mm/extable.c             | 19 +++++++++++++++++++
 include/asm-generic/vmlinux.lds.h | 12 +++++++-----
 7 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 96d058a87100..42d26b4d1ec4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
 	  If you don't know what a machine check is and you don't do kernel
 	  QA it is safe to say n.
 
+config MCE_KERNEL_RECOVERY
+	bool "Recovery from machine checks in special kernel memory copy functions"
+	default n
+	depends on X86_MCE && X86_64
+	---help---
+	  This option provides a new memory copy function mcsafe_memcpy()
+	  that is annotated to allow the machine check handler to return
+	  to an alternate code path to return an error to the caller instead
+	  of crashing the system. Say yes if you have a driver that uses this.
+
 config X86_THERMAL_VECTOR
 	def_bool y
 	depends on X86_MCE_INTEL
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..a5d483ac11fa 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,13 +44,19 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
-	.pushsection "__ex_table","a" ;				\
+# define __ASM_EXTABLE(from, to, table)				\
+	.pushsection table, "a" ;				\
 	.balign 8 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
 	.popsection
 
+# define _ASM_EXTABLE(from, to)					\
+	__ASM_EXTABLE(from, to, "__ex_table")
+
+# define _ASM_MCEXTABLE(from, to)				\
+	__ASM_EXTABLE(from, to, "__mcex_table")
+
 # define _ASM_EXTABLE_EX(from,to)				\
 	.pushsection "__ex_table","a" ;				\
 	.balign 8 ;						\
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2dbc0bf2b9f3..9dc11d2a9db1 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -279,4 +279,18 @@ struct cper_sec_mem_err;
 extern void apei_mce_report_mem_error(int corrected,
 				      struct cper_sec_mem_err *mem_err);
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+const struct exception_table_entry *search_mcexception_tables(unsigned long a);
+int fixup_mcexception(struct pt_regs *regs);
+#else
+static inline const struct exception_table_entry *search_mcexception_tables(unsigned long a)
+{
+	return 0;
+}
+static inline int fixup_mcexception(struct pt_regs *regs)
+{
+	return 0;
+}
+#endif
+
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9d014b82a124..0111cd49ee94 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/poll.h>
 #include <linux/nmi.h>
@@ -2022,8 +2023,23 @@ static int __init mcheck_enable(char *str)
 }
 __setup("mce", mcheck_enable);
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+extern struct exception_table_entry __start___mcex_table[];
+extern struct exception_table_entry __stop___mcex_table[];
+
+/* Given an address, look for it in the machine check exception tables. */
+const struct exception_table_entry *search_mcexception_tables(unsigned long addr)
+{
+	return search_extable(__start___mcex_table, __stop___mcex_table-1, addr);
+}
+#endif
+
 int __init mcheck_init(void)
 {
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+	if (__stop___mcex_table > __start___mcex_table)
+		sort_extable(__start___mcex_table, __stop___mcex_table);
+#endif
 	mcheck_intel_therm_init();
 	mce_register_decode_chain(&mce_srao_nb);
 	mcheck_vendor_init_severity();
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf11f562..a65fa0deda06 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -110,7 +110,11 @@ SECTIONS
 
 	NOTES :text :note
 
-	EXCEPTION_TABLE(16) :text = 0x9090
+	EXCEPTION_TABLE(16)
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+	NAMED_EXCEPTION_TABLE(16, mcex)
+#endif
+	:text = 0x9090
 
 #if defined(CONFIG_DEBUG_RODATA)
 	/* .text should occupy whole number of pages */
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..f8f8b72e88af 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -2,6 +2,7 @@
 #include <linux/spinlock.h>
 #include <linux/sort.h>
 #include <asm/uaccess.h>
+#include <asm/mce.h>
 
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
@@ -49,6 +50,24 @@ int fixup_exception(struct pt_regs *regs)
 	return 0;
 }
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+int fixup_mcexception(struct pt_regs *regs)
+{
+	const struct exception_table_entry *fixup;
+	unsigned long new_ip;
+
+	fixup = search_mcexception_tables(regs->ip);
+	if (fixup) {
+		new_ip = ex_fixup_addr(fixup);
+
+		regs->ip = new_ip;
+		return 1;
+	}
+
+	return 0;
+}
+#endif
+
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 1781e54ea6d3..42ef98de373a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -467,14 +467,16 @@
 /*
  * Exception table
  */
-#define EXCEPTION_TABLE(align)						\
+#define NAMED_EXCEPTION_TABLE(align, pfx)				\
 	. = ALIGN(align);						\
-	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__start___ex_table) = .;			\
-		*(__ex_table)						\
-		VMLINUX_SYMBOL(__stop___ex_table) = .;			\
+	__##pfx##_table : AT(ADDR(__##pfx##_table) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__start___##pfx##_table) = .;		\
+		*(__##pfx##_table)					\
+		VMLINUX_SYMBOL(__stop___##pfx##_table) = .;		\
 	}
 
+#define EXCEPTION_TABLE(align) NAMED_EXCEPTION_TABLE(align, ex)
+
 /*
  * Init task
  */
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCHV3 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
  2015-12-16 16:39 ` Tony Luck
@ 2015-12-16  1:29   ` Tony Luck
  -1 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-16  1:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel with a fixup entry.

Add code to check for this situation and respond by altering the return
IP to the fixup address.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 21 +++++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 70 ++++++++++++++++---------------
 2 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..cc7136351820 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -29,7 +29,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +48,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -87,6 +88,10 @@ static struct severity {
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
 	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
 		),
@@ -123,6 +128,11 @@ static struct severity {
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
 	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
+	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
@@ -170,6 +180,9 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +196,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && search_mcexception_tables(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 0111cd49ee94..e848b583e840 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -959,6 +959,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -996,8 +1010,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	ist_enter(regs);
@@ -1124,22 +1136,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1147,25 +1150,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_mcexception(regs))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCHV3 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
@ 2015-12-16  1:29   ` Tony Luck
  0 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-16  1:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert (Persistent Memory),
	linux-kernel, linux-mm, linux-nvdimm, x86

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel with a fixup entry.

Add code to check for this situation and respond by altering the return
IP to the fixup address.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 21 +++++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 70 ++++++++++++++++---------------
 2 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..cc7136351820 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -29,7 +29,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +48,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -87,6 +88,10 @@ static struct severity {
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
 	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
 		),
@@ -123,6 +128,11 @@ static struct severity {
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
 	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
+	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
@@ -170,6 +180,9 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +196,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && search_mcexception_tables(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 0111cd49ee94..e848b583e840 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -959,6 +959,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -996,8 +1010,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	ist_enter(regs);
@@ -1124,22 +1136,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1147,25 +1150,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_mcexception(regs))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-12-16 16:39 ` Tony Luck
@ 2015-12-16  1:30   ` Tony Luck
  -1 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-16  1:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

Using __copy_user_nocache() as inspiration create a memory copy
routine for use by kernel code with annotations to allow for
recovery from machine checks.

Notes:
1) We align the source address rather than the destination. This
   means we never have to deal with a memory read that spans two
   cache lines ... so we can provide a precise indication of
   where the error occurred without having to re-execute at
   a byte-by-byte level to find the exact spot like the original
   did.
2) We 'or' BIT(63) into the return because this is the first
   in a series of machine check safe functions. Some will copy
   from user addresses, so may need to indicate an invalid user
   address instead of a machine check.
3) This code doesn't play any cache games. Future functions can
   use non-temporal loads/stores to meet needs of different callers.
4) Provide helpful macros to decode the return value.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/mcsafe_copy.h |  11 +++
 arch/x86/kernel/x8664_ksyms_64.c   |   5 ++
 arch/x86/lib/Makefile              |   1 +
 arch/x86/lib/mcsafe_copy.S         | 142 +++++++++++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+)
 create mode 100644 arch/x86/include/asm/mcsafe_copy.h
 create mode 100644 arch/x86/lib/mcsafe_copy.S

diff --git a/arch/x86/include/asm/mcsafe_copy.h b/arch/x86/include/asm/mcsafe_copy.h
new file mode 100644
index 000000000000..d4dbd5a667a3
--- /dev/null
+++ b/arch/x86/include/asm/mcsafe_copy.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_MCSAFE_COPY_H
+#define _ASM_X86_MCSAFE_COPY_H
+
+u64 mcsafe_memcpy(void *dst, const void *src, unsigned size);
+
+#define	COPY_MCHECK_ERRBIT	BIT(63)
+#define COPY_HAD_MCHECK(ret)	((ret) & COPY_MCHECK_ERRBIT)
+#define COPY_MCHECK_REMAIN(ret)	((ret) & ~COPY_MCHECK_ERRBIT)
+
+#endif /* _ASM_MCSAFE_COPY_H */
+
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..afab8b25dbc0 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,11 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+#include <asm/mcsafe_copy.h>
+EXPORT_SYMBOL(mcsafe_memcpy);
+#endif
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2587888d987..82bb0bf46b6b 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -21,6 +21,7 @@ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
+lib-$(CONFIG_MCE_KERNEL_RECOVERY) += mcsafe_copy.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o
 
diff --git a/arch/x86/lib/mcsafe_copy.S b/arch/x86/lib/mcsafe_copy.S
new file mode 100644
index 000000000000..059b3a9642eb
--- /dev/null
+++ b/arch/x86/lib/mcsafe_copy.S
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2015 Intel Corporation
+ * Author: Tony Luck
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+/*
+ * mcsafe_memcpy - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(mcsafe_memcpy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of source */
+	movl %esi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 0b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	mov %r8,(%rdi)
+	mov %r9,1*8(%rdi)
+	mov %r10,2*8(%rdi)
+	mov %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	mov %r8,4*8(%rdi)
+	mov %r9,5*8(%rdi)
+	mov %r10,6*8(%rdi)
+	mov %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	mov %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorl %eax,%eax
+	sfence
+	ret
+
+	.section .fixup,"ax"
+30:
+	addl %ecx,%edx
+	jmp 100f
+31:
+	shll $6,%ecx
+	addl %ecx,%edx
+	jmp 100f
+32:
+	shll $6,%ecx
+	leal -8(%ecx,%edx),%edx
+	jmp 100f
+33:
+	shll $6,%ecx
+	leal -16(%ecx,%edx),%edx
+	jmp 100f
+34:
+	shll $6,%ecx
+	leal -24(%ecx,%edx),%edx
+	jmp 100f
+35:
+	shll $6,%ecx
+	leal -32(%ecx,%edx),%edx
+	jmp 100f
+36:
+	shll $6,%ecx
+	leal -40(%ecx,%edx),%edx
+	jmp 100f
+37:
+	shll $6,%ecx
+	leal -48(%ecx,%edx),%edx
+	jmp 100f
+38:
+	shll $6,%ecx
+	leal -56(%ecx,%edx),%edx
+	jmp 100f
+39:
+	lea (%rdx,%rcx,8),%rdx
+	jmp 100f
+40:
+	movl %ecx,%edx
+100:
+	sfence
+	movabsq	$0x8000000000000000, %rax
+	orq %rdx,%rax
+	ret
+	.previous
+
+	_ASM_MCEXTABLE(0b,30b)
+	_ASM_MCEXTABLE(1b,31b)
+	_ASM_MCEXTABLE(2b,32b)
+	_ASM_MCEXTABLE(3b,33b)
+	_ASM_MCEXTABLE(4b,34b)
+	_ASM_MCEXTABLE(9b,35b)
+	_ASM_MCEXTABLE(10b,36b)
+	_ASM_MCEXTABLE(11b,37b)
+	_ASM_MCEXTABLE(12b,38b)
+	_ASM_MCEXTABLE(18b,39b)
+	_ASM_MCEXTABLE(21b,40b)
+ENDPROC(mcsafe_memcpy)
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
@ 2015-12-16  1:30   ` Tony Luck
  0 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-16  1:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert (Persistent Memory),
	linux-kernel, linux-mm, linux-nvdimm, x86

Using __copy_user_nocache() as inspiration create a memory copy
routine for use by kernel code with annotations to allow for
recovery from machine checks.

Notes:
1) We align the source address rather than the destination. This
   means we never have to deal with a memory read that spans two
   cache lines ... so we can provide a precise indication of
   where the error occurred without having to re-execute at
   a byte-by-byte level to find the exact spot like the original
   did.
2) We 'or' BIT(63) into the return because this is the first
   in a series of machine check safe functions. Some will copy
   from user addresses, so may need to indicate an invalid user
   address instead of a machine check.
3) This code doesn't play any cache games. Future functions can
   use non-temporal loads/stores to meet needs of different callers.
4) Provide helpful macros to decode the return value.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/mcsafe_copy.h |  11 +++
 arch/x86/kernel/x8664_ksyms_64.c   |   5 ++
 arch/x86/lib/Makefile              |   1 +
 arch/x86/lib/mcsafe_copy.S         | 142 +++++++++++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+)
 create mode 100644 arch/x86/include/asm/mcsafe_copy.h
 create mode 100644 arch/x86/lib/mcsafe_copy.S

diff --git a/arch/x86/include/asm/mcsafe_copy.h b/arch/x86/include/asm/mcsafe_copy.h
new file mode 100644
index 000000000000..d4dbd5a667a3
--- /dev/null
+++ b/arch/x86/include/asm/mcsafe_copy.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_MCSAFE_COPY_H
+#define _ASM_X86_MCSAFE_COPY_H
+
+u64 mcsafe_memcpy(void *dst, const void *src, unsigned size);
+
+#define	COPY_MCHECK_ERRBIT	BIT(63)
+#define COPY_HAD_MCHECK(ret)	((ret) & COPY_MCHECK_ERRBIT)
+#define COPY_MCHECK_REMAIN(ret)	((ret) & ~COPY_MCHECK_ERRBIT)
+
+#endif /* _ASM_MCSAFE_COPY_H */
+
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..afab8b25dbc0 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,11 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+#include <asm/mcsafe_copy.h>
+EXPORT_SYMBOL(mcsafe_memcpy);
+#endif
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2587888d987..82bb0bf46b6b 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -21,6 +21,7 @@ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
+lib-$(CONFIG_MCE_KERNEL_RECOVERY) += mcsafe_copy.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o
 
diff --git a/arch/x86/lib/mcsafe_copy.S b/arch/x86/lib/mcsafe_copy.S
new file mode 100644
index 000000000000..059b3a9642eb
--- /dev/null
+++ b/arch/x86/lib/mcsafe_copy.S
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2015 Intel Corporation
+ * Author: Tony Luck
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+/*
+ * mcsafe_memcpy - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(mcsafe_memcpy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of source */
+	movl %esi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 0b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	mov %r8,(%rdi)
+	mov %r9,1*8(%rdi)
+	mov %r10,2*8(%rdi)
+	mov %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	mov %r8,4*8(%rdi)
+	mov %r9,5*8(%rdi)
+	mov %r10,6*8(%rdi)
+	mov %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	mov %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorl %eax,%eax
+	sfence
+	ret
+
+	.section .fixup,"ax"
+30:
+	addl %ecx,%edx
+	jmp 100f
+31:
+	shll $6,%ecx
+	addl %ecx,%edx
+	jmp 100f
+32:
+	shll $6,%ecx
+	leal -8(%ecx,%edx),%edx
+	jmp 100f
+33:
+	shll $6,%ecx
+	leal -16(%ecx,%edx),%edx
+	jmp 100f
+34:
+	shll $6,%ecx
+	leal -24(%ecx,%edx),%edx
+	jmp 100f
+35:
+	shll $6,%ecx
+	leal -32(%ecx,%edx),%edx
+	jmp 100f
+36:
+	shll $6,%ecx
+	leal -40(%ecx,%edx),%edx
+	jmp 100f
+37:
+	shll $6,%ecx
+	leal -48(%ecx,%edx),%edx
+	jmp 100f
+38:
+	shll $6,%ecx
+	leal -56(%ecx,%edx),%edx
+	jmp 100f
+39:
+	lea (%rdx,%rcx,8),%rdx
+	jmp 100f
+40:
+	movl %ecx,%edx
+100:
+	sfence
+	movabsq	$0x8000000000000000, %rax
+	orq %rdx,%rax
+	ret
+	.previous
+
+	_ASM_MCEXTABLE(0b,30b)
+	_ASM_MCEXTABLE(1b,31b)
+	_ASM_MCEXTABLE(2b,32b)
+	_ASM_MCEXTABLE(3b,33b)
+	_ASM_MCEXTABLE(4b,34b)
+	_ASM_MCEXTABLE(9b,35b)
+	_ASM_MCEXTABLE(10b,36b)
+	_ASM_MCEXTABLE(11b,37b)
+	_ASM_MCEXTABLE(12b,38b)
+	_ASM_MCEXTABLE(18b,39b)
+	_ASM_MCEXTABLE(21b,40b)
+ENDPROC(mcsafe_memcpy)
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCHV3 0/3] Machine check recovery when kernel accesses poison
@ 2015-12-16 16:39 ` Tony Luck
  0 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-16 16:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

This series is initially targeted at the folks doing filesystems
on top of NVDIMMs. They really want to be able to return -EIO
when there is a h/w error (just like spinning rust, and SSD does).

I plan to use the same infrastructure in parts 1&2 to write a
machine check aware "copy_from_user()" that will SIGBUS the
calling application when a syscall touches poison in user space
(just like we do when the application touches the poison itself).

Changes V2-V3:

Andy:	Don't hack "regs->ax = BIT(63) | addr;" in the machine check
	handler.  Now have better fixup code that computes the number
	of remaining bytes (just like page-fault fixup).
Andy:	#define for BIT(63). Done, plus couple of extra macros using it.
Boris:	Don't clutter up generic code (like mm/extable.c) with this.
	I moved everything under arch/x86 (the asm-generic change is
	a more generic #define).
Boris:	Dependencies for CONFIG_MCE_KERNEL_RECOVERY are too generic.
	I made it a real menu item with default "n". Dan Williams
	will use "select MCE_KERNEL_RECOVERY" from his persistent
	filesystem code.
Boris:	Simplify conditionals in mce.c by moving tolerant/kill_it
	checks earlier, with a skip to end if they aren't set.
Boris:	Miscellaneous grammar/punctuation. Fixed.
Boris:	Don't leak spurious __start_mcextable symbols into kernels
	that didn't configure MCE_KERNEL_RECOVERY. Done.
Tony:	New code doesn't belong in user_copy_64.S/uaccess*.h. Moved
	to new .S/.h files
Elliott:Cacheing behavior non-optimal. Could use movntdqa, vmovntdqa
	or vmovntdqa on source addresses. I didn't fix this yet. Think
	of the current mcsafe_memcpy() as the first of several functions.
	This one is useful for small copies (meta-data) where the overhead
	of saving SSE/AVX state isn't justified.

Changes V1->V2:

0-day:	Reported build errors and warnings on 32-bit systems. Fixed
0-day:	Reported bloat to tinyconfig. Fixed
Boris:	Suggestions to use extra macros to reduce code duplication in _ASM_*EXTABLE. Done
Boris:	Re-write "tolerant==3" check to reduce indentation level. See below.
Andy:	Check IP is valid before searching kernel exception tables. Done.
Andy:	Explain use of BIT(63) on return value from mcsafe_memcpy(). Done (added decode macros).
Andy:	Untangle mess of code in tail of do_machine_check() to make it
	clear what is going on (e.g. that we only enter the ist_begin_non_atomic()
	if we were called from user code, not from kernel!). Done.

Tony Luck (3):
  x86, ras: Add new infrastructure for machine check fixup tables
  x86, ras: Extend machine check recovery code to annotated ring0 areas
  x86, ras: Add mcsafe_memcpy() function to recover from machine checks

 arch/x86/Kconfig                          |  10 +++
 arch/x86/include/asm/asm.h                |  10 ++-
 arch/x86/include/asm/mce.h                |  14 +++
 arch/x86/include/asm/mcsafe_copy.h        |  11 +++
 arch/x86/kernel/cpu/mcheck/mce-severity.c |  21 ++++-
 arch/x86/kernel/cpu/mcheck/mce.c          |  86 +++++++++++-------
 arch/x86/kernel/vmlinux.lds.S             |   6 +-
 arch/x86/kernel/x8664_ksyms_64.c          |   5 ++
 arch/x86/lib/Makefile                     |   1 +
 arch/x86/lib/mcsafe_copy.S                | 142 ++++++++++++++++++++++++++++++
 arch/x86/mm/extable.c                     |  19 ++++
 include/asm-generic/vmlinux.lds.h         |  12 +--
 12 files changed, 293 insertions(+), 44 deletions(-)
 create mode 100644 arch/x86/include/asm/mcsafe_copy.h
 create mode 100644 arch/x86/lib/mcsafe_copy.S

-- 
2.1.4


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCHV3 0/3] Machine check recovery when kernel accesses poison
@ 2015-12-16 16:39 ` Tony Luck
  0 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-16 16:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert (Persistent Memory),
	linux-kernel, linux-mm, linux-nvdimm, x86

This series is initially targeted at the folks doing filesystems
on top of NVDIMMs. They really want to be able to return -EIO
when there is a h/w error (just like spinning rust, and SSD does).

I plan to use the same infrastructure in parts 1&2 to write a
machine check aware "copy_from_user()" that will SIGBUS the
calling application when a syscall touches poison in user space
(just like we do when the application touches the poison itself).

Changes V2-V3:

Andy:	Don't hack "regs->ax = BIT(63) | addr;" in the machine check
	handler.  Now have better fixup code that computes the number
	of remaining bytes (just like page-fault fixup).
Andy:	#define for BIT(63). Done, plus couple of extra macros using it.
Boris:	Don't clutter up generic code (like mm/extable.c) with this.
	I moved everything under arch/x86 (the asm-generic change is
	a more generic #define).
Boris:	Dependencies for CONFIG_MCE_KERNEL_RECOVERY are too generic.
	I made it a real menu item with default "n". Dan Williams
	will use "select MCE_KERNEL_RECOVERY" from his persistent
	filesystem code.
Boris:	Simplify conditionals in mce.c by moving tolerant/kill_it
	checks earlier, with a skip to end if they aren't set.
Boris:	Miscellaneous grammar/punctuation. Fixed.
Boris:	Don't leak spurious __start_mcextable symbols into kernels
	that didn't configure MCE_KERNEL_RECOVERY. Done.
Tony:	New code doesn't belong in user_copy_64.S/uaccess*.h. Moved
	to new .S/.h files
Elliott:Cacheing behavior non-optimal. Could use movntdqa, vmovntdqa
	or vmovntdqa on source addresses. I didn't fix this yet. Think
	of the current mcsafe_memcpy() as the first of several functions.
	This one is useful for small copies (meta-data) where the overhead
	of saving SSE/AVX state isn't justified.

Changes V1->V2:

0-day:	Reported build errors and warnings on 32-bit systems. Fixed
0-day:	Reported bloat to tinyconfig. Fixed
Boris:	Suggestions to use extra macros to reduce code duplication in _ASM_*EXTABLE. Done
Boris:	Re-write "tolerant==3" check to reduce indentation level. See below.
Andy:	Check IP is valid before searching kernel exception tables. Done.
Andy:	Explain use of BIT(63) on return value from mcsafe_memcpy(). Done (added decode macros).
Andy:	Untangle mess of code in tail of do_machine_check() to make it
	clear what is going on (e.g. that we only enter the ist_begin_non_atomic()
	if we were called from user code, not from kernel!). Done.

Tony Luck (3):
  x86, ras: Add new infrastructure for machine check fixup tables
  x86, ras: Extend machine check recovery code to annotated ring0 areas
  x86, ras: Add mcsafe_memcpy() function to recover from machine checks

 arch/x86/Kconfig                          |  10 +++
 arch/x86/include/asm/asm.h                |  10 ++-
 arch/x86/include/asm/mce.h                |  14 +++
 arch/x86/include/asm/mcsafe_copy.h        |  11 +++
 arch/x86/kernel/cpu/mcheck/mce-severity.c |  21 ++++-
 arch/x86/kernel/cpu/mcheck/mce.c          |  86 +++++++++++-------
 arch/x86/kernel/vmlinux.lds.S             |   6 +-
 arch/x86/kernel/x8664_ksyms_64.c          |   5 ++
 arch/x86/lib/Makefile                     |   1 +
 arch/x86/lib/mcsafe_copy.S                | 142 ++++++++++++++++++++++++++++++
 arch/x86/mm/extable.c                     |  19 ++++
 include/asm-generic/vmlinux.lds.h         |  12 +--
 12 files changed, 293 insertions(+), 44 deletions(-)
 create mode 100644 arch/x86/include/asm/mcsafe_copy.h
 create mode 100644 arch/x86/lib/mcsafe_copy.S

-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-12-16  1:29   ` Tony Luck
@ 2015-12-16 17:55     ` Andy Lutomirski
  -1 siblings, 0 replies; 36+ messages in thread
From: Andy Lutomirski @ 2015-12-16 17:55 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Tue, Dec 15, 2015 at 5:29 PM, Tony Luck <tony.luck@intel.com> wrote:
> Copy the existing page fault fixup mechanisms to create a new table
> to be used when fixing machine checks. Note:
> 1) At this time we only provide a macro to annotate assembly code
> 2) We assume all fixups will in code builtin to the kernel.
> 3) Only for x86_64
> 4) New code under CONFIG_MCE_KERNEL_RECOVERY (default 'n')
>
> Signed-off-by: Tony Luck <tony.luck@intel.com>

Looks generally good.

Reviewed-by: Andy Lutomirski <luto@kernel.org>

with trivial caveats:


>  int __init mcheck_init(void)
>  {
> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
> +       if (__stop___mcex_table > __start___mcex_table)
> +               sort_extable(__start___mcex_table, __stop___mcex_table);
> +#endif

This doesn't matter unless we sprout a lot of these, but it could be
worthwhile to update sortextable.h as well.

> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
> +int fixup_mcexception(struct pt_regs *regs)
> +{
> +       const struct exception_table_entry *fixup;
> +       unsigned long new_ip;
> +
> +       fixup = search_mcexception_tables(regs->ip);
> +       if (fixup) {
> +               new_ip = ex_fixup_addr(fixup);
> +
> +               regs->ip = new_ip;

You could very easily save a line of code here :)

> +               return 1;
> +       }
> +
> +       return 0;
> +}
> +#endif
> +

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
@ 2015-12-16 17:55     ` Andy Lutomirski
  0 siblings, 0 replies; 36+ messages in thread
From: Andy Lutomirski @ 2015-12-16 17:55 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Tue, Dec 15, 2015 at 5:29 PM, Tony Luck <tony.luck@intel.com> wrote:
> Copy the existing page fault fixup mechanisms to create a new table
> to be used when fixing machine checks. Note:
> 1) At this time we only provide a macro to annotate assembly code
> 2) We assume all fixups will in code builtin to the kernel.
> 3) Only for x86_64
> 4) New code under CONFIG_MCE_KERNEL_RECOVERY (default 'n')
>
> Signed-off-by: Tony Luck <tony.luck@intel.com>

Looks generally good.

Reviewed-by: Andy Lutomirski <luto@kernel.org>

with trivial caveats:


>  int __init mcheck_init(void)
>  {
> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
> +       if (__stop___mcex_table > __start___mcex_table)
> +               sort_extable(__start___mcex_table, __stop___mcex_table);
> +#endif

This doesn't matter unless we sprout a lot of these, but it could be
worthwhile to update sortextable.h as well.

> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
> +int fixup_mcexception(struct pt_regs *regs)
> +{
> +       const struct exception_table_entry *fixup;
> +       unsigned long new_ip;
> +
> +       fixup = search_mcexception_tables(regs->ip);
> +       if (fixup) {
> +               new_ip = ex_fixup_addr(fixup);
> +
> +               regs->ip = new_ip;

You could very easily save a line of code here :)

> +               return 1;
> +       }
> +
> +       return 0;
> +}
> +#endif
> +

--Andy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-12-16 17:55     ` Andy Lutomirski
@ 2015-12-16 22:51       ` Luck, Tony
  -1 siblings, 0 replies; 36+ messages in thread
From: Luck, Tony @ 2015-12-16 22:51 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Williams, Dan J, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 912 bytes --]

> Looks generally good.
>
> Reviewed-by: Andy Lutomirski <luto@kernel.org>

You say that to part 1/3 ... what happens when you get to part 3/3 and you
read my attempts at writing x86 assembly code?

>> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
>> +int fixup_mcexception(struct pt_regs *regs)
>> +{
>> +       const struct exception_table_entry *fixup;
>> +       unsigned long new_ip;
>> +
>> +       fixup = search_mcexception_tables(regs->ip);
>> +       if (fixup) {
>> +               new_ip = ex_fixup_addr(fixup);
>> +
>> +               regs->ip = new_ip;
>
>  You could very easily save a line of code here :)

Two lines (the declaration of the variable can go away as well).
Will include if we need a V4 when everyone else gets to commenting.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
@ 2015-12-16 22:51       ` Luck, Tony
  0 siblings, 0 replies; 36+ messages in thread
From: Luck, Tony @ 2015-12-16 22:51 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Williams, Dan J, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 876 bytes --]

> Looks generally good.
>
> Reviewed-by: Andy Lutomirski <luto@kernel.org>

You say that to part 1/3 ... what happens when you get to part 3/3 and you
read my attempts at writing x86 assembly code?

>> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
>> +int fixup_mcexception(struct pt_regs *regs)
>> +{
>> +       const struct exception_table_entry *fixup;
>> +       unsigned long new_ip;
>> +
>> +       fixup = search_mcexception_tables(regs->ip);
>> +       if (fixup) {
>> +               new_ip = ex_fixup_addr(fixup);
>> +
>> +               regs->ip = new_ip;
>
>  You could very easily save a line of code here :)

Two lines (the declaration of the variable can go away as well).
Will include if we need a V4 when everyone else gets to commenting.

-Tony
N‹§²æìr¸›zǧu©ž²Æ {\b­†éì¹»\x1c®&Þ–)îÆi¢žØ^n‡r¶‰šŽŠÝ¢j$½§$¢¸\x05¢¹¨­è§~Š'.)îÄÃ,yèm¶ŸÿÃ\f%Š{±šj+ƒðèž×¦j)Z†·Ÿ

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-12-16 22:51       ` Luck, Tony
@ 2015-12-17 16:22         ` Andy Lutomirski
  -1 siblings, 0 replies; 36+ messages in thread
From: Andy Lutomirski @ 2015-12-17 16:22 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Williams, Dan J, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Wed, Dec 16, 2015 at 2:51 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> Looks generally good.
>>
>> Reviewed-by: Andy Lutomirski <luto@kernel.org>
>
> You say that to part 1/3 ... what happens when you get to part 3/3 and you
> read my attempts at writing x86 assembly code?

I'm not at all familiar with that code, and Borislav or someone else
(Denys Vlasenko?) can probably review it much better than I can.

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
@ 2015-12-17 16:22         ` Andy Lutomirski
  0 siblings, 0 replies; 36+ messages in thread
From: Andy Lutomirski @ 2015-12-17 16:22 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Williams, Dan J, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Wed, Dec 16, 2015 at 2:51 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> Looks generally good.
>>
>> Reviewed-by: Andy Lutomirski <luto@kernel.org>
>
> You say that to part 1/3 ... what happens when you get to part 3/3 and you
> read my attempts at writing x86 assembly code?

I'm not at all familiar with that code, and Borislav or someone else
(Denys Vlasenko?) can probably review it much better than I can.

--Andy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-12-16  1:29   ` Tony Luck
@ 2015-12-21 18:18     ` Borislav Petkov
  -1 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-21 18:18 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Dec 15, 2015 at 05:29:30PM -0800, Tony Luck wrote:
> Copy the existing page fault fixup mechanisms to create a new table
> to be used when fixing machine checks. Note:
> 1) At this time we only provide a macro to annotate assembly code
> 2) We assume all fixups will in code builtin to the kernel.
> 3) Only for x86_64
> 4) New code under CONFIG_MCE_KERNEL_RECOVERY (default 'n')
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/Kconfig                  | 10 ++++++++++
>  arch/x86/include/asm/asm.h        | 10 ++++++++--
>  arch/x86/include/asm/mce.h        | 14 ++++++++++++++
>  arch/x86/kernel/cpu/mcheck/mce.c  | 16 ++++++++++++++++
>  arch/x86/kernel/vmlinux.lds.S     |  6 +++++-
>  arch/x86/mm/extable.c             | 19 +++++++++++++++++++
>  include/asm-generic/vmlinux.lds.h | 12 +++++++-----
>  7 files changed, 79 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 96d058a87100..42d26b4d1ec4 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
>  	  If you don't know what a machine check is and you don't do kernel
>  	  QA it is safe to say n.
>  
> +config MCE_KERNEL_RECOVERY
> +	bool "Recovery from machine checks in special kernel memory copy functions"
> +	default n
> +	depends on X86_MCE && X86_64

Still no dependency on CONFIG_LIBNVDIMM.

> +	---help---
> +	  This option provides a new memory copy function mcsafe_memcpy()
> +	  that is annotated to allow the machine check handler to return
> +	  to an alternate code path to return an error to the caller instead
> +	  of crashing the system. Say yes if you have a driver that uses this.
> +

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
@ 2015-12-21 18:18     ` Borislav Petkov
  0 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-21 18:18 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Dec 15, 2015 at 05:29:30PM -0800, Tony Luck wrote:
> Copy the existing page fault fixup mechanisms to create a new table
> to be used when fixing machine checks. Note:
> 1) At this time we only provide a macro to annotate assembly code
> 2) We assume all fixups will in code builtin to the kernel.
> 3) Only for x86_64
> 4) New code under CONFIG_MCE_KERNEL_RECOVERY (default 'n')
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/Kconfig                  | 10 ++++++++++
>  arch/x86/include/asm/asm.h        | 10 ++++++++--
>  arch/x86/include/asm/mce.h        | 14 ++++++++++++++
>  arch/x86/kernel/cpu/mcheck/mce.c  | 16 ++++++++++++++++
>  arch/x86/kernel/vmlinux.lds.S     |  6 +++++-
>  arch/x86/mm/extable.c             | 19 +++++++++++++++++++
>  include/asm-generic/vmlinux.lds.h | 12 +++++++-----
>  7 files changed, 79 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 96d058a87100..42d26b4d1ec4 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
>  	  If you don't know what a machine check is and you don't do kernel
>  	  QA it is safe to say n.
>  
> +config MCE_KERNEL_RECOVERY
> +	bool "Recovery from machine checks in special kernel memory copy functions"
> +	default n
> +	depends on X86_MCE && X86_64

Still no dependency on CONFIG_LIBNVDIMM.

> +	---help---
> +	  This option provides a new memory copy function mcsafe_memcpy()
> +	  that is annotated to allow the machine check handler to return
> +	  to an alternate code path to return an error to the caller instead
> +	  of crashing the system. Say yes if you have a driver that uses this.
> +

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-12-21 18:18     ` Borislav Petkov
@ 2015-12-21 19:16       ` Dan Williams
  -1 siblings, 0 replies; 36+ messages in thread
From: Dan Williams @ 2015-12-21 19:16 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski, Elliott,
	Robert, linux-kernel, Linux MM, linux-nvdimm, X86 ML

On Mon, Dec 21, 2015 at 10:18 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Dec 15, 2015 at 05:29:30PM -0800, Tony Luck wrote:
>> Copy the existing page fault fixup mechanisms to create a new table
>> to be used when fixing machine checks. Note:
>> 1) At this time we only provide a macro to annotate assembly code
>> 2) We assume all fixups will in code builtin to the kernel.
>> 3) Only for x86_64
>> 4) New code under CONFIG_MCE_KERNEL_RECOVERY (default 'n')
>>
>> Signed-off-by: Tony Luck <tony.luck@intel.com>
>> ---
>>  arch/x86/Kconfig                  | 10 ++++++++++
>>  arch/x86/include/asm/asm.h        | 10 ++++++++--
>>  arch/x86/include/asm/mce.h        | 14 ++++++++++++++
>>  arch/x86/kernel/cpu/mcheck/mce.c  | 16 ++++++++++++++++
>>  arch/x86/kernel/vmlinux.lds.S     |  6 +++++-
>>  arch/x86/mm/extable.c             | 19 +++++++++++++++++++
>>  include/asm-generic/vmlinux.lds.h | 12 +++++++-----
>>  7 files changed, 79 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index 96d058a87100..42d26b4d1ec4 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
>>         If you don't know what a machine check is and you don't do kernel
>>         QA it is safe to say n.
>>
>> +config MCE_KERNEL_RECOVERY
>> +     bool "Recovery from machine checks in special kernel memory copy functions"
>> +     default n
>> +     depends on X86_MCE && X86_64
>
> Still no dependency on CONFIG_LIBNVDIMM.

I suggested we reverse the dependency and have the driver optionally
"select MCE_KERNEL_RECOVERY".  There may be other drivers outside of
LIBNVDIMM that want this functionality enabled.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
@ 2015-12-21 19:16       ` Dan Williams
  0 siblings, 0 replies; 36+ messages in thread
From: Dan Williams @ 2015-12-21 19:16 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski, Elliott,
	Robert, linux-kernel, Linux MM, linux-nvdimm, X86 ML

On Mon, Dec 21, 2015 at 10:18 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Dec 15, 2015 at 05:29:30PM -0800, Tony Luck wrote:
>> Copy the existing page fault fixup mechanisms to create a new table
>> to be used when fixing machine checks. Note:
>> 1) At this time we only provide a macro to annotate assembly code
>> 2) We assume all fixups will in code builtin to the kernel.
>> 3) Only for x86_64
>> 4) New code under CONFIG_MCE_KERNEL_RECOVERY (default 'n')
>>
>> Signed-off-by: Tony Luck <tony.luck@intel.com>
>> ---
>>  arch/x86/Kconfig                  | 10 ++++++++++
>>  arch/x86/include/asm/asm.h        | 10 ++++++++--
>>  arch/x86/include/asm/mce.h        | 14 ++++++++++++++
>>  arch/x86/kernel/cpu/mcheck/mce.c  | 16 ++++++++++++++++
>>  arch/x86/kernel/vmlinux.lds.S     |  6 +++++-
>>  arch/x86/mm/extable.c             | 19 +++++++++++++++++++
>>  include/asm-generic/vmlinux.lds.h | 12 +++++++-----
>>  7 files changed, 79 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index 96d058a87100..42d26b4d1ec4 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
>>         If you don't know what a machine check is and you don't do kernel
>>         QA it is safe to say n.
>>
>> +config MCE_KERNEL_RECOVERY
>> +     bool "Recovery from machine checks in special kernel memory copy functions"
>> +     default n
>> +     depends on X86_MCE && X86_64
>
> Still no dependency on CONFIG_LIBNVDIMM.

I suggested we reverse the dependency and have the driver optionally
"select MCE_KERNEL_RECOVERY".  There may be other drivers outside of
LIBNVDIMM that want this functionality enabled.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-12-21 19:16       ` Dan Williams
@ 2015-12-21 20:15         ` Borislav Petkov
  -1 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-21 20:15 UTC (permalink / raw)
  To: Dan Williams
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski, Elliott,
	Robert, linux-kernel, Linux MM, linux-nvdimm, X86 ML

On Mon, Dec 21, 2015 at 11:16:44AM -0800, Dan Williams wrote:
> I suggested we reverse the dependency and have the driver optionally
> "select MCE_KERNEL_RECOVERY".  There may be other drivers outside of
> LIBNVDIMM that want this functionality enabled.

Ah ok, makes sense.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
@ 2015-12-21 20:15         ` Borislav Petkov
  0 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-21 20:15 UTC (permalink / raw)
  To: Dan Williams
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski, Elliott,
	Robert, linux-kernel, Linux MM, linux-nvdimm, X86 ML

On Mon, Dec 21, 2015 at 11:16:44AM -0800, Dan Williams wrote:
> I suggested we reverse the dependency and have the driver optionally
> "select MCE_KERNEL_RECOVERY".  There may be other drivers outside of
> LIBNVDIMM that want this functionality enabled.

Ah ok, makes sense.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-12-16  1:30   ` Tony Luck
@ 2015-12-22 11:13     ` Borislav Petkov
  -1 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-22 11:13 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Dec 15, 2015 at 05:30:49PM -0800, Tony Luck wrote:
> Using __copy_user_nocache() as inspiration create a memory copy
> routine for use by kernel code with annotations to allow for
> recovery from machine checks.
> 
> Notes:
> 1) We align the source address rather than the destination. This
>    means we never have to deal with a memory read that spans two
>    cache lines ... so we can provide a precise indication of
>    where the error occurred without having to re-execute at
>    a byte-by-byte level to find the exact spot like the original
>    did.
> 2) We 'or' BIT(63) into the return because this is the first
>    in a series of machine check safe functions. Some will copy
>    from user addresses, so may need to indicate an invalid user
>    address instead of a machine check.
> 3) This code doesn't play any cache games. Future functions can
>    use non-temporal loads/stores to meet needs of different callers.
> 4) Provide helpful macros to decode the return value.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/mcsafe_copy.h |  11 +++
>  arch/x86/kernel/x8664_ksyms_64.c   |   5 ++
>  arch/x86/lib/Makefile              |   1 +
>  arch/x86/lib/mcsafe_copy.S         | 142 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 159 insertions(+)
>  create mode 100644 arch/x86/include/asm/mcsafe_copy.h
>  create mode 100644 arch/x86/lib/mcsafe_copy.S
> 
> diff --git a/arch/x86/include/asm/mcsafe_copy.h b/arch/x86/include/asm/mcsafe_copy.h
> new file mode 100644
> index 000000000000..d4dbd5a667a3
> --- /dev/null
> +++ b/arch/x86/include/asm/mcsafe_copy.h
> @@ -0,0 +1,11 @@
> +#ifndef _ASM_X86_MCSAFE_COPY_H
> +#define _ASM_X86_MCSAFE_COPY_H
> +
> +u64 mcsafe_memcpy(void *dst, const void *src, unsigned size);
> +
> +#define	COPY_MCHECK_ERRBIT	BIT(63)

What happened to the landing pads Andy was talking about? They sound
like cleaner design than that bit 63...

> +#define COPY_HAD_MCHECK(ret)	((ret) & COPY_MCHECK_ERRBIT)
> +#define COPY_MCHECK_REMAIN(ret)	((ret) & ~COPY_MCHECK_ERRBIT)
> +
> +#endif /* _ASM_MCSAFE_COPY_H */

This should all be in arch/x86/include/asm/string_64.h I guess. You can
save yourself the #include-ing.

> diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
> index a0695be19864..afab8b25dbc0 100644
> --- a/arch/x86/kernel/x8664_ksyms_64.c
> +++ b/arch/x86/kernel/x8664_ksyms_64.c
> @@ -37,6 +37,11 @@ EXPORT_SYMBOL(__copy_user_nocache);
>  EXPORT_SYMBOL(_copy_from_user);
>  EXPORT_SYMBOL(_copy_to_user);
>  
> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
> +#include <asm/mcsafe_copy.h>
> +EXPORT_SYMBOL(mcsafe_memcpy);
> +#endif
> +
>  EXPORT_SYMBOL(copy_page);
>  EXPORT_SYMBOL(clear_page);
>  
> diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
> index f2587888d987..82bb0bf46b6b 100644
> --- a/arch/x86/lib/Makefile
> +++ b/arch/x86/lib/Makefile
> @@ -21,6 +21,7 @@ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
>  lib-y += memcpy_$(BITS).o
>  lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
>  lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
> +lib-$(CONFIG_MCE_KERNEL_RECOVERY) += mcsafe_copy.o
>  
>  obj-y += msr.o msr-reg.o msr-reg-export.o
>  
> diff --git a/arch/x86/lib/mcsafe_copy.S b/arch/x86/lib/mcsafe_copy.S
> new file mode 100644
> index 000000000000..059b3a9642eb
> --- /dev/null
> +++ b/arch/x86/lib/mcsafe_copy.S

You probably should add that function to arch/x86/lib/memcpy_64.S where
we keep all those memcpy variants instead of a separate file.

> @@ -0,0 +1,142 @@
> +/*
> + * Copyright (C) 2015 Intel Corporation
> + * Author: Tony Luck
> + *
> + * This software may be redistributed and/or modified under the terms of
> + * the GNU General Public License ("GPL") version 2 only as published by the
> + * Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm.h>
> +
> +/*
> + * mcsafe_memcpy - memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + */
> +ENTRY(mcsafe_memcpy)
> +	cmpl $8,%edx
> +	jb 20f		/* less then 8 bytes, go to byte copy loop */
> +
> +	/* check for bad alignment of source */
> +	movl %esi,%ecx
> +	andl $7,%ecx
> +	jz 102f				/* already aligned */
> +	subl $8,%ecx
> +	negl %ecx
> +	subl %ecx,%edx
> +0:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 0b
> +102:
> +	movl %edx,%ecx
> +	andl $63,%edx
> +	shrl $6,%ecx
> +	jz 17f
> +1:	movq (%rsi),%r8
> +2:	movq 1*8(%rsi),%r9
> +3:	movq 2*8(%rsi),%r10
> +4:	movq 3*8(%rsi),%r11
> +	mov %r8,(%rdi)
> +	mov %r9,1*8(%rdi)
> +	mov %r10,2*8(%rdi)
> +	mov %r11,3*8(%rdi)
> +9:	movq 4*8(%rsi),%r8
> +10:	movq 5*8(%rsi),%r9
> +11:	movq 6*8(%rsi),%r10
> +12:	movq 7*8(%rsi),%r11
> +	mov %r8,4*8(%rdi)
> +	mov %r9,5*8(%rdi)
> +	mov %r10,6*8(%rdi)
> +	mov %r11,7*8(%rdi)
> +	leaq 64(%rsi),%rsi
> +	leaq 64(%rdi),%rdi
> +	decl %ecx
> +	jnz 1b
> +17:	movl %edx,%ecx
> +	andl $7,%edx
> +	shrl $3,%ecx
> +	jz 20f
> +18:	movq (%rsi),%r8
> +	mov %r8,(%rdi)
> +	leaq 8(%rsi),%rsi
> +	leaq 8(%rdi),%rdi
> +	decl %ecx
> +	jnz 18b
> +20:	andl %edx,%edx
> +	jz 23f
> +	movl %edx,%ecx
> +21:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 21b
> +23:	xorl %eax,%eax
> +	sfence
> +	ret
> +
> +	.section .fixup,"ax"
> +30:
> +	addl %ecx,%edx
> +	jmp 100f
> +31:
> +	shll $6,%ecx
> +	addl %ecx,%edx
> +	jmp 100f
> +32:
> +	shll $6,%ecx
> +	leal -8(%ecx,%edx),%edx
> +	jmp 100f
> +33:
> +	shll $6,%ecx
> +	leal -16(%ecx,%edx),%edx
> +	jmp 100f
> +34:
> +	shll $6,%ecx
> +	leal -24(%ecx,%edx),%edx
> +	jmp 100f
> +35:
> +	shll $6,%ecx
> +	leal -32(%ecx,%edx),%edx
> +	jmp 100f
> +36:
> +	shll $6,%ecx
> +	leal -40(%ecx,%edx),%edx
> +	jmp 100f
> +37:
> +	shll $6,%ecx
> +	leal -48(%ecx,%edx),%edx
> +	jmp 100f
> +38:
> +	shll $6,%ecx
> +	leal -56(%ecx,%edx),%edx
> +	jmp 100f
> +39:
> +	lea (%rdx,%rcx,8),%rdx
> +	jmp 100f
> +40:
> +	movl %ecx,%edx
> +100:
> +	sfence
> +	movabsq	$0x8000000000000000, %rax
> +	orq %rdx,%rax

I think you want to write this as:

	mov %rdx, %rax
	bts $63, %rax

It cuts down instruction bytes by almost half and it is a bit more
readable:

  5c:   48 b8 00 00 00 00 00    movabs $0x8000000000000000,%rax
  63:   00 00 80
  66:   48 09 d0                or     %rdx,%rax

  5c:   48 89 d0                mov    %rdx,%rax
  5f:   48 0f ba e8 3f          bts    $0x3f,%rax


> +	ret

Also, you can drop the "l" suffix - default operand size is 32-bit in
long mode:

        .section .fixup,"ax"
30:
        add %ecx,%edx
        jmp 100f
31:
        shl $6,%ecx
        add %ecx,%edx
        jmp 100f
32:
        shl $6,%ecx
        lea -8(%ecx,%edx),%edx
        jmp 100f
33:
        shl $6,%ecx
        lea -16(%ecx,%edx),%edx
        jmp 100f
34:
        shl $6,%ecx
        lea -24(%ecx,%edx),%edx
        jmp 100f
35:
        shl $6,%ecx
        lea -32(%ecx,%edx),%edx
        jmp 100f
36:
        shl $6,%ecx
        lea -40(%ecx,%edx),%edx
        jmp 100f
37:
        shl $6,%ecx
        lea -48(%ecx,%edx),%edx
        jmp 100f
38:
        shl $6,%ecx
        lea -56(%ecx,%edx),%edx
        jmp 100f
39:
        lea (%rdx,%rcx,8),%rdx
        jmp 100f
40:
        mov %ecx,%edx
100:

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
@ 2015-12-22 11:13     ` Borislav Petkov
  0 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-22 11:13 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Dec 15, 2015 at 05:30:49PM -0800, Tony Luck wrote:
> Using __copy_user_nocache() as inspiration create a memory copy
> routine for use by kernel code with annotations to allow for
> recovery from machine checks.
> 
> Notes:
> 1) We align the source address rather than the destination. This
>    means we never have to deal with a memory read that spans two
>    cache lines ... so we can provide a precise indication of
>    where the error occurred without having to re-execute at
>    a byte-by-byte level to find the exact spot like the original
>    did.
> 2) We 'or' BIT(63) into the return because this is the first
>    in a series of machine check safe functions. Some will copy
>    from user addresses, so may need to indicate an invalid user
>    address instead of a machine check.
> 3) This code doesn't play any cache games. Future functions can
>    use non-temporal loads/stores to meet needs of different callers.
> 4) Provide helpful macros to decode the return value.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/mcsafe_copy.h |  11 +++
>  arch/x86/kernel/x8664_ksyms_64.c   |   5 ++
>  arch/x86/lib/Makefile              |   1 +
>  arch/x86/lib/mcsafe_copy.S         | 142 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 159 insertions(+)
>  create mode 100644 arch/x86/include/asm/mcsafe_copy.h
>  create mode 100644 arch/x86/lib/mcsafe_copy.S
> 
> diff --git a/arch/x86/include/asm/mcsafe_copy.h b/arch/x86/include/asm/mcsafe_copy.h
> new file mode 100644
> index 000000000000..d4dbd5a667a3
> --- /dev/null
> +++ b/arch/x86/include/asm/mcsafe_copy.h
> @@ -0,0 +1,11 @@
> +#ifndef _ASM_X86_MCSAFE_COPY_H
> +#define _ASM_X86_MCSAFE_COPY_H
> +
> +u64 mcsafe_memcpy(void *dst, const void *src, unsigned size);
> +
> +#define	COPY_MCHECK_ERRBIT	BIT(63)

What happened to the landing pads Andy was talking about? They sound
like cleaner design than that bit 63...

> +#define COPY_HAD_MCHECK(ret)	((ret) & COPY_MCHECK_ERRBIT)
> +#define COPY_MCHECK_REMAIN(ret)	((ret) & ~COPY_MCHECK_ERRBIT)
> +
> +#endif /* _ASM_MCSAFE_COPY_H */

This should all be in arch/x86/include/asm/string_64.h I guess. You can
save yourself the #include-ing.

> diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
> index a0695be19864..afab8b25dbc0 100644
> --- a/arch/x86/kernel/x8664_ksyms_64.c
> +++ b/arch/x86/kernel/x8664_ksyms_64.c
> @@ -37,6 +37,11 @@ EXPORT_SYMBOL(__copy_user_nocache);
>  EXPORT_SYMBOL(_copy_from_user);
>  EXPORT_SYMBOL(_copy_to_user);
>  
> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
> +#include <asm/mcsafe_copy.h>
> +EXPORT_SYMBOL(mcsafe_memcpy);
> +#endif
> +
>  EXPORT_SYMBOL(copy_page);
>  EXPORT_SYMBOL(clear_page);
>  
> diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
> index f2587888d987..82bb0bf46b6b 100644
> --- a/arch/x86/lib/Makefile
> +++ b/arch/x86/lib/Makefile
> @@ -21,6 +21,7 @@ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
>  lib-y += memcpy_$(BITS).o
>  lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
>  lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
> +lib-$(CONFIG_MCE_KERNEL_RECOVERY) += mcsafe_copy.o
>  
>  obj-y += msr.o msr-reg.o msr-reg-export.o
>  
> diff --git a/arch/x86/lib/mcsafe_copy.S b/arch/x86/lib/mcsafe_copy.S
> new file mode 100644
> index 000000000000..059b3a9642eb
> --- /dev/null
> +++ b/arch/x86/lib/mcsafe_copy.S

You probably should add that function to arch/x86/lib/memcpy_64.S where
we keep all those memcpy variants instead of a separate file.

> @@ -0,0 +1,142 @@
> +/*
> + * Copyright (C) 2015 Intel Corporation
> + * Author: Tony Luck
> + *
> + * This software may be redistributed and/or modified under the terms of
> + * the GNU General Public License ("GPL") version 2 only as published by the
> + * Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm.h>
> +
> +/*
> + * mcsafe_memcpy - memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + */
> +ENTRY(mcsafe_memcpy)
> +	cmpl $8,%edx
> +	jb 20f		/* less then 8 bytes, go to byte copy loop */
> +
> +	/* check for bad alignment of source */
> +	movl %esi,%ecx
> +	andl $7,%ecx
> +	jz 102f				/* already aligned */
> +	subl $8,%ecx
> +	negl %ecx
> +	subl %ecx,%edx
> +0:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 0b
> +102:
> +	movl %edx,%ecx
> +	andl $63,%edx
> +	shrl $6,%ecx
> +	jz 17f
> +1:	movq (%rsi),%r8
> +2:	movq 1*8(%rsi),%r9
> +3:	movq 2*8(%rsi),%r10
> +4:	movq 3*8(%rsi),%r11
> +	mov %r8,(%rdi)
> +	mov %r9,1*8(%rdi)
> +	mov %r10,2*8(%rdi)
> +	mov %r11,3*8(%rdi)
> +9:	movq 4*8(%rsi),%r8
> +10:	movq 5*8(%rsi),%r9
> +11:	movq 6*8(%rsi),%r10
> +12:	movq 7*8(%rsi),%r11
> +	mov %r8,4*8(%rdi)
> +	mov %r9,5*8(%rdi)
> +	mov %r10,6*8(%rdi)
> +	mov %r11,7*8(%rdi)
> +	leaq 64(%rsi),%rsi
> +	leaq 64(%rdi),%rdi
> +	decl %ecx
> +	jnz 1b
> +17:	movl %edx,%ecx
> +	andl $7,%edx
> +	shrl $3,%ecx
> +	jz 20f
> +18:	movq (%rsi),%r8
> +	mov %r8,(%rdi)
> +	leaq 8(%rsi),%rsi
> +	leaq 8(%rdi),%rdi
> +	decl %ecx
> +	jnz 18b
> +20:	andl %edx,%edx
> +	jz 23f
> +	movl %edx,%ecx
> +21:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 21b
> +23:	xorl %eax,%eax
> +	sfence
> +	ret
> +
> +	.section .fixup,"ax"
> +30:
> +	addl %ecx,%edx
> +	jmp 100f
> +31:
> +	shll $6,%ecx
> +	addl %ecx,%edx
> +	jmp 100f
> +32:
> +	shll $6,%ecx
> +	leal -8(%ecx,%edx),%edx
> +	jmp 100f
> +33:
> +	shll $6,%ecx
> +	leal -16(%ecx,%edx),%edx
> +	jmp 100f
> +34:
> +	shll $6,%ecx
> +	leal -24(%ecx,%edx),%edx
> +	jmp 100f
> +35:
> +	shll $6,%ecx
> +	leal -32(%ecx,%edx),%edx
> +	jmp 100f
> +36:
> +	shll $6,%ecx
> +	leal -40(%ecx,%edx),%edx
> +	jmp 100f
> +37:
> +	shll $6,%ecx
> +	leal -48(%ecx,%edx),%edx
> +	jmp 100f
> +38:
> +	shll $6,%ecx
> +	leal -56(%ecx,%edx),%edx
> +	jmp 100f
> +39:
> +	lea (%rdx,%rcx,8),%rdx
> +	jmp 100f
> +40:
> +	movl %ecx,%edx
> +100:
> +	sfence
> +	movabsq	$0x8000000000000000, %rax
> +	orq %rdx,%rax

I think you want to write this as:

	mov %rdx, %rax
	bts $63, %rax

It cuts down instruction bytes by almost half and it is a bit more
readable:

  5c:   48 b8 00 00 00 00 00    movabs $0x8000000000000000,%rax
  63:   00 00 80
  66:   48 09 d0                or     %rdx,%rax

  5c:   48 89 d0                mov    %rdx,%rax
  5f:   48 0f ba e8 3f          bts    $0x3f,%rax


> +	ret

Also, you can drop the "l" suffix - default operand size is 32-bit in
long mode:

        .section .fixup,"ax"
30:
        add %ecx,%edx
        jmp 100f
31:
        shl $6,%ecx
        add %ecx,%edx
        jmp 100f
32:
        shl $6,%ecx
        lea -8(%ecx,%edx),%edx
        jmp 100f
33:
        shl $6,%ecx
        lea -16(%ecx,%edx),%edx
        jmp 100f
34:
        shl $6,%ecx
        lea -24(%ecx,%edx),%edx
        jmp 100f
35:
        shl $6,%ecx
        lea -32(%ecx,%edx),%edx
        jmp 100f
36:
        shl $6,%ecx
        lea -40(%ecx,%edx),%edx
        jmp 100f
37:
        shl $6,%ecx
        lea -48(%ecx,%edx),%edx
        jmp 100f
38:
        shl $6,%ecx
        lea -56(%ecx,%edx),%edx
        jmp 100f
39:
        lea (%rdx,%rcx,8),%rdx
        jmp 100f
40:
        mov %ecx,%edx
100:

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
  2015-12-16  1:29   ` Tony Luck
@ 2015-12-22 11:13     ` Borislav Petkov
  -1 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-22 11:13 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Dec 15, 2015 at 05:29:30PM -0800, Tony Luck wrote:
> Copy the existing page fault fixup mechanisms to create a new table
> to be used when fixing machine checks. Note:
> 1) At this time we only provide a macro to annotate assembly code
> 2) We assume all fixups will in code builtin to the kernel.
> 3) Only for x86_64
> 4) New code under CONFIG_MCE_KERNEL_RECOVERY (default 'n')
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/Kconfig                  | 10 ++++++++++
>  arch/x86/include/asm/asm.h        | 10 ++++++++--
>  arch/x86/include/asm/mce.h        | 14 ++++++++++++++
>  arch/x86/kernel/cpu/mcheck/mce.c  | 16 ++++++++++++++++
>  arch/x86/kernel/vmlinux.lds.S     |  6 +++++-
>  arch/x86/mm/extable.c             | 19 +++++++++++++++++++
>  include/asm-generic/vmlinux.lds.h | 12 +++++++-----
>  7 files changed, 79 insertions(+), 8 deletions(-)

Reviewed-by: Borislav Petkov <bp@suse.de>

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables
@ 2015-12-22 11:13     ` Borislav Petkov
  0 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-22 11:13 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Dec 15, 2015 at 05:29:30PM -0800, Tony Luck wrote:
> Copy the existing page fault fixup mechanisms to create a new table
> to be used when fixing machine checks. Note:
> 1) At this time we only provide a macro to annotate assembly code
> 2) We assume all fixups will in code builtin to the kernel.
> 3) Only for x86_64
> 4) New code under CONFIG_MCE_KERNEL_RECOVERY (default 'n')
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/Kconfig                  | 10 ++++++++++
>  arch/x86/include/asm/asm.h        | 10 ++++++++--
>  arch/x86/include/asm/mce.h        | 14 ++++++++++++++
>  arch/x86/kernel/cpu/mcheck/mce.c  | 16 ++++++++++++++++
>  arch/x86/kernel/vmlinux.lds.S     |  6 +++++-
>  arch/x86/mm/extable.c             | 19 +++++++++++++++++++
>  include/asm-generic/vmlinux.lds.h | 12 +++++++-----
>  7 files changed, 79 insertions(+), 8 deletions(-)

Reviewed-by: Borislav Petkov <bp@suse.de>

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
  2015-12-16  1:29   ` Tony Luck
@ 2015-12-22 11:14     ` Borislav Petkov
  -1 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-22 11:14 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Dec 15, 2015 at 05:29:59PM -0800, Tony Luck wrote:
> Extend the severity checking code to add a new context IN_KERN_RECOV
> which is used to indicate that the machine check was triggered by code
> in the kernel with a fixup entry.
> 
> Add code to check for this situation and respond by altering the return
> IP to the fixup address.
> 
> Major re-work to the tail code in do_machine_check() to make all this
> readable/maintainable. One functional change is that tolerant=3 no longer
> stops recovery actions. Revert to only skipping sending SIGBUS to the
> current process.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/kernel/cpu/mcheck/mce-severity.c | 21 +++++++++-
>  arch/x86/kernel/cpu/mcheck/mce.c          | 70 ++++++++++++++++---------------
>  2 files changed, 55 insertions(+), 36 deletions(-)

Reviewed-by: Borislav Petkov <bp@suse.de>

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas
@ 2015-12-22 11:14     ` Borislav Petkov
  0 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-22 11:14 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Dec 15, 2015 at 05:29:59PM -0800, Tony Luck wrote:
> Extend the severity checking code to add a new context IN_KERN_RECOV
> which is used to indicate that the machine check was triggered by code
> in the kernel with a fixup entry.
> 
> Add code to check for this situation and respond by altering the return
> IP to the fixup address.
> 
> Major re-work to the tail code in do_machine_check() to make all this
> readable/maintainable. One functional change is that tolerant=3 no longer
> stops recovery actions. Revert to only skipping sending SIGBUS to the
> current process.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/kernel/cpu/mcheck/mce-severity.c | 21 +++++++++-
>  arch/x86/kernel/cpu/mcheck/mce.c          | 70 ++++++++++++++++---------------
>  2 files changed, 55 insertions(+), 36 deletions(-)

Reviewed-by: Borislav Petkov <bp@suse.de>

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-12-22 11:13     ` Borislav Petkov
@ 2015-12-22 19:38       ` Tony Luck
  -1 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-22 19:38 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Tue, Dec 22, 2015 at 3:13 AM, Borislav Petkov <bp@alien8.de> wrote:
>> +#define      COPY_MCHECK_ERRBIT      BIT(63)
>
> What happened to the landing pads Andy was talking about? They sound
> like cleaner design than that bit 63...

I interpreted that comment as "stop playing with %rax in the fault
handler ... just change the IP to point the the .fixup location" ...
the target of the fixup being the "landing pad".

Right now this function has only one set of fault fixups (for machine
checks). When I tackle copy_from_user() it will sprout a second
set for page faults, and then will look a bit more like Andy's dual
landing pad example.

I still need an indicator to the caller which type of fault happened
since their actions will be different. So BIT(63) lives on ... but is
now set in the .fixup section rather than in the machine check
code.

I'll move the function and #defines as you suggest - we don't need
new files for these.  Also will fix the assembly code.
[In my defense that load immediate 0x8000000000000000 and 'or'
was what gcc -O2 generates from a simple bit of C code to set
bit 63 ... perhaps it is faster, or perhaps gcc is on drugs. In this
case code compactness wins over possible speed difference].

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
@ 2015-12-22 19:38       ` Tony Luck
  0 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-22 19:38 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Tue, Dec 22, 2015 at 3:13 AM, Borislav Petkov <bp@alien8.de> wrote:
>> +#define      COPY_MCHECK_ERRBIT      BIT(63)
>
> What happened to the landing pads Andy was talking about? They sound
> like cleaner design than that bit 63...

I interpreted that comment as "stop playing with %rax in the fault
handler ... just change the IP to point the the .fixup location" ...
the target of the fixup being the "landing pad".

Right now this function has only one set of fault fixups (for machine
checks). When I tackle copy_from_user() it will sprout a second
set for page faults, and then will look a bit more like Andy's dual
landing pad example.

I still need an indicator to the caller which type of fault happened
since their actions will be different. So BIT(63) lives on ... but is
now set in the .fixup section rather than in the machine check
code.

I'll move the function and #defines as you suggest - we don't need
new files for these.  Also will fix the assembly code.
[In my defense that load immediate 0x8000000000000000 and 'or'
was what gcc -O2 generates from a simple bit of C code to set
bit 63 ... perhaps it is faster, or perhaps gcc is on drugs. In this
case code compactness wins over possible speed difference].

-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-12-22 19:38       ` Tony Luck
@ 2015-12-23 12:58         ` Borislav Petkov
  -1 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-23 12:58 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Tue, Dec 22, 2015 at 11:38:07AM -0800, Tony Luck wrote:
> I interpreted that comment as "stop playing with %rax in the fault
> handler ... just change the IP to point the the .fixup location" ...
> the target of the fixup being the "landing pad".
> 
> Right now this function has only one set of fault fixups (for machine
> checks). When I tackle copy_from_user() it will sprout a second
> set for page faults, and then will look a bit more like Andy's dual
> landing pad example.
> 
> I still need an indicator to the caller which type of fault happened
> since their actions will be different. So BIT(63) lives on ... but is
> now set in the .fixup section rather than in the machine check
> code.

You mean this previous example of yours:

int copy_from_user(void *to, void *from, unsigned long n)
{
        u64 ret = mcsafe_memcpy(to, from, n);

        if (COPY_HAD_MCHECK(r)) {
                if (memory_failure(COPY_MCHECK_PADDR(ret) >> PAGE_SIZE, ...))
                        force_sig(SIGBUS, current);
                return something;
        } else
                return ret;
}

?

So what's wrong with mcsafe_memcpy() returning a proper retval which
says what type of fault happened?

I know, memcpy returns the ptr to @dest like a parrot but your version
mcsafe_memcpy() will be different. It can even be called __mcsafe_memcpy
and have a wrapper around it which fiddles out the proper retvals and
returns @dest after all. It would still be cleaner this way IMHO.

> I'll move the function and #defines as you suggest - we don't need
> new files for these.  Also will fix the assembly code.
> [In my defense that load immediate 0x8000000000000000 and 'or'
> was what gcc -O2 generates from a simple bit of C code to set
> bit 63 ... perhaps it is faster, or perhaps gcc is on drugs. In this
> case code compactness wins over possible speed difference].

Well, upon a second thought, the reason why gcc would use that huge
immediate could be because by using BTS, it clobbers the carry flag
in rFLAGS. And I guess we don't want that. Although any Jcc or other
conditional instructions touching rFLAGS following will overwrite that
bit so it won't really matter.

I've asked a gcc person, we'll see what interesting explanation comes
back.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
@ 2015-12-23 12:58         ` Borislav Petkov
  0 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-23 12:58 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	Elliott, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Tue, Dec 22, 2015 at 11:38:07AM -0800, Tony Luck wrote:
> I interpreted that comment as "stop playing with %rax in the fault
> handler ... just change the IP to point the the .fixup location" ...
> the target of the fixup being the "landing pad".
> 
> Right now this function has only one set of fault fixups (for machine
> checks). When I tackle copy_from_user() it will sprout a second
> set for page faults, and then will look a bit more like Andy's dual
> landing pad example.
> 
> I still need an indicator to the caller which type of fault happened
> since their actions will be different. So BIT(63) lives on ... but is
> now set in the .fixup section rather than in the machine check
> code.

You mean this previous example of yours:

int copy_from_user(void *to, void *from, unsigned long n)
{
        u64 ret = mcsafe_memcpy(to, from, n);

        if (COPY_HAD_MCHECK(r)) {
                if (memory_failure(COPY_MCHECK_PADDR(ret) >> PAGE_SIZE, ...))
                        force_sig(SIGBUS, current);
                return something;
        } else
                return ret;
}

?

So what's wrong with mcsafe_memcpy() returning a proper retval which
says what type of fault happened?

I know, memcpy returns the ptr to @dest like a parrot but your version
mcsafe_memcpy() will be different. It can even be called __mcsafe_memcpy
and have a wrapper around it which fiddles out the proper retvals and
returns @dest after all. It would still be cleaner this way IMHO.

> I'll move the function and #defines as you suggest - we don't need
> new files for these.  Also will fix the assembly code.
> [In my defense that load immediate 0x8000000000000000 and 'or'
> was what gcc -O2 generates from a simple bit of C code to set
> bit 63 ... perhaps it is faster, or perhaps gcc is on drugs. In this
> case code compactness wins over possible speed difference].

Well, upon a second thought, the reason why gcc would use that huge
immediate could be because by using BTS, it clobbers the carry flag
in rFLAGS. And I guess we don't want that. Although any Jcc or other
conditional instructions touching rFLAGS following will overwrite that
bit so it won't really matter.

I've asked a gcc person, we'll see what interesting explanation comes
back.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-12-23 12:58         ` Borislav Petkov
@ 2015-12-23 19:31           ` Dan Williams
  -1 siblings, 0 replies; 36+ messages in thread
From: Dan Williams @ 2015-12-23 19:31 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski, Elliott,
	Robert, Linux Kernel Mailing List, linux-mm, linux-nvdimm,
	X86-ML

On Wed, Dec 23, 2015 at 4:58 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Dec 22, 2015 at 11:38:07AM -0800, Tony Luck wrote:
>> I interpreted that comment as "stop playing with %rax in the fault
>> handler ... just change the IP to point the the .fixup location" ...
>> the target of the fixup being the "landing pad".
>>
>> Right now this function has only one set of fault fixups (for machine
>> checks). When I tackle copy_from_user() it will sprout a second
>> set for page faults, and then will look a bit more like Andy's dual
>> landing pad example.
>>
>> I still need an indicator to the caller which type of fault happened
>> since their actions will be different. So BIT(63) lives on ... but is
>> now set in the .fixup section rather than in the machine check
>> code.
>
> You mean this previous example of yours:
>
> int copy_from_user(void *to, void *from, unsigned long n)
> {
>         u64 ret = mcsafe_memcpy(to, from, n);
>
>         if (COPY_HAD_MCHECK(r)) {
>                 if (memory_failure(COPY_MCHECK_PADDR(ret) >> PAGE_SIZE, ...))
>                         force_sig(SIGBUS, current);
>                 return something;
>         } else
>                 return ret;
> }
>
> ?
>
> So what's wrong with mcsafe_memcpy() returning a proper retval which
> says what type of fault happened?
>
> I know, memcpy returns the ptr to @dest like a parrot but your version
> mcsafe_memcpy() will be different. It can even be called __mcsafe_memcpy
> and have a wrapper around it which fiddles out the proper retvals and
> returns @dest after all. It would still be cleaner this way IMHO.

We might leave this to the consumer.  It's already the case that
mcsafe_memcpy() is arch specific so I'm having to wrap its return
value into a generic value.  My current thinking is make
memcpy_from_pmem() return a pmem_cookie_t, and then have an arch
specific pmem_copy_error(pmem_cookit_t cookie) helper that interprets
the value.  This is similar to the situation we have with
dma_mapping_error().

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
@ 2015-12-23 19:31           ` Dan Williams
  0 siblings, 0 replies; 36+ messages in thread
From: Dan Williams @ 2015-12-23 19:31 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski, Elliott,
	Robert, Linux Kernel Mailing List, linux-mm, linux-nvdimm,
	X86-ML

On Wed, Dec 23, 2015 at 4:58 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Dec 22, 2015 at 11:38:07AM -0800, Tony Luck wrote:
>> I interpreted that comment as "stop playing with %rax in the fault
>> handler ... just change the IP to point the the .fixup location" ...
>> the target of the fixup being the "landing pad".
>>
>> Right now this function has only one set of fault fixups (for machine
>> checks). When I tackle copy_from_user() it will sprout a second
>> set for page faults, and then will look a bit more like Andy's dual
>> landing pad example.
>>
>> I still need an indicator to the caller which type of fault happened
>> since their actions will be different. So BIT(63) lives on ... but is
>> now set in the .fixup section rather than in the machine check
>> code.
>
> You mean this previous example of yours:
>
> int copy_from_user(void *to, void *from, unsigned long n)
> {
>         u64 ret = mcsafe_memcpy(to, from, n);
>
>         if (COPY_HAD_MCHECK(r)) {
>                 if (memory_failure(COPY_MCHECK_PADDR(ret) >> PAGE_SIZE, ...))
>                         force_sig(SIGBUS, current);
>                 return something;
>         } else
>                 return ret;
> }
>
> ?
>
> So what's wrong with mcsafe_memcpy() returning a proper retval which
> says what type of fault happened?
>
> I know, memcpy returns the ptr to @dest like a parrot but your version
> mcsafe_memcpy() will be different. It can even be called __mcsafe_memcpy
> and have a wrapper around it which fiddles out the proper retvals and
> returns @dest after all. It would still be cleaner this way IMHO.

We might leave this to the consumer.  It's already the case that
mcsafe_memcpy() is arch specific so I'm having to wrap its return
value into a generic value.  My current thinking is make
memcpy_from_pmem() return a pmem_cookie_t, and then have an arch
specific pmem_copy_error(pmem_cookit_t cookie) helper that interprets
the value.  This is similar to the situation we have with
dma_mapping_error().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-12-23 19:31           ` Dan Williams
@ 2015-12-23 20:46             ` Tony Luck
  -1 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-23 20:46 UTC (permalink / raw)
  To: Dan Williams
  Cc: Borislav Petkov, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Elliott, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

> I know, memcpy returns the ptr to @dest like a parrot

Maybe I need to change the name to remove the
"memcpy" substring to avoid this confusion. How
about "mcsafe_copy()"? Perhaps with a "__" prefix
to point out it is a building block that will get various
wrappers around it??

Dan wants a copy_from_nvdimm() that either completes
the copy, or indicates where a machine check occurred.

I'm going to want a copy_from_user() that has two fault
options (user gave a bad address -> -EFAULT, or the
source address had an uncorrected error -> SIGBUS).

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
@ 2015-12-23 20:46             ` Tony Luck
  0 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-23 20:46 UTC (permalink / raw)
  To: Dan Williams
  Cc: Borislav Petkov, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Elliott, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

> I know, memcpy returns the ptr to @dest like a parrot

Maybe I need to change the name to remove the
"memcpy" substring to avoid this confusion. How
about "mcsafe_copy()"? Perhaps with a "__" prefix
to point out it is a building block that will get various
wrappers around it??

Dan wants a copy_from_nvdimm() that either completes
the copy, or indicates where a machine check occurred.

I'm going to want a copy_from_user() that has two fault
options (user gave a bad address -> -EFAULT, or the
source address had an uncorrected error -> SIGBUS).

-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
  2015-12-23 20:46             ` Tony Luck
@ 2015-12-24 13:37               ` Borislav Petkov
  -1 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-24 13:37 UTC (permalink / raw)
  To: Tony Luck
  Cc: Dan Williams, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Elliott, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Wed, Dec 23, 2015 at 12:46:20PM -0800, Tony Luck wrote:
> > I know, memcpy returns the ptr to @dest like a parrot
> 
> Maybe I need to change the name to remove the
> "memcpy" substring to avoid this confusion. How
> about "mcsafe_copy()"? Perhaps with a "__" prefix
> to point out it is a building block that will get various
> wrappers around it??
> 
> Dan wants a copy_from_nvdimm() that either completes
> the copy, or indicates where a machine check occurred.
> 
> I'm going to want a copy_from_user() that has two fault
> options (user gave a bad address -> -EFAULT, or the
> source address had an uncorrected error -> SIGBUS).

Sounds like standard kernel design to me. :)

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks
@ 2015-12-24 13:37               ` Borislav Petkov
  0 siblings, 0 replies; 36+ messages in thread
From: Borislav Petkov @ 2015-12-24 13:37 UTC (permalink / raw)
  To: Tony Luck
  Cc: Dan Williams, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Elliott, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Wed, Dec 23, 2015 at 12:46:20PM -0800, Tony Luck wrote:
> > I know, memcpy returns the ptr to @dest like a parrot
> 
> Maybe I need to change the name to remove the
> "memcpy" substring to avoid this confusion. How
> about "mcsafe_copy()"? Perhaps with a "__" prefix
> to point out it is a building block that will get various
> wrappers around it??
> 
> Dan wants a copy_from_nvdimm() that either completes
> the copy, or indicates where a machine check occurred.
> 
> I'm going to want a copy_from_user() that has two fault
> options (user gave a bad address -> -EFAULT, or the
> source address had an uncorrected error -> SIGBUS).

Sounds like standard kernel design to me. :)

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2015-12-24 13:37 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-16 16:39 [PATCHV3 0/3] Machine check recovery when kernel accesses poison Tony Luck
2015-12-16 16:39 ` Tony Luck
2015-12-16  1:29 ` [PATCHV3 1/3] x86, ras: Add new infrastructure for machine check fixup tables Tony Luck
2015-12-16  1:29   ` Tony Luck
2015-12-16 17:55   ` Andy Lutomirski
2015-12-16 17:55     ` Andy Lutomirski
2015-12-16 22:51     ` Luck, Tony
2015-12-16 22:51       ` Luck, Tony
2015-12-17 16:22       ` Andy Lutomirski
2015-12-17 16:22         ` Andy Lutomirski
2015-12-21 18:18   ` Borislav Petkov
2015-12-21 18:18     ` Borislav Petkov
2015-12-21 19:16     ` Dan Williams
2015-12-21 19:16       ` Dan Williams
2015-12-21 20:15       ` Borislav Petkov
2015-12-21 20:15         ` Borislav Petkov
2015-12-22 11:13   ` Borislav Petkov
2015-12-22 11:13     ` Borislav Petkov
2015-12-16  1:29 ` [PATCHV3 2/3] x86, ras: Extend machine check recovery code to annotated ring0 areas Tony Luck
2015-12-16  1:29   ` Tony Luck
2015-12-22 11:14   ` Borislav Petkov
2015-12-22 11:14     ` Borislav Petkov
2015-12-16  1:30 ` [PATCHV3 3/3] x86, ras: Add mcsafe_memcpy() function to recover from machine checks Tony Luck
2015-12-16  1:30   ` Tony Luck
2015-12-22 11:13   ` Borislav Petkov
2015-12-22 11:13     ` Borislav Petkov
2015-12-22 19:38     ` Tony Luck
2015-12-22 19:38       ` Tony Luck
2015-12-23 12:58       ` Borislav Petkov
2015-12-23 12:58         ` Borislav Petkov
2015-12-23 19:31         ` Dan Williams
2015-12-23 19:31           ` Dan Williams
2015-12-23 20:46           ` Tony Luck
2015-12-23 20:46             ` Tony Luck
2015-12-24 13:37             ` Borislav Petkov
2015-12-24 13:37               ` Borislav Petkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.