linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v8 2/3] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries
  2016-01-09  0:04 [PATCH v8 0/3] Machine check recovery when kernel accesses poison Tony Luck
@ 2015-12-31 19:40 ` Tony Luck
  2016-01-08 20:49 ` [PATCH v8 1/3] x86: Expand exception table to allow new handling options Tony Luck
  2016-01-08 21:18 ` [PATCH v8 3/3] x86, mce: Add __mcsafe_copy() Tony Luck
  2 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2015-12-31 19:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel with a EXTABLE_CLASS_FAULT fixup entry.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 32 +++++++++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 71 ++++++++++++++++---------------
 2 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..e87aa060977c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -13,7 +13,9 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/debugfs.h>
+#include <linux/module.h>
 #include <asm/mce.h>
+#include <asm/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -29,7 +31,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +50,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -87,6 +90,10 @@ static struct severity {
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
 	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
 		),
@@ -123,6 +130,11 @@ static struct severity {
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
 	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
+	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
@@ -170,6 +182,18 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
+static inline bool mce_in_kernel_recov(unsigned long addr)
+{
+	const struct exception_table_entry *fixup;
+
+	fixup = search_exception_tables(addr);
+
+	return fixup && fixup->handler == EXTABLE_CLASS_FAULT;
+}
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +207,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && mce_in_kernel_recov(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index c5b0d562dbf5..c95eb203b5c3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/poll.h>
 #include <linux/nmi.h>
@@ -958,6 +959,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -995,8 +1010,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	ist_enter(regs);
@@ -1123,22 +1136,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1146,25 +1150,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_exception(regs, X86_TRAP_MC))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09  0:04 [PATCH v8 0/3] Machine check recovery when kernel accesses poison Tony Luck
  2015-12-31 19:40 ` [PATCH v8 2/3] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries Tony Luck
@ 2016-01-08 20:49 ` Tony Luck
  2016-01-09  1:52   ` Andy Lutomirski
  2016-01-08 21:18 ` [PATCH v8 3/3] x86, mce: Add __mcsafe_copy() Tony Luck
  2 siblings, 1 reply; 36+ messages in thread
From: Tony Luck @ 2016-01-08 20:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
produce this. Andy provided the inspiration to add classes to the
exception table with a clever bit-squeezing trick, Boris pointed
out how much cleaner it would all be if we just had a new field.

Linus Torvalds blessed the expansion with:
  I'd rather not be clever in order to save just a tiny amount of space
  in the exception table, which isn't really criticial for anybody.

The third field is a simple integer indexing into an array of handler
functions (I thought it couldn't be a relative pointer like the other
fields because a module may have its ex_table loaded more than 2GB away
from the handler function - but that may not be actually true. But the
integer is pretty flexible, we are only really using low two bits now).

We start out with three handlers:

0: Legacy - just jumps the to fixup IP
1: Fault - provide the trap number in %ax to the fixup code
2: Cleaned up legacy for the uaccess error hack

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/x86/exception-tables.txt | 34 ++++++++++++++
 arch/x86/include/asm/asm.h             | 44 +++++++++++-------
 arch/x86/include/asm/uaccess.h         | 13 +++---
 arch/x86/kernel/kprobes/core.c         |  2 +-
 arch/x86/kernel/traps.c                |  6 +--
 arch/x86/mm/extable.c                  | 84 ++++++++++++++++++++++------------
 arch/x86/mm/fault.c                    |  2 +-
 scripts/sortextable.c                  | 30 ++++++++++++
 8 files changed, 160 insertions(+), 55 deletions(-)

diff --git a/Documentation/x86/exception-tables.txt b/Documentation/x86/exception-tables.txt
index 32901aa36f0a..340a6f5dc99b 100644
--- a/Documentation/x86/exception-tables.txt
+++ b/Documentation/x86/exception-tables.txt
@@ -290,3 +290,37 @@ Due to the way that the exception table is built and needs to be ordered,
 only use exceptions for code in the .text section.  Any other section
 will cause the exception table to not be sorted correctly, and the
 exceptions will fail.
+
+Things changed when 64-bit support was added to x86 Linux. Rather than
+double the size of the exception table by expanding the two entries
+from 32-bits to 64 bits, a clever trick was used to store addreesses
+as relative offsets from the table itself. The assembly code changed
+from:
+	.long 1b,3b
+to:
+        .long (from) - .
+        .long (to) - .
+and the C-code that uses these values converts back to absolute addresses
+like this:
+	ex_insn_addr(const struct exception_table_entry *x)
+	{
+		return (unsigned long)&x->insn + x->insn;
+	}
+
+In v4.5 the exception table entry was given a new field "handler".
+This is also 32-bits wide and contains an index into an array of
+handler functions that can perform specific operations in addition
+to re-writing the instruction pointer to jump to the fixup location.
+Initially there are three such functions:
+
+0) int ex_handler_default(const struct exception_table_entry *fixup,
+   This is legacy case that just jumps to the fixup code
+1) int ex_handler_fault(const struct exception_table_entry *fixup,
+   This case provides the fault number of the trap that occurred at
+   entry->insn. It is used to distinguish page faults from machine
+   check.
+2) int ex_handler_ext(const struct exception_table_entry *fixup,
+   This case is used to for uaccess_err ... we need to set a flag
+   in the task structure. Before the handler functions existed this
+   case was handled by adding a large offset to the fixup to tag
+   it as special.
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..0280f5c5d160 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -42,21 +42,28 @@
 #define _ASM_SI		__ASM_REG(si)
 #define _ASM_DI		__ASM_REG(di)
 
+#define EXTABLE_CLASS_DEFAULT	0	/* standard uaccess fixup */
+#define EXTABLE_CLASS_FAULT	1	/* provide trap number in %ax */
+#define EXTABLE_CLASS_EX	2	/* uaccess + set uaccess_err */
+
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
+# define _ASM_EXTABLE_CLASS(from, to, class)			\
 	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
+	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
+	.long (class) ;						\
 	.popsection
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
-	.popsection
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_DEFAULT)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_FAULT)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_EX)
 
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
@@ -89,19 +96,24 @@
 	.endm
 
 #else
-# define _ASM_EXTABLE(from,to)					\
+# define _EXPAND_EXTABLE_CLASS(x) #x
+# define _ASM_EXTABLE_CLASS(from, to, class)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
+	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
+	" .long (" _EXPAND_EXTABLE_CLASS(class) ")\n"		\
 	" .popsection\n"
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
-	" .popsection\n"
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_DEFAULT)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_FAULT)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_EX)
+
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 09b1b0ab94b7..315d1423377c 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -91,11 +91,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 
 /*
  * The exception table consists of pairs of addresses relative to the
- * exception table enty itself: the first is the address of an
- * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * exception table enty itself followed by an integer. The first address
+ * is of an instruction that is allowed to fault, the second is the target
+ * at which the program should continue. The integer is an index into an
+ * array of handler functions to deal with the fault referenced by the
+ * instruction in the first field.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -105,12 +105,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 
 struct exception_table_entry {
 	int insn, fixup;
+	u32 handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a46b1d..211c11c7bba4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..cfaf5feace36 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,9 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+typedef int (*ex_handler_t)(const struct exception_table_entry *,
+			    struct pt_regs *, int);
+
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
 {
@@ -14,10 +17,39 @@ ex_fixup_addr(const struct exception_table_entry *x)
 	return (unsigned long)&x->fixup + x->fixup;
 }
 
-int fixup_exception(struct pt_regs *regs)
+static int ex_handler_default(const struct exception_table_entry *fixup,
+		       struct pt_regs *regs, int trapnr)
 {
-	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+
+static int ex_handler_fault(const struct exception_table_entry *fixup,
+		     struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return 1;
+}
+
+static int ex_handler_ext(const struct exception_table_entry *fixup,
+		   struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+
+static ex_handler_t allhandlers[] = {
+	[EXTABLE_CLASS_DEFAULT] = ex_handler_default,
+	[EXTABLE_CLASS_FAULT] = ex_handler_fault,
+	[EXTABLE_CLASS_EX] = ex_handler_ext,
+};
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+	const struct exception_table_entry *e;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +65,34 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	e = search_exception_tables(regs->ip);
+	if (!e)
+		return 0;
 
-	return 0;
+	/* if exception table corrupted die here rather than jump into space */
+	BUG_ON(e->handler >= ARRAY_SIZE(allhandlers));
+
+	return allhandlers[e->handler](e, regs, trapnr);
 }
 
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
-	const struct exception_table_entry *fixup;
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
 
-	fixup = search_exception_tables(*ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
+	e = search_exception_tables(*ip);
+	if (!e)
+		return 0;
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* uaccess handling not supported during early boot */
-			return 0;
-		}
+	new_ip  = ex_fixup_addr(e);
 
-		*ip = new_ip;
-		return 1;
-	}
+	/* special handling not supported during early boot */
+	if (e->handler != EXTABLE_CLASS_DEFAULT)
+		return 0;
 
-	return 0;
+	*ip = new_ip;
+	return 1;
 }
 
 /*
@@ -133,6 +157,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup += i;
 		i += 4;
+		/* p->handler doesn't need noodling */
+		i += 4;
 	}
 
 	sort(start, finish - start, sizeof(struct exception_table_entry),
@@ -145,6 +171,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup -= i;
 		i += 4;
+		/* p->handler doesn't need unnoodling */
+		i += 4;
 	}
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..495946c3f9dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index c2423d913b46..b17b716959a4 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -209,6 +209,33 @@ static int compare_relative_table(const void *a, const void *b)
 	return 0;
 }
 
+static void x86_sort_relative_table(char *extab_image, int image_size)
+{
+	int i;
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) + i, loc);
+		w(r(loc + 1) + i + 4, loc + 1);
+
+		i += sizeof(uint32_t) * 3;
+	}
+
+	qsort(extab_image, image_size / 12, 12, compare_relative_table);
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) - i, loc);
+		w(r(loc + 1) - (i + 4), loc + 1);
+
+		i += sizeof(uint32_t) * 3;
+	}
+}
+
 static void sort_relative_table(char *extab_image, int image_size)
 {
 	int i;
@@ -281,6 +308,9 @@ do_file(char const *const fname)
 		break;
 	case EM_386:
 	case EM_X86_64:
+		custom_sort = x86_sort_relative_table;
+		break;
+
 	case EM_S390:
 		custom_sort = sort_relative_table;
 		break;
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-09  0:04 [PATCH v8 0/3] Machine check recovery when kernel accesses poison Tony Luck
  2015-12-31 19:40 ` [PATCH v8 2/3] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries Tony Luck
  2016-01-08 20:49 ` [PATCH v8 1/3] x86: Expand exception table to allow new handling options Tony Luck
@ 2016-01-08 21:18 ` Tony Luck
  2016-01-09  1:49   ` Andy Lutomirski
  2 siblings, 1 reply; 36+ messages in thread
From: Tony Luck @ 2016-01-08 21:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Make use of the EXTABLE_FAULT exception table entries. This routine
returns a structure to indicate the result of the copy:

struct mcsafe_ret {
        u64 trapnr;
        u64 remain;
};

If the copy is successful, then both 'trapnr' and 'remain' are zero.

If we faulted during the copy, then 'trapnr' will say which type
of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
bytes were not copied.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/string_64.h |   8 +++
 arch/x86/kernel/x8664_ksyms_64.c |   2 +
 arch/x86/lib/memcpy_64.S         | 133 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 143 insertions(+)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a17dc4b..5b24039463a4 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,14 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+struct mcsafe_ret {
+	u64 trapnr;
+	u64 remain;
+};
+
+struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt);
+extern void __mcsafe_copy_end(void);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..96434edd7430 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+EXPORT_SYMBOL(__mcsafe_copy);
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bba87de..195ff0144152 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,136 @@ ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+
+/*
+ * __mcsafe_copy - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(__mcsafe_copy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of source */
+	movl %esi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 0b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	mov %r8,(%rdi)
+	mov %r9,1*8(%rdi)
+	mov %r10,2*8(%rdi)
+	mov %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	mov %r8,4*8(%rdi)
+	mov %r9,5*8(%rdi)
+	mov %r10,6*8(%rdi)
+	mov %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	mov %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorq %rax, %rax
+	xorq %rdx, %rdx
+	sfence
+	/* copy successful. return 0 */
+	ret
+
+	.section .fixup,"ax"
+	/* fixups for machine check */
+30:
+	add %ecx,%edx
+	jmp 100f
+31:
+	shl $6,%ecx
+	add %ecx,%edx
+	jmp 100f
+32:
+	shl $6,%ecx
+	lea -8(%ecx,%edx),%edx
+	jmp 100f
+33:
+	shl $6,%ecx
+	lea -16(%ecx,%edx),%edx
+	jmp 100f
+34:
+	shl $6,%ecx
+	lea -24(%ecx,%edx),%edx
+	jmp 100f
+35:
+	shl $6,%ecx
+	lea -32(%ecx,%edx),%edx
+	jmp 100f
+36:
+	shl $6,%ecx
+	lea -40(%ecx,%edx),%edx
+	jmp 100f
+37:
+	shl $6,%ecx
+	lea -48(%ecx,%edx),%edx
+	jmp 100f
+38:
+	shl $6,%ecx
+	lea -56(%ecx,%edx),%edx
+	jmp 100f
+39:
+	lea (%rdx,%rcx,8),%rdx
+	jmp 100f
+40:
+	mov %ecx,%edx
+100:
+	sfence
+
+	/* %rax set the fault number in fixup_exception() */
+	ret
+	.previous
+
+	_ASM_EXTABLE_FAULT(0b,30b)
+	_ASM_EXTABLE_FAULT(1b,31b)
+	_ASM_EXTABLE_FAULT(2b,32b)
+	_ASM_EXTABLE_FAULT(3b,33b)
+	_ASM_EXTABLE_FAULT(4b,34b)
+	_ASM_EXTABLE_FAULT(9b,35b)
+	_ASM_EXTABLE_FAULT(10b,36b)
+	_ASM_EXTABLE_FAULT(11b,37b)
+	_ASM_EXTABLE_FAULT(12b,38b)
+	_ASM_EXTABLE_FAULT(18b,39b)
+	_ASM_EXTABLE_FAULT(21b,40b)
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v8 0/3] Machine check recovery when kernel accesses poison
@ 2016-01-09  0:04 Tony Luck
  2015-12-31 19:40 ` [PATCH v8 2/3] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries Tony Luck
                   ` (2 more replies)
  0 siblings, 3 replies; 36+ messages in thread
From: Tony Luck @ 2016-01-09  0:04 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

This series is initially targeted at the folks doing filesystems
on top of NVDIMMs. They really want to be able to return -EIO
when there is a h/w error (just like spinning rust, and SSD does).

I plan to use the same infrastructure to write a machine check aware
"copy_from_user()" that will SIGBUS the calling application when a
syscall touches poison in user space (just like we do when the application
touches the poison itself).

Changes V7-V8
Boris:	Would be so much cleaner if we added a new field to the exception table
	instead of squeezing bits into the fixup field. New field added
Tony:	Documentation needs to be updated. Done

Changes V6-V7:
Boris:	Why add/subtract 0x20000000? Added better comment provided by Andy
Boris:	Churn. Part2 changes things only introduced in part1.
	Merged parts 1&2 into one patch.
Ingo:	Missing my sign off on part1. Added.

Changes V5-V6
Andy:	Provoked massive re-write by providing what is now part1 of this
	patch series. This frees up two bits in the exception table
	fixup field that can be used to tag exception table entries
	as different "classes". This means we don't need my separate
	exception table fro machine checks. Also avoids duplicating
	fixup actions for #PF and #MC cases that were in version 5.
Andy:	Use C99 array initializers to tie the various class fixup
	functions back to the defintions of each class. Also give the
	functions meanningful names (not fixup_class0() etc.).
Boris:	Cleaned up my lousy assembly code removing many spurious 'l'
	modifiers on instructions.
Boris:	Provided some helper functions for the machine check severity
	calculation that make the code more readable.
Boris:	Have __mcsafe_copy() return a structure with the 'remaining bytes'
	in a separate field from the fault indicator. Boris had suggested
	Linux -EFAULT/-EINVAL ... but I thought it made more sense to return
	the exception number (X86_TRAP_MC, etc.)  This finally kills off
	BIT(63) which has been controversial throughout all the early versions
	of this patch series.

Changes V4-V5
Tony:	Extended __mcsafe_copy() to have fixup entries for both machine
	check and page fault.

Changes V3-V4:
Andy:   Simplify fixup_mcexception() by dropping used-once local variable
Andy:   "Reviewed-by" tag added to part1
Boris:  Moved new functions to memcpy_64.S and declaration to asm/string_64.h
Boris:  Changed name s/mcsafe_memcpy/__mcsafe_copy/ to make it clear that this
        is an internal function and that return value doesn't follow memcpy() semantics.
Boris:  "Reviewed-by" tag added to parts 1&2

Changes V2-V3:

Andy:   Don't hack "regs->ax = BIT(63) | addr;" in the machine check
        handler.  Now have better fixup code that computes the number
        of remaining bytes (just like page-fault fixup).
Andy:   #define for BIT(63). Done, plus couple of extra macros using it.
Boris:  Don't clutter up generic code (like mm/extable.c) with this.
        I moved everything under arch/x86 (the asm-generic change is
        a more generic #define).
Boris:  Dependencies for CONFIG_MCE_KERNEL_RECOVERY are too generic.
        I made it a real menu item with default "n". Dan Williams
        will use "select MCE_KERNEL_RECOVERY" from his persistent
        filesystem code.
Boris:  Simplify conditionals in mce.c by moving tolerant/kill_it
        checks earlier, with a skip to end if they aren't set.
Boris:  Miscellaneous grammar/punctuation. Fixed.
Boris:  Don't leak spurious __start_mcextable symbols into kernels
        that didn't configure MCE_KERNEL_RECOVERY. Done.
Tony:   New code doesn't belong in user_copy_64.S/uaccess*.h. Moved
        to new .S/.h files
Elliott:Cacheing behavior non-optimal. Could use movntdqa, vmovntdqa
        or vmovntdqa on source addresses. I didn't fix this yet. Think
        of the current mcsafe_memcpy() as the first of several functions.
        This one is useful for small copies (meta-data) where the overhead
        of saving SSE/AVX state isn't justified.

Changes V1->V2:

0-day:  Reported build errors and warnings on 32-bit systems. Fixed
0-day:  Reported bloat to tinyconfig. Fixed
Boris:  Suggestions to use extra macros to reduce code duplication in _ASM_*EXTABLE. Done
Boris:  Re-write "tolerant==3" check to reduce indentation level. See below.
Andy:   Check IP is valid before searching kernel exception tables. Done.
Andy:   Explain use of BIT(63) on return value from mcsafe_memcpy(). Done (added decode macros).
Andy:   Untangle mess of code in tail of do_machine_check() to make it
        clear what is going on (e.g. that we only enter the ist_begin_non_atomic()
        if we were called from user code, not from kernel!). Done.
Tony Luck (3):
  x86: Expand exception table to allow new handling options
  x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception
    table entries
  x86, mce: Add __mcsafe_copy()

 Documentation/x86/exception-tables.txt    |  34 ++++++++
 arch/x86/include/asm/asm.h                |  44 ++++++----
 arch/x86/include/asm/string_64.h          |   8 ++
 arch/x86/include/asm/uaccess.h            |  13 +--
 arch/x86/kernel/cpu/mcheck/mce-severity.c |  32 ++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          |  71 ++++++++--------
 arch/x86/kernel/kprobes/core.c            |   2 +-
 arch/x86/kernel/traps.c                   |   6 +-
 arch/x86/kernel/x8664_ksyms_64.c          |   2 +
 arch/x86/lib/memcpy_64.S                  | 133 ++++++++++++++++++++++++++++++
 arch/x86/mm/extable.c                     |  84 ++++++++++++-------
 arch/x86/mm/fault.c                       |   2 +-
 scripts/sortextable.c                     |  30 +++++++
 13 files changed, 370 insertions(+), 91 deletions(-)

-- 
2.1.4

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-08 21:18 ` [PATCH v8 3/3] x86, mce: Add __mcsafe_copy() Tony Luck
@ 2016-01-09  1:49   ` Andy Lutomirski
  2016-01-09 17:48     ` Tony Luck
  2016-01-12  0:26     ` Luck, Tony
  0 siblings, 2 replies; 36+ messages in thread
From: Andy Lutomirski @ 2016-01-09  1:49 UTC (permalink / raw)
  To: Tony Luck
  Cc: linux-nvdimm, Dan Williams, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Jan 8, 2016 4:19 PM, "Tony Luck" <tony.luck@intel.com> wrote:
>
> Make use of the EXTABLE_FAULT exception table entries. This routine
> returns a structure to indicate the result of the copy:

Perhaps this is silly, but could we make this feature depend on ERMS
and thus make the code a lot simpler?

Also, what's the sfence for?  You don't seem to be using any
non-temporal operations.

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-08 20:49 ` [PATCH v8 1/3] x86: Expand exception table to allow new handling options Tony Luck
@ 2016-01-09  1:52   ` Andy Lutomirski
  2016-01-09  3:39     ` Brian Gerst
                       ` (2 more replies)
  0 siblings, 3 replies; 36+ messages in thread
From: Andy Lutomirski @ 2016-01-09  1:52 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Fri, Jan 8, 2016 at 12:49 PM, Tony Luck <tony.luck@intel.com> wrote:
> Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
> produce this. Andy provided the inspiration to add classes to the
> exception table with a clever bit-squeezing trick, Boris pointed
> out how much cleaner it would all be if we just had a new field.
>
> Linus Torvalds blessed the expansion with:
>   I'd rather not be clever in order to save just a tiny amount of space
>   in the exception table, which isn't really criticial for anybody.
>
> The third field is a simple integer indexing into an array of handler
> functions (I thought it couldn't be a relative pointer like the other
> fields because a module may have its ex_table loaded more than 2GB away
> from the handler function - but that may not be actually true. But the
> integer is pretty flexible, we are only really using low two bits now).
>
> We start out with three handlers:
>
> 0: Legacy - just jumps the to fixup IP
> 1: Fault - provide the trap number in %ax to the fixup code
> 2: Cleaned up legacy for the uaccess error hack

I think I preferred the relative function pointer approach.

Also, I think it would be nicer if the machine check code would invoke
the handler regardless of which handler (or class) is selected.  Then
the handlers that don't want to handle #MC can just reject them.

Also, can you make the handlers return bool instead of int?

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09  1:52   ` Andy Lutomirski
@ 2016-01-09  3:39     ` Brian Gerst
  2016-01-09  4:31       ` Brian Gerst
  2016-01-09 17:45     ` Tony Luck
  2016-01-11  0:25     ` Luck, Tony
  2 siblings, 1 reply; 36+ messages in thread
From: Brian Gerst @ 2016-01-09  3:39 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Borislav Petkov, Andrew Morton,
	Andy Lutomirski, Dan Williams, Robert, linux-kernel, linux-mm,
	linux-nvdimm, X86 ML

On Fri, Jan 8, 2016 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Jan 8, 2016 at 12:49 PM, Tony Luck <tony.luck@intel.com> wrote:
>> Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
>> produce this. Andy provided the inspiration to add classes to the
>> exception table with a clever bit-squeezing trick, Boris pointed
>> out how much cleaner it would all be if we just had a new field.
>>
>> Linus Torvalds blessed the expansion with:
>>   I'd rather not be clever in order to save just a tiny amount of space
>>   in the exception table, which isn't really criticial for anybody.
>>
>> The third field is a simple integer indexing into an array of handler
>> functions (I thought it couldn't be a relative pointer like the other
>> fields because a module may have its ex_table loaded more than 2GB away
>> from the handler function - but that may not be actually true. But the
>> integer is pretty flexible, we are only really using low two bits now).
>>
>> We start out with three handlers:
>>
>> 0: Legacy - just jumps the to fixup IP
>> 1: Fault - provide the trap number in %ax to the fixup code
>> 2: Cleaned up legacy for the uaccess error hack
>
> I think I preferred the relative function pointer approach.
>
> Also, I think it would be nicer if the machine check code would invoke
> the handler regardless of which handler (or class) is selected.  Then
> the handlers that don't want to handle #MC can just reject them.
>
> Also, can you make the handlers return bool instead of int?

I'm hashing up an idea that could eliminate alot of text in the .fixup
section, but it needs the integer handler method to work.  We have
alot of fixup code that does "mov $-EFAULT, reg; jmp xxxx".  If we
encode the register in the third word, the handler can be generic and
no fixup code for each user access would be needed.  That would
recover alot of the memory used by expanding the exception table.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09  3:39     ` Brian Gerst
@ 2016-01-09  4:31       ` Brian Gerst
  2016-01-09  6:36         ` Andy Lutomirski
  0 siblings, 1 reply; 36+ messages in thread
From: Brian Gerst @ 2016-01-09  4:31 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Borislav Petkov, Andrew Morton,
	Andy Lutomirski, Dan Williams, Robert, linux-kernel, linux-mm,
	linux-nvdimm, X86 ML

On Fri, Jan 8, 2016 at 10:39 PM, Brian Gerst <brgerst@gmail.com> wrote:
> On Fri, Jan 8, 2016 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Jan 8, 2016 at 12:49 PM, Tony Luck <tony.luck@intel.com> wrote:
>>> Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
>>> produce this. Andy provided the inspiration to add classes to the
>>> exception table with a clever bit-squeezing trick, Boris pointed
>>> out how much cleaner it would all be if we just had a new field.
>>>
>>> Linus Torvalds blessed the expansion with:
>>>   I'd rather not be clever in order to save just a tiny amount of space
>>>   in the exception table, which isn't really criticial for anybody.
>>>
>>> The third field is a simple integer indexing into an array of handler
>>> functions (I thought it couldn't be a relative pointer like the other
>>> fields because a module may have its ex_table loaded more than 2GB away
>>> from the handler function - but that may not be actually true. But the
>>> integer is pretty flexible, we are only really using low two bits now).
>>>
>>> We start out with three handlers:
>>>
>>> 0: Legacy - just jumps the to fixup IP
>>> 1: Fault - provide the trap number in %ax to the fixup code
>>> 2: Cleaned up legacy for the uaccess error hack
>>
>> I think I preferred the relative function pointer approach.
>>
>> Also, I think it would be nicer if the machine check code would invoke
>> the handler regardless of which handler (or class) is selected.  Then
>> the handlers that don't want to handle #MC can just reject them.
>>
>> Also, can you make the handlers return bool instead of int?
>
> I'm hashing up an idea that could eliminate alot of text in the .fixup
> section, but it needs the integer handler method to work.  We have
> alot of fixup code that does "mov $-EFAULT, reg; jmp xxxx".  If we
> encode the register in the third word, the handler can be generic and
> no fixup code for each user access would be needed.  That would
> recover alot of the memory used by expanding the exception table.

On second thought, this could still be implemented with a relative
function pointer.  We'd just need a separate function for each
register.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09  4:31       ` Brian Gerst
@ 2016-01-09  6:36         ` Andy Lutomirski
  2016-01-11 23:09           ` Brian Gerst
  0 siblings, 1 reply; 36+ messages in thread
From: Andy Lutomirski @ 2016-01-09  6:36 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Dan Williams, Borislav Petkov, X86 ML, linux-mm, Robert,
	Andrew Morton, Tony Luck, Ingo Molnar, linux-kernel,
	linux-nvdimm

On Jan 8, 2016 8:31 PM, "Brian Gerst" <brgerst@gmail.com> wrote:
>
> On Fri, Jan 8, 2016 at 10:39 PM, Brian Gerst <brgerst@gmail.com> wrote:
> > On Fri, Jan 8, 2016 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >> On Fri, Jan 8, 2016 at 12:49 PM, Tony Luck <tony.luck@intel.com> wrote:
> >>> Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
> >>> produce this. Andy provided the inspiration to add classes to the
> >>> exception table with a clever bit-squeezing trick, Boris pointed
> >>> out how much cleaner it would all be if we just had a new field.
> >>>
> >>> Linus Torvalds blessed the expansion with:
> >>>   I'd rather not be clever in order to save just a tiny amount of space
> >>>   in the exception table, which isn't really criticial for anybody.
> >>>
> >>> The third field is a simple integer indexing into an array of handler
> >>> functions (I thought it couldn't be a relative pointer like the other
> >>> fields because a module may have its ex_table loaded more than 2GB away
> >>> from the handler function - but that may not be actually true. But the
> >>> integer is pretty flexible, we are only really using low two bits now).
> >>>
> >>> We start out with three handlers:
> >>>
> >>> 0: Legacy - just jumps the to fixup IP
> >>> 1: Fault - provide the trap number in %ax to the fixup code
> >>> 2: Cleaned up legacy for the uaccess error hack
> >>
> >> I think I preferred the relative function pointer approach.
> >>
> >> Also, I think it would be nicer if the machine check code would invoke
> >> the handler regardless of which handler (or class) is selected.  Then
> >> the handlers that don't want to handle #MC can just reject them.
> >>
> >> Also, can you make the handlers return bool instead of int?
> >
> > I'm hashing up an idea that could eliminate alot of text in the .fixup
> > section, but it needs the integer handler method to work.  We have
> > alot of fixup code that does "mov $-EFAULT, reg; jmp xxxx".  If we
> > encode the register in the third word, the handler can be generic and
> > no fixup code for each user access would be needed.  That would
> > recover alot of the memory used by expanding the exception table.
>
> On second thought, this could still be implemented with a relative
> function pointer.  We'd just need a separate function for each
> register.
>

If we could get gcc to play along (which, IIRC, it already can for
__put_user), we can do much better with jump labels -- the fixup
target would be a jump label.

Even without that, how about using @cc?  Do:

clc
mov whatever, wherever

The fixup sets the carry flag and skips the faulting instruction
(either by knowing the length or by decoding it), and the inline asm
causes gcc to emit jc to the error logic.

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09  1:52   ` Andy Lutomirski
  2016-01-09  3:39     ` Brian Gerst
@ 2016-01-09 17:45     ` Tony Luck
  2016-01-09 18:00       ` Andy Lutomirski
  2016-01-11  0:25     ` Luck, Tony
  2 siblings, 1 reply; 36+ messages in thread
From: Tony Luck @ 2016-01-09 17:45 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Fri, Jan 8, 2016 at 5:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> Also, I think it would be nicer if the machine check code would invoke
> the handler regardless of which handler (or class) is selected.  Then
> the handlers that don't want to handle #MC can just reject them.

The machine check code is currently a two pass process.

First we scan all the machine check banks (on all processors
at the moment because machine checks are broadcast). We
assess the severity of all errors found.

Then we take action. Panic if the most severe error was fatal,
recover if not.

This patch series tweaks the severity calculation. In-kernel
errors at IPs with a EXTABLE_CLASS_FAULT handler are
now ranked as recoverable. All other kernel errors remain
fatal.

I don't think it is right to unconditionally execute the fix code in the
severity assessment phase.

Perhaps later we can revisit the two pass process?

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-09  1:49   ` Andy Lutomirski
@ 2016-01-09 17:48     ` Tony Luck
  2016-01-09 17:57       ` Andy Lutomirski
  2016-01-12  0:26     ` Luck, Tony
  1 sibling, 1 reply; 36+ messages in thread
From: Tony Luck @ 2016-01-09 17:48 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-nvdimm, Dan Williams, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Fri, Jan 8, 2016 at 5:49 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Jan 8, 2016 4:19 PM, "Tony Luck" <tony.luck@intel.com> wrote:
>>
>> Make use of the EXTABLE_FAULT exception table entries. This routine
>> returns a structure to indicate the result of the copy:
>
> Perhaps this is silly, but could we make this feature depend on ERMS
> and thus make the code a lot simpler?

ERMS?

> Also, what's the sfence for?  You don't seem to be using any
> non-temporal operations.

Ah - left over from the original function that this
was cloned from (which did use non-temporal
operations).  Will delete.

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-09 17:48     ` Tony Luck
@ 2016-01-09 17:57       ` Andy Lutomirski
  2016-01-09 19:39         ` Tony Luck
  0 siblings, 1 reply; 36+ messages in thread
From: Andy Lutomirski @ 2016-01-09 17:57 UTC (permalink / raw)
  To: Tony Luck
  Cc: linux-nvdimm, Dan Williams, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Sat, Jan 9, 2016 at 9:48 AM, Tony Luck <tony.luck@gmail.com> wrote:
> On Fri, Jan 8, 2016 at 5:49 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Jan 8, 2016 4:19 PM, "Tony Luck" <tony.luck@intel.com> wrote:
>>>
>>> Make use of the EXTABLE_FAULT exception table entries. This routine
>>> returns a structure to indicate the result of the copy:
>>
>> Perhaps this is silly, but could we make this feature depend on ERMS
>> and thus make the code a lot simpler?
>
> ERMS?

It's the fast string extension, aka Enhanced REP MOV STOS.  On CPUs
with that feature (and not disabled via MSR), plain ol' rep movs is
the fastest way to copy bytes.  I think this includes all Intel CPUs
from SNB onwards.

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09 17:45     ` Tony Luck
@ 2016-01-09 18:00       ` Andy Lutomirski
  2016-01-09 19:51         ` Tony Luck
  0 siblings, 1 reply; 36+ messages in thread
From: Andy Lutomirski @ 2016-01-09 18:00 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Sat, Jan 9, 2016 at 9:45 AM, Tony Luck <tony.luck@gmail.com> wrote:
> On Fri, Jan 8, 2016 at 5:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> Also, I think it would be nicer if the machine check code would invoke
>> the handler regardless of which handler (or class) is selected.  Then
>> the handlers that don't want to handle #MC can just reject them.
>
> The machine check code is currently a two pass process.
>
> First we scan all the machine check banks (on all processors
> at the moment because machine checks are broadcast). We
> assess the severity of all errors found.
>
> Then we take action. Panic if the most severe error was fatal,
> recover if not.
>
> This patch series tweaks the severity calculation. In-kernel
> errors at IPs with a EXTABLE_CLASS_FAULT handler are
> now ranked as recoverable. All other kernel errors remain
> fatal.
>
> I don't think it is right to unconditionally execute the fix code in the
> severity assessment phase.

I would argue that unconditionally calling the handler would be
cleaner.  The handler would return 0 or false to indicate that it
refuses to fix the exception.

This is similar to the logic that, for regular user memory access, we
shouldn't fix up faults other than #PF.  Given that we're adding
flexible handler callbacks, lets push all the "is this an acceptable
fault to fix up" down into the callback.  Does that make sense?

>
> Perhaps later we can revisit the two pass process?

Oh, I see.  Is it the case that the MC code can't cleanly handle the
case where the error was nominally recoverable but the kernel doesn't
know how to recover from it due to the lack of a handler that's okay
with it, because the handler's refusal to handle the fault wouldn't be
known until too late?

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-09 17:57       ` Andy Lutomirski
@ 2016-01-09 19:39         ` Tony Luck
  2016-01-09 22:15           ` Dan Williams
  0 siblings, 1 reply; 36+ messages in thread
From: Tony Luck @ 2016-01-09 19:39 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-nvdimm, Dan Williams, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Sat, Jan 9, 2016 at 9:57 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Sat, Jan 9, 2016 at 9:48 AM, Tony Luck <tony.luck@gmail.com> wrote:
>> ERMS?
>
> It's the fast string extension, aka Enhanced REP MOV STOS.  On CPUs
> with that feature (and not disabled via MSR), plain ol' rep movs is
> the fastest way to copy bytes.  I think this includes all Intel CPUs
> from SNB onwards.

Ah ... very fast at copying .. but currently not machine check recoverable.

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09 18:00       ` Andy Lutomirski
@ 2016-01-09 19:51         ` Tony Luck
  2016-01-09 22:32           ` Andy Lutomirski
  0 siblings, 1 reply; 36+ messages in thread
From: Tony Luck @ 2016-01-09 19:51 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

> Oh, I see.  Is it the case that the MC code can't cleanly handle the
> case where the error was nominally recoverable but the kernel doesn't
> know how to recover from it due to the lack of a handler that's okay
> with it, because the handler's refusal to handle the fault wouldn't be
> known until too late?

The code is just too clunky right now.  We have a table driven
severity calculator that we invoke on each machine check bank
that has some valid data to report.  Part of that calculation is
"what context am I in?". Which happens earlier in the sequence
than "Is MCi_STATUS.MCACOD some known recoverable type".
If I invoke the fixup code I'll change regs->ip right away ... even
if I'm executing on some innocent bystander processor that wasn't
the source of the machine check (the bystanders on the same
socket can usually see something logged in one of the memory
controller banks).

There are definitely some cleanups that should be done
in this code (e.g. figuring our context just once, not once
per bank).  But I'm pretty sure I'll always want to know
"am I executing an instruction with a #MC recoverable
handler?" in a way that doesn't actually invoke the recovery.

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-09 19:39         ` Tony Luck
@ 2016-01-09 22:15           ` Dan Williams
  2016-01-09 22:33             ` Andy Lutomirski
  0 siblings, 1 reply; 36+ messages in thread
From: Dan Williams @ 2016-01-09 22:15 UTC (permalink / raw)
  To: Tony Luck
  Cc: Andy Lutomirski, linux-nvdimm, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Sat, Jan 9, 2016 at 11:39 AM, Tony Luck <tony.luck@gmail.com> wrote:
> On Sat, Jan 9, 2016 at 9:57 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Sat, Jan 9, 2016 at 9:48 AM, Tony Luck <tony.luck@gmail.com> wrote:
>>> ERMS?
>>
>> It's the fast string extension, aka Enhanced REP MOV STOS.  On CPUs
>> with that feature (and not disabled via MSR), plain ol' rep movs is
>> the fastest way to copy bytes.  I think this includes all Intel CPUs
>> from SNB onwards.
>
> Ah ... very fast at copying .. but currently not machine check recoverable.

Hmm, I assume for the pmem driver I'll want to check at runtime if the
cpu has machine check recovery and fall back to the faster copy if
it's not available?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09 19:51         ` Tony Luck
@ 2016-01-09 22:32           ` Andy Lutomirski
  2016-01-10  1:15             ` Tony Luck
  0 siblings, 1 reply; 36+ messages in thread
From: Andy Lutomirski @ 2016-01-09 22:32 UTC (permalink / raw)
  To: Tony Luck
  Cc: Dan Williams, Borislav Petkov, X86 ML, linux-mm, Robert,
	Andrew Morton, Ingo Molnar, linux-kernel, linux-nvdimm

On Jan 9, 2016 11:51 AM, "Tony Luck" <tony.luck@gmail.com> wrote:
>
> > Oh, I see.  Is it the case that the MC code can't cleanly handle the
> > case where the error was nominally recoverable but the kernel doesn't
> > know how to recover from it due to the lack of a handler that's okay
> > with it, because the handler's refusal to handle the fault wouldn't be
> > known until too late?
>
> The code is just too clunky right now.  We have a table driven
> severity calculator that we invoke on each machine check bank
> that has some valid data to report.  Part of that calculation is
> "what context am I in?". Which happens earlier in the sequence
> than "Is MCi_STATUS.MCACOD some known recoverable type".
> If I invoke the fixup code I'll change regs->ip right away ... even
> if I'm executing on some innocent bystander processor that wasn't
> the source of the machine check (the bystanders on the same
> socket can usually see something logged in one of the memory
> controller banks).

Makes sense, sort of.  But even if there is an MC fixup registered,
don't you still have to make sure to execute it on the actual victim
CPU?  After all, you don't want to fail an mcsafe copy just because a
different CPU coincidentally machine checked while the mcsafe copy has
the recoverable RIP value.

>
> There are definitely some cleanups that should be done
> in this code (e.g. figuring our context just once, not once
> per bank).  But I'm pretty sure I'll always want to know
> "am I executing an instruction with a #MC recoverable
> handler?" in a way that doesn't actually invoke the recovery.

What's wrong with:

Step 1: determine that the HW context is, in principle, recoverable.

Step 2: ask the handler to try to recover.

Step 3: if the handler doesn't recover, panic

I'm not saying that restructuring the code like this should be a
prerequisite for merging this, but I'm wondering whether it would make
sense at some point in the future.

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-09 22:15           ` Dan Williams
@ 2016-01-09 22:33             ` Andy Lutomirski
  2016-01-10  0:23               ` Dan Williams
  0 siblings, 1 reply; 36+ messages in thread
From: Andy Lutomirski @ 2016-01-09 22:33 UTC (permalink / raw)
  To: Dan Williams
  Cc: Tony Luck, linux-nvdimm, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Sat, Jan 9, 2016 at 2:15 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Sat, Jan 9, 2016 at 11:39 AM, Tony Luck <tony.luck@gmail.com> wrote:
>> On Sat, Jan 9, 2016 at 9:57 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Sat, Jan 9, 2016 at 9:48 AM, Tony Luck <tony.luck@gmail.com> wrote:
>>>> ERMS?
>>>
>>> It's the fast string extension, aka Enhanced REP MOV STOS.  On CPUs
>>> with that feature (and not disabled via MSR), plain ol' rep movs is
>>> the fastest way to copy bytes.  I think this includes all Intel CPUs
>>> from SNB onwards.
>>
>> Ah ... very fast at copying .. but currently not machine check recoverable.
>
> Hmm, I assume for the pmem driver I'll want to check at runtime if the
> cpu has machine check recovery and fall back to the faster copy if
> it's not available?

Shouldn't that logic live in the mcsafe_copy routine itself rather
than being delegated to callers?

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-09 22:33             ` Andy Lutomirski
@ 2016-01-10  0:23               ` Dan Williams
  2016-01-10  1:40                 ` Tony Luck
  0 siblings, 1 reply; 36+ messages in thread
From: Dan Williams @ 2016-01-10  0:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, linux-nvdimm, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Sat, Jan 9, 2016 at 2:33 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Sat, Jan 9, 2016 at 2:15 PM, Dan Williams <dan.j.williams@intel.com> wrote:
>> On Sat, Jan 9, 2016 at 11:39 AM, Tony Luck <tony.luck@gmail.com> wrote:
>>> On Sat, Jan 9, 2016 at 9:57 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>> On Sat, Jan 9, 2016 at 9:48 AM, Tony Luck <tony.luck@gmail.com> wrote:
>>>>> ERMS?
>>>>
>>>> It's the fast string extension, aka Enhanced REP MOV STOS.  On CPUs
>>>> with that feature (and not disabled via MSR), plain ol' rep movs is
>>>> the fastest way to copy bytes.  I think this includes all Intel CPUs
>>>> from SNB onwards.
>>>
>>> Ah ... very fast at copying .. but currently not machine check recoverable.
>>
>> Hmm, I assume for the pmem driver I'll want to check at runtime if the
>> cpu has machine check recovery and fall back to the faster copy if
>> it's not available?
>
> Shouldn't that logic live in the mcsafe_copy routine itself rather
> than being delegated to callers?
>

Yes, please.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09 22:32           ` Andy Lutomirski
@ 2016-01-10  1:15             ` Tony Luck
  0 siblings, 0 replies; 36+ messages in thread
From: Tony Luck @ 2016-01-10  1:15 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Dan Williams, Borislav Petkov, X86 ML, linux-mm, Robert,
	Andrew Morton, Ingo Molnar, linux-kernel, linux-nvdimm

On Sat, Jan 9, 2016 at 2:32 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> Step 1: determine that the HW context is, in principle, recoverable.
>
> Step 2: ask the handler to try to recover.
>
> Step 3: if the handler doesn't recover, panic
>
> I'm not saying that restructuring the code like this should be a
> prerequisite for merging this, but I'm wondering whether it would make
> sense at some point in the future.

For the local machine check case this all looks simple. For the broadcast
case it's pretty incompatible with the current code structure. For a machine
check triggered someplace in the kernel w/o a new style fixup handler we'd
start by saying ... "sure, that's plausible to recover from". Then after we let
all the other CPUs return from the machine check handler we'd take it
back and say "just kidding, we're going down". It might work, but it would
be a messier panic than we have now.

Definitely food for thought for some future cleanups.

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-10  0:23               ` Dan Williams
@ 2016-01-10  1:40                 ` Tony Luck
  2016-01-10 11:26                   ` Borislav Petkov
  0 siblings, 1 reply; 36+ messages in thread
From: Tony Luck @ 2016-01-10  1:40 UTC (permalink / raw)
  To: Dan Williams
  Cc: Andy Lutomirski, linux-nvdimm, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Sat, Jan 9, 2016 at 4:23 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Sat, Jan 9, 2016 at 2:33 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> Shouldn't that logic live in the mcsafe_copy routine itself rather
>> than being delegated to callers?
>>
>
> Yes, please.

Yes - we should have some of that fancy self-patching code that
redirects to the optimal routine for the cpu model we are running
on.

BUT ... it's all going to be very messy.  We don't have any CPUID
capability bits to say whether we support recovery, or which instructions
are good/bad choices for recovery. You might think that MCG_CAP{24}
which is described as "software error recovery" (or some such) would
be a good clue, but you'd be wrong. The bit got a little overloaded and
there are cpus that set it, but won't recover.

Only Intel(R) Xeon(R) branded cpus can recover, but not all. The story so far:

Nehalem, Westmere: E7 models support SRAO recovery (patrol scrub,
cache eviction). Not relevant for this e-mail thread.

Sandy Bridge: Some "advanced RAS" skus will recover from poison reads
(these have E5 model names, there was no E7 in this generation)

Ivy Bridge: Xeon E5-* models do not recover. E7-* models do recover.
Note E5 and E7 have the same CPUID model number.

Haswell: Same as Ivy Bridge

Broadwell/Sky Lake: Xeon not released yet ... can't talk about them.

Linux code recently got some recovery bits for AMD cpus ... I don't
know what the story is on which models support this,

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-10  1:40                 ` Tony Luck
@ 2016-01-10 11:26                   ` Borislav Petkov
  2016-01-11 10:44                     ` Ingo Molnar
  0 siblings, 1 reply; 36+ messages in thread
From: Borislav Petkov @ 2016-01-10 11:26 UTC (permalink / raw)
  To: Tony Luck
  Cc: Dan Williams, Andy Lutomirski, linux-nvdimm, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Sat, Jan 09, 2016 at 05:40:05PM -0800, Tony Luck wrote:
> BUT ... it's all going to be very messy.  We don't have any CPUID
> capability bits to say whether we support recovery, or which instructions
> are good/bad choices for recovery.

We can always define synthetic ones and set them after having checked
MCA capability bits, f/m/s, etc., maybe even based on the list you're
supplying...

> Linux code recently got some recovery bits for AMD cpus ... I don't
> know what the story is on which models support this,

You mean this?

                /*
                 * overflow_recov is supported for F15h Models 00h-0fh
                 * even though we don't have a CPUID bit for it.
                 */
                if (c->x86 == 0x15 && c->x86_model <= 0xf)
                        mce_flags.overflow_recov = 1;

If so, that's just an improvement which makes MCi_STATUS[Overflow] MCEs
non-fatal.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09  1:52   ` Andy Lutomirski
  2016-01-09  3:39     ` Brian Gerst
  2016-01-09 17:45     ` Tony Luck
@ 2016-01-11  0:25     ` Luck, Tony
  2 siblings, 0 replies; 36+ messages in thread
From: Luck, Tony @ 2016-01-11  0:25 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML, Brian Gerst

On Fri, Jan 08, 2016 at 05:52:54PM -0800, Andy Lutomirski wrote:
> I think I preferred the relative function pointer approach.
> 
> Also, I think it would be nicer if the machine check code would invoke
> the handler regardless of which handler (or class) is selected.  Then
> the handlers that don't want to handle #MC can just reject them.
> 
> Also, can you make the handlers return bool instead of int?

This patch is currently on top of the 3 patches ... just to check
direction. Obviously it needs to be folded into parts 1 & 2.
[and my Documentation changes updated]

So function pointers and bool returns are here.  Only bit that
I think Andy won't be "happy" with is ex_has_fault_handler().
But perhaps I've worn him down so he can be resigned to this
as a necessary evil until the machine check handler gets a
major overhaul.

Brian: You started to say you wanted the int fields, but then
looked like you took it back at you'd only need a half dozen
or so functions to cover all the possible "mov -FAULT,reg"
cases that you'd like to optimize.

-Tony

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 0280f5c5d160..f5063b6659eb 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -42,28 +42,24 @@
 #define _ASM_SI		__ASM_REG(si)
 #define _ASM_DI		__ASM_REG(di)
 
-#define EXTABLE_CLASS_DEFAULT	0	/* standard uaccess fixup */
-#define EXTABLE_CLASS_FAULT	1	/* provide trap number in %ax */
-#define EXTABLE_CLASS_EX	2	/* uaccess + set uaccess_err */
-
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE_CLASS(from, to, class)			\
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	.pushsection "__ex_table","a" ;				\
 	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
-	.long (class) ;						\
+	.long (handler) - . ;					\
 	.popsection
 
 # define _ASM_EXTABLE(from, to)					\
-	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_DEFAULT)
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
 
 # define _ASM_EXTABLE_FAULT(from, to)				\
-	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_FAULT)
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
 
 # define _ASM_EXTABLE_EX(from, to)				\
-	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_EX)
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
@@ -96,23 +92,23 @@
 	.endm
 
 #else
-# define _EXPAND_EXTABLE_CLASS(x) #x
-# define _ASM_EXTABLE_CLASS(from, to, class)			\
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
 	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
-	" .long (" _EXPAND_EXTABLE_CLASS(class) ")\n"		\
+	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
 	" .popsection\n"
 
 # define _ASM_EXTABLE(from, to)					\
-	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_DEFAULT)
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
 
 # define _ASM_EXTABLE_FAULT(from, to)				\
-	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_FAULT)
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
 
 # define _ASM_EXTABLE_EX(from, to)				\
-	_ASM_EXTABLE_CLASS(from, to, EXTABLE_CLASS_EX)
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 315d1423377c..565b382b2f35 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -104,14 +104,14 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  */
 
 struct exception_table_entry {
-	int insn, fixup;
-	u32 handler;
+	int insn, fixup, handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
 extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index e87aa060977c..bca8b3936740 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -185,15 +185,6 @@ static struct severity {
 #define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
 				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
 
-static inline bool mce_in_kernel_recov(unsigned long addr)
-{
-	const struct exception_table_entry *fixup;
-
-	fixup = search_exception_tables(addr);
-
-	return fixup && fixup->handler == EXTABLE_CLASS_FAULT;
-}
-
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -209,7 +200,7 @@ static int error_context(struct mce *m)
 {
 	if ((m->cs & 3) == 3)
 		return IN_USER;
-	if (mc_recoverable(m->mcgstatus) && mce_in_kernel_recov(m->ip))
+	if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
 		return IN_KERNEL_RECOV;
 	return IN_KERNEL;
 }
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index cfaf5feace36..e1a996bc84e3 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,7 +3,7 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
-typedef int (*ex_handler_t)(const struct exception_table_entry *,
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
 			    struct pt_regs *, int);
 
 static inline unsigned long
@@ -16,23 +16,30 @@ ex_fixup_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->fixup + x->fixup;
 }
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
+{
+	return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
 
-static int ex_handler_default(const struct exception_table_entry *fixup,
+bool ex_handler_default(const struct exception_table_entry *fixup,
 		       struct pt_regs *regs, int trapnr)
 {
 	regs->ip = ex_fixup_addr(fixup);
 	return 1;
 }
+EXPORT_SYMBOL(ex_handler_default);
 
-static int ex_handler_fault(const struct exception_table_entry *fixup,
+bool ex_handler_fault(const struct exception_table_entry *fixup,
 		     struct pt_regs *regs, int trapnr)
 {
 	regs->ip = ex_fixup_addr(fixup);
 	regs->ax = trapnr;
 	return 1;
 }
+EXPORT_SYMBOL_GPL(ex_handler_fault);
 
-static int ex_handler_ext(const struct exception_table_entry *fixup,
+bool ex_handler_ext(const struct exception_table_entry *fixup,
 		   struct pt_regs *regs, int trapnr)
 {
 	/* Special hack for uaccess_err */
@@ -40,16 +47,25 @@ static int ex_handler_ext(const struct exception_table_entry *fixup,
 	regs->ip = ex_fixup_addr(fixup);
 	return 1;
 }
+EXPORT_SYMBOL(ex_handler_ext);
+
+bool ex_has_fault_handler(unsigned long ip)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
 
-static ex_handler_t allhandlers[] = {
-	[EXTABLE_CLASS_DEFAULT] = ex_handler_default,
-	[EXTABLE_CLASS_FAULT] = ex_handler_fault,
-	[EXTABLE_CLASS_EX] = ex_handler_ext,
-};
+	e = search_exception_tables(ip);
+	if (!e)
+		return 0;
+	handler = ex_fixup_handler(e);
+
+	return handler == ex_handler_fault;
+}
 
 int fixup_exception(struct pt_regs *regs, int trapnr)
 {
 	const struct exception_table_entry *e;
+	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -69,10 +85,8 @@ int fixup_exception(struct pt_regs *regs, int trapnr)
 	if (!e)
 		return 0;
 
-	/* if exception table corrupted die here rather than jump into space */
-	BUG_ON(e->handler >= ARRAY_SIZE(allhandlers));
-
-	return allhandlers[e->handler](e, regs, trapnr);
+	handler = ex_fixup_handler(e);
+	return handler(e, regs, trapnr);
 }
 
 /* Restricted version used during very early boot */
@@ -80,15 +94,17 @@ int __init early_fixup_exception(unsigned long *ip)
 {
 	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
 	e = search_exception_tables(*ip);
 	if (!e)
 		return 0;
 
 	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
 
 	/* special handling not supported during early boot */
-	if (e->handler != EXTABLE_CLASS_DEFAULT)
+	if (handler != ex_handler_default)
 		return 0;
 
 	*ip = new_ip;
@@ -157,7 +173,7 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup += i;
 		i += 4;
-		/* p->handler doesn't need noodling */
+		p->handler += i;
 		i += 4;
 	}
 
@@ -171,7 +187,7 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup -= i;
 		i += 4;
-		/* p->handler doesn't need unnoodling */
+		p->handler -= i;
 		i += 4;
 	}
 }
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index b17b716959a4..7b29fb14f870 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -219,6 +219,7 @@ static void x86_sort_relative_table(char *extab_image, int image_size)
 
 		w(r(loc) + i, loc);
 		w(r(loc + 1) + i + 4, loc + 1);
+		w(r(loc + 2) + i + 8, loc + 2);
 
 		i += sizeof(uint32_t) * 3;
 	}
@@ -231,6 +232,7 @@ static void x86_sort_relative_table(char *extab_image, int image_size)
 
 		w(r(loc) - i, loc);
 		w(r(loc + 1) - (i + 4), loc + 1);
+		w(r(loc + 2) - (i + 8), loc + 2);
 
 		i += sizeof(uint32_t) * 3;
 	}

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-10 11:26                   ` Borislav Petkov
@ 2016-01-11 10:44                     ` Ingo Molnar
  2016-01-13 23:22                       ` Tony Luck
  0 siblings, 1 reply; 36+ messages in thread
From: Ingo Molnar @ 2016-01-11 10:44 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Dan Williams, Andy Lutomirski, linux-nvdimm,
	linux-kernel, Andrew Morton, Robert, linux-mm, X86 ML


* Borislav Petkov <bp@alien8.de> wrote:

> On Sat, Jan 09, 2016 at 05:40:05PM -0800, Tony Luck wrote:
> > BUT ... it's all going to be very messy.  We don't have any CPUID
> > capability bits to say whether we support recovery, or which instructions
> > are good/bad choices for recovery.
> 
> We can always define synthetic ones and set them after having checked
> MCA capability bits, f/m/s, etc., maybe even based on the list you're
> supplying...

So such a synthetic CPUID bit would definitely be useful.

Also, knowing whether a memcpy function is recoverable or not, should not be 
delegated to callers: there should be the regular memcpy APIs, plus new APIs that 
do everything they can to provide recoverable memory copies. Whether it's achieved 
via flag checking, a function pointer or code patching is an implementation detail 
that's not visible to drivers making use of the new facility.

I'd go for the simplest, most robust solution initially, also perhaps with boot 
time messages to make sure users know which variant is used and now.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-09  6:36         ` Andy Lutomirski
@ 2016-01-11 23:09           ` Brian Gerst
  2016-01-11 23:22             ` Andy Lutomirski
  2016-01-11 23:48             ` Luck, Tony
  0 siblings, 2 replies; 36+ messages in thread
From: Brian Gerst @ 2016-01-11 23:09 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Dan Williams, Borislav Petkov, X86 ML, linux-mm, Robert,
	Andrew Morton, Tony Luck, Ingo Molnar, linux-kernel,
	linux-nvdimm

On Sat, Jan 9, 2016 at 1:36 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Jan 8, 2016 8:31 PM, "Brian Gerst" <brgerst@gmail.com> wrote:
>>
>> On Fri, Jan 8, 2016 at 10:39 PM, Brian Gerst <brgerst@gmail.com> wrote:
>> > On Fri, Jan 8, 2016 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> >> On Fri, Jan 8, 2016 at 12:49 PM, Tony Luck <tony.luck@intel.com> wrote:
>> >>> Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
>> >>> produce this. Andy provided the inspiration to add classes to the
>> >>> exception table with a clever bit-squeezing trick, Boris pointed
>> >>> out how much cleaner it would all be if we just had a new field.
>> >>>
>> >>> Linus Torvalds blessed the expansion with:
>> >>>   I'd rather not be clever in order to save just a tiny amount of space
>> >>>   in the exception table, which isn't really criticial for anybody.
>> >>>
>> >>> The third field is a simple integer indexing into an array of handler
>> >>> functions (I thought it couldn't be a relative pointer like the other
>> >>> fields because a module may have its ex_table loaded more than 2GB away
>> >>> from the handler function - but that may not be actually true. But the
>> >>> integer is pretty flexible, we are only really using low two bits now).
>> >>>
>> >>> We start out with three handlers:
>> >>>
>> >>> 0: Legacy - just jumps the to fixup IP
>> >>> 1: Fault - provide the trap number in %ax to the fixup code
>> >>> 2: Cleaned up legacy for the uaccess error hack
>> >>
>> >> I think I preferred the relative function pointer approach.
>> >>
>> >> Also, I think it would be nicer if the machine check code would invoke
>> >> the handler regardless of which handler (or class) is selected.  Then
>> >> the handlers that don't want to handle #MC can just reject them.
>> >>
>> >> Also, can you make the handlers return bool instead of int?
>> >
>> > I'm hashing up an idea that could eliminate alot of text in the .fixup
>> > section, but it needs the integer handler method to work.  We have
>> > alot of fixup code that does "mov $-EFAULT, reg; jmp xxxx".  If we
>> > encode the register in the third word, the handler can be generic and
>> > no fixup code for each user access would be needed.  That would
>> > recover alot of the memory used by expanding the exception table.
>>
>> On second thought, this could still be implemented with a relative
>> function pointer.  We'd just need a separate function for each
>> register.
>>
>
> If we could get gcc to play along (which, IIRC, it already can for
> __put_user), we can do much better with jump labels -- the fixup
> target would be a jump label.
>
> Even without that, how about using @cc?  Do:
>
> clc
> mov whatever, wherever
>
> The fixup sets the carry flag and skips the faulting instruction
> (either by knowing the length or by decoding it), and the inline asm
> causes gcc to emit jc to the error logic.
>
> --Andy

I agree that for at least put_user() using asm goto would be an even
better option.  get_user() on the other hand, will be much messier to
deal with, since asm goto statements can't have outputs, plus it
zeroes the output register on fault.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-11 23:09           ` Brian Gerst
@ 2016-01-11 23:22             ` Andy Lutomirski
  2016-01-11 23:48             ` Luck, Tony
  1 sibling, 0 replies; 36+ messages in thread
From: Andy Lutomirski @ 2016-01-11 23:22 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Dan Williams, Borislav Petkov, X86 ML, linux-mm, Robert,
	Andrew Morton, Tony Luck, Ingo Molnar, linux-kernel,
	linux-nvdimm

On Mon, Jan 11, 2016 at 3:09 PM, Brian Gerst <brgerst@gmail.com> wrote:
> On Sat, Jan 9, 2016 at 1:36 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Jan 8, 2016 8:31 PM, "Brian Gerst" <brgerst@gmail.com> wrote:
>>>
>>> On Fri, Jan 8, 2016 at 10:39 PM, Brian Gerst <brgerst@gmail.com> wrote:
>>> > On Fri, Jan 8, 2016 at 8:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> >> On Fri, Jan 8, 2016 at 12:49 PM, Tony Luck <tony.luck@intel.com> wrote:
>>> >>> Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
>>> >>> produce this. Andy provided the inspiration to add classes to the
>>> >>> exception table with a clever bit-squeezing trick, Boris pointed
>>> >>> out how much cleaner it would all be if we just had a new field.
>>> >>>
>>> >>> Linus Torvalds blessed the expansion with:
>>> >>>   I'd rather not be clever in order to save just a tiny amount of space
>>> >>>   in the exception table, which isn't really criticial for anybody.
>>> >>>
>>> >>> The third field is a simple integer indexing into an array of handler
>>> >>> functions (I thought it couldn't be a relative pointer like the other
>>> >>> fields because a module may have its ex_table loaded more than 2GB away
>>> >>> from the handler function - but that may not be actually true. But the
>>> >>> integer is pretty flexible, we are only really using low two bits now).
>>> >>>
>>> >>> We start out with three handlers:
>>> >>>
>>> >>> 0: Legacy - just jumps the to fixup IP
>>> >>> 1: Fault - provide the trap number in %ax to the fixup code
>>> >>> 2: Cleaned up legacy for the uaccess error hack
>>> >>
>>> >> I think I preferred the relative function pointer approach.
>>> >>
>>> >> Also, I think it would be nicer if the machine check code would invoke
>>> >> the handler regardless of which handler (or class) is selected.  Then
>>> >> the handlers that don't want to handle #MC can just reject them.
>>> >>
>>> >> Also, can you make the handlers return bool instead of int?
>>> >
>>> > I'm hashing up an idea that could eliminate alot of text in the .fixup
>>> > section, but it needs the integer handler method to work.  We have
>>> > alot of fixup code that does "mov $-EFAULT, reg; jmp xxxx".  If we
>>> > encode the register in the third word, the handler can be generic and
>>> > no fixup code for each user access would be needed.  That would
>>> > recover alot of the memory used by expanding the exception table.
>>>
>>> On second thought, this could still be implemented with a relative
>>> function pointer.  We'd just need a separate function for each
>>> register.
>>>
>>
>> If we could get gcc to play along (which, IIRC, it already can for
>> __put_user), we can do much better with jump labels -- the fixup
>> target would be a jump label.
>>
>> Even without that, how about using @cc?  Do:
>>
>> clc
>> mov whatever, wherever
>>
>> The fixup sets the carry flag and skips the faulting instruction
>> (either by knowing the length or by decoding it), and the inline asm
>> causes gcc to emit jc to the error logic.
>>
>> --Andy
>
> I agree that for at least put_user() using asm goto would be an even
> better option.  get_user() on the other hand, will be much messier to
> deal with, since asm goto statements can't have outputs, plus it
> zeroes the output register on fault.
>

The cc thing still works for get_user, I think.

int fault;
asm ("clc; mov whatever, wherever" : "=r" (out), "=@ccc" (fault) : "m" (in));
if (fault) {
  out = 0;
  return -EFAULT;
}

return 0;

You'd set the handler to a special handler that does regs->flags |=
X86_EFLAGS_CF in addition to jumping to the landing pad, which, in
this case, is immediately after the mov.

If you want to be *really* fancy, a post-compilation pass could detect
these things, observe that the landing pad points to jc, nop out the
jc, and move the landing pad to the jc target.  This gets most of the
speed benefit of what asm goto would do if gcc supported it without
relying on gcc support.

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH v8 1/3] x86: Expand exception table to allow new handling options
  2016-01-11 23:09           ` Brian Gerst
  2016-01-11 23:22             ` Andy Lutomirski
@ 2016-01-11 23:48             ` Luck, Tony
  1 sibling, 0 replies; 36+ messages in thread
From: Luck, Tony @ 2016-01-11 23:48 UTC (permalink / raw)
  To: Brian Gerst, Andy Lutomirski
  Cc: Williams, Dan J, Borislav Petkov, X86 ML, linux-mm, Robert,
	Andrew Morton, Ingo Molnar, linux-kernel, linux-nvdimm

> I agree that for at least put_user() using asm goto would be an even
> better option.  get_user() on the other hand, will be much messier to
> deal with, since asm goto statements can't have outputs, plus it
> zeroes the output register on fault.

get_user() is the much more interesting one for me. A read from
a poisoned user address that generates a machine check is something
that can be recovered (kill the process).  A write to user space doesn't
even generate a machine check.

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-09  1:49   ` Andy Lutomirski
  2016-01-09 17:48     ` Tony Luck
@ 2016-01-12  0:26     ` Luck, Tony
  2016-01-12  0:30       ` Andy Lutomirski
  2016-01-12  0:37       ` Andy Lutomirski
  1 sibling, 2 replies; 36+ messages in thread
From: Luck, Tony @ 2016-01-12  0:26 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-nvdimm, Dan Williams, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Fri, Jan 08, 2016 at 05:49:30PM -0800, Andy Lutomirski wrote:
> Also, what's the sfence for?  You don't seem to be using any
> non-temporal operations.

So I deleted the "sfence" and now I just have a comment
at the 100: label.

37:
        shl $6,%ecx
        lea -48(%ecx,%edx),%edx
        jmp 100f
38:
        shl $6,%ecx
        lea -56(%ecx,%edx),%edx
        jmp 100f
39:
        lea (%rdx,%rcx,8),%rdx
        jmp 100f
40:
        mov %ecx,%edx
100:
        /* %rax set the fault number in fixup_exception() */
        ret

Should I just change all the "jmp 100f" into "ret"?  There
aren't any tools that will be confused that the function
has 10 returns, are there?

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-12  0:26     ` Luck, Tony
@ 2016-01-12  0:30       ` Andy Lutomirski
  2016-01-12  0:37       ` Andy Lutomirski
  1 sibling, 0 replies; 36+ messages in thread
From: Andy Lutomirski @ 2016-01-12  0:30 UTC (permalink / raw)
  To: Luck, Tony
  Cc: linux-nvdimm, Dan Williams, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Mon, Jan 11, 2016 at 4:26 PM, Luck, Tony <tony.luck@intel.com> wrote:
> On Fri, Jan 08, 2016 at 05:49:30PM -0800, Andy Lutomirski wrote:
>> Also, what's the sfence for?  You don't seem to be using any
>> non-temporal operations.
>
> So I deleted the "sfence" and now I just have a comment
> at the 100: label.
>
> 37:
>         shl $6,%ecx
>         lea -48(%ecx,%edx),%edx
>         jmp 100f
> 38:
>         shl $6,%ecx
>         lea -56(%ecx,%edx),%edx
>         jmp 100f
> 39:
>         lea (%rdx,%rcx,8),%rdx
>         jmp 100f
> 40:
>         mov %ecx,%edx
> 100:
>         /* %rax set the fault number in fixup_exception() */
>         ret
>
> Should I just change all the "jmp 100f" into "ret"?  There
> aren't any tools that will be confused that the function
> has 10 returns, are there?
>

Given that gcc does that too, it should be fine.

--Andy\

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-12  0:26     ` Luck, Tony
  2016-01-12  0:30       ` Andy Lutomirski
@ 2016-01-12  0:37       ` Andy Lutomirski
  1 sibling, 0 replies; 36+ messages in thread
From: Andy Lutomirski @ 2016-01-12  0:37 UTC (permalink / raw)
  To: Luck, Tony
  Cc: linux-nvdimm, Dan Williams, Borislav Petkov, linux-kernel,
	Andrew Morton, Robert, Ingo Molnar, linux-mm, X86 ML

On Mon, Jan 11, 2016 at 4:26 PM, Luck, Tony <tony.luck@intel.com> wrote:
> On Fri, Jan 08, 2016 at 05:49:30PM -0800, Andy Lutomirski wrote:
>> Also, what's the sfence for?  You don't seem to be using any
>> non-temporal operations.
>
> So I deleted the "sfence" and now I just have a comment
> at the 100: label.
>
> 37:
>         shl $6,%ecx
>         lea -48(%ecx,%edx),%edx
>         jmp 100f
> 38:
>         shl $6,%ecx
>         lea -56(%ecx,%edx),%edx
>         jmp 100f
> 39:
>         lea (%rdx,%rcx,8),%rdx
>         jmp 100f
> 40:
>         mov %ecx,%edx
> 100:
>         /* %rax set the fault number in fixup_exception() */
>         ret
>
> Should I just change all the "jmp 100f" into "ret"?  There
> aren't any tools that will be confused that the function
> has 10 returns, are there?
>

Given that gcc does that too, it should be fine.

--Andy\

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-11 10:44                     ` Ingo Molnar
@ 2016-01-13 23:22                       ` Tony Luck
  2016-01-14  4:39                         ` Borislav Petkov
  0 siblings, 1 reply; 36+ messages in thread
From: Tony Luck @ 2016-01-13 23:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Dan Williams, Andy Lutomirski, linux-nvdimm,
	linux-kernel, Andrew Morton, Robert, linux-mm, X86 ML

On Mon, Jan 11, 2016 at 2:44 AM, Ingo Molnar <mingo@kernel.org> wrote:
> So such a synthetic CPUID bit would definitely be useful.
>
> Also, knowing whether a memcpy function is recoverable or not, should not be
> delegated to callers: there should be the regular memcpy APIs, plus new APIs that
> do everything they can to provide recoverable memory copies. Whether it's achieved
> via flag checking, a function pointer or code patching is an implementation detail
> that's not visible to drivers making use of the new facility.
>
> I'd go for the simplest, most robust solution initially, also perhaps with boot
> time messages to make sure users know which variant is used and now.

Are there some examples of synthetic CPUID bits?  I grepped around and
found a handful of places making ad hoc decisions based on sub-strings of
x86_model_id[] ... but didn't find any systematic approach.

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-13 23:22                       ` Tony Luck
@ 2016-01-14  4:39                         ` Borislav Petkov
  2016-01-30  0:35                           ` Tony Luck
  0 siblings, 1 reply; 36+ messages in thread
From: Borislav Petkov @ 2016-01-14  4:39 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Dan Williams, Andy Lutomirski, linux-nvdimm,
	linux-kernel, Andrew Morton, Robert, linux-mm, X86 ML

On Wed, Jan 13, 2016 at 03:22:58PM -0800, Tony Luck wrote:
> Are there some examples of synthetic CPUID bits?

X86_FEATURE_ALWAYS is one. The others got renamed into X86_BUG_* ones,
the remaining mechanism is the same, though.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-14  4:39                         ` Borislav Petkov
@ 2016-01-30  0:35                           ` Tony Luck
  2016-01-30 10:28                             ` Borislav Petkov
  0 siblings, 1 reply; 36+ messages in thread
From: Tony Luck @ 2016-01-30  0:35 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Dan Williams, Andy Lutomirski, linux-nvdimm,
	linux-kernel, Andrew Morton, Robert, linux-mm, X86 ML

On Wed, Jan 13, 2016 at 8:39 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Wed, Jan 13, 2016 at 03:22:58PM -0800, Tony Luck wrote:
>> Are there some examples of synthetic CPUID bits?
>
> X86_FEATURE_ALWAYS is one. The others got renamed into X86_BUG_* ones,
> the remaining mechanism is the same, though.

So something like this [gmail will line wrap, but should still be legible]

Then Dan will be able to use:

      if (cpu_has(c, X86_FEATURE_MCRECOVERY))

to decide whether to use the (slightly slower, but recovery capable)
__mcsafe_copy()
or just pick the fastest memcpy() instead.

-Tony


diff --git a/arch/x86/include/asm/cpufeature.h
b/arch/x86/include/asm/cpufeature.h
index 7ad8c9464297..621e05103633 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -106,6 +106,7 @@
 #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU  ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
 #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in
S3 state */
+#define X86_FEATURE_MCRECOVERY ( 3*32+31) /* cpu has recoverable
machine checks */

 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3       ( 4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a006f4cd792b..b8980d767240 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1694,6 +1694,14 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
                return;
        }

+       /*
+        * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
+        * whether this processor will actually generate recoverable
+        * machine checks. Check to see if this is an E7 model Xeon.
+        */
+       if (mca_cfg.ser && !strncmp(c->x86_model_id, "Intel(R) Xeon(R)
CPU E7-", 24))
+               set_cpu_cap(c, X86_FEATURE_MCRECOVERY);
+
        if (mce_gen_pool_init()) {
                mca_cfg.disabled = true;
                pr_emerg("Couldn't allocate MCE records pool!\n");

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-30  0:35                           ` Tony Luck
@ 2016-01-30 10:28                             ` Borislav Petkov
  2016-02-01 23:10                               ` Tony Luck
  0 siblings, 1 reply; 36+ messages in thread
From: Borislav Petkov @ 2016-01-30 10:28 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Dan Williams, Andy Lutomirski, linux-nvdimm,
	linux-kernel, Andrew Morton, Robert, linux-mm, X86 ML

On Fri, Jan 29, 2016 at 04:35:35PM -0800, Tony Luck wrote:
> On Wed, Jan 13, 2016 at 8:39 PM, Borislav Petkov <bp@alien8.de> wrote:
> > On Wed, Jan 13, 2016 at 03:22:58PM -0800, Tony Luck wrote:
> >> Are there some examples of synthetic CPUID bits?
> >
> > X86_FEATURE_ALWAYS is one. The others got renamed into X86_BUG_* ones,
> > the remaining mechanism is the same, though.
> 
> So something like this [gmail will line wrap, but should still be legible]
> 
> Then Dan will be able to use:
> 
>       if (cpu_has(c, X86_FEATURE_MCRECOVERY))
> 
> to decide whether to use the (slightly slower, but recovery capable)
> __mcsafe_copy()
> or just pick the fastest memcpy() instead.

The most optimal way of alternatively calling two functions would be
something like this, IMO:

alternative_call(memcpy, __mcsafe_copy, X86_FEATURE_MCRECOVERY,
		 ASM_OUTPUT2("=a" (mcsafe_ret.trapnr), "=d" (mcsafe_ret.remain)),
		 "D" (dst), "S" (src), "d" (len));

I hope I've not messed up the calling convention but you want the inputs
in %rdi, %rsi, %rdx and the outputs in %rax, %rdx, respectively. Just
check the asm gcc generates and do not trust me :)

The other thing you probably would need to do is create our own
__memcpy() which returns struct mcsafe_ret so that the signatures of
both functions match.

Yeah, it is a bit of jumping through hoops but this way we do a CALL
<func_ptr> directly in asm, without any JMPs or NOPs padding the other
alternatives methods add.

But if you don't care about a small JMP and that is not a hot path, you
could do the simpler:

	if (static_cpu_has(X86_FEATURE_MCRECOVERY))
		return __mcsafe_copy(...);

	return memcpy();

which adds a JMP or a 5-byte NOP depending on the X86_FEATURE_MCRECOVERY
setting.

> diff --git a/arch/x86/include/asm/cpufeature.h
> b/arch/x86/include/asm/cpufeature.h
> index 7ad8c9464297..621e05103633 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -106,6 +106,7 @@
>  #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
>  #define X86_FEATURE_EAGER_FPU  ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
>  #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in
> S3 state */
> +#define X86_FEATURE_MCRECOVERY ( 3*32+31) /* cpu has recoverable

Why not write it out?

	X86_FEATURE_MCE_RECOVERY

> machine checks */
> 
>  /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
>  #define X86_FEATURE_XMM3       ( 4*32+ 0) /* "pni" SSE-3 */

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-30 10:28                             ` Borislav Petkov
@ 2016-02-01 23:10                               ` Tony Luck
  2016-02-01 23:16                                 ` Dan Williams
  0 siblings, 1 reply; 36+ messages in thread
From: Tony Luck @ 2016-02-01 23:10 UTC (permalink / raw)
  To: Dan Williams
  Cc: Ingo Molnar, Borislav Petkov, Andy Lutomirski, linux-nvdimm,
	linux-kernel, Andrew Morton, Robert, linux-mm, X86 ML

> The most optimal way of alternatively calling two functions would be
> something like this, IMO:
>
> alternative_call(memcpy, __mcsafe_copy, X86_FEATURE_MCRECOVERY,
>                  ASM_OUTPUT2("=a" (mcsafe_ret.trapnr), "=d" (mcsafe_ret.remain)),
>                  "D" (dst), "S" (src), "d" (len));
>
> I hope I've not messed up the calling convention but you want the inputs
> in %rdi, %rsi, %rdx and the outputs in %rax, %rdx, respectively. Just
> check the asm gcc generates and do not trust me :)
>
> The other thing you probably would need to do is create our own
> __memcpy() which returns struct mcsafe_ret so that the signatures of
> both functions match.
>
> Yeah, it is a bit of jumping through hoops but this way we do a CALL
> <func_ptr> directly in asm, without any JMPs or NOPs padding the other
> alternatives methods add.
>
> But if you don't care about a small JMP and that is not a hot path, you
> could do the simpler:
>
>         if (static_cpu_has(X86_FEATURE_MCRECOVERY))
>                 return __mcsafe_copy(...);
>
>         return memcpy();
>
> which adds a JMP or a 5-byte NOP depending on the X86_FEATURE_MCRECOVERY
> setting.

Dan,

What do you want the API to look like at the point you make a call
in the libnvdimm code?  Something like:

        r = nvcopy(dst, src, len);

where the innards of nvcopy() does the check for X86_FEATURE_MCE_RECOVERY?

What is useful to you in the return value? The low level __mcsafe_copy() returns
both a remainder and a trap number. But in your case I don't think you
need the trap
number (if the remaining count is not zero, then there must have been a #MC. #PF
isn't an option for you, right?

-Tony

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v8 3/3] x86, mce: Add __mcsafe_copy()
  2016-02-01 23:10                               ` Tony Luck
@ 2016-02-01 23:16                                 ` Dan Williams
  0 siblings, 0 replies; 36+ messages in thread
From: Dan Williams @ 2016-02-01 23:16 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Borislav Petkov, Andy Lutomirski, linux-nvdimm,
	linux-kernel, Andrew Morton, Robert, linux-mm, X86 ML

On Mon, Feb 1, 2016 at 3:10 PM, Tony Luck <tony.luck@gmail.com> wrote:
>> The most optimal way of alternatively calling two functions would be
>> something like this, IMO:
>>
>> alternative_call(memcpy, __mcsafe_copy, X86_FEATURE_MCRECOVERY,
>>                  ASM_OUTPUT2("=a" (mcsafe_ret.trapnr), "=d" (mcsafe_ret.remain)),
>>                  "D" (dst), "S" (src), "d" (len));
>>
>> I hope I've not messed up the calling convention but you want the inputs
>> in %rdi, %rsi, %rdx and the outputs in %rax, %rdx, respectively. Just
>> check the asm gcc generates and do not trust me :)
>>
>> The other thing you probably would need to do is create our own
>> __memcpy() which returns struct mcsafe_ret so that the signatures of
>> both functions match.
>>
>> Yeah, it is a bit of jumping through hoops but this way we do a CALL
>> <func_ptr> directly in asm, without any JMPs or NOPs padding the other
>> alternatives methods add.
>>
>> But if you don't care about a small JMP and that is not a hot path, you
>> could do the simpler:
>>
>>         if (static_cpu_has(X86_FEATURE_MCRECOVERY))
>>                 return __mcsafe_copy(...);
>>
>>         return memcpy();
>>
>> which adds a JMP or a 5-byte NOP depending on the X86_FEATURE_MCRECOVERY
>> setting.
>
> Dan,
>
> What do you want the API to look like at the point you make a call
> in the libnvdimm code?  Something like:
>
>         r = nvcopy(dst, src, len);
>
> where the innards of nvcopy() does the check for X86_FEATURE_MCE_RECOVERY?
>
> What is useful to you in the return value? The low level __mcsafe_copy() returns
> both a remainder and a trap number. But in your case I don't think you
> need the trap
> number (if the remaining count is not zero, then there must have been a #MC. #PF
> isn't an option for you, right?

RIght, we don't need a trap number just an error.  This is the v1
attempt at integrating mcsafe_copy:

https://lists.01.org/pipermail/linux-nvdimm/2016-January/003869.html

I think the only change needed is to use
static_cpu_has(X86_FEATURE_MCRECOVERY) like so:

+static inline int arch_memcpy_from_pmem(void *dst, const void __pmem *src,
+ size_t n)
+{
+ if (static_cpu_has(X86_FEATURE_MCRECOVERY)) {
+ struct mcsafe_ret ret;
+
+ ret = __mcsafe_copy(dst, (void __force *) src, n);
+ if (ret.remain)
+ return -EIO;
+ return 0;
+ }
+ memcpy(dst, (void __force *) src, n);
+ return 0;
+}

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2016-02-01 23:16 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-09  0:04 [PATCH v8 0/3] Machine check recovery when kernel accesses poison Tony Luck
2015-12-31 19:40 ` [PATCH v8 2/3] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries Tony Luck
2016-01-08 20:49 ` [PATCH v8 1/3] x86: Expand exception table to allow new handling options Tony Luck
2016-01-09  1:52   ` Andy Lutomirski
2016-01-09  3:39     ` Brian Gerst
2016-01-09  4:31       ` Brian Gerst
2016-01-09  6:36         ` Andy Lutomirski
2016-01-11 23:09           ` Brian Gerst
2016-01-11 23:22             ` Andy Lutomirski
2016-01-11 23:48             ` Luck, Tony
2016-01-09 17:45     ` Tony Luck
2016-01-09 18:00       ` Andy Lutomirski
2016-01-09 19:51         ` Tony Luck
2016-01-09 22:32           ` Andy Lutomirski
2016-01-10  1:15             ` Tony Luck
2016-01-11  0:25     ` Luck, Tony
2016-01-08 21:18 ` [PATCH v8 3/3] x86, mce: Add __mcsafe_copy() Tony Luck
2016-01-09  1:49   ` Andy Lutomirski
2016-01-09 17:48     ` Tony Luck
2016-01-09 17:57       ` Andy Lutomirski
2016-01-09 19:39         ` Tony Luck
2016-01-09 22:15           ` Dan Williams
2016-01-09 22:33             ` Andy Lutomirski
2016-01-10  0:23               ` Dan Williams
2016-01-10  1:40                 ` Tony Luck
2016-01-10 11:26                   ` Borislav Petkov
2016-01-11 10:44                     ` Ingo Molnar
2016-01-13 23:22                       ` Tony Luck
2016-01-14  4:39                         ` Borislav Petkov
2016-01-30  0:35                           ` Tony Luck
2016-01-30 10:28                             ` Borislav Petkov
2016-02-01 23:10                               ` Tony Luck
2016-02-01 23:16                                 ` Dan Williams
2016-01-12  0:26     ` Luck, Tony
2016-01-12  0:30       ` Andy Lutomirski
2016-01-12  0:37       ` Andy Lutomirski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).