All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v10 2/4] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries
  2016-02-04 20:36 ` Tony Luck
@ 2015-12-31 19:40   ` Tony Luck
  -1 siblings, 0 replies; 34+ messages in thread
From: Tony Luck @ 2015-12-31 19:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel tagged with _ASM_EXTABLE_FAULT() so that the ex_handler_fault()
handler will provide the fixup code with the trap number.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 22 +++++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 70 ++++++++++++++++---------------
 2 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..5119766d9889 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/debugfs.h>
 #include <asm/mce.h>
+#include <asm/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -29,7 +30,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +49,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -87,6 +89,10 @@ static struct severity {
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
 	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
 		),
@@ -123,6 +129,11 @@ static struct severity {
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
 	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
+	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
@@ -170,6 +181,9 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +197,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a006f4cd792b..905f3070f412 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	/* If this CPU is offline, just bail out. */
@@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_exception(regs, X86_TRAP_MC))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v10 2/4] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries
@ 2015-12-31 19:40   ` Tony Luck
  0 siblings, 0 replies; 34+ messages in thread
From: Tony Luck @ 2015-12-31 19:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel tagged with _ASM_EXTABLE_FAULT() so that the ex_handler_fault()
handler will provide the fixup code with the trap number.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 22 +++++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 70 ++++++++++++++++---------------
 2 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..5119766d9889 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/debugfs.h>
 #include <asm/mce.h>
+#include <asm/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -29,7 +30,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +49,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -87,6 +89,10 @@ static struct severity {
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
 	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
 		),
@@ -123,6 +129,11 @@ static struct severity {
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
 	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
+	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
@@ -170,6 +181,9 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +197,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a006f4cd792b..905f3070f412 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	/* If this CPU is offline, just bail out. */
@@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_exception(regs, X86_TRAP_MC))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-- 
2.5.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v10 1/4] x86: Expand exception table to allow new handling options
  2016-02-04 20:36 ` Tony Luck
@ 2016-01-08 20:49   ` Tony Luck
  -1 siblings, 0 replies; 34+ messages in thread
From: Tony Luck @ 2016-01-08 20:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
produce this. Andy provided the inspiration to add classes to the
exception table with a clever bit-squeezing trick, Boris pointed
out how much cleaner it would all be if we just had a new field.

Linus Torvalds blessed the expansion with:
  I'd rather not be clever in order to save just a tiny amount of space
  in the exception table, which isn't really criticial for anybody.

The third field is another relative function pointer, this one to a
handler that executes the actions.

We start out with three handlers:

1: Legacy - just jumps the to fixup IP
2: Fault - provide the trap number in %ax to the fixup code
3: Cleaned up legacy for the uaccess error hack

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/x86/exception-tables.txt |  35 ++++++++++++
 arch/x86/include/asm/asm.h             |  40 +++++++------
 arch/x86/include/asm/uaccess.h         |  16 +++---
 arch/x86/kernel/kprobes/core.c         |   2 +-
 arch/x86/kernel/traps.c                |   6 +-
 arch/x86/mm/extable.c                  | 100 ++++++++++++++++++++++++---------
 arch/x86/mm/fault.c                    |   2 +-
 scripts/sortextable.c                  |  32 +++++++++++
 8 files changed, 176 insertions(+), 57 deletions(-)

diff --git a/Documentation/x86/exception-tables.txt b/Documentation/x86/exception-tables.txt
index 32901aa36f0a..fed18187a8b8 100644
--- a/Documentation/x86/exception-tables.txt
+++ b/Documentation/x86/exception-tables.txt
@@ -290,3 +290,38 @@ Due to the way that the exception table is built and needs to be ordered,
 only use exceptions for code in the .text section.  Any other section
 will cause the exception table to not be sorted correctly, and the
 exceptions will fail.
+
+Things changed when 64-bit support was added to x86 Linux. Rather than
+double the size of the exception table by expanding the two entries
+from 32-bits to 64 bits, a clever trick was used to store addresses
+as relative offsets from the table itself. The assembly code changed
+from:
+	.long 1b,3b
+to:
+        .long (from) - .
+        .long (to) - .
+
+and the C-code that uses these values converts back to absolute addresses
+like this:
+
+	ex_insn_addr(const struct exception_table_entry *x)
+	{
+		return (unsigned long)&x->insn + x->insn;
+	}
+
+In v4.5 the exception table entry was given a new field "handler".
+This is also 32-bits wide and contains a third relative function
+pointer which points to one of:
+
+1) int ex_handler_default(const struct exception_table_entry *fixup)
+   This is legacy case that just jumps to the fixup code
+2) int ex_handler_fault(const struct exception_table_entry *fixup)
+   This case provides the fault number of the trap that occurred at
+   entry->insn. It is used to distinguish page faults from machine
+   check.
+3) int ex_handler_ext(const struct exception_table_entry *fixup)
+   This case is used for uaccess_err ... we need to set a flag
+   in the task structure. Before the handler functions existed this
+   case was handled by adding a large offset to the fixup to tag
+   it as special.
+More functions can easily be added.
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..f5063b6659eb 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,19 +44,22 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
+	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
+	.long (handler) - . ;					\
 	.popsection
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
-	.popsection
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
@@ -89,19 +92,24 @@
 	.endm
 
 #else
-# define _ASM_EXTABLE(from,to)					\
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
+	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
+	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
 	" .popsection\n"
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
-	" .popsection\n"
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a4a30e4b2d34..c0f27d7ea7ff 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -90,12 +90,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 	likely(!__range_not_ok(addr, size, user_addr_max()))
 
 /*
- * The exception table consists of pairs of addresses relative to the
- * exception table enty itself: the first is the address of an
- * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -104,13 +103,14 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  */
 
 struct exception_table_entry {
-	int insn, fixup;
+	int insn, fixup, handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a46b1d..211c11c7bba4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..9dd7e4b7fcde 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,9 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
+			    struct pt_regs *, int);
+
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
 {
@@ -13,11 +16,56 @@ ex_fixup_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->fixup + x->fixup;
 }
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
+{
+	return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
 
-int fixup_exception(struct pt_regs *regs)
+bool ex_handler_default(const struct exception_table_entry *fixup,
+		       struct pt_regs *regs, int trapnr)
 {
-	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_default);
+
+bool ex_handler_fault(const struct exception_table_entry *fixup,
+		     struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fault);
+
+bool ex_handler_ext(const struct exception_table_entry *fixup,
+		   struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_ext);
+
+bool ex_has_fault_handler(unsigned long ip)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
+
+	e = search_exception_tables(ip);
+	if (!e)
+		return false;
+	handler = ex_fixup_handler(e);
+
+	return handler == ex_handler_fault;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +81,34 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	e = search_exception_tables(regs->ip);
+	if (!e)
+		return 0;
 
-	return 0;
+	handler = ex_fixup_handler(e);
+	return handler(e, regs, trapnr);
 }
 
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
-	const struct exception_table_entry *fixup;
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
-	fixup = search_exception_tables(*ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
+	e = search_exception_tables(*ip);
+	if (!e)
+		return 0;
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* uaccess handling not supported during early boot */
-			return 0;
-		}
+	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
 
-		*ip = new_ip;
-		return 1;
-	}
+	/* special handling not supported during early boot */
+	if (handler != ex_handler_default)
+		return 0;
 
-	return 0;
+	*ip = new_ip;
+	return 1;
 }
 
 /*
@@ -133,6 +173,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup += i;
 		i += 4;
+		p->handler += i;
+		i += 4;
 	}
 
 	sort(start, finish - start, sizeof(struct exception_table_entry),
@@ -145,6 +187,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup -= i;
 		i += 4;
+		p->handler -= i;
+		i += 4;
 	}
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..495946c3f9dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index c2423d913b46..7b29fb14f870 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -209,6 +209,35 @@ static int compare_relative_table(const void *a, const void *b)
 	return 0;
 }
 
+static void x86_sort_relative_table(char *extab_image, int image_size)
+{
+	int i;
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) + i, loc);
+		w(r(loc + 1) + i + 4, loc + 1);
+		w(r(loc + 2) + i + 8, loc + 2);
+
+		i += sizeof(uint32_t) * 3;
+	}
+
+	qsort(extab_image, image_size / 12, 12, compare_relative_table);
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) - i, loc);
+		w(r(loc + 1) - (i + 4), loc + 1);
+		w(r(loc + 2) - (i + 8), loc + 2);
+
+		i += sizeof(uint32_t) * 3;
+	}
+}
+
 static void sort_relative_table(char *extab_image, int image_size)
 {
 	int i;
@@ -281,6 +310,9 @@ do_file(char const *const fname)
 		break;
 	case EM_386:
 	case EM_X86_64:
+		custom_sort = x86_sort_relative_table;
+		break;
+
 	case EM_S390:
 		custom_sort = sort_relative_table;
 		break;
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v10 1/4] x86: Expand exception table to allow new handling options
@ 2016-01-08 20:49   ` Tony Luck
  0 siblings, 0 replies; 34+ messages in thread
From: Tony Luck @ 2016-01-08 20:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
produce this. Andy provided the inspiration to add classes to the
exception table with a clever bit-squeezing trick, Boris pointed
out how much cleaner it would all be if we just had a new field.

Linus Torvalds blessed the expansion with:
  I'd rather not be clever in order to save just a tiny amount of space
  in the exception table, which isn't really criticial for anybody.

The third field is another relative function pointer, this one to a
handler that executes the actions.

We start out with three handlers:

1: Legacy - just jumps the to fixup IP
2: Fault - provide the trap number in %ax to the fixup code
3: Cleaned up legacy for the uaccess error hack

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/x86/exception-tables.txt |  35 ++++++++++++
 arch/x86/include/asm/asm.h             |  40 +++++++------
 arch/x86/include/asm/uaccess.h         |  16 +++---
 arch/x86/kernel/kprobes/core.c         |   2 +-
 arch/x86/kernel/traps.c                |   6 +-
 arch/x86/mm/extable.c                  | 100 ++++++++++++++++++++++++---------
 arch/x86/mm/fault.c                    |   2 +-
 scripts/sortextable.c                  |  32 +++++++++++
 8 files changed, 176 insertions(+), 57 deletions(-)

diff --git a/Documentation/x86/exception-tables.txt b/Documentation/x86/exception-tables.txt
index 32901aa36f0a..fed18187a8b8 100644
--- a/Documentation/x86/exception-tables.txt
+++ b/Documentation/x86/exception-tables.txt
@@ -290,3 +290,38 @@ Due to the way that the exception table is built and needs to be ordered,
 only use exceptions for code in the .text section.  Any other section
 will cause the exception table to not be sorted correctly, and the
 exceptions will fail.
+
+Things changed when 64-bit support was added to x86 Linux. Rather than
+double the size of the exception table by expanding the two entries
+from 32-bits to 64 bits, a clever trick was used to store addresses
+as relative offsets from the table itself. The assembly code changed
+from:
+	.long 1b,3b
+to:
+        .long (from) - .
+        .long (to) - .
+
+and the C-code that uses these values converts back to absolute addresses
+like this:
+
+	ex_insn_addr(const struct exception_table_entry *x)
+	{
+		return (unsigned long)&x->insn + x->insn;
+	}
+
+In v4.5 the exception table entry was given a new field "handler".
+This is also 32-bits wide and contains a third relative function
+pointer which points to one of:
+
+1) int ex_handler_default(const struct exception_table_entry *fixup)
+   This is legacy case that just jumps to the fixup code
+2) int ex_handler_fault(const struct exception_table_entry *fixup)
+   This case provides the fault number of the trap that occurred at
+   entry->insn. It is used to distinguish page faults from machine
+   check.
+3) int ex_handler_ext(const struct exception_table_entry *fixup)
+   This case is used for uaccess_err ... we need to set a flag
+   in the task structure. Before the handler functions existed this
+   case was handled by adding a large offset to the fixup to tag
+   it as special.
+More functions can easily be added.
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..f5063b6659eb 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,19 +44,22 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
+	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
+	.long (handler) - . ;					\
 	.popsection
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
-	.popsection
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
@@ -89,19 +92,24 @@
 	.endm
 
 #else
-# define _ASM_EXTABLE(from,to)					\
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
+	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
+	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
 	" .popsection\n"
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
-	" .popsection\n"
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a4a30e4b2d34..c0f27d7ea7ff 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -90,12 +90,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 	likely(!__range_not_ok(addr, size, user_addr_max()))
 
 /*
- * The exception table consists of pairs of addresses relative to the
- * exception table enty itself: the first is the address of an
- * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -104,13 +103,14 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  */
 
 struct exception_table_entry {
-	int insn, fixup;
+	int insn, fixup, handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a46b1d..211c11c7bba4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..9dd7e4b7fcde 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,9 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
+			    struct pt_regs *, int);
+
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
 {
@@ -13,11 +16,56 @@ ex_fixup_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->fixup + x->fixup;
 }
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
+{
+	return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
 
-int fixup_exception(struct pt_regs *regs)
+bool ex_handler_default(const struct exception_table_entry *fixup,
+		       struct pt_regs *regs, int trapnr)
 {
-	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_default);
+
+bool ex_handler_fault(const struct exception_table_entry *fixup,
+		     struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fault);
+
+bool ex_handler_ext(const struct exception_table_entry *fixup,
+		   struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_ext);
+
+bool ex_has_fault_handler(unsigned long ip)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
+
+	e = search_exception_tables(ip);
+	if (!e)
+		return false;
+	handler = ex_fixup_handler(e);
+
+	return handler == ex_handler_fault;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +81,34 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	e = search_exception_tables(regs->ip);
+	if (!e)
+		return 0;
 
-	return 0;
+	handler = ex_fixup_handler(e);
+	return handler(e, regs, trapnr);
 }
 
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
-	const struct exception_table_entry *fixup;
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
-	fixup = search_exception_tables(*ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
+	e = search_exception_tables(*ip);
+	if (!e)
+		return 0;
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* uaccess handling not supported during early boot */
-			return 0;
-		}
+	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
 
-		*ip = new_ip;
-		return 1;
-	}
+	/* special handling not supported during early boot */
+	if (handler != ex_handler_default)
+		return 0;
 
-	return 0;
+	*ip = new_ip;
+	return 1;
 }
 
 /*
@@ -133,6 +173,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup += i;
 		i += 4;
+		p->handler += i;
+		i += 4;
 	}
 
 	sort(start, finish - start, sizeof(struct exception_table_entry),
@@ -145,6 +187,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup -= i;
 		i += 4;
+		p->handler -= i;
+		i += 4;
 	}
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..495946c3f9dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index c2423d913b46..7b29fb14f870 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -209,6 +209,35 @@ static int compare_relative_table(const void *a, const void *b)
 	return 0;
 }
 
+static void x86_sort_relative_table(char *extab_image, int image_size)
+{
+	int i;
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) + i, loc);
+		w(r(loc + 1) + i + 4, loc + 1);
+		w(r(loc + 2) + i + 8, loc + 2);
+
+		i += sizeof(uint32_t) * 3;
+	}
+
+	qsort(extab_image, image_size / 12, 12, compare_relative_table);
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) - i, loc);
+		w(r(loc + 1) - (i + 4), loc + 1);
+		w(r(loc + 2) - (i + 8), loc + 2);
+
+		i += sizeof(uint32_t) * 3;
+	}
+}
+
 static void sort_relative_table(char *extab_image, int image_size)
 {
 	int i;
@@ -281,6 +310,9 @@ do_file(char const *const fname)
 		break;
 	case EM_386:
 	case EM_X86_64:
+		custom_sort = x86_sort_relative_table;
+		break;
+
 	case EM_S390:
 		custom_sort = sort_relative_table;
 		break;
-- 
2.5.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
  2016-02-04 20:36 ` Tony Luck
@ 2016-01-08 21:18   ` Tony Luck
  -1 siblings, 0 replies; 34+ messages in thread
From: Tony Luck @ 2016-01-08 21:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

Make use of the EXTABLE_FAULT exception table entries. This routine
returns a structure to indicate the result of the copy:

struct mcsafe_ret {
        u64 trapnr;
        u64 remain;
};

If the copy is successful, then both 'trapnr' and 'remain' are zero.

If we faulted during the copy, then 'trapnr' will say which type
of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
bytes were not copied.

Note that this is probably the first of several copy functions.
We can make new ones for non-temporal cache handling etc.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/string_64.h |   8 +++
 arch/x86/kernel/x8664_ksyms_64.c |   2 +
 arch/x86/lib/memcpy_64.S         | 134 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 144 insertions(+)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a17dc4b..5b24039463a4 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,14 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+struct mcsafe_ret {
+	u64 trapnr;
+	u64 remain;
+};
+
+struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt);
+extern void __mcsafe_copy_end(void);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..fff245462a8c 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+EXPORT_SYMBOL_GPL(__mcsafe_copy);
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bba87de..f576acad485e 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,137 @@ ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * __mcsafe_copy - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(__mcsafe_copy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of source */
+	movl %esi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 0b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	mov %r8,(%rdi)
+	mov %r9,1*8(%rdi)
+	mov %r10,2*8(%rdi)
+	mov %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	mov %r8,4*8(%rdi)
+	mov %r9,5*8(%rdi)
+	mov %r10,6*8(%rdi)
+	mov %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	mov %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorq %rax, %rax
+	xorq %rdx, %rdx
+	/* copy successful. return 0 */
+	ret
+
+	.section .fixup,"ax"
+	/*
+	 * machine check handler loaded %rax with trap number
+	 * We just need to make sure %edx has the number of
+	 * bytes remaining
+	 */
+30:
+	add %ecx,%edx
+	ret
+31:
+	shl $6,%ecx
+	add %ecx,%edx
+	ret
+32:
+	shl $6,%ecx
+	lea -8(%ecx,%edx),%edx
+	ret
+33:
+	shl $6,%ecx
+	lea -16(%ecx,%edx),%edx
+	ret
+34:
+	shl $6,%ecx
+	lea -24(%ecx,%edx),%edx
+	ret
+35:
+	shl $6,%ecx
+	lea -32(%ecx,%edx),%edx
+	ret
+36:
+	shl $6,%ecx
+	lea -40(%ecx,%edx),%edx
+	ret
+37:
+	shl $6,%ecx
+	lea -48(%ecx,%edx),%edx
+	ret
+38:
+	shl $6,%ecx
+	lea -56(%ecx,%edx),%edx
+	ret
+39:
+	lea (%rdx,%rcx,8),%rdx
+	ret
+40:
+	mov %ecx,%edx
+	ret
+	.previous
+
+	_ASM_EXTABLE_FAULT(0b,30b)
+	_ASM_EXTABLE_FAULT(1b,31b)
+	_ASM_EXTABLE_FAULT(2b,32b)
+	_ASM_EXTABLE_FAULT(3b,33b)
+	_ASM_EXTABLE_FAULT(4b,34b)
+	_ASM_EXTABLE_FAULT(9b,35b)
+	_ASM_EXTABLE_FAULT(10b,36b)
+	_ASM_EXTABLE_FAULT(11b,37b)
+	_ASM_EXTABLE_FAULT(12b,38b)
+	_ASM_EXTABLE_FAULT(18b,39b)
+	_ASM_EXTABLE_FAULT(21b,40b)
+#endif
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
@ 2016-01-08 21:18   ` Tony Luck
  0 siblings, 0 replies; 34+ messages in thread
From: Tony Luck @ 2016-01-08 21:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

Make use of the EXTABLE_FAULT exception table entries. This routine
returns a structure to indicate the result of the copy:

struct mcsafe_ret {
        u64 trapnr;
        u64 remain;
};

If the copy is successful, then both 'trapnr' and 'remain' are zero.

If we faulted during the copy, then 'trapnr' will say which type
of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
bytes were not copied.

Note that this is probably the first of several copy functions.
We can make new ones for non-temporal cache handling etc.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/string_64.h |   8 +++
 arch/x86/kernel/x8664_ksyms_64.c |   2 +
 arch/x86/lib/memcpy_64.S         | 134 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 144 insertions(+)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a17dc4b..5b24039463a4 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,14 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+struct mcsafe_ret {
+	u64 trapnr;
+	u64 remain;
+};
+
+struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt);
+extern void __mcsafe_copy_end(void);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..fff245462a8c 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+EXPORT_SYMBOL_GPL(__mcsafe_copy);
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bba87de..f576acad485e 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,137 @@ ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * __mcsafe_copy - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(__mcsafe_copy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of source */
+	movl %esi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 0b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	mov %r8,(%rdi)
+	mov %r9,1*8(%rdi)
+	mov %r10,2*8(%rdi)
+	mov %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	mov %r8,4*8(%rdi)
+	mov %r9,5*8(%rdi)
+	mov %r10,6*8(%rdi)
+	mov %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	mov %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorq %rax, %rax
+	xorq %rdx, %rdx
+	/* copy successful. return 0 */
+	ret
+
+	.section .fixup,"ax"
+	/*
+	 * machine check handler loaded %rax with trap number
+	 * We just need to make sure %edx has the number of
+	 * bytes remaining
+	 */
+30:
+	add %ecx,%edx
+	ret
+31:
+	shl $6,%ecx
+	add %ecx,%edx
+	ret
+32:
+	shl $6,%ecx
+	lea -8(%ecx,%edx),%edx
+	ret
+33:
+	shl $6,%ecx
+	lea -16(%ecx,%edx),%edx
+	ret
+34:
+	shl $6,%ecx
+	lea -24(%ecx,%edx),%edx
+	ret
+35:
+	shl $6,%ecx
+	lea -32(%ecx,%edx),%edx
+	ret
+36:
+	shl $6,%ecx
+	lea -40(%ecx,%edx),%edx
+	ret
+37:
+	shl $6,%ecx
+	lea -48(%ecx,%edx),%edx
+	ret
+38:
+	shl $6,%ecx
+	lea -56(%ecx,%edx),%edx
+	ret
+39:
+	lea (%rdx,%rcx,8),%rdx
+	ret
+40:
+	mov %ecx,%edx
+	ret
+	.previous
+
+	_ASM_EXTABLE_FAULT(0b,30b)
+	_ASM_EXTABLE_FAULT(1b,31b)
+	_ASM_EXTABLE_FAULT(2b,32b)
+	_ASM_EXTABLE_FAULT(3b,33b)
+	_ASM_EXTABLE_FAULT(4b,34b)
+	_ASM_EXTABLE_FAULT(9b,35b)
+	_ASM_EXTABLE_FAULT(10b,36b)
+	_ASM_EXTABLE_FAULT(11b,37b)
+	_ASM_EXTABLE_FAULT(12b,38b)
+	_ASM_EXTABLE_FAULT(18b,39b)
+	_ASM_EXTABLE_FAULT(21b,40b)
+#endif
-- 
2.5.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
  2016-02-04 20:36 ` Tony Luck
@ 2016-01-30  0:00   ` Tony Luck
  -1 siblings, 0 replies; 34+ messages in thread
From: Tony Luck @ 2016-01-30  0:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

The Intel Software Developer Manual describes bit 24 in the MCG_CAP
MSR:
   MCG_SER_P (software error recovery support present) flag,
   bit 24 — Indicates (when set) that the processor supports
   software error recovery
But only some models with this capability bit set will actually
generate recoverable machine checks.

Check the model name and set a synthetic capability bit. Provide
a command line option to set this bit anyway in case the kernel
doesn't recognise the model name.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/x86/x86_64/boot-options.txt |  4 ++++
 arch/x86/include/asm/cpufeature.h         |  1 +
 arch/x86/include/asm/mce.h                |  1 +
 arch/x86/kernel/cpu/mcheck/mce.c          | 11 +++++++++++
 4 files changed, 17 insertions(+)

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 68ed3114c363..8423c04ae7b3 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -60,6 +60,10 @@ Machine check
 		threshold to 1. Enabling this may make memory predictive failure
 		analysis less effective if the bios sets thresholds for memory
 		errors since we will not see details for all errors.
+   mce=recovery
+		Tell the kernel that this system can generate recoverable
+		machine checks (useful when the kernel doesn't recognize
+		the cpuid x86_model_id[])
 
    nomce (for compatibility with i386): same as mce=off
 
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7ad8c9464297..06c6c2d2fea0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -106,6 +106,7 @@
 #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
 #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2ea4527e462f..18d2ba9c8e44 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -113,6 +113,7 @@ struct mca_config {
 	bool ignore_ce;
 	bool disabled;
 	bool ser;
+	bool recovery;
 	bool bios_cmci_threshold;
 	u8 banks;
 	s8 bootlog;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 905f3070f412..16a3d0e29f84 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1696,6 +1696,15 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 		return;
 	}
 
+	/*
+	 * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
+	 * whether this processor will actually generate recoverable
+	 * machine checks. Check to see if this is an E7 model Xeon.
+	 */
+	if (mca_cfg.recovery || (mca_cfg.ser &&
+		!strncmp(c->x86_model_id, "Intel(R) Xeon(R) CPU E7-", 24)))
+		set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
+
 	if (mce_gen_pool_init()) {
 		mca_cfg.disabled = true;
 		pr_emerg("Couldn't allocate MCE records pool!\n");
@@ -2030,6 +2039,8 @@ static int __init mcheck_enable(char *str)
 		cfg->bootlog = (str[0] == 'b');
 	else if (!strcmp(str, "bios_cmci_threshold"))
 		cfg->bios_cmci_threshold = true;
+	else if (!strcmp(str, "recovery"))
+		cfg->recovery = true;
 	else if (isdigit(str[0])) {
 		if (get_option(&str, &cfg->tolerant) == 2)
 			get_option(&str, &(cfg->monarch_timeout));
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
@ 2016-01-30  0:00   ` Tony Luck
  0 siblings, 0 replies; 34+ messages in thread
From: Tony Luck @ 2016-01-30  0:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

The Intel Software Developer Manual describes bit 24 in the MCG_CAP
MSR:
   MCG_SER_P (software error recovery support present) flag,
   bit 24 a?? Indicates (when set) that the processor supports
   software error recovery
But only some models with this capability bit set will actually
generate recoverable machine checks.

Check the model name and set a synthetic capability bit. Provide
a command line option to set this bit anyway in case the kernel
doesn't recognise the model name.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/x86/x86_64/boot-options.txt |  4 ++++
 arch/x86/include/asm/cpufeature.h         |  1 +
 arch/x86/include/asm/mce.h                |  1 +
 arch/x86/kernel/cpu/mcheck/mce.c          | 11 +++++++++++
 4 files changed, 17 insertions(+)

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 68ed3114c363..8423c04ae7b3 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -60,6 +60,10 @@ Machine check
 		threshold to 1. Enabling this may make memory predictive failure
 		analysis less effective if the bios sets thresholds for memory
 		errors since we will not see details for all errors.
+   mce=recovery
+		Tell the kernel that this system can generate recoverable
+		machine checks (useful when the kernel doesn't recognize
+		the cpuid x86_model_id[])
 
    nomce (for compatibility with i386): same as mce=off
 
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7ad8c9464297..06c6c2d2fea0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -106,6 +106,7 @@
 #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
 #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2ea4527e462f..18d2ba9c8e44 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -113,6 +113,7 @@ struct mca_config {
 	bool ignore_ce;
 	bool disabled;
 	bool ser;
+	bool recovery;
 	bool bios_cmci_threshold;
 	u8 banks;
 	s8 bootlog;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 905f3070f412..16a3d0e29f84 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1696,6 +1696,15 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 		return;
 	}
 
+	/*
+	 * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
+	 * whether this processor will actually generate recoverable
+	 * machine checks. Check to see if this is an E7 model Xeon.
+	 */
+	if (mca_cfg.recovery || (mca_cfg.ser &&
+		!strncmp(c->x86_model_id, "Intel(R) Xeon(R) CPU E7-", 24)))
+		set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
+
 	if (mce_gen_pool_init()) {
 		mca_cfg.disabled = true;
 		pr_emerg("Couldn't allocate MCE records pool!\n");
@@ -2030,6 +2039,8 @@ static int __init mcheck_enable(char *str)
 		cfg->bootlog = (str[0] == 'b');
 	else if (!strcmp(str, "bios_cmci_threshold"))
 		cfg->bios_cmci_threshold = true;
+	else if (!strcmp(str, "recovery"))
+		cfg->recovery = true;
 	else if (isdigit(str[0])) {
 		if (get_option(&str, &cfg->tolerant) == 2)
 			get_option(&str, &(cfg->monarch_timeout));
-- 
2.5.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v10 0/4] Machine check recovery when kernel accesses poison
@ 2016-02-04 20:36 ` Tony Luck
  0 siblings, 0 replies; 34+ messages in thread
From: Tony Luck @ 2016-02-04 20:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

This series is initially targeted at the folks doing filesystems
on top of NVDIMMs. They really want to be able to return -EIO
when there is a h/w error (just like spinning rust, and SSD does).

I plan to use the same infrastructure to write a machine check aware
"copy_from_user()" that will SIGBUS the calling application when a
syscall touches poison in user space (just like we do when the application
touches the poison itself).

With this series applied Dan Williams can write:

static inline int arch_memcpy_from_pmem(void *dst, const void __pmem *src, size_t n)
{
	if (static_cpu_has(X86_FEATURE_MCRECOVERY)) {
		struct mcsafe_ret ret;

		ret = __mcsafe_copy(dst, (void __force *) src, n);
		if (ret.remain)
			return -EIO;
		return 0;
	}
	memcpy(dst, (void __force *) src, n);
	return 0;
}


I've dropped off the "reviewed-by" tags that I collected back prior to
adding the new field to the exception table. Please send new ones
if you can.

Changes
V9-V10
Andy:	Commit comment in part 2 is stale - refers to "EXTABLE_CLASS_FAULT"
Boris:	Part1 - Numerous spelling, grammar, etc. fixes
Boris:	Part2 - No longer need #include <linux/module.h> (in either file).

V8->V9
Boris: Create a synthetic cpu capability for machine check recovery.

Changes V7-V8
Boris:  Would be so much cleaner if we added a new field to the exception table
        instead of squeezing bits into the fixup field. New field added
Tony:   Documentation needs to be updated. Done

Changes V6-V7:
Boris:  Why add/subtract 0x20000000? Added better comment provided by Andy
Boris:  Churn. Part2 changes things only introduced in part1.
        Merged parts 1&2 into one patch.
Ingo:   Missing my sign off on part1. Added.

Changes V5-V6
Andy:   Provoked massive re-write by providing what is now part1 of this
        patch series. This frees up two bits in the exception table
        fixup field that can be used to tag exception table entries
        as different "classes". This means we don't need my separate
        exception table fro machine checks. Also avoids duplicating
        fixup actions for #PF and #MC cases that were in version 5.
Andy:   Use C99 array initializers to tie the various class fixup
        functions back to the defintions of each class. Also give the
        functions meanningful names (not fixup_class0() etc.).
Boris:  Cleaned up my lousy assembly code removing many spurious 'l'
        modifiers on instructions.
Boris:  Provided some helper functions for the machine check severity
        calculation that make the code more readable.
Boris:  Have __mcsafe_copy() return a structure with the 'remaining bytes'
        in a separate field from the fault indicator. Boris had suggested
        Linux -EFAULT/-EINVAL ... but I thought it made more sense to return
        the exception number (X86_TRAP_MC, etc.)  This finally kills off
        BIT(63) which has been controversial throughout all the early versions
        of this patch series.

Changes V4-V5
Tony:   Extended __mcsafe_copy() to have fixup entries for both machine
        check and page fault.

Changes V3-V4:
Andy:   Simplify fixup_mcexception() by dropping used-once local variable
Andy:   "Reviewed-by" tag added to part1
Boris:  Moved new functions to memcpy_64.S and declaration to asm/string_64.h
Boris:  Changed name s/mcsafe_memcpy/__mcsafe_copy/ to make it clear that this
        is an internal function and that return value doesn't follow memcpy() semantics.
Boris:  "Reviewed-by" tag added to parts 1&2

Changes V2-V3:

Andy:   Don't hack "regs->ax = BIT(63) | addr;" in the machine check
        handler.  Now have better fixup code that computes the number
        of remaining bytes (just like page-fault fixup).
Andy:   #define for BIT(63). Done, plus couple of extra macros using it.
Boris:  Don't clutter up generic code (like mm/extable.c) with this.
        I moved everything under arch/x86 (the asm-generic change is
        a more generic #define).
Boris:  Dependencies for CONFIG_MCE_KERNEL_RECOVERY are too generic.
        I made it a real menu item with default "n". Dan Williams
        will use "select MCE_KERNEL_RECOVERY" from his persistent
        filesystem code.
Boris:  Simplify conditionals in mce.c by moving tolerant/kill_it
        checks earlier, with a skip to end if they aren't set.
Boris:  Miscellaneous grammar/punctuation. Fixed.
Boris:  Don't leak spurious __start_mcextable symbols into kernels
        that didn't configure MCE_KERNEL_RECOVERY. Done.
Tony:   New code doesn't belong in user_copy_64.S/uaccess*.h. Moved
        to new .S/.h files
Elliott:Cacheing behavior non-optimal. Could use movntdqa, vmovntdqa
        or vmovntdqa on source addresses. I didn't fix this yet. Think
        of the current mcsafe_memcpy() as the first of several functions.
        This one is useful for small copies (meta-data) where the overhead
        of saving SSE/AVX state isn't justified.

Changes V1->V2:

0-day:  Reported build errors and warnings on 32-bit systems. Fixed
0-day:  Reported bloat to tinyconfig. Fixed
Boris:  Suggestions to use extra macros to reduce code duplication in _ASM_*EXTABLE. Done
Boris:  Re-write "tolerant==3" check to reduce indentation level. See below.
Andy:   Check IP is valid before searching kernel exception tables. Done.
Andy:   Explain use of BIT(63) on return value from mcsafe_memcpy(). Done (added decode macros).
Andy:   Untangle mess of code in tail of do_machine_check() to make it
        clear what is going on (e.g. that we only enter the ist_begin_non_atomic()
        if we were called from user code, not from kernel!). Done.

Tony Luck (4):
  x86: Expand exception table to allow new handling options
  x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception
    table entries
  x86, mce: Add __mcsafe_copy()
  x86: Create a new synthetic cpu capability for machine check recovery

 Documentation/x86/exception-tables.txt    |  35 ++++++++
 Documentation/x86/x86_64/boot-options.txt |   4 +
 arch/x86/include/asm/asm.h                |  40 +++++----
 arch/x86/include/asm/cpufeature.h         |   1 +
 arch/x86/include/asm/mce.h                |   1 +
 arch/x86/include/asm/string_64.h          |   8 ++
 arch/x86/include/asm/uaccess.h            |  16 ++--
 arch/x86/kernel/cpu/mcheck/mce-severity.c |  22 ++++-
 arch/x86/kernel/cpu/mcheck/mce.c          |  81 ++++++++++--------
 arch/x86/kernel/kprobes/core.c            |   2 +-
 arch/x86/kernel/traps.c                   |   6 +-
 arch/x86/kernel/x8664_ksyms_64.c          |   2 +
 arch/x86/lib/memcpy_64.S                  | 134 ++++++++++++++++++++++++++++++
 arch/x86/mm/extable.c                     | 100 +++++++++++++++-------
 arch/x86/mm/fault.c                       |   2 +-
 scripts/sortextable.c                     |  32 +++++++
 16 files changed, 393 insertions(+), 93 deletions(-)

-- 
2.5.0

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v10 0/4] Machine check recovery when kernel accesses poison
@ 2016-02-04 20:36 ` Tony Luck
  0 siblings, 0 replies; 34+ messages in thread
From: Tony Luck @ 2016-02-04 20:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

This series is initially targeted at the folks doing filesystems
on top of NVDIMMs. They really want to be able to return -EIO
when there is a h/w error (just like spinning rust, and SSD does).

I plan to use the same infrastructure to write a machine check aware
"copy_from_user()" that will SIGBUS the calling application when a
syscall touches poison in user space (just like we do when the application
touches the poison itself).

With this series applied Dan Williams can write:

static inline int arch_memcpy_from_pmem(void *dst, const void __pmem *src, size_t n)
{
	if (static_cpu_has(X86_FEATURE_MCRECOVERY)) {
		struct mcsafe_ret ret;

		ret = __mcsafe_copy(dst, (void __force *) src, n);
		if (ret.remain)
			return -EIO;
		return 0;
	}
	memcpy(dst, (void __force *) src, n);
	return 0;
}


I've dropped off the "reviewed-by" tags that I collected back prior to
adding the new field to the exception table. Please send new ones
if you can.

Changes
V9-V10
Andy:	Commit comment in part 2 is stale - refers to "EXTABLE_CLASS_FAULT"
Boris:	Part1 - Numerous spelling, grammar, etc. fixes
Boris:	Part2 - No longer need #include <linux/module.h> (in either file).

V8->V9
Boris: Create a synthetic cpu capability for machine check recovery.

Changes V7-V8
Boris:  Would be so much cleaner if we added a new field to the exception table
        instead of squeezing bits into the fixup field. New field added
Tony:   Documentation needs to be updated. Done

Changes V6-V7:
Boris:  Why add/subtract 0x20000000? Added better comment provided by Andy
Boris:  Churn. Part2 changes things only introduced in part1.
        Merged parts 1&2 into one patch.
Ingo:   Missing my sign off on part1. Added.

Changes V5-V6
Andy:   Provoked massive re-write by providing what is now part1 of this
        patch series. This frees up two bits in the exception table
        fixup field that can be used to tag exception table entries
        as different "classes". This means we don't need my separate
        exception table fro machine checks. Also avoids duplicating
        fixup actions for #PF and #MC cases that were in version 5.
Andy:   Use C99 array initializers to tie the various class fixup
        functions back to the defintions of each class. Also give the
        functions meanningful names (not fixup_class0() etc.).
Boris:  Cleaned up my lousy assembly code removing many spurious 'l'
        modifiers on instructions.
Boris:  Provided some helper functions for the machine check severity
        calculation that make the code more readable.
Boris:  Have __mcsafe_copy() return a structure with the 'remaining bytes'
        in a separate field from the fault indicator. Boris had suggested
        Linux -EFAULT/-EINVAL ... but I thought it made more sense to return
        the exception number (X86_TRAP_MC, etc.)  This finally kills off
        BIT(63) which has been controversial throughout all the early versions
        of this patch series.

Changes V4-V5
Tony:   Extended __mcsafe_copy() to have fixup entries for both machine
        check and page fault.

Changes V3-V4:
Andy:   Simplify fixup_mcexception() by dropping used-once local variable
Andy:   "Reviewed-by" tag added to part1
Boris:  Moved new functions to memcpy_64.S and declaration to asm/string_64.h
Boris:  Changed name s/mcsafe_memcpy/__mcsafe_copy/ to make it clear that this
        is an internal function and that return value doesn't follow memcpy() semantics.
Boris:  "Reviewed-by" tag added to parts 1&2

Changes V2-V3:

Andy:   Don't hack "regs->ax = BIT(63) | addr;" in the machine check
        handler.  Now have better fixup code that computes the number
        of remaining bytes (just like page-fault fixup).
Andy:   #define for BIT(63). Done, plus couple of extra macros using it.
Boris:  Don't clutter up generic code (like mm/extable.c) with this.
        I moved everything under arch/x86 (the asm-generic change is
        a more generic #define).
Boris:  Dependencies for CONFIG_MCE_KERNEL_RECOVERY are too generic.
        I made it a real menu item with default "n". Dan Williams
        will use "select MCE_KERNEL_RECOVERY" from his persistent
        filesystem code.
Boris:  Simplify conditionals in mce.c by moving tolerant/kill_it
        checks earlier, with a skip to end if they aren't set.
Boris:  Miscellaneous grammar/punctuation. Fixed.
Boris:  Don't leak spurious __start_mcextable symbols into kernels
        that didn't configure MCE_KERNEL_RECOVERY. Done.
Tony:   New code doesn't belong in user_copy_64.S/uaccess*.h. Moved
        to new .S/.h files
Elliott:Cacheing behavior non-optimal. Could use movntdqa, vmovntdqa
        or vmovntdqa on source addresses. I didn't fix this yet. Think
        of the current mcsafe_memcpy() as the first of several functions.
        This one is useful for small copies (meta-data) where the overhead
        of saving SSE/AVX state isn't justified.

Changes V1->V2:

0-day:  Reported build errors and warnings on 32-bit systems. Fixed
0-day:  Reported bloat to tinyconfig. Fixed
Boris:  Suggestions to use extra macros to reduce code duplication in _ASM_*EXTABLE. Done
Boris:  Re-write "tolerant==3" check to reduce indentation level. See below.
Andy:   Check IP is valid before searching kernel exception tables. Done.
Andy:   Explain use of BIT(63) on return value from mcsafe_memcpy(). Done (added decode macros).
Andy:   Untangle mess of code in tail of do_machine_check() to make it
        clear what is going on (e.g. that we only enter the ist_begin_non_atomic()
        if we were called from user code, not from kernel!). Done.

Tony Luck (4):
  x86: Expand exception table to allow new handling options
  x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception
    table entries
  x86, mce: Add __mcsafe_copy()
  x86: Create a new synthetic cpu capability for machine check recovery

 Documentation/x86/exception-tables.txt    |  35 ++++++++
 Documentation/x86/x86_64/boot-options.txt |   4 +
 arch/x86/include/asm/asm.h                |  40 +++++----
 arch/x86/include/asm/cpufeature.h         |   1 +
 arch/x86/include/asm/mce.h                |   1 +
 arch/x86/include/asm/string_64.h          |   8 ++
 arch/x86/include/asm/uaccess.h            |  16 ++--
 arch/x86/kernel/cpu/mcheck/mce-severity.c |  22 ++++-
 arch/x86/kernel/cpu/mcheck/mce.c          |  81 ++++++++++--------
 arch/x86/kernel/kprobes/core.c            |   2 +-
 arch/x86/kernel/traps.c                   |   6 +-
 arch/x86/kernel/x8664_ksyms_64.c          |   2 +
 arch/x86/lib/memcpy_64.S                  | 134 ++++++++++++++++++++++++++++++
 arch/x86/mm/extable.c                     | 100 +++++++++++++++-------
 arch/x86/mm/fault.c                       |   2 +-
 scripts/sortextable.c                     |  32 +++++++
 16 files changed, 393 insertions(+), 93 deletions(-)

-- 
2.5.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
  2016-01-08 21:18   ` Tony Luck
@ 2016-02-07 16:49     ` Borislav Petkov
  -1 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-07 16:49 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Fri, Jan 08, 2016 at 01:18:03PM -0800, Tony Luck wrote:
> Make use of the EXTABLE_FAULT exception table entries. This routine
> returns a structure to indicate the result of the copy:
> 
> struct mcsafe_ret {
>         u64 trapnr;
>         u64 remain;
> };
> 
> If the copy is successful, then both 'trapnr' and 'remain' are zero.
> 
> If we faulted during the copy, then 'trapnr' will say which type
> of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
> bytes were not copied.
> 
> Note that this is probably the first of several copy functions.
> We can make new ones for non-temporal cache handling etc.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/string_64.h |   8 +++
>  arch/x86/kernel/x8664_ksyms_64.c |   2 +
>  arch/x86/lib/memcpy_64.S         | 134 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 144 insertions(+)

...

> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index 16698bba87de..f576acad485e 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -177,3 +177,137 @@ ENTRY(memcpy_orig)
>  .Lend:
>  	retq
>  ENDPROC(memcpy_orig)
> +
> +#ifndef CONFIG_UML
> +/*
> + * __mcsafe_copy - memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + */
> +ENTRY(__mcsafe_copy)
> +	cmpl $8,%edx
> +	jb 20f		/* less then 8 bytes, go to byte copy loop */
> +
> +	/* check for bad alignment of source */
> +	movl %esi,%ecx

You can save yourself this MOV here in what is, I'm assuming, the
general likely case where @src is aligned and do:

        /* check for bad alignment of source */
        testl $7, %esi
        /* already aligned? */
        jz 102f

        movl %esi,%ecx
        subl $8,%ecx
        negl %ecx
        subl %ecx,%edx
0:      movb (%rsi),%al
        movb %al,(%rdi)
        incq %rsi
        incq %rdi
        decl %ecx
        jnz 0b

> +	andl $7,%ecx
> +	jz 102f				/* already aligned */

Please move side-comments over the line they're referring to.

> +	subl $8,%ecx
> +	negl %ecx
> +	subl %ecx,%edx
> +0:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 0b
> +102:
> +	movl %edx,%ecx
> +	andl $63,%edx
> +	shrl $6,%ecx
> +	jz 17f

Please add a \n after the JMPs for better readability - those blocks are
dense as it is. They could use some comments too.

> +1:	movq (%rsi),%r8
> +2:	movq 1*8(%rsi),%r9
> +3:	movq 2*8(%rsi),%r10
> +4:	movq 3*8(%rsi),%r11
> +	mov %r8,(%rdi)
> +	mov %r9,1*8(%rdi)
> +	mov %r10,2*8(%rdi)
> +	mov %r11,3*8(%rdi)

You can say "movq" too here, for consistency.

> +9:	movq 4*8(%rsi),%r8
> +10:	movq 5*8(%rsi),%r9
> +11:	movq 6*8(%rsi),%r10
> +12:	movq 7*8(%rsi),%r11

Why aren't we pushing %r12-%r15 on the stack after the "jz 17f" above
and using them too and thus copying a whole cacheline in one go?

We would need to restore them when we're done with the cacheline-wise
shuffle, of course.

> +	mov %r8,4*8(%rdi)
> +	mov %r9,5*8(%rdi)
> +	mov %r10,6*8(%rdi)
> +	mov %r11,7*8(%rdi)
> +	leaq 64(%rsi),%rsi
> +	leaq 64(%rdi),%rdi
> +	decl %ecx
> +	jnz 1b

...

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
@ 2016-02-07 16:49     ` Borislav Petkov
  0 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-07 16:49 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Fri, Jan 08, 2016 at 01:18:03PM -0800, Tony Luck wrote:
> Make use of the EXTABLE_FAULT exception table entries. This routine
> returns a structure to indicate the result of the copy:
> 
> struct mcsafe_ret {
>         u64 trapnr;
>         u64 remain;
> };
> 
> If the copy is successful, then both 'trapnr' and 'remain' are zero.
> 
> If we faulted during the copy, then 'trapnr' will say which type
> of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
> bytes were not copied.
> 
> Note that this is probably the first of several copy functions.
> We can make new ones for non-temporal cache handling etc.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/string_64.h |   8 +++
>  arch/x86/kernel/x8664_ksyms_64.c |   2 +
>  arch/x86/lib/memcpy_64.S         | 134 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 144 insertions(+)

...

> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index 16698bba87de..f576acad485e 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -177,3 +177,137 @@ ENTRY(memcpy_orig)
>  .Lend:
>  	retq
>  ENDPROC(memcpy_orig)
> +
> +#ifndef CONFIG_UML
> +/*
> + * __mcsafe_copy - memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + */
> +ENTRY(__mcsafe_copy)
> +	cmpl $8,%edx
> +	jb 20f		/* less then 8 bytes, go to byte copy loop */
> +
> +	/* check for bad alignment of source */
> +	movl %esi,%ecx

You can save yourself this MOV here in what is, I'm assuming, the
general likely case where @src is aligned and do:

        /* check for bad alignment of source */
        testl $7, %esi
        /* already aligned? */
        jz 102f

        movl %esi,%ecx
        subl $8,%ecx
        negl %ecx
        subl %ecx,%edx
0:      movb (%rsi),%al
        movb %al,(%rdi)
        incq %rsi
        incq %rdi
        decl %ecx
        jnz 0b

> +	andl $7,%ecx
> +	jz 102f				/* already aligned */

Please move side-comments over the line they're referring to.

> +	subl $8,%ecx
> +	negl %ecx
> +	subl %ecx,%edx
> +0:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 0b
> +102:
> +	movl %edx,%ecx
> +	andl $63,%edx
> +	shrl $6,%ecx
> +	jz 17f

Please add a \n after the JMPs for better readability - those blocks are
dense as it is. They could use some comments too.

> +1:	movq (%rsi),%r8
> +2:	movq 1*8(%rsi),%r9
> +3:	movq 2*8(%rsi),%r10
> +4:	movq 3*8(%rsi),%r11
> +	mov %r8,(%rdi)
> +	mov %r9,1*8(%rdi)
> +	mov %r10,2*8(%rdi)
> +	mov %r11,3*8(%rdi)

You can say "movq" too here, for consistency.

> +9:	movq 4*8(%rsi),%r8
> +10:	movq 5*8(%rsi),%r9
> +11:	movq 6*8(%rsi),%r10
> +12:	movq 7*8(%rsi),%r11

Why aren't we pushing %r12-%r15 on the stack after the "jz 17f" above
and using them too and thus copying a whole cacheline in one go?

We would need to restore them when we're done with the cacheline-wise
shuffle, of course.

> +	mov %r8,4*8(%rdi)
> +	mov %r9,5*8(%rdi)
> +	mov %r10,6*8(%rdi)
> +	mov %r11,7*8(%rdi)
> +	leaq 64(%rsi),%rsi
> +	leaq 64(%rdi),%rdi
> +	decl %ecx
> +	jnz 1b

...

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
  2016-01-08 21:18   ` Tony Luck
@ 2016-02-07 16:55     ` Borislav Petkov
  -1 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-07 16:55 UTC (permalink / raw)
  To: Tony Luck, Richard Weinberger
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Fri, Jan 08, 2016 at 01:18:03PM -0800, Tony Luck wrote:
> Make use of the EXTABLE_FAULT exception table entries. This routine
> returns a structure to indicate the result of the copy:
> 
> struct mcsafe_ret {
>         u64 trapnr;
>         u64 remain;
> };
> 
> If the copy is successful, then both 'trapnr' and 'remain' are zero.
> 
> If we faulted during the copy, then 'trapnr' will say which type
> of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
> bytes were not copied.
> 
> Note that this is probably the first of several copy functions.
> We can make new ones for non-temporal cache handling etc.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/string_64.h |   8 +++
>  arch/x86/kernel/x8664_ksyms_64.c |   2 +
>  arch/x86/lib/memcpy_64.S         | 134 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 144 insertions(+)
> 
> diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
> index ff8b9a17dc4b..5b24039463a4 100644
> --- a/arch/x86/include/asm/string_64.h
> +++ b/arch/x86/include/asm/string_64.h
> @@ -78,6 +78,14 @@ int strcmp(const char *cs, const char *ct);
>  #define memset(s, c, n) __memset(s, c, n)
>  #endif
>  
> +struct mcsafe_ret {
> +	u64 trapnr;
> +	u64 remain;
> +};
> +
> +struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt);
> +extern void __mcsafe_copy_end(void);
> +
>  #endif /* __KERNEL__ */
>  
>  #endif /* _ASM_X86_STRING_64_H */
> diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
> index a0695be19864..fff245462a8c 100644
> --- a/arch/x86/kernel/x8664_ksyms_64.c
> +++ b/arch/x86/kernel/x8664_ksyms_64.c
> @@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
>  EXPORT_SYMBOL(_copy_from_user);
>  EXPORT_SYMBOL(_copy_to_user);
>  
> +EXPORT_SYMBOL_GPL(__mcsafe_copy);
> +
>  EXPORT_SYMBOL(copy_page);
>  EXPORT_SYMBOL(clear_page);
>  
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index 16698bba87de..f576acad485e 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -177,3 +177,137 @@ ENTRY(memcpy_orig)
>  .Lend:
>  	retq
>  ENDPROC(memcpy_orig)
> +
> +#ifndef CONFIG_UML

So this is because we get a link failure on UML:

arch/x86/um/built-in.o:(__ex_table+0x8): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x14): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x20): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x2c): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x38): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x44): more undefined references to `ex_handler_fault' follow
collect2: error: ld returned 1 exit status
make: *** [vmlinux] Error 1

due to those

> +     _ASM_EXTABLE_FAULT(0b,30b)
> +     _ASM_EXTABLE_FAULT(1b,31b)
> +     _ASM_EXTABLE_FAULT(2b,32b)
> +     _ASM_EXTABLE_FAULT(3b,33b)
> +     _ASM_EXTABLE_FAULT(4b,34b)

things below and that's because ex_handler_fault() is defined in
arch/x86/mm/extable.c and UML doesn't include that file in the build. It
takes kernel/extable.c and lib/extable.c only.

Richi, what's the usual way to address that in UML? I.e., make an
x86-only symbol visible to the UML build too? Define a dummy one, just
so that it builds?

> + * __mcsafe_copy - memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + */
> +ENTRY(__mcsafe_copy)
> +	cmpl $8,%edx
> +	jb 20f		/* less then 8 bytes, go to byte copy loop */
> +
> +	/* check for bad alignment of source */
> +	movl %esi,%ecx
> +	andl $7,%ecx
> +	jz 102f				/* already aligned */
> +	subl $8,%ecx
> +	negl %ecx
> +	subl %ecx,%edx
> +0:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 0b
> +102:
> +	movl %edx,%ecx
> +	andl $63,%edx
> +	shrl $6,%ecx
> +	jz 17f
> +1:	movq (%rsi),%r8
> +2:	movq 1*8(%rsi),%r9
> +3:	movq 2*8(%rsi),%r10
> +4:	movq 3*8(%rsi),%r11
> +	mov %r8,(%rdi)
> +	mov %r9,1*8(%rdi)
> +	mov %r10,2*8(%rdi)
> +	mov %r11,3*8(%rdi)
> +9:	movq 4*8(%rsi),%r8
> +10:	movq 5*8(%rsi),%r9
> +11:	movq 6*8(%rsi),%r10
> +12:	movq 7*8(%rsi),%r11
> +	mov %r8,4*8(%rdi)
> +	mov %r9,5*8(%rdi)
> +	mov %r10,6*8(%rdi)
> +	mov %r11,7*8(%rdi)
> +	leaq 64(%rsi),%rsi
> +	leaq 64(%rdi),%rdi
> +	decl %ecx
> +	jnz 1b
> +17:	movl %edx,%ecx
> +	andl $7,%edx
> +	shrl $3,%ecx
> +	jz 20f
> +18:	movq (%rsi),%r8
> +	mov %r8,(%rdi)
> +	leaq 8(%rsi),%rsi
> +	leaq 8(%rdi),%rdi
> +	decl %ecx
> +	jnz 18b
> +20:	andl %edx,%edx
> +	jz 23f
> +	movl %edx,%ecx
> +21:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 21b
> +23:	xorq %rax, %rax
> +	xorq %rdx, %rdx
> +	/* copy successful. return 0 */
> +	ret
> +
> +	.section .fixup,"ax"
> +	/*
> +	 * machine check handler loaded %rax with trap number
> +	 * We just need to make sure %edx has the number of
> +	 * bytes remaining
> +	 */
> +30:
> +	add %ecx,%edx
> +	ret
> +31:
> +	shl $6,%ecx
> +	add %ecx,%edx
> +	ret
> +32:
> +	shl $6,%ecx
> +	lea -8(%ecx,%edx),%edx
> +	ret
> +33:
> +	shl $6,%ecx
> +	lea -16(%ecx,%edx),%edx
> +	ret
> +34:
> +	shl $6,%ecx
> +	lea -24(%ecx,%edx),%edx
> +	ret
> +35:
> +	shl $6,%ecx
> +	lea -32(%ecx,%edx),%edx
> +	ret
> +36:
> +	shl $6,%ecx
> +	lea -40(%ecx,%edx),%edx
> +	ret
> +37:
> +	shl $6,%ecx
> +	lea -48(%ecx,%edx),%edx
> +	ret
> +38:
> +	shl $6,%ecx
> +	lea -56(%ecx,%edx),%edx
> +	ret
> +39:
> +	lea (%rdx,%rcx,8),%rdx
> +	ret
> +40:
> +	mov %ecx,%edx
> +	ret
> +	.previous
> +
> +	_ASM_EXTABLE_FAULT(0b,30b)
> +	_ASM_EXTABLE_FAULT(1b,31b)
> +	_ASM_EXTABLE_FAULT(2b,32b)
> +	_ASM_EXTABLE_FAULT(3b,33b)
> +	_ASM_EXTABLE_FAULT(4b,34b)
> +	_ASM_EXTABLE_FAULT(9b,35b)
> +	_ASM_EXTABLE_FAULT(10b,36b)
> +	_ASM_EXTABLE_FAULT(11b,37b)
> +	_ASM_EXTABLE_FAULT(12b,38b)
> +	_ASM_EXTABLE_FAULT(18b,39b)
> +	_ASM_EXTABLE_FAULT(21b,40b)
> +#endif
> -- 
> 2.5.0

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
@ 2016-02-07 16:55     ` Borislav Petkov
  0 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-07 16:55 UTC (permalink / raw)
  To: Tony Luck, Richard Weinberger
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Fri, Jan 08, 2016 at 01:18:03PM -0800, Tony Luck wrote:
> Make use of the EXTABLE_FAULT exception table entries. This routine
> returns a structure to indicate the result of the copy:
> 
> struct mcsafe_ret {
>         u64 trapnr;
>         u64 remain;
> };
> 
> If the copy is successful, then both 'trapnr' and 'remain' are zero.
> 
> If we faulted during the copy, then 'trapnr' will say which type
> of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
> bytes were not copied.
> 
> Note that this is probably the first of several copy functions.
> We can make new ones for non-temporal cache handling etc.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/string_64.h |   8 +++
>  arch/x86/kernel/x8664_ksyms_64.c |   2 +
>  arch/x86/lib/memcpy_64.S         | 134 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 144 insertions(+)
> 
> diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
> index ff8b9a17dc4b..5b24039463a4 100644
> --- a/arch/x86/include/asm/string_64.h
> +++ b/arch/x86/include/asm/string_64.h
> @@ -78,6 +78,14 @@ int strcmp(const char *cs, const char *ct);
>  #define memset(s, c, n) __memset(s, c, n)
>  #endif
>  
> +struct mcsafe_ret {
> +	u64 trapnr;
> +	u64 remain;
> +};
> +
> +struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt);
> +extern void __mcsafe_copy_end(void);
> +
>  #endif /* __KERNEL__ */
>  
>  #endif /* _ASM_X86_STRING_64_H */
> diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
> index a0695be19864..fff245462a8c 100644
> --- a/arch/x86/kernel/x8664_ksyms_64.c
> +++ b/arch/x86/kernel/x8664_ksyms_64.c
> @@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
>  EXPORT_SYMBOL(_copy_from_user);
>  EXPORT_SYMBOL(_copy_to_user);
>  
> +EXPORT_SYMBOL_GPL(__mcsafe_copy);
> +
>  EXPORT_SYMBOL(copy_page);
>  EXPORT_SYMBOL(clear_page);
>  
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index 16698bba87de..f576acad485e 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -177,3 +177,137 @@ ENTRY(memcpy_orig)
>  .Lend:
>  	retq
>  ENDPROC(memcpy_orig)
> +
> +#ifndef CONFIG_UML

So this is because we get a link failure on UML:

arch/x86/um/built-in.o:(__ex_table+0x8): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x14): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x20): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x2c): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x38): undefined reference to `ex_handler_fault'
arch/x86/um/built-in.o:(__ex_table+0x44): more undefined references to `ex_handler_fault' follow
collect2: error: ld returned 1 exit status
make: *** [vmlinux] Error 1

due to those

> +     _ASM_EXTABLE_FAULT(0b,30b)
> +     _ASM_EXTABLE_FAULT(1b,31b)
> +     _ASM_EXTABLE_FAULT(2b,32b)
> +     _ASM_EXTABLE_FAULT(3b,33b)
> +     _ASM_EXTABLE_FAULT(4b,34b)

things below and that's because ex_handler_fault() is defined in
arch/x86/mm/extable.c and UML doesn't include that file in the build. It
takes kernel/extable.c and lib/extable.c only.

Richi, what's the usual way to address that in UML? I.e., make an
x86-only symbol visible to the UML build too? Define a dummy one, just
so that it builds?

> + * __mcsafe_copy - memory copy with machine check exception handling
> + * Note that we only catch machine checks when reading the source addresses.
> + * Writes to target are posted and don't generate machine checks.
> + */
> +ENTRY(__mcsafe_copy)
> +	cmpl $8,%edx
> +	jb 20f		/* less then 8 bytes, go to byte copy loop */
> +
> +	/* check for bad alignment of source */
> +	movl %esi,%ecx
> +	andl $7,%ecx
> +	jz 102f				/* already aligned */
> +	subl $8,%ecx
> +	negl %ecx
> +	subl %ecx,%edx
> +0:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 0b
> +102:
> +	movl %edx,%ecx
> +	andl $63,%edx
> +	shrl $6,%ecx
> +	jz 17f
> +1:	movq (%rsi),%r8
> +2:	movq 1*8(%rsi),%r9
> +3:	movq 2*8(%rsi),%r10
> +4:	movq 3*8(%rsi),%r11
> +	mov %r8,(%rdi)
> +	mov %r9,1*8(%rdi)
> +	mov %r10,2*8(%rdi)
> +	mov %r11,3*8(%rdi)
> +9:	movq 4*8(%rsi),%r8
> +10:	movq 5*8(%rsi),%r9
> +11:	movq 6*8(%rsi),%r10
> +12:	movq 7*8(%rsi),%r11
> +	mov %r8,4*8(%rdi)
> +	mov %r9,5*8(%rdi)
> +	mov %r10,6*8(%rdi)
> +	mov %r11,7*8(%rdi)
> +	leaq 64(%rsi),%rsi
> +	leaq 64(%rdi),%rdi
> +	decl %ecx
> +	jnz 1b
> +17:	movl %edx,%ecx
> +	andl $7,%edx
> +	shrl $3,%ecx
> +	jz 20f
> +18:	movq (%rsi),%r8
> +	mov %r8,(%rdi)
> +	leaq 8(%rsi),%rsi
> +	leaq 8(%rdi),%rdi
> +	decl %ecx
> +	jnz 18b
> +20:	andl %edx,%edx
> +	jz 23f
> +	movl %edx,%ecx
> +21:	movb (%rsi),%al
> +	movb %al,(%rdi)
> +	incq %rsi
> +	incq %rdi
> +	decl %ecx
> +	jnz 21b
> +23:	xorq %rax, %rax
> +	xorq %rdx, %rdx
> +	/* copy successful. return 0 */
> +	ret
> +
> +	.section .fixup,"ax"
> +	/*
> +	 * machine check handler loaded %rax with trap number
> +	 * We just need to make sure %edx has the number of
> +	 * bytes remaining
> +	 */
> +30:
> +	add %ecx,%edx
> +	ret
> +31:
> +	shl $6,%ecx
> +	add %ecx,%edx
> +	ret
> +32:
> +	shl $6,%ecx
> +	lea -8(%ecx,%edx),%edx
> +	ret
> +33:
> +	shl $6,%ecx
> +	lea -16(%ecx,%edx),%edx
> +	ret
> +34:
> +	shl $6,%ecx
> +	lea -24(%ecx,%edx),%edx
> +	ret
> +35:
> +	shl $6,%ecx
> +	lea -32(%ecx,%edx),%edx
> +	ret
> +36:
> +	shl $6,%ecx
> +	lea -40(%ecx,%edx),%edx
> +	ret
> +37:
> +	shl $6,%ecx
> +	lea -48(%ecx,%edx),%edx
> +	ret
> +38:
> +	shl $6,%ecx
> +	lea -56(%ecx,%edx),%edx
> +	ret
> +39:
> +	lea (%rdx,%rcx,8),%rdx
> +	ret
> +40:
> +	mov %ecx,%edx
> +	ret
> +	.previous
> +
> +	_ASM_EXTABLE_FAULT(0b,30b)
> +	_ASM_EXTABLE_FAULT(1b,31b)
> +	_ASM_EXTABLE_FAULT(2b,32b)
> +	_ASM_EXTABLE_FAULT(3b,33b)
> +	_ASM_EXTABLE_FAULT(4b,34b)
> +	_ASM_EXTABLE_FAULT(9b,35b)
> +	_ASM_EXTABLE_FAULT(10b,36b)
> +	_ASM_EXTABLE_FAULT(11b,37b)
> +	_ASM_EXTABLE_FAULT(12b,38b)
> +	_ASM_EXTABLE_FAULT(18b,39b)
> +	_ASM_EXTABLE_FAULT(21b,40b)
> +#endif
> -- 
> 2.5.0

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
  2016-01-30  0:00   ` Tony Luck
@ 2016-02-07 17:10     ` Borislav Petkov
  -1 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-07 17:10 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Fri, Jan 29, 2016 at 04:00:19PM -0800, Tony Luck wrote:
> The Intel Software Developer Manual describes bit 24 in the MCG_CAP
> MSR:
>    MCG_SER_P (software error recovery support present) flag,
>    bit 24 — Indicates (when set) that the processor supports
>    software error recovery
> But only some models with this capability bit set will actually
> generate recoverable machine checks.
> 
> Check the model name and set a synthetic capability bit. Provide
> a command line option to set this bit anyway in case the kernel
> doesn't recognise the model name.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  Documentation/x86/x86_64/boot-options.txt |  4 ++++
>  arch/x86/include/asm/cpufeature.h         |  1 +
>  arch/x86/include/asm/mce.h                |  1 +
>  arch/x86/kernel/cpu/mcheck/mce.c          | 11 +++++++++++
>  4 files changed, 17 insertions(+)
> 
> diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
> index 68ed3114c363..8423c04ae7b3 100644
> --- a/Documentation/x86/x86_64/boot-options.txt
> +++ b/Documentation/x86/x86_64/boot-options.txt
> @@ -60,6 +60,10 @@ Machine check
>  		threshold to 1. Enabling this may make memory predictive failure
>  		analysis less effective if the bios sets thresholds for memory
>  		errors since we will not see details for all errors.
> +   mce=recovery
> +		Tell the kernel that this system can generate recoverable
> +		machine checks (useful when the kernel doesn't recognize
> +		the cpuid x86_model_id[])

I'd say		"Force-enable generation of recoverable MCEs."

and not mention implementation details in the description text.

>     nomce (for compatibility with i386): same as mce=off
>  
> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
> index 7ad8c9464297..06c6c2d2fea0 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -106,6 +106,7 @@
>  #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
>  #define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
>  #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
> +#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
>  
>  /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
>  #define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index 2ea4527e462f..18d2ba9c8e44 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -113,6 +113,7 @@ struct mca_config {
>  	bool ignore_ce;
>  	bool disabled;
>  	bool ser;
> +	bool recovery;
>  	bool bios_cmci_threshold;
>  	u8 banks;
>  	s8 bootlog;
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 905f3070f412..16a3d0e29f84 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -1696,6 +1696,15 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
>  		return;
>  	}
>  
> +	/*
> +	 * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
> +	 * whether this processor will actually generate recoverable
> +	 * machine checks. Check to see if this is an E7 model Xeon.
> +	 */
> +	if (mca_cfg.recovery || (mca_cfg.ser &&
> +		!strncmp(c->x86_model_id, "Intel(R) Xeon(R) CPU E7-", 24)))

Eeww, a model string check :-(

Lemme guess: those E7s can't be represented by a range of
model/steppings, can they?

Similar to AMD_MODEL_RANGE() thing in cpu/amd.c, for example.

In any case, that chunk belongs in the Intel part of
__mcheck_cpu_apply_quirks().

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
@ 2016-02-07 17:10     ` Borislav Petkov
  0 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-07 17:10 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Fri, Jan 29, 2016 at 04:00:19PM -0800, Tony Luck wrote:
> The Intel Software Developer Manual describes bit 24 in the MCG_CAP
> MSR:
>    MCG_SER_P (software error recovery support present) flag,
>    bit 24 a?? Indicates (when set) that the processor supports
>    software error recovery
> But only some models with this capability bit set will actually
> generate recoverable machine checks.
> 
> Check the model name and set a synthetic capability bit. Provide
> a command line option to set this bit anyway in case the kernel
> doesn't recognise the model name.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  Documentation/x86/x86_64/boot-options.txt |  4 ++++
>  arch/x86/include/asm/cpufeature.h         |  1 +
>  arch/x86/include/asm/mce.h                |  1 +
>  arch/x86/kernel/cpu/mcheck/mce.c          | 11 +++++++++++
>  4 files changed, 17 insertions(+)
> 
> diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
> index 68ed3114c363..8423c04ae7b3 100644
> --- a/Documentation/x86/x86_64/boot-options.txt
> +++ b/Documentation/x86/x86_64/boot-options.txt
> @@ -60,6 +60,10 @@ Machine check
>  		threshold to 1. Enabling this may make memory predictive failure
>  		analysis less effective if the bios sets thresholds for memory
>  		errors since we will not see details for all errors.
> +   mce=recovery
> +		Tell the kernel that this system can generate recoverable
> +		machine checks (useful when the kernel doesn't recognize
> +		the cpuid x86_model_id[])

I'd say		"Force-enable generation of recoverable MCEs."

and not mention implementation details in the description text.

>     nomce (for compatibility with i386): same as mce=off
>  
> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
> index 7ad8c9464297..06c6c2d2fea0 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -106,6 +106,7 @@
>  #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
>  #define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
>  #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
> +#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
>  
>  /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
>  #define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index 2ea4527e462f..18d2ba9c8e44 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -113,6 +113,7 @@ struct mca_config {
>  	bool ignore_ce;
>  	bool disabled;
>  	bool ser;
> +	bool recovery;
>  	bool bios_cmci_threshold;
>  	u8 banks;
>  	s8 bootlog;
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 905f3070f412..16a3d0e29f84 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -1696,6 +1696,15 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
>  		return;
>  	}
>  
> +	/*
> +	 * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
> +	 * whether this processor will actually generate recoverable
> +	 * machine checks. Check to see if this is an E7 model Xeon.
> +	 */
> +	if (mca_cfg.recovery || (mca_cfg.ser &&
> +		!strncmp(c->x86_model_id, "Intel(R) Xeon(R) CPU E7-", 24)))

Eeww, a model string check :-(

Lemme guess: those E7s can't be represented by a range of
model/steppings, can they?

Similar to AMD_MODEL_RANGE() thing in cpu/amd.c, for example.

In any case, that chunk belongs in the Intel part of
__mcheck_cpu_apply_quirks().

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
  2016-02-07 16:55     ` Borislav Petkov
@ 2016-02-07 20:54       ` Richard Weinberger
  -1 siblings, 0 replies; 34+ messages in thread
From: Richard Weinberger @ 2016-02-07 20:54 UTC (permalink / raw)
  To: Borislav Petkov, Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

Am 07.02.2016 um 17:55 schrieb Borislav Petkov:
> due to those
> 
>> +     _ASM_EXTABLE_FAULT(0b,30b)
>> +     _ASM_EXTABLE_FAULT(1b,31b)
>> +     _ASM_EXTABLE_FAULT(2b,32b)
>> +     _ASM_EXTABLE_FAULT(3b,33b)
>> +     _ASM_EXTABLE_FAULT(4b,34b)
> 
> things below and that's because ex_handler_fault() is defined in
> arch/x86/mm/extable.c and UML doesn't include that file in the build. It
> takes kernel/extable.c and lib/extable.c only.
> 
> Richi, what's the usual way to address that in UML? I.e., make an
> x86-only symbol visible to the UML build too? Define a dummy one, just
> so that it builds?

As discussed on IRC with Boris, UML offers only minimal extable support.
To get rid of the said #ifndef, UML would have to provide its own
extable.c (mostly copy&paste from arch/x86) and an advanced
struct exception_table_entry which includes the trap number.
This implies also that UML can no longer use uaccess.h from asm-generic
or has to add a new ifdef into uaccess.h to whiteout the minimal
struct exception_table_entry from there.

So, I'd vote to keep the #ifndef CONFIG_UML in memcpy_64.S.
As soon you need another #ifndef please ping me and I'll happily
bite the bullet and implement the advanced extable stuff for UML.
Deal?

Thanks,
//richard

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
@ 2016-02-07 20:54       ` Richard Weinberger
  0 siblings, 0 replies; 34+ messages in thread
From: Richard Weinberger @ 2016-02-07 20:54 UTC (permalink / raw)
  To: Borislav Petkov, Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

Am 07.02.2016 um 17:55 schrieb Borislav Petkov:
> due to those
> 
>> +     _ASM_EXTABLE_FAULT(0b,30b)
>> +     _ASM_EXTABLE_FAULT(1b,31b)
>> +     _ASM_EXTABLE_FAULT(2b,32b)
>> +     _ASM_EXTABLE_FAULT(3b,33b)
>> +     _ASM_EXTABLE_FAULT(4b,34b)
> 
> things below and that's because ex_handler_fault() is defined in
> arch/x86/mm/extable.c and UML doesn't include that file in the build. It
> takes kernel/extable.c and lib/extable.c only.
> 
> Richi, what's the usual way to address that in UML? I.e., make an
> x86-only symbol visible to the UML build too? Define a dummy one, just
> so that it builds?

As discussed on IRC with Boris, UML offers only minimal extable support.
To get rid of the said #ifndef, UML would have to provide its own
extable.c (mostly copy&paste from arch/x86) and an advanced
struct exception_table_entry which includes the trap number.
This implies also that UML can no longer use uaccess.h from asm-generic
or has to add a new ifdef into uaccess.h to whiteout the minimal
struct exception_table_entry from there.

So, I'd vote to keep the #ifndef CONFIG_UML in memcpy_64.S.
As soon you need another #ifndef please ping me and I'll happily
bite the bullet and implement the advanced extable stuff for UML.
Deal?

Thanks,
//richard

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
  2016-02-07 16:49     ` Borislav Petkov
@ 2016-02-09 23:15       ` Luck, Tony
  -1 siblings, 0 replies; 34+ messages in thread
From: Luck, Tony @ 2016-02-09 23:15 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

> You can save yourself this MOV here in what is, I'm assuming, the
> general likely case where @src is aligned and do:
> 
>         /* check for bad alignment of source */
>         testl $7, %esi
>         /* already aligned? */
>         jz 102f
> 
>         movl %esi,%ecx
>         subl $8,%ecx
>         negl %ecx
>         subl %ecx,%edx
> 0:      movb (%rsi),%al
>         movb %al,(%rdi)
>         incq %rsi
>         incq %rdi
>         decl %ecx
>         jnz 0b

The "testl $7, %esi" just checks the low three bits ... it doesn't
change %esi.  But the code from the "subl $8" on down assumes that
%ecx is a number in [1..7] as the count of bytes to copy until we
achieve alignment.

So your "movl %esi,%ecx" needs to be somthing that just copies the
low three bits and zeroes the high part of %ecx.  Is there a cute
way to do that in x86 assembler?

> Why aren't we pushing %r12-%r15 on the stack after the "jz 17f" above
> and using them too and thus copying a whole cacheline in one go?
> 
> We would need to restore them when we're done with the cacheline-wise
> shuffle, of course.

I copied that loop from arch/x86/lib/copy_user_64.S:__copy_user_nocache()
I guess the answer depends on whether you generally copy enough
cache lines to save enough time to cover the cost of saving and
restoring those registers.

-Tony

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
@ 2016-02-09 23:15       ` Luck, Tony
  0 siblings, 0 replies; 34+ messages in thread
From: Luck, Tony @ 2016-02-09 23:15 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

> You can save yourself this MOV here in what is, I'm assuming, the
> general likely case where @src is aligned and do:
> 
>         /* check for bad alignment of source */
>         testl $7, %esi
>         /* already aligned? */
>         jz 102f
> 
>         movl %esi,%ecx
>         subl $8,%ecx
>         negl %ecx
>         subl %ecx,%edx
> 0:      movb (%rsi),%al
>         movb %al,(%rdi)
>         incq %rsi
>         incq %rdi
>         decl %ecx
>         jnz 0b

The "testl $7, %esi" just checks the low three bits ... it doesn't
change %esi.  But the code from the "subl $8" on down assumes that
%ecx is a number in [1..7] as the count of bytes to copy until we
achieve alignment.

So your "movl %esi,%ecx" needs to be somthing that just copies the
low three bits and zeroes the high part of %ecx.  Is there a cute
way to do that in x86 assembler?

> Why aren't we pushing %r12-%r15 on the stack after the "jz 17f" above
> and using them too and thus copying a whole cacheline in one go?
> 
> We would need to restore them when we're done with the cacheline-wise
> shuffle, of course.

I copied that loop from arch/x86/lib/copy_user_64.S:__copy_user_nocache()
I guess the answer depends on whether you generally copy enough
cache lines to save enough time to cover the cost of saving and
restoring those registers.

-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
  2016-02-07 17:10     ` Borislav Petkov
@ 2016-02-09 23:38       ` Luck, Tony
  -1 siblings, 0 replies; 34+ messages in thread
From: Luck, Tony @ 2016-02-09 23:38 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

> > +	if (mca_cfg.recovery || (mca_cfg.ser &&
> > +		!strncmp(c->x86_model_id, "Intel(R) Xeon(R) CPU E7-", 24)))
> 
> Eeww, a model string check :-(
> 
> Lemme guess: those E7s can't be represented by a range of
> model/steppings, can they?

We use the same model number for E5 and E7 series. E.g. 63 for Haswell.
The model_id string seems to be the only way to tell ahead of time
whether you will get a recoverable machine check or die when you
touch uncorrected memory.

-Tony

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
@ 2016-02-09 23:38       ` Luck, Tony
  0 siblings, 0 replies; 34+ messages in thread
From: Luck, Tony @ 2016-02-09 23:38 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

> > +	if (mca_cfg.recovery || (mca_cfg.ser &&
> > +		!strncmp(c->x86_model_id, "Intel(R) Xeon(R) CPU E7-", 24)))
> 
> Eeww, a model string check :-(
> 
> Lemme guess: those E7s can't be represented by a range of
> model/steppings, can they?

We use the same model number for E5 and E7 series. E.g. 63 for Haswell.
The model_id string seems to be the only way to tell ahead of time
whether you will get a recoverable machine check or die when you
touch uncorrected memory.

-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
  2016-02-09 23:15       ` Luck, Tony
@ 2016-02-10 10:58         ` Borislav Petkov
  -1 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-10 10:58 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Feb 09, 2016 at 03:15:57PM -0800, Luck, Tony wrote:
> > You can save yourself this MOV here in what is, I'm assuming, the
> > general likely case where @src is aligned and do:
> > 
> >         /* check for bad alignment of source */
> >         testl $7, %esi
> >         /* already aligned? */
> >         jz 102f
> > 
> >         movl %esi,%ecx
> >         subl $8,%ecx
> >         negl %ecx
> >         subl %ecx,%edx
> > 0:      movb (%rsi),%al
> >         movb %al,(%rdi)
> >         incq %rsi
> >         incq %rdi
> >         decl %ecx
> >         jnz 0b
> 
> The "testl $7, %esi" just checks the low three bits ... it doesn't
> change %esi.  But the code from the "subl $8" on down assumes that
> %ecx is a number in [1..7] as the count of bytes to copy until we
> achieve alignment.

Grr, sorry about that, I actually missed to copy-paste the AND:

        /* check for bad alignment of source */
        testl $7, %esi
        jz 102f                         /* already aligned */

        movl %esi,%ecx
        andl $7,%ecx
        subl $8,%ecx
        negl %ecx
        subl %ecx,%edx
0:      movb (%rsi),%al
        movb %al,(%rdi)
        incq %rsi
        incq %rdi
        decl %ecx
        jnz 0b

I basically am proposing to move the unlikely case out of line and
optimize the likely one.

> So your "movl %esi,%ecx" needs to be somthing that just copies the
> low three bits and zeroes the high part of %ecx.  Is there a cute
> way to do that in x86 assembler?

We could do some funky games with byte-sized moves but those are
generally slower anyway so doing the default operand size thing should
be ok.

> I copied that loop from arch/x86/lib/copy_user_64.S:__copy_user_nocache()
> I guess the answer depends on whether you generally copy enough
> cache lines to save enough time to cover the cost of saving and
> restoring those registers.

Well, that function will run on modern hw with a stack engine so I'd
assume those 4 pushes and pops would be paid for by the increased
registers count for the data shuffling.

But one could take out that function do some microbenchmarking with
different sizes and once with the current version and once with the
pushes and pops of r1[2-5] to see where the breakeven is.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
@ 2016-02-10 10:58         ` Borislav Petkov
  0 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-10 10:58 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Feb 09, 2016 at 03:15:57PM -0800, Luck, Tony wrote:
> > You can save yourself this MOV here in what is, I'm assuming, the
> > general likely case where @src is aligned and do:
> > 
> >         /* check for bad alignment of source */
> >         testl $7, %esi
> >         /* already aligned? */
> >         jz 102f
> > 
> >         movl %esi,%ecx
> >         subl $8,%ecx
> >         negl %ecx
> >         subl %ecx,%edx
> > 0:      movb (%rsi),%al
> >         movb %al,(%rdi)
> >         incq %rsi
> >         incq %rdi
> >         decl %ecx
> >         jnz 0b
> 
> The "testl $7, %esi" just checks the low three bits ... it doesn't
> change %esi.  But the code from the "subl $8" on down assumes that
> %ecx is a number in [1..7] as the count of bytes to copy until we
> achieve alignment.

Grr, sorry about that, I actually missed to copy-paste the AND:

        /* check for bad alignment of source */
        testl $7, %esi
        jz 102f                         /* already aligned */

        movl %esi,%ecx
        andl $7,%ecx
        subl $8,%ecx
        negl %ecx
        subl %ecx,%edx
0:      movb (%rsi),%al
        movb %al,(%rdi)
        incq %rsi
        incq %rdi
        decl %ecx
        jnz 0b

I basically am proposing to move the unlikely case out of line and
optimize the likely one.

> So your "movl %esi,%ecx" needs to be somthing that just copies the
> low three bits and zeroes the high part of %ecx.  Is there a cute
> way to do that in x86 assembler?

We could do some funky games with byte-sized moves but those are
generally slower anyway so doing the default operand size thing should
be ok.

> I copied that loop from arch/x86/lib/copy_user_64.S:__copy_user_nocache()
> I guess the answer depends on whether you generally copy enough
> cache lines to save enough time to cover the cost of saving and
> restoring those registers.

Well, that function will run on modern hw with a stack engine so I'd
assume those 4 pushes and pops would be paid for by the increased
registers count for the data shuffling.

But one could take out that function do some microbenchmarking with
different sizes and once with the current version and once with the
pushes and pops of r1[2-5] to see where the breakeven is.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
  2016-02-09 23:38       ` Luck, Tony
@ 2016-02-10 11:06         ` Borislav Petkov
  -1 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-10 11:06 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Feb 09, 2016 at 03:38:57PM -0800, Luck, Tony wrote:
> We use the same model number for E5 and E7 series. E.g. 63 for Haswell.
> The model_id string seems to be the only way to tell ahead of time
> whether you will get a recoverable machine check or die when you
> touch uncorrected memory.

What about MSR_IA32_PLATFORM_ID or some other MSR or register, for
example?

I.e., isn't there some other, more reliable distinction between E5 and
E7 besides the model ID?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
@ 2016-02-10 11:06         ` Borislav Petkov
  0 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-10 11:06 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Tue, Feb 09, 2016 at 03:38:57PM -0800, Luck, Tony wrote:
> We use the same model number for E5 and E7 series. E.g. 63 for Haswell.
> The model_id string seems to be the only way to tell ahead of time
> whether you will get a recoverable machine check or die when you
> touch uncorrected memory.

What about MSR_IA32_PLATFORM_ID or some other MSR or register, for
example?

I.e., isn't there some other, more reliable distinction between E5 and
E7 besides the model ID?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
  2016-02-10 11:06         ` Borislav Petkov
@ 2016-02-10 19:27           ` Luck, Tony
  -1 siblings, 0 replies; 34+ messages in thread
From: Luck, Tony @ 2016-02-10 19:27 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Feb 10, 2016 at 12:06:03PM +0100, Borislav Petkov wrote:
> What about MSR_IA32_PLATFORM_ID or some other MSR or register, for
> example?

Bits 52:50 give us "information concerning the intended platform
for the processor" ... but we don't seem to decode that vague
statement into anything that I can make use of.

> I.e., isn't there some other, more reliable distinction between E5 and
> E7 besides the model ID?

Digging in the data sheet I found the CAPID0 register which does
indicate in bit 4 whether this is an "EX" (a.k.a. "E7" part). But
we invent a new PCI device ID for this every generation (0x0EC3 in
Ivy Bridge, 0x2fc0 in Haswell, 0x6fc0 in Broadwell). The offset
has stayed at 0x84 through all this.

I don't think that hunting the ever-changing PCI-id is a
good choice ... the "E5/E7" naming convention has stuck for
four generations[1] (Sandy Bridge, Ivy Bridge, Haswell, Broadwell).

-Tony

[1] Although this probably means that marketing are about to
think of something new ... they generally do when people start
understanding the model names :-(

-Tony

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
@ 2016-02-10 19:27           ` Luck, Tony
  0 siblings, 0 replies; 34+ messages in thread
From: Luck, Tony @ 2016-02-10 19:27 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Feb 10, 2016 at 12:06:03PM +0100, Borislav Petkov wrote:
> What about MSR_IA32_PLATFORM_ID or some other MSR or register, for
> example?

Bits 52:50 give us "information concerning the intended platform
for the processor" ... but we don't seem to decode that vague
statement into anything that I can make use of.

> I.e., isn't there some other, more reliable distinction between E5 and
> E7 besides the model ID?

Digging in the data sheet I found the CAPID0 register which does
indicate in bit 4 whether this is an "EX" (a.k.a. "E7" part). But
we invent a new PCI device ID for this every generation (0x0EC3 in
Ivy Bridge, 0x2fc0 in Haswell, 0x6fc0 in Broadwell). The offset
has stayed at 0x84 through all this.

I don't think that hunting the ever-changing PCI-id is a
good choice ... the "E5/E7" naming convention has stuck for
four generations[1] (Sandy Bridge, Ivy Bridge, Haswell, Broadwell).

-Tony

[1] Although this probably means that marketing are about to
think of something new ... they generally do when people start
understanding the model names :-(

-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
  2016-02-10 10:58         ` Borislav Petkov
@ 2016-02-10 19:39           ` Luck, Tony
  -1 siblings, 0 replies; 34+ messages in thread
From: Luck, Tony @ 2016-02-10 19:39 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Feb 10, 2016 at 11:58:43AM +0100, Borislav Petkov wrote:
> But one could take out that function do some microbenchmarking with
> different sizes and once with the current version and once with the
> pushes and pops of r1[2-5] to see where the breakeven is.

On a 4K page copy from a source address that isn't in the
cache I see all sorts of answers.

On my desktop (i7-3960X) it is ~50 cycles slower to push and pop the four
registers.

On my latest Xeon - I can't post benchmarks ... but also a bit slower.

On an older Xeon it is a few cycles faster (but even though I'm
looking at the median of 10,000 runs I see more run-to-run variation
that I see difference between register choices.

Here's what I tested:

	push %r12
	push %r13
	push %r14
	push %r15

	/* Loop copying whole cache lines */
1:	movq (%rsi),%r8
2:	movq 1*8(%rsi),%r9
3:	movq 2*8(%rsi),%r10
4:	movq 3*8(%rsi),%r11
9:	movq 4*8(%rsi),%r12
10:	movq 5*8(%rsi),%r13
11:	movq 6*8(%rsi),%r14
12:	movq 7*8(%rsi),%r15
	movq %r8,(%rdi)
	movq %r9,1*8(%rdi)
	movq %r10,2*8(%rdi)
	movq %r11,3*8(%rdi)
	movq %r12,4*8(%rdi)
	movq %r13,5*8(%rdi)
	movq %r14,6*8(%rdi)
	movq %r15,7*8(%rdi)
	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	decl %ecx
	jnz 1b

	pop %r15
	pop %r14
	pop %r13
	pop %r12
-Tony

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
@ 2016-02-10 19:39           ` Luck, Tony
  0 siblings, 0 replies; 34+ messages in thread
From: Luck, Tony @ 2016-02-10 19:39 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Feb 10, 2016 at 11:58:43AM +0100, Borislav Petkov wrote:
> But one could take out that function do some microbenchmarking with
> different sizes and once with the current version and once with the
> pushes and pops of r1[2-5] to see where the breakeven is.

On a 4K page copy from a source address that isn't in the
cache I see all sorts of answers.

On my desktop (i7-3960X) it is ~50 cycles slower to push and pop the four
registers.

On my latest Xeon - I can't post benchmarks ... but also a bit slower.

On an older Xeon it is a few cycles faster (but even though I'm
looking at the median of 10,000 runs I see more run-to-run variation
that I see difference between register choices.

Here's what I tested:

	push %r12
	push %r13
	push %r14
	push %r15

	/* Loop copying whole cache lines */
1:	movq (%rsi),%r8
2:	movq 1*8(%rsi),%r9
3:	movq 2*8(%rsi),%r10
4:	movq 3*8(%rsi),%r11
9:	movq 4*8(%rsi),%r12
10:	movq 5*8(%rsi),%r13
11:	movq 6*8(%rsi),%r14
12:	movq 7*8(%rsi),%r15
	movq %r8,(%rdi)
	movq %r9,1*8(%rdi)
	movq %r10,2*8(%rdi)
	movq %r11,3*8(%rdi)
	movq %r12,4*8(%rdi)
	movq %r13,5*8(%rdi)
	movq %r14,6*8(%rdi)
	movq %r15,7*8(%rdi)
	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	decl %ecx
	jnz 1b

	pop %r15
	pop %r14
	pop %r13
	pop %r12
-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
  2016-02-10 19:39           ` Luck, Tony
@ 2016-02-10 20:50             ` Borislav Petkov
  -1 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-10 20:50 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Feb 10, 2016 at 11:39:05AM -0800, Luck, Tony wrote:
> On Wed, Feb 10, 2016 at 11:58:43AM +0100, Borislav Petkov wrote:
> > But one could take out that function do some microbenchmarking with
> > different sizes and once with the current version and once with the
> > pushes and pops of r1[2-5] to see where the breakeven is.
> 
> On a 4K page copy from a source address that isn't in the
> cache I see all sorts of answers.
> 
> On my desktop (i7-3960X) it is ~50 cycles slower to push and pop the four
> registers.
> 
> On my latest Xeon - I can't post benchmarks ... but also a bit slower.
> 
> On an older Xeon it is a few cycles faster (but even though I'm
> looking at the median of 10,000 runs I see more run-to-run variation
> that I see difference between register choices.

Hmm, strange. Can you check whether perf doesn't show any significant
differences too. Something like:

perf stat --repeat 100 --sync --pre 'echo 3 > /proc/sys/vm/drop_caches' -- ./mcsafe_copy_1

and then

perf stat --repeat 100 --sync --pre 'echo 3 > /proc/sys/vm/drop_caches' -- ./mcsafe_copy_2

That'll be interesting...

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 3/4] x86, mce: Add __mcsafe_copy()
@ 2016-02-10 20:50             ` Borislav Petkov
  0 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-10 20:50 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Feb 10, 2016 at 11:39:05AM -0800, Luck, Tony wrote:
> On Wed, Feb 10, 2016 at 11:58:43AM +0100, Borislav Petkov wrote:
> > But one could take out that function do some microbenchmarking with
> > different sizes and once with the current version and once with the
> > pushes and pops of r1[2-5] to see where the breakeven is.
> 
> On a 4K page copy from a source address that isn't in the
> cache I see all sorts of answers.
> 
> On my desktop (i7-3960X) it is ~50 cycles slower to push and pop the four
> registers.
> 
> On my latest Xeon - I can't post benchmarks ... but also a bit slower.
> 
> On an older Xeon it is a few cycles faster (but even though I'm
> looking at the median of 10,000 runs I see more run-to-run variation
> that I see difference between register choices.

Hmm, strange. Can you check whether perf doesn't show any significant
differences too. Something like:

perf stat --repeat 100 --sync --pre 'echo 3 > /proc/sys/vm/drop_caches' -- ./mcsafe_copy_1

and then

perf stat --repeat 100 --sync --pre 'echo 3 > /proc/sys/vm/drop_caches' -- ./mcsafe_copy_2

That'll be interesting...

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
  2016-02-10 19:27           ` Luck, Tony
@ 2016-02-11 11:55             ` Borislav Petkov
  -1 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-11 11:55 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Feb 10, 2016 at 11:27:50AM -0800, Luck, Tony wrote:
> Digging in the data sheet I found the CAPID0 register which does
> indicate in bit 4 whether this is an "EX" (a.k.a. "E7" part). But
> we invent a new PCI device ID for this every generation (0x0EC3 in
> Ivy Bridge, 0x2fc0 in Haswell, 0x6fc0 in Broadwell). The offset
> has stayed at 0x84 through all this.
> 
> I don't think that hunting the ever-changing PCI-id is a
> good choice ...

Right :-\

> the "E5/E7" naming convention has stuck for
> four generations[1] (Sandy Bridge, Ivy Bridge, Haswell, Broadwell).
> 
> -Tony
> 
> [1] Although this probably means that marketing are about to
> think of something new ... they generally do when people start
> understanding the model names :-(

Yeah, customers shouldn't slack and relax into even thinking they know
the model names. Fortunately there's wikipedia...

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery
@ 2016-02-11 11:55             ` Borislav Petkov
  0 siblings, 0 replies; 34+ messages in thread
From: Borislav Petkov @ 2016-02-11 11:55 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, Brian Gerst, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Feb 10, 2016 at 11:27:50AM -0800, Luck, Tony wrote:
> Digging in the data sheet I found the CAPID0 register which does
> indicate in bit 4 whether this is an "EX" (a.k.a. "E7" part). But
> we invent a new PCI device ID for this every generation (0x0EC3 in
> Ivy Bridge, 0x2fc0 in Haswell, 0x6fc0 in Broadwell). The offset
> has stayed at 0x84 through all this.
> 
> I don't think that hunting the ever-changing PCI-id is a
> good choice ...

Right :-\

> the "E5/E7" naming convention has stuck for
> four generations[1] (Sandy Bridge, Ivy Bridge, Haswell, Broadwell).
> 
> -Tony
> 
> [1] Although this probably means that marketing are about to
> think of something new ... they generally do when people start
> understanding the model names :-(

Yeah, customers shouldn't slack and relax into even thinking they know
the model names. Fortunately there's wikipedia...

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2016-02-11 11:55 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-04 20:36 [PATCH v10 0/4] Machine check recovery when kernel accesses poison Tony Luck
2016-02-04 20:36 ` Tony Luck
2015-12-31 19:40 ` [PATCH v10 2/4] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries Tony Luck
2015-12-31 19:40   ` Tony Luck
2016-01-08 20:49 ` [PATCH v10 1/4] x86: Expand exception table to allow new handling options Tony Luck
2016-01-08 20:49   ` Tony Luck
2016-01-08 21:18 ` [PATCH v10 3/4] x86, mce: Add __mcsafe_copy() Tony Luck
2016-01-08 21:18   ` Tony Luck
2016-02-07 16:49   ` Borislav Petkov
2016-02-07 16:49     ` Borislav Petkov
2016-02-09 23:15     ` Luck, Tony
2016-02-09 23:15       ` Luck, Tony
2016-02-10 10:58       ` Borislav Petkov
2016-02-10 10:58         ` Borislav Petkov
2016-02-10 19:39         ` Luck, Tony
2016-02-10 19:39           ` Luck, Tony
2016-02-10 20:50           ` Borislav Petkov
2016-02-10 20:50             ` Borislav Petkov
2016-02-07 16:55   ` Borislav Petkov
2016-02-07 16:55     ` Borislav Petkov
2016-02-07 20:54     ` Richard Weinberger
2016-02-07 20:54       ` Richard Weinberger
2016-01-30  0:00 ` [PATCH v10 4/4] x86: Create a new synthetic cpu capability for machine check recovery Tony Luck
2016-01-30  0:00   ` Tony Luck
2016-02-07 17:10   ` Borislav Petkov
2016-02-07 17:10     ` Borislav Petkov
2016-02-09 23:38     ` Luck, Tony
2016-02-09 23:38       ` Luck, Tony
2016-02-10 11:06       ` Borislav Petkov
2016-02-10 11:06         ` Borislav Petkov
2016-02-10 19:27         ` Luck, Tony
2016-02-10 19:27           ` Luck, Tony
2016-02-11 11:55           ` Borislav Petkov
2016-02-11 11:55             ` Borislav Petkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.