linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-05  0:05 [PATCH v7 0/3] Machine check recovery when kernel accesses poison Tony Luck
@ 2015-12-30 17:59 ` Tony Luck
  2016-01-06 12:33   ` Borislav Petkov
  2016-01-06 12:36   ` Borislav Petkov
  2015-12-31 19:40 ` [PATCH v7 2/3] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries Tony Luck
  2015-12-31 19:43 ` [PATCH v7 3/3] x86, mce: Add __mcsafe_copy() Tony Luck
  2 siblings, 2 replies; 27+ messages in thread
From: Tony Luck @ 2015-12-30 17:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Starting with a patch from Andy Lutomirski <luto@amacapital.net>
that used linker relocation trickery to free up a couple of bits
in the "fixup" field of the exception table (and generalized the
uaccess_err hack to use one of the classes).

This patch allocates another one of the classes to provide
a mechanism to provide the fault number to the fixup code
in %rax.

Still one free class for the next brilliant idea. If more are
needed it should be possible to squeeze another bit or three
extending the same technique.

Originally-from: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/asm.h     | 102 +++++++++++++++++++++++++++++++----------
 arch/x86/include/asm/uaccess.h |  17 +++++--
 arch/x86/kernel/kprobes/core.c |   2 +-
 arch/x86/kernel/traps.c        |   6 +--
 arch/x86/mm/extable.c          |  66 ++++++++++++++++++--------
 arch/x86/mm/fault.c            |   2 +-
 6 files changed, 142 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..977273e36f87 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -43,19 +43,79 @@
 #define _ASM_DI		__ASM_REG(di)
 
 /* Exception table entry */
-#ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . ;					\
-	.popsection
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
+#define	_EXTABLE_BIAS	0x20000000
+/*
+ * An exception table entry is 64 bits.  The first 32 bits are the offset
+ * from that entry to the potentially faulting instruction.  sortextable
+ * relies on that exact encoding.  The second 32 bits encode the fault
+ * handler address.
+ *
+ * We want to stick two extra bits of handler class into the fault handler
+ * address.  All of these are generated by relocations, so we can only
+ * rely on addition.
+ *
+ * The offset to the fixup is signed, and we're trying to use the high
+ * bits for a different purpose.  In C, we could just do:
+ *
+ * u32 class_and_offset = ((target - here) & 0x3fffffff) | class;
+ *
+ * Then, to decode it, we'd mask off the class and sign-extend to recover
+ * the offset.
+ *
+ * In asm, we can't do that, because this all gets laundered through the
+ * linker, and there's no relocation type that supports this chicanery.
+ * Instead we cheat a bit.  We first add a large number to the offset
+ * (0x20000000).  The result is still nominally signed, but now it's
+ * always positive, and the two high bits are always clear.  We can then
+ * set high bits by ordinary addition or subtraction instead of using
+ * bitwise operations.  As far as the linker is concerned, all we're
+ * doing is adding a large constant to the difference between here (".")
+ * and the target, and that's a valid relocation type.
+ *
+ * We therefore emit:
+ * (target - here) + (class) + 0x20000000
+ *
+ * This has the property that the two high bits are the class (see
+ * ex_class().
+ *
+ * To get the offset back we just mask off the class bits and subtract
+ * 0x20000000. See ex_fixup_addr().
+ */
+
+/*
+ * There are two bits of extable entry class giving four classes
+ */
+#define EXTABLE_CLASS_DEFAULT	0	/* standard uaccess fixup */
+#define EXTABLE_CLASS_FAULT	1	/* provide fault number as well as fixup */
+#define EXTABLE_CLASS_EX	2	/* uaccess + set uaccess_err */
+#define EXTABLE_CLASS_UNUSED	3	/* available for something else */
+
+/*
+ * The biases are the class constants + _EXTABLE_BIAS, as signed integers.
+ * This can't use ordinary arithmetic -- the assembler isn't that smart.
+ */
+#define _EXTABLE_BIAS_DEFAULT	_EXTABLE_BIAS
+#define _EXTABLE_BIAS_FAULT	_EXTABLE_BIAS + 0x40000000
+#define _EXTABLE_BIAS_EX	_EXTABLE_BIAS - 0x80000000
+#define _EXTABLE_BIAS_UNUSED	_EXTABLE_BIAS - 0x40000000
+
+#define _ASM_EXTABLE(from,to)						\
+	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_DEFAULT)
+
+#define _ASM_EXTABLE_FAULT(from,to)					\
+	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_FAULT)
+
+#define _ASM_EXTABLE_EX(from,to)					\
+	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_EX)
+
+#ifdef __ASSEMBLY__
+# define _EXPAND_EXTABLE_BIAS(x) x
+# define _ASM_EXTABLE_CLASS(from,to,bias)				\
+	.pushsection "__ex_table","a" ;					\
+	.balign 8 ;							\
+	.long (from) - . ;						\
+	.long (to) - . + _EXPAND_EXTABLE_BIAS(bias) ;			\
 	.popsection
 
 # define _ASM_NOKPROBE(entry)					\
@@ -89,18 +149,12 @@
 	.endm
 
 #else
-# define _ASM_EXTABLE(from,to)					\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - .\n"				\
-	" .popsection\n"
-
-# define _ASM_EXTABLE_EX(from,to)				\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
+# define _EXPAND_EXTABLE_BIAS(x) #x
+# define _ASM_EXTABLE_CLASS(from,to,bias)				\
+	" .pushsection \"__ex_table\",\"a\"\n"				\
+	" .balign 8\n"							\
+	" .long (" #from ") - .\n"					\
+	" .long (" #to ") - . + " _EXPAND_EXTABLE_BIAS(bias) "\n"	\
 	" .popsection\n"
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a8df874f3e88..b023300cd6f0 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -93,9 +93,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  * The exception table consists of pairs of addresses relative to the
  * exception table enty itself: the first is the address of an
  * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * at which the program should continue.  The exception table is linked
+ * soon after the fixup section, so we don't need a full 32-bit offset
+ * for the fixup. We steal the two upper bits so we can tag exception
+ * table entries with different classes. In the default class no registers
+ * are modified, so it is entirely up to the continuation code to figure
+ * out what to do.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -110,7 +113,13 @@ struct exception_table_entry {
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+static inline unsigned int
+ex_class(const struct exception_table_entry *x)
+{
+	return (u32)x->fixup >> 30;
+}
+
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 346eec73f7db..df25081e5970 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, X86_TRAP_DE)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..7a592ec193d5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -11,13 +11,51 @@ ex_insn_addr(const struct exception_table_entry *x)
 static inline unsigned long
 ex_fixup_addr(const struct exception_table_entry *x)
 {
-	return (unsigned long)&x->fixup + x->fixup;
+	long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)_EXTABLE_BIAS;
+	return (unsigned long)&x->fixup + offset;
 }
 
-int fixup_exception(struct pt_regs *regs)
+/* Fixup functions for each exception class */
+static int fix_class_default(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+static int fix_class_fault(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return 1;
+}
+static int fix_class_ex(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+static int fix_class_unused(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	/* can't happen unless exception table was corrupted */
+	BUG_ON(1);
+	return 0;
+}
+
+static int (*allclasses[])(const struct exception_table_entry *,
+			   struct pt_regs *, int) = {
+	[EXTABLE_CLASS_DEFAULT] = fix_class_default,
+	[EXTABLE_CLASS_FAULT] = fix_class_fault,
+	[EXTABLE_CLASS_EX] = fix_class_ex,
+	[EXTABLE_CLASS_UNUSED] = fix_class_unused
+};
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
 {
 	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -34,17 +72,8 @@ int fixup_exception(struct pt_regs *regs)
 #endif
 
 	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	if (fixup)
+		return allclasses[ex_class(fixup)](fixup, regs, trapnr);
 
 	return 0;
 }
@@ -53,18 +82,15 @@ int fixup_exception(struct pt_regs *regs)
 int __init early_fixup_exception(unsigned long *ip)
 {
 	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
 
 	fixup = search_exception_tables(*ip);
 	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* uaccess handling not supported during early boot */
+		if (ex_class(fixup)) {
+			/* special handling not supported during early boot */
 			return 0;
 		}
 
-		*ip = new_ip;
+		*ip = ex_fixup_addr(fixup);
 		return 1;
 	}
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..495946c3f9dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v7 2/3] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries
  2016-01-05  0:05 [PATCH v7 0/3] Machine check recovery when kernel accesses poison Tony Luck
  2015-12-30 17:59 ` [PATCH v7 1/3] x86: Add classes to exception tables Tony Luck
@ 2015-12-31 19:40 ` Tony Luck
  2015-12-31 19:43 ` [PATCH v7 3/3] x86, mce: Add __mcsafe_copy() Tony Luck
  2 siblings, 0 replies; 27+ messages in thread
From: Tony Luck @ 2015-12-31 19:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel with a EXTABLE_CLASS_FAULT fixup entry.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 32 +++++++++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 71 ++++++++++++++++---------------
 2 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..b6eee8e9b2a6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -13,7 +13,9 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/debugfs.h>
+#include <linux/module.h>
 #include <asm/mce.h>
+#include <asm/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -29,7 +31,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +50,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -87,6 +90,10 @@ static struct severity {
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
 	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
 		),
@@ -123,6 +130,11 @@ static struct severity {
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
 	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
+	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
@@ -170,6 +182,18 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
+static inline bool mce_in_kernel_recov(unsigned long addr)
+{
+	const struct exception_table_entry *fixup;
+
+	fixup = search_exception_tables(addr);
+
+	return fixup && ex_class(fixup) == EXTABLE_CLASS_FAULT;
+}
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +207,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && mce_in_kernel_recov(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9d014b82a124..d69a69b38c54 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/poll.h>
 #include <linux/nmi.h>
@@ -958,6 +959,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -995,8 +1010,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	ist_enter(regs);
@@ -1123,22 +1136,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1146,25 +1150,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_exception(regs, X86_TRAP_MC))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v7 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-05  0:05 [PATCH v7 0/3] Machine check recovery when kernel accesses poison Tony Luck
  2015-12-30 17:59 ` [PATCH v7 1/3] x86: Add classes to exception tables Tony Luck
  2015-12-31 19:40 ` [PATCH v7 2/3] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries Tony Luck
@ 2015-12-31 19:43 ` Tony Luck
  2016-01-06  4:42   ` Dan Williams
  2 siblings, 1 reply; 27+ messages in thread
From: Tony Luck @ 2015-12-31 19:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Make use of the EXTABLE_FAULT exception table entries. This routine
returns a structure to indicate the result of the copy:

struct mcsafe_ret {
	u64 trapnr;
	u64 remain;
};

If the copy is successful, then both 'trapnr' and 'remain' are zero.

If we faulted during the copy, then 'trapnr' will say which type
of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
bytes were not copied.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/Kconfig                 |  10 +++
 arch/x86/include/asm/string_64.h |  10 +++
 arch/x86/kernel/x8664_ksyms_64.c |   4 ++
 arch/x86/lib/memcpy_64.S         | 136 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 160 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 96d058a87100..42d26b4d1ec4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
 	  If you don't know what a machine check is and you don't do kernel
 	  QA it is safe to say n.
 
+config MCE_KERNEL_RECOVERY
+	bool "Recovery from machine checks in special kernel memory copy functions"
+	default n
+	depends on X86_MCE && X86_64
+	---help---
+	  This option provides a new memory copy function mcsafe_memcpy()
+	  that is annotated to allow the machine check handler to return
+	  to an alternate code path to return an error to the caller instead
+	  of crashing the system. Say yes if you have a driver that uses this.
+
 config X86_THERMAL_VECTOR
 	def_bool y
 	depends on X86_MCE_INTEL
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a17dc4b..16a8f0e56e4a 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,16 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+struct mcsafe_ret {
+	u64 trapnr;
+	u64 remain;
+};
+
+struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt);
+extern void __mcsafe_copy_end(void);
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..3d42d0ef3333 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,10 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+EXPORT_SYMBOL(__mcsafe_copy);
+#endif
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bba87de..e5b1acad8b1e 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,139 @@ ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+/*
+ * __mcsafe_copy - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(__mcsafe_copy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of source */
+	movl %esi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 0b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	mov %r8,(%rdi)
+	mov %r9,1*8(%rdi)
+	mov %r10,2*8(%rdi)
+	mov %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	mov %r8,4*8(%rdi)
+	mov %r9,5*8(%rdi)
+	mov %r10,6*8(%rdi)
+	mov %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	mov %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorq %rax, %rax
+	xorq %rdx, %rdx
+	sfence
+	/* copy successful. return 0 */
+	ret
+
+	.section .fixup,"ax"
+	/* fixups for machine check */
+30:
+	add %ecx,%edx
+	jmp 100f
+31:
+	shl $6,%ecx
+	add %ecx,%edx
+	jmp 100f
+32:
+	shl $6,%ecx
+	lea -8(%ecx,%edx),%edx
+	jmp 100f
+33:
+	shl $6,%ecx
+	lea -16(%ecx,%edx),%edx
+	jmp 100f
+34:
+	shl $6,%ecx
+	lea -24(%ecx,%edx),%edx
+	jmp 100f
+35:
+	shl $6,%ecx
+	lea -32(%ecx,%edx),%edx
+	jmp 100f
+36:
+	shl $6,%ecx
+	lea -40(%ecx,%edx),%edx
+	jmp 100f
+37:
+	shl $6,%ecx
+	lea -48(%ecx,%edx),%edx
+	jmp 100f
+38:
+	shl $6,%ecx
+	lea -56(%ecx,%edx),%edx
+	jmp 100f
+39:
+	lea (%rdx,%rcx,8),%rdx
+	jmp 100f
+40:
+	mov %ecx,%edx
+100:
+	sfence
+
+	/* %rax set the fault number in fixup_exception() */
+	ret
+	.previous
+
+	_ASM_EXTABLE_FAULT(0b,30b)
+	_ASM_EXTABLE_FAULT(1b,31b)
+	_ASM_EXTABLE_FAULT(2b,32b)
+	_ASM_EXTABLE_FAULT(3b,33b)
+	_ASM_EXTABLE_FAULT(4b,34b)
+	_ASM_EXTABLE_FAULT(9b,35b)
+	_ASM_EXTABLE_FAULT(10b,36b)
+	_ASM_EXTABLE_FAULT(11b,37b)
+	_ASM_EXTABLE_FAULT(12b,38b)
+	_ASM_EXTABLE_FAULT(18b,39b)
+	_ASM_EXTABLE_FAULT(21b,40b)
+ENDPROC(__mcsafe_copy)
+#endif
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v7 0/3] Machine check recovery when kernel accesses poison
@ 2016-01-05  0:05 Tony Luck
  2015-12-30 17:59 ` [PATCH v7 1/3] x86: Add classes to exception tables Tony Luck
                   ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Tony Luck @ 2016-01-05  0:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

This series is initially targeted at the folks doing filesystems
on top of NVDIMMs. They really want to be able to return -EIO
when there is a h/w error (just like spinning rust, and SSD does).

I plan to use the same infrastructure to write a machine check aware
"copy_from_user()" that will SIGBUS the calling application when a
syscall touches poison in user space (just like we do when the application
touches the poison itself).

Changes V6-V7:
Boris:	Why add/subtract 0x20000000? Added better comment provided by Andy
Boris:	Churn. Part2 changes things only introduced in part1.
	Merged parts 1&2 into one patch.
Ingo:	Missing my sign off on part1. Added.

Changes V5-V6
Andy:	Provoked massive re-write by providing what is now part1 of this
	patch series. This frees up two bits in the exception table
	fixup field that can be used to tag exception table entries
	as different "classes". This means we don't need my separate
	exception table fro machine checks. Also avoids duplicating
	fixup actions for #PF and #MC cases that were in version 5.
Andy:	Use C99 array initializers to tie the various class fixup
	functions back to the defintions of each class. Also give the
	functions meanningful names (not fixup_class0() etc.).
Boris:	Cleaned up my lousy assembly code removing many spurious 'l'
	modifiers on instructions.
Boris:	Provided some helper functions for the machine check severity
	calculation that make the code more readable.
Boris:	Have __mcsafe_copy() return a structure with the 'remaining bytes'
	in a separate field from the fault indicator. Boris had suggested
	Linux -EFAULT/-EINVAL ... but I thought it made more sense to return
	the exception number (X86_TRAP_MC, etc.)  This finally kills off
	BIT(63) which has been controversial throughout all the early versions
	of this patch series.

Changes V4-V5
Tony:	Extended __mcsafe_copy() to have fixup entries for both machine
	check and page fault.

Changes V3-V4:
Andy:   Simplify fixup_mcexception() by dropping used-once local variable
Andy:   "Reviewed-by" tag added to part1
Boris:  Moved new functions to memcpy_64.S and declaration to asm/string_64.h
Boris:  Changed name s/mcsafe_memcpy/__mcsafe_copy/ to make it clear that this
        is an internal function and that return value doesn't follow memcpy() semantics.
Boris:  "Reviewed-by" tag added to parts 1&2

Changes V2-V3:

Andy:   Don't hack "regs->ax = BIT(63) | addr;" in the machine check
        handler.  Now have better fixup code that computes the number
        of remaining bytes (just like page-fault fixup).
Andy:   #define for BIT(63). Done, plus couple of extra macros using it.
Boris:  Don't clutter up generic code (like mm/extable.c) with this.
        I moved everything under arch/x86 (the asm-generic change is
        a more generic #define).
Boris:  Dependencies for CONFIG_MCE_KERNEL_RECOVERY are too generic.
        I made it a real menu item with default "n". Dan Williams
        will use "select MCE_KERNEL_RECOVERY" from his persistent
        filesystem code.
Boris:  Simplify conditionals in mce.c by moving tolerant/kill_it
        checks earlier, with a skip to end if they aren't set.
Boris:  Miscellaneous grammar/punctuation. Fixed.
Boris:  Don't leak spurious __start_mcextable symbols into kernels
        that didn't configure MCE_KERNEL_RECOVERY. Done.
Tony:   New code doesn't belong in user_copy_64.S/uaccess*.h. Moved
        to new .S/.h files
Elliott:Cacheing behavior non-optimal. Could use movntdqa, vmovntdqa
        or vmovntdqa on source addresses. I didn't fix this yet. Think
        of the current mcsafe_memcpy() as the first of several functions.
        This one is useful for small copies (meta-data) where the overhead
        of saving SSE/AVX state isn't justified.

Changes V1->V2:

0-day:  Reported build errors and warnings on 32-bit systems. Fixed
0-day:  Reported bloat to tinyconfig. Fixed
Boris:  Suggestions to use extra macros to reduce code duplication in _ASM_*EXTABLE. Done
Boris:  Re-write "tolerant==3" check to reduce indentation level. See below.
Andy:   Check IP is valid before searching kernel exception tables. Done.
Andy:   Explain use of BIT(63) on return value from mcsafe_memcpy(). Done (added decode macros).
Andy:   Untangle mess of code in tail of do_machine_check() to make it
        clear what is going on (e.g. that we only enter the ist_begin_non_atomic()
        if we were called from user code, not from kernel!). Done.

Tony Luck (3):
  x86: Add classes to exception tables
  x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception
    table entries
  x86, mce: Add __mcsafe_copy()

 arch/x86/Kconfig                          |  10 +++
 arch/x86/include/asm/asm.h                | 102 ++++++++++++++++------
 arch/x86/include/asm/string_64.h          |  10 +++
 arch/x86/include/asm/uaccess.h            |  17 +++-
 arch/x86/kernel/cpu/mcheck/mce-severity.c |  32 ++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          |  71 ++++++++--------
 arch/x86/kernel/kprobes/core.c            |   2 +-
 arch/x86/kernel/traps.c                   |   6 +-
 arch/x86/kernel/x8664_ksyms_64.c          |   4 +
 arch/x86/lib/memcpy_64.S                  | 136 ++++++++++++++++++++++++++++++
 arch/x86/mm/extable.c                     |  66 ++++++++++-----
 arch/x86/mm/fault.c                       |   2 +-
 12 files changed, 369 insertions(+), 89 deletions(-)

-- 
2.1.4


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 3/3] x86, mce: Add __mcsafe_copy()
  2015-12-31 19:43 ` [PATCH v7 3/3] x86, mce: Add __mcsafe_copy() Tony Luck
@ 2016-01-06  4:42   ` Dan Williams
  2016-01-06  7:06     ` Luck, Tony
  0 siblings, 1 reply; 27+ messages in thread
From: Dan Williams @ 2016-01-06  4:42 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Elliott, Robert (Persistent Memory),
	linux-kernel, Linux MM, linux-nvdimm, X86 ML

On Thu, Dec 31, 2015 at 11:43 AM, Tony Luck <tony.luck@intel.com> wrote:
> Make use of the EXTABLE_FAULT exception table entries. This routine
> returns a structure to indicate the result of the copy:
>
> struct mcsafe_ret {
>         u64 trapnr;
>         u64 remain;
> };
>
> If the copy is successful, then both 'trapnr' and 'remain' are zero.
>
> If we faulted during the copy, then 'trapnr' will say which type
> of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
> bytes were not copied.
>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/Kconfig                 |  10 +++
>  arch/x86/include/asm/string_64.h |  10 +++
>  arch/x86/kernel/x8664_ksyms_64.c |   4 ++
>  arch/x86/lib/memcpy_64.S         | 136 +++++++++++++++++++++++++++++++++++++++
>  4 files changed, 160 insertions(+)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 96d058a87100..42d26b4d1ec4 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
>           If you don't know what a machine check is and you don't do kernel
>           QA it is safe to say n.
>
> +config MCE_KERNEL_RECOVERY
> +       bool "Recovery from machine checks in special kernel memory copy functions"
> +       default n
> +       depends on X86_MCE && X86_64
> +       ---help---
> +         This option provides a new memory copy function mcsafe_memcpy()
> +         that is annotated to allow the machine check handler to return
> +         to an alternate code path to return an error to the caller instead
> +         of crashing the system. Say yes if you have a driver that uses this.
> +
>  config X86_THERMAL_VECTOR
>         def_bool y
>         depends on X86_MCE_INTEL
> diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
> index ff8b9a17dc4b..16a8f0e56e4a 100644
> --- a/arch/x86/include/asm/string_64.h
> +++ b/arch/x86/include/asm/string_64.h
> @@ -78,6 +78,16 @@ int strcmp(const char *cs, const char *ct);
>  #define memset(s, c, n) __memset(s, c, n)
>  #endif
>
> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
> +struct mcsafe_ret {
> +       u64 trapnr;
> +       u64 remain;
> +};

Can we move this definition outside of the CONFIG_MCE_KERNEL_RECOVERY
ifdef guard?  On a test integration branch the kbuild robot caught the
following:

   In file included from include/linux/pmem.h:21:0,
                    from drivers/acpi/nfit.c:22:
   arch/x86/include/asm/pmem.h: In function 'arch_memcpy_from_pmem':
>> arch/x86/include/asm/pmem.h:55:21: error: storage size of 'ret' isn't known
      struct mcsafe_ret ret;
                        ^
>> arch/x86/include/asm/pmem.h:57:9: error: implicit declaration of function '__mcsafe_copy' [-Werror=implicit-function-declaration]
      ret = __mcsafe_copy(dst, (void __force *) src, n);
            ^
>> arch/x86/include/asm/pmem.h:55:21: warning: unused variable 'ret' [-Wunused-variable]
      struct mcsafe_ret ret;
                        ^
   cc1: some warnings being treated as errors

vim +55 arch/x86/include/asm/pmem.h

    49  }
    50
    51  static inline int arch_memcpy_from_pmem(void *dst, const void
__pmem *src,
    52                  size_t n)
    53  {
    54          if (IS_ENABLED(CONFIG_MCE_KERNEL_RECOVERY)) {
  > 55                  struct mcsafe_ret ret;
    56
  > 57                  ret = __mcsafe_copy(dst, (void __force *) src, n);
    58                  if (ret.remain)
    59                          return -EIO;
    60                  return 0;

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-06  4:42   ` Dan Williams
@ 2016-01-06  7:06     ` Luck, Tony
  2016-01-06  7:11       ` Dan Williams
  0 siblings, 1 reply; 27+ messages in thread
From: Luck, Tony @ 2016-01-06  7:06 UTC (permalink / raw)
  To: Williams, Dan J
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Elliott, Robert (Persistent Memory),
	linux-kernel, Linux MM, linux-nvdimm, X86 ML

You were heading towards:

ld: undefined __mcsafe_copy

since that is also inside the #ifdef. 

Weren't you going to "select" this?

I'm seriously wondering whether the ifdef still makes sense. Now I don't have an extra exception table and routines to sort/search/fixup, it doesn't seem as useful as it was a few iterations ago.

Sent from my iPhone

> On Jan 5, 2016, at 20:43, Dan Williams <dan.j.williams@intel.com> wrote:
> 
>> On Thu, Dec 31, 2015 at 11:43 AM, Tony Luck <tony.luck@intel.com> wrote:
>> Make use of the EXTABLE_FAULT exception table entries. This routine
>> returns a structure to indicate the result of the copy:
>> 
>> struct mcsafe_ret {
>>        u64 trapnr;
>>        u64 remain;
>> };
>> 
>> If the copy is successful, then both 'trapnr' and 'remain' are zero.
>> 
>> If we faulted during the copy, then 'trapnr' will say which type
>> of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
>> bytes were not copied.
>> 
>> Signed-off-by: Tony Luck <tony.luck@intel.com>
>> ---
>> arch/x86/Kconfig                 |  10 +++
>> arch/x86/include/asm/string_64.h |  10 +++
>> arch/x86/kernel/x8664_ksyms_64.c |   4 ++
>> arch/x86/lib/memcpy_64.S         | 136 +++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 160 insertions(+)
>> 
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index 96d058a87100..42d26b4d1ec4 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
>>          If you don't know what a machine check is and you don't do kernel
>>          QA it is safe to say n.
>> 
>> +config MCE_KERNEL_RECOVERY
>> +       bool "Recovery from machine checks in special kernel memory copy functions"
>> +       default n
>> +       depends on X86_MCE && X86_64
>> +       ---help---
>> +         This option provides a new memory copy function mcsafe_memcpy()
>> +         that is annotated to allow the machine check handler to return
>> +         to an alternate code path to return an error to the caller instead
>> +         of crashing the system. Say yes if you have a driver that uses this.
>> +
>> config X86_THERMAL_VECTOR
>>        def_bool y
>>        depends on X86_MCE_INTEL
>> diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
>> index ff8b9a17dc4b..16a8f0e56e4a 100644
>> --- a/arch/x86/include/asm/string_64.h
>> +++ b/arch/x86/include/asm/string_64.h
>> @@ -78,6 +78,16 @@ int strcmp(const char *cs, const char *ct);
>> #define memset(s, c, n) __memset(s, c, n)
>> #endif
>> 
>> +#ifdef CONFIG_MCE_KERNEL_RECOVERY
>> +struct mcsafe_ret {
>> +       u64 trapnr;
>> +       u64 remain;
>> +};
> 
> Can we move this definition outside of the CONFIG_MCE_KERNEL_RECOVERY
> ifdef guard?  On a test integration branch the kbuild robot caught the
> following:
> 
>   In file included from include/linux/pmem.h:21:0,
>                    from drivers/acpi/nfit.c:22:
>   arch/x86/include/asm/pmem.h: In function 'arch_memcpy_from_pmem':
>>> arch/x86/include/asm/pmem.h:55:21: error: storage size of 'ret' isn't known
>      struct mcsafe_ret ret;
>                        ^
>>> arch/x86/include/asm/pmem.h:57:9: error: implicit declaration of function '__mcsafe_copy' [-Werror=implicit-function-declaration]
>      ret = __mcsafe_copy(dst, (void __force *) src, n);
>            ^
>>> arch/x86/include/asm/pmem.h:55:21: warning: unused variable 'ret' [-Wunused-variable]
>      struct mcsafe_ret ret;
>                        ^
>   cc1: some warnings being treated as errors
> 
> vim +55 arch/x86/include/asm/pmem.h
> 
>    49  }
>    50
>    51  static inline int arch_memcpy_from_pmem(void *dst, const void
> __pmem *src,
>    52                  size_t n)
>    53  {
>    54          if (IS_ENABLED(CONFIG_MCE_KERNEL_RECOVERY)) {
>> 55                  struct mcsafe_ret ret;
>    56
>> 57                  ret = __mcsafe_copy(dst, (void __force *) src, n);
>    58                  if (ret.remain)
>    59                          return -EIO;
>    60                  return 0;

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-06  7:06     ` Luck, Tony
@ 2016-01-06  7:11       ` Dan Williams
  2016-01-06 16:37         ` Dan Williams
  0 siblings, 1 reply; 27+ messages in thread
From: Dan Williams @ 2016-01-06  7:11 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Elliott, Robert (Persistent Memory),
	linux-kernel, Linux MM, linux-nvdimm, X86 ML

On Tue, Jan 5, 2016 at 11:06 PM, Luck, Tony <tony.luck@intel.com> wrote:
> You were heading towards:
>
> ld: undefined __mcsafe_copy

True, we'd also need a dummy mcsafe_copy() definition to compile it
out in the disabled case.

> since that is also inside the #ifdef.
>
> Weren't you going to "select" this?
>

I do select it, but by randconfig I still need to handle the
CONFIG_X86_MCE=n case.

> I'm seriously wondering whether the ifdef still makes sense. Now I don't have an extra exception table and routines to sort/search/fixup, it doesn't seem as useful as it was a few iterations ago.

Either way is ok with me.  That said, the extra definitions to allow
it compile out when not enabled don't seem too onerous.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2015-12-30 17:59 ` [PATCH v7 1/3] x86: Add classes to exception tables Tony Luck
@ 2016-01-06 12:33   ` Borislav Petkov
  2016-01-06 17:35     ` Luck, Tony
  2016-01-06 17:54     ` Andy Lutomirski
  2016-01-06 12:36   ` Borislav Petkov
  1 sibling, 2 replies; 27+ messages in thread
From: Borislav Petkov @ 2016-01-06 12:33 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Dec 30, 2015 at 09:59:29AM -0800, Tony Luck wrote:
> Starting with a patch from Andy Lutomirski <luto@amacapital.net>
> that used linker relocation trickery to free up a couple of bits
> in the "fixup" field of the exception table (and generalized the
> uaccess_err hack to use one of the classes).

So I still think that the other idea Andy gave with putting the handler
in the exception table is much cleaner and straightforward.

Here's a totally untested patch which at least builds here. I think this
approach is much more extensible and simpler for the price of a couple
of KBs of __ex_table size.

---
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..43b509c88b13 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,18 +44,20 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
+# define _ASM_EXTABLE(from,to)				\
 	.pushsection "__ex_table","a" ;				\
 	.balign 8 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
+	.long 0 - .;						\
 	.popsection
 
 # define _ASM_EXTABLE_EX(from,to)				\
 	.pushsection "__ex_table","a" ;				\
 	.balign 8 ;						\
 	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
+	.long (to) - . ;					\
+	.long ex_handler_ext - . ;				\
 	.popsection
 
 # define _ASM_NOKPROBE(entry)					\
@@ -94,13 +96,14 @@
 	" .balign 8\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
+	" .long 0 - .\n"					\
 	" .popsection\n"
 
 # define _ASM_EXTABLE_EX(from,to)				\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
 	" .balign 8\n"						\
 	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
+	" .long ex_handler_ext - .\n"				\
 	" .popsection\n"
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 09b1b0ab94b7..22b49c3b311a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -104,13 +104,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  */
 
 struct exception_table_entry {
-	int insn, fixup;
+	int insn, fixup, handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a46b1d..211c11c7bba4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..191f4b7d1d2d 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,8 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+typedef int (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int);
+
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
 {
@@ -14,10 +16,39 @@ ex_fixup_addr(const struct exception_table_entry *x)
 	return (unsigned long)&x->fixup + x->fixup;
 }
 
-int fixup_exception(struct pt_regs *regs)
+inline ex_handler_t ex_fixup_handler(const struct exception_table_entry *x)
+{
+	return (ex_handler_t)&x->handler + x->handler;
+}
+
+int ex_handler_default(const struct exception_table_entry *fixup,
+		       struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+
+int ex_handler_fault(const struct exception_table_entry *fixup,
+		     struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return 1;
+}
+int ex_handler_ext(const struct exception_table_entry *fixup,
+		   struct pt_regs *regs, int trapnr)
 {
-	const struct exception_table_entry *fixup;
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +64,40 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	e = search_exception_tables(regs->ip);
+	if (!e)
+		return 0;
+
+	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
+
+	if (!handler)
+		handler = ex_handler_default;
+
+	return handler(e, regs, trapnr);
 
-	return 0;
 }
 
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
-	const struct exception_table_entry *fixup;
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
-	fixup = search_exception_tables(*ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
+	e = search_exception_tables(*ip);
+	if (!e)
+		return 0;
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* uaccess handling not supported during early boot */
-			return 0;
-		}
+	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
 
-		*ip = new_ip;
-		return 1;
-	}
+	/* uaccess handling not supported during early boot */
+	if (handler && handler == ex_handler_ext)
+		return 0;
 
-	return 0;
+	*ip = new_ip;
+	return 1;
 }
 
 /*
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..495946c3f9dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2015-12-30 17:59 ` [PATCH v7 1/3] x86: Add classes to exception tables Tony Luck
  2016-01-06 12:33   ` Borislav Petkov
@ 2016-01-06 12:36   ` Borislav Petkov
  1 sibling, 0 replies; 27+ messages in thread
From: Borislav Petkov @ 2016-01-06 12:36 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Dec 30, 2015 at 09:59:29AM -0800, Tony Luck wrote:
> Starting with a patch from Andy Lutomirski <luto@amacapital.net>
> that used linker relocation trickery to free up a couple of bits
> in the "fixup" field of the exception table (and generalized the
> uaccess_err hack to use one of the classes).
> 
> This patch allocates another one of the classes to provide
> a mechanism to provide the fault number to the fixup code
> in %rax.
> 
> Still one free class for the next brilliant idea. If more are
> needed it should be possible to squeeze another bit or three
> extending the same technique.
> 
> Originally-from: Andy Lutomirski <luto@amacapital.net>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/asm.h     | 102 +++++++++++++++++++++++++++++++----------
>  arch/x86/include/asm/uaccess.h |  17 +++++--
>  arch/x86/kernel/kprobes/core.c |   2 +-
>  arch/x86/kernel/traps.c        |   6 +--
>  arch/x86/mm/extable.c          |  66 ++++++++++++++++++--------
>  arch/x86/mm/fault.c            |   2 +-
>  6 files changed, 142 insertions(+), 53 deletions(-)

...

> @@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
>  	conditional_sti(regs);
>  
>  	if (!user_mode(regs)) {
> -		if (!fixup_exception(regs)) {
> +		if (!fixup_exception(regs, X86_TRAP_DE)) {

Whatever we end up doing, this needs to be trapnr above and not X86_TRAP_DE.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-06  7:11       ` Dan Williams
@ 2016-01-06 16:37         ` Dan Williams
  2016-01-06 16:57           ` Luck, Tony
  0 siblings, 1 reply; 27+ messages in thread
From: Dan Williams @ 2016-01-06 16:37 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Elliott, Robert (Persistent Memory),
	linux-kernel, Linux MM, linux-nvdimm, X86 ML

On Tue, Jan 5, 2016 at 11:11 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Tue, Jan 5, 2016 at 11:06 PM, Luck, Tony <tony.luck@intel.com> wrote:
>> You were heading towards:
>>
>> ld: undefined __mcsafe_copy
>
> True, we'd also need a dummy mcsafe_copy() definition to compile it
> out in the disabled case.
>
>> since that is also inside the #ifdef.
>>
>> Weren't you going to "select" this?
>>
>
> I do select it, but by randconfig I still need to handle the
> CONFIG_X86_MCE=n case.
>
>> I'm seriously wondering whether the ifdef still makes sense. Now I don't have an extra exception table and routines to sort/search/fixup, it doesn't seem as useful as it was a few iterations ago.
>
> Either way is ok with me.  That said, the extra definitions to allow
> it compile out when not enabled don't seem too onerous.

This works for me, because all we need is the definitions.  As long as
we don't attempt to link to mcsafe_copy() we get the benefit of
compiling this out when de-selected:


diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 16a8f0e56e4a..5b24039463a4 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,7 +78,6 @@ int strcmp(const char *cs, const char *ct);
#define memset(s, c, n) __memset(s, c, n)
#endif

-#ifdef CONFIG_MCE_KERNEL_RECOVERY
struct mcsafe_ret {
       u64 trapnr;
       u64 remain;
@@ -86,7 +85,6 @@ struct mcsafe_ret {

struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, size_t cnt);
extern void __mcsafe_copy_end(void);
-#endif

#endif /* __KERNEL__ */

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* RE: [PATCH v7 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-06 16:37         ` Dan Williams
@ 2016-01-06 16:57           ` Luck, Tony
  2016-01-06 17:05             ` Dan Williams
  0 siblings, 1 reply; 27+ messages in thread
From: Luck, Tony @ 2016-01-06 16:57 UTC (permalink / raw)
  To: Williams, Dan J
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Elliott, Robert (Persistent Memory),
	linux-kernel, Linux MM, linux-nvdimm, X86 ML

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1281 bytes --]

>> I do select it, but by randconfig I still need to handle the
>> CONFIG_X86_MCE=n case.
>>
>>> I'm seriously wondering whether the ifdef still makes sense. Now I don't have an extra exception table and routines to sort/search/fixup, it doesn't seem as useful as it was a few iterations ago.
>>
>> Either way is ok with me.  That said, the extra definitions to allow
>> it compile out when not enabled don't seem too onerous.
>
> This works for me, because all we need is the definitions.  As long as
> we don't attempt to link to mcsafe_copy() we get the benefit of
> compiling this out when de-selected:

It seems  that Kconfig's "select" statement doesn't auto-select other things
that are dependencies of the symbol you choose.

CONFIG_MCE_KERNEL_RECOVERY really is dependent on
CONFIG_X86_MCE ... having the code for the __mcsafe_copy()
linked into the kernel won't do you any good without a machine
check handler that jumps to the fixup code.

So I think you have to select X86_MCE as well (or Kconfig needs
to be taught to do it automatically ... but I have a nagging feeling
that this is known behavior).

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 3/3] x86, mce: Add __mcsafe_copy()
  2016-01-06 16:57           ` Luck, Tony
@ 2016-01-06 17:05             ` Dan Williams
  0 siblings, 0 replies; 27+ messages in thread
From: Dan Williams @ 2016-01-06 17:05 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Elliott, Robert (Persistent Memory),
	linux-kernel, Linux MM, linux-nvdimm, X86 ML

On Wed, Jan 6, 2016 at 8:57 AM, Luck, Tony <tony.luck@intel.com> wrote:
>>> I do select it, but by randconfig I still need to handle the
>>> CONFIG_X86_MCE=n case.
>>>
>>>> I'm seriously wondering whether the ifdef still makes sense. Now I don't have an extra exception table and routines to sort/search/fixup, it doesn't seem as useful as it was a few iterations ago.
>>>
>>> Either way is ok with me.  That said, the extra definitions to allow
>>> it compile out when not enabled don't seem too onerous.
>>
>> This works for me, because all we need is the definitions.  As long as
>> we don't attempt to link to mcsafe_copy() we get the benefit of
>> compiling this out when de-selected:
>
> It seems  that Kconfig's "select" statement doesn't auto-select other things
> that are dependencies of the symbol you choose.
>
> CONFIG_MCE_KERNEL_RECOVERY really is dependent on
> CONFIG_X86_MCE ... having the code for the __mcsafe_copy()
> linked into the kernel won't do you any good without a machine
> check handler that jumps to the fixup code.
>
> So I think you have to select X86_MCE as well (or Kconfig needs
> to be taught to do it automatically ... but I have a nagging feeling
> that this is known behavior).
>

I don't want to force it on, otherwise we might as well remove the
ability to configure it.  Instead I have this:

config BLK_DEV_PMEM
       select MCE_KERNEL_RECOVERY if X86_MCE && X86_64

...that way if you turn on X86_MCE and BLK_DEV_PMEM you get
MCE_KERNEL_RECOVERY by default.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-06 12:33   ` Borislav Petkov
@ 2016-01-06 17:35     ` Luck, Tony
  2016-01-06 17:48       ` Linus Torvalds
  2016-01-06 17:54     ` Andy Lutomirski
  1 sibling, 1 reply; 27+ messages in thread
From: Luck, Tony @ 2016-01-06 17:35 UTC (permalink / raw)
  To: Borislav Petkov, torvalds, hpa, mingo, tglx
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Jan 06, 2016 at 01:33:46PM +0100, Borislav Petkov wrote:
> On Wed, Dec 30, 2015 at 09:59:29AM -0800, Tony Luck wrote:
> > Starting with a patch from Andy Lutomirski <luto@amacapital.net>
> > that used linker relocation trickery to free up a couple of bits
> > in the "fixup" field of the exception table (and generalized the
> > uaccess_err hack to use one of the classes).
> 
> So I still think that the other idea Andy gave with putting the handler
> in the exception table is much cleaner and straightforward.
> 
> Here's a totally untested patch which at least builds here. I think this
> approach is much more extensible and simpler for the price of a couple
> of KBs of __ex_table size.

On my config (based on an enterprise distro config) the vmlinux
ex_table would grow from 8k to 12k.  There are a handful of modules
that would also see 50% expansion (the size is the first hex field after
the __ex_table):

arch/x86/kernel/test_nx.ko 4 __ex_table 00000008 0000000000000000 0000000000000000 000002a8 2**3
arch/x86/kvm/kvm-intel.ko 7 __ex_table 000009b8 0000000000000000 0000000000000000 0001ff60 2**3
arch/x86/kvm/kvm-amd.ko 6 __ex_table 00000070 0000000000000000 0000000000000000 0000a1e0 2**3
arch/x86/kvm/kvm.ko 23 __ex_table 00000080 0000000000000000 0000000000000000 00059940 2**3
drivers/char/ipmi/ipmi_devintf.ko 9 __ex_table 00000098 0000000000000000 0000000000000000 000017f0 2**3
drivers/gpu/drm/i915/i915.ko 20 __ex_table 00000060 0000000000000000 0000000000000000 000f8f10 2**3
drivers/gpu/drm/radeon/radeon.ko 19 __ex_table 000001d0 0000000000000000 0000000000000000 00145c88 2**3
drivers/gpu/drm/drm.ko 24 __ex_table 00000400 0000000000000000 0000000000000000 0003b688 2**3
drivers/media/usb/uvc/uvcvideo.ko 15 __ex_table 00000030 0000000000000000 0000000000000000 0000f100 2**3
drivers/scsi/sg.ko 12 __ex_table 00000060 0000000000000000 0000000000000000 00006548 2**3
drivers/vhost/vhost.ko 12 __ex_table 00000068 0000000000000000 0000000000000000 00003360 2**3
drivers/xen/xen-privcmd.ko 11 __ex_table 00000010 0000000000000000 0000000000000000 00000f70 2**3
net/sunrpc/sunrpc.ko 24 __ex_table 00000008 0000000000000000 0000000000000000 000321b0 2**3
sound/core/snd-pcm.ko 19 __ex_table 00000040 0000000000000000 0000000000000000 000108f0 2**3

Linus, Peter, Ingo, Thomas: Can we head this direction? The code is cleaner
and more flexible. Or should we stick with Andy's clever way to squeeze a
couple of "class" bits into the fixup field of the exception table?

-Tony

> 
> ---
> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
> index 189679aba703..43b509c88b13 100644
> --- a/arch/x86/include/asm/asm.h
> +++ b/arch/x86/include/asm/asm.h
> @@ -44,18 +44,20 @@
>  
>  /* Exception table entry */
>  #ifdef __ASSEMBLY__
> -# define _ASM_EXTABLE(from,to)					\
> +# define _ASM_EXTABLE(from,to)				\
>  	.pushsection "__ex_table","a" ;				\
>  	.balign 8 ;						\
>  	.long (from) - . ;					\
>  	.long (to) - . ;					\
> +	.long 0 - .;						\
>  	.popsection
>  
>  # define _ASM_EXTABLE_EX(from,to)				\
>  	.pushsection "__ex_table","a" ;				\
>  	.balign 8 ;						\
>  	.long (from) - . ;					\
> -	.long (to) - . + 0x7ffffff0 ;				\
> +	.long (to) - . ;					\
> +	.long ex_handler_ext - . ;				\
>  	.popsection
>  
>  # define _ASM_NOKPROBE(entry)					\
> @@ -94,13 +96,14 @@
>  	" .balign 8\n"						\
>  	" .long (" #from ") - .\n"				\
>  	" .long (" #to ") - .\n"				\
> +	" .long 0 - .\n"					\
>  	" .popsection\n"
>  
>  # define _ASM_EXTABLE_EX(from,to)				\
>  	" .pushsection \"__ex_table\",\"a\"\n"			\
>  	" .balign 8\n"						\
>  	" .long (" #from ") - .\n"				\
> -	" .long (" #to ") - . + 0x7ffffff0\n"			\
> +	" .long ex_handler_ext - .\n"				\
>  	" .popsection\n"
>  /* For C file, we already have NOKPROBE_SYMBOL macro */
>  #endif
> diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
> index 09b1b0ab94b7..22b49c3b311a 100644
> --- a/arch/x86/include/asm/uaccess.h
> +++ b/arch/x86/include/asm/uaccess.h
> @@ -104,13 +104,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
>   */
>  
>  struct exception_table_entry {
> -	int insn, fixup;
> +	int insn, fixup, handler;
>  };
>  /* This is not the generic standard exception_table_entry format */
>  #define ARCH_HAS_SORT_EXTABLE
>  #define ARCH_HAS_SEARCH_EXTABLE
>  
> -extern int fixup_exception(struct pt_regs *regs);
> +extern int fixup_exception(struct pt_regs *regs, int trapnr);
>  extern int early_fixup_exception(unsigned long *ip);
>  
>  /*
> diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
> index 1deffe6cc873..0f05deeff5ce 100644
> --- a/arch/x86/kernel/kprobes/core.c
> +++ b/arch/x86/kernel/kprobes/core.c
> @@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
>  		 * In case the user-specified fault handler returned
>  		 * zero, try to fix up.
>  		 */
> -		if (fixup_exception(regs))
> +		if (fixup_exception(regs, trapnr))
>  			return 1;
>  
>  		/*
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index ade185a46b1d..211c11c7bba4 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
>  	}
>  
>  	if (!user_mode(regs)) {
> -		if (!fixup_exception(regs)) {
> +		if (!fixup_exception(regs, trapnr)) {
>  			tsk->thread.error_code = error_code;
>  			tsk->thread.trap_nr = trapnr;
>  			die(str, regs, error_code);
> @@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
>  
>  	tsk = current;
>  	if (!user_mode(regs)) {
> -		if (fixup_exception(regs))
> +		if (fixup_exception(regs, X86_TRAP_GP))
>  			return;
>  
>  		tsk->thread.error_code = error_code;
> @@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
>  	conditional_sti(regs);
>  
>  	if (!user_mode(regs)) {
> -		if (!fixup_exception(regs)) {
> +		if (!fixup_exception(regs, trapnr)) {
>  			task->thread.error_code = error_code;
>  			task->thread.trap_nr = trapnr;
>  			die(str, regs, error_code);
> diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
> index 903ec1e9c326..191f4b7d1d2d 100644
> --- a/arch/x86/mm/extable.c
> +++ b/arch/x86/mm/extable.c
> @@ -3,6 +3,8 @@
>  #include <linux/sort.h>
>  #include <asm/uaccess.h>
>  
> +typedef int (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int);
> +
>  static inline unsigned long
>  ex_insn_addr(const struct exception_table_entry *x)
>  {
> @@ -14,10 +16,39 @@ ex_fixup_addr(const struct exception_table_entry *x)
>  	return (unsigned long)&x->fixup + x->fixup;
>  }
>  
> -int fixup_exception(struct pt_regs *regs)
> +inline ex_handler_t ex_fixup_handler(const struct exception_table_entry *x)
> +{
> +	return (ex_handler_t)&x->handler + x->handler;
> +}
> +
> +int ex_handler_default(const struct exception_table_entry *fixup,
> +		       struct pt_regs *regs, int trapnr)
> +{
> +	regs->ip = ex_fixup_addr(fixup);
> +	return 1;
> +}
> +
> +int ex_handler_fault(const struct exception_table_entry *fixup,
> +		     struct pt_regs *regs, int trapnr)
> +{
> +	regs->ip = ex_fixup_addr(fixup);
> +	regs->ax = trapnr;
> +	return 1;
> +}
> +int ex_handler_ext(const struct exception_table_entry *fixup,
> +		   struct pt_regs *regs, int trapnr)
>  {
> -	const struct exception_table_entry *fixup;
> +	/* Special hack for uaccess_err */
> +	current_thread_info()->uaccess_err = 1;
> +	regs->ip = ex_fixup_addr(fixup);
> +	return 1;
> +}
> +
> +int fixup_exception(struct pt_regs *regs, int trapnr)
> +{
> +	const struct exception_table_entry *e;
>  	unsigned long new_ip;
> +	ex_handler_t handler;
>  
>  #ifdef CONFIG_PNPBIOS
>  	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
> @@ -33,42 +64,40 @@ int fixup_exception(struct pt_regs *regs)
>  	}
>  #endif
>  
> -	fixup = search_exception_tables(regs->ip);
> -	if (fixup) {
> -		new_ip = ex_fixup_addr(fixup);
> -
> -		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
> -			/* Special hack for uaccess_err */
> -			current_thread_info()->uaccess_err = 1;
> -			new_ip -= 0x7ffffff0;
> -		}
> -		regs->ip = new_ip;
> -		return 1;
> -	}
> +	e = search_exception_tables(regs->ip);
> +	if (!e)
> +		return 0;
> +
> +	new_ip  = ex_fixup_addr(e);
> +	handler = ex_fixup_handler(e);
> +
> +	if (!handler)
> +		handler = ex_handler_default;
> +
> +	return handler(e, regs, trapnr);
>  
> -	return 0;
>  }
>  
>  /* Restricted version used during very early boot */
>  int __init early_fixup_exception(unsigned long *ip)
>  {
> -	const struct exception_table_entry *fixup;
> +	const struct exception_table_entry *e;
>  	unsigned long new_ip;
> +	ex_handler_t handler;
>  
> -	fixup = search_exception_tables(*ip);
> -	if (fixup) {
> -		new_ip = ex_fixup_addr(fixup);
> +	e = search_exception_tables(*ip);
> +	if (!e)
> +		return 0;
>  
> -		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
> -			/* uaccess handling not supported during early boot */
> -			return 0;
> -		}
> +	new_ip  = ex_fixup_addr(e);
> +	handler = ex_fixup_handler(e);
>  
> -		*ip = new_ip;
> -		return 1;
> -	}
> +	/* uaccess handling not supported during early boot */
> +	if (handler && handler == ex_handler_ext)
> +		return 0;
>  
> -	return 0;
> +	*ip = new_ip;
> +	return 1;
>  }
>  
>  /*
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index eef44d9a3f77..495946c3f9dd 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
>  	int sig;
>  
>  	/* Are we prepared to handle this kernel fault? */
> -	if (fixup_exception(regs)) {
> +	if (fixup_exception(regs, X86_TRAP_PF)) {
>  		/*
>  		 * Any interrupt that takes a fault gets the fixup. This makes
>  		 * the below recursive fault logic only apply to a faults from
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-06 17:35     ` Luck, Tony
@ 2016-01-06 17:48       ` Linus Torvalds
  0 siblings, 0 replies; 27+ messages in thread
From: Linus Torvalds @ 2016-01-06 17:48 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, Peter Anvin, Ingo Molnar, Thomas Gleixner,
	Andrew Morton, Andy Lutomirski, Dan Williams, elliott,
	Linux Kernel Mailing List, linux-mm, linux-nvdimm@lists.01.org,
	the arch/x86 maintainers

On Wed, Jan 6, 2016 at 9:35 AM, Luck, Tony <tony.luck@intel.com> wrote:
>
> Linus, Peter, Ingo, Thomas: Can we head this direction? The code is cleaner
> and more flexible. Or should we stick with Andy's clever way to squeeze a
> couple of "class" bits into the fixup field of the exception table?

I'd rather not be clever in order to save just a tiny amount of space
in the exception table, which isn't really criticial for anybody.

So I think Borislav's patch has the advantage of being pretty
straightforward and allowing arbitrary fixups, in case we end up
having localized special cases..

               Linus

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-06 12:33   ` Borislav Petkov
  2016-01-06 17:35     ` Luck, Tony
@ 2016-01-06 17:54     ` Andy Lutomirski
  2016-01-06 17:59       ` Borislav Petkov
  1 sibling, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2016-01-06 17:54 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Wed, Jan 6, 2016 at 4:33 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Wed, Dec 30, 2015 at 09:59:29AM -0800, Tony Luck wrote:
>> Starting with a patch from Andy Lutomirski <luto@amacapital.net>
>> that used linker relocation trickery to free up a couple of bits
>> in the "fixup" field of the exception table (and generalized the
>> uaccess_err hack to use one of the classes).
>
> So I still think that the other idea Andy gave with putting the handler
> in the exception table is much cleaner and straightforward.
>
> Here's a totally untested patch which at least builds here. I think this
> approach is much more extensible and simpler for the price of a couple
> of KBs of __ex_table size.
>
> ---
> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
> index 189679aba703..43b509c88b13 100644
> --- a/arch/x86/include/asm/asm.h
> +++ b/arch/x86/include/asm/asm.h
> @@ -44,18 +44,20 @@
>
>  /* Exception table entry */
>  #ifdef __ASSEMBLY__
> -# define _ASM_EXTABLE(from,to)                                 \
> +# define _ASM_EXTABLE(from,to)                         \
>         .pushsection "__ex_table","a" ;                         \
>         .balign 8 ;                                             \
>         .long (from) - . ;                                      \
>         .long (to) - . ;                                        \
> +       .long 0 - .;                                            \

I assume that this zero is to save the couple of bytes for the
relocation entry on relocatable kernels?

If so, ...

> +inline ex_handler_t ex_fixup_handler(const struct exception_table_entry *x)
> +{
> +       return (ex_handler_t)&x->handler + x->handler;

I would check for zero here, because...

> +       new_ip  = ex_fixup_addr(e);
> +       handler = ex_fixup_handler(e);
> +
> +       if (!handler)
> +               handler = ex_handler_default;

the !handler condition here will never trigger because the offset was
already applied.

Otherwise this looks generally sane.

--Andy

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-06 17:54     ` Andy Lutomirski
@ 2016-01-06 17:59       ` Borislav Petkov
  2016-01-06 18:07         ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: Borislav Petkov @ 2016-01-06 17:59 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Wed, Jan 06, 2016 at 09:54:19AM -0800, Andy Lutomirski wrote:
> I assume that this zero is to save the couple of bytes for the
> relocation entry on relocatable kernels?

I didn't want to touch all _ASM_EXTABLE() macro invocations by adding a
third param @handler which is redundant as we know which it is.

> > +       new_ip  = ex_fixup_addr(e);
> > +       handler = ex_fixup_handler(e);
> > +
> > +       if (!handler)
> > +               handler = ex_handler_default;
> 
> the !handler condition here will never trigger because the offset was
> already applied.

Actually, if I do "0 - .", that would overflow the int because current
location is virtual address and that's 64-bit. Or would gas simply
truncate it? Lemme check...

Anyway, what we should do instead is simply

	.long 0

to denote that the @handler is implicit.

Right?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-06 17:59       ` Borislav Petkov
@ 2016-01-06 18:07         ` Andy Lutomirski
  2016-01-06 19:42           ` Borislav Petkov
  0 siblings, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2016-01-06 18:07 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Wed, Jan 6, 2016 at 9:59 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Wed, Jan 06, 2016 at 09:54:19AM -0800, Andy Lutomirski wrote:
>> I assume that this zero is to save the couple of bytes for the
>> relocation entry on relocatable kernels?
>
> I didn't want to touch all _ASM_EXTABLE() macro invocations by adding a
> third param @handler which is redundant as we know which it is.

I see.  You could shove the .long ex_handler_default - . into the
macro, but that would indeed bloat the kernel image a bit more
(although not the in-memory size of the kernel).

>
>> > +       new_ip  = ex_fixup_addr(e);
>> > +       handler = ex_fixup_handler(e);
>> > +
>> > +       if (!handler)
>> > +               handler = ex_handler_default;
>>
>> the !handler condition here will never trigger because the offset was
>> already applied.
>
> Actually, if I do "0 - .", that would overflow the int because current
> location is virtual address and that's 64-bit. Or would gas simply
> truncate it? Lemme check...
>
> Anyway, what we should do instead is simply
>
>         .long 0
>
> to denote that the @handler is implicit.
>
> Right?

Agreed.  I just think that your current fixup_ex_handler
implementation needs adjustment if you do it that way.

--Andy

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-06 18:07         ` Andy Lutomirski
@ 2016-01-06 19:42           ` Borislav Petkov
  2016-01-07 12:11             ` Borislav Petkov
  0 siblings, 1 reply; 27+ messages in thread
From: Borislav Petkov @ 2016-01-06 19:42 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Wed, Jan 06, 2016 at 10:07:19AM -0800, Andy Lutomirski wrote:
> Agreed.  I just think that your current fixup_ex_handler
> implementation needs adjustment if you do it that way.

Right, and as you just mentioned on IRC, there's also sortextable.c
which needs adjusting. So I'll go stare at that code first to try to
figure out what exactly is being done there...

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-06 19:42           ` Borislav Petkov
@ 2016-01-07 12:11             ` Borislav Petkov
  2016-01-07 18:22               ` Luck, Tony
                                 ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Borislav Petkov @ 2016-01-07 12:11 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Wed, Jan 06, 2016 at 08:42:22PM +0100, Borislav Petkov wrote:
> On Wed, Jan 06, 2016 at 10:07:19AM -0800, Andy Lutomirski wrote:
> > Agreed.  I just think that your current fixup_ex_handler
> > implementation needs adjustment if you do it that way.
> 
> Right, and as you just mentioned on IRC, there's also sortextable.c
> which needs adjusting. So I'll go stare at that code first to try to
> figure out what exactly is being done there...

Sorry, it took me a while to figure out all that normalization and
denormalization of the offsets sortextable.c does. And with the bigger
table of 3 ints, we need to do the sorting a bit differently. Also, the
alignment of 8 makes gas turn ".long 0" into 8 bytes worth of zeroes so
I had to do ".balign 4". Is that going to be a problem...?

With it, the exception table looks ok to me:

8e9984          35 ae 91 ff 73 d5 ff ff 00 00 00 00
8e9990          3e af 91 ff 71 d5 ff ff 00 00 00 00
8e999c          4e af 91 ff 6c d5 ff ff 00 00 00 00
8e99a8          49 af 91 ff 67 d5 ff ff 00 00 00 00
8e99b4          ab b3 91 ff 62 d5 ff ff 00 00 00 00
8e99c0          b0 b3 91 ff 5d d5 ff ff 00 00 00 00
8e99cc          e7 b3 91 ff 5b d5 ff ff 00 00 00 00
8e99d8          82 b4 91 ff 59 d5 ff ff 00 00 00 00
8e99e4          8e b9 91 ff 8e b9 91 ff d8 41 96 ff
8e99f0          8a b9 91 ff 8a b9 91 ff cc 41 96 ff
8e99fc          86 b9 91 ff 86 b9 91 ff c0 41 96 ff
8e9a08          82 b9 91 ff 82 b9 91 ff b4 41 96 ff
8e9a14          81 b9 91 ff 81 b9 91 ff a8 41 96 ff
8e9a20          7d b9 91 ff 7d b9 91 ff 9c 41 96 ff
...

the last int not 00 is the offset to ex_handler_ext(). Regarding that, I
probably should change _ASM_EXTABLE_EX() to take a @handler argument so
that Tony can pass in the ex_handler_fault() too.

Anyway, here's what I have, it boots fine in a guest.

Btw, it seems I'm coming down with the cold and all that above could be
hallucinations so please double-check me.

Thanks.

---
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..20b638d49f24 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -46,16 +46,18 @@
 #ifdef __ASSEMBLY__
 # define _ASM_EXTABLE(from,to)					\
 	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
+	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
+	.long 0;						\
 	.popsection
 
 # define _ASM_EXTABLE_EX(from,to)				\
 	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
+	.balign 4 ;						\
 	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
+	.long (to) - . ;					\
+	.long ex_handler_ext - . ;				\
 	.popsection
 
 # define _ASM_NOKPROBE(entry)					\
@@ -91,16 +93,18 @@
 #else
 # define _ASM_EXTABLE(from,to)					\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
+	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
+	" .long 0\n"						\
 	" .popsection\n"
 
 # define _ASM_EXTABLE_EX(from,to)				\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
+	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
+	" .long (" #to ") - .\n"				\
+	" .long ex_handler_ext - .\n"				\
 	" .popsection\n"
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 660458af425d..35d67026bfa5 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -104,13 +104,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  */
 
 struct exception_table_entry {
-	int insn, fixup;
+	int insn, fixup, handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a46b1d..211c11c7bba4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..e11d3b32efd4 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,8 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+typedef int (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int);
+
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
 {
@@ -14,10 +16,42 @@ ex_fixup_addr(const struct exception_table_entry *x)
 	return (unsigned long)&x->fixup + x->fixup;
 }
 
-int fixup_exception(struct pt_regs *regs)
+int ex_handler_default(const struct exception_table_entry *fixup,
+		       struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+
+int ex_handler_fault(const struct exception_table_entry *fixup,
+		     struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return 1;
+}
+int ex_handler_ext(const struct exception_table_entry *fixup,
+		   struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+
+inline ex_handler_t ex_fixup_handler(const struct exception_table_entry *x)
+{
+	if (!x->handler)
+		return ex_handler_default;
+
+	return (ex_handler_t)&x->handler + x->handler;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
 {
-	const struct exception_table_entry *fixup;
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +67,36 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	e = search_exception_tables(regs->ip);
+	if (!e)
+		return 0;
 
-	return 0;
+	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
+
+	return handler(e, regs, trapnr);
 }
 
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
-	const struct exception_table_entry *fixup;
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
-	fixup = search_exception_tables(*ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
+	e = search_exception_tables(*ip);
+	if (!e)
+		return 0;
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* uaccess handling not supported during early boot */
-			return 0;
-		}
+	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
 
-		*ip = new_ip;
-		return 1;
-	}
+	/* uaccess handling not supported during early boot */
+	if (handler == ex_handler_ext)
+		return 0;
 
-	return 0;
+	*ip = new_ip;
+	return 1;
 }
 
 /*
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 546fbca9621d..def6af7aef89 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -670,7 +670,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index c2423d913b46..b17b716959a4 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -209,6 +209,33 @@ static int compare_relative_table(const void *a, const void *b)
 	return 0;
 }
 
+static void x86_sort_relative_table(char *extab_image, int image_size)
+{
+	int i;
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) + i, loc);
+		w(r(loc + 1) + i + 4, loc + 1);
+
+		i += sizeof(uint32_t) * 3;
+	}
+
+	qsort(extab_image, image_size / 12, 12, compare_relative_table);
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) - i, loc);
+		w(r(loc + 1) - (i + 4), loc + 1);
+
+		i += sizeof(uint32_t) * 3;
+	}
+}
+
 static void sort_relative_table(char *extab_image, int image_size)
 {
 	int i;
@@ -281,6 +308,9 @@ do_file(char const *const fname)
 		break;
 	case EM_386:
 	case EM_X86_64:
+		custom_sort = x86_sort_relative_table;
+		break;
+
 	case EM_S390:
 		custom_sort = sort_relative_table;
 		break;


-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-07 12:11             ` Borislav Petkov
@ 2016-01-07 18:22               ` Luck, Tony
  2016-01-08  1:45               ` Luck, Tony
  2016-01-08  5:30               ` Luck, Tony
  2 siblings, 0 replies; 27+ messages in thread
From: Luck, Tony @ 2016-01-07 18:22 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andy Lutomirski, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Thu, Jan 07, 2016 at 01:11:31PM +0100, Borislav Petkov wrote:
>  #ifdef __ASSEMBLY__
>  # define _ASM_EXTABLE(from,to)					\
>  	.pushsection "__ex_table","a" ;				\
> -	.balign 8 ;						\
> +	.balign 4 ;						\
>  	.long (from) - . ;					\
>  	.long (to) - . ;					\
> +	.long 0;						\

Why not
	.long ex_handler_default - . ;

Then you wouldn't have to special case the zero in the lookup
(and in the sort, which you don't do now, but should)

> @@ -33,42 +67,36 @@ int fixup_exception(struct pt_regs *regs)
>  	}
>  #endif
>  
> -	fixup = search_exception_tables(regs->ip);
> -	if (fixup) {
> -		new_ip = ex_fixup_addr(fixup);
> -
> -		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
> -			/* Special hack for uaccess_err */
> -			current_thread_info()->uaccess_err = 1;
> -			new_ip -= 0x7ffffff0;
> -		}
> -		regs->ip = new_ip;
> -		return 1;
> -	}
> +	e = search_exception_tables(regs->ip);
> +	if (!e)
> +		return 0;
>  
> -	return 0;
> +	new_ip  = ex_fixup_addr(e);

Assigned, but not used - delete the declaration above too.

> +static void x86_sort_relative_table(char *extab_image, int image_size)
> +{
> +	int i;
> +
> +	i = 0;
> +	while (i < image_size) {
> +		uint32_t *loc = (uint32_t *)(extab_image + i);
> +
> +		w(r(loc) + i, loc);
> +		w(r(loc + 1) + i + 4, loc + 1);
Need to twiddle the 'handler' field too (unless it is 0). If you
give up on the magic zero and fill in the offset to ex_handler_default
then I *think* you need:
		w(r(loc + 2) + i + 8, loc + 2);
the special case *might* be:
		if (r(loc + 2))
			w(r(loc + 2) + i + 8, loc + 2);
> +
> +		i += sizeof(uint32_t) * 3;
> +	}
> +
> +	qsort(extab_image, image_size / 12, 12, compare_relative_table);
> +
> +	i = 0;
> +	while (i < image_size) {
> +		uint32_t *loc = (uint32_t *)(extab_image + i);
> +
> +		w(r(loc) - i, loc);
> +		w(r(loc + 1) - (i + 4), loc + 1);
ditto, untwiddle the handler (unless it was zero)
		w(r(loc + 2) - (i + 8), loc + 2);

There is also arch/x86/mm/extable.c:sort_extable() which will
be used on the main kernel exception table if for some reason
the build skipped using scripts/sortextable ... and is always
used for exception tables in modules.  It also needs to know
about the new field (and would be another place to special case
the zero fields for the default handler).

-Tony

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-07 12:11             ` Borislav Petkov
  2016-01-07 18:22               ` Luck, Tony
@ 2016-01-08  1:45               ` Luck, Tony
  2016-01-08 10:37                 ` Borislav Petkov
  2016-01-08  5:30               ` Luck, Tony
  2 siblings, 1 reply; 27+ messages in thread
From: Luck, Tony @ 2016-01-08  1:45 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andy Lutomirski, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Thu, Jan 07, 2016 at 01:11:31PM +0100, Borislav Petkov wrote:
> Anyway, here's what I have, it boots fine in a guest.
> 
> Btw, it seems I'm coming down with the cold and all that above could be
> hallucinations so please double-check me.

Hardly any hallucinations ... here's an update with the changes
I mentioned in earlier e-mail.  Boots on actual h/w.


diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..c286db8ddc58 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -46,16 +46,18 @@
 #ifdef __ASSEMBLY__
 # define _ASM_EXTABLE(from,to)					\
 	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
+	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
+	.long ex_handler_default - . ;				\
 	.popsection
 
 # define _ASM_EXTABLE_EX(from,to)				\
 	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
+	.balign 4 ;						\
 	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
+	.long (to) - . ;					\
+	.long ex_handler_ext - . ;				\
 	.popsection
 
 # define _ASM_NOKPROBE(entry)					\
@@ -91,16 +93,18 @@
 #else
 # define _ASM_EXTABLE(from,to)					\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
+	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
+	" .long ex_handler_default - .\n"			\
 	" .popsection\n"
 
 # define _ASM_EXTABLE_EX(from,to)				\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
+	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
+	" .long (" #to ") - .\n"				\
+	" .long ex_handler_ext - .\n"				\
 	" .popsection\n"
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a8df874f3e88..b8f6f7545679 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -104,13 +104,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  */
 
 struct exception_table_entry {
-	int insn, fixup;
+	int insn, fixup, handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 346eec73f7db..7f50e8f4a075 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..01098ad010dd 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,8 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+typedef int (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int);
+
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
 {
@@ -14,10 +16,39 @@ ex_fixup_addr(const struct exception_table_entry *x)
 	return (unsigned long)&x->fixup + x->fixup;
 }
 
-int fixup_exception(struct pt_regs *regs)
+int ex_handler_default(const struct exception_table_entry *fixup,
+		       struct pt_regs *regs, int trapnr)
 {
-	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+EXPORT_SYMBOL(ex_handler_default);
+
+int ex_handler_fault(const struct exception_table_entry *fixup,
+		     struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return 1;
+}
+int ex_handler_ext(const struct exception_table_entry *fixup,
+		   struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+
+inline ex_handler_t ex_fixup_handler(const struct exception_table_entry *x)
+{
+	return (ex_handler_t)&x->handler + x->handler;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +64,35 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	e = search_exception_tables(regs->ip);
+	if (!e)
+		return 0;
 
-	return 0;
+	handler = ex_fixup_handler(e);
+
+	return handler(e, regs, trapnr);
 }
 
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
-	const struct exception_table_entry *fixup;
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
-	fixup = search_exception_tables(*ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
+	e = search_exception_tables(*ip);
+	if (!e)
+		return 0;
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* uaccess handling not supported during early boot */
-			return 0;
-		}
+	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
 
-		*ip = new_ip;
-		return 1;
-	}
+	/* uaccess handling not supported during early boot */
+	if (handler == ex_handler_ext)
+		return 0;
 
-	return 0;
+	*ip = new_ip;
+	return 1;
 }
 
 /*
@@ -133,6 +157,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup += i;
 		i += 4;
+		p->handler += i;
+		i += 4;
 	}
 
 	sort(start, finish - start, sizeof(struct exception_table_entry),
@@ -145,6 +171,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup -= i;
 		i += 4;
+		p->handler -= i;
+		i += 4;
 	}
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..495946c3f9dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index c2423d913b46..7b29fb14f870 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -209,6 +209,35 @@ static int compare_relative_table(const void *a, const void *b)
 	return 0;
 }
 
+static void x86_sort_relative_table(char *extab_image, int image_size)
+{
+	int i;
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) + i, loc);
+		w(r(loc + 1) + i + 4, loc + 1);
+		w(r(loc + 2) + i + 8, loc + 2);
+
+		i += sizeof(uint32_t) * 3;
+	}
+
+	qsort(extab_image, image_size / 12, 12, compare_relative_table);
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) - i, loc);
+		w(r(loc + 1) - (i + 4), loc + 1);
+		w(r(loc + 2) - (i + 8), loc + 2);
+
+		i += sizeof(uint32_t) * 3;
+	}
+}
+
 static void sort_relative_table(char *extab_image, int image_size)
 {
 	int i;
@@ -281,6 +310,9 @@ do_file(char const *const fname)
 		break;
 	case EM_386:
 	case EM_X86_64:
+		custom_sort = x86_sort_relative_table;
+		break;
+
 	case EM_S390:
 		custom_sort = sort_relative_table;
 		break;

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-07 12:11             ` Borislav Petkov
  2016-01-07 18:22               ` Luck, Tony
  2016-01-08  1:45               ` Luck, Tony
@ 2016-01-08  5:30               ` Luck, Tony
  2016-01-08 10:41                 ` Borislav Petkov
  2 siblings, 1 reply; 27+ messages in thread
From: Luck, Tony @ 2016-01-08  5:30 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andy Lutomirski, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

Also need some comment and Documentation/ changes:


diff --git a/Documentation/x86/exception-tables.txt b/Documentation/x86/exception-tables.txt
index 32901aa36f0a..ae47b9f64b8a 100644
--- a/Documentation/x86/exception-tables.txt
+++ b/Documentation/x86/exception-tables.txt
@@ -290,3 +290,37 @@ Due to the way that the exception table is built and needs to be ordered,
 only use exceptions for code in the .text section.  Any other section
 will cause the exception table to not be sorted correctly, and the
 exceptions will fail.
+
+Things changed when 64-bit support was added to x86 Linux. Rather than
+double the size of the exception table by expanding the two entries
+from 32-bits to 64 bits, a clever trick was used to store addreesses
+as relative offsets from the table itself. The assembly code changed
+from:
+	.long 1b,3b
+to:
+        .long (from) - .
+        .long (to) - .
+and the C-code that uses these values converts back to absolute addresses
+like this:
+	ex_insn_addr(const struct exception_table_entry *x)
+	{
+		return (unsigned long)&x->insn + x->insn;
+	}
+
+In v4.5 the exception table entry was given a new field "handler".
+This is also 32-bits wide and contains a table entry relative address
+of a handler function that can perform specific operations in addition
+to re-writing the instruction pointer to jump to the fixup location.
+Initially there are three such functions:
+
+1) int ex_handler_default(const struct exception_table_entry *fixup,
+   This is legacy case that just jumps to the fixup code
+2) int ex_handler_fault(const struct exception_table_entry *fixup,
+   This case provides the fault number of the trap that occured at
+   entry->insn. It is used to distinguish page faults from machine
+   check.
+3) int ex_handler_ext(const struct exception_table_entry *fixup,
+   This case is used to for uaccess_err ... we need to set a flag
+   in the task structure. Before the handler functions existed this
+   case was handled by adding a large offset to the fixup to tag
+   it as special.
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index b8f6f7545679..563443870915 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -90,12 +90,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 	likely(!__range_not_ok(addr, size, user_addr_max()))
 
 /*
- * The exception table consists of pairs of addresses relative to the
+ * The exception table consists of triples of addresses relative to the
  * exception table enty itself: the first is the address of an
- * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * instruction that is allowed to fault, the second is the address
+ * at which the program should continue, the last is the address of
+ * a handler function to deal with the fault referenced by the instruction
+ * in the first field.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f057718d8d15..195ff0144152 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -310,4 +310,3 @@ ENTRY(__mcsafe_copy)
 	_ASM_EXTABLE_FAULT(12b,38b)
 	_ASM_EXTABLE_FAULT(18b,39b)
 	_ASM_EXTABLE_FAULT(21b,40b)
-#endif

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-08  1:45               ` Luck, Tony
@ 2016-01-08 10:37                 ` Borislav Petkov
  2016-01-08 16:29                   ` Luck, Tony
  0 siblings, 1 reply; 27+ messages in thread
From: Borislav Petkov @ 2016-01-08 10:37 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Andy Lutomirski, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Thu, Jan 07, 2016 at 05:45:26PM -0800, Luck, Tony wrote:
> On Thu, Jan 07, 2016 at 01:11:31PM +0100, Borislav Petkov wrote:
> > Anyway, here's what I have, it boots fine in a guest.
> > 
> > Btw, it seems I'm coming down with the cold and all that above could be
> > hallucinations so please double-check me.
> 
> Hardly any hallucinations ... here's an update with the changes
> I mentioned in earlier e-mail.  Boots on actual h/w.

Cool, thanks for fixing it up. Looks good to me. Feel free to make it
into a proper patch and add it to your series... unless you want me to
do it.

Just one small question below:

> diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
> index 903ec1e9c326..01098ad010dd 100644
> --- a/arch/x86/mm/extable.c
> +++ b/arch/x86/mm/extable.c
> @@ -3,6 +3,8 @@
>  #include <linux/sort.h>
>  #include <asm/uaccess.h>
>  
> +typedef int (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int);
> +
>  static inline unsigned long
>  ex_insn_addr(const struct exception_table_entry *x)
>  {
> @@ -14,10 +16,39 @@ ex_fixup_addr(const struct exception_table_entry *x)
>  	return (unsigned long)&x->fixup + x->fixup;
>  }
>  
> -int fixup_exception(struct pt_regs *regs)
> +int ex_handler_default(const struct exception_table_entry *fixup,
> +		       struct pt_regs *regs, int trapnr)
>  {
> -	const struct exception_table_entry *fixup;
> -	unsigned long new_ip;
> +	regs->ip = ex_fixup_addr(fixup);
> +	return 1;
> +}
> +EXPORT_SYMBOL(ex_handler_default);

Why not EXPORT_SYMBOL_GPL() ?

We do not care about external modules.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-08  5:30               ` Luck, Tony
@ 2016-01-08 10:41                 ` Borislav Petkov
  0 siblings, 0 replies; 27+ messages in thread
From: Borislav Petkov @ 2016-01-08 10:41 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Andy Lutomirski, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Thu, Jan 07, 2016 at 09:30:29PM -0800, Luck, Tony wrote:
> Also need some comment and Documentation/ changes:
> 
> 
> diff --git a/Documentation/x86/exception-tables.txt b/Documentation/x86/exception-tables.txt
> index 32901aa36f0a..ae47b9f64b8a 100644
> --- a/Documentation/x86/exception-tables.txt
> +++ b/Documentation/x86/exception-tables.txt
> @@ -290,3 +290,37 @@ Due to the way that the exception table is built and needs to be ordered,
>  only use exceptions for code in the .text section.  Any other section
>  will cause the exception table to not be sorted correctly, and the
>  exceptions will fail.
> +
> +Things changed when 64-bit support was added to x86 Linux. Rather than
> +double the size of the exception table by expanding the two entries
> +from 32-bits to 64 bits, a clever trick was used to store addreesses
> +as relative offsets from the table itself. The assembly code changed
> +from:
> +	.long 1b,3b
> +to:
> +        .long (from) - .
> +        .long (to) - .
> +and the C-code that uses these values converts back to absolute addresses
> +like this:
> +	ex_insn_addr(const struct exception_table_entry *x)
> +	{
> +		return (unsigned long)&x->insn + x->insn;
> +	}
> +
> +In v4.5 the exception table entry was given a new field "handler".
> +This is also 32-bits wide and contains a table entry relative address
> +of a handler function that can perform specific operations in addition
> +to re-writing the instruction pointer to jump to the fixup location.
> +Initially there are three such functions:
> +
> +1) int ex_handler_default(const struct exception_table_entry *fixup,
> +   This is legacy case that just jumps to the fixup code
> +2) int ex_handler_fault(const struct exception_table_entry *fixup,
> +   This case provides the fault number of the trap that occured at
> +   entry->insn. It is used to distinguish page faults from machine
> +   check.
> +3) int ex_handler_ext(const struct exception_table_entry *fixup,
> +   This case is used to for uaccess_err ... we need to set a flag
> +   in the task structure. Before the handler functions existed this
> +   case was handled by adding a large offset to the fixup to tag
> +   it as special.
> diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
> index b8f6f7545679..563443870915 100644
> --- a/arch/x86/include/asm/uaccess.h
> +++ b/arch/x86/include/asm/uaccess.h
> @@ -90,12 +90,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
>  	likely(!__range_not_ok(addr, size, user_addr_max()))
>  
>  /*
> - * The exception table consists of pairs of addresses relative to the
> + * The exception table consists of triples of addresses relative to the
>   * exception table enty itself: the first is the address of an
> - * instruction that is allowed to fault, and the second is the address
> - * at which the program should continue.  No registers are modified,
> - * so it is entirely up to the continuation code to figure out what to
> - * do.
> + * instruction that is allowed to fault, the second is the address
> + * at which the program should continue, the last is the address of
> + * a handler function to deal with the fault referenced by the instruction
> + * in the first field.
>   *
>   * All the routines below use bits of fixup code that are out of line
>   * with the main instruction path.  This means when everything is well,

Looks good. /me always likes patches adding more sensible documentation:

Acked-by: Borislav Petkov <bp@suse.de>

> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index f057718d8d15..195ff0144152 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -310,4 +310,3 @@ ENTRY(__mcsafe_copy)
>  	_ASM_EXTABLE_FAULT(12b,38b)
>  	_ASM_EXTABLE_FAULT(18b,39b)
>  	_ASM_EXTABLE_FAULT(21b,40b)
> -#endif

This looks like a stray change.

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-08 10:37                 ` Borislav Petkov
@ 2016-01-08 16:29                   ` Luck, Tony
  2016-01-08 17:20                     ` Borislav Petkov
  2016-01-08 22:29                     ` Brian Gerst
  0 siblings, 2 replies; 27+ messages in thread
From: Luck, Tony @ 2016-01-08 16:29 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andy Lutomirski, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Williams, Dan J, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

>> +EXPORT_SYMBOL(ex_handler_default);
>
> Why not EXPORT_SYMBOL_GPL() ?
>
> We do not care about external modules.

I thought the guideline was that new features are GPL, but changes
to existing features shouldn't break by adding new GPL requirements.

The point is moot though because  the shared hallucinations wore
off this morning and I realized that having the "handler" be a pointer
to a function can't work. We're storing the 32-bit signed offset from
the extable to the target address. This is fine if the table and the
address are close together. But for modules we have an exception
table wherever vmalloc() loaded the module, and a function back
in the base kernel.

So back to your ".long 0" for the default case.  And if we want to allow
modules to use any of the new handlers, then we can't use
relative function pointers for them either.

So I'm looking at making the new field just a simple integer and using
it to index an array of function pointers (like in v7).

Unless someone has a better idea?

-Tony

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-08 16:29                   ` Luck, Tony
@ 2016-01-08 17:20                     ` Borislav Petkov
  2016-01-08 22:29                     ` Brian Gerst
  1 sibling, 0 replies; 27+ messages in thread
From: Borislav Petkov @ 2016-01-08 17:20 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Andy Lutomirski, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Williams, Dan J, Robert, linux-kernel, linux-mm, linux-nvdimm,
	X86 ML

On Fri, Jan 08, 2016 at 04:29:49PM +0000, Luck, Tony wrote:
> I thought the guideline was that new features are GPL, but changes
> to existing features shouldn't break by adding new GPL requirements.
> 
> The point is moot though because  the shared hallucinations wore
> off this morning and I realized that having the "handler" be a pointer
> to a function can't work. We're storing the 32-bit signed offset from
> the extable to the target address. This is fine if the table and the
> address are close together. But for modules we have an exception
> table wherever vmalloc() loaded the module, and a function back
> in the base kernel.

Whoops, true story.

> So back to your ".long 0" for the default case.  And if we want to allow
> modules to use any of the new handlers, then we can't use
> relative function pointers for them either.
> 
> So I'm looking at making the new field just a simple integer and using
> it to index an array of function pointers (like in v7).

Right, that sounds good too. I guess we can even split the integer into

[0 ... 7][8 ... 31]

where slice [0:7] is an index into the handlers array and the remaining
unused 24-bits could be used for other stuff later. Normal addition as a
way to OR values should work.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v7 1/3] x86: Add classes to exception tables
  2016-01-08 16:29                   ` Luck, Tony
  2016-01-08 17:20                     ` Borislav Petkov
@ 2016-01-08 22:29                     ` Brian Gerst
  1 sibling, 0 replies; 27+ messages in thread
From: Brian Gerst @ 2016-01-08 22:29 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, Andy Lutomirski, Ingo Molnar, Andrew Morton,
	Andy Lutomirski, Williams, Dan J, Robert, linux-kernel, linux-mm,
	linux-nvdimm, X86 ML

On Fri, Jan 8, 2016 at 11:29 AM, Luck, Tony <tony.luck@intel.com> wrote:
>>> +EXPORT_SYMBOL(ex_handler_default);
>>
>> Why not EXPORT_SYMBOL_GPL() ?
>>
>> We do not care about external modules.
>
> I thought the guideline was that new features are GPL, but changes
> to existing features shouldn't break by adding new GPL requirements.
>
> The point is moot though because  the shared hallucinations wore
> off this morning and I realized that having the "handler" be a pointer
> to a function can't work. We're storing the 32-bit signed offset from
> the extable to the target address. This is fine if the table and the
> address are close together. But for modules we have an exception
> table wherever vmalloc() loaded the module, and a function back
> in the base kernel.
>
> So back to your ".long 0" for the default case.  And if we want to allow
> modules to use any of the new handlers, then we can't use
> relative function pointers for them either.
>
> So I'm looking at making the new field just a simple integer and using
> it to index an array of function pointers (like in v7).
>
> Unless someone has a better idea?

Aren't modules loaded in the top 2GB of address space like the main
kernel?  Otherwise rip-relative addressing wouldn't work and modules
would have to be compiled as PIC.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2016-01-08 22:29 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-05  0:05 [PATCH v7 0/3] Machine check recovery when kernel accesses poison Tony Luck
2015-12-30 17:59 ` [PATCH v7 1/3] x86: Add classes to exception tables Tony Luck
2016-01-06 12:33   ` Borislav Petkov
2016-01-06 17:35     ` Luck, Tony
2016-01-06 17:48       ` Linus Torvalds
2016-01-06 17:54     ` Andy Lutomirski
2016-01-06 17:59       ` Borislav Petkov
2016-01-06 18:07         ` Andy Lutomirski
2016-01-06 19:42           ` Borislav Petkov
2016-01-07 12:11             ` Borislav Petkov
2016-01-07 18:22               ` Luck, Tony
2016-01-08  1:45               ` Luck, Tony
2016-01-08 10:37                 ` Borislav Petkov
2016-01-08 16:29                   ` Luck, Tony
2016-01-08 17:20                     ` Borislav Petkov
2016-01-08 22:29                     ` Brian Gerst
2016-01-08  5:30               ` Luck, Tony
2016-01-08 10:41                 ` Borislav Petkov
2016-01-06 12:36   ` Borislav Petkov
2015-12-31 19:40 ` [PATCH v7 2/3] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries Tony Luck
2015-12-31 19:43 ` [PATCH v7 3/3] x86, mce: Add __mcsafe_copy() Tony Luck
2016-01-06  4:42   ` Dan Williams
2016-01-06  7:06     ` Luck, Tony
2016-01-06  7:11       ` Dan Williams
2016-01-06 16:37         ` Dan Williams
2016-01-06 16:57           ` Luck, Tony
2016-01-06 17:05             ` Dan Williams

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).