All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04  1:02 ` Tony Luck
@ 2015-12-30 17:59   ` Andy Lutomirski
  -1 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2015-12-30 17:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

This adds two bits of fixup class information to a fixup entry,
generalizing the uaccess_err hack currently in place.

Forward-ported-from-3.9-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/x86/include/asm/asm.h | 70 ++++++++++++++++++++++++++++++----------------
 arch/x86/mm/extable.c      | 21 ++++++++------
 2 files changed, 59 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..b64121ffb2da 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -43,19 +43,47 @@
 #define _ASM_DI		__ASM_REG(di)
 
 /* Exception table entry */
-#ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . ;					\
-	.popsection
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
+/*
+ * An exception table entry is 64 bits.  The first 32 bits are the offset
+ * from that entry to the potentially faulting instruction.  sortextable
+ * relies on that exact encoding.  The second 32 bits encode the fault
+ * handler address.
+ *
+ * We want to stick two extra bits of handler class into the fault handler
+ * address.  All of these are generated by relocations, so we can only
+ * rely on addition.  We therefore emit:
+ *
+ * (target - here) + (class) + 0x20000000
+ *
+ * This has the property that the two high bits are the class and the
+ * rest is easy to decode.
+ */
+
+/* There are two bits of extable entry class, added to a signed offset. */
+#define _EXTABLE_CLASS_DEFAULT	0		/* standard uaccess fixup */
+#define _EXTABLE_CLASS_EX	0x80000000	/* uaccess + set uaccess_err */
+
+/*
+ * The biases are the class constants + 0x20000000, as signed integers.
+ * This can't use ordinary arithmetic -- the assembler isn't that smart.
+ */
+#define _EXTABLE_BIAS_DEFAULT	0x20000000
+#define _EXTABLE_BIAS_EX	0x20000000 - 0x80000000
+
+#define _ASM_EXTABLE(from,to)						\
+	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_DEFAULT)
+
+#define _ASM_EXTABLE_EX(from,to)					\
+	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_EX)
+
+#ifdef __ASSEMBLY__
+# define _EXPAND_EXTABLE_BIAS(x) x
+# define _ASM_EXTABLE_CLASS(from,to,bias)				\
+	.pushsection "__ex_table","a" ;					\
+	.balign 8 ;							\
+	.long (from) - . ;						\
+	.long (to) - . + _EXPAND_EXTABLE_BIAS(bias) ;			\
 	.popsection
 
 # define _ASM_NOKPROBE(entry)					\
@@ -89,18 +117,12 @@
 	.endm
 
 #else
-# define _ASM_EXTABLE(from,to)					\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - .\n"				\
-	" .popsection\n"
-
-# define _ASM_EXTABLE_EX(from,to)				\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
+# define _EXPAND_EXTABLE_BIAS(x) #x
+# define _ASM_EXTABLE_CLASS(from,to,bias)				\
+	" .pushsection \"__ex_table\",\"a\"\n"				\
+	" .balign 8\n"							\
+	" .long (" #from ") - .\n"					\
+	" .long (" #to ") - . + " _EXPAND_EXTABLE_BIAS(bias) "\n"	\
 	" .popsection\n"
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..95e2ede71206 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -8,16 +8,24 @@ ex_insn_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->insn + x->insn;
 }
+static inline unsigned int
+ex_class(const struct exception_table_entry *x)
+{
+	return (unsigned int)x->fixup & 0xC0000000;
+}
+
 static inline unsigned long
 ex_fixup_addr(const struct exception_table_entry *x)
 {
-	return (unsigned long)&x->fixup + x->fixup;
+	long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)0x20000000;
+	return (unsigned long)&x->fixup + offset;
 }
 
 int fixup_exception(struct pt_regs *regs)
 {
 	const struct exception_table_entry *fixup;
 	unsigned long new_ip;
+	unsigned int class;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -35,12 +43,12 @@ int fixup_exception(struct pt_regs *regs)
 
 	fixup = search_exception_tables(regs->ip);
 	if (fixup) {
+		class = ex_class(fixup);
 		new_ip = ex_fixup_addr(fixup);
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+		if (class == _EXTABLE_CLASS_EX) {
 			/* Special hack for uaccess_err */
 			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
 		}
 		regs->ip = new_ip;
 		return 1;
@@ -53,18 +61,15 @@ int fixup_exception(struct pt_regs *regs)
 int __init early_fixup_exception(unsigned long *ip)
 {
 	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
 
 	fixup = search_exception_tables(*ip);
 	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+		if (ex_class(fixup) == _EXTABLE_CLASS_EX) {
 			/* uaccess handling not supported during early boot */
 			return 0;
 		}
 
-		*ip = new_ip;
+		*ip = ex_fixup_addr(fixup);
 		return 1;
 	}
 
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2015-12-30 17:59   ` Andy Lutomirski
  0 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2015-12-30 17:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

This adds two bits of fixup class information to a fixup entry,
generalizing the uaccess_err hack currently in place.

Forward-ported-from-3.9-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/x86/include/asm/asm.h | 70 ++++++++++++++++++++++++++++++----------------
 arch/x86/mm/extable.c      | 21 ++++++++------
 2 files changed, 59 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..b64121ffb2da 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -43,19 +43,47 @@
 #define _ASM_DI		__ASM_REG(di)
 
 /* Exception table entry */
-#ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . ;					\
-	.popsection
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
+/*
+ * An exception table entry is 64 bits.  The first 32 bits are the offset
+ * from that entry to the potentially faulting instruction.  sortextable
+ * relies on that exact encoding.  The second 32 bits encode the fault
+ * handler address.
+ *
+ * We want to stick two extra bits of handler class into the fault handler
+ * address.  All of these are generated by relocations, so we can only
+ * rely on addition.  We therefore emit:
+ *
+ * (target - here) + (class) + 0x20000000
+ *
+ * This has the property that the two high bits are the class and the
+ * rest is easy to decode.
+ */
+
+/* There are two bits of extable entry class, added to a signed offset. */
+#define _EXTABLE_CLASS_DEFAULT	0		/* standard uaccess fixup */
+#define _EXTABLE_CLASS_EX	0x80000000	/* uaccess + set uaccess_err */
+
+/*
+ * The biases are the class constants + 0x20000000, as signed integers.
+ * This can't use ordinary arithmetic -- the assembler isn't that smart.
+ */
+#define _EXTABLE_BIAS_DEFAULT	0x20000000
+#define _EXTABLE_BIAS_EX	0x20000000 - 0x80000000
+
+#define _ASM_EXTABLE(from,to)						\
+	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_DEFAULT)
+
+#define _ASM_EXTABLE_EX(from,to)					\
+	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_EX)
+
+#ifdef __ASSEMBLY__
+# define _EXPAND_EXTABLE_BIAS(x) x
+# define _ASM_EXTABLE_CLASS(from,to,bias)				\
+	.pushsection "__ex_table","a" ;					\
+	.balign 8 ;							\
+	.long (from) - . ;						\
+	.long (to) - . + _EXPAND_EXTABLE_BIAS(bias) ;			\
 	.popsection
 
 # define _ASM_NOKPROBE(entry)					\
@@ -89,18 +117,12 @@
 	.endm
 
 #else
-# define _ASM_EXTABLE(from,to)					\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - .\n"				\
-	" .popsection\n"
-
-# define _ASM_EXTABLE_EX(from,to)				\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
+# define _EXPAND_EXTABLE_BIAS(x) #x
+# define _ASM_EXTABLE_CLASS(from,to,bias)				\
+	" .pushsection \"__ex_table\",\"a\"\n"				\
+	" .balign 8\n"							\
+	" .long (" #from ") - .\n"					\
+	" .long (" #to ") - . + " _EXPAND_EXTABLE_BIAS(bias) "\n"	\
 	" .popsection\n"
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..95e2ede71206 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -8,16 +8,24 @@ ex_insn_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->insn + x->insn;
 }
+static inline unsigned int
+ex_class(const struct exception_table_entry *x)
+{
+	return (unsigned int)x->fixup & 0xC0000000;
+}
+
 static inline unsigned long
 ex_fixup_addr(const struct exception_table_entry *x)
 {
-	return (unsigned long)&x->fixup + x->fixup;
+	long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)0x20000000;
+	return (unsigned long)&x->fixup + offset;
 }
 
 int fixup_exception(struct pt_regs *regs)
 {
 	const struct exception_table_entry *fixup;
 	unsigned long new_ip;
+	unsigned int class;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -35,12 +43,12 @@ int fixup_exception(struct pt_regs *regs)
 
 	fixup = search_exception_tables(regs->ip);
 	if (fixup) {
+		class = ex_class(fixup);
 		new_ip = ex_fixup_addr(fixup);
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+		if (class == _EXTABLE_CLASS_EX) {
 			/* Special hack for uaccess_err */
 			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
 		}
 		regs->ip = new_ip;
 		return 1;
@@ -53,18 +61,15 @@ int fixup_exception(struct pt_regs *regs)
 int __init early_fixup_exception(unsigned long *ip)
 {
 	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
 
 	fixup = search_exception_tables(*ip);
 	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+		if (ex_class(fixup) == _EXTABLE_CLASS_EX) {
 			/* uaccess handling not supported during early boot */
 			return 0;
 		}
 
-		*ip = new_ip;
+		*ip = ex_fixup_addr(fixup);
 		return 1;
 	}
 
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v6 2/4] x86: Cleanup and add a new exception class
  2016-01-04  1:02 ` Tony Luck
@ 2015-12-30 18:56   ` Tony Luck
  -1 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2015-12-30 18:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Make per-class functions for exception table fixup. Add #defines
and general prettiness to make it clear how all the parts tie
together.

Add a new class that fills %rax with the fault number of the exception.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/asm.h     | 24 ++++++++++-----
 arch/x86/include/asm/uaccess.h | 17 ++++++++---
 arch/x86/kernel/kprobes/core.c |  2 +-
 arch/x86/kernel/traps.c        |  6 ++--
 arch/x86/mm/extable.c          | 67 +++++++++++++++++++++++++++---------------
 arch/x86/mm/fault.c            |  2 +-
 6 files changed, 79 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b64121ffb2da..1888278d0559 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,6 +44,7 @@
 
 /* Exception table entry */
 
+#define	_EXTABLE_BIAS	0x20000000
 /*
  * An exception table entry is 64 bits.  The first 32 bits are the offset
  * from that entry to the potentially faulting instruction.  sortextable
@@ -54,26 +55,35 @@
  * address.  All of these are generated by relocations, so we can only
  * rely on addition.  We therefore emit:
  *
- * (target - here) + (class) + 0x20000000
+ * (target - here) + (class) + _EXTABLE_BIAS
  *
  * This has the property that the two high bits are the class and the
  * rest is easy to decode.
  */
 
-/* There are two bits of extable entry class, added to a signed offset. */
-#define _EXTABLE_CLASS_DEFAULT	0		/* standard uaccess fixup */
-#define _EXTABLE_CLASS_EX	0x80000000	/* uaccess + set uaccess_err */
+/*
+ * There are two bits of extable entry class giving four classes
+ */
+#define EXTABLE_CLASS_DEFAULT	0	/* standard uaccess fixup */
+#define EXTABLE_CLASS_FAULT	1	/* provide fault number as well as fixup */
+#define EXTABLE_CLASS_EX	2	/* uaccess + set uaccess_err */
+#define EXTABLE_CLASS_UNUSED	3	/* available for something else */
 
 /*
- * The biases are the class constants + 0x20000000, as signed integers.
+ * The biases are the class constants + _EXTABLE_BIAS, as signed integers.
  * This can't use ordinary arithmetic -- the assembler isn't that smart.
  */
-#define _EXTABLE_BIAS_DEFAULT	0x20000000
-#define _EXTABLE_BIAS_EX	0x20000000 - 0x80000000
+#define _EXTABLE_BIAS_DEFAULT	_EXTABLE_BIAS
+#define _EXTABLE_BIAS_FAULT	_EXTABLE_BIAS + 0x40000000
+#define _EXTABLE_BIAS_EX	_EXTABLE_BIAS - 0x80000000
+#define _EXTABLE_BIAS_UNUSED	_EXTABLE_BIAS - 0x40000000
 
 #define _ASM_EXTABLE(from,to)						\
 	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_DEFAULT)
 
+#define _ASM_EXTABLE_FAULT(from,to)					\
+	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_FAULT)
+
 #define _ASM_EXTABLE_EX(from,to)					\
 	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_EX)
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a8df874f3e88..b023300cd6f0 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -93,9 +93,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  * The exception table consists of pairs of addresses relative to the
  * exception table enty itself: the first is the address of an
  * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * at which the program should continue.  The exception table is linked
+ * soon after the fixup section, so we don't need a full 32-bit offset
+ * for the fixup. We steal the two upper bits so we can tag exception
+ * table entries with different classes. In the default class no registers
+ * are modified, so it is entirely up to the continuation code to figure
+ * out what to do.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -110,7 +113,13 @@ struct exception_table_entry {
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+static inline unsigned int
+ex_class(const struct exception_table_entry *x)
+{
+	return (u32)x->fixup >> 30;
+}
+
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 346eec73f7db..df25081e5970 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, X86_TRAP_DE)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 95e2ede71206..7a592ec193d5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -8,24 +8,54 @@ ex_insn_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->insn + x->insn;
 }
-static inline unsigned int
-ex_class(const struct exception_table_entry *x)
-{
-	return (unsigned int)x->fixup & 0xC0000000;
-}
-
 static inline unsigned long
 ex_fixup_addr(const struct exception_table_entry *x)
 {
-	long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)0x20000000;
+	long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)_EXTABLE_BIAS;
 	return (unsigned long)&x->fixup + offset;
 }
 
-int fixup_exception(struct pt_regs *regs)
+/* Fixup functions for each exception class */
+static int fix_class_default(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+static int fix_class_fault(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return 1;
+}
+static int fix_class_ex(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+static int fix_class_unused(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	/* can't happen unless exception table was corrupted */
+	BUG_ON(1);
+	return 0;
+}
+
+static int (*allclasses[])(const struct exception_table_entry *,
+			   struct pt_regs *, int) = {
+	[EXTABLE_CLASS_DEFAULT] = fix_class_default,
+	[EXTABLE_CLASS_FAULT] = fix_class_fault,
+	[EXTABLE_CLASS_EX] = fix_class_ex,
+	[EXTABLE_CLASS_UNUSED] = fix_class_unused
+};
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
 {
 	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
-	unsigned int class;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -42,17 +72,8 @@ int fixup_exception(struct pt_regs *regs)
 #endif
 
 	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		class = ex_class(fixup);
-		new_ip = ex_fixup_addr(fixup);
-
-		if (class == _EXTABLE_CLASS_EX) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	if (fixup)
+		return allclasses[ex_class(fixup)](fixup, regs, trapnr);
 
 	return 0;
 }
@@ -64,8 +85,8 @@ int __init early_fixup_exception(unsigned long *ip)
 
 	fixup = search_exception_tables(*ip);
 	if (fixup) {
-		if (ex_class(fixup) == _EXTABLE_CLASS_EX) {
-			/* uaccess handling not supported during early boot */
+		if (ex_class(fixup)) {
+			/* special handling not supported during early boot */
 			return 0;
 		}
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..495946c3f9dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v6 2/4] x86: Cleanup and add a new exception class
@ 2015-12-30 18:56   ` Tony Luck
  0 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2015-12-30 18:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Make per-class functions for exception table fixup. Add #defines
and general prettiness to make it clear how all the parts tie
together.

Add a new class that fills %rax with the fault number of the exception.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/asm.h     | 24 ++++++++++-----
 arch/x86/include/asm/uaccess.h | 17 ++++++++---
 arch/x86/kernel/kprobes/core.c |  2 +-
 arch/x86/kernel/traps.c        |  6 ++--
 arch/x86/mm/extable.c          | 67 +++++++++++++++++++++++++++---------------
 arch/x86/mm/fault.c            |  2 +-
 6 files changed, 79 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b64121ffb2da..1888278d0559 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,6 +44,7 @@
 
 /* Exception table entry */
 
+#define	_EXTABLE_BIAS	0x20000000
 /*
  * An exception table entry is 64 bits.  The first 32 bits are the offset
  * from that entry to the potentially faulting instruction.  sortextable
@@ -54,26 +55,35 @@
  * address.  All of these are generated by relocations, so we can only
  * rely on addition.  We therefore emit:
  *
- * (target - here) + (class) + 0x20000000
+ * (target - here) + (class) + _EXTABLE_BIAS
  *
  * This has the property that the two high bits are the class and the
  * rest is easy to decode.
  */
 
-/* There are two bits of extable entry class, added to a signed offset. */
-#define _EXTABLE_CLASS_DEFAULT	0		/* standard uaccess fixup */
-#define _EXTABLE_CLASS_EX	0x80000000	/* uaccess + set uaccess_err */
+/*
+ * There are two bits of extable entry class giving four classes
+ */
+#define EXTABLE_CLASS_DEFAULT	0	/* standard uaccess fixup */
+#define EXTABLE_CLASS_FAULT	1	/* provide fault number as well as fixup */
+#define EXTABLE_CLASS_EX	2	/* uaccess + set uaccess_err */
+#define EXTABLE_CLASS_UNUSED	3	/* available for something else */
 
 /*
- * The biases are the class constants + 0x20000000, as signed integers.
+ * The biases are the class constants + _EXTABLE_BIAS, as signed integers.
  * This can't use ordinary arithmetic -- the assembler isn't that smart.
  */
-#define _EXTABLE_BIAS_DEFAULT	0x20000000
-#define _EXTABLE_BIAS_EX	0x20000000 - 0x80000000
+#define _EXTABLE_BIAS_DEFAULT	_EXTABLE_BIAS
+#define _EXTABLE_BIAS_FAULT	_EXTABLE_BIAS + 0x40000000
+#define _EXTABLE_BIAS_EX	_EXTABLE_BIAS - 0x80000000
+#define _EXTABLE_BIAS_UNUSED	_EXTABLE_BIAS - 0x40000000
 
 #define _ASM_EXTABLE(from,to)						\
 	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_DEFAULT)
 
+#define _ASM_EXTABLE_FAULT(from,to)					\
+	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_FAULT)
+
 #define _ASM_EXTABLE_EX(from,to)					\
 	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_EX)
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a8df874f3e88..b023300cd6f0 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -93,9 +93,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  * The exception table consists of pairs of addresses relative to the
  * exception table enty itself: the first is the address of an
  * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * at which the program should continue.  The exception table is linked
+ * soon after the fixup section, so we don't need a full 32-bit offset
+ * for the fixup. We steal the two upper bits so we can tag exception
+ * table entries with different classes. In the default class no registers
+ * are modified, so it is entirely up to the continuation code to figure
+ * out what to do.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -110,7 +113,13 @@ struct exception_table_entry {
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+static inline unsigned int
+ex_class(const struct exception_table_entry *x)
+{
+	return (u32)x->fixup >> 30;
+}
+
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 346eec73f7db..df25081e5970 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, X86_TRAP_DE)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 95e2ede71206..7a592ec193d5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -8,24 +8,54 @@ ex_insn_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->insn + x->insn;
 }
-static inline unsigned int
-ex_class(const struct exception_table_entry *x)
-{
-	return (unsigned int)x->fixup & 0xC0000000;
-}
-
 static inline unsigned long
 ex_fixup_addr(const struct exception_table_entry *x)
 {
-	long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)0x20000000;
+	long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)_EXTABLE_BIAS;
 	return (unsigned long)&x->fixup + offset;
 }
 
-int fixup_exception(struct pt_regs *regs)
+/* Fixup functions for each exception class */
+static int fix_class_default(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+static int fix_class_fault(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return 1;
+}
+static int fix_class_ex(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return 1;
+}
+static int fix_class_unused(const struct exception_table_entry *fixup,
+		      struct pt_regs *regs, int trapnr)
+{
+	/* can't happen unless exception table was corrupted */
+	BUG_ON(1);
+	return 0;
+}
+
+static int (*allclasses[])(const struct exception_table_entry *,
+			   struct pt_regs *, int) = {
+	[EXTABLE_CLASS_DEFAULT] = fix_class_default,
+	[EXTABLE_CLASS_FAULT] = fix_class_fault,
+	[EXTABLE_CLASS_EX] = fix_class_ex,
+	[EXTABLE_CLASS_UNUSED] = fix_class_unused
+};
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
 {
 	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
-	unsigned int class;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -42,17 +72,8 @@ int fixup_exception(struct pt_regs *regs)
 #endif
 
 	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		class = ex_class(fixup);
-		new_ip = ex_fixup_addr(fixup);
-
-		if (class == _EXTABLE_CLASS_EX) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	if (fixup)
+		return allclasses[ex_class(fixup)](fixup, regs, trapnr);
 
 	return 0;
 }
@@ -64,8 +85,8 @@ int __init early_fixup_exception(unsigned long *ip)
 
 	fixup = search_exception_tables(*ip);
 	if (fixup) {
-		if (ex_class(fixup) == _EXTABLE_CLASS_EX) {
-			/* uaccess handling not supported during early boot */
+		if (ex_class(fixup)) {
+			/* special handling not supported during early boot */
 			return 0;
 		}
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..495946c3f9dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v6 3/4] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries
  2016-01-04  1:02 ` Tony Luck
@ 2015-12-31 19:40   ` Tony Luck
  -1 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2015-12-31 19:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel with a EXTABLE_CLASS_FAULT fixup entry.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 32 +++++++++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 71 ++++++++++++++++---------------
 2 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..b6eee8e9b2a6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -13,7 +13,9 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/debugfs.h>
+#include <linux/module.h>
 #include <asm/mce.h>
+#include <asm/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -29,7 +31,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +50,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -87,6 +90,10 @@ static struct severity {
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
 	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
 		),
@@ -123,6 +130,11 @@ static struct severity {
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
 	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
+	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
@@ -170,6 +182,18 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
+static inline bool mce_in_kernel_recov(unsigned long addr)
+{
+	const struct exception_table_entry *fixup;
+
+	fixup = search_exception_tables(addr);
+
+	return fixup && ex_class(fixup) == EXTABLE_CLASS_FAULT;
+}
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +207,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && mce_in_kernel_recov(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9d014b82a124..d69a69b38c54 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/poll.h>
 #include <linux/nmi.h>
@@ -958,6 +959,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -995,8 +1010,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	ist_enter(regs);
@@ -1123,22 +1136,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1146,25 +1150,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_exception(regs, X86_TRAP_MC))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v6 3/4] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries
@ 2015-12-31 19:40   ` Tony Luck
  0 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2015-12-31 19:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel with a EXTABLE_CLASS_FAULT fixup entry.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 32 +++++++++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 71 ++++++++++++++++---------------
 2 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..b6eee8e9b2a6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -13,7 +13,9 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/debugfs.h>
+#include <linux/module.h>
 #include <asm/mce.h>
+#include <asm/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -29,7 +31,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +50,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -87,6 +90,10 @@ static struct severity {
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
 	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
 		),
@@ -123,6 +130,11 @@ static struct severity {
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
 	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
+	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
@@ -170,6 +182,18 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
+static inline bool mce_in_kernel_recov(unsigned long addr)
+{
+	const struct exception_table_entry *fixup;
+
+	fixup = search_exception_tables(addr);
+
+	return fixup && ex_class(fixup) == EXTABLE_CLASS_FAULT;
+}
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +207,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && mce_in_kernel_recov(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9d014b82a124..d69a69b38c54 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/poll.h>
 #include <linux/nmi.h>
@@ -958,6 +959,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -995,8 +1010,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	ist_enter(regs);
@@ -1123,22 +1136,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1146,25 +1150,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_exception(regs, X86_TRAP_MC))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v6 4/4] x86, mce: Add __mcsafe_copy()
  2016-01-04  1:02 ` Tony Luck
@ 2015-12-31 19:43   ` Tony Luck
  -1 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2015-12-31 19:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Make use of the EXTABLE_FAULT exception table entries. This routine
returns a structure to indicate the result of the copy:

struct mcsafe_ret {
	u64 trapnr;
	u64 remain;
};

If the copy is successful, then both 'trapnr' and 'remain' are zero.

If we faulted during the copy, then 'trapnr' will say which type
of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
bytes were not copied.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/Kconfig                 |  10 +++
 arch/x86/include/asm/string_64.h |  10 +++
 arch/x86/kernel/x8664_ksyms_64.c |   4 ++
 arch/x86/lib/memcpy_64.S         | 136 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 160 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 96d058a87100..42d26b4d1ec4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
 	  If you don't know what a machine check is and you don't do kernel
 	  QA it is safe to say n.
 
+config MCE_KERNEL_RECOVERY
+	bool "Recovery from machine checks in special kernel memory copy functions"
+	default n
+	depends on X86_MCE && X86_64
+	---help---
+	  This option provides a new memory copy function mcsafe_memcpy()
+	  that is annotated to allow the machine check handler to return
+	  to an alternate code path to return an error to the caller instead
+	  of crashing the system. Say yes if you have a driver that uses this.
+
 config X86_THERMAL_VECTOR
 	def_bool y
 	depends on X86_MCE_INTEL
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a17dc4b..3887f304d8cd 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,16 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+struct mcsafe_ret {
+	u64 trapnr;
+	u64 remain;
+};
+
+struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, unsigned size);
+extern void __mcsafe_copy_end(void);
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..3d42d0ef3333 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,10 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+EXPORT_SYMBOL(__mcsafe_copy);
+#endif
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bba87de..e5b1acad8b1e 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,139 @@ ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+/*
+ * __mcsafe_copy - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(__mcsafe_copy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of source */
+	movl %esi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 0b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	mov %r8,(%rdi)
+	mov %r9,1*8(%rdi)
+	mov %r10,2*8(%rdi)
+	mov %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	mov %r8,4*8(%rdi)
+	mov %r9,5*8(%rdi)
+	mov %r10,6*8(%rdi)
+	mov %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	mov %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorq %rax, %rax
+	xorq %rdx, %rdx
+	sfence
+	/* copy successful. return 0 */
+	ret
+
+	.section .fixup,"ax"
+	/* fixups for machine check */
+30:
+	add %ecx,%edx
+	jmp 100f
+31:
+	shl $6,%ecx
+	add %ecx,%edx
+	jmp 100f
+32:
+	shl $6,%ecx
+	lea -8(%ecx,%edx),%edx
+	jmp 100f
+33:
+	shl $6,%ecx
+	lea -16(%ecx,%edx),%edx
+	jmp 100f
+34:
+	shl $6,%ecx
+	lea -24(%ecx,%edx),%edx
+	jmp 100f
+35:
+	shl $6,%ecx
+	lea -32(%ecx,%edx),%edx
+	jmp 100f
+36:
+	shl $6,%ecx
+	lea -40(%ecx,%edx),%edx
+	jmp 100f
+37:
+	shl $6,%ecx
+	lea -48(%ecx,%edx),%edx
+	jmp 100f
+38:
+	shl $6,%ecx
+	lea -56(%ecx,%edx),%edx
+	jmp 100f
+39:
+	lea (%rdx,%rcx,8),%rdx
+	jmp 100f
+40:
+	mov %ecx,%edx
+100:
+	sfence
+
+	/* %rax set the fault number in fixup_exception() */
+	ret
+	.previous
+
+	_ASM_EXTABLE_FAULT(0b,30b)
+	_ASM_EXTABLE_FAULT(1b,31b)
+	_ASM_EXTABLE_FAULT(2b,32b)
+	_ASM_EXTABLE_FAULT(3b,33b)
+	_ASM_EXTABLE_FAULT(4b,34b)
+	_ASM_EXTABLE_FAULT(9b,35b)
+	_ASM_EXTABLE_FAULT(10b,36b)
+	_ASM_EXTABLE_FAULT(11b,37b)
+	_ASM_EXTABLE_FAULT(12b,38b)
+	_ASM_EXTABLE_FAULT(18b,39b)
+	_ASM_EXTABLE_FAULT(21b,40b)
+ENDPROC(__mcsafe_copy)
+#endif
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v6 4/4] x86, mce: Add __mcsafe_copy()
@ 2015-12-31 19:43   ` Tony Luck
  0 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2015-12-31 19:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

Make use of the EXTABLE_FAULT exception table entries. This routine
returns a structure to indicate the result of the copy:

struct mcsafe_ret {
	u64 trapnr;
	u64 remain;
};

If the copy is successful, then both 'trapnr' and 'remain' are zero.

If we faulted during the copy, then 'trapnr' will say which type
of trap (X86_TRAP_PF or X86_TRAP_MC) and 'remain' says how many
bytes were not copied.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/Kconfig                 |  10 +++
 arch/x86/include/asm/string_64.h |  10 +++
 arch/x86/kernel/x8664_ksyms_64.c |   4 ++
 arch/x86/lib/memcpy_64.S         | 136 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 160 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 96d058a87100..42d26b4d1ec4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1001,6 +1001,16 @@ config X86_MCE_INJECT
 	  If you don't know what a machine check is and you don't do kernel
 	  QA it is safe to say n.
 
+config MCE_KERNEL_RECOVERY
+	bool "Recovery from machine checks in special kernel memory copy functions"
+	default n
+	depends on X86_MCE && X86_64
+	---help---
+	  This option provides a new memory copy function mcsafe_memcpy()
+	  that is annotated to allow the machine check handler to return
+	  to an alternate code path to return an error to the caller instead
+	  of crashing the system. Say yes if you have a driver that uses this.
+
 config X86_THERMAL_VECTOR
 	def_bool y
 	depends on X86_MCE_INTEL
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a17dc4b..3887f304d8cd 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,16 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+struct mcsafe_ret {
+	u64 trapnr;
+	u64 remain;
+};
+
+struct mcsafe_ret __mcsafe_copy(void *dst, const void __user *src, unsigned size);
+extern void __mcsafe_copy_end(void);
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..3d42d0ef3333 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,10 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+EXPORT_SYMBOL(__mcsafe_copy);
+#endif
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bba87de..e5b1acad8b1e 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,139 @@ ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+
+#ifdef CONFIG_MCE_KERNEL_RECOVERY
+/*
+ * __mcsafe_copy - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(__mcsafe_copy)
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+
+	/* check for bad alignment of source */
+	movl %esi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+0:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 0b
+102:
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+	mov %r8,(%rdi)
+	mov %r9,1*8(%rdi)
+	mov %r10,2*8(%rdi)
+	mov %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+	mov %r8,4*8(%rdi)
+	mov %r9,5*8(%rdi)
+	mov %r10,6*8(%rdi)
+	mov %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+	mov %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorq %rax, %rax
+	xorq %rdx, %rdx
+	sfence
+	/* copy successful. return 0 */
+	ret
+
+	.section .fixup,"ax"
+	/* fixups for machine check */
+30:
+	add %ecx,%edx
+	jmp 100f
+31:
+	shl $6,%ecx
+	add %ecx,%edx
+	jmp 100f
+32:
+	shl $6,%ecx
+	lea -8(%ecx,%edx),%edx
+	jmp 100f
+33:
+	shl $6,%ecx
+	lea -16(%ecx,%edx),%edx
+	jmp 100f
+34:
+	shl $6,%ecx
+	lea -24(%ecx,%edx),%edx
+	jmp 100f
+35:
+	shl $6,%ecx
+	lea -32(%ecx,%edx),%edx
+	jmp 100f
+36:
+	shl $6,%ecx
+	lea -40(%ecx,%edx),%edx
+	jmp 100f
+37:
+	shl $6,%ecx
+	lea -48(%ecx,%edx),%edx
+	jmp 100f
+38:
+	shl $6,%ecx
+	lea -56(%ecx,%edx),%edx
+	jmp 100f
+39:
+	lea (%rdx,%rcx,8),%rdx
+	jmp 100f
+40:
+	mov %ecx,%edx
+100:
+	sfence
+
+	/* %rax set the fault number in fixup_exception() */
+	ret
+	.previous
+
+	_ASM_EXTABLE_FAULT(0b,30b)
+	_ASM_EXTABLE_FAULT(1b,31b)
+	_ASM_EXTABLE_FAULT(2b,32b)
+	_ASM_EXTABLE_FAULT(3b,33b)
+	_ASM_EXTABLE_FAULT(4b,34b)
+	_ASM_EXTABLE_FAULT(9b,35b)
+	_ASM_EXTABLE_FAULT(10b,36b)
+	_ASM_EXTABLE_FAULT(11b,37b)
+	_ASM_EXTABLE_FAULT(12b,38b)
+	_ASM_EXTABLE_FAULT(18b,39b)
+	_ASM_EXTABLE_FAULT(21b,40b)
+ENDPROC(__mcsafe_copy)
+#endif
-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v6 0/4] Machine check recovery when kernel accesses poison
@ 2016-01-04  1:02 ` Tony Luck
  0 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2016-01-04  1:02 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

This series is initially targeted at the folks doing filesystems
on top of NVDIMMs. They really want to be able to return -EIO
when there is a h/w error (just like spinning rust, and SSD does).

I plan to use the same infrastructure to write a machine check aware
"copy_from_user()" that will SIGBUS the calling application when a
syscall touches poison in user space (just like we do when the application
touches the poison itself).

Changes V5-V6
Andy:	Provoked massive re-write by providing what is now part1 of this
	patch series. This frees up two bits in the exception table
	fixup field that can be used to tag exception table entries
	as different "classes". This means we don't need my separate
	exception table fro machine checks. Also avoids duplicating
	fixup actions for #PF and #MC cases that were in version 5.
Andy:	Use C99 array initializers to tie the various class fixup
	functions back to the defintions of each class. Also give the
	functions meanningful names (not fixup_class0() etc.).
Boris:	Cleaned up my lousy assembly code removing many spurious 'l'
	modifiers on instructions.
Boris:	Provided some helper functions for the machine check severity
	calculation that make the code more readable.
Boris:	Have __mcsafe_copy() return a structure with the 'remaining bytes'
	in a separate field from the fault indicator. Boris had suggested
	Linux -EFAULT/-EINVAL ... but I thought it made more sense to return
	the exception number (X86_TRAP_MC, etc.)  This finally kills off
	BIT(63) which has been controversial throughout all the early versions
	of this patch series.

Changes V4-V5
Tony:	Extended __mcsafe_copy() to have fixup entries for both machine
	check and page fault.

Changes V3-V4:
Andy:   Simplify fixup_mcexception() by dropping used-once local variable
Andy:   "Reviewed-by" tag added to part1
Boris:  Moved new functions to memcpy_64.S and declaration to asm/string_64.h
Boris:  Changed name s/mcsafe_memcpy/__mcsafe_copy/ to make it clear that this
        is an internal function and that return value doesn't follow memcpy() semantics.
Boris:  "Reviewed-by" tag added to parts 1&2

Changes V2-V3:

Andy:   Don't hack "regs->ax = BIT(63) | addr;" in the machine check
        handler.  Now have better fixup code that computes the number
        of remaining bytes (just like page-fault fixup).
Andy:   #define for BIT(63). Done, plus couple of extra macros using it.
Boris:  Don't clutter up generic code (like mm/extable.c) with this.
        I moved everything under arch/x86 (the asm-generic change is
        a more generic #define).
Boris:  Dependencies for CONFIG_MCE_KERNEL_RECOVERY are too generic.
        I made it a real menu item with default "n". Dan Williams
        will use "select MCE_KERNEL_RECOVERY" from his persistent
        filesystem code.
Boris:  Simplify conditionals in mce.c by moving tolerant/kill_it
        checks earlier, with a skip to end if they aren't set.
Boris:  Miscellaneous grammar/punctuation. Fixed.
Boris:  Don't leak spurious __start_mcextable symbols into kernels
        that didn't configure MCE_KERNEL_RECOVERY. Done.
Tony:   New code doesn't belong in user_copy_64.S/uaccess*.h. Moved
        to new .S/.h files
Elliott:Cacheing behavior non-optimal. Could use movntdqa, vmovntdqa
        or vmovntdqa on source addresses. I didn't fix this yet. Think
        of the current mcsafe_memcpy() as the first of several functions.
        This one is useful for small copies (meta-data) where the overhead
        of saving SSE/AVX state isn't justified.

Changes V1->V2:

0-day:  Reported build errors and warnings on 32-bit systems. Fixed
0-day:  Reported bloat to tinyconfig. Fixed
Boris:  Suggestions to use extra macros to reduce code duplication in _ASM_*EXTABLE. Done
Boris:  Re-write "tolerant==3" check to reduce indentation level. See below.
Andy:   Check IP is valid before searching kernel exception tables. Done.
Andy:   Explain use of BIT(63) on return value from mcsafe_memcpy(). Done (added decode macros).
Andy:   Untangle mess of code in tail of do_machine_check() to make it
        clear what is going on (e.g. that we only enter the ist_begin_non_atomic()
        if we were called from user code, not from kernel!). Done.

Andy Lutomirski (1):
  x86: Clean up extable entry format (and free up a bit)

Tony Luck (3):
  x86: Cleanup and add a new exception class
  x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception
    table entries
  x86, mce: Add __mcsafe_copy()

 arch/x86/Kconfig                          |  10 +++
 arch/x86/include/asm/asm.h                |  80 ++++++++++++------
 arch/x86/include/asm/string_64.h          |  10 +++
 arch/x86/include/asm/uaccess.h            |  17 +++-
 arch/x86/kernel/cpu/mcheck/mce-severity.c |  32 ++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          |  71 ++++++++--------
 arch/x86/kernel/kprobes/core.c            |   2 +-
 arch/x86/kernel/traps.c                   |   6 +-
 arch/x86/kernel/x8664_ksyms_64.c          |   4 +
 arch/x86/lib/memcpy_64.S                  | 136 ++++++++++++++++++++++++++++++
 arch/x86/mm/extable.c                     |  66 ++++++++++-----
 arch/x86/mm/fault.c                       |   2 +-
 12 files changed, 347 insertions(+), 89 deletions(-)

-- 
2.1.4


^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH v6 0/4] Machine check recovery when kernel accesses poison
@ 2016-01-04  1:02 ` Tony Luck
  0 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2016-01-04  1:02 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

This series is initially targeted at the folks doing filesystems
on top of NVDIMMs. They really want to be able to return -EIO
when there is a h/w error (just like spinning rust, and SSD does).

I plan to use the same infrastructure to write a machine check aware
"copy_from_user()" that will SIGBUS the calling application when a
syscall touches poison in user space (just like we do when the application
touches the poison itself).

Changes V5-V6
Andy:	Provoked massive re-write by providing what is now part1 of this
	patch series. This frees up two bits in the exception table
	fixup field that can be used to tag exception table entries
	as different "classes". This means we don't need my separate
	exception table fro machine checks. Also avoids duplicating
	fixup actions for #PF and #MC cases that were in version 5.
Andy:	Use C99 array initializers to tie the various class fixup
	functions back to the defintions of each class. Also give the
	functions meanningful names (not fixup_class0() etc.).
Boris:	Cleaned up my lousy assembly code removing many spurious 'l'
	modifiers on instructions.
Boris:	Provided some helper functions for the machine check severity
	calculation that make the code more readable.
Boris:	Have __mcsafe_copy() return a structure with the 'remaining bytes'
	in a separate field from the fault indicator. Boris had suggested
	Linux -EFAULT/-EINVAL ... but I thought it made more sense to return
	the exception number (X86_TRAP_MC, etc.)  This finally kills off
	BIT(63) which has been controversial throughout all the early versions
	of this patch series.

Changes V4-V5
Tony:	Extended __mcsafe_copy() to have fixup entries for both machine
	check and page fault.

Changes V3-V4:
Andy:   Simplify fixup_mcexception() by dropping used-once local variable
Andy:   "Reviewed-by" tag added to part1
Boris:  Moved new functions to memcpy_64.S and declaration to asm/string_64.h
Boris:  Changed name s/mcsafe_memcpy/__mcsafe_copy/ to make it clear that this
        is an internal function and that return value doesn't follow memcpy() semantics.
Boris:  "Reviewed-by" tag added to parts 1&2

Changes V2-V3:

Andy:   Don't hack "regs->ax = BIT(63) | addr;" in the machine check
        handler.  Now have better fixup code that computes the number
        of remaining bytes (just like page-fault fixup).
Andy:   #define for BIT(63). Done, plus couple of extra macros using it.
Boris:  Don't clutter up generic code (like mm/extable.c) with this.
        I moved everything under arch/x86 (the asm-generic change is
        a more generic #define).
Boris:  Dependencies for CONFIG_MCE_KERNEL_RECOVERY are too generic.
        I made it a real menu item with default "n". Dan Williams
        will use "select MCE_KERNEL_RECOVERY" from his persistent
        filesystem code.
Boris:  Simplify conditionals in mce.c by moving tolerant/kill_it
        checks earlier, with a skip to end if they aren't set.
Boris:  Miscellaneous grammar/punctuation. Fixed.
Boris:  Don't leak spurious __start_mcextable symbols into kernels
        that didn't configure MCE_KERNEL_RECOVERY. Done.
Tony:   New code doesn't belong in user_copy_64.S/uaccess*.h. Moved
        to new .S/.h files
Elliott:Cacheing behavior non-optimal. Could use movntdqa, vmovntdqa
        or vmovntdqa on source addresses. I didn't fix this yet. Think
        of the current mcsafe_memcpy() as the first of several functions.
        This one is useful for small copies (meta-data) where the overhead
        of saving SSE/AVX state isn't justified.

Changes V1->V2:

0-day:  Reported build errors and warnings on 32-bit systems. Fixed
0-day:  Reported bloat to tinyconfig. Fixed
Boris:  Suggestions to use extra macros to reduce code duplication in _ASM_*EXTABLE. Done
Boris:  Re-write "tolerant==3" check to reduce indentation level. See below.
Andy:   Check IP is valid before searching kernel exception tables. Done.
Andy:   Explain use of BIT(63) on return value from mcsafe_memcpy(). Done (added decode macros).
Andy:   Untangle mess of code in tail of do_machine_check() to make it
        clear what is going on (e.g. that we only enter the ist_begin_non_atomic()
        if we were called from user code, not from kernel!). Done.

Andy Lutomirski (1):
  x86: Clean up extable entry format (and free up a bit)

Tony Luck (3):
  x86: Cleanup and add a new exception class
  x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception
    table entries
  x86, mce: Add __mcsafe_copy()

 arch/x86/Kconfig                          |  10 +++
 arch/x86/include/asm/asm.h                |  80 ++++++++++++------
 arch/x86/include/asm/string_64.h          |  10 +++
 arch/x86/include/asm/uaccess.h            |  17 +++-
 arch/x86/kernel/cpu/mcheck/mce-severity.c |  32 ++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          |  71 ++++++++--------
 arch/x86/kernel/kprobes/core.c            |   2 +-
 arch/x86/kernel/traps.c                   |   6 +-
 arch/x86/kernel/x8664_ksyms_64.c          |   4 +
 arch/x86/lib/memcpy_64.S                  | 136 ++++++++++++++++++++++++++++++
 arch/x86/mm/extable.c                     |  66 ++++++++++-----
 arch/x86/mm/fault.c                       |   2 +-
 12 files changed, 347 insertions(+), 89 deletions(-)

-- 
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2015-12-30 17:59   ` Andy Lutomirski
@ 2016-01-04  1:37     ` Tony Luck
  -1 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2016-01-04  1:37 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Wed, Dec 30, 2015 at 9:59 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> This adds two bits of fixup class information to a fixup entry,
> generalizing the uaccess_err hack currently in place.
>
> Forward-ported-from-3.9-by: Tony Luck <tony.luck@intel.com>
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>

Crivens!  I messed up when "git cherrypick"ing this and "git
format-patch"ing it.

I didn't mean to forge Andy's From line when sending this out (just to have a
From: line to give him credit.for the patch).

Big OOPs ... this is "From:" me ... not Andy!

-Tony

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04  1:37     ` Tony Luck
  0 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2016-01-04  1:37 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Wed, Dec 30, 2015 at 9:59 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> This adds two bits of fixup class information to a fixup entry,
> generalizing the uaccess_err hack currently in place.
>
> Forward-ported-from-3.9-by: Tony Luck <tony.luck@intel.com>
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>

Crivens!  I messed up when "git cherrypick"ing this and "git
format-patch"ing it.

I didn't mean to forge Andy's From line when sending this out (just to have a
From: line to give him credit.for the patch).

Big OOPs ... this is "From:" me ... not Andy!

-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04  1:37     ` Tony Luck
@ 2016-01-04  7:49       ` Ingo Molnar
  -1 siblings, 0 replies; 44+ messages in thread
From: Ingo Molnar @ 2016-01-04  7:49 UTC (permalink / raw)
  To: Tony Luck
  Cc: Andy Lutomirski, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML


* Tony Luck <tony.luck@gmail.com> wrote:

> On Wed, Dec 30, 2015 at 9:59 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> > This adds two bits of fixup class information to a fixup entry,
> > generalizing the uaccess_err hack currently in place.
> >
> > Forward-ported-from-3.9-by: Tony Luck <tony.luck@intel.com>
> > Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> 
> Crivens!  I messed up when "git cherrypick"ing this and "git
> format-patch"ing it.
> 
> I didn't mean to forge Andy's From line when sending this out (just to have a
> From: line to give him credit.for the patch).
> 
> Big OOPs ... this is "From:" me ... not Andy!

But in any case it's missing your SOB line.

If Andy is still the primary author (much of his original patch survived, you 
resolved conflicts or minor changes) then you can send this as:

 From: Tony Luck <tony.luck@intel.com>
 Subject: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)

 From: Andy Lutomirski <luto@amacapital.net>

 ... changelog ...

 Signed-off-by: Andy Lutomirski <luto@amacapital.net>
 [ Forward ported from a v3.9 version. ]
 Signed-off-by: Tony Luck <tony.luck@intel.com

This carries all the information, has a proper SOB chain, and preserves 
authorship. Also, it's clear from the tags that you made material changes, so any 
resulting breakage is yours (and mine), not Andy's! ;-)

If the changes to the patch are major, so that your new work is larger than Andy's 
original work, you can still credit him via a non-standard tag, like:

 From: Tony Luck <tony.luck@intel.com>
 Subject: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)

 This patch is based on Andy Lutomirski's patch sent against v3.9:

 ... changelog ...

 Originally-from: Andy Lutomirski <luto@amacapital.net>
 Signed-off-by: Tony Luck <tony.luck@intel.com

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04  7:49       ` Ingo Molnar
  0 siblings, 0 replies; 44+ messages in thread
From: Ingo Molnar @ 2016-01-04  7:49 UTC (permalink / raw)
  To: Tony Luck
  Cc: Andy Lutomirski, Borislav Petkov, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML


* Tony Luck <tony.luck@gmail.com> wrote:

> On Wed, Dec 30, 2015 at 9:59 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> > This adds two bits of fixup class information to a fixup entry,
> > generalizing the uaccess_err hack currently in place.
> >
> > Forward-ported-from-3.9-by: Tony Luck <tony.luck@intel.com>
> > Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> 
> Crivens!  I messed up when "git cherrypick"ing this and "git
> format-patch"ing it.
> 
> I didn't mean to forge Andy's From line when sending this out (just to have a
> From: line to give him credit.for the patch).
> 
> Big OOPs ... this is "From:" me ... not Andy!

But in any case it's missing your SOB line.

If Andy is still the primary author (much of his original patch survived, you 
resolved conflicts or minor changes) then you can send this as:

 From: Tony Luck <tony.luck@intel.com>
 Subject: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)

 From: Andy Lutomirski <luto@amacapital.net>

 ... changelog ...

 Signed-off-by: Andy Lutomirski <luto@amacapital.net>
 [ Forward ported from a v3.9 version. ]
 Signed-off-by: Tony Luck <tony.luck@intel.com

This carries all the information, has a proper SOB chain, and preserves 
authorship. Also, it's clear from the tags that you made material changes, so any 
resulting breakage is yours (and mine), not Andy's! ;-)

If the changes to the patch are major, so that your new work is larger than Andy's 
original work, you can still credit him via a non-standard tag, like:

 From: Tony Luck <tony.luck@intel.com>
 Subject: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)

 This patch is based on Andy Lutomirski's patch sent against v3.9:

 ... changelog ...

 Originally-from: Andy Lutomirski <luto@amacapital.net>
 Signed-off-by: Tony Luck <tony.luck@intel.com

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2015-12-30 17:59   ` Andy Lutomirski
@ 2016-01-04 12:07     ` Borislav Petkov
  -1 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2016-01-04 12:07 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Dec 30, 2015 at 09:59:29AM -0800, Andy Lutomirski wrote:
> This adds two bits of fixup class information to a fixup entry,
> generalizing the uaccess_err hack currently in place.
> 
> Forward-ported-from-3.9-by: Tony Luck <tony.luck@intel.com>
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
>  arch/x86/include/asm/asm.h | 70 ++++++++++++++++++++++++++++++----------------
>  arch/x86/mm/extable.c      | 21 ++++++++------
>  2 files changed, 59 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
> index 189679aba703..b64121ffb2da 100644
> --- a/arch/x86/include/asm/asm.h
> +++ b/arch/x86/include/asm/asm.h
> @@ -43,19 +43,47 @@
>  #define _ASM_DI		__ASM_REG(di)
>  
>  /* Exception table entry */
> -#ifdef __ASSEMBLY__
> -# define _ASM_EXTABLE(from,to)					\
> -	.pushsection "__ex_table","a" ;				\
> -	.balign 8 ;						\
> -	.long (from) - . ;					\
> -	.long (to) - . ;					\
> -	.popsection
>  
> -# define _ASM_EXTABLE_EX(from,to)				\
> -	.pushsection "__ex_table","a" ;				\
> -	.balign 8 ;						\
> -	.long (from) - . ;					\
> -	.long (to) - . + 0x7ffffff0 ;				\
> +/*
> + * An exception table entry is 64 bits.  The first 32 bits are the offset

Two 32-bit ints, to be exact.

Also, there's text in arch/x86/include/asm/uaccess.h where the exception
table entry is defined so you probably should sync with it so that the
nomenclature is the same.

> + * from that entry to the potentially faulting instruction.  sortextable

								sortextable.c ?

> + * relies on that exact encoding.  The second 32 bits encode the fault
> + * handler address.
> + *
> + * We want to stick two extra bits of handler class into the fault handler
> + * address.  All of these are generated by relocations, so we can only
> + * rely on addition.  We therefore emit:
> + *
> + * (target - here) + (class) + 0x20000000

I still don't understand that bit 29 thing.

Because the offset is negative?

The exception table currently looks like this here:

insn offset: 0xff91a7c4, fixup offset: 0xffffd57a
insn offset: 0xff91bac3, fixup offset: 0xffffd57e
insn offset: 0xff91bac0, fixup offset: 0xffffd57d
insn offset: 0xff91baba, fixup offset: 0xffffd57c
insn offset: 0xff91bfca, fixup offset: 0xffffd57c
insn offset: 0xff91bfff, fixup offset: 0xffffd57e
insn offset: 0xff91c049, fixup offset: 0xffffd580
insn offset: 0xff91c141, fixup offset: 0xffffd57f
insn offset: 0xff91c24e, fixup offset: 0xffffd581
insn offset: 0xff91c262, fixup offset: 0xffffd580
insn offset: 0xff91c261, fixup offset: 0xffffd57f
...

It probably will dawn on me when I look at the rest of the patch...

> + * This has the property that the two high bits are the class and the
> + * rest is easy to decode.
> + */
> +
> +/* There are two bits of extable entry class, added to a signed offset. */
> +#define _EXTABLE_CLASS_DEFAULT	0		/* standard uaccess fixup */
> +#define _EXTABLE_CLASS_EX	0x80000000	/* uaccess + set uaccess_err */

				BIT(31) is more readable.

> +
> +/*
> + * The biases are the class constants + 0x20000000, as signed integers.
> + * This can't use ordinary arithmetic -- the assembler isn't that smart.
> + */
> +#define _EXTABLE_BIAS_DEFAULT	0x20000000
> +#define _EXTABLE_BIAS_EX	0x20000000 - 0x80000000

Ditto.

> +
> +#define _ASM_EXTABLE(from,to)						\
> +	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_DEFAULT)
> +
> +#define _ASM_EXTABLE_EX(from,to)					\
> +	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_EX)
> +
> +#ifdef __ASSEMBLY__
> +# define _EXPAND_EXTABLE_BIAS(x) x
> +# define _ASM_EXTABLE_CLASS(from,to,bias)				\
> +	.pushsection "__ex_table","a" ;					\
> +	.balign 8 ;							\
> +	.long (from) - . ;						\
> +	.long (to) - . + _EXPAND_EXTABLE_BIAS(bias) ;			\

Why not simply:

	.long (to) - . + (bias) ;

and

	" .long (" #to ") - . + "(" #bias ") "\n"

below and get rid of that _EXPAND_EXTABLE_BIAS()?

>  	.popsection
>  
>  # define _ASM_NOKPROBE(entry)					\
> @@ -89,18 +117,12 @@
>  	.endm
>  
>  #else
> -# define _ASM_EXTABLE(from,to)					\
> -	" .pushsection \"__ex_table\",\"a\"\n"			\
> -	" .balign 8\n"						\
> -	" .long (" #from ") - .\n"				\
> -	" .long (" #to ") - .\n"				\
> -	" .popsection\n"
> -
> -# define _ASM_EXTABLE_EX(from,to)				\
> -	" .pushsection \"__ex_table\",\"a\"\n"			\
> -	" .balign 8\n"						\
> -	" .long (" #from ") - .\n"				\
> -	" .long (" #to ") - . + 0x7ffffff0\n"			\
> +# define _EXPAND_EXTABLE_BIAS(x) #x
> +# define _ASM_EXTABLE_CLASS(from,to,bias)				\
> +	" .pushsection \"__ex_table\",\"a\"\n"				\
> +	" .balign 8\n"							\
> +	" .long (" #from ") - .\n"					\
> +	" .long (" #to ") - . + " _EXPAND_EXTABLE_BIAS(bias) "\n"	\
>  	" .popsection\n"
>  /* For C file, we already have NOKPROBE_SYMBOL macro */
>  #endif
> diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
> index 903ec1e9c326..95e2ede71206 100644
> --- a/arch/x86/mm/extable.c
> +++ b/arch/x86/mm/extable.c
> @@ -8,16 +8,24 @@ ex_insn_addr(const struct exception_table_entry *x)
>  {
>  	return (unsigned long)&x->insn + x->insn;
>  }
> +static inline unsigned int
> +ex_class(const struct exception_table_entry *x)
> +{
> +	return (unsigned int)x->fixup & 0xC0000000;
> +}
> +
>  static inline unsigned long
>  ex_fixup_addr(const struct exception_table_entry *x)
>  {
> -	return (unsigned long)&x->fixup + x->fixup;
> +	long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)0x20000000;

So basically:

	x->fixup & 0x1fffffff

Why the explicit subtraction of bit 29?

IOW, I was expecting something simpler for the whole scheme like:

ex_class:

	return x->fixup & 0xC0000000;

ex_fixup_addr:

	return x->fixup | 0xC0000000;

Why can't it be done this way?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 12:07     ` Borislav Petkov
  0 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2016-01-04 12:07 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Dec 30, 2015 at 09:59:29AM -0800, Andy Lutomirski wrote:
> This adds two bits of fixup class information to a fixup entry,
> generalizing the uaccess_err hack currently in place.
> 
> Forward-ported-from-3.9-by: Tony Luck <tony.luck@intel.com>
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
>  arch/x86/include/asm/asm.h | 70 ++++++++++++++++++++++++++++++----------------
>  arch/x86/mm/extable.c      | 21 ++++++++------
>  2 files changed, 59 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
> index 189679aba703..b64121ffb2da 100644
> --- a/arch/x86/include/asm/asm.h
> +++ b/arch/x86/include/asm/asm.h
> @@ -43,19 +43,47 @@
>  #define _ASM_DI		__ASM_REG(di)
>  
>  /* Exception table entry */
> -#ifdef __ASSEMBLY__
> -# define _ASM_EXTABLE(from,to)					\
> -	.pushsection "__ex_table","a" ;				\
> -	.balign 8 ;						\
> -	.long (from) - . ;					\
> -	.long (to) - . ;					\
> -	.popsection
>  
> -# define _ASM_EXTABLE_EX(from,to)				\
> -	.pushsection "__ex_table","a" ;				\
> -	.balign 8 ;						\
> -	.long (from) - . ;					\
> -	.long (to) - . + 0x7ffffff0 ;				\
> +/*
> + * An exception table entry is 64 bits.  The first 32 bits are the offset

Two 32-bit ints, to be exact.

Also, there's text in arch/x86/include/asm/uaccess.h where the exception
table entry is defined so you probably should sync with it so that the
nomenclature is the same.

> + * from that entry to the potentially faulting instruction.  sortextable

								sortextable.c ?

> + * relies on that exact encoding.  The second 32 bits encode the fault
> + * handler address.
> + *
> + * We want to stick two extra bits of handler class into the fault handler
> + * address.  All of these are generated by relocations, so we can only
> + * rely on addition.  We therefore emit:
> + *
> + * (target - here) + (class) + 0x20000000

I still don't understand that bit 29 thing.

Because the offset is negative?

The exception table currently looks like this here:

insn offset: 0xff91a7c4, fixup offset: 0xffffd57a
insn offset: 0xff91bac3, fixup offset: 0xffffd57e
insn offset: 0xff91bac0, fixup offset: 0xffffd57d
insn offset: 0xff91baba, fixup offset: 0xffffd57c
insn offset: 0xff91bfca, fixup offset: 0xffffd57c
insn offset: 0xff91bfff, fixup offset: 0xffffd57e
insn offset: 0xff91c049, fixup offset: 0xffffd580
insn offset: 0xff91c141, fixup offset: 0xffffd57f
insn offset: 0xff91c24e, fixup offset: 0xffffd581
insn offset: 0xff91c262, fixup offset: 0xffffd580
insn offset: 0xff91c261, fixup offset: 0xffffd57f
...

It probably will dawn on me when I look at the rest of the patch...

> + * This has the property that the two high bits are the class and the
> + * rest is easy to decode.
> + */
> +
> +/* There are two bits of extable entry class, added to a signed offset. */
> +#define _EXTABLE_CLASS_DEFAULT	0		/* standard uaccess fixup */
> +#define _EXTABLE_CLASS_EX	0x80000000	/* uaccess + set uaccess_err */

				BIT(31) is more readable.

> +
> +/*
> + * The biases are the class constants + 0x20000000, as signed integers.
> + * This can't use ordinary arithmetic -- the assembler isn't that smart.
> + */
> +#define _EXTABLE_BIAS_DEFAULT	0x20000000
> +#define _EXTABLE_BIAS_EX	0x20000000 - 0x80000000

Ditto.

> +
> +#define _ASM_EXTABLE(from,to)						\
> +	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_DEFAULT)
> +
> +#define _ASM_EXTABLE_EX(from,to)					\
> +	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_EX)
> +
> +#ifdef __ASSEMBLY__
> +# define _EXPAND_EXTABLE_BIAS(x) x
> +# define _ASM_EXTABLE_CLASS(from,to,bias)				\
> +	.pushsection "__ex_table","a" ;					\
> +	.balign 8 ;							\
> +	.long (from) - . ;						\
> +	.long (to) - . + _EXPAND_EXTABLE_BIAS(bias) ;			\

Why not simply:

	.long (to) - . + (bias) ;

and

	" .long (" #to ") - . + "(" #bias ") "\n"

below and get rid of that _EXPAND_EXTABLE_BIAS()?

>  	.popsection
>  
>  # define _ASM_NOKPROBE(entry)					\
> @@ -89,18 +117,12 @@
>  	.endm
>  
>  #else
> -# define _ASM_EXTABLE(from,to)					\
> -	" .pushsection \"__ex_table\",\"a\"\n"			\
> -	" .balign 8\n"						\
> -	" .long (" #from ") - .\n"				\
> -	" .long (" #to ") - .\n"				\
> -	" .popsection\n"
> -
> -# define _ASM_EXTABLE_EX(from,to)				\
> -	" .pushsection \"__ex_table\",\"a\"\n"			\
> -	" .balign 8\n"						\
> -	" .long (" #from ") - .\n"				\
> -	" .long (" #to ") - . + 0x7ffffff0\n"			\
> +# define _EXPAND_EXTABLE_BIAS(x) #x
> +# define _ASM_EXTABLE_CLASS(from,to,bias)				\
> +	" .pushsection \"__ex_table\",\"a\"\n"				\
> +	" .balign 8\n"							\
> +	" .long (" #from ") - .\n"					\
> +	" .long (" #to ") - . + " _EXPAND_EXTABLE_BIAS(bias) "\n"	\
>  	" .popsection\n"
>  /* For C file, we already have NOKPROBE_SYMBOL macro */
>  #endif
> diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
> index 903ec1e9c326..95e2ede71206 100644
> --- a/arch/x86/mm/extable.c
> +++ b/arch/x86/mm/extable.c
> @@ -8,16 +8,24 @@ ex_insn_addr(const struct exception_table_entry *x)
>  {
>  	return (unsigned long)&x->insn + x->insn;
>  }
> +static inline unsigned int
> +ex_class(const struct exception_table_entry *x)
> +{
> +	return (unsigned int)x->fixup & 0xC0000000;
> +}
> +
>  static inline unsigned long
>  ex_fixup_addr(const struct exception_table_entry *x)
>  {
> -	return (unsigned long)&x->fixup + x->fixup;
> +	long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)0x20000000;

So basically:

	x->fixup & 0x1fffffff

Why the explicit subtraction of bit 29?

IOW, I was expecting something simpler for the whole scheme like:

ex_class:

	return x->fixup & 0xC0000000;

ex_fixup_addr:

	return x->fixup | 0xC0000000;

Why can't it be done this way?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 2/4] x86: Cleanup and add a new exception class
  2015-12-30 18:56   ` Tony Luck
  (?)
@ 2016-01-04 14:22   ` Borislav Petkov
  2016-01-04 17:00       ` Luck, Tony
  -1 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2016-01-04 14:22 UTC (permalink / raw)
  To: Tony Luck
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Dan Williams,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

On Wed, Dec 30, 2015 at 10:56:41AM -0800, Tony Luck wrote:
> Make per-class functions for exception table fixup. Add #defines
> and general prettiness to make it clear how all the parts tie
> together.
> 
> Add a new class that fills %rax with the fault number of the exception.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/asm.h     | 24 ++++++++++-----
>  arch/x86/include/asm/uaccess.h | 17 ++++++++---
>  arch/x86/kernel/kprobes/core.c |  2 +-
>  arch/x86/kernel/traps.c        |  6 ++--
>  arch/x86/mm/extable.c          | 67 +++++++++++++++++++++++++++---------------
>  arch/x86/mm/fault.c            |  2 +-
>  6 files changed, 79 insertions(+), 39 deletions(-)
> 
> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
> index b64121ffb2da..1888278d0559 100644
> --- a/arch/x86/include/asm/asm.h
> +++ b/arch/x86/include/asm/asm.h
> @@ -44,6 +44,7 @@
>  
>  /* Exception table entry */
>  
> +#define	_EXTABLE_BIAS	0x20000000
>  /*
>   * An exception table entry is 64 bits.  The first 32 bits are the offset
>   * from that entry to the potentially faulting instruction.  sortextable
> @@ -54,26 +55,35 @@
>   * address.  All of these are generated by relocations, so we can only
>   * rely on addition.  We therefore emit:
>   *
> - * (target - here) + (class) + 0x20000000
> + * (target - here) + (class) + _EXTABLE_BIAS
>   *
>   * This has the property that the two high bits are the class and the
>   * rest is easy to decode.
>   */
>  
> -/* There are two bits of extable entry class, added to a signed offset. */
> -#define _EXTABLE_CLASS_DEFAULT	0		/* standard uaccess fixup */
> -#define _EXTABLE_CLASS_EX	0x80000000	/* uaccess + set uaccess_err */
> +/*
> + * There are two bits of extable entry class giving four classes
> + */
> +#define EXTABLE_CLASS_DEFAULT	0	/* standard uaccess fixup */
> +#define EXTABLE_CLASS_FAULT	1	/* provide fault number as well as fixup */
> +#define EXTABLE_CLASS_EX	2	/* uaccess + set uaccess_err */
> +#define EXTABLE_CLASS_UNUSED	3	/* available for something else */
>  
>  /*
> - * The biases are the class constants + 0x20000000, as signed integers.
> + * The biases are the class constants + _EXTABLE_BIAS, as signed integers.
>   * This can't use ordinary arithmetic -- the assembler isn't that smart.
>   */
> -#define _EXTABLE_BIAS_DEFAULT	0x20000000
> -#define _EXTABLE_BIAS_EX	0x20000000 - 0x80000000
> +#define _EXTABLE_BIAS_DEFAULT	_EXTABLE_BIAS
> +#define _EXTABLE_BIAS_FAULT	_EXTABLE_BIAS + 0x40000000
> +#define _EXTABLE_BIAS_EX	_EXTABLE_BIAS - 0x80000000
> +#define _EXTABLE_BIAS_UNUSED	_EXTABLE_BIAS - 0x40000000
>  
>  #define _ASM_EXTABLE(from,to)						\
>  	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_DEFAULT)
>  
> +#define _ASM_EXTABLE_FAULT(from,to)					\
> +	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_FAULT)
> +
>  #define _ASM_EXTABLE_EX(from,to)					\
>  	_ASM_EXTABLE_CLASS(from, to, _EXTABLE_BIAS_EX)

So you're touching those again in patch 2. Why not add those defines to
patch 1 directly and diminish the churn?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH v6 2/4] x86: Cleanup and add a new exception class
  2016-01-04 14:22   ` Borislav Petkov
@ 2016-01-04 17:00       ` Luck, Tony
  0 siblings, 0 replies; 44+ messages in thread
From: Luck, Tony @ 2016-01-04 17:00 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Williams, Dan J,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 391 bytes --]

> So you're touching those again in patch 2. Why not add those defines to
> patch 1 directly and diminish the churn?

To preserve authorship. Andy did patch 1 (the clever part). Patch 2 is just syntactic
sugar on top of it.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH v6 2/4] x86: Cleanup and add a new exception class
@ 2016-01-04 17:00       ` Luck, Tony
  0 siblings, 0 replies; 44+ messages in thread
From: Luck, Tony @ 2016-01-04 17:00 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Williams, Dan J,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

> So you're touching those again in patch 2. Why not add those defines to
> patch 1 directly and diminish the churn?

To preserve authorship. Andy did patch 1 (the clever part). Patch 2 is just syntactic
sugar on top of it.

-Tony

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 12:07     ` Borislav Petkov
@ 2016-01-04 17:26       ` Tony Luck
  -1 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2016-01-04 17:26 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andy Lutomirski, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 4:07 AM, Borislav Petkov <bp@alien8.de> wrote:
>> + * (target - here) + (class) + 0x20000000
>
> I still don't understand that bit 29 thing.
>
> Because the offset is negative?

I think so.  The .fixup section is placed in the end of .text, and the ex_table
itself is pretty much right after.  So all the "fixup" offsets will be
small negative
numbers (the "insn" ones are also negative, but will be bigger since they
potentially need to reach all the way to the start of .text).

Adding 0x20000000 makes everything positive (so our legacy exception
table entries have bit31==bit30==0) and perhaps makes it fractionally clearer
how we manipulate the top bits for the other classes ... but only
slightly. I got
very confused by it too).

It is all made more complex because these values need to be something
that "ld" can relocate when vmlinux is put together from all the ".o" files.
So we can't just use "x | BIT(30)" etc.


>> +#define _EXTABLE_CLASS_EX    0x80000000      /* uaccess + set uaccess_err */
>
>                                 BIT(31) is more readable.

Not to the assembler :-(

> Why not simply:
>
>         .long (to) - . + (bias) ;
>
> and
>
>         " .long (" #to ") - . + "(" #bias ") "\n"
>
> below and get rid of that _EXPAND_EXTABLE_BIAS()?

Andy - this part is your code and I'm not sure what the trick is here.

>>  ex_fixup_addr(const struct exception_table_entry *x)
>>  {
>> -     return (unsigned long)&x->fixup + x->fixup;
>> +     long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)0x20000000;
>
> So basically:
>
>         x->fixup & 0x1fffffff
>
> Why the explicit subtraction of bit 29?

We added it to begin with ... need to subtract to get back to the
original offset.

> IOW, I was expecting something simpler for the whole scheme like:
>
> ex_class:
>
>         return x->fixup & 0xC0000000;

ex_class (after part2) is just "(u32)x->fixup >> 30" (because I wanted
a result in [0..3])

> ex_fixup_addr:
>
>         return x->fixup | 0xC0000000;
>
> Why can't it be done this way?

Because relocations ... the linker can only add/subtract values when
making vmlinux ... it can't OR bits in.

-Tony

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 17:26       ` Tony Luck
  0 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2016-01-04 17:26 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andy Lutomirski, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 4:07 AM, Borislav Petkov <bp@alien8.de> wrote:
>> + * (target - here) + (class) + 0x20000000
>
> I still don't understand that bit 29 thing.
>
> Because the offset is negative?

I think so.  The .fixup section is placed in the end of .text, and the ex_table
itself is pretty much right after.  So all the "fixup" offsets will be
small negative
numbers (the "insn" ones are also negative, but will be bigger since they
potentially need to reach all the way to the start of .text).

Adding 0x20000000 makes everything positive (so our legacy exception
table entries have bit31==bit30==0) and perhaps makes it fractionally clearer
how we manipulate the top bits for the other classes ... but only
slightly. I got
very confused by it too).

It is all made more complex because these values need to be something
that "ld" can relocate when vmlinux is put together from all the ".o" files.
So we can't just use "x | BIT(30)" etc.


>> +#define _EXTABLE_CLASS_EX    0x80000000      /* uaccess + set uaccess_err */
>
>                                 BIT(31) is more readable.

Not to the assembler :-(

> Why not simply:
>
>         .long (to) - . + (bias) ;
>
> and
>
>         " .long (" #to ") - . + "(" #bias ") "\n"
>
> below and get rid of that _EXPAND_EXTABLE_BIAS()?

Andy - this part is your code and I'm not sure what the trick is here.

>>  ex_fixup_addr(const struct exception_table_entry *x)
>>  {
>> -     return (unsigned long)&x->fixup + x->fixup;
>> +     long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)0x20000000;
>
> So basically:
>
>         x->fixup & 0x1fffffff
>
> Why the explicit subtraction of bit 29?

We added it to begin with ... need to subtract to get back to the
original offset.

> IOW, I was expecting something simpler for the whole scheme like:
>
> ex_class:
>
>         return x->fixup & 0xC0000000;

ex_class (after part2) is just "(u32)x->fixup >> 30" (because I wanted
a result in [0..3])

> ex_fixup_addr:
>
>         return x->fixup | 0xC0000000;
>
> Why can't it be done this way?

Because relocations ... the linker can only add/subtract values when
making vmlinux ... it can't OR bits in.

-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 17:26       ` Tony Luck
@ 2016-01-04 18:08         ` Andy Lutomirski
  -1 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2016-01-04 18:08 UTC (permalink / raw)
  To: Tony Luck
  Cc: Borislav Petkov, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 9:26 AM, Tony Luck <tony.luck@gmail.com> wrote:
> On Mon, Jan 4, 2016 at 4:07 AM, Borislav Petkov <bp@alien8.de> wrote:
>>> + * (target - here) + (class) + 0x20000000
>>
>> I still don't understand that bit 29 thing.
>>
>> Because the offset is negative?
>
> I think so.  The .fixup section is placed in the end of .text, and the ex_table
> itself is pretty much right after.  So all the "fixup" offsets will be
> small negative
> numbers (the "insn" ones are also negative, but will be bigger since they
> potentially need to reach all the way to the start of .text).
>
> Adding 0x20000000 makes everything positive (so our legacy exception
> table entries have bit31==bit30==0) and perhaps makes it fractionally clearer
> how we manipulate the top bits for the other classes ... but only
> slightly. I got
> very confused by it too).
>
> It is all made more complex because these values need to be something
> that "ld" can relocate when vmlinux is put together from all the ".o" files.
> So we can't just use "x | BIT(30)" etc.

All of that's correct, including the part where it's confusing.  The
comments aren't the best.

How about adding a comment like:

----- begin comment -----

The offset to the fixup is signed, and we're trying to use the high
bits for a different purpose.  In C, we could just do:

u32 class_and_offset = ((target - here) & 0x3fffffff) | class;

Then, to decode it, we'd mask off the class and sign-extend to recover
the offset.

In asm, we can't do that, because this all gets laundered through the
linker, and there's no relocation type that supports this chicanery.
Instead we cheat a bit.  We first add a large number to the offset
(0x20000000).  The result is still nominally signed, but now it's
always positive, and the two high bits are always clear.  We can then
set high bits by ordinary addition or subtraction instead of using
bitwise operations.  As far as the linker is concerned, all we're
doing is adding a large constant to the difference between here (".")
and the target, and that's a valid relocation type.

In the C code, we just mask off the class bits and subtract 0x20000000
to get the offset.

----- end comment -----

>
>
>>> +#define _EXTABLE_CLASS_EX    0x80000000      /* uaccess + set uaccess_err */
>>
>>                                 BIT(31) is more readable.
>
> Not to the assembler :-(
>
>> Why not simply:
>>
>>         .long (to) - . + (bias) ;
>>
>> and
>>
>>         " .long (" #to ") - . + "(" #bias ") "\n"
>>
>> below and get rid of that _EXPAND_EXTABLE_BIAS()?
>
> Andy - this part is your code and I'm not sure what the trick is here.

I don't remember.  I think it was just some preprocessor crud to force
all the macros to expand fully before the assembler sees it.  If it
builds without it, feel free to delete it.

>
>>>  ex_fixup_addr(const struct exception_table_entry *x)
>>>  {
>>> -     return (unsigned long)&x->fixup + x->fixup;
>>> +     long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)0x20000000;
>>
>> So basically:
>>
>>         x->fixup & 0x1fffffff
>>
>> Why the explicit subtraction of bit 29?
>
> We added it to begin with ... need to subtract to get back to the
> original offset.

Hopefully it's clearer with the comment above.

>
>> IOW, I was expecting something simpler for the whole scheme like:
>>
>> ex_class:
>>
>>         return x->fixup & 0xC0000000;
>
> ex_class (after part2) is just "(u32)x->fixup >> 30" (because I wanted
> a result in [0..3])
>
>> ex_fixup_addr:
>>
>>         return x->fixup | 0xC0000000;
>>
>> Why can't it be done this way?
>
> Because relocations ... the linker can only add/subtract values when
> making vmlinux ... it can't OR bits in.

--Andy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 18:08         ` Andy Lutomirski
  0 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2016-01-04 18:08 UTC (permalink / raw)
  To: Tony Luck
  Cc: Borislav Petkov, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 9:26 AM, Tony Luck <tony.luck@gmail.com> wrote:
> On Mon, Jan 4, 2016 at 4:07 AM, Borislav Petkov <bp@alien8.de> wrote:
>>> + * (target - here) + (class) + 0x20000000
>>
>> I still don't understand that bit 29 thing.
>>
>> Because the offset is negative?
>
> I think so.  The .fixup section is placed in the end of .text, and the ex_table
> itself is pretty much right after.  So all the "fixup" offsets will be
> small negative
> numbers (the "insn" ones are also negative, but will be bigger since they
> potentially need to reach all the way to the start of .text).
>
> Adding 0x20000000 makes everything positive (so our legacy exception
> table entries have bit31==bit30==0) and perhaps makes it fractionally clearer
> how we manipulate the top bits for the other classes ... but only
> slightly. I got
> very confused by it too).
>
> It is all made more complex because these values need to be something
> that "ld" can relocate when vmlinux is put together from all the ".o" files.
> So we can't just use "x | BIT(30)" etc.

All of that's correct, including the part where it's confusing.  The
comments aren't the best.

How about adding a comment like:

----- begin comment -----

The offset to the fixup is signed, and we're trying to use the high
bits for a different purpose.  In C, we could just do:

u32 class_and_offset = ((target - here) & 0x3fffffff) | class;

Then, to decode it, we'd mask off the class and sign-extend to recover
the offset.

In asm, we can't do that, because this all gets laundered through the
linker, and there's no relocation type that supports this chicanery.
Instead we cheat a bit.  We first add a large number to the offset
(0x20000000).  The result is still nominally signed, but now it's
always positive, and the two high bits are always clear.  We can then
set high bits by ordinary addition or subtraction instead of using
bitwise operations.  As far as the linker is concerned, all we're
doing is adding a large constant to the difference between here (".")
and the target, and that's a valid relocation type.

In the C code, we just mask off the class bits and subtract 0x20000000
to get the offset.

----- end comment -----

>
>
>>> +#define _EXTABLE_CLASS_EX    0x80000000      /* uaccess + set uaccess_err */
>>
>>                                 BIT(31) is more readable.
>
> Not to the assembler :-(
>
>> Why not simply:
>>
>>         .long (to) - . + (bias) ;
>>
>> and
>>
>>         " .long (" #to ") - . + "(" #bias ") "\n"
>>
>> below and get rid of that _EXPAND_EXTABLE_BIAS()?
>
> Andy - this part is your code and I'm not sure what the trick is here.

I don't remember.  I think it was just some preprocessor crud to force
all the macros to expand fully before the assembler sees it.  If it
builds without it, feel free to delete it.

>
>>>  ex_fixup_addr(const struct exception_table_entry *x)
>>>  {
>>> -     return (unsigned long)&x->fixup + x->fixup;
>>> +     long offset = (long)((u32)x->fixup & 0x3fffffff) - (long)0x20000000;
>>
>> So basically:
>>
>>         x->fixup & 0x1fffffff
>>
>> Why the explicit subtraction of bit 29?
>
> We added it to begin with ... need to subtract to get back to the
> original offset.

Hopefully it's clearer with the comment above.

>
>> IOW, I was expecting something simpler for the whole scheme like:
>>
>> ex_class:
>>
>>         return x->fixup & 0xC0000000;
>
> ex_class (after part2) is just "(u32)x->fixup >> 30" (because I wanted
> a result in [0..3])
>
>> ex_fixup_addr:
>>
>>         return x->fixup | 0xC0000000;
>>
>> Why can't it be done this way?
>
> Because relocations ... the linker can only add/subtract values when
> making vmlinux ... it can't OR bits in.

--Andy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 18:08         ` Andy Lutomirski
@ 2016-01-04 18:59           ` Tony Luck
  -1 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2016-01-04 18:59 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

> ----- begin comment -----
>
> The offset to the fixup is signed, and we're trying to use the high
> bits for a different purpose.  In C, we could just do:
>
> u32 class_and_offset = ((target - here) & 0x3fffffff) | class;
>
> Then, to decode it, we'd mask off the class and sign-extend to recover
> the offset.
>
> In asm, we can't do that, because this all gets laundered through the
> linker, and there's no relocation type that supports this chicanery.
> Instead we cheat a bit.  We first add a large number to the offset
> (0x20000000).  The result is still nominally signed, but now it's
> always positive, and the two high bits are always clear.  We can then
> set high bits by ordinary addition or subtraction instead of using
> bitwise operations.  As far as the linker is concerned, all we're
> doing is adding a large constant to the difference between here (".")
> and the target, and that's a valid relocation type.
>
> In the C code, we just mask off the class bits and subtract 0x20000000
> to get the offset.
>
> ----- end comment -----

But presumably those constants get folded together, so the linker
is dealing with only one offset.  It doesn't (I assume) know that our
source code added 0x20000000 and then added/subtracted some
more.

It looks like we could just use:
class0: +0x40000000
class1: +0x80000000 (or subtract ... whatever doesn't make the linker cranky)
class2: -0x40000000
class3: don't add/subtract anything

ex_class() stays the same (just looks at bit31/bit30)
ex_fixup_addr() has to use ex_class() to decide what to add/subtract
(if anything).

Would that work?  Would it be more or less confusing?

-Tony

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 18:59           ` Tony Luck
  0 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2016-01-04 18:59 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

> ----- begin comment -----
>
> The offset to the fixup is signed, and we're trying to use the high
> bits for a different purpose.  In C, we could just do:
>
> u32 class_and_offset = ((target - here) & 0x3fffffff) | class;
>
> Then, to decode it, we'd mask off the class and sign-extend to recover
> the offset.
>
> In asm, we can't do that, because this all gets laundered through the
> linker, and there's no relocation type that supports this chicanery.
> Instead we cheat a bit.  We first add a large number to the offset
> (0x20000000).  The result is still nominally signed, but now it's
> always positive, and the two high bits are always clear.  We can then
> set high bits by ordinary addition or subtraction instead of using
> bitwise operations.  As far as the linker is concerned, all we're
> doing is adding a large constant to the difference between here (".")
> and the target, and that's a valid relocation type.
>
> In the C code, we just mask off the class bits and subtract 0x20000000
> to get the offset.
>
> ----- end comment -----

But presumably those constants get folded together, so the linker
is dealing with only one offset.  It doesn't (I assume) know that our
source code added 0x20000000 and then added/subtracted some
more.

It looks like we could just use:
class0: +0x40000000
class1: +0x80000000 (or subtract ... whatever doesn't make the linker cranky)
class2: -0x40000000
class3: don't add/subtract anything

ex_class() stays the same (just looks at bit31/bit30)
ex_fixup_addr() has to use ex_class() to decide what to add/subtract
(if anything).

Would that work?  Would it be more or less confusing?

-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 18:59           ` Tony Luck
@ 2016-01-04 19:05             ` Andy Lutomirski
  -1 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2016-01-04 19:05 UTC (permalink / raw)
  To: Tony Luck
  Cc: Borislav Petkov, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 10:59 AM, Tony Luck <tony.luck@gmail.com> wrote:
>> ----- begin comment -----
>>
>> The offset to the fixup is signed, and we're trying to use the high
>> bits for a different purpose.  In C, we could just do:
>>
>> u32 class_and_offset = ((target - here) & 0x3fffffff) | class;
>>
>> Then, to decode it, we'd mask off the class and sign-extend to recover
>> the offset.
>>
>> In asm, we can't do that, because this all gets laundered through the
>> linker, and there's no relocation type that supports this chicanery.
>> Instead we cheat a bit.  We first add a large number to the offset
>> (0x20000000).  The result is still nominally signed, but now it's
>> always positive, and the two high bits are always clear.  We can then
>> set high bits by ordinary addition or subtraction instead of using
>> bitwise operations.  As far as the linker is concerned, all we're
>> doing is adding a large constant to the difference between here (".")
>> and the target, and that's a valid relocation type.
>>
>> In the C code, we just mask off the class bits and subtract 0x20000000
>> to get the offset.
>>
>> ----- end comment -----
>
> But presumably those constants get folded together, so the linker
> is dealing with only one offset.  It doesn't (I assume) know that our
> source code added 0x20000000 and then added/subtracted some
> more.

Yes, indeed.

>
> It looks like we could just use:
> class0: +0x40000000
> class1: +0x80000000 (or subtract ... whatever doesn't make the linker cranky)
> class2: -0x40000000
> class3: don't add/subtract anything
>
> ex_class() stays the same (just looks at bit31/bit30)
> ex_fixup_addr() has to use ex_class() to decide what to add/subtract
> (if anything).
>
> Would that work?  Would it be more or less confusing?

That probably works, but to me, at least, it's a bit more confusing.
It also means that you need a table or some branches to compute the
offset, whereas the "mask top two bits and add a constant" approach is
straightforward, short, and fast.

Also, I'm not 100% convinced that the 0x80000000 case can ever work
reliably.  I don't know exactly what the condition that triggers the
warning is, but the logical one would be to warn if the actual offset
plus or minus the addend, as appropriate, overflows in a signed sense.
Whether it overflows depends on the sign of the offset, and *that*
depends on the actual layout of all the sections.

Mine avoids this issue by being shifted by 0x20000000, so nothing ends
up right on the edge.

--Andy



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 19:05             ` Andy Lutomirski
  0 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2016-01-04 19:05 UTC (permalink / raw)
  To: Tony Luck
  Cc: Borislav Petkov, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 10:59 AM, Tony Luck <tony.luck@gmail.com> wrote:
>> ----- begin comment -----
>>
>> The offset to the fixup is signed, and we're trying to use the high
>> bits for a different purpose.  In C, we could just do:
>>
>> u32 class_and_offset = ((target - here) & 0x3fffffff) | class;
>>
>> Then, to decode it, we'd mask off the class and sign-extend to recover
>> the offset.
>>
>> In asm, we can't do that, because this all gets laundered through the
>> linker, and there's no relocation type that supports this chicanery.
>> Instead we cheat a bit.  We first add a large number to the offset
>> (0x20000000).  The result is still nominally signed, but now it's
>> always positive, and the two high bits are always clear.  We can then
>> set high bits by ordinary addition or subtraction instead of using
>> bitwise operations.  As far as the linker is concerned, all we're
>> doing is adding a large constant to the difference between here (".")
>> and the target, and that's a valid relocation type.
>>
>> In the C code, we just mask off the class bits and subtract 0x20000000
>> to get the offset.
>>
>> ----- end comment -----
>
> But presumably those constants get folded together, so the linker
> is dealing with only one offset.  It doesn't (I assume) know that our
> source code added 0x20000000 and then added/subtracted some
> more.

Yes, indeed.

>
> It looks like we could just use:
> class0: +0x40000000
> class1: +0x80000000 (or subtract ... whatever doesn't make the linker cranky)
> class2: -0x40000000
> class3: don't add/subtract anything
>
> ex_class() stays the same (just looks at bit31/bit30)
> ex_fixup_addr() has to use ex_class() to decide what to add/subtract
> (if anything).
>
> Would that work?  Would it be more or less confusing?

That probably works, but to me, at least, it's a bit more confusing.
It also means that you need a table or some branches to compute the
offset, whereas the "mask top two bits and add a constant" approach is
straightforward, short, and fast.

Also, I'm not 100% convinced that the 0x80000000 case can ever work
reliably.  I don't know exactly what the condition that triggers the
warning is, but the logical one would be to warn if the actual offset
plus or minus the addend, as appropriate, overflows in a signed sense.
Whether it overflows depends on the sign of the offset, and *that*
depends on the actual layout of all the sections.

Mine avoids this issue by being shifted by 0x20000000, so nothing ends
up right on the edge.

--Andy



-- 
Andy Lutomirski
AMA Capital Management, LLC

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 2/4] x86: Cleanup and add a new exception class
  2016-01-04 17:00       ` Luck, Tony
  (?)
@ 2016-01-04 20:32       ` Borislav Petkov
  2016-01-04 22:23           ` Andy Lutomirski
  -1 siblings, 1 reply; 44+ messages in thread
From: Borislav Petkov @ 2016-01-04 20:32 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Ingo Molnar, Andrew Morton, Andy Lutomirski, Williams, Dan J,
	elliott, linux-kernel, linux-mm, linux-nvdimm, x86

On Mon, Jan 04, 2016 at 05:00:04PM +0000, Luck, Tony wrote:
> > So you're touching those again in patch 2. Why not add those defines to
> > patch 1 directly and diminish the churn?
> 
> To preserve authorship. Andy did patch 1 (the clever part). Patch 2 is just syntactic
> sugar on top of it.

That you can do in the way Ingo suggested.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 18:08         ` Andy Lutomirski
@ 2016-01-04 21:02           ` Borislav Petkov
  -1 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2016-01-04 21:02 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 04, 2016 at 10:08:43AM -0800, Andy Lutomirski wrote:
> All of that's correct, including the part where it's confusing.  The
> comments aren't the best.
> 
> How about adding a comment like:
> 
> ----- begin comment -----
> 
> The offset to the fixup is signed, and we're trying to use the high
> bits for a different purpose.  In C, we could just do:
> 
> u32 class_and_offset = ((target - here) & 0x3fffffff) | class;
> 
> Then, to decode it, we'd mask off the class and sign-extend to recover
> the offset.
> 
> In asm, we can't do that, because this all gets laundered through the
> linker, and there's no relocation type that supports this chicanery.
> Instead we cheat a bit.  We first add a large number to the offset
> (0x20000000).  The result is still nominally signed, but now it's
> always positive, and the two high bits are always clear.  We can then
> set high bits by ordinary addition or subtraction instead of using
> bitwise operations.  As far as the linker is concerned, all we're
> doing is adding a large constant to the difference between here (".")
> and the target, and that's a valid relocation type.
> 
> In the C code, we just mask off the class bits and subtract 0x20000000
> to get the offset.
> 
> ----- end comment -----

Yeah, that makes more sense, thanks.

That nasty "." current position thing stays in the way to do it cleanly. :-)

Anyway, ok, I see it now. It still feels a bit hacky to me. I probably
would've added the third int to the exception table instead. It would've
been much more straightforward and clean this way and I'd gladly pay the
additional 6K growth.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 21:02           ` Borislav Petkov
  0 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2016-01-04 21:02 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 04, 2016 at 10:08:43AM -0800, Andy Lutomirski wrote:
> All of that's correct, including the part where it's confusing.  The
> comments aren't the best.
> 
> How about adding a comment like:
> 
> ----- begin comment -----
> 
> The offset to the fixup is signed, and we're trying to use the high
> bits for a different purpose.  In C, we could just do:
> 
> u32 class_and_offset = ((target - here) & 0x3fffffff) | class;
> 
> Then, to decode it, we'd mask off the class and sign-extend to recover
> the offset.
> 
> In asm, we can't do that, because this all gets laundered through the
> linker, and there's no relocation type that supports this chicanery.
> Instead we cheat a bit.  We first add a large number to the offset
> (0x20000000).  The result is still nominally signed, but now it's
> always positive, and the two high bits are always clear.  We can then
> set high bits by ordinary addition or subtraction instead of using
> bitwise operations.  As far as the linker is concerned, all we're
> doing is adding a large constant to the difference between here (".")
> and the target, and that's a valid relocation type.
> 
> In the C code, we just mask off the class bits and subtract 0x20000000
> to get the offset.
> 
> ----- end comment -----

Yeah, that makes more sense, thanks.

That nasty "." current position thing stays in the way to do it cleanly. :-)

Anyway, ok, I see it now. It still feels a bit hacky to me. I probably
would've added the third int to the exception table instead. It would've
been much more straightforward and clean this way and I'd gladly pay the
additional 6K growth.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 2/4] x86: Cleanup and add a new exception class
  2016-01-04 20:32       ` Borislav Petkov
@ 2016-01-04 22:23           ` Andy Lutomirski
  0 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2016-01-04 22:23 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Luck, Tony, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Williams, Dan J, elliott, linux-kernel, linux-mm, linux-nvdimm,
	x86

On Mon, Jan 4, 2016 at 12:32 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jan 04, 2016 at 05:00:04PM +0000, Luck, Tony wrote:
>> > So you're touching those again in patch 2. Why not add those defines to
>> > patch 1 directly and diminish the churn?
>>
>> To preserve authorship. Andy did patch 1 (the clever part). Patch 2 is just syntactic
>> sugar on top of it.
>
> That you can do in the way Ingo suggested.

I also personally don't care that much.  You're welcome to modify my patch.

If you modify it so much that it's mostly your patch, then change the
From: string and credit me.  If not, leave the author as me and make a
note in the log message.

--Andy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 2/4] x86: Cleanup and add a new exception class
@ 2016-01-04 22:23           ` Andy Lutomirski
  0 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2016-01-04 22:23 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Luck, Tony, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Williams, Dan J, elliott, linux-kernel, linux-mm, linux-nvdimm,
	x86

On Mon, Jan 4, 2016 at 12:32 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jan 04, 2016 at 05:00:04PM +0000, Luck, Tony wrote:
>> > So you're touching those again in patch 2. Why not add those defines to
>> > patch 1 directly and diminish the churn?
>>
>> To preserve authorship. Andy did patch 1 (the clever part). Patch 2 is just syntactic
>> sugar on top of it.
>
> That you can do in the way Ingo suggested.

I also personally don't care that much.  You're welcome to modify my patch.

If you modify it so much that it's mostly your patch, then change the
From: string and credit me.  If not, leave the author as me and make a
note in the log message.

--Andy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 21:02           ` Borislav Petkov
@ 2016-01-04 22:29             ` Andy Lutomirski
  -1 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2016-01-04 22:29 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 1:02 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jan 04, 2016 at 10:08:43AM -0800, Andy Lutomirski wrote:
>> All of that's correct, including the part where it's confusing.  The
>> comments aren't the best.
>>
>> How about adding a comment like:
>>
>> ----- begin comment -----
>>
>> The offset to the fixup is signed, and we're trying to use the high
>> bits for a different purpose.  In C, we could just do:
>>
>> u32 class_and_offset = ((target - here) & 0x3fffffff) | class;
>>
>> Then, to decode it, we'd mask off the class and sign-extend to recover
>> the offset.
>>
>> In asm, we can't do that, because this all gets laundered through the
>> linker, and there's no relocation type that supports this chicanery.
>> Instead we cheat a bit.  We first add a large number to the offset
>> (0x20000000).  The result is still nominally signed, but now it's
>> always positive, and the two high bits are always clear.  We can then
>> set high bits by ordinary addition or subtraction instead of using
>> bitwise operations.  As far as the linker is concerned, all we're
>> doing is adding a large constant to the difference between here (".")
>> and the target, and that's a valid relocation type.
>>
>> In the C code, we just mask off the class bits and subtract 0x20000000
>> to get the offset.
>>
>> ----- end comment -----
>
> Yeah, that makes more sense, thanks.
>
> That nasty "." current position thing stays in the way to do it cleanly. :-)
>
> Anyway, ok, I see it now. It still feels a bit hacky to me. I probably
> would've added the third int to the exception table instead. It would've
> been much more straightforward and clean this way and I'd gladly pay the
> additional 6K growth.

Josh will argue with you if he sees that :)

We could maybe come up with a way to compress the table and get that
space and more back, but maybe that should be a follow-up that someone
else can do if they're inspired.

--Andy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 22:29             ` Andy Lutomirski
  0 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2016-01-04 22:29 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 1:02 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jan 04, 2016 at 10:08:43AM -0800, Andy Lutomirski wrote:
>> All of that's correct, including the part where it's confusing.  The
>> comments aren't the best.
>>
>> How about adding a comment like:
>>
>> ----- begin comment -----
>>
>> The offset to the fixup is signed, and we're trying to use the high
>> bits for a different purpose.  In C, we could just do:
>>
>> u32 class_and_offset = ((target - here) & 0x3fffffff) | class;
>>
>> Then, to decode it, we'd mask off the class and sign-extend to recover
>> the offset.
>>
>> In asm, we can't do that, because this all gets laundered through the
>> linker, and there's no relocation type that supports this chicanery.
>> Instead we cheat a bit.  We first add a large number to the offset
>> (0x20000000).  The result is still nominally signed, but now it's
>> always positive, and the two high bits are always clear.  We can then
>> set high bits by ordinary addition or subtraction instead of using
>> bitwise operations.  As far as the linker is concerned, all we're
>> doing is adding a large constant to the difference between here (".")
>> and the target, and that's a valid relocation type.
>>
>> In the C code, we just mask off the class bits and subtract 0x20000000
>> to get the offset.
>>
>> ----- end comment -----
>
> Yeah, that makes more sense, thanks.
>
> That nasty "." current position thing stays in the way to do it cleanly. :-)
>
> Anyway, ok, I see it now. It still feels a bit hacky to me. I probably
> would've added the third int to the exception table instead. It would've
> been much more straightforward and clean this way and I'd gladly pay the
> additional 6K growth.

Josh will argue with you if he sees that :)

We could maybe come up with a way to compress the table and get that
space and more back, but maybe that should be a follow-up that someone
else can do if they're inspired.

--Andy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 22:29             ` Andy Lutomirski
@ 2016-01-04 23:02               ` Borislav Petkov
  -1 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2016-01-04 23:02 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 04, 2016 at 02:29:09PM -0800, Andy Lutomirski wrote:
> Josh will argue with you if he sees that :)

Except Josh doesn't need allyesconfigs. tinyconfig's __ex_table is 2K.

> We could maybe come up with a way to compress the table and get that
> space and more back, but maybe that should be a follow-up that someone
> else can do if they're inspired.

Yeah, one needs to look after ones' TODO list. :-)

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 23:02               ` Borislav Petkov
  0 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2016-01-04 23:02 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 04, 2016 at 02:29:09PM -0800, Andy Lutomirski wrote:
> Josh will argue with you if he sees that :)

Except Josh doesn't need allyesconfigs. tinyconfig's __ex_table is 2K.

> We could maybe come up with a way to compress the table and get that
> space and more back, but maybe that should be a follow-up that someone
> else can do if they're inspired.

Yeah, one needs to look after ones' TODO list. :-)

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 23:02               ` Borislav Petkov
@ 2016-01-04 23:04                 ` Borislav Petkov
  -1 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2016-01-04 23:04 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Tue, Jan 05, 2016 at 12:02:46AM +0100, Borislav Petkov wrote:
> Except Josh doesn't need allyesconfigs. tinyconfig's __ex_table is 2K.

Besides I just saved him 1.5K:

https://lkml.kernel.org/r/1449481182-27541-1-git-send-email-bp@alien8.de

:-)

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 23:04                 ` Borislav Petkov
  0 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2016-01-04 23:04 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Tue, Jan 05, 2016 at 12:02:46AM +0100, Borislav Petkov wrote:
> Except Josh doesn't need allyesconfigs. tinyconfig's __ex_table is 2K.

Besides I just saved him 1.5K:

https://lkml.kernel.org/r/1449481182-27541-1-git-send-email-bp@alien8.de

:-)

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 18:08         ` Andy Lutomirski
@ 2016-01-04 23:11           ` Tony Luck
  -1 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2016-01-04 23:11 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 10:08 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Mon, Jan 4, 2016 at 9:26 AM, Tony Luck <tony.luck@gmail.com> wrote:
>> On Mon, Jan 4, 2016 at 4:07 AM, Borislav Petkov <bp@alien8.de> wrote:

>>> Why not simply:
>>>
>>>         .long (to) - . + (bias) ;
>>>
>>> and
>>>
>>>         " .long (" #to ") - . + "(" #bias ") "\n"
>>>
>>> below and get rid of that _EXPAND_EXTABLE_BIAS()?
>>
>> Andy - this part is your code and I'm not sure what the trick is here.
>
> I don't remember.  I think it was just some preprocessor crud to force
> all the macros to expand fully before the assembler sees it.  If it
> builds without it, feel free to delete it.

The trick is definitely needed in the case of

# define _EXPAND_EXTABLE_BIAS(x) #x

Trying to expand it inline and get rid of the macro led to
horrible failure. The __ASSEMBLY__ case where the
macro does nothing isn't required ... but does provide
a certain amount of symmetry when looking at the two
versions of _ASM_EXTABLE_CLASS

-Tony

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 23:11           ` Tony Luck
  0 siblings, 0 replies; 44+ messages in thread
From: Tony Luck @ 2016-01-04 23:11 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 10:08 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Mon, Jan 4, 2016 at 9:26 AM, Tony Luck <tony.luck@gmail.com> wrote:
>> On Mon, Jan 4, 2016 at 4:07 AM, Borislav Petkov <bp@alien8.de> wrote:

>>> Why not simply:
>>>
>>>         .long (to) - . + (bias) ;
>>>
>>> and
>>>
>>>         " .long (" #to ") - . + "(" #bias ") "\n"
>>>
>>> below and get rid of that _EXPAND_EXTABLE_BIAS()?
>>
>> Andy - this part is your code and I'm not sure what the trick is here.
>
> I don't remember.  I think it was just some preprocessor crud to force
> all the macros to expand fully before the assembler sees it.  If it
> builds without it, feel free to delete it.

The trick is definitely needed in the case of

# define _EXPAND_EXTABLE_BIAS(x) #x

Trying to expand it inline and get rid of the macro led to
horrible failure. The __ASSEMBLY__ case where the
macro does nothing isn't required ... but does provide
a certain amount of symmetry when looking at the two
versions of _ASM_EXTABLE_CLASS

-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 23:02               ` Borislav Petkov
@ 2016-01-04 23:25                 ` Andy Lutomirski
  -1 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2016-01-04 23:25 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 3:02 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jan 04, 2016 at 02:29:09PM -0800, Andy Lutomirski wrote:
>> Josh will argue with you if he sees that :)
>
> Except Josh doesn't need allyesconfigs. tinyconfig's __ex_table is 2K.

If we do the make-it-bigger approach, we get a really nice
simplification.  Screw the whole 'class' idea -- just store an offset
to a handler.

bool extable_handler_default(struct pt_regs *regs, unsigned int fault,
unsigned long error_code, unsigned long info)
{
    if (fault == X86_TRAP_MC)
        return false;

    ...
}

bool extable_handler_mc_copy(struct pt_regs *regs, unsigned int fault,
unsigned long error_code, unsigned long info);
bool extable_handler_getput_ex(struct pt_regs *regs, unsigned int
fault, unsigned long error_code, unsigned long info);

and then shove ".long extable_handler_whatever - ." into the extable entry.

Major bonus points to whoever can figure out how to make
extable_handler_iret work -- the current implementation of that is a
real turd.  (Hint: it's not clear to me that it's even possible
without preserving at least part of the asm special case.)

--Andy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-04 23:25                 ` Andy Lutomirski
  0 siblings, 0 replies; 44+ messages in thread
From: Andy Lutomirski @ 2016-01-04 23:25 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 4, 2016 at 3:02 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jan 04, 2016 at 02:29:09PM -0800, Andy Lutomirski wrote:
>> Josh will argue with you if he sees that :)
>
> Except Josh doesn't need allyesconfigs. tinyconfig's __ex_table is 2K.

If we do the make-it-bigger approach, we get a really nice
simplification.  Screw the whole 'class' idea -- just store an offset
to a handler.

bool extable_handler_default(struct pt_regs *regs, unsigned int fault,
unsigned long error_code, unsigned long info)
{
    if (fault == X86_TRAP_MC)
        return false;

    ...
}

bool extable_handler_mc_copy(struct pt_regs *regs, unsigned int fault,
unsigned long error_code, unsigned long info);
bool extable_handler_getput_ex(struct pt_regs *regs, unsigned int
fault, unsigned long error_code, unsigned long info);

and then shove ".long extable_handler_whatever - ." into the extable entry.

Major bonus points to whoever can figure out how to make
extable_handler_iret work -- the current implementation of that is a
real turd.  (Hint: it's not clear to me that it's even possible
without preserving at least part of the asm special case.)

--Andy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
  2016-01-04 23:25                 ` Andy Lutomirski
@ 2016-01-05 11:20                   ` Borislav Petkov
  -1 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2016-01-05 11:20 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 04, 2016 at 03:25:58PM -0800, Andy Lutomirski wrote:
> On Mon, Jan 4, 2016 at 3:02 PM, Borislav Petkov <bp@alien8.de> wrote:
> > On Mon, Jan 04, 2016 at 02:29:09PM -0800, Andy Lutomirski wrote:
> >> Josh will argue with you if he sees that :)
> >
> > Except Josh doesn't need allyesconfigs. tinyconfig's __ex_table is 2K.
> 
> If we do the make-it-bigger approach, we get a really nice
> simplification.  Screw the whole 'class' idea -- just store an offset
> to a handler.
> 
> bool extable_handler_default(struct pt_regs *regs, unsigned int fault,
> unsigned long error_code, unsigned long info)
> {
>     if (fault == X86_TRAP_MC)
>         return false;
> 
>     ...
> }
> 
> bool extable_handler_mc_copy(struct pt_regs *regs, unsigned int fault,
> unsigned long error_code, unsigned long info);
> bool extable_handler_getput_ex(struct pt_regs *regs, unsigned int
> fault, unsigned long error_code, unsigned long info);
> 
> and then shove ".long extable_handler_whatever - ." into the extable entry.

And to make it even cooler and more generic, you can make the exception
table entry look like this:

{ <offset to fault address>, <offset to handler>, <offset to an opaque pointer> }

and that opaque pointer would be a void * to some struct we pass to
that handler and filled with stuff it needs. For starters, it would
contain the return address where the fixup wants us to go.

The struct will have to be statically allocated but ok, that's fine.

And this way you can do all the sophisticated handling you desire.

> Major bonus points to whoever can figure out how to make
> extable_handler_iret work -- the current implementation of that is a
> real turd.  (Hint: it's not clear to me that it's even possible
> without preserving at least part of the asm special case.)

What's extable_handler_iret? IRET-ting from an exception? Where do we do
that?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit)
@ 2016-01-05 11:20                   ` Borislav Petkov
  0 siblings, 0 replies; 44+ messages in thread
From: Borislav Petkov @ 2016-01-05 11:20 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Tony Luck, Ingo Molnar, Andrew Morton, Andy Lutomirski,
	Dan Williams, Robert, Linux Kernel Mailing List, linux-mm,
	linux-nvdimm, X86-ML

On Mon, Jan 04, 2016 at 03:25:58PM -0800, Andy Lutomirski wrote:
> On Mon, Jan 4, 2016 at 3:02 PM, Borislav Petkov <bp@alien8.de> wrote:
> > On Mon, Jan 04, 2016 at 02:29:09PM -0800, Andy Lutomirski wrote:
> >> Josh will argue with you if he sees that :)
> >
> > Except Josh doesn't need allyesconfigs. tinyconfig's __ex_table is 2K.
> 
> If we do the make-it-bigger approach, we get a really nice
> simplification.  Screw the whole 'class' idea -- just store an offset
> to a handler.
> 
> bool extable_handler_default(struct pt_regs *regs, unsigned int fault,
> unsigned long error_code, unsigned long info)
> {
>     if (fault == X86_TRAP_MC)
>         return false;
> 
>     ...
> }
> 
> bool extable_handler_mc_copy(struct pt_regs *regs, unsigned int fault,
> unsigned long error_code, unsigned long info);
> bool extable_handler_getput_ex(struct pt_regs *regs, unsigned int
> fault, unsigned long error_code, unsigned long info);
> 
> and then shove ".long extable_handler_whatever - ." into the extable entry.

And to make it even cooler and more generic, you can make the exception
table entry look like this:

{ <offset to fault address>, <offset to handler>, <offset to an opaque pointer> }

and that opaque pointer would be a void * to some struct we pass to
that handler and filled with stuff it needs. For starters, it would
contain the return address where the fixup wants us to go.

The struct will have to be statically allocated but ok, that's fine.

And this way you can do all the sophisticated handling you desire.

> Major bonus points to whoever can figure out how to make
> extable_handler_iret work -- the current implementation of that is a
> real turd.  (Hint: it's not clear to me that it's even possible
> without preserving at least part of the asm special case.)

What's extable_handler_iret? IRET-ting from an exception? Where do we do
that?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2016-01-05 11:20 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-04  1:02 [PATCH v6 0/4] Machine check recovery when kernel accesses poison Tony Luck
2016-01-04  1:02 ` Tony Luck
2015-12-30 17:59 ` [PATCH v6 1/4] x86: Clean up extable entry format (and free up a bit) Andy Lutomirski
2015-12-30 17:59   ` Andy Lutomirski
2016-01-04  1:37   ` Tony Luck
2016-01-04  1:37     ` Tony Luck
2016-01-04  7:49     ` Ingo Molnar
2016-01-04  7:49       ` Ingo Molnar
2016-01-04 12:07   ` Borislav Petkov
2016-01-04 12:07     ` Borislav Petkov
2016-01-04 17:26     ` Tony Luck
2016-01-04 17:26       ` Tony Luck
2016-01-04 18:08       ` Andy Lutomirski
2016-01-04 18:08         ` Andy Lutomirski
2016-01-04 18:59         ` Tony Luck
2016-01-04 18:59           ` Tony Luck
2016-01-04 19:05           ` Andy Lutomirski
2016-01-04 19:05             ` Andy Lutomirski
2016-01-04 21:02         ` Borislav Petkov
2016-01-04 21:02           ` Borislav Petkov
2016-01-04 22:29           ` Andy Lutomirski
2016-01-04 22:29             ` Andy Lutomirski
2016-01-04 23:02             ` Borislav Petkov
2016-01-04 23:02               ` Borislav Petkov
2016-01-04 23:04               ` Borislav Petkov
2016-01-04 23:04                 ` Borislav Petkov
2016-01-04 23:25               ` Andy Lutomirski
2016-01-04 23:25                 ` Andy Lutomirski
2016-01-05 11:20                 ` Borislav Petkov
2016-01-05 11:20                   ` Borislav Petkov
2016-01-04 23:11         ` Tony Luck
2016-01-04 23:11           ` Tony Luck
2015-12-30 18:56 ` [PATCH v6 2/4] x86: Cleanup and add a new exception class Tony Luck
2015-12-30 18:56   ` Tony Luck
2016-01-04 14:22   ` Borislav Petkov
2016-01-04 17:00     ` Luck, Tony
2016-01-04 17:00       ` Luck, Tony
2016-01-04 20:32       ` Borislav Petkov
2016-01-04 22:23         ` Andy Lutomirski
2016-01-04 22:23           ` Andy Lutomirski
2015-12-31 19:40 ` [PATCH v6 3/4] x86, mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries Tony Luck
2015-12-31 19:40   ` Tony Luck
2015-12-31 19:43 ` [PATCH v6 4/4] x86, mce: Add __mcsafe_copy() Tony Luck
2015-12-31 19:43   ` Tony Luck

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.