[PATCH 0/6 v3] [GIT PULL] x86: Workaround for NMI iret woes

* [PATCH 0/6 v3] [GIT PULL] x86: Workaround for NMI iret woes
@ 2011-12-22  3:13 Steven Rostedt
  2011-12-22  3:13 ` [PATCH 1/6 v3] x86: Do not schedule while still in NMI context Steven Rostedt
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: Steven Rostedt @ 2011-12-22  3:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, Peter Zijlstra,
	Frederic Weisbecker, Linus Torvalds, H. Peter Anvin,
	Mathieu Desnoyers, Andi Kleen

[-- Attachment #1: Type: text/plain, Size: 10288 bytes --]


Ingo,

I updated the patches to reflect your comments and ran them through
all my tests again. Below is the full diff between v2 and v3.

I renamed nmi_preprocess() and nmi_postprocess() to
nmi_nesting_preprocess() and nmi_nesting_postprocess() to at least
note that this has to do with nmi nesting. I also commented the
fact that i386 may loop back to the preprocess.

I had to move the debug_usage functions to debugreg.h, because
smp.h includes processor.h where making the inc/dec inline
caused include header issues.

-- Steve

Please pull the latest tip/x86/core-3 tree, which can be found at:

  git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git
tip/x86/core-3

Head SHA1: 42181186ad4db986fcaa40ca95c6e407e9e79372


Linus Torvalds (1):
      x86: Do not schedule while still in NMI context

Steven Rostedt (5):
      x86: Document the NMI handler about not using paranoid_exit
      x86: Add workaround to NMI iret woes
      x86: Keep current stack in NMI breakpoints
      x86: Allow NMIs to hit breakpoints in i386
      x86: Add counter when debug stack is used with interrupts enabled

----
 arch/x86/include/asm/debugreg.h |   22 ++++
 arch/x86/include/asm/desc.h     |   12 ++
 arch/x86/kernel/cpu/common.c    |   24 +++++
 arch/x86/kernel/entry_64.S      |  218 +++++++++++++++++++++++++++++++++------
 arch/x86/kernel/head_64.S       |    4 +
 arch/x86/kernel/nmi.c           |  102 ++++++++++++++++++
 arch/x86/kernel/traps.c         |   20 ++++
 7 files changed, 369 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 078ad0c..b903d5e 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump);
 
 extern void hw_breakpoint_restore(void);
 
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(int, debug_stack_usage);
+static inline void debug_stack_usage_inc(void)
+{
+	__get_cpu_var(debug_stack_usage)++;
+}
+static inline void debug_stack_usage_dec(void)
+{
+	__get_cpu_var(debug_stack_usage)--;
+}
+int is_debug_stack(unsigned long addr);
+void debug_stack_set_zero(void);
+void debug_stack_reset(void);
+#else /* !X86_64 */
+static inline int is_debug_stack(unsigned long addr) { return 0; }
+static inline void debug_stack_set_zero(void) { }
+static inline void debug_stack_reset(void) { }
+static inline void debug_stack_usage_inc(void) { }
+static inline void debug_stack_usage_dec(void) { }
+#endif /* X86_64 */
+
+
 #endif	/* __KERNEL__ */
 
 #endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2fef5ba..b650435 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -402,11 +402,6 @@ DECLARE_PER_CPU(char *, irq_stack_ptr);
 DECLARE_PER_CPU(unsigned int, irq_count);
 extern unsigned long kernel_eflags;
 extern asmlinkage void ignore_sysret(void);
-void inc_debug_stack_usage(void);
-void dec_debug_stack_usage(void);
-int is_debug_stack(unsigned long addr);
-void zero_debug_stack(void);
-void reset_debug_stack(void);
 #else	/* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
 /*
@@ -421,11 +416,6 @@ struct stack_canary {
 };
 DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
-static inline int is_debug_stack(unsigned long addr) { return 0; }
-static inline void inc_debug_stack_usage(void) { }
-static inline void dec_debug_stack_usage(void) { }
-static inline void zero_debug_stack(void) { }
-static inline void reset_debug_stack(void) { }
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f1ec612..266e464 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1093,17 +1093,7 @@ unsigned long kernel_eflags;
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 
 static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
-static DEFINE_PER_CPU(int, debug_stack_usage);
-
-void inc_debug_stack_usage(void)
-{
-	__get_cpu_var(debug_stack_usage)++;
-}
-
-void dec_debug_stack_usage(void)
-{
-	__get_cpu_var(debug_stack_usage)--;
-}
+DEFINE_PER_CPU(int, debug_stack_usage);
 
 int is_debug_stack(unsigned long addr)
 {
@@ -1112,12 +1102,12 @@ int is_debug_stack(unsigned long addr)
 		 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
 }
 
-void zero_debug_stack(void)
+void debug_stack_set_zero(void)
 {
 	load_idt((const struct desc_ptr *)&nmi_idt_descr);
 }
 
-void reset_debug_stack(void)
+void debug_stack_reset(void)
 {
 	load_idt((const struct desc_ptr *)&idt_descr);
 }
@@ -1240,7 +1230,7 @@ void __cpuinit cpu_init(void)
 			estacks += exception_stack_sizes[v];
 			oist->ist[v] = t->x86_tss.ist[v] =
 					(unsigned long)estacks;
-			if (v == DEBUG_STACK - 1)
+			if (v == DEBUG_STACK-1)
 				per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
 		}
 	}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b2dea00..b62aa29 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1560,14 +1560,16 @@ nested_nmi:
 	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
 	leaq -6*8(%rsp), %rdx
 	movq %rdx, %rsp
-	pushq $__KERNEL_DS
-	pushq %rdx
-	pushfq
-	pushq $__KERNEL_CS
+	CFI_ADJUST_CFA_OFFSET 6*8
+	pushq_cfi $__KERNEL_DS
+	pushq_cfi %rdx
+	pushfq_cfi
+	pushq_cfi $__KERNEL_CS
 	pushq_cfi $repeat_nmi
 
 	/* Put stack back */
 	addq $(11*8), %rsp
+	CFI_ADJUST_CFA_OFFSET -11*8
 
 nested_nmi_out:
 	popq_cfi %rdx
@@ -1578,7 +1580,7 @@ nested_nmi_out:
 first_nmi:
 	/*
 	 * Because nested NMIs will use the pushed location that we
-	 * stored rdx, we must keep that space available.
+	 * stored in rdx, we must keep that space available.
 	 * Here's what our stack frame will look like:
 	 * +-------------------------+
 	 * | original SS             |
@@ -1661,16 +1663,24 @@ nmi_restore:
 	CFI_ENDPROC
 END(nmi)
 
+	/*
+	 * If an NMI hit an iret because of an exception or breakpoint,
+	 * it can lose its NMI context, and a nested NMI may come in.
+	 * In that case, the nested NMI will change the preempted NMI's
+	 * stack to jump to here when it does the final iret.
+	 */
 repeat_nmi:
+	INTR_FRAME
 	/* Update the stack variable to say we are still in NMI */
 	movq $1, 5*8(%rsp)
 
 	/* copy the saved stack back to copy stack */
 	.rept 5
-	pushq 4*8(%rsp)
+	pushq_cfi 4*8(%rsp)
 	.endr
 
 	jmp restart_nmi
+	CFI_ENDPROC
 end_repeat_nmi:
 
 ENTRY(ignore_sysret)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 3cb659c..47acaf3 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -442,7 +442,7 @@ enum nmi_states {
 };
 static DEFINE_PER_CPU(enum nmi_states, nmi_state);
 
-#define nmi_preprocess(regs)						\
+#define nmi_nesting_preprocess(regs)					\
 	do {								\
 		if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) {	\
 			__get_cpu_var(nmi_state) = NMI_LATCHED;		\
@@ -452,7 +452,7 @@ static DEFINE_PER_CPU(enum nmi_states, nmi_state);
 		__get_cpu_var(nmi_state) = NMI_EXECUTING;		\
 	} while (0)
 
-#define nmi_postprocess()						\
+#define nmi_nesting_postprocess()					\
 	do {								\
 		if (cmpxchg(&__get_cpu_var(nmi_state),			\
 		    NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING)	\
@@ -481,7 +481,7 @@ static DEFINE_PER_CPU(enum nmi_states, nmi_state);
  */
 static DEFINE_PER_CPU(int, update_debug_stack);
 
-static inline void nmi_preprocess(struct pt_regs *regs)
+static inline void nmi_nesting_preprocess(struct pt_regs *regs)
 {
 	/*
 	 * If we interrupted a breakpoint, it is possible that
@@ -490,22 +490,22 @@ static inline void nmi_preprocess(struct pt_regs *regs)
 	 * continue to use the NMI stack.
 	 */
 	if (unlikely(is_debug_stack(regs->sp))) {
-		zero_debug_stack();
+		debug_stack_set_zero();
 		__get_cpu_var(update_debug_stack) = 1;
 	}
 }
 
-static inline void nmi_postprocess(void)
+static inline void nmi_nesting_postprocess(void)
 {
 	if (unlikely(__get_cpu_var(update_debug_stack)))
-		reset_debug_stack();
+		debug_stack_reset();
 }
 #endif
 
 dotraplinkage notrace __kprobes void
 do_nmi(struct pt_regs *regs, long error_code)
 {
-	nmi_preprocess(regs);
+	nmi_nesting_preprocess(regs);
 
 	nmi_enter();
 
@@ -516,7 +516,8 @@ do_nmi(struct pt_regs *regs, long error_code)
 
 	nmi_exit();
 
-	nmi_postprocess();
+	/* On i386, may loop back to preprocess */
+	nmi_nesting_postprocess();
 }
 
 void stop_nmi(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d2510e7..0072b38 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -320,11 +320,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 	 * Let others (NMI) know that the debug stack is in use
 	 * as we may switch to the interrupt stack.
 	 */
-	inc_debug_stack_usage();
+	debug_stack_usage_inc();
 	preempt_conditional_sti(regs);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
 	preempt_conditional_cli(regs);
-	dec_debug_stack_usage();
+	debug_stack_usage_dec();
 }
 
 #ifdef CONFIG_X86_64
@@ -421,7 +421,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	 * Let others (NMI) know that the debug stack is in use
 	 * as we may switch to the interrupt stack.
 	 */
-	inc_debug_stack_usage();
+	debug_stack_usage_inc();
 
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
@@ -430,7 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 		handle_vm86_trap((struct kernel_vm86_regs *) regs,
 				error_code, 1);
 		preempt_conditional_cli(regs);
-		dec_debug_stack_usage();
+		debug_stack_usage_dec();
 		return;
 	}
 
@@ -450,7 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
 	preempt_conditional_cli(regs);
-	dec_debug_stack_usage();
+	debug_stack_usage_dec();
 
 	return;
 }

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply related	[flat|nested] 8+ messages in thread