linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/7] Tracing vs CR2 (and cleanups)
@ 2019-07-04 19:55 Peter Zijlstra
  2019-07-04 19:55 ` [PATCH v2 1/7] x86/paravirt: Make read_cr2() CALLEE_SAVE Peter Zijlstra
                   ` (6 more replies)
  0 siblings, 7 replies; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-04 19:55 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

Hi,

Eiichi-san re-discovered the bug earlier found by He Zhe which we've failed to
fix due to getting distracted by discussing how to untangle entry_64.S.

These 3 patches are basically a completion of the initial approach I suggested
in that earlier thread:

  https://lkml.kernel.org/r/20190320221534.165ab87b@oasis.local.home

Since v1:

 - idtentry_part 'cleanup'
 - extra sanity check and comment
 - read_cr2=1 for do_double_fault
 - #BP vs IST cleanup
 - IDTENTRYx() C wrapper

The thing boots on x86_64 with lockdep on, survives function-trace,
selftests/x86 and perf-test.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 1/7] x86/paravirt: Make read_cr2() CALLEE_SAVE
  2019-07-04 19:55 [PATCH v2 0/7] Tracing vs CR2 (and cleanups) Peter Zijlstra
@ 2019-07-04 19:55 ` Peter Zijlstra
  2019-07-04 21:49   ` Andy Lutomirski
  2019-07-10 19:53   ` Steven Rostedt
  2019-07-04 19:55 ` [PATCH v2 2/7] x86/entry/32: Simplify common_exception Peter Zijlstra
                   ` (5 subsequent siblings)
  6 siblings, 2 replies; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-04 19:55 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

The one paravirt read_cr2() implementation (Xen) is actually quite
trivial and doesn't need to clobber anything other than the return
register. By making read_cr2() CALLEE_SAVE we avoid all the PUSH/POP
nonsense and allow more convenient use from assembly.

Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/calling.h              |    6 ++++++
 arch/x86/include/asm/paravirt.h       |   22 +++++++++++++---------
 arch/x86/include/asm/paravirt_types.h |    2 +-
 arch/x86/kernel/asm-offsets.c         |    1 +
 arch/x86/kernel/head_64.S             |    4 +---
 arch/x86/kernel/paravirt.c            |    2 +-
 arch/x86/xen/enlighten_pv.c           |    3 ++-
 arch/x86/xen/mmu_pv.c                 |   12 +-----------
 arch/x86/xen/xen-asm.S                |   17 +++++++++++++++++
 arch/x86/xen/xen-ops.h                |    3 +++
 10 files changed, 46 insertions(+), 26 deletions(-)

--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -383,3 +383,9 @@ For 32-bit we have the following convent
 .Lafter_call_\@:
 #endif
 .endm
+
+#ifdef CONFIG_PARAVIRT_XXL
+#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
+#else
+#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
+#endif
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -116,7 +116,7 @@ static inline void write_cr0(unsigned lo
 
 static inline unsigned long read_cr2(void)
 {
-	return PVOP_CALL0(unsigned long, mmu.read_cr2);
+	return PVOP_CALLEE0(unsigned long, mmu.read_cr2);
 }
 
 static inline void write_cr2(unsigned long x)
@@ -909,13 +909,7 @@ extern void default_banner(void);
 		  ANNOTATE_RETPOLINE_SAFE;				\
 		  call PARA_INDIRECT(pv_ops+PV_CPU_swapgs);		\
 		 )
-#endif
-
-#define GET_CR2_INTO_RAX				\
-	ANNOTATE_RETPOLINE_SAFE;				\
-	call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);
 
-#ifdef CONFIG_PARAVIRT_XXL
 #define USERGS_SYSRET64							\
 	PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64),			\
 		  ANNOTATE_RETPOLINE_SAFE;				\
@@ -929,9 +923,19 @@ extern void default_banner(void);
 		  call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);	    \
 		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 #endif
-#endif
+#endif /* CONFIG_PARAVIRT_XXL */
+#endif	/* CONFIG_X86_64 */
+
+#ifdef CONFIG_PARAVIRT_XXL
+
+#define GET_CR2_INTO_AX							\
+	PARA_SITE(PARA_PATCH(PV_MMU_read_cr2),				\
+		  ANNOTATE_RETPOLINE_SAFE;				\
+		  call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);		\
+		 )
+
+#endif /* CONFIG_PARAVIRT_XXL */
 
-#endif	/* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
 #else  /* CONFIG_PARAVIRT */
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -220,7 +220,7 @@ struct pv_mmu_ops {
 	void (*exit_mmap)(struct mm_struct *mm);
 
 #ifdef CONFIG_PARAVIRT_XXL
-	unsigned long (*read_cr2)(void);
+	struct paravirt_callee_save read_cr2;
 	void (*write_cr2)(unsigned long);
 
 	unsigned long (*read_cr3)(void);
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -76,6 +76,7 @@ static void __used common(void)
 	BLANK();
 	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
 	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+	OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2);
 #endif
 
 	BLANK();
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -29,9 +29,7 @@
 #ifdef CONFIG_PARAVIRT_XXL
 #include <asm/asm-offsets.h>
 #include <asm/paravirt.h>
-#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
 #else
-#define GET_CR2_INTO(reg) movq %cr2, reg
 #define INTERRUPT_RETURN iretq
 #endif
 
@@ -323,7 +321,7 @@ END(early_idt_handler_array)
 
 	cmpq $14,%rsi		/* Page fault? */
 	jnz 10f
-	GET_CR2_INTO(%rdi)	/* Can clobber any volatile register if pv */
+	GET_CR2_INTO(%rdi)	/* can clobber %rax if pv */
 	call early_make_pgtable
 	andl %eax,%eax
 	jz 20f			/* All good */
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -370,7 +370,7 @@ struct paravirt_patch_template pv_ops =
 	.mmu.exit_mmap		= paravirt_nop,
 
 #ifdef CONFIG_PARAVIRT_XXL
-	.mmu.read_cr2		= native_read_cr2,
+	.mmu.read_cr2		= __PV_IS_CALLEE_SAVE(native_read_cr2),
 	.mmu.write_cr2		= native_write_cr2,
 	.mmu.read_cr3		= __native_read_cr3,
 	.mmu.write_cr3		= native_write_cr3,
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -998,7 +998,8 @@ void __init xen_setup_vcpu_info_placemen
 			__PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
 		pv_ops.irq.irq_enable =
 			__PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
-		pv_ops.mmu.read_cr2 = xen_read_cr2_direct;
+		pv_ops.mmu.read_cr2 =
+			__PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
 	}
 }
 
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1307,16 +1307,6 @@ static void xen_write_cr2(unsigned long
 	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
 }
 
-static unsigned long xen_read_cr2(void)
-{
-	return this_cpu_read(xen_vcpu)->arch.cr2;
-}
-
-unsigned long xen_read_cr2_direct(void)
-{
-	return this_cpu_read(xen_vcpu_info.arch.cr2);
-}
-
 static noinline void xen_flush_tlb(void)
 {
 	struct mmuext_op *op;
@@ -2397,7 +2387,7 @@ static void xen_leave_lazy_mmu(void)
 }
 
 static const struct pv_mmu_ops xen_mmu_ops __initconst = {
-	.read_cr2 = xen_read_cr2,
+	.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
 	.write_cr2 = xen_write_cr2,
 
 	.read_cr3 = xen_read_cr3,
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -10,6 +10,7 @@
 #include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/frame.h>
+#include <asm/asm.h>
 
 #include <linux/linkage.h>
 
@@ -135,3 +136,19 @@ ENTRY(check_events)
 	FRAME_END
 	ret
 ENDPROC(check_events)
+
+ENTRY(xen_read_cr2)
+	FRAME_BEGIN
+	_ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX
+	_ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX
+	FRAME_END
+	ret
+	ENDPROC(xen_read_cr2);
+
+ENTRY(xen_read_cr2_direct)
+	FRAME_BEGIN
+	_ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX
+	FRAME_END
+	ret
+	ENDPROC(xen_read_cr2_direct);
+
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -134,6 +134,9 @@ __visible void xen_irq_disable_direct(vo
 __visible unsigned long xen_save_fl_direct(void);
 __visible void xen_restore_fl_direct(unsigned long);
 
+__visible unsigned long xen_read_cr2(void);
+__visible unsigned long xen_read_cr2_direct(void);
+
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
 __visible void xen_sysret32(void);



^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 2/7] x86/entry/32: Simplify common_exception
  2019-07-04 19:55 [PATCH v2 0/7] Tracing vs CR2 (and cleanups) Peter Zijlstra
  2019-07-04 19:55 ` [PATCH v2 1/7] x86/paravirt: Make read_cr2() CALLEE_SAVE Peter Zijlstra
@ 2019-07-04 19:55 ` Peter Zijlstra
  2019-07-04 21:51   ` Andy Lutomirski
  2019-07-10 20:11   ` Steven Rostedt
  2019-07-04 19:55 ` [PATCH v2 3/7] x86/entry/64: Simplify idtentry a little Peter Zijlstra
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-04 19:55 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

By adding one more option to SAVE_ALL we can make use of it in
common_exception and simplify things. This saves duplication later
where page_fault will no longer use common_exception.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/entry_32.S |   36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -294,9 +294,11 @@
 .Lfinished_frame_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0
 	cld
+.if \skip_gs == 0
 	PUSH_GS
+.endif
 	FIXUP_FRAME
 	pushl	%fs
 	pushl	%es
@@ -313,13 +315,13 @@
 	movl	%edx, %es
 	movl	$(__KERNEL_PERCPU), %edx
 	movl	%edx, %fs
+.if \skip_gs == 0
 	SET_KERNEL_GS %edx
-
+.endif
 	/* Switch to kernel stack if necessary */
 .if \switch_stacks > 0
 	SWITCH_TO_KERNEL_STACK
 .endif
-
 .endm
 
 .macro SAVE_ALL_NMI cr3_reg:req
@@ -1448,32 +1450,20 @@ END(page_fault)
 
 common_exception:
 	/* the function address is in %gs's slot on the stack */
-	FIXUP_FRAME
-	pushl	%fs
-	pushl	%es
-	pushl	%ds
-	pushl	%eax
-	movl	$(__USER_DS), %eax
-	movl	%eax, %ds
-	movl	%eax, %es
-	movl	$(__KERNEL_PERCPU), %eax
-	movl	%eax, %fs
-	pushl	%ebp
-	pushl	%edi
-	pushl	%esi
-	pushl	%edx
-	pushl	%ecx
-	pushl	%ebx
-	SWITCH_TO_KERNEL_STACK
+	SAVE_ALL switch_stacks=1 skip_gs=1
 	ENCODE_FRAME_POINTER
-	cld
 	UNWIND_ESPFIX_STACK
+
+	/* fixup %gs */
 	GS_TO_REG %ecx
 	movl	PT_GS(%esp), %edi		# get the function address
-	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
-	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
 	REG_TO_PTGS %ecx
 	SET_KERNEL_GS %ecx
+
+	/* fixup orig %eax */
+	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
+	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
+
 	TRACE_IRQS_OFF
 	movl	%esp, %eax			# pt_regs pointer
 	CALL_NOSPEC %edi



^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 3/7] x86/entry/64: Simplify idtentry a little
  2019-07-04 19:55 [PATCH v2 0/7] Tracing vs CR2 (and cleanups) Peter Zijlstra
  2019-07-04 19:55 ` [PATCH v2 1/7] x86/paravirt: Make read_cr2() CALLEE_SAVE Peter Zijlstra
  2019-07-04 19:55 ` [PATCH v2 2/7] x86/entry/32: Simplify common_exception Peter Zijlstra
@ 2019-07-04 19:55 ` Peter Zijlstra
  2019-07-04 21:54   ` Andy Lutomirski
  2019-07-10 20:23   ` Steven Rostedt
  2019-07-04 19:55 ` [PATCH v2 4/7] x86/entry/64: Update comments and sanity tests for create_gap Peter Zijlstra
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-04 19:55 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

There's a bunch of duplication in idtentry, namely the
.Lfrom_usermode_switch_stack is a paranoid=0 copy of the normal flow.

Make this explicit by creating a idtentry_part helper macro.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/entry_64.S |  100 +++++++++++++++++++++-------------------------
 1 file changed, 47 insertions(+), 53 deletions(-)

--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -865,6 +865,51 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
  */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
 
+.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0
+
+	.if \paranoid
+	call	paranoid_entry
+	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
+	.else
+	call	error_entry
+	.endif
+	UNWIND_HINT_REGS
+
+	.if \paranoid
+	.if \shift_ist != -1
+	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
+	.else
+	TRACE_IRQS_OFF
+	.endif
+	.endif
+
+	movq	%rsp, %rdi			/* pt_regs pointer */
+
+	.if \has_error_code
+	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
+	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
+	.else
+	xorl	%esi, %esi			/* no error code */
+	.endif
+
+	.if \shift_ist != -1
+	subq	$\ist_offset, CPU_TSS_IST(\shift_ist)
+	.endif
+
+	call	\do_sym
+
+	.if \shift_ist != -1
+	addq	$\ist_offset, CPU_TSS_IST(\shift_ist)
+	.endif
+
+	.if \paranoid
+	jmp	paranoid_exit
+	.else
+	jmp	error_exit
+	.endif
+
+.endm
+
 /**
  * idtentry - Generate an IDT entry stub
  * @sym:		Name of the generated entry point
@@ -935,46 +980,7 @@ ENTRY(\sym)
 .Lfrom_usermode_no_gap_\@:
 	.endif
 
-	.if \paranoid
-	call	paranoid_entry
-	.else
-	call	error_entry
-	.endif
-	UNWIND_HINT_REGS
-	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
-
-	.if \paranoid
-	.if \shift_ist != -1
-	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
-	.else
-	TRACE_IRQS_OFF
-	.endif
-	.endif
-
-	movq	%rsp, %rdi			/* pt_regs pointer */
-
-	.if \has_error_code
-	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
-	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
-	.else
-	xorl	%esi, %esi			/* no error code */
-	.endif
-
-	.if \shift_ist != -1
-	subq	$\ist_offset, CPU_TSS_IST(\shift_ist)
-	.endif
-
-	call	\do_sym
-
-	.if \shift_ist != -1
-	addq	$\ist_offset, CPU_TSS_IST(\shift_ist)
-	.endif
-
-	.if \paranoid
-	jmp	paranoid_exit
-	.else
-	jmp	error_exit
-	.endif
+	idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset
 
 	.if \paranoid == 1
 	/*
@@ -983,21 +989,9 @@ ENTRY(\sym)
 	 * run in real process context if user_mode(regs).
 	 */
 .Lfrom_usermode_switch_stack_\@:
-	call	error_entry
-
-	movq	%rsp, %rdi			/* pt_regs pointer */
-
-	.if \has_error_code
-	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
-	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
-	.else
-	xorl	%esi, %esi			/* no error code */
+	idtentry_part \do_sym, \has_error_code, 0
 	.endif
 
-	call	\do_sym
-
-	jmp	error_exit
-	.endif
 _ASM_NOKPROBE(\sym)
 END(\sym)
 .endm



^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 4/7] x86/entry/64: Update comments and sanity tests for create_gap
  2019-07-04 19:55 [PATCH v2 0/7] Tracing vs CR2 (and cleanups) Peter Zijlstra
                   ` (2 preceding siblings ...)
  2019-07-04 19:55 ` [PATCH v2 3/7] x86/entry/64: Simplify idtentry a little Peter Zijlstra
@ 2019-07-04 19:55 ` Peter Zijlstra
  2019-07-04 21:55   ` Andy Lutomirski
  2019-07-10 20:24   ` Steven Rostedt
  2019-07-04 19:56 ` [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption Peter Zijlstra
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-04 19:55 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz



Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/entry_64.S |   17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -913,15 +913,16 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 /**
  * idtentry - Generate an IDT entry stub
  * @sym:		Name of the generated entry point
- * @do_sym: 		C function to be called
- * @has_error_code: 	True if this IDT vector has an error code on the stack
- * @paranoid: 		non-zero means that this vector may be invoked from
+ * @do_sym:		C function to be called
+ * @has_error_code:	True if this IDT vector has an error code on the stack
+ * @paranoid:		non-zero means that this vector may be invoked from
  *			kernel mode with user GSBASE and/or user CR3.
  *			2 is special -- see below.
  * @shift_ist:		Set to an IST index if entries from kernel mode should
- *             		decrement the IST stack so that nested entries get a
+ *			decrement the IST stack so that nested entries get a
  *			fresh stack.  (This is for #DB, which has a nasty habit
- *             		of recursing.)
+ *			of recursing.)
+ * @create_gap:		create a 6-word stack gap when coming from kernel mode.
  *
  * idtentry generates an IDT stub that sets up a usable kernel context,
  * creates struct pt_regs, and calls @do_sym.  The stub has the following
@@ -951,10 +952,14 @@ ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
 	/* Sanity check */
-	.if \shift_ist != -1 && \paranoid == 0
+	.if \shift_ist != -1 && \paranoid != 1
 	.error "using shift_ist requires paranoid=1"
 	.endif
 
+	.if \create_gap && \paranoid
+	.error "using create_gap requires paranoid=0"
+	.endif
+
 	ASM_CLAC
 
 	.if \has_error_code == 0



^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-04 19:55 [PATCH v2 0/7] Tracing vs CR2 (and cleanups) Peter Zijlstra
                   ` (3 preceding siblings ...)
  2019-07-04 19:55 ` [PATCH v2 4/7] x86/entry/64: Update comments and sanity tests for create_gap Peter Zijlstra
@ 2019-07-04 19:56 ` Peter Zijlstra
  2019-07-05  2:18   ` Linus Torvalds
  2019-07-07 15:10   ` Andy Lutomirski
  2019-07-04 19:56 ` [PATCH v2 6/7] x86/entry/64: Remove TRACE_IRQS_*_DEBUG Peter Zijlstra
  2019-07-04 19:56 ` [RFC][PATCH v2 7/7] x86/entry/64: Pull bits into C Peter Zijlstra
  6 siblings, 2 replies; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-04 19:56 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

Despire the current efforts to read CR2 before tracing happens there
still exist a number of possible holes:

  idtentry page_fault             do_page_fault           has_error_code=1
    call error_entry
      TRACE_IRQS_OFF
        call trace_hardirqs_off*
          #PF // modifies CR2

      CALL_enter_from_user_mode
        __context_tracking_exit()
          trace_user_exit(0)
            #PF // modifies CR2

    call do_page_fault
      address = read_cr2(); /* whoopsie */

And similar for i386.

Fix it by pulling the CR2 read into the entry code, before any of that
stuff gets a chance to run and ruin things.

Reported-by: He Zhe <zhe.he@windriver.com>
Reported-by: Eiichi Tsukata <devel@etsukata.com>
Debugged-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/entry_32.S       |   25 ++++++++++++++++++++++---
 arch/x86/entry/entry_64.S       |   35 ++++++++++++++++++-----------------
 arch/x86/include/asm/kvm_para.h |    2 +-
 arch/x86/include/asm/traps.h    |    4 ++--
 arch/x86/kernel/kvm.c           |    8 ++++----
 arch/x86/kernel/traps.c         |    6 +-----
 arch/x86/mm/fault.c             |   28 ++++++++++------------------
 7 files changed, 58 insertions(+), 50 deletions(-)

--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vec
 
 ENTRY(page_fault)
 	ASM_CLAC
-	pushl	$do_page_fault
-	ALIGN
-	jmp common_exception
+	pushl	$0; /* %gs's slot on the stack */
+
+	SAVE_ALL switch_stacks=1 skip_gs=1
+
+	ENCODE_FRAME_POINTER
+	UNWIND_ESPFIX_STACK
+
+	/* fixup %gs */
+	GS_TO_REG %ecx
+	REG_TO_PTGS %ecx
+	SET_KERNEL_GS %ecx
+
+	GET_CR2_INTO(%ecx)			# might clobber %eax
+
+	/* fixup orig %eax */
+	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
+	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
+
+	TRACE_IRQS_OFF
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_page_fault
+	jmp	ret_from_exception
 END(page_fault)
 
 common_exception:
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -865,7 +865,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
  */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
 
-.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0
+.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0
 
 	.if \paranoid
 	call	paranoid_entry
@@ -875,12 +875,21 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 	.endif
 	UNWIND_HINT_REGS
 
-	.if \paranoid
+	.if \read_cr2
+	GET_CR2_INTO(%rdx);			/* can clobber %rax */
+	.endif
+
 	.if \shift_ist != -1
 	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
 	.else
 	TRACE_IRQS_OFF
 	.endif
+
+	.if \paranoid == 0
+	testb	$3, CS(%rsp)
+	jz	.Lfrom_kernel_no_context_tracking_\@
+	CALL_enter_from_user_mode
+.Lfrom_kernel_no_context_tracking_\@:
 	.endif
 
 	movq	%rsp, %rdi			/* pt_regs pointer */
@@ -923,6 +932,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
  *			fresh stack.  (This is for #DB, which has a nasty habit
  *			of recursing.)
  * @create_gap:		create a 6-word stack gap when coming from kernel mode.
+ * @read_cr2:		load CR2 into the 3rd argument; done before calling any C code
  *
  * idtentry generates an IDT stub that sets up a usable kernel context,
  * creates struct pt_regs, and calls @do_sym.  The stub has the following
@@ -947,7 +957,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
  */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
 ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
@@ -985,7 +995,7 @@ ENTRY(\sym)
 .Lfrom_usermode_no_gap_\@:
 	.endif
 
-	idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset
+	idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset
 
 	.if \paranoid == 1
 	/*
@@ -994,7 +1004,7 @@ ENTRY(\sym)
 	 * run in real process context if user_mode(regs).
 	 */
 .Lfrom_usermode_switch_stack_\@:
-	idtentry_part \do_sym, \has_error_code, 0
+	idtentry_part \do_sym, \has_error_code, \read_cr2, 0
 	.endif
 
 _ASM_NOKPROBE(\sym)
@@ -1006,7 +1016,7 @@ idtentry overflow			do_overflow			has_er
 idtentry bounds				do_bounds			has_error_code=0
 idtentry invalid_op			do_invalid_op			has_error_code=0
 idtentry device_not_available		do_device_not_available		has_error_code=0
-idtentry double_fault			do_double_fault			has_error_code=1 paranoid=2
+idtentry double_fault			do_double_fault			has_error_code=1 paranoid=2 read_cr2=1
 idtentry coprocessor_segment_overrun	do_coprocessor_segment_overrun	has_error_code=0
 idtentry invalid_TSS			do_invalid_TSS			has_error_code=1
 idtentry segment_not_present		do_segment_not_present		has_error_code=1
@@ -1179,10 +1189,10 @@ idtentry xenint3		do_int3			has_error_co
 #endif
 
 idtentry general_protection	do_general_protection	has_error_code=1
-idtentry page_fault		do_page_fault		has_error_code=1
+idtentry page_fault		do_page_fault		has_error_code=1	read_cr2=1
 
 #ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault	do_async_page_fault	has_error_code=1
+idtentry async_page_fault	do_async_page_fault	has_error_code=1	read_cr2=1
 #endif
 
 #ifdef CONFIG_X86_MCE
@@ -1337,18 +1347,9 @@ ENTRY(error_entry)
 	movq	%rax, %rsp			/* switch stack */
 	ENCODE_FRAME_POINTER
 	pushq	%r12
-
-	/*
-	 * We need to tell lockdep that IRQs are off.  We can't do this until
-	 * we fix gsbase, and we should do it before enter_from_user_mode
-	 * (which can take locks).
-	 */
-	TRACE_IRQS_OFF
-	CALL_enter_from_user_mode
 	ret
 
 .Lerror_entry_done:
-	TRACE_IRQS_OFF
 	ret
 
 	/*
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -92,7 +92,7 @@ void kvm_async_pf_task_wait(u32 token, i
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code);
+void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -74,14 +74,14 @@ dotraplinkage void do_invalid_TSS(struct
 dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
 dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code);
 #ifdef CONFIG_X86_64
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code);
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long address);
 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
 asmlinkage __visible notrace
 struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
 void __init trap_init(void);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code);
-dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code);
+dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code);
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
 dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code);
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_
 NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
 dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 {
 	enum ctx_state prev_state;
 
 	switch (kvm_read_and_reset_pf_reason()) {
 	default:
-		do_page_fault(regs, error_code);
+		do_page_fault(regs, error_code, address);
 		break;
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		/* page is swapped out by the host. */
 		prev_state = exception_enter();
-		kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
+		kvm_async_pf_task_wait((u32)address, !user_mode(regs));
 		exception_exit(prev_state);
 		break;
 	case KVM_PV_REASON_PAGE_READY:
 		rcu_irq_enter();
-		kvm_async_pf_task_wake((u32)read_cr2());
+		kvm_async_pf_task_wake((u32)address);
 		rcu_irq_exit();
 		break;
 	}
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -313,13 +313,10 @@ __visible void __noreturn handle_stack_o
 
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
 {
 	static const char str[] = "double fault";
 	struct task_struct *tsk = current;
-#ifdef CONFIG_VMAP_STACK
-	unsigned long cr2;
-#endif
 
 #ifdef CONFIG_X86_ESPFIX64
 	extern unsigned char native_irq_return_iret[];
@@ -415,7 +412,6 @@ dotraplinkage void do_double_fault(struc
 	 * stack even if the actual trigger for the double fault was
 	 * something else.
 	 */
-	cr2 = read_cr2();
 	if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
 		handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
 #endif
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1513,7 +1513,7 @@ NOKPROBE_SYMBOL(do_user_addr_fault);
  * and the problem, and then passes it off to one of the appropriate
  * routines.
  */
-static noinline void
+static __always_inline void
 __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
 		unsigned long address)
 {
@@ -1528,35 +1528,27 @@ __do_page_fault(struct pt_regs *regs, un
 	else
 		do_user_addr_fault(regs, hw_error_code, address);
 }
-NOKPROBE_SYMBOL(__do_page_fault);
 
-static nokprobe_inline void
-trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
-			 unsigned long error_code)
+static __always_inline void
+trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
+			 unsigned long address)
 {
+	if (!trace_pagefault_enabled())
+		return;
+
 	if (user_mode(regs))
 		trace_page_fault_user(address, regs, error_code);
 	else
 		trace_page_fault_kernel(address, regs, error_code);
 }
 
-/*
- * We must have this function blacklisted from kprobes, tagged with notrace
- * and call read_cr2() before calling anything else. To avoid calling any
- * kind of tracing machinery before we've observed the CR2 value.
- *
- * exception_{enter,exit}() contains all sorts of tracepoints.
- */
-dotraplinkage void notrace
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+dotraplinkage void
+do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 {
-	unsigned long address = read_cr2(); /* Get the faulting address */
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
-	if (trace_pagefault_enabled())
-		trace_page_fault_entries(address, regs, error_code);
-
+	trace_page_fault_entries(regs, error_code, address);
 	__do_page_fault(regs, error_code, address);
 	exception_exit(prev_state);
 }



^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2 6/7] x86/entry/64: Remove TRACE_IRQS_*_DEBUG
  2019-07-04 19:55 [PATCH v2 0/7] Tracing vs CR2 (and cleanups) Peter Zijlstra
                   ` (4 preceding siblings ...)
  2019-07-04 19:56 ` [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption Peter Zijlstra
@ 2019-07-04 19:56 ` Peter Zijlstra
  2019-07-11  3:24   ` Steven Rostedt
  2019-07-04 19:56 ` [RFC][PATCH v2 7/7] x86/entry/64: Pull bits into C Peter Zijlstra
  6 siblings, 1 reply; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-04 19:56 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

Since INT3/#BP no longer runs on an IST, this workaround is no longer
required.

Tested by running lockdep+ftrace as described in the initial commit:

  5963e317b1e9 ("ftrace/x86: Do not change stacks in DEBUG when calling lockdep")

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/entry_64.S |   46 ++--------------------------------------------
 1 file changed, 2 insertions(+), 44 deletions(-)

--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -68,44 +68,6 @@ END(native_usergs_sysret64)
 .endm
 
 /*
- * When dynamic function tracer is enabled it will add a breakpoint
- * to all locations that it is about to modify, sync CPUs, update
- * all the code, sync CPUs, then remove the breakpoints. In this time
- * if lockdep is enabled, it might jump back into the debug handler
- * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
- *
- * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
- * make sure the stack pointer does not get reset back to the top
- * of the debug stack, and instead just reuses the current stack.
- */
-#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
-
-.macro TRACE_IRQS_OFF_DEBUG
-	call	debug_stack_set_zero
-	TRACE_IRQS_OFF
-	call	debug_stack_reset
-.endm
-
-.macro TRACE_IRQS_ON_DEBUG
-	call	debug_stack_set_zero
-	TRACE_IRQS_ON
-	call	debug_stack_reset
-.endm
-
-.macro TRACE_IRQS_IRETQ_DEBUG
-	btl	$9, EFLAGS(%rsp)		/* interrupts off? */
-	jnc	1f
-	TRACE_IRQS_ON_DEBUG
-1:
-.endm
-
-#else
-# define TRACE_IRQS_OFF_DEBUG			TRACE_IRQS_OFF
-# define TRACE_IRQS_ON_DEBUG			TRACE_IRQS_ON
-# define TRACE_IRQS_IRETQ_DEBUG			TRACE_IRQS_IRETQ
-#endif
-
-/*
  * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
  * This is the only entry point used for 64-bit system calls.  The
@@ -879,11 +841,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 	GET_CR2_INTO(%rdx);			/* can clobber %rax */
 	.endif
 
-	.if \shift_ist != -1
-	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
-	.else
 	TRACE_IRQS_OFF
-	.endif
 
 	.if \paranoid == 0
 	testb	$3, CS(%rsp)
@@ -1292,7 +1250,7 @@ END(paranoid_entry)
 ENTRY(paranoid_exit)
 	UNWIND_HINT_REGS
 	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF_DEBUG
+	TRACE_IRQS_OFF
 
 	/* Handle GS depending on FSGSBASE availability */
 	ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "nop",X86_FEATURE_FSGSBASE
@@ -1312,7 +1270,7 @@ ENTRY(paranoid_exit)
 	jmp	.Lparanoid_exit_restore
 
 .Lparanoid_exit_no_swapgs:
-	TRACE_IRQS_IRETQ_DEBUG
+	TRACE_IRQS_IRETQ
 	/* Always restore stashed CR3 value (see paranoid_entry) */
 	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
 



^ permalink raw reply	[flat|nested] 42+ messages in thread

* [RFC][PATCH v2 7/7] x86/entry/64: Pull bits into C
  2019-07-04 19:55 [PATCH v2 0/7] Tracing vs CR2 (and cleanups) Peter Zijlstra
                   ` (5 preceding siblings ...)
  2019-07-04 19:56 ` [PATCH v2 6/7] x86/entry/64: Remove TRACE_IRQS_*_DEBUG Peter Zijlstra
@ 2019-07-04 19:56 ` Peter Zijlstra
  6 siblings, 0 replies; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-04 19:56 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

TODO: 32bit, Xen

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/entry_64.S       |   13 +++-------
 arch/x86/include/asm/idtentry.h |   49 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/traps.h    |    1 
 arch/x86/kernel/cpu/mce/core.c  |    2 -
 arch/x86/kernel/kvm.c           |    4 ---
 arch/x86/kernel/traps.c         |   30 ++++++++----------------
 arch/x86/mm/fault.c             |    4 ---
 7 files changed, 68 insertions(+), 35 deletions(-)

--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -841,15 +841,6 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 	GET_CR2_INTO(%rdx);			/* can clobber %rax */
 	.endif
 
-	TRACE_IRQS_OFF
-
-	.if \paranoid == 0
-	testb	$3, CS(%rsp)
-	jz	.Lfrom_kernel_no_context_tracking_\@
-	CALL_enter_from_user_mode
-.Lfrom_kernel_no_context_tracking_\@:
-	.endif
-
 	movq	%rsp, %rdi			/* pt_regs pointer */
 
 	.if \has_error_code
@@ -863,7 +854,11 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 	subq	$\ist_offset, CPU_TSS_IST(\shift_ist)
 	.endif
 
+	.if \paranoid
+	call	\do_sym\()_paranoid
+	.else
 	call	\do_sym
+	.endif
 
 	.if \shift_ist != -1
 	addq	$\ist_offset, CPU_TSS_IST(\shift_ist)
--- /dev/null
+++ b/arch/x86/include/asm/idtentry.h
@@ -0,0 +1,49 @@
+#ifndef __ASM_IDTENTRY_H
+#define __ASM_IDTENTRY_H
+
+/* shamelessly stolen from linux/syscalls.h; XXX share */
+
+#define __IDT_MAP0(m,...)
+#define __IDT_MAP1(m,t,a,...) m(t,a)
+#define __IDT_MAP2(m,t,a,...) m(t,a), __IDT_MAP1(m,__VA_ARGS__)
+#define __IDT_MAP3(m,t,a,...) m(t,a), __IDT_MAP2(m,__VA_ARGS__)
+
+#define __IDT_MAP(n,...) __IDT_MAP##n(__VA_ARGS__)
+
+#define __IDT_DECL(t, a) t a
+#define __IDT_ARGS(t, a) a
+#define __IDT_TEST(t, a) (void)BUILD_BUG_ON_ZERO(sizeof(t) != sizeof(long))
+
+#ifdef CONFIG_CONTEXT_TRACKING
+#define CALL_enter_from_user_mode(_regs) \
+	if (static_branch_unlikely(&context_tracking_enabled) && user_mode(_regs)) \
+		enter_from_user_mode()
+#else
+#define CALL_enter_from_user_mode(_regs)
+#endif
+
+#define IDTENTRYx(n, name, ...)	\
+	static notrace void __idt_##name(__IDT_MAP(n, __IDT_DECL, __VA_ARGS__)); \
+	NOKPROBE_SYMBOL(__idt_##name); \
+	dotraplinkage notrace void name(__IDT_MAP(n, __IDT_DECL, __VA_ARGS__)) \
+	{ \
+		__IDT_MAP(n, __IDT_TEST, __VA_ARGS__); \
+		trace_hardirqs_off(); \
+		CALL_enter_from_user_mode(regs); \
+		__idt_##name(__IDT_MAP(n, __IDT_ARGS, __VA_ARGS__)); \
+	} \
+	NOKPROBE_SYMBOL(name); \
+	dotraplinkage notrace void name##_paranoid(__IDT_MAP(n, __IDT_DECL, __VA_ARGS__)) \
+	{ \
+		__IDT_MAP(n, __IDT_TEST, __VA_ARGS__); \
+		trace_hardirqs_off(); \
+		__idt_##name(__IDT_MAP(n, __IDT_ARGS, __VA_ARGS__)); \
+	} \
+	NOKPROBE_SYMBOL(name##_paranoid); \
+	static notrace void __idt_##name(__IDT_MAP(n, __IDT_DECL, __VA_ARGS__))
+
+#define IDTENTRY1(name,...) IDTENTRYx(1, name, __VA_ARGS__)
+#define IDTENTRY2(name,...) IDTENTRYx(2, name, __VA_ARGS__)
+#define IDTENTRY3(name,...) IDTENTRYx(3, name, __VA_ARGS__)
+
+#endif /* __ASM_IDTENTRY_H */
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -7,6 +7,7 @@
 
 #include <asm/debugreg.h>
 #include <asm/siginfo.h>			/* TRAP_TRACE, ... */
+#include <asm/idtentry.h>
 
 #define dotraplinkage __visible
 
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1842,7 +1842,7 @@ static void unexpected_machine_check(str
 void (*machine_check_vector)(struct pt_regs *, long error_code) =
 						unexpected_machine_check;
 
-dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
+IDTENTRY2(do_mce, struct pt_regs *, regs, long, error_code)
 {
 	machine_check_vector(regs, error_code);
 }
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -241,8 +241,7 @@ u32 kvm_read_and_reset_pf_reason(void)
 EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
 NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
-dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+IDTENTRY3(do_async_page_fault, struct pt_regs *, regs, unsigned long, error_code, unsigned long, address)
 {
 	enum ctx_state prev_state;
 
@@ -263,7 +262,6 @@ do_async_page_fault(struct pt_regs *regs
 		break;
 	}
 }
-NOKPROBE_SYMBOL(do_async_page_fault);
 
 static void __init paravirt_ops_setup(void)
 {
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -281,7 +281,7 @@ static void do_error_trap(struct pt_regs
 
 #define IP ((void __user *)uprobe_get_trap_addr(regs))
 #define DO_ERROR(trapnr, signr, sicode, addr, str, name)		   \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	   \
+IDTENTRY2(do_##name, struct pt_regs *, regs, long, error_code)		   \
 {									   \
 	do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \
 }
@@ -313,7 +313,7 @@ __visible void __noreturn handle_stack_o
 
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
+IDTENTRY3(do_double_fault, struct pt_regs *, regs, long, error_code, unsigned long, cr2)
 {
 	static const char str[] = "double fault";
 	struct task_struct *tsk = current;
@@ -428,7 +428,7 @@ dotraplinkage void do_double_fault(struc
 }
 #endif
 
-dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
+IDTENTRY2(do_bounds, struct pt_regs *, regs, long, error_code)
 {
 	const struct mpx_bndcsr *bndcsr;
 
@@ -514,8 +514,7 @@ dotraplinkage void do_bounds(struct pt_r
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
 }
 
-dotraplinkage void
-do_general_protection(struct pt_regs *regs, long error_code)
+IDTENTRY2(do_general_protection, struct pt_regs *, regs, long, error_code)
 {
 	const char *desc = "general protection fault";
 	struct task_struct *tsk;
@@ -564,9 +563,8 @@ do_general_protection(struct pt_regs *re
 
 	force_sig(SIGSEGV, tsk);
 }
-NOKPROBE_SYMBOL(do_general_protection);
 
-dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+IDTENTRY2(do_int3, struct pt_regs *, regs, long, error_code)
 {
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/*
@@ -611,7 +609,6 @@ dotraplinkage void notrace do_int3(struc
 exit:
 	ist_exit(regs);
 }
-NOKPROBE_SYMBOL(do_int3);
 
 #ifdef CONFIG_X86_64
 /*
@@ -706,7 +703,7 @@ static bool is_sysenter_singlestep(struc
  *
  * May run on IST stack.
  */
-dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
+IDTENTRY2(do_debug, struct pt_regs *, regs, long, error_code)
 {
 	struct task_struct *tsk = current;
 	int user_icebp = 0;
@@ -808,7 +805,6 @@ dotraplinkage void do_debug(struct pt_re
 exit:
 	ist_exit(regs);
 }
-NOKPROBE_SYMBOL(do_debug);
 
 /*
  * Note that we play around with the 'TS' bit in an attempt to get
@@ -855,27 +851,24 @@ static void math_error(struct pt_regs *r
 			(void __user *)uprobe_get_trap_addr(regs), task);
 }
 
-dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
+IDTENTRY2(do_coprocessor_error, struct pt_regs *, regs, long, error_code)
 {
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_MF);
 }
 
-dotraplinkage void
-do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+IDTENTRY2(do_simd_coprocessor_error, struct pt_regs *, regs, long, error_code)
 {
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_XF);
 }
 
-dotraplinkage void
-do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+IDTENTRY2(do_spurious_interrupt_bug, struct pt_regs *, regs, long, error_code)
 {
 	cond_local_irq_enable(regs);
 }
 
-dotraplinkage void
-do_device_not_available(struct pt_regs *regs, long error_code)
+IDTENTRY2(do_device_not_available, struct pt_regs *, regs, long,  error_code)
 {
 	unsigned long cr0 = read_cr0();
 
@@ -906,10 +899,9 @@ do_device_not_available(struct pt_regs *
 		die("unexpected #NM exception", regs, error_code);
 	}
 }
-NOKPROBE_SYMBOL(do_device_not_available);
 
 #ifdef CONFIG_X86_32
-dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+IDTENTRY2(do_iret_error, struct pt_regs *, regs, long, error_code)
 {
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	local_irq_enable();
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1542,8 +1542,7 @@ trace_page_fault_entries(struct pt_regs
 		trace_page_fault_kernel(address, regs, error_code);
 }
 
-dotraplinkage void
-do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+IDTENTRY3(do_page_fault, struct pt_regs *, regs, unsigned long, error_code, unsigned long, address)
 {
 	enum ctx_state prev_state;
 
@@ -1552,4 +1551,3 @@ do_page_fault(struct pt_regs *regs, unsi
 	__do_page_fault(regs, error_code, address);
 	exception_exit(prev_state);
 }
-NOKPROBE_SYMBOL(do_page_fault);



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/7] x86/paravirt: Make read_cr2() CALLEE_SAVE
  2019-07-04 19:55 ` [PATCH v2 1/7] x86/paravirt: Make read_cr2() CALLEE_SAVE Peter Zijlstra
@ 2019-07-04 21:49   ` Andy Lutomirski
  2019-07-10 19:53   ` Steven Rostedt
  1 sibling, 0 replies; 42+ messages in thread
From: Andy Lutomirski @ 2019-07-04 21:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Andrew Lutomirski, Linus Torvalds, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel

On Thu, Jul 4, 2019 at 1:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> The one paravirt read_cr2() implementation (Xen) is actually quite
> trivial and doesn't need to clobber anything other than the return
> register. By making read_cr2() CALLEE_SAVE we avoid all the PUSH/POP
> nonsense and allow more convenient use from assembly.

Wow, this is incomprehensible! :)  I'll trust Juergen's review.

--Andy

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/7] x86/entry/32: Simplify common_exception
  2019-07-04 19:55 ` [PATCH v2 2/7] x86/entry/32: Simplify common_exception Peter Zijlstra
@ 2019-07-04 21:51   ` Andy Lutomirski
  2019-07-10 20:11   ` Steven Rostedt
  1 sibling, 0 replies; 42+ messages in thread
From: Andy Lutomirski @ 2019-07-04 21:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Andrew Lutomirski, Linus Torvalds, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel

On Thu, Jul 4, 2019 at 1:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> By adding one more option to SAVE_ALL we can make use of it in
> common_exception and simplify things. This saves duplication later
> where page_fault will no longer use common_exception.
>

Reviewed-by: Andy Lutomirski <luto@kernel.org>

Although I'm still looking forward to the far-off magical day when we
decide we can drop 32-bit support entirely!

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 3/7] x86/entry/64: Simplify idtentry a little
  2019-07-04 19:55 ` [PATCH v2 3/7] x86/entry/64: Simplify idtentry a little Peter Zijlstra
@ 2019-07-04 21:54   ` Andy Lutomirski
  2019-07-10 20:23   ` Steven Rostedt
  1 sibling, 0 replies; 42+ messages in thread
From: Andy Lutomirski @ 2019-07-04 21:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Andrew Lutomirski, Linus Torvalds, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel

On Thu, Jul 4, 2019 at 1:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> There's a bunch of duplication in idtentry, namely the
> .Lfrom_usermode_switch_stack is a paranoid=0 copy of the normal flow.
>
> Make this explicit by creating a idtentry_part helper macro.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/entry/entry_64.S |  100 +++++++++++++++++++++-------------------------
>  1 file changed, 47 insertions(+), 53 deletions(-)
>
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -865,6 +865,51 @@ apicinterrupt IRQ_WORK_VECTOR                      irq_work
>   */
>  #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
>
> +.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0
> +
> +       .if \paranoid
> +       call    paranoid_entry
> +       /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
> +       .else
> +       call    error_entry
> +       .endif
> +       UNWIND_HINT_REGS
> +
> +       .if \paranoid
> +       .if \shift_ist != -1
> +       TRACE_IRQS_OFF_DEBUG                    /* reload IDT in case of recursion */
> +       .else
> +       TRACE_IRQS_OFF
> +       .endif
> +       .endif
> +
> +       movq    %rsp, %rdi                      /* pt_regs pointer */
> +
> +       .if \has_error_code
> +       movq    ORIG_RAX(%rsp), %rsi            /* get error code */
> +       movq    $-1, ORIG_RAX(%rsp)             /* no syscall to restart */
> +       .else
> +       xorl    %esi, %esi                      /* no error code */
> +       .endif
> +
> +       .if \shift_ist != -1
> +       subq    $\ist_offset, CPU_TSS_IST(\shift_ist)
> +       .endif
> +
> +       call    \do_sym
> +
> +       .if \shift_ist != -1
> +       addq    $\ist_offset, CPU_TSS_IST(\shift_ist)
> +       .endif
> +
> +       .if \paranoid
> +       jmp     paranoid_exit
> +       .else
> +       jmp     error_exit
> +       .endif
> +
> +.endm
> +
>  /**
>   * idtentry - Generate an IDT entry stub
>   * @sym:               Name of the generated entry point
> @@ -935,46 +980,7 @@ ENTRY(\sym)
>  .Lfrom_usermode_no_gap_\@:
>         .endif
>
> -       .if \paranoid
> -       call    paranoid_entry
> -       .else
> -       call    error_entry
> -       .endif
> -       UNWIND_HINT_REGS
> -       /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
> -
> -       .if \paranoid
> -       .if \shift_ist != -1
> -       TRACE_IRQS_OFF_DEBUG                    /* reload IDT in case of recursion */
> -       .else
> -       TRACE_IRQS_OFF
> -       .endif
> -       .endif
> -
> -       movq    %rsp, %rdi                      /* pt_regs pointer */
> -
> -       .if \has_error_code
> -       movq    ORIG_RAX(%rsp), %rsi            /* get error code */
> -       movq    $-1, ORIG_RAX(%rsp)             /* no syscall to restart */
> -       .else
> -       xorl    %esi, %esi                      /* no error code */
> -       .endif
> -
> -       .if \shift_ist != -1
> -       subq    $\ist_offset, CPU_TSS_IST(\shift_ist)
> -       .endif
> -
> -       call    \do_sym
> -
> -       .if \shift_ist != -1
> -       addq    $\ist_offset, CPU_TSS_IST(\shift_ist)
> -       .endif
> -
> -       .if \paranoid
> -       jmp     paranoid_exit
> -       .else
> -       jmp     error_exit
> -       .endif
> +       idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset
>
>         .if \paranoid == 1
>         /*
> @@ -983,21 +989,9 @@ ENTRY(\sym)
>          * run in real process context if user_mode(regs).
>          */
>  .Lfrom_usermode_switch_stack_\@:
> -       call    error_entry
> -
> -       movq    %rsp, %rdi                      /* pt_regs pointer */
> -
> -       .if \has_error_code
> -       movq    ORIG_RAX(%rsp), %rsi            /* get error code */
> -       movq    $-1, ORIG_RAX(%rsp)             /* no syscall to restart */
> -       .else
> -       xorl    %esi, %esi                      /* no error code */
> +       idtentry_part \do_sym, \has_error_code, 0

Nice!  You are adding an extra UNWIND_HINT_REGS that wasn't here
before, but I think that's fine.  However, can you pleace make it
paranoid=0 instead of just 0?  You could go all the way verbose and
say do_sym=\do_sym, etc, but that seems like overkill.

Other than that nitpick, Acked-by: Andy Lutomirski <luto@kernel.org>

--Andy

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 4/7] x86/entry/64: Update comments and sanity tests for create_gap
  2019-07-04 19:55 ` [PATCH v2 4/7] x86/entry/64: Update comments and sanity tests for create_gap Peter Zijlstra
@ 2019-07-04 21:55   ` Andy Lutomirski
  2019-07-10 20:24   ` Steven Rostedt
  1 sibling, 0 replies; 42+ messages in thread
From: Andy Lutomirski @ 2019-07-04 21:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Andrew Lutomirski, Linus Torvalds, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel

On Thu, Jul 4, 2019 at 1:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
>

Acked-by: Andy Lutomirski <luto@kernel.org>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-04 19:56 ` [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption Peter Zijlstra
@ 2019-07-05  2:18   ` Linus Torvalds
  2019-07-05  3:16     ` Andy Lutomirski
                       ` (2 more replies)
  2019-07-07 15:10   ` Andy Lutomirski
  1 sibling, 3 replies; 42+ messages in thread
From: Linus Torvalds @ 2019-07-05  2:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Andrew Lutomirski, Peter Anvin, Dave Hansen, Juergen Gross,
	Linux List Kernel Mailing, He Zhe, Joel Fernandes, devel

On Fri, Jul 5, 2019 at 5:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Despire the current efforts to read CR2 before tracing happens there
> still exist a number of possible holes:

So this whole series disturbs me for the simple reason that I thought
tracing was supposed to save/restore cr2 and make it unnecessary to
worry about this in non-tracing code.

That is very much what the NMI code explicitly does. Why shouldn't all
the other tracing code do the same thing in case they can take page
faults?

So I don't think the patches are wrong per se, but this seems to solve
it at the wrong level.

                 Linus

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-05  2:18   ` Linus Torvalds
@ 2019-07-05  3:16     ` Andy Lutomirski
  2019-07-05  3:27       ` Linus Torvalds
  2019-07-05 13:49     ` Peter Zijlstra
  2019-07-06 11:07     ` Eiichi Tsukata
  2 siblings, 1 reply; 42+ messages in thread
From: Andy Lutomirski @ 2019-07-05  3:16 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Steven Rostedt, Andrew Lutomirski, Peter Anvin, Dave Hansen,
	Juergen Gross, Linux List Kernel Mailing, He Zhe, Joel Fernandes,
	devel



> On Jul 4, 2019, at 7:18 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
>> On Fri, Jul 5, 2019 at 5:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
>> 
>> Despire the current efforts to read CR2 before tracing happens there
>> still exist a number of possible holes:
> 
> So this whole series disturbs me for the simple reason that I thought
> tracing was supposed to save/restore cr2 and make it unnecessary to
> worry about this in non-tracing code.
> 
> That is very much what the NMI code explicitly does. Why shouldn't all
> the other tracing code do the same thing in case they can take page
> faults?
> 

If nothing else, MOV to CR2 is architecturally serializing, so, unless there’s some fancy unwinding involved, this will be quite slow.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-05  3:16     ` Andy Lutomirski
@ 2019-07-05  3:27       ` Linus Torvalds
  0 siblings, 0 replies; 42+ messages in thread
From: Linus Torvalds @ 2019-07-05  3:27 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Steven Rostedt, Andrew Lutomirski, Peter Anvin, Dave Hansen,
	Juergen Gross, Linux List Kernel Mailing, He Zhe, Joel Fernandes,
	devel

On Fri, Jul 5, 2019 at 12:16 PM Andy Lutomirski <luto@amacapital.net> wrote:
>
> If nothing else, MOV to CR2 is architecturally serializing, so, unless there’s some fancy unwinding involved, this will be quite slow.

That's why the NMI code does this:

        if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
                write_cr2(this_cpu_read(nmi_cr2));

so that it normally only does a read. Only if you actually took a page
fault will it restore cr2 to the old value (and if you took a page
fault the performance issues will be _there_, not in the "restore cr2"
part)

                Linus

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-05  2:18   ` Linus Torvalds
  2019-07-05  3:16     ` Andy Lutomirski
@ 2019-07-05 13:49     ` Peter Zijlstra
  2019-07-06 21:41       ` Linus Torvalds
  2019-07-06 11:07     ` Eiichi Tsukata
  2 siblings, 1 reply; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-05 13:49 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Andrew Lutomirski, Peter Anvin, Dave Hansen, Juergen Gross,
	Linux List Kernel Mailing, He Zhe, Joel Fernandes, devel

On Fri, Jul 05, 2019 at 11:18:51AM +0900, Linus Torvalds wrote:
> On Fri, Jul 5, 2019 at 5:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Despire the current efforts to read CR2 before tracing happens there
> > still exist a number of possible holes:
> 
> So this whole series disturbs me for the simple reason that I thought
> tracing was supposed to save/restore cr2 and make it unnecessary to
> worry about this in non-tracing code.
> 
> That is very much what the NMI code explicitly does. Why shouldn't all
> the other tracing code do the same thing in case they can take page
> faults?
> 
> So I don't think the patches are wrong per se, but this seems to solve
> it at the wrong level.

My thinking is that that results in far too many sites which we have to
fix and a possibly fragility of interface. Invariably we'll get multiple
interface for the same thing, one which preserves CR2 and one which
doesn't -- in the name of performance. And then someone uses the wrong
one, and we're back where we started.

Conversely, this way we get to fix it in one place.

Also; all previous attempts at fixing this have been about pushing the
read_cr2() earlier; notably:

  0ac09f9f8cd1 ("x86, trace: Fix CR2 corruption when tracing page faults")
  d4078e232267 ("x86, trace: Further robustify CR2 handling vs tracing")

And I'm thinking that with exception of this patch, the rest are
worthwhile cleanups regardless.

Also; while looking at this, if we do continue with the C wrappers from
the very last patch, we can do horrible things like this on top and move
the read_cr2() back into C code.


--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -826,7 +826,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
  */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
 
-.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0
+.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0
 
 	.if \paranoid
 	call	paranoid_entry
@@ -836,10 +836,6 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
 	.endif
 	UNWIND_HINT_REGS
 
-	.if \read_cr2
-	GET_CR2_INTO(%rdx);			/* can clobber %rax */
-	.endif
-
 	.if \has_error_code
 	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
 	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
@@ -885,7 +881,6 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
  *			fresh stack.  (This is for #DB, which has a nasty habit
  *			of recursing.)
  * @create_gap:		create a 6-word stack gap when coming from kernel mode.
- * @read_cr2:		load CR2 into the 3rd argument; done before calling any C code
  *
  * idtentry generates an IDT stub that sets up a usable kernel context,
  * creates struct pt_regs, and calls @do_sym.  The stub has the following
@@ -910,7 +905,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
  */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
 ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
@@ -948,7 +943,7 @@ ENTRY(\sym)
 .Lfrom_usermode_no_gap_\@:
 	.endif
 
-	idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset
+	idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset
 
 	.if \paranoid == 1
 	/*
@@ -957,7 +952,7 @@ ENTRY(\sym)
 	 * run in real process context if user_mode(regs).
 	 */
 .Lfrom_usermode_switch_stack_\@:
-	idtentry_part \do_sym, \has_error_code, \read_cr2, 0
+	idtentry_part \do_sym, \has_error_code, paranoid=0
 	.endif
 
 _ASM_NOKPROBE(\sym)
@@ -969,7 +964,7 @@ idtentry overflow			do_overflow			has_er
 idtentry bounds				do_bounds			has_error_code=0
 idtentry invalid_op			do_invalid_op			has_error_code=0
 idtentry device_not_available		do_device_not_available		has_error_code=0
-idtentry double_fault			do_double_fault			has_error_code=1 paranoid=2 read_cr2=1
+idtentry double_fault			do_double_fault			has_error_code=1 paranoid=2
 idtentry coprocessor_segment_overrun	do_coprocessor_segment_overrun	has_error_code=0
 idtentry invalid_TSS			do_invalid_TSS			has_error_code=1
 idtentry segment_not_present		do_segment_not_present		has_error_code=1
@@ -1142,10 +1137,10 @@ idtentry xenint3		do_int3			has_error_co
 #endif
 
 idtentry general_protection	do_general_protection	has_error_code=1
-idtentry page_fault		do_page_fault		has_error_code=1	read_cr2=1
+idtentry page_fault		do_page_fault		has_error_code=1
 
 #ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault	do_async_page_fault	has_error_code=1	read_cr2=1
+idtentry async_page_fault	do_async_page_fault	has_error_code=1
 #endif
 
 #ifdef CONFIG_X86_MCE
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -22,20 +22,34 @@
 #define CALL_enter_from_user_mode(_regs)
 #endif
 
+#define __IDT_NR1 1
+#define __IDT_NR2 2
+#define __IDT_NR3 2
+
+#define IDT_NR(n) __IDT_NR##n
+
+#define __IDT_TRAP1(t1,a1)
+#define __IDT_TRAP2(t1,a1,t2,a2)
+#define __IDT_TRAP3(t1,a1,t2,a2,t3,a3)	t3 a3 = read_cr2()
+
+#define IDT_TRAP(n,...) __IDT_TRAP##n(__VA_ARGS__)
+
 #define IDTENTRYx(n, name, ...)	\
 	static notrace void __idt_##name(__IDT_MAP(n, __IDT_DECL, __VA_ARGS__)); \
 	NOKPROBE_SYMBOL(__idt_##name); \
-	dotraplinkage notrace void name(__IDT_MAP(n, __IDT_DECL, __VA_ARGS__)) \
+	dotraplinkage notrace void name(__IDT_MAP(__IDT_NR(n), __IDT_DECL, __VA_ARGS__)) \
 	{ \
 		__IDT_MAP(n, __IDT_TEST, __VA_ARGS__); \
+		__IDT_TRAP(n, __VA_ARGS__); \
 		trace_hardirqs_off(); \
 		CALL_enter_from_user_mode(regs); \
 		__idt_##name(__IDT_MAP(n, __IDT_ARGS, __VA_ARGS__)); \
 	} \
 	NOKPROBE_SYMBOL(name); \
-	dotraplinkage notrace void name##_paranoid(__IDT_MAP(n, __IDT_DECL, __VA_ARGS__)) \
+	dotraplinkage notrace void name##_paranoid(__IDT_MAP(__IDT_NR(n), __IDT_DECL, __VA_ARGS__)) \
 	{ \
 		__IDT_MAP(n, __IDT_TEST, __VA_ARGS__); \
+		__IDT_TRAP(n, __VA_ARGS__); \
 		trace_hardirqs_off(); \
 		__idt_##name(__IDT_MAP(n, __IDT_ARGS, __VA_ARGS__)); \
 	} \

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-05  2:18   ` Linus Torvalds
  2019-07-05  3:16     ` Andy Lutomirski
  2019-07-05 13:49     ` Peter Zijlstra
@ 2019-07-06 11:07     ` Eiichi Tsukata
  2019-07-08  7:48       ` Peter Zijlstra
  2 siblings, 1 reply; 42+ messages in thread
From: Eiichi Tsukata @ 2019-07-06 11:07 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Andrew Lutomirski, Peter Anvin, Dave Hansen, Juergen Gross,
	Linux List Kernel Mailing, He Zhe, Joel Fernandes



On 2019/07/05 11:18, Linus Torvalds wrote:
> On Fri, Jul 5, 2019 at 5:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> Despire the current efforts to read CR2 before tracing happens there
>> still exist a number of possible holes:
> 
> So this whole series disturbs me for the simple reason that I thought
> tracing was supposed to save/restore cr2 and make it unnecessary to
> worry about this in non-tracing code.
> 
> That is very much what the NMI code explicitly does. Why shouldn't all
> the other tracing code do the same thing in case they can take page
> faults?
> 
> So I don't think the patches are wrong per se, but this seems to solve
> it at the wrong level.
> 
>                  Linus
> 

Steven previously tried to fix it by saving CR2 in TRACE_IRQS_OFF:
https://lore.kernel.org/lkml/20190320221534.165ab87b@oasis.local.home/

But hit the following WARNING:
https://lore.kernel.org/lkml/20190321095502.47b51356@gandalf.local.home/

I tried to find out the root cause of the WARNING, and found that it is
caused by touching trace point(TRACE_IRQS_OFF) before search_binary_handler()
at exeve.

To prevent userstack trace code from reading user stack before it becomes ready,
checking current->in_execve in stack_trace_save_user() can help Steven's approach,
though trace_sched_process_exec() is called before current->in_execve = 0 so it changes
current behavior.

The PoC code is as follows:

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 2abf27d7df6b..30fa6e1b7a87 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -116,10 +116,12 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
                          const struct pt_regs *regs)
 {
        const void __user *fp = (const void __user *)regs->bp;
+       unsigned long address;
 
        if (!consume_entry(cookie, regs->ip, false))
                return;
 
+       address = read_cr2();
        while (1) {
                struct stack_frame_user frame;
 
@@ -131,11 +133,14 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
                        break;
                if (frame.ret_addr) {
                        if (!consume_entry(cookie, frame.ret_addr, false))
-                               return;
+                               break;
                }
                if (fp == frame.next_fp)
                        break;
                fp = frame.next_fp;
        }
+
+       if (address != read_cr2())
+               write_cr2(address);
 }
 
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 36139de0a3c4..489d33bb5d28 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -230,6 +230,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
        /* Trace user stack if not a kernel thread */
        if (!current->mm)
                return 0;
+       /* current can reach some trace points before its stack is ready */
+       if (current->in_execve)
+               return 0;
 
        arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
        return c.len;
  




^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-05 13:49     ` Peter Zijlstra
@ 2019-07-06 21:41       ` Linus Torvalds
  2019-07-06 22:27         ` Steven Rostedt
  0 siblings, 1 reply; 42+ messages in thread
From: Linus Torvalds @ 2019-07-06 21:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Andrew Lutomirski, Peter Anvin, Dave Hansen, Juergen Gross,
	Linux List Kernel Mailing, He Zhe, Joel Fernandes, devel

On Fri, Jul 5, 2019 at 6:50 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Also; all previous attempts at fixing this have been about pushing the
> read_cr2() earlier; notably:
>
>   0ac09f9f8cd1 ("x86, trace: Fix CR2 corruption when tracing page faults")
>   d4078e232267 ("x86, trace: Further robustify CR2 handling vs tracing")

I think both of those are because people - again - felt like tracing
can validly corrupt CPU state, and then they fix up the symptoms
rather than the cause.

Which I disagree with.

> And I'm thinking that with exception of this patch, the rest are
> worthwhile cleanups regardless.

I don't have any issues with the patches themselves, I agree that they
are probably good on their own.

I *do* have issues with the "tracing can change CPU state so we need
to deal with it" model, though. I think that mode of thinking is
wrong. I don't believe tracing should ever change core CPU state and
that be considered ok.

> Also; while looking at this, if we do continue with the C wrappers from
> the very last patch, we can do horrible things like this on top and move
> the read_cr2() back into C code.

Again, I don't think that is the problem. I think it's a much more
fundamental problem in thinking that core code should be changed
because tracing is broken garbage and didn't do things right.

I see Eiichi's patch, and it makes me go "that looks better" - simply
because it fixes the fundamental issue, rather than working around the
symptoms.

               Linus

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-06 21:41       ` Linus Torvalds
@ 2019-07-06 22:27         ` Steven Rostedt
  2019-07-06 22:41           ` Linus Torvalds
                             ` (2 more replies)
  0 siblings, 3 replies; 42+ messages in thread
From: Steven Rostedt @ 2019-07-06 22:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Andrew Lutomirski, Peter Anvin, Dave Hansen, Juergen Gross,
	Linux List Kernel Mailing, He Zhe, Joel Fernandes, devel

On Sat, 6 Jul 2019 14:41:22 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, Jul 5, 2019 at 6:50 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Also; all previous attempts at fixing this have been about pushing the
> > read_cr2() earlier; notably:
> >
> >   0ac09f9f8cd1 ("x86, trace: Fix CR2 corruption when tracing page faults")
> >   d4078e232267 ("x86, trace: Further robustify CR2 handling vs tracing")  
> 
> I think both of those are because people - again - felt like tracing
> can validly corrupt CPU state, and then they fix up the symptoms
> rather than the cause.
> 
> Which I disagree with.
> 
> > And I'm thinking that with exception of this patch, the rest are
> > worthwhile cleanups regardless.  
> 
> I don't have any issues with the patches themselves, I agree that they
> are probably good on their own.
> 
> I *do* have issues with the "tracing can change CPU state so we need
> to deal with it" model, though. I think that mode of thinking is
> wrong. I don't believe tracing should ever change core CPU state and
> that be considered ok.
> 
> > Also; while looking at this, if we do continue with the C wrappers from
> > the very last patch, we can do horrible things like this on top and move
> > the read_cr2() back into C code.  
> 
> Again, I don't think that is the problem. I think it's a much more
> fundamental problem in thinking that core code should be changed
> because tracing is broken garbage and didn't do things right.
> 
> I see Eiichi's patch, and it makes me go "that looks better" - simply
> because it fixes the fundamental issue, rather than working around the
> symptoms.
>

We also have to deal with reading vmalloc'd data as that can fault too.
The perf ring buffer IIUC is vmalloc, so if perf records in one of
these locations, then the reading of the vmalloc area has a potential
to fault corrupting the CR2 register as well. Or have we changed
vmalloc to no longer fault?

-- Steve

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-06 22:27         ` Steven Rostedt
@ 2019-07-06 22:41           ` Linus Torvalds
  2019-07-07  0:08             ` Linus Torvalds
  2019-07-06 23:50           ` Andy Lutomirski
  2019-07-07  3:44           ` Eiichi Tsukata
  2 siblings, 1 reply; 42+ messages in thread
From: Linus Torvalds @ 2019-07-06 22:41 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Andrew Lutomirski, Peter Anvin, Dave Hansen, Juergen Gross,
	Linux List Kernel Mailing, He Zhe, Joel Fernandes, devel

On Sat, Jul 6, 2019 at 3:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> We also have to deal with reading vmalloc'd data as that can fault too.

Ahh, that may be a better reason for PeterZ's patches and reading cr2
very early from asm code than the stack trace case. It's why the page
fault handler delayed enabling interrupts, after all (which is one big
reason NMI was special).

               Linus

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-06 22:27         ` Steven Rostedt
  2019-07-06 22:41           ` Linus Torvalds
@ 2019-07-06 23:50           ` Andy Lutomirski
  2019-07-07  3:44           ` Eiichi Tsukata
  2 siblings, 0 replies; 42+ messages in thread
From: Andy Lutomirski @ 2019-07-06 23:50 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Linus Torvalds, Peter Zijlstra, Thomas Gleixner, Borislav Petkov,
	Ingo Molnar, Andrew Lutomirski, Peter Anvin, Dave Hansen,
	Juergen Gross, Linux List Kernel Mailing, He Zhe, Joel Fernandes,
	devel

On Sat, Jul 6, 2019 at 3:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Sat, 6 Jul 2019 14:41:22 -0700
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> > On Fri, Jul 5, 2019 at 6:50 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > Also; all previous attempts at fixing this have been about pushing the
> > > read_cr2() earlier; notably:
> > >
> > >   0ac09f9f8cd1 ("x86, trace: Fix CR2 corruption when tracing page faults")
> > >   d4078e232267 ("x86, trace: Further robustify CR2 handling vs tracing")
> >
> > I think both of those are because people - again - felt like tracing
> > can validly corrupt CPU state, and then they fix up the symptoms
> > rather than the cause.
> >
> > Which I disagree with.
> >
> > > And I'm thinking that with exception of this patch, the rest are
> > > worthwhile cleanups regardless.
> >
> > I don't have any issues with the patches themselves, I agree that they
> > are probably good on their own.
> >
> > I *do* have issues with the "tracing can change CPU state so we need
> > to deal with it" model, though. I think that mode of thinking is
> > wrong. I don't believe tracing should ever change core CPU state and
> > that be considered ok.
> >
> > > Also; while looking at this, if we do continue with the C wrappers from
> > > the very last patch, we can do horrible things like this on top and move
> > > the read_cr2() back into C code.
> >
> > Again, I don't think that is the problem. I think it's a much more
> > fundamental problem in thinking that core code should be changed
> > because tracing is broken garbage and didn't do things right.
> >
> > I see Eiichi's patch, and it makes me go "that looks better" - simply
> > because it fixes the fundamental issue, rather than working around the
> > symptoms.
> >
>
> We also have to deal with reading vmalloc'd data as that can fault too.
> The perf ring buffer IIUC is vmalloc, so if perf records in one of
> these locations, then the reading of the vmalloc area has a potential
> to fault corrupting the CR2 register as well. Or have we changed
> vmalloc to no longer fault?
>

What context is that happening in?  If the tracepoint-saves-CR2 patch
is in, then it should be fine if it nests inside of tracing, right?
And NMI needs to save and restore CR2 no matter what, since it can
happen before we can possibly save CR2.  For that matter, MCE should
save and restore CR2 as well.

All that being said, I do like the idea of moving all this crud (IRQ
tracing, CR2 handling, and context tracking) into C.  I haven't had
time to review that part of Peter's series yet, but I should get to it
soon.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-06 22:41           ` Linus Torvalds
@ 2019-07-07  0:08             ` Linus Torvalds
  2019-07-07  0:36               ` Andy Lutomirski
  0 siblings, 1 reply; 42+ messages in thread
From: Linus Torvalds @ 2019-07-07  0:08 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Andrew Lutomirski, Peter Anvin, Dave Hansen, Juergen Gross,
	Linux List Kernel Mailing, He Zhe, Joel Fernandes, devel

[-- Attachment #1: Type: text/plain, Size: 1937 bytes --]

On Sat, Jul 6, 2019 at 3:41 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Sat, Jul 6, 2019 at 3:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > We also have to deal with reading vmalloc'd data as that can fault too.
>
> Ahh, that may be a better reason for PeterZ's patches and reading cr2
> very early from asm code than the stack trace case.

Hmm. Another alternative might be to simply just make our vmalloc page
fault handling more robust.

Right now, if we take a vmalloc page fault in an inconvenient spot, it
is fatal because it corrupts the cr2 in the outer context.

However, it doesn't *need* to be fatal. Who cares if the outer context
cr2 gets corrupted? We probably *shouldn't* care - it's an odd and
unusual case, and the outer context could just handle the wrong
vmalloc-address cr2 fine (it's going to be a no-op, since the inner
page fault will have handled it already), return, and then re-fault.

The only reason it's fatal right now is that we care much too deeply about

 (a) the error code
 (b) the pt_regs state

when we handle vmalloc faults.

So one option is that we simply handle the vmalloc faults _without_
caring about the error code and the pt_regs state, because even if the
error code or the pt_regs implies that the fault comes from user
space, the cr2 value might be due to a vmalloc fault from the inner
kernel page fault that corrupted cr2.

Right now vmalloc faults are already special and need to be handled
without holding any locks etc. We'd just make them even more special,
and say that we might have a vmalloc area fault from any context.

IOW, somethinig like the attached patch would make nesting vmalloc
faults harmless. Sure, we'll do the "vmalloc fault" twice, and return
and re-do the original page fault, but this is a very unusual case, so
from a performance angle we don't really care.

But I guess the "read cr2 early" is fine too..

               Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 1957 bytes --]

 arch/x86/mm/fault.c | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 46df4c6aae46..3a03504bc624 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1245,6 +1245,15 @@ static void
 do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
 		   unsigned long address)
 {
+	/*
+	 * The kernel vmalloc area can fault in at any time, and
+	 * we should not check the hw error code, since the cr2 value
+	 * could be a stale one from a nested vmalloc fault, but the
+	 * error code got pushed by hardware.
+	 */
+	if (vmalloc_fault(address) >= 0)
+		return;
+
 	/*
 	 * Protection keys exceptions only happen on user pages.  We
 	 * have no user pages in the kernel portion of the address
@@ -1252,29 +1261,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
 	 */
 	WARN_ON_ONCE(hw_error_code & X86_PF_PK);
 
-	/*
-	 * We can fault-in kernel-space virtual memory on-demand. The
-	 * 'reference' page table is init_mm.pgd.
-	 *
-	 * NOTE! We MUST NOT take any locks for this case. We may
-	 * be in an interrupt or a critical region, and should
-	 * only copy the information from the master page table,
-	 * nothing more.
-	 *
-	 * Before doing this on-demand faulting, ensure that the
-	 * fault is not any of the following:
-	 * 1. A fault on a PTE with a reserved bit set.
-	 * 2. A fault caused by a user-mode access.  (Do not demand-
-	 *    fault kernel memory due to user-mode accesses).
-	 * 3. A fault caused by a page-level protection violation.
-	 *    (A demand fault would be on a non-present page which
-	 *     would have X86_PF_PROT==0).
-	 */
-	if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
-		if (vmalloc_fault(address) >= 0)
-			return;
-	}
-
 	/* Was the fault spurious, caused by lazy TLB invalidation? */
 	if (spurious_kernel_fault(hw_error_code, address))
 		return;

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-07  0:08             ` Linus Torvalds
@ 2019-07-07  0:36               ` Andy Lutomirski
  0 siblings, 0 replies; 42+ messages in thread
From: Andy Lutomirski @ 2019-07-07  0:36 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Steven Rostedt, Peter Zijlstra, Thomas Gleixner, Borislav Petkov,
	Ingo Molnar, Andrew Lutomirski, Peter Anvin, Dave Hansen,
	Juergen Gross, Linux List Kernel Mailing, He Zhe, Joel Fernandes,
	devel



> On Jul 6, 2019, at 6:08 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> On Sat, Jul 6, 2019 at 3:41 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
>> 
>>> On Sat, Jul 6, 2019 at 3:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>>> 
>>> We also have to deal with reading vmalloc'd data as that can fault too.
>> 
>> Ahh, that may be a better reason for PeterZ's patches and reading cr2
>> very early from asm code than the stack trace case.
> 
> Hmm. Another alternative might be to simply just make our vmalloc page
> fault handling more robust.
> 
> Right now, if we take a vmalloc page fault in an inconvenient spot, it
> is fatal because it corrupts the cr2 in the outer context.
> 
> However, it doesn't *need* to be fatal. Who cares if the outer context
> cr2 gets corrupted? We probably *shouldn't* care - it's an odd and
> unusual case, and the outer context could just handle the wrong
> vmalloc-address cr2 fine (it's going to be a no-op, since the inner
> page fault will have handled it already), return, and then re-fault.
> 
> The only reason it's fatal right now is that we care much too deeply about
> 
> (a) the error code
> (b) the pt_regs state
> 
> when we handle vmalloc faults.
> 
> So one option is that we simply handle the vmalloc faults _without_
> caring about the error code and the pt_regs state, because even if the
> error code or the pt_regs implies that the fault comes from user
> space, the cr2 value might be due to a vmalloc fault from the inner
> kernel page fault that corrupted cr2.
> 
> Right now vmalloc faults are already special and need to be handled
> without holding any locks etc. We'd just make them even more special,
> and say that we might have a vmalloc area fault from any context.
> 
> IOW, somethinig like the attached patch would make nesting vmalloc
> faults harmless. Sure, we'll do the "vmalloc fault" twice, and return
> and re-do the original page fault, but this is a very unusual case, so
> from a performance angle we don't really care.

Eww. I would like to be able to rely on fault into being correct.  Also, your patch won’t do so well if the fault is from user mode, I think.


> 
> But I guess the "read cr2 early" is fine too..
> 
>               Linus
> <patch.diff>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-06 22:27         ` Steven Rostedt
  2019-07-06 22:41           ` Linus Torvalds
  2019-07-06 23:50           ` Andy Lutomirski
@ 2019-07-07  3:44           ` Eiichi Tsukata
  2 siblings, 0 replies; 42+ messages in thread
From: Eiichi Tsukata @ 2019-07-07  3:44 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Linus Torvalds, Peter Zijlstra, Thomas Gleixner, Borislav Petkov,
	Ingo Molnar, Andrew Lutomirski, Peter Anvin, Dave Hansen,
	Juergen Gross, Linux List Kernel Mailing, He Zhe, Joel Fernandes



On 2019/07/07 7:27, Steven Rostedt wrote:

> 
> We also have to deal with reading vmalloc'd data as that can fault too.
> The perf ring buffer IIUC is vmalloc, so if perf records in one of
> these locations, then the reading of the vmalloc area has a potential
> to fault corrupting the CR2 register as well. Or have we changed
> vmalloc to no longer fault?
> 
> -- Steve
> 

It seems that perf ring buffer does not normally use vmalloc.
It depends on CONFIG_PERF_USE_VMALLOC introduced by the following commit:

commit 906010b2134e14a2e377decbadd357b3d0ab9c6a
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Mon Sep 21 16:08:49 2009 +0200

    perf_event: Provide vmalloc() based mmap() backing
    
    Some architectures such as Sparc, ARM and MIPS (basically
    everything with flush_dcache_page()) need to deal with dcache
    aliases by carefully placing pages in both kernel and user maps.
    
    These architectures typically have to use vmalloc_user() for this.
    
    However, on other architectures, vmalloc() is not needed and has
    the downsides of being more restricted and slower than regular
    allocations.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Acked-by: David Miller <davem@davemloft.net>
    Cc: Andrew Morton <akpm@linux-foundation.org>
    Cc: Jens Axboe <jens.axboe@oracle.com>
    Cc: Paul Mackerras <paulus@samba.org>
    LKML-Reference: <1254830228.21044.272.camel@laptop>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-04 19:56 ` [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption Peter Zijlstra
  2019-07-05  2:18   ` Linus Torvalds
@ 2019-07-07 15:10   ` Andy Lutomirski
  2019-07-07 15:11     ` Andy Lutomirski
  1 sibling, 1 reply; 42+ messages in thread
From: Andy Lutomirski @ 2019-07-07 15:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Andrew Lutomirski, Linus Torvalds, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel

On Thu, Jul 4, 2019 at 1:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Despire the current efforts to read CR2 before tracing happens there
> still exist a number of possible holes:
>
>   idtentry page_fault             do_page_fault           has_error_code=1
>     call error_entry
>       TRACE_IRQS_OFF
>         call trace_hardirqs_off*
>           #PF // modifies CR2
>
>       CALL_enter_from_user_mode
>         __context_tracking_exit()
>           trace_user_exit(0)
>             #PF // modifies CR2
>
>     call do_page_fault
>       address = read_cr2(); /* whoopsie */
>
> And similar for i386.
>
> Fix it by pulling the CR2 read into the entry code, before any of that
> stuff gets a chance to run and ruin things.

Reviewed-by: Andy Lutomirski <luto@kernel.org>

Subject to the discussion as to whether this is the right approach at all.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-07 15:10   ` Andy Lutomirski
@ 2019-07-07 15:11     ` Andy Lutomirski
  2019-07-07 18:17       ` Linus Torvalds
  0 siblings, 1 reply; 42+ messages in thread
From: Andy Lutomirski @ 2019-07-07 15:11 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Steven Rostedt, Linus Torvalds, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel

On Sun, Jul 7, 2019 at 8:10 AM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Thu, Jul 4, 2019 at 1:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Despire the current efforts to read CR2 before tracing happens there
> > still exist a number of possible holes:
> >
> >   idtentry page_fault             do_page_fault           has_error_code=1
> >     call error_entry
> >       TRACE_IRQS_OFF
> >         call trace_hardirqs_off*
> >           #PF // modifies CR2
> >
> >       CALL_enter_from_user_mode
> >         __context_tracking_exit()
> >           trace_user_exit(0)
> >             #PF // modifies CR2
> >
> >     call do_page_fault
> >       address = read_cr2(); /* whoopsie */
> >
> > And similar for i386.
> >
> > Fix it by pulling the CR2 read into the entry code, before any of that
> > stuff gets a chance to run and ruin things.
>
> Reviewed-by: Andy Lutomirski <luto@kernel.org>
>
> Subject to the discussion as to whether this is the right approach at all.

FWIW, I'm leaning toward suggesting that we apply the trivial tracing
fix and backport *that*.  Then, in -tip, we could revert it and apply
this patch instead.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-07 15:11     ` Andy Lutomirski
@ 2019-07-07 18:17       ` Linus Torvalds
  2019-07-10 20:27         ` Steven Rostedt
  0 siblings, 1 reply; 42+ messages in thread
From: Linus Torvalds @ 2019-07-07 18:17 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Steven Rostedt, H. Peter Anvin, Dave Hansen, Juergen Gross, LKML,
	He Zhe, Joel Fernandes, devel

On Sun, Jul 7, 2019 at 8:11 AM Andy Lutomirski <luto@kernel.org> wrote:
>
> FWIW, I'm leaning toward suggesting that we apply the trivial tracing
> fix and backport *that*.  Then, in -tip, we could revert it and apply
> this patch instead.

You don't have to have the same fix in stable as in -tip.

It's fine to send something to stable that says "Fixed differently by
commit XYZ upstream". The main thing is to make sure that stable
doesn't have fixes that then get lost upstream (which we used to have
long long ago).

                  Linus

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-06 11:07     ` Eiichi Tsukata
@ 2019-07-08  7:48       ` Peter Zijlstra
  2019-07-08  8:58         ` Eiichi Tsukata
  0 siblings, 1 reply; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-08  7:48 UTC (permalink / raw)
  To: Eiichi Tsukata
  Cc: Linus Torvalds, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Steven Rostedt, Andrew Lutomirski, Peter Anvin, Dave Hansen,
	Juergen Gross, Linux List Kernel Mailing, He Zhe, Joel Fernandes

On Sat, Jul 06, 2019 at 08:07:22PM +0900, Eiichi Tsukata wrote:
> 
> 
> On 2019/07/05 11:18, Linus Torvalds wrote:
> > On Fri, Jul 5, 2019 at 5:03 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >>
> >> Despire the current efforts to read CR2 before tracing happens there
> >> still exist a number of possible holes:
> > 
> > So this whole series disturbs me for the simple reason that I thought
> > tracing was supposed to save/restore cr2 and make it unnecessary to
> > worry about this in non-tracing code.
> > 
> > That is very much what the NMI code explicitly does. Why shouldn't all
> > the other tracing code do the same thing in case they can take page
> > faults?
> > 
> > So I don't think the patches are wrong per se, but this seems to solve
> > it at the wrong level.

This solves it all at one site. If we're going to sprinkle CR2
save/restore over 'all' sites we're bound to miss some and we'll be
playing catch-up forever :/

> Steven previously tried to fix it by saving CR2 in TRACE_IRQS_OFF:
> https://lore.kernel.org/lkml/20190320221534.165ab87b@oasis.local.home/

That misses the context tracking tracepoint, which is also before we
load CR2.

> To prevent userstack trace code from reading user stack before it becomes ready,
> checking current->in_execve in stack_trace_save_user() can help Steven's approach,
> though trace_sched_process_exec() is called before current->in_execve = 0 so it changes
> current behavior.
> 
> The PoC code is as follows:
> 
> diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
> index 2abf27d7df6b..30fa6e1b7a87 100644
> --- a/arch/x86/kernel/stacktrace.c
> +++ b/arch/x86/kernel/stacktrace.c
> @@ -116,10 +116,12 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
>                           const struct pt_regs *regs)
>  {
>         const void __user *fp = (const void __user *)regs->bp;
> +       unsigned long address;
>  
>         if (!consume_entry(cookie, regs->ip, false))
>                 return;
>  
> +       address = read_cr2();
>         while (1) {
>                 struct stack_frame_user frame;
>  
> @@ -131,11 +133,14 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
>                         break;
>                 if (frame.ret_addr) {
>                         if (!consume_entry(cookie, frame.ret_addr, false))
> -                               return;
> +                               break;
>                 }
>                 if (fp == frame.next_fp)
>                         break;
>                 fp = frame.next_fp;
>         }
> +
> +       if (address != read_cr2())
> +               write_cr2(address);
>  }
>  

And this misses the perf unwinder, which we can reach if we attach perf
to the tracepoint. We can most likely also do user accesses from
kprobes, which we can similarly attach to tracepoints, and there's eBPF,
and who knows what else...

Do we really want to go put CR2 save/restore around every single thing
that can cause a fault (any fault) and is reachable from a tracepoint?
That's going to be a long list of things ... :/

Or are we going to put the CR2 save/restore on every single tracepoint?
But then we also need it on the mcount/fentry stubs and we again have
multiple places.

Whereas if we stick it in the entry path, like I proposed, we fix it in
one location and we're done.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-08  7:48       ` Peter Zijlstra
@ 2019-07-08  8:58         ` Eiichi Tsukata
  2019-07-08  9:42           ` Eiichi Tsukata
  0 siblings, 1 reply; 42+ messages in thread
From: Eiichi Tsukata @ 2019-07-08  8:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Steven Rostedt, Andrew Lutomirski, Peter Anvin, Dave Hansen,
	Juergen Gross, Linux List Kernel Mailing, He Zhe, Joel Fernandes



On 2019/07/08 16:48, Peter Zijlstra wrote:
...

> 
> Or are we going to put the CR2 save/restore on every single tracepoint?
> But then we also need it on the mcount/fentry stubs and we again have
> multiple places.
> 
> Whereas if we stick it in the entry path, like I proposed, we fix it in
> one location and we're done.
> 

Thanks to your detailed comments and the discussion, I've come to realize
that your solution "save CR2 early in the entry" is the simplest and reasonable.

By the way, is there possibility that the WARNING(#GP in execve(2)) which Steven
previously hit? : https://lore.kernel.org/lkml/20190321095502.47b51356@gandalf.local.home/

Even if there were, it will *Not* be the bug introduced by this patch series,
but the bug revealed by this series.

Thanks,

Eiichi

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-08  8:58         ` Eiichi Tsukata
@ 2019-07-08  9:42           ` Eiichi Tsukata
  2019-07-09  5:17             ` Eiichi Tsukata
  0 siblings, 1 reply; 42+ messages in thread
From: Eiichi Tsukata @ 2019-07-08  9:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Steven Rostedt, Andrew Lutomirski, Peter Anvin, Dave Hansen,
	Juergen Gross, Linux List Kernel Mailing, He Zhe, Joel Fernandes



On 2019/07/08 17:58, Eiichi Tsukata wrote:

> 
> By the way, is there possibility that the WARNING(#GP in execve(2)) which Steven
> previously hit? : https://lore.kernel.org/lkml/20190321095502.47b51356@gandalf.local.home/
> 
> Even if there were, it will *Not* be the bug introduced by this patch series,
> but the bug revealed by this series.
> 

I reproduced with the kernel:
  - https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=x86/core&id=057b48d544b384c47ed921f616128b802ab1fdc3

Reproducer:

  # echo 1 > events/preemptirq/irq_disable/enable
  # echo userstacktrace > trace_options
  # service sshd restart

[   20.338200] ------------[ cut here ]------------
[   20.339471] General protection fault in user access. Non-canonical address?
[   20.339488] WARNING: CPU: 1 PID: 1957 at arch/x86/mm/extable.c:126 ex_handler_uaccess+0x52/0x60
[   20.342550] Modules linked in:
[   20.343209] CPU: 1 PID: 1957 Comm: systemctl Tainted: G        W         5.2.0-rc7+ #48
[   20.344935] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014
[   20.346688] RIP: 0010:ex_handler_uaccess+0x52/0x60
[   20.347667] Code: c4 08 b8 01 00 00 00 5b 5d c3 80 3d b6 8c 94 01 00 75 db 48 c7 c7 b8 e4 6f aa 48 89 75 f0 c6 05 ad
[   20.351508] RSP: 0018:ffffb28940453688 EFLAGS: 00010082
[   20.352707] RAX: 0000000000000000 RBX: ffffffffaa2023c0 RCX: 0000000000000001
[   20.354478] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffaa7dab8d
[   20.355706] RBP: ffffb28940453698 R08: 0000000000000000 R09: 0000000000000001
[   20.356665] R10: 0000000000000000 R11: 0000000000000000 R12: ffffb28940453728
[   20.357594] R13: 0000000000000000 R14: 000000000000000d R15: 0000000000000000
[   20.358876] FS:  00007fec9fa248c0(0000) GS:ffff921d3d800000(0000) knlGS:0000000000000000
[   20.360573] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   20.361792] CR2: 0000000000000004 CR3: 0000000074d4e000 CR4: 00000000000006e0
[   20.363300] Call Trace:
[   20.363830]  fixup_exception+0x4a/0x61
[   20.364637]  __idt_do_general_protection+0x65/0x1d0
[   20.365684]  do_general_protection+0x1e/0x30
[   20.366654]  general_protection+0x1e/0x30
[   20.367513] RIP: 0010:arch_stack_walk_user+0x7a/0x100
[   20.368583] Code: 00 01 1f 00 0f 85 8d 00 00 00 49 8b 87 98 16 00 00 48 83 e8 10 49 39 c6 77 32 41 83 87 d0 15 00 04
[   20.372797] RSP: 0018:ffffb289404537d0 EFLAGS: 00010046
[   20.374027] RAX: 0000000000000000 RBX: ffff921d383bcb00 RCX: 0074726174736572
[   20.375674] RDX: ffff921d38393b84 RSI: 7265732e64687373 RDI: ffffb28940453818
[   20.377179] RBP: ffffb28940453808 R08: 0000000000000001 R09: ffff921d3d169c00
[   20.378681] R10: 0000000000000b60 R11: ffff921d38393b70 R12: ffffb28940453818
[   20.380195] R13: ffffb28940453f58 R14: 0074726174736572 R15: ffff921d383bcb00
[   20.381703]  ? profile_setup.cold+0x9b/0x9b
[   20.382604]  stack_trace_save_user+0x60/0x7d
[   20.383520]  trace_buffer_unlock_commit_regs+0x17b/0x220
[   20.384686]  trace_event_buffer_commit+0x6b/0x200
[   20.385741]  trace_event_raw_event_preemptirq_template+0x7b/0xc0
[   20.387067]  ? get_page_from_freelist+0x10a/0x13b0
[   20.388129]  ? __alloc_pages_nodemask+0x178/0x380
[   20.389132]  trace_hardirqs_off+0xc6/0x100
[   20.390006]  get_page_from_freelist+0x10a/0x13b0
[   20.390997]  ? kvm_clock_read+0x18/0x30
[   20.391813]  ? sched_clock+0x9/0x10
[   20.392647]  __alloc_pages_nodemask+0x178/0x380
[   20.393706]  alloc_pages_current+0x87/0xe0
[   20.394609]  __page_cache_alloc+0xcd/0x110
[   20.395491]  __do_page_cache_readahead+0xa1/0x1a0
[   20.396547]  ondemand_readahead+0x220/0x540
[   20.397486]  page_cache_sync_readahead+0x35/0x50
[   20.398511]  generic_file_read_iter+0x8d8/0xbd0
[   20.399335]  ? __might_sleep+0x4b/0x80
[   20.400110]  ext4_file_read_iter+0x35/0x40
[   20.400937]  new_sync_read+0x10f/0x1a0
[   20.401833]  __vfs_read+0x29/0x40
[   20.402608]  vfs_read+0xc0/0x170
[   20.403319]  kernel_read+0x31/0x50
[   20.404128]  prepare_binprm+0x150/0x190
[   20.404766]  __do_execve_file.isra.0+0x4c0/0xaa0
[   20.405559]  __x64_sys_execve+0x39/0x50
[   20.406173]  do_syscall_64+0x55/0x1b0
[   20.406762]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   20.407703] RIP: 0033:0x7fec9f0ee647
[   20.408441] Code: ff 76 e0 f7 d8 64 41 89 01 eb d8 0f 1f 84 00 00 00 00 00 f7 d8 64 41 89 01 eb d7 0f 1f 84 00 00 08
[   20.411636] RSP: 002b:00007ffe7f316228 EFLAGS: 00000202 ORIG_RAX: 000000000000003b
[   20.412686] RAX: ffffffffffffffda RBX: 00007fec9f8105a4 RCX: 00007fec9f0ee647
[   20.414047] RDX: 00007ffe7f3167d8 RSI: 00007ffe7f316230 RDI: 00007fec9f73cea0
[   20.415048] RBP: 00007ffe7f316480 R08: 0000000000000030 R09: 0000000000000001
[   20.416076] R10: 00007ffe7f316498 R11: 0000000000000202 R12: 00007fec9f73cea0
[   20.417556] R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
[   20.419032] irq event stamp: 832
[   20.419493] hardirqs last  enabled at (831): [<ffffffffa94b9f2e>] __find_get_block+0xde/0x360
[   20.420649] hardirqs last disabled at (832): [<ffffffffa9305ad2>] rcu_irq_enter_irqson+0x12/0x30
[   20.422179] softirqs last  enabled at (436): [<ffffffffaa20037f>] __do_softirq+0x37f/0x472
[   20.423286] softirqs last disabled at (291): [<ffffffffa927bce3>] irq_exit+0xb3/0xd0
[   20.424296] ---[ end trace cdb84a67edcdc420 ]---

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-08  9:42           ` Eiichi Tsukata
@ 2019-07-09  5:17             ` Eiichi Tsukata
  0 siblings, 0 replies; 42+ messages in thread
From: Eiichi Tsukata @ 2019-07-09  5:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Steven Rostedt, Andrew Lutomirski, Peter Anvin, Dave Hansen,
	Juergen Gross, Linux List Kernel Mailing, He Zhe, Joel Fernandes



On 2019/07/08 18:42, Eiichi Tsukata wrote:
> 
> 
> On 2019/07/08 17:58, Eiichi Tsukata wrote:
> 
>>
>> By the way, is there possibility that the WARNING(#GP in execve(2)) which Steven
>> previously hit? : https://lore.kernel.org/lkml/20190321095502.47b51356@gandalf.local.home/
>>
>> Even if there were, it will *Not* be the bug introduced by this patch series,
>> but the bug revealed by this series.
>>
> 
> I reproduced with the kernel:
>   - https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=x86/core&id=057b48d544b384c47ed921f616128b802ab1fdc3
> 
> Reproducer:
> 
>   # echo 1 > events/preemptirq/irq_disable/enable
>   # echo userstacktrace > trace_options
>   # service sshd restart
> 
> [   20.338200] ------------[ cut here ]------------
> [   20.339471] General protection fault in user access. Non-canonical address?
> [   20.339488] WARNING: CPU: 1 PID: 1957 at arch/x86/mm/extable.c:126 ex_handler_uaccess+0x52/0x60
> [   20.342550] Modules linked in:
> [   20.343209] CPU: 1 PID: 1957 Comm: systemctl Tainted: G        W         5.2.0-rc7+ #48
> [   20.344935] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014
> [   20.346688] RIP: 0010:ex_handler_uaccess+0x52/0x60
> [   20.347667] Code: c4 08 b8 01 00 00 00 5b 5d c3 80 3d b6 8c 94 01 00 75 db 48 c7 c7 b8 e4 6f aa 48 89 75 f0 c6 05 ad
> [   20.351508] RSP: 0018:ffffb28940453688 EFLAGS: 00010082
> [   20.352707] RAX: 0000000000000000 RBX: ffffffffaa2023c0 RCX: 0000000000000001
> [   20.354478] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffaa7dab8d
> [   20.355706] RBP: ffffb28940453698 R08: 0000000000000000 R09: 0000000000000001
> [   20.356665] R10: 0000000000000000 R11: 0000000000000000 R12: ffffb28940453728
> [   20.357594] R13: 0000000000000000 R14: 000000000000000d R15: 0000000000000000
> [   20.358876] FS:  00007fec9fa248c0(0000) GS:ffff921d3d800000(0000) knlGS:0000000000000000
> [   20.360573] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   20.361792] CR2: 0000000000000004 CR3: 0000000074d4e000 CR4: 00000000000006e0
> [   20.363300] Call Trace:
> [   20.363830]  fixup_exception+0x4a/0x61
> [   20.364637]  __idt_do_general_protection+0x65/0x1d0
> [   20.365684]  do_general_protection+0x1e/0x30
> [   20.366654]  general_protection+0x1e/0x30
> [   20.367513] RIP: 0010:arch_stack_walk_user+0x7a/0x100
> [   20.368583] Code: 00 01 1f 00 0f 85 8d 00 00 00 49 8b 87 98 16 00 00 48 83 e8 10 49 39 c6 77 32 41 83 87 d0 15 00 04
> [   20.372797] RSP: 0018:ffffb289404537d0 EFLAGS: 00010046
> [   20.374027] RAX: 0000000000000000 RBX: ffff921d383bcb00 RCX: 0074726174736572
> [   20.375674] RDX: ffff921d38393b84 RSI: 7265732e64687373 RDI: ffffb28940453818
> [   20.377179] RBP: ffffb28940453808 R08: 0000000000000001 R09: ffff921d3d169c00
> [   20.378681] R10: 0000000000000b60 R11: ffff921d38393b70 R12: ffffb28940453818
> [   20.380195] R13: ffffb28940453f58 R14: 0074726174736572 R15: ffff921d383bcb00
> [   20.381703]  ? profile_setup.cold+0x9b/0x9b
> [   20.382604]  stack_trace_save_user+0x60/0x7d
> [   20.383520]  trace_buffer_unlock_commit_regs+0x17b/0x220
> [   20.384686]  trace_event_buffer_commit+0x6b/0x200
> [   20.385741]  trace_event_raw_event_preemptirq_template+0x7b/0xc0
> [   20.387067]  ? get_page_from_freelist+0x10a/0x13b0
> [   20.388129]  ? __alloc_pages_nodemask+0x178/0x380
> [   20.389132]  trace_hardirqs_off+0xc6/0x100
> [   20.390006]  get_page_from_freelist+0x10a/0x13b0
> [   20.390997]  ? kvm_clock_read+0x18/0x30
> [   20.391813]  ? sched_clock+0x9/0x10
> [   20.392647]  __alloc_pages_nodemask+0x178/0x380
> [   20.393706]  alloc_pages_current+0x87/0xe0
> [   20.394609]  __page_cache_alloc+0xcd/0x110
> [   20.395491]  __do_page_cache_readahead+0xa1/0x1a0
> [   20.396547]  ondemand_readahead+0x220/0x540
> [   20.397486]  page_cache_sync_readahead+0x35/0x50
> [   20.398511]  generic_file_read_iter+0x8d8/0xbd0
> [   20.399335]  ? __might_sleep+0x4b/0x80
> [   20.400110]  ext4_file_read_iter+0x35/0x40
> [   20.400937]  new_sync_read+0x10f/0x1a0
> [   20.401833]  __vfs_read+0x29/0x40
> [   20.402608]  vfs_read+0xc0/0x170
> [   20.403319]  kernel_read+0x31/0x50

The cause was obvious as Linus already said in:
  https://lore.kernel.org/lkml/CAHk-=whvxJjBBOmQSsrB-xHkc6xm8zGHsBRgpxh14UsEY_g+nw@mail.gmail.com/

stack_trace_save_user() is called after set_fs(KERNEL_DS).
I don't know why systemctl alyways hit this, but user space process can make up stack
as it like, so we have to check it by ourselves?


ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
    mm_segment_t old_fs;
    ssize_t result;

    old_fs = get_fs();
    set_fs(KERNEL_DS);
    /* The cast to a user pointer is valid due to the set_fs() */
    result = vfs_read(file, (void __user *)buf, count, pos);
    set_fs(old_fs);
    return result;
}
EXPORT_SYMBOL(kernel_read);



> [   20.404128]  prepare_binprm+0x150/0x190
> [   20.404766]  __do_execve_file.isra.0+0x4c0/0xaa0
> [   20.405559]  __x64_sys_execve+0x39/0x50
> [   20.406173]  do_syscall_64+0x55/0x1b0
> [   20.406762]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [   20.407703] RIP: 0033:0x7fec9f0ee647
> [   20.408441] Code: ff 76 e0 f7 d8 64 41 89 01 eb d8 0f 1f 84 00 00 00 00 00 f7 d8 64 41 89 01 eb d7 0f 1f 84 00 00 08
> [   20.411636] RSP: 002b:00007ffe7f316228 EFLAGS: 00000202 ORIG_RAX: 000000000000003b
> [   20.412686] RAX: ffffffffffffffda RBX: 00007fec9f8105a4 RCX: 00007fec9f0ee647
> [   20.414047] RDX: 00007ffe7f3167d8 RSI: 00007ffe7f316230 RDI: 00007fec9f73cea0
> [   20.415048] RBP: 00007ffe7f316480 R08: 0000000000000030 R09: 0000000000000001
> [   20.416076] R10: 00007ffe7f316498 R11: 0000000000000202 R12: 00007fec9f73cea0
> [   20.417556] R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
> [   20.419032] irq event stamp: 832
> [   20.419493] hardirqs last  enabled at (831): [<ffffffffa94b9f2e>] __find_get_block+0xde/0x360
> [   20.420649] hardirqs last disabled at (832): [<ffffffffa9305ad2>] rcu_irq_enter_irqson+0x12/0x30
> [   20.422179] softirqs last  enabled at (436): [<ffffffffaa20037f>] __do_softirq+0x37f/0x472
> [   20.423286] softirqs last disabled at (291): [<ffffffffa927bce3>] irq_exit+0xb3/0xd0
> [   20.424296] ---[ end trace cdb84a67edcdc420 ]---
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/7] x86/paravirt: Make read_cr2() CALLEE_SAVE
  2019-07-04 19:55 ` [PATCH v2 1/7] x86/paravirt: Make read_cr2() CALLEE_SAVE Peter Zijlstra
  2019-07-04 21:49   ` Andy Lutomirski
@ 2019-07-10 19:53   ` Steven Rostedt
  1 sibling, 0 replies; 42+ messages in thread
From: Steven Rostedt @ 2019-07-10 19:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, bp, mingo, luto, torvalds, hpa, dave.hansen, jgross,
	linux-kernel, zhe.he, joel, devel

On Thu, 04 Jul 2019 21:55:56 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> --- a/arch/x86/xen/xen-asm.S
> +++ b/arch/x86/xen/xen-asm.S
> @@ -10,6 +10,7 @@
>  #include <asm/percpu.h>
>  #include <asm/processor-flags.h>
>  #include <asm/frame.h>
> +#include <asm/asm.h>
>  
>  #include <linux/linkage.h>
>  
> @@ -135,3 +136,19 @@ ENTRY(check_events)
>  	FRAME_END
>  	ret
>  ENDPROC(check_events)
> +
> +ENTRY(xen_read_cr2)
> +	FRAME_BEGIN
> +	_ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX
> +	_ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX
> +	FRAME_END
> +	ret
> +	ENDPROC(xen_read_cr2);
> +
> +ENTRY(xen_read_cr2_direct)
> +	FRAME_BEGIN
> +	_ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX
> +	FRAME_END
> +	ret
> +	ENDPROC(xen_read_cr2_direct);
> +

When I applied this locally, git complained about this extra line at the
end of the file.

-- Steve

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/7] x86/entry/32: Simplify common_exception
  2019-07-04 19:55 ` [PATCH v2 2/7] x86/entry/32: Simplify common_exception Peter Zijlstra
  2019-07-04 21:51   ` Andy Lutomirski
@ 2019-07-10 20:11   ` Steven Rostedt
  2019-07-10 20:14     ` Peter Zijlstra
  1 sibling, 1 reply; 42+ messages in thread
From: Steven Rostedt @ 2019-07-10 20:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, bp, mingo, luto, torvalds, hpa, dave.hansen, jgross,
	linux-kernel, zhe.he, joel, devel

On Thu, 04 Jul 2019 21:55:57 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -294,9 +294,11 @@
>  .Lfinished_frame_\@:
>  .endm
>  
> -.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
> +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0
>  	cld
> +.if \skip_gs == 0
>  	PUSH_GS
> +.endif
>  	FIXUP_FRAME
>  	pushl	%fs
>  	pushl	%es
> @@ -313,13 +315,13 @@
>  	movl	%edx, %es
>  	movl	$(__KERNEL_PERCPU), %edx
>  	movl	%edx, %fs
> +.if \skip_gs == 0
>  	SET_KERNEL_GS %edx
> -
> +.endif
>  	/* Switch to kernel stack if necessary */
>  .if \switch_stacks > 0
>  	SWITCH_TO_KERNEL_STACK
>  .endif
> -
>  .endm
>  
>  .macro SAVE_ALL_NMI cr3_reg:req
> @@ -1448,32 +1450,20 @@ END(page_fault)
>  
>  common_exception:
>  	/* the function address is in %gs's slot on the stack */
> -	FIXUP_FRAME
> -	pushl	%fs
> -	pushl	%es
> -	pushl	%ds
> -	pushl	%eax
> -	movl	$(__USER_DS), %eax
> -	movl	%eax, %ds
> -	movl	%eax, %es
> -	movl	$(__KERNEL_PERCPU), %eax
> -	movl	%eax, %fs
> -	pushl	%ebp
> -	pushl	%edi
> -	pushl	%esi
> -	pushl	%edx
> -	pushl	%ecx
> -	pushl	%ebx
> -	SWITCH_TO_KERNEL_STACK
> +	SAVE_ALL switch_stacks=1 skip_gs=1
>  	ENCODE_FRAME_POINTER
> -	cld

The only code change of this is that cld moved from the end to the
beginning. As this appears to match other SAVE_ALL users with respect
to ENCODE_FRAME_POINTER, this shouldn't be an issue.

Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

-- Steve



>  	UNWIND_ESPFIX_STACK
> +

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/7] x86/entry/32: Simplify common_exception
  2019-07-10 20:11   ` Steven Rostedt
@ 2019-07-10 20:14     ` Peter Zijlstra
  0 siblings, 0 replies; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-10 20:14 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: tglx, bp, mingo, luto, torvalds, hpa, dave.hansen, jgross,
	linux-kernel, zhe.he, joel, devel

On Wed, Jul 10, 2019 at 04:11:37PM -0400, Steven Rostedt wrote:
> On Thu, 04 Jul 2019 21:55:57 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:

> > -	cld
> 
> The only code change of this is that cld moved from the end to the
> beginning. As this appears to match other SAVE_ALL users with respect
> to ENCODE_FRAME_POINTER, this shouldn't be an issue.

On that, getting CLD and CLAC together is somewhere on the TODO list. It
is stupid they're not.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 3/7] x86/entry/64: Simplify idtentry a little
  2019-07-04 19:55 ` [PATCH v2 3/7] x86/entry/64: Simplify idtentry a little Peter Zijlstra
  2019-07-04 21:54   ` Andy Lutomirski
@ 2019-07-10 20:23   ` Steven Rostedt
  1 sibling, 0 replies; 42+ messages in thread
From: Steven Rostedt @ 2019-07-10 20:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, bp, mingo, luto, torvalds, hpa, dave.hansen, jgross,
	linux-kernel, zhe.he, joel, devel

On Thu, 04 Jul 2019 21:55:58 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> There's a bunch of duplication in idtentry, namely the
> .Lfrom_usermode_switch_stack is a paranoid=0 copy of the normal flow.
> 
> Make this explicit by creating a idtentry_part helper macro.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

-- Steve

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 4/7] x86/entry/64: Update comments and sanity tests for create_gap
  2019-07-04 19:55 ` [PATCH v2 4/7] x86/entry/64: Update comments and sanity tests for create_gap Peter Zijlstra
  2019-07-04 21:55   ` Andy Lutomirski
@ 2019-07-10 20:24   ` Steven Rostedt
  1 sibling, 0 replies; 42+ messages in thread
From: Steven Rostedt @ 2019-07-10 20:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, bp, mingo, luto, torvalds, hpa, dave.hansen, jgross,
	linux-kernel, zhe.he, joel, devel

On Thu, 04 Jul 2019 21:55:59 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

Very descriptive change log ;-)

Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

-- Steve


> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-07 18:17       ` Linus Torvalds
@ 2019-07-10 20:27         ` Steven Rostedt
  2019-07-11  6:45           ` Greg Kroah-Hartman
                             ` (2 more replies)
  0 siblings, 3 replies; 42+ messages in thread
From: Steven Rostedt @ 2019-07-10 20:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andy Lutomirski, Peter Zijlstra, Thomas Gleixner,
	Borislav Petkov, Ingo Molnar, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel, stable,
	Greg Kroah-Hartman, Sasha Levin


[ added stable folks ]

On Sun, 7 Jul 2019 11:17:09 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sun, Jul 7, 2019 at 8:11 AM Andy Lutomirski <luto@kernel.org> wrote:
> >
> > FWIW, I'm leaning toward suggesting that we apply the trivial tracing
> > fix and backport *that*.  Then, in -tip, we could revert it and apply
> > this patch instead.  
> 
> You don't have to have the same fix in stable as in -tip.
> 
> It's fine to send something to stable that says "Fixed differently by
> commit XYZ upstream". The main thing is to make sure that stable
> doesn't have fixes that then get lost upstream (which we used to have
> long long ago).
> 

But isn't it easier for them to just pull the quick fix in, if it is in
your tree? That is, it shouldn't be too hard to make the "quick fix"
that gets backported on your tree (and probably better testing), and
then add the proper fix on top of it. The stable folks will then just
use the commit sha to know what to take, and feel more confident about
taking it.

-- Steve

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 6/7] x86/entry/64: Remove TRACE_IRQS_*_DEBUG
  2019-07-04 19:56 ` [PATCH v2 6/7] x86/entry/64: Remove TRACE_IRQS_*_DEBUG Peter Zijlstra
@ 2019-07-11  3:24   ` Steven Rostedt
  2019-07-11  8:05     ` Peter Zijlstra
  0 siblings, 1 reply; 42+ messages in thread
From: Steven Rostedt @ 2019-07-11  3:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, bp, mingo, luto, torvalds, hpa, dave.hansen, jgross,
	linux-kernel, zhe.he, joel, devel

On Thu, 04 Jul 2019 21:56:01 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> Since INT3/#BP no longer runs on an IST, this workaround is no longer
> required.
> 
> Tested by running lockdep+ftrace as described in the initial commit:
> 
>   5963e317b1e9 ("ftrace/x86: Do not change stacks in DEBUG when calling lockdep")

It looks like a clean revert, and it passed my ftrace smoke tests with
lockdep enabled (although I triggered a locked warning unrelated to
this, with the text_mutex and module_mutex, but I'm hoping my tree has
the fixes for that).

Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

Hmm, does this mean we can remove the IDT switching in the NMI handler
as well?

-- Steve


> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/entry/entry_64.S |   46 ++--------------------------------------------
>  1 file changed, 2 insertions(+), 44 deletions(-)
> 
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -68,44 +68,6 @@ END(native_usergs_sysret64)
>  .endm
>  
>  /*
> - * When dynamic function tracer is enabled it will add a breakpoint
> - * to all locations that it is about to modify, sync CPUs, update
> - * all the code, sync CPUs, then remove the breakpoints. In this time
> - * if lockdep is enabled, it might jump back into the debug handler
> - * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
> - *
> - * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
> - * make sure the stack pointer does not get reset back to the top
> - * of the debug stack, and instead just reuses the current stack.
> - */
> -#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
> -
> -.macro TRACE_IRQS_OFF_DEBUG
> -	call	debug_stack_set_zero
> -	TRACE_IRQS_OFF
> -	call	debug_stack_reset
> -.endm
> -
> -.macro TRACE_IRQS_ON_DEBUG
> -	call	debug_stack_set_zero
> -	TRACE_IRQS_ON
> -	call	debug_stack_reset
> -.endm
> -
> -.macro TRACE_IRQS_IRETQ_DEBUG
> -	btl	$9, EFLAGS(%rsp)		/* interrupts off? */
> -	jnc	1f
> -	TRACE_IRQS_ON_DEBUG
> -1:
> -.endm
> -
> -#else
> -# define TRACE_IRQS_OFF_DEBUG			TRACE_IRQS_OFF
> -# define TRACE_IRQS_ON_DEBUG			TRACE_IRQS_ON
> -# define TRACE_IRQS_IRETQ_DEBUG			TRACE_IRQS_IRETQ
> -#endif
> -
> -/*
>   * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
>   *
>   * This is the only entry point used for 64-bit system calls.  The
> @@ -879,11 +841,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
>  	GET_CR2_INTO(%rdx);			/* can clobber %rax */
>  	.endif
>  
> -	.if \shift_ist != -1
> -	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
> -	.else
>  	TRACE_IRQS_OFF
> -	.endif
>  
>  	.if \paranoid == 0
>  	testb	$3, CS(%rsp)
> @@ -1292,7 +1250,7 @@ END(paranoid_entry)
>  ENTRY(paranoid_exit)
>  	UNWIND_HINT_REGS
>  	DISABLE_INTERRUPTS(CLBR_ANY)
> -	TRACE_IRQS_OFF_DEBUG
> +	TRACE_IRQS_OFF
>  
>  	/* Handle GS depending on FSGSBASE availability */
>  	ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "nop",X86_FEATURE_FSGSBASE
> @@ -1312,7 +1270,7 @@ ENTRY(paranoid_exit)
>  	jmp	.Lparanoid_exit_restore
>  
>  .Lparanoid_exit_no_swapgs:
> -	TRACE_IRQS_IRETQ_DEBUG
> +	TRACE_IRQS_IRETQ
>  	/* Always restore stashed CR3 value (see paranoid_entry) */
>  	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
>  
> 


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-10 20:27         ` Steven Rostedt
@ 2019-07-11  6:45           ` Greg Kroah-Hartman
  2019-07-11 12:12           ` Sasha Levin
  2019-07-11 12:21           ` Peter Zijlstra
  2 siblings, 0 replies; 42+ messages in thread
From: Greg Kroah-Hartman @ 2019-07-11  6:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Linus Torvalds, Andy Lutomirski, Peter Zijlstra, Thomas Gleixner,
	Borislav Petkov, Ingo Molnar, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel, stable,
	Sasha Levin

On Wed, Jul 10, 2019 at 04:27:09PM -0400, Steven Rostedt wrote:
> 
> [ added stable folks ]
> 
> On Sun, 7 Jul 2019 11:17:09 -0700
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> > On Sun, Jul 7, 2019 at 8:11 AM Andy Lutomirski <luto@kernel.org> wrote:
> > >
> > > FWIW, I'm leaning toward suggesting that we apply the trivial tracing
> > > fix and backport *that*.  Then, in -tip, we could revert it and apply
> > > this patch instead.  
> > 
> > You don't have to have the same fix in stable as in -tip.
> > 
> > It's fine to send something to stable that says "Fixed differently by
> > commit XYZ upstream". The main thing is to make sure that stable
> > doesn't have fixes that then get lost upstream (which we used to have
> > long long ago).
> > 
> 
> But isn't it easier for them to just pull the quick fix in, if it is in
> your tree? That is, it shouldn't be too hard to make the "quick fix"
> that gets backported on your tree (and probably better testing), and
> then add the proper fix on top of it. The stable folks will then just
> use the commit sha to know what to take, and feel more confident about
> taking it.

It all depends on what the "quick fix" is.  The reason I want to take
the exact same patch that is in Linus's tree is that 95% of the time
that we do a "one off" patch for stable only, it's wrong.  We _ALWAYS_
get it wrong somehow, it's crazy how bad we are at this.  I don't know
why this is, but we have the stats to prove it.

Because of this, I now require the "one off" stable only fixes to get a
bunch of people reviewing it and write up a bunch of explaination as to
why this is the way it is and why we can't just take whatever is in
mainline.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 6/7] x86/entry/64: Remove TRACE_IRQS_*_DEBUG
  2019-07-11  3:24   ` Steven Rostedt
@ 2019-07-11  8:05     ` Peter Zijlstra
  0 siblings, 0 replies; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-11  8:05 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: tglx, bp, mingo, luto, torvalds, hpa, dave.hansen, jgross,
	linux-kernel, zhe.he, joel, devel

On Wed, Jul 10, 2019 at 11:24:56PM -0400, Steven Rostedt wrote:
> On Thu, 04 Jul 2019 21:56:01 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > Since INT3/#BP no longer runs on an IST, this workaround is no longer
> > required.
> > 
> > Tested by running lockdep+ftrace as described in the initial commit:
> > 
> >   5963e317b1e9 ("ftrace/x86: Do not change stacks in DEBUG when calling lockdep")
> 
> It looks like a clean revert, and it passed my ftrace smoke tests with
> lockdep enabled (although I triggered a locked warning unrelated to
> this, with the text_mutex and module_mutex, but I'm hoping my tree has
> the fixes for that).
> 
> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

Thanks!

> Hmm, does this mean we can remove the IDT switching in the NMI handler
> as well?

I'll have to go look at that; there still are ISTs and NMIs can still
hit those.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-10 20:27         ` Steven Rostedt
  2019-07-11  6:45           ` Greg Kroah-Hartman
@ 2019-07-11 12:12           ` Sasha Levin
  2019-07-11 12:21           ` Peter Zijlstra
  2 siblings, 0 replies; 42+ messages in thread
From: Sasha Levin @ 2019-07-11 12:12 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Linus Torvalds, Andy Lutomirski, Peter Zijlstra, Thomas Gleixner,
	Borislav Petkov, Ingo Molnar, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel, stable,
	Greg Kroah-Hartman

On Wed, Jul 10, 2019 at 04:27:09PM -0400, Steven Rostedt wrote:
>
>[ added stable folks ]
>
>On Sun, 7 Jul 2019 11:17:09 -0700
>Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> On Sun, Jul 7, 2019 at 8:11 AM Andy Lutomirski <luto@kernel.org> wrote:
>> >
>> > FWIW, I'm leaning toward suggesting that we apply the trivial tracing
>> > fix and backport *that*.  Then, in -tip, we could revert it and apply
>> > this patch instead.
>>
>> You don't have to have the same fix in stable as in -tip.
>>
>> It's fine to send something to stable that says "Fixed differently by
>> commit XYZ upstream". The main thing is to make sure that stable
>> doesn't have fixes that then get lost upstream (which we used to have
>> long long ago).
>>
>
>But isn't it easier for them to just pull the quick fix in, if it is in
>your tree? That is, it shouldn't be too hard to make the "quick fix"
>that gets backported on your tree (and probably better testing), and
>then add the proper fix on top of it. The stable folks will then just
>use the commit sha to know what to take, and feel more confident about
>taking it.

I'd say that if the "final" fix is significantly different than what
we'll end up with upstream then just do as Linus said and send us a
separate backport.

If we try doing the apply fix/revert etc games it'll just be more
difficult later on to parse what has happened. On the other hand, if we
have a clear explanation in the backported commit as to how it's
different from upstream and the reasons for doing so it'll make future
us happy when we try to apply fixes on top of it.

--
Thanks,
Sasha

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption
  2019-07-10 20:27         ` Steven Rostedt
  2019-07-11  6:45           ` Greg Kroah-Hartman
  2019-07-11 12:12           ` Sasha Levin
@ 2019-07-11 12:21           ` Peter Zijlstra
  2 siblings, 0 replies; 42+ messages in thread
From: Peter Zijlstra @ 2019-07-11 12:21 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Linus Torvalds, Andy Lutomirski, Thomas Gleixner,
	Borislav Petkov, Ingo Molnar, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel, stable,
	Greg Kroah-Hartman, Sasha Levin

On Wed, Jul 10, 2019 at 04:27:09PM -0400, Steven Rostedt wrote:

> But isn't it easier for them to just pull the quick fix in, if it is in

Steve, I've not yet seen a quick fix that actually fixes all the
problems.

Your initial one only fixes the IRQ tracing one, but leaves the context
tracking one wide open.

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2019-07-11 12:21 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-04 19:55 [PATCH v2 0/7] Tracing vs CR2 (and cleanups) Peter Zijlstra
2019-07-04 19:55 ` [PATCH v2 1/7] x86/paravirt: Make read_cr2() CALLEE_SAVE Peter Zijlstra
2019-07-04 21:49   ` Andy Lutomirski
2019-07-10 19:53   ` Steven Rostedt
2019-07-04 19:55 ` [PATCH v2 2/7] x86/entry/32: Simplify common_exception Peter Zijlstra
2019-07-04 21:51   ` Andy Lutomirski
2019-07-10 20:11   ` Steven Rostedt
2019-07-10 20:14     ` Peter Zijlstra
2019-07-04 19:55 ` [PATCH v2 3/7] x86/entry/64: Simplify idtentry a little Peter Zijlstra
2019-07-04 21:54   ` Andy Lutomirski
2019-07-10 20:23   ` Steven Rostedt
2019-07-04 19:55 ` [PATCH v2 4/7] x86/entry/64: Update comments and sanity tests for create_gap Peter Zijlstra
2019-07-04 21:55   ` Andy Lutomirski
2019-07-10 20:24   ` Steven Rostedt
2019-07-04 19:56 ` [PATCH v2 5/7] x86/mm, tracing: Fix CR2 corruption Peter Zijlstra
2019-07-05  2:18   ` Linus Torvalds
2019-07-05  3:16     ` Andy Lutomirski
2019-07-05  3:27       ` Linus Torvalds
2019-07-05 13:49     ` Peter Zijlstra
2019-07-06 21:41       ` Linus Torvalds
2019-07-06 22:27         ` Steven Rostedt
2019-07-06 22:41           ` Linus Torvalds
2019-07-07  0:08             ` Linus Torvalds
2019-07-07  0:36               ` Andy Lutomirski
2019-07-06 23:50           ` Andy Lutomirski
2019-07-07  3:44           ` Eiichi Tsukata
2019-07-06 11:07     ` Eiichi Tsukata
2019-07-08  7:48       ` Peter Zijlstra
2019-07-08  8:58         ` Eiichi Tsukata
2019-07-08  9:42           ` Eiichi Tsukata
2019-07-09  5:17             ` Eiichi Tsukata
2019-07-07 15:10   ` Andy Lutomirski
2019-07-07 15:11     ` Andy Lutomirski
2019-07-07 18:17       ` Linus Torvalds
2019-07-10 20:27         ` Steven Rostedt
2019-07-11  6:45           ` Greg Kroah-Hartman
2019-07-11 12:12           ` Sasha Levin
2019-07-11 12:21           ` Peter Zijlstra
2019-07-04 19:56 ` [PATCH v2 6/7] x86/entry/64: Remove TRACE_IRQS_*_DEBUG Peter Zijlstra
2019-07-11  3:24   ` Steven Rostedt
2019-07-11  8:05     ` Peter Zijlstra
2019-07-04 19:56 ` [RFC][PATCH v2 7/7] x86/entry/64: Pull bits into C Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).