All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] tracing vs CR2
@ 2019-07-03 10:27 root
  2019-07-03 10:27 ` [PATCH 1/3] x86/paravirt: Make read_cr2() CALLEE_SAVE root
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: root @ 2019-07-03 10:27 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

Hi,

Eiichi-san re-discovered the bug earlier found by He Zhe which we've failed to
fix due to getting distracted by discussing how to untangle entry_64.S.

These 3 patches are basically a completion of the initial approach I suggested
in that earlier thread:

  https://lkml.kernel.org/r/20190320221534.165ab87b@oasis.local.home

Yes, idtentry is a mess, and this doesn't help, but lets fix this now before
someone else trips over it.

This boots on x86_64 and builds on i386 so it must be perfect. No Xen testing
what so ever, because I wouldn't know where to begin.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/3] x86/paravirt: Make read_cr2() CALLEE_SAVE
  2019-07-03 10:27 [PATCH 0/3] tracing vs CR2 root
@ 2019-07-03 10:27 ` root
  2019-07-03 14:12   ` Juergen Gross
  2019-07-03 10:27 ` [PATCH 2/3] x86/entry/32: Simplify common_exception root
  2019-07-03 10:27 ` [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption root
  2 siblings, 1 reply; 15+ messages in thread
From: root @ 2019-07-03 10:27 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

The one paravirt read_cr2() implementation (Xen) is actually quite
trivial and doesn't need to clobber anything other than the return
register. By making read_cr2() CALLEE_SAVE we avoid all the PUSH/POP
nonsense and allow more convenient use from assembly.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/calling.h              |    6 ++++++
 arch/x86/include/asm/paravirt.h       |   22 +++++++++++++---------
 arch/x86/include/asm/paravirt_types.h |    2 +-
 arch/x86/kernel/asm-offsets.c         |    1 +
 arch/x86/kernel/head_64.S             |    4 +---
 arch/x86/kernel/paravirt.c            |    2 +-
 arch/x86/xen/enlighten_pv.c           |    3 ++-
 arch/x86/xen/mmu_pv.c                 |   12 +-----------
 arch/x86/xen/xen-asm.S                |   17 +++++++++++++++++
 arch/x86/xen/xen-ops.h                |    3 +++
 10 files changed, 46 insertions(+), 26 deletions(-)

--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -383,3 +383,9 @@ For 32-bit we have the following convent
 .Lafter_call_\@:
 #endif
 .endm
+
+#ifdef CONFIG_PARAVIRT_XXL
+#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
+#else
+#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
+#endif
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -116,7 +116,7 @@ static inline void write_cr0(unsigned lo
 
 static inline unsigned long read_cr2(void)
 {
-	return PVOP_CALL0(unsigned long, mmu.read_cr2);
+	return PVOP_CALLEE0(unsigned long, mmu.read_cr2);
 }
 
 static inline void write_cr2(unsigned long x)
@@ -909,13 +909,7 @@ extern void default_banner(void);
 		  ANNOTATE_RETPOLINE_SAFE;				\
 		  call PARA_INDIRECT(pv_ops+PV_CPU_swapgs);		\
 		 )
-#endif
-
-#define GET_CR2_INTO_RAX				\
-	ANNOTATE_RETPOLINE_SAFE;				\
-	call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);
 
-#ifdef CONFIG_PARAVIRT_XXL
 #define USERGS_SYSRET64							\
 	PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64),			\
 		  ANNOTATE_RETPOLINE_SAFE;				\
@@ -929,9 +923,19 @@ extern void default_banner(void);
 		  call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);	    \
 		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 #endif
-#endif
+#endif /* CONFIG_PARAVIRT_XXL */
+#endif	/* CONFIG_X86_64 */
+
+#ifdef CONFIG_PARAVIRT_XXL
+
+#define GET_CR2_INTO_AX							\
+	PARA_SITE(PARA_PATCH(PV_MMU_read_cr2),				\
+		  ANNOTATE_RETPOLINE_SAFE;				\
+		  call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);		\
+		 )
+
+#endif /* CONFIG_PARAVIRT_XXL */
 
-#endif	/* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
 #else  /* CONFIG_PARAVIRT */
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -220,7 +220,7 @@ struct pv_mmu_ops {
 	void (*exit_mmap)(struct mm_struct *mm);
 
 #ifdef CONFIG_PARAVIRT_XXL
-	unsigned long (*read_cr2)(void);
+	struct paravirt_callee_save read_cr2;
 	void (*write_cr2)(unsigned long);
 
 	unsigned long (*read_cr3)(void);
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -76,6 +76,7 @@ static void __used common(void)
 	BLANK();
 	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
 	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+	OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2);
 #endif
 
 	BLANK();
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -29,9 +29,7 @@
 #ifdef CONFIG_PARAVIRT_XXL
 #include <asm/asm-offsets.h>
 #include <asm/paravirt.h>
-#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
 #else
-#define GET_CR2_INTO(reg) movq %cr2, reg
 #define INTERRUPT_RETURN iretq
 #endif
 
@@ -323,7 +321,7 @@ END(early_idt_handler_array)
 
 	cmpq $14,%rsi		/* Page fault? */
 	jnz 10f
-	GET_CR2_INTO(%rdi)	/* Can clobber any volatile register if pv */
+	GET_CR2_INTO(%rdi)	/* can clobber %rax if pv */
 	call early_make_pgtable
 	andl %eax,%eax
 	jz 20f			/* All good */
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -370,7 +370,7 @@ struct paravirt_patch_template pv_ops =
 	.mmu.exit_mmap		= paravirt_nop,
 
 #ifdef CONFIG_PARAVIRT_XXL
-	.mmu.read_cr2		= native_read_cr2,
+	.mmu.read_cr2		= __PV_IS_CALLEE_SAVE(native_read_cr2),
 	.mmu.write_cr2		= native_write_cr2,
 	.mmu.read_cr3		= __native_read_cr3,
 	.mmu.write_cr3		= native_write_cr3,
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -998,7 +998,8 @@ void __init xen_setup_vcpu_info_placemen
 			__PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
 		pv_ops.irq.irq_enable =
 			__PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
-		pv_ops.mmu.read_cr2 = xen_read_cr2_direct;
+		pv_ops.mmu.read_cr2 =
+			__PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
 	}
 }
 
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1307,16 +1307,6 @@ static void xen_write_cr2(unsigned long
 	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
 }
 
-static unsigned long xen_read_cr2(void)
-{
-	return this_cpu_read(xen_vcpu)->arch.cr2;
-}
-
-unsigned long xen_read_cr2_direct(void)
-{
-	return this_cpu_read(xen_vcpu_info.arch.cr2);
-}
-
 static noinline void xen_flush_tlb(void)
 {
 	struct mmuext_op *op;
@@ -2397,7 +2387,7 @@ static void xen_leave_lazy_mmu(void)
 }
 
 static const struct pv_mmu_ops xen_mmu_ops __initconst = {
-	.read_cr2 = xen_read_cr2,
+	.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
 	.write_cr2 = xen_write_cr2,
 
 	.read_cr3 = xen_read_cr3,
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -10,6 +10,7 @@
 #include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/frame.h>
+#include <asm/asm.h>
 
 #include <linux/linkage.h>
 
@@ -135,3 +136,19 @@ ENTRY(check_events)
 	FRAME_END
 	ret
 ENDPROC(check_events)
+
+ENTRY(xen_read_cr2)
+	FRAME_BEGIN
+	_ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX
+	_ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX
+	FRAME_END
+	ret
+	ENDPROC(xen_read_cr2);
+
+ENTRY(xen_read_cr2_direct)
+	FRAME_BEGIN
+	_ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX
+	FRAME_END
+	ret
+	ENDPROC(xen_read_cr2_direct);
+
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -134,6 +134,9 @@ __visible void xen_irq_disable_direct(vo
 __visible unsigned long xen_save_fl_direct(void);
 __visible void xen_restore_fl_direct(unsigned long);
 
+__visible unsigned long xen_read_cr2(void);
+__visible unsigned long xen_read_cr2_direct(void);
+
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
 __visible void xen_sysret32(void);



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 2/3] x86/entry/32: Simplify common_exception
  2019-07-03 10:27 [PATCH 0/3] tracing vs CR2 root
  2019-07-03 10:27 ` [PATCH 1/3] x86/paravirt: Make read_cr2() CALLEE_SAVE root
@ 2019-07-03 10:27 ` root
  2019-07-03 10:27 ` [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption root
  2 siblings, 0 replies; 15+ messages in thread
From: root @ 2019-07-03 10:27 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

By adding one more option to SAVE_ALL we can make use of it in
common_exception and simplify things. This saves duplication later
where page_fault will no longer use common_exception.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/entry_32.S |   36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -294,9 +294,11 @@
 .Lfinished_frame_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0
 	cld
+.if \skip_gs == 0
 	PUSH_GS
+.endif
 	FIXUP_FRAME
 	pushl	%fs
 	pushl	%es
@@ -313,13 +315,13 @@
 	movl	%edx, %es
 	movl	$(__KERNEL_PERCPU), %edx
 	movl	%edx, %fs
+.if \skip_gs == 0
 	SET_KERNEL_GS %edx
-
+.endif
 	/* Switch to kernel stack if necessary */
 .if \switch_stacks > 0
 	SWITCH_TO_KERNEL_STACK
 .endif
-
 .endm
 
 .macro SAVE_ALL_NMI cr3_reg:req
@@ -1448,32 +1450,20 @@ END(page_fault)
 
 common_exception:
 	/* the function address is in %gs's slot on the stack */
-	FIXUP_FRAME
-	pushl	%fs
-	pushl	%es
-	pushl	%ds
-	pushl	%eax
-	movl	$(__USER_DS), %eax
-	movl	%eax, %ds
-	movl	%eax, %es
-	movl	$(__KERNEL_PERCPU), %eax
-	movl	%eax, %fs
-	pushl	%ebp
-	pushl	%edi
-	pushl	%esi
-	pushl	%edx
-	pushl	%ecx
-	pushl	%ebx
-	SWITCH_TO_KERNEL_STACK
+	SAVE_ALL switch_stacks=1 skip_gs=1
 	ENCODE_FRAME_POINTER
-	cld
 	UNWIND_ESPFIX_STACK
+
+	/* fixup %gs */
 	GS_TO_REG %ecx
 	movl	PT_GS(%esp), %edi		# get the function address
-	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
-	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
 	REG_TO_PTGS %ecx
 	SET_KERNEL_GS %ecx
+
+	/* fixup orig %eax */
+	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
+	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
+
 	TRACE_IRQS_OFF
 	movl	%esp, %eax			# pt_regs pointer
 	CALL_NOSPEC %edi



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 10:27 [PATCH 0/3] tracing vs CR2 root
  2019-07-03 10:27 ` [PATCH 1/3] x86/paravirt: Make read_cr2() CALLEE_SAVE root
  2019-07-03 10:27 ` [PATCH 2/3] x86/entry/32: Simplify common_exception root
@ 2019-07-03 10:27 ` root
  2019-07-03 20:22   ` Peter Zijlstra
  2019-07-03 20:27   ` Andy Lutomirski
  2 siblings, 2 replies; 15+ messages in thread
From: root @ 2019-07-03 10:27 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel, peterz

Despire the current efforts to read CR2 before tracing happens there
still exist a number of possible holes:

  idtentry page_fault             do_page_fault           has_error_code=1
    call error_entry
      TRACE_IRQS_OFF
        call trace_hardirqs_off*
          #PF // modifies CR2

      CALL_enter_from_user_mode
        __context_tracking_exit()
          trace_user_exit(0)
            #PF // modifies CR2

    call do_page_fault
      address = read_cr2(); /* whoopsie */

And similar for i386.

Fix it by pulling the CR2 read into the entry code, before any of that
stuff gets a chance to run and ruin things.

Ideally we'll clean up the entry code by moving this tracing and
context tracking nonsense into C some day, but let's not delay fixing
this longer.

Reported-by: He Zhe <zhe.he@windriver.com>
Reported-by: Eiichi Tsukata <devel@etsukata.com>
Debugged-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/entry_32.S       |   25 ++++++++++++++++++++++---
 arch/x86/entry/entry_64.S       |   28 ++++++++++++++--------------
 arch/x86/include/asm/kvm_para.h |    2 +-
 arch/x86/include/asm/traps.h    |    2 +-
 arch/x86/kernel/kvm.c           |    8 ++++----
 arch/x86/mm/fault.c             |   28 ++++++++++------------------
 6 files changed, 52 insertions(+), 41 deletions(-)

--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vec
 
 ENTRY(page_fault)
 	ASM_CLAC
-	pushl	$do_page_fault
-	ALIGN
-	jmp common_exception
+	pushl	$0; /* %gs's slot on the stack */
+
+	SAVE_ALL switch_stacks=1 skip_gs=1
+
+	ENCODE_FRAME_POINTER
+	UNWIND_ESPFIX_STACK
+
+	/* fixup %gs */
+	GS_TO_REG %ecx
+	REG_TO_PTGS %ecx
+	SET_KERNEL_GS %ecx
+
+	GET_CR2_INTO(%ecx)			# might clobber %eax
+
+	/* fixup orig %eax */
+	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
+	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
+
+	TRACE_IRQS_OFF
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_page_fault
+	jmp	ret_from_exception
 END(page_fault)
 
 common_exception:
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -901,7 +901,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
  */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
 ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
@@ -937,18 +937,27 @@ ENTRY(\sym)
 
 	.if \paranoid
 	call	paranoid_entry
+	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 	.else
 	call	error_entry
 	.endif
 	UNWIND_HINT_REGS
-	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
-	.if \paranoid
+	.if \read_cr2
+	GET_CR2_INTO(%rdx);			/* can clobber %rax */
+	.endif
+
 	.if \shift_ist != -1
 	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
 	.else
 	TRACE_IRQS_OFF
 	.endif
+
+	.if \paranoid == 0
+	testb	$3, CS(%rsp)
+	jz	.Lfrom_kernel_no_context_tracking_\@
+	CALL_enter_from_user_mode
+.Lfrom_kernel_no_context_tracking_\@:
 	.endif
 
 	movq	%rsp, %rdi			/* pt_regs pointer */
@@ -1180,10 +1189,10 @@ idtentry xenint3		do_int3			has_error_co
 #endif
 
 idtentry general_protection	do_general_protection	has_error_code=1
-idtentry page_fault		do_page_fault		has_error_code=1
+idtentry page_fault		do_page_fault		has_error_code=1	read_cr2=1
 
 #ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault	do_async_page_fault	has_error_code=1
+idtentry async_page_fault	do_async_page_fault	has_error_code=1	read_cr2=1
 #endif
 
 #ifdef CONFIG_X86_MCE
@@ -1338,18 +1347,9 @@ ENTRY(error_entry)
 	movq	%rax, %rsp			/* switch stack */
 	ENCODE_FRAME_POINTER
 	pushq	%r12
-
-	/*
-	 * We need to tell lockdep that IRQs are off.  We can't do this until
-	 * we fix gsbase, and we should do it before enter_from_user_mode
-	 * (which can take locks).
-	 */
-	TRACE_IRQS_OFF
-	CALL_enter_from_user_mode
 	ret
 
 .Lerror_entry_done:
-	TRACE_IRQS_OFF
 	ret
 
 	/*
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -92,7 +92,7 @@ void kvm_async_pf_task_wait(u32 token, i
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code);
+void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -81,7 +81,7 @@ struct bad_iret_stack *fixup_bad_iret(st
 void __init trap_init(void);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code);
-dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code);
+dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code);
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
 dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code);
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_
 NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
 dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 {
 	enum ctx_state prev_state;
 
 	switch (kvm_read_and_reset_pf_reason()) {
 	default:
-		do_page_fault(regs, error_code);
+		do_page_fault(regs, error_code, address);
 		break;
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		/* page is swapped out by the host. */
 		prev_state = exception_enter();
-		kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
+		kvm_async_pf_task_wait((u32)address, !user_mode(regs));
 		exception_exit(prev_state);
 		break;
 	case KVM_PV_REASON_PAGE_READY:
 		rcu_irq_enter();
-		kvm_async_pf_task_wake((u32)read_cr2());
+		kvm_async_pf_task_wake((u32)address);
 		rcu_irq_exit();
 		break;
 	}
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1513,7 +1513,7 @@ NOKPROBE_SYMBOL(do_user_addr_fault);
  * and the problem, and then passes it off to one of the appropriate
  * routines.
  */
-static noinline void
+static __always_inline void
 __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
 		unsigned long address)
 {
@@ -1528,35 +1528,27 @@ __do_page_fault(struct pt_regs *regs, un
 	else
 		do_user_addr_fault(regs, hw_error_code, address);
 }
-NOKPROBE_SYMBOL(__do_page_fault);
 
-static nokprobe_inline void
-trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
-			 unsigned long error_code)
+static __always_inline void
+trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
+			 unsigned long address)
 {
+	if (!trace_pagefault_enabled())
+		return;
+
 	if (user_mode(regs))
 		trace_page_fault_user(address, regs, error_code);
 	else
 		trace_page_fault_kernel(address, regs, error_code);
 }
 
-/*
- * We must have this function blacklisted from kprobes, tagged with notrace
- * and call read_cr2() before calling anything else. To avoid calling any
- * kind of tracing machinery before we've observed the CR2 value.
- *
- * exception_{enter,exit}() contains all sorts of tracepoints.
- */
-dotraplinkage void notrace
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+dotraplinkage void
+do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 {
-	unsigned long address = read_cr2(); /* Get the faulting address */
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
-	if (trace_pagefault_enabled())
-		trace_page_fault_entries(address, regs, error_code);
-
+	trace_page_fault_entries(regs, error_code, address);
 	__do_page_fault(regs, error_code, address);
 	exception_exit(prev_state);
 }



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] x86/paravirt: Make read_cr2() CALLEE_SAVE
  2019-07-03 10:27 ` [PATCH 1/3] x86/paravirt: Make read_cr2() CALLEE_SAVE root
@ 2019-07-03 14:12   ` Juergen Gross
  0 siblings, 0 replies; 15+ messages in thread
From: Juergen Gross @ 2019-07-03 14:12 UTC (permalink / raw)
  To: root, bp, rostedt, luto, mingo, tglx, torvalds
  Cc: devel, joel, dave.hansen, linux-kernel, zhe.he, hpa

On 03.07.19 12:27, root wrote:
> The one paravirt read_cr2() implementation (Xen) is actually quite
> trivial and doesn't need to clobber anything other than the return
> register. By making read_cr2() CALLEE_SAVE we avoid all the PUSH/POP
> nonsense and allow more convenient use from assembly.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Reviewed-by: Juergen Gross <jgross@suse.com>


Juergen

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 10:27 ` [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption root
@ 2019-07-03 20:22   ` Peter Zijlstra
  2019-07-03 20:29     ` Steven Rostedt
  2019-07-03 20:27   ` Andy Lutomirski
  1 sibling, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2019-07-03 20:22 UTC (permalink / raw)
  To: tglx, bp, mingo, rostedt, luto, torvalds
  Cc: hpa, dave.hansen, jgross, linux-kernel, zhe.he, joel, devel

On Wed, Jul 03, 2019 at 12:27:34PM +0200, root wrote:
> Despire the current efforts to read CR2 before tracing happens there
> still exist a number of possible holes:
> 
>   idtentry page_fault             do_page_fault           has_error_code=1
>     call error_entry
>       TRACE_IRQS_OFF
>         call trace_hardirqs_off*
>           #PF // modifies CR2
> 
>       CALL_enter_from_user_mode
>         __context_tracking_exit()
>           trace_user_exit(0)
>             #PF // modifies CR2
> 
>     call do_page_fault
>       address = read_cr2(); /* whoopsie */
> 
> And similar for i386.
> 
> Fix it by pulling the CR2 read into the entry code, before any of that
> stuff gets a chance to run and ruin things.
> 
> Ideally we'll clean up the entry code by moving this tracing and
> context tracking nonsense into C some day, but let's not delay fixing
> this longer.
> 

> @@ -1180,10 +1189,10 @@ idtentry xenint3		do_int3			has_error_co
>  #endif
>  
>  idtentry general_protection	do_general_protection	has_error_code=1
> -idtentry page_fault		do_page_fault		has_error_code=1
> +idtentry page_fault		do_page_fault		has_error_code=1	read_cr2=1
>  
>  #ifdef CONFIG_KVM_GUEST
> -idtentry async_page_fault	do_async_page_fault	has_error_code=1
> +idtentry async_page_fault	do_async_page_fault	has_error_code=1	read_cr2=1
>  #endif

While going over the various idt handlers, I found that we probably also
need read_cr2 on do_double_fault(), otherwise it is susceptible to the
same problem.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 10:27 ` [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption root
  2019-07-03 20:22   ` Peter Zijlstra
@ 2019-07-03 20:27   ` Andy Lutomirski
  2019-07-03 20:47     ` Steven Rostedt
  2019-07-03 22:00     ` Peter Zijlstra
  1 sibling, 2 replies; 15+ messages in thread
From: Andy Lutomirski @ 2019-07-03 20:27 UTC (permalink / raw)
  To: root
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Andrew Lutomirski, Linus Torvalds, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel

On Wed, Jul 3, 2019 at 3:28 AM root <peterz@infradead.org> wrote:
>
> Despire the current efforts to read CR2 before tracing happens there
> still exist a number of possible holes:
>
>   idtentry page_fault             do_page_fault           has_error_code=1
>     call error_entry
>       TRACE_IRQS_OFF
>         call trace_hardirqs_off*
>           #PF // modifies CR2
>
>       CALL_enter_from_user_mode
>         __context_tracking_exit()
>           trace_user_exit(0)
>             #PF // modifies CR2
>
>     call do_page_fault
>       address = read_cr2(); /* whoopsie */
>
> And similar for i386.
>
> Fix it by pulling the CR2 read into the entry code, before any of that
> stuff gets a chance to run and ruin things.
>
> Ideally we'll clean up the entry code by moving this tracing and
> context tracking nonsense into C some day, but let's not delay fixing
> this longer.
>
> Reported-by: He Zhe <zhe.he@windriver.com>
> Reported-by: Eiichi Tsukata <devel@etsukata.com>
> Debugged-by: Steven Rostedt <rostedt@goodmis.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/entry/entry_32.S       |   25 ++++++++++++++++++++++---
>  arch/x86/entry/entry_64.S       |   28 ++++++++++++++--------------
>  arch/x86/include/asm/kvm_para.h |    2 +-
>  arch/x86/include/asm/traps.h    |    2 +-
>  arch/x86/kernel/kvm.c           |    8 ++++----
>  arch/x86/mm/fault.c             |   28 ++++++++++------------------
>  6 files changed, 52 insertions(+), 41 deletions(-)
>
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vec
>
>  ENTRY(page_fault)
>         ASM_CLAC
> -       pushl   $do_page_fault
> -       ALIGN
> -       jmp common_exception
> +       pushl   $0; /* %gs's slot on the stack */
> +
> +       SAVE_ALL switch_stacks=1 skip_gs=1
> +
> +       ENCODE_FRAME_POINTER
> +       UNWIND_ESPFIX_STACK
> +
> +       /* fixup %gs */
> +       GS_TO_REG %ecx
> +       REG_TO_PTGS %ecx
> +       SET_KERNEL_GS %ecx
> +
> +       GET_CR2_INTO(%ecx)                      # might clobber %eax
> +
> +       /* fixup orig %eax */
> +       movl    PT_ORIG_EAX(%esp), %edx         # get the error code
> +       movl    $-1, PT_ORIG_EAX(%esp)          # no syscall to restart
> +
> +       TRACE_IRQS_OFF
> +       movl    %esp, %eax                      # pt_regs pointer
> +       call    do_page_fault
> +       jmp     ret_from_exception
>  END(page_fault)
>
>  common_exception:
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -901,7 +901,7 @@ apicinterrupt IRQ_WORK_VECTOR                       irq_work
>   * @paranoid == 2 is special: the stub will never switch stacks.  This is for
>   * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
>   */
> -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
> +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
>  ENTRY(\sym)
>         UNWIND_HINT_IRET_REGS offset=\has_error_code*8
>
> @@ -937,18 +937,27 @@ ENTRY(\sym)
>
>         .if \paranoid
>         call    paranoid_entry
> +       /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
>         .else
>         call    error_entry
>         .endif
>         UNWIND_HINT_REGS
> -       /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
>
> -       .if \paranoid
> +       .if \read_cr2
> +       GET_CR2_INTO(%rdx);                     /* can clobber %rax */
> +       .endif
> +
>         .if \shift_ist != -1
>         TRACE_IRQS_OFF_DEBUG                    /* reload IDT in case of recursion */
>         .else
>         TRACE_IRQS_OFF
>         .endif
> +
> +       .if \paranoid == 0
> +       testb   $3, CS(%rsp)
> +       jz      .Lfrom_kernel_no_context_tracking_\@
> +       CALL_enter_from_user_mode
> +.Lfrom_kernel_no_context_tracking_\@:
>         .endif
>
>         movq    %rsp, %rdi                      /* pt_regs pointer */
> @@ -1180,10 +1189,10 @@ idtentry xenint3                do_int3                 has_error_co
>  #endif
>
>  idtentry general_protection    do_general_protection   has_error_code=1
> -idtentry page_fault            do_page_fault           has_error_code=1
> +idtentry page_fault            do_page_fault           has_error_code=1        read_cr2=1
>
>  #ifdef CONFIG_KVM_GUEST
> -idtentry async_page_fault      do_async_page_fault     has_error_code=1
> +idtentry async_page_fault      do_async_page_fault     has_error_code=1        read_cr2=1
>  #endif
>
>  #ifdef CONFIG_X86_MCE
> @@ -1338,18 +1347,9 @@ ENTRY(error_entry)
>         movq    %rax, %rsp                      /* switch stack */
>         ENCODE_FRAME_POINTER
>         pushq   %r12
> -
> -       /*
> -        * We need to tell lockdep that IRQs are off.  We can't do this until
> -        * we fix gsbase, and we should do it before enter_from_user_mode
> -        * (which can take locks).
> -        */
> -       TRACE_IRQS_OFF

This hunk looks wrong.  Am I missing some other place that handles the
case where we enter from kernel mode and IRQs were on?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 20:22   ` Peter Zijlstra
@ 2019-07-03 20:29     ` Steven Rostedt
  2019-07-03 21:51       ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: Steven Rostedt @ 2019-07-03 20:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tglx, bp, mingo, luto, torvalds, hpa, dave.hansen, jgross,
	linux-kernel, zhe.he, joel, devel

On Wed, 3 Jul 2019 22:22:31 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Jul 03, 2019 at 12:27:34PM +0200, root wrote:
> > Despire the current efforts to read CR2 before tracing happens there
> > still exist a number of possible holes:
> > 
> >   idtentry page_fault             do_page_fault           has_error_code=1
> >     call error_entry
> >       TRACE_IRQS_OFF
> >         call trace_hardirqs_off*
> >           #PF // modifies CR2
> > 
> >       CALL_enter_from_user_mode
> >         __context_tracking_exit()
> >           trace_user_exit(0)
> >             #PF // modifies CR2
> > 
> >     call do_page_fault
> >       address = read_cr2(); /* whoopsie */
> > 
> > And similar for i386.
> > 
> > Fix it by pulling the CR2 read into the entry code, before any of that
> > stuff gets a chance to run and ruin things.
> > 
> > Ideally we'll clean up the entry code by moving this tracing and
> > context tracking nonsense into C some day, but let's not delay fixing
> > this longer.
> >   
> 
> > @@ -1180,10 +1189,10 @@ idtentry xenint3		do_int3			has_error_co
> >  #endif
> >  
> >  idtentry general_protection	do_general_protection	has_error_code=1
> > -idtentry page_fault		do_page_fault		has_error_code=1
> > +idtentry page_fault		do_page_fault		has_error_code=1	read_cr2=1
> >  
> >  #ifdef CONFIG_KVM_GUEST
> > -idtentry async_page_fault	do_async_page_fault	has_error_code=1
> > +idtentry async_page_fault	do_async_page_fault	has_error_code=1	read_cr2=1
> >  #endif  
> 
> While going over the various idt handlers, I found that we probably also
> need read_cr2 on do_double_fault(), otherwise it is susceptible to the
> same problem.
> 

BTW, do you plan on making this for stable? Even though it's rather
invasive. Or should we just apply the band-aids first, have them
backported to stable, and then put this change on top of them for
upstream?

-- Steve

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 20:27   ` Andy Lutomirski
@ 2019-07-03 20:47     ` Steven Rostedt
  2019-07-03 22:05       ` Peter Zijlstra
  2019-07-03 22:00     ` Peter Zijlstra
  1 sibling, 1 reply; 15+ messages in thread
From: Steven Rostedt @ 2019-07-03 20:47 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: root, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Linus Torvalds, H. Peter Anvin, Dave Hansen, Juergen Gross, LKML,
	He Zhe, Joel Fernandes, devel

On Wed, 3 Jul 2019 13:27:09 -0700
Andy Lutomirski <luto@kernel.org> wrote:


> > @@ -1180,10 +1189,10 @@ idtentry xenint3                do_int3                 has_error_co
> >  #endif
> >
> >  idtentry general_protection    do_general_protection   has_error_code=1
> > -idtentry page_fault            do_page_fault           has_error_code=1
> > +idtentry page_fault            do_page_fault           has_error_code=1        read_cr2=1
> >
> >  #ifdef CONFIG_KVM_GUEST
> > -idtentry async_page_fault      do_async_page_fault     has_error_code=1
> > +idtentry async_page_fault      do_async_page_fault     has_error_code=1        read_cr2=1
> >  #endif
> >
> >  #ifdef CONFIG_X86_MCE
> > @@ -1338,18 +1347,9 @@ ENTRY(error_entry)
> >         movq    %rax, %rsp                      /* switch stack */
> >         ENCODE_FRAME_POINTER
> >         pushq   %r12
> > -
> > -       /*
> > -        * We need to tell lockdep that IRQs are off.  We can't do this until
> > -        * we fix gsbase, and we should do it before enter_from_user_mode
> > -        * (which can take locks).
> > -        */
> > -       TRACE_IRQS_OFF  
> 
> This hunk looks wrong.  Am I missing some other place that handles the
> case where we enter from kernel mode and IRQs were on?

Yeah, looks like we might be missing a TRACE_IRQS_OFF from the
from_usermode_stack_switch path.

-- Steve

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 20:29     ` Steven Rostedt
@ 2019-07-03 21:51       ` Peter Zijlstra
  0 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2019-07-03 21:51 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: tglx, bp, mingo, luto, torvalds, hpa, dave.hansen, jgross,
	linux-kernel, zhe.he, joel, devel

On Wed, Jul 03, 2019 at 04:29:42PM -0400, Steven Rostedt wrote:
> On Wed, 3 Jul 2019 22:22:31 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Wed, Jul 03, 2019 at 12:27:34PM +0200, root wrote:
> > > Despire the current efforts to read CR2 before tracing happens there
> > > still exist a number of possible holes:
> > > 
> > >   idtentry page_fault             do_page_fault           has_error_code=1
> > >     call error_entry
> > >       TRACE_IRQS_OFF
> > >         call trace_hardirqs_off*
> > >           #PF // modifies CR2
> > > 
> > >       CALL_enter_from_user_mode
> > >         __context_tracking_exit()
> > >           trace_user_exit(0)
> > >             #PF // modifies CR2
> > > 
> > >     call do_page_fault
> > >       address = read_cr2(); /* whoopsie */
> > > 
> > > And similar for i386.
> > > 
> > > Fix it by pulling the CR2 read into the entry code, before any of that
> > > stuff gets a chance to run and ruin things.
> > > 
> > > Ideally we'll clean up the entry code by moving this tracing and
> > > context tracking nonsense into C some day, but let's not delay fixing
> > > this longer.
> > >   
> > 
> > > @@ -1180,10 +1189,10 @@ idtentry xenint3		do_int3			has_error_co
> > >  #endif
> > >  
> > >  idtentry general_protection	do_general_protection	has_error_code=1
> > > -idtentry page_fault		do_page_fault		has_error_code=1
> > > +idtentry page_fault		do_page_fault		has_error_code=1	read_cr2=1
> > >  
> > >  #ifdef CONFIG_KVM_GUEST
> > > -idtentry async_page_fault	do_async_page_fault	has_error_code=1
> > > +idtentry async_page_fault	do_async_page_fault	has_error_code=1	read_cr2=1
> > >  #endif  
> > 
> > While going over the various idt handlers, I found that we probably also
> > need read_cr2 on do_double_fault(), otherwise it is susceptible to the
> > same problem.
> > 
> 
> BTW, do you plan on making this for stable? Even though it's rather
> invasive. Or should we just apply the band-aids first, have them
> backported to stable, and then put this change on top of them for
> upstream?

So I don't particularly care about stable; and the band-aids
(trace_irqs_off_cr2) is known broken so I really don't see the point.

That said, these patches should apply to most recent kernels (post PTI)
without too much rejects.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 20:27   ` Andy Lutomirski
  2019-07-03 20:47     ` Steven Rostedt
@ 2019-07-03 22:00     ` Peter Zijlstra
  2019-07-03 22:26       ` Andy Lutomirski
  2019-07-04  9:13       ` Peter Zijlstra
  1 sibling, 2 replies; 15+ messages in thread
From: Peter Zijlstra @ 2019-07-03 22:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Linus Torvalds, H. Peter Anvin, Dave Hansen, Juergen Gross, LKML,
	He Zhe, Joel Fernandes, devel

On Wed, Jul 03, 2019 at 01:27:09PM -0700, Andy Lutomirski wrote:
> On Wed, Jul 3, 2019 at 3:28 AM root <peterz@infradead.org> wrote:

> > @@ -1338,18 +1347,9 @@ ENTRY(error_entry)
> >         movq    %rax, %rsp                      /* switch stack */
> >         ENCODE_FRAME_POINTER
> >         pushq   %r12
> > -
> > -       /*
> > -        * We need to tell lockdep that IRQs are off.  We can't do this until
> > -        * we fix gsbase, and we should do it before enter_from_user_mode
> > -        * (which can take locks).
> > -        */
> > -       TRACE_IRQS_OFF
> 
> This hunk looks wrong.  Am I missing some other place that handles the
> case where we enter from kernel mode and IRQs were on?

> > -	CALL_enter_from_user_mode
> >  	ret
> >  
> >  .Lerror_entry_done:
> > -	TRACE_IRQS_OFF
> >  	ret
> >  
> >  	/*

Did you perchance mean to complain about the .Lerror_entry_done one?

Because I'm not seeing how the one before CALL_enter_from_user_mode can
ever be from-kernel.

But yes, that .Lerror_entry_done one looks fishy.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 20:47     ` Steven Rostedt
@ 2019-07-03 22:05       ` Peter Zijlstra
  2019-07-04  9:19         ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2019-07-03 22:05 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andy Lutomirski, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Linus Torvalds, H. Peter Anvin, Dave Hansen, Juergen Gross, LKML,
	He Zhe, Joel Fernandes, devel

On Wed, Jul 03, 2019 at 04:47:01PM -0400, Steven Rostedt wrote:
> On Wed, 3 Jul 2019 13:27:09 -0700
> Andy Lutomirski <luto@kernel.org> wrote:
> 
> 
> > > @@ -1180,10 +1189,10 @@ idtentry xenint3                do_int3                 has_error_co
> > >  #endif
> > >
> > >  idtentry general_protection    do_general_protection   has_error_code=1
> > > -idtentry page_fault            do_page_fault           has_error_code=1
> > > +idtentry page_fault            do_page_fault           has_error_code=1        read_cr2=1
> > >
> > >  #ifdef CONFIG_KVM_GUEST
> > > -idtentry async_page_fault      do_async_page_fault     has_error_code=1
> > > +idtentry async_page_fault      do_async_page_fault     has_error_code=1        read_cr2=1
> > >  #endif
> > >
> > >  #ifdef CONFIG_X86_MCE
> > > @@ -1338,18 +1347,9 @@ ENTRY(error_entry)
> > >         movq    %rax, %rsp                      /* switch stack */
> > >         ENCODE_FRAME_POINTER
> > >         pushq   %r12
> > > -
> > > -       /*
> > > -        * We need to tell lockdep that IRQs are off.  We can't do this until
> > > -        * we fix gsbase, and we should do it before enter_from_user_mode
> > > -        * (which can take locks).
> > > -        */
> > > -       TRACE_IRQS_OFF  
> > 
> > This hunk looks wrong.  Am I missing some other place that handles the
> > case where we enter from kernel mode and IRQs were on?
> 
> Yeah, looks like we might be missing a TRACE_IRQS_OFF from the
> from_usermode_stack_switch path.

Oh bugger, there's a second error_entry call.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 22:00     ` Peter Zijlstra
@ 2019-07-03 22:26       ` Andy Lutomirski
  2019-07-04  9:13       ` Peter Zijlstra
  1 sibling, 0 replies; 15+ messages in thread
From: Andy Lutomirski @ 2019-07-03 22:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andy Lutomirski, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Steven Rostedt, Linus Torvalds, H. Peter Anvin, Dave Hansen,
	Juergen Gross, LKML, He Zhe, Joel Fernandes, devel

On Wed, Jul 3, 2019 at 3:01 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Jul 03, 2019 at 01:27:09PM -0700, Andy Lutomirski wrote:
> > On Wed, Jul 3, 2019 at 3:28 AM root <peterz@infradead.org> wrote:
>
> > > @@ -1338,18 +1347,9 @@ ENTRY(error_entry)
> > >         movq    %rax, %rsp                      /* switch stack */
> > >         ENCODE_FRAME_POINTER
> > >         pushq   %r12
> > > -
> > > -       /*
> > > -        * We need to tell lockdep that IRQs are off.  We can't do this until
> > > -        * we fix gsbase, and we should do it before enter_from_user_mode
> > > -        * (which can take locks).
> > > -        */
> > > -       TRACE_IRQS_OFF
> >
> > This hunk looks wrong.  Am I missing some other place that handles the
> > case where we enter from kernel mode and IRQs were on?
>
> > > -   CALL_enter_from_user_mode
> > >     ret
> > >
> > >  .Lerror_entry_done:
> > > -   TRACE_IRQS_OFF
> > >     ret
> > >
> > >     /*
>
> Did you perchance mean to complain about the .Lerror_entry_done one?

Yes, duh.

I would suggest compiling with DEBUG_LOCKDEP and running perf and the
x86 selftests.  IIRC that's what catches most of the IRQ flag tracing
errors.

>
> Because I'm not seeing how the one before CALL_enter_from_user_mode can
> ever be from-kernel.
>
> But yes, that .Lerror_entry_done one looks fishy.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 22:00     ` Peter Zijlstra
  2019-07-03 22:26       ` Andy Lutomirski
@ 2019-07-04  9:13       ` Peter Zijlstra
  1 sibling, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2019-07-04  9:13 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, Borislav Petkov, Ingo Molnar, Steven Rostedt,
	Linus Torvalds, H. Peter Anvin, Dave Hansen, Juergen Gross, LKML,
	He Zhe, Joel Fernandes, devel

On Thu, Jul 04, 2019 at 12:00:57AM +0200, Peter Zijlstra wrote:
> On Wed, Jul 03, 2019 at 01:27:09PM -0700, Andy Lutomirski wrote:
> > On Wed, Jul 3, 2019 at 3:28 AM root <peterz@infradead.org> wrote:
> 
> > > @@ -1338,18 +1347,9 @@ ENTRY(error_entry)
> > >         movq    %rax, %rsp                      /* switch stack */
> > >         ENCODE_FRAME_POINTER
> > >         pushq   %r12
> > > -
> > > -       /*
> > > -        * We need to tell lockdep that IRQs are off.  We can't do this until
> > > -        * we fix gsbase, and we should do it before enter_from_user_mode
> > > -        * (which can take locks).
> > > -        */
> > > -       TRACE_IRQS_OFF
> > 
> > This hunk looks wrong.  Am I missing some other place that handles the
> > case where we enter from kernel mode and IRQs were on?
> 
> > > -	CALL_enter_from_user_mode
> > >  	ret
> > >  
> > >  .Lerror_entry_done:
> > > -	TRACE_IRQS_OFF
> > >  	ret
> > >  
> > >  	/*
> 
> Did you perchance mean to complain about the .Lerror_entry_done one?
> 
> Because I'm not seeing how the one before CALL_enter_from_user_mode can
> ever be from-kernel.
> 
> But yes, that .Lerror_entry_done one looks fishy.

That works because idtentry will now do an unconditional TRACE_IRQS_OFF,
note how the patch removes the .if \paranoid around that.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption
  2019-07-03 22:05       ` Peter Zijlstra
@ 2019-07-04  9:19         ` Peter Zijlstra
  0 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2019-07-04  9:19 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andy Lutomirski, Thomas Gleixner, Borislav Petkov, Ingo Molnar,
	Linus Torvalds, H. Peter Anvin, Dave Hansen, Juergen Gross, LKML,
	He Zhe, Joel Fernandes, devel

On Thu, Jul 04, 2019 at 12:05:22AM +0200, Peter Zijlstra wrote:
> On Wed, Jul 03, 2019 at 04:47:01PM -0400, Steven Rostedt wrote:

> > Yeah, looks like we might be missing a TRACE_IRQS_OFF from the
> > from_usermode_stack_switch path.
> 
> Oh bugger, there's a second error_entry call.

---
Subject: x86/entry/64: Simplify idtentry a little
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Jul 4 10:55:11 CEST 2019

There's a bunch of duplication in idtentry, namely the
.Lfrom_usermode_switch_stack is an explicit paranoid=0 copy of the
normal flow.

Make this explicit by creating a (idtentry_part) helper macro.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/entry_64.S |  100 +++++++++++++++++++++-------------------------
 1 file changed, 47 insertions(+), 53 deletions(-)

--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -865,6 +865,51 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work
  */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
 
+.macro idtentry_part has_error_code:req paranoid:req shift_ist:-1 ist_offset=0
+
+	.if \paranoid
+	call	paranoid_entry
+	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
+	.else
+	call	error_entry
+	.endif
+	UNWIND_HINT_REGS
+
+	.if \paranoid
+	.if \shift_ist != -1
+	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
+	.else
+	TRACE_IRQS_OFF
+	.endif
+	.endif
+
+	movq	%rsp, %rdi			/* pt_regs pointer */
+
+	.if \has_error_code
+	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
+	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
+	.else
+	xorl	%esi, %esi			/* no error code */
+	.endif
+
+	.if \shift_ist != -1
+	subq	$\ist_offset, CPU_TSS_IST(\shift_ist)
+	.endif
+
+	call	\do_sym
+
+	.if \shift_ist != -1
+	addq	$\ist_offset, CPU_TSS_IST(\shift_ist)
+	.endif
+
+	.if \paranoid
+	jmp	paranoid_exit
+	.else
+	jmp	error_exit
+	.endif
+
+.endm
+
 /**
  * idtentry - Generate an IDT entry stub
  * @sym:		Name of the generated entry point
@@ -935,46 +980,7 @@ ENTRY(\sym)
 .Lfrom_usermode_no_gap_\@:
 	.endif
 
-	.if \paranoid
-	call	paranoid_entry
-	.else
-	call	error_entry
-	.endif
-	UNWIND_HINT_REGS
-	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
-
-	.if \paranoid
-	.if \shift_ist != -1
-	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
-	.else
-	TRACE_IRQS_OFF
-	.endif
-	.endif
-
-	movq	%rsp, %rdi			/* pt_regs pointer */
-
-	.if \has_error_code
-	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
-	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
-	.else
-	xorl	%esi, %esi			/* no error code */
-	.endif
-
-	.if \shift_ist != -1
-	subq	$\ist_offset, CPU_TSS_IST(\shift_ist)
-	.endif
-
-	call	\do_sym
-
-	.if \shift_ist != -1
-	addq	$\ist_offset, CPU_TSS_IST(\shift_ist)
-	.endif
-
-	.if \paranoid
-	jmp	paranoid_exit
-	.else
-	jmp	error_exit
-	.endif
+	idtentry_part has_error_code=\has_error_code paranoid=\paranoid shift_ist=\shift_ist ist_offset=\ist_offset
 
 	.if \paranoid == 1
 	/*
@@ -983,21 +989,9 @@ ENTRY(\sym)
 	 * run in real process context if user_mode(regs).
 	 */
 .Lfrom_usermode_switch_stack_\@:
-	call	error_entry
-
-	movq	%rsp, %rdi			/* pt_regs pointer */
-
-	.if \has_error_code
-	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
-	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
-	.else
-	xorl	%esi, %esi			/* no error code */
+	idtentry_part has_error_code=\has_error_code paranoid=0
 	.endif
 
-	call	\do_sym
-
-	jmp	error_exit
-	.endif
 _ASM_NOKPROBE(\sym)
 END(\sym)
 .endm

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2019-07-04  9:19 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-03 10:27 [PATCH 0/3] tracing vs CR2 root
2019-07-03 10:27 ` [PATCH 1/3] x86/paravirt: Make read_cr2() CALLEE_SAVE root
2019-07-03 14:12   ` Juergen Gross
2019-07-03 10:27 ` [PATCH 2/3] x86/entry/32: Simplify common_exception root
2019-07-03 10:27 ` [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption root
2019-07-03 20:22   ` Peter Zijlstra
2019-07-03 20:29     ` Steven Rostedt
2019-07-03 21:51       ` Peter Zijlstra
2019-07-03 20:27   ` Andy Lutomirski
2019-07-03 20:47     ` Steven Rostedt
2019-07-03 22:05       ` Peter Zijlstra
2019-07-04  9:19         ` Peter Zijlstra
2019-07-03 22:00     ` Peter Zijlstra
2019-07-03 22:26       ` Andy Lutomirski
2019-07-04  9:13       ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.