[PATCH] x86/entry/64: Context-track syscalls before enabling interrupts

* [PATCH] x86/entry/64: Context-track syscalls before enabling interrupts
@ 2015-08-18 19:11 Andy Lutomirski
  2015-08-18 22:16 ` Frederic Weisbecker
  2015-08-19  0:30 ` Andy Lutomirski
  0 siblings, 2 replies; 10+ messages in thread
From: Andy Lutomirski @ 2015-08-18 19:11 UTC (permalink / raw)
  To: X86 ML
  Cc: Sasha Levin, Brian Gerst, Frédéric Weisbecker,
	Denys Vlasenko, linux-kernel, Oleg Nesterov, Borislav Petkov,
	Andy Lutomirski

This fixes a couple minor holes if we took an IRQ very early in syscall
processing:

 - We could enter the IRQ with CONTEXT_USER.  Everything worked (RCU
   was fine), but we could warn if all the debugging options were
   set.

 - We could have the IRQ regs overlap task_pt_regs.  I'm not aware
   of anything important that would break, but some of the /proc
   stuff could plausibly have gotten confused.

Fix it the straightforward way: finish filling in pt_regs and call
enter_from_user_mode before enabling interrupts if _TIF_NOHZ is set.

This should be the last piece of the puzzle needed to get rid of most
remaining exception_enter calls.  (vmalloc faults are still tricky,
but they're mostly fatal in the syscall prologue already.)

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---

This is the last significant functionality change I send for 4.3, I
hope.  With this applied, context tracking for all non-NMI, non-debug
entries should be exact.

There's probably some (minor) performance regression on
CONFIG_CONTEXT_TRACKING=y kernels that aren't using nohz.  If so
(I'll benchmark it later this week), I'll try to rig up a simple
patch to NOP out the hooks of nohz is off.

Sasha, this should fix the intermittent DEBUG_LOCKS splat you're
seeing.

I don't intend to send v2 the #BP stuff for 4.3.  The pile is plenty
big already.

 arch/x86/entry/common.c            | 12 +-------
 arch/x86/entry/entry_64.S          | 32 ++++++++++++++------
 arch/x86/entry/entry_64_compat.S   | 60 +++++++++++++++++++++++++++++---------
 arch/x86/include/asm/thread_info.h |  3 +-
 4 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 80dcc9261ca3..b570cea2f469 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -70,21 +70,11 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
 	u32 work;
 
 	BUG_ON(regs != task_pt_regs(current));
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 
 	work = ACCESS_ONCE(current_thread_info()->flags) &
 		_TIF_WORK_SYSCALL_ENTRY;
 
-#ifdef CONFIG_CONTEXT_TRACKING
-	/*
-	 * If TIF_NOHZ is set, we are required to call user_exit() before
-	 * doing anything that could touch RCU.
-	 */
-	if (work & _TIF_NOHZ) {
-		enter_from_user_mode();
-		work &= ~_TIF_NOHZ;
-	}
-#endif
-
 #ifdef CONFIG_SECCOMP
 	/*
 	 * Do seccomp first -- it should minimize exposure of other
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e2d078c9dfe4..6bf0c7ecf399 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,20 +142,16 @@ ENTRY(entry_SYSCALL_64)
 	 */
 GLOBAL(entry_SYSCALL_64_after_swapgs)
 
+	/*
+	 * IRQs must be off while we use rsp_scratch to keep it from
+	 * being clobbered by a different task.
+	 */
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/* Construct struct pt_regs on stack */
 	pushq	$__USER_DS			/* pt_regs->ss */
 	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
-	/*
-	 * Re-enable interrupts.
-	 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
-	 * must execute atomically in the face of possible interrupt-driven
-	 * task preemption. We must enable interrupts only after we're done
-	 * with using rsp_scratch:
-	 */
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq	%r11				/* pt_regs->flags */
 	pushq	$__USER_CS			/* pt_regs->cs */
 	pushq	%rcx				/* pt_regs->ip */
@@ -171,8 +167,17 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	pushq	%r11				/* pt_regs->r11 */
 	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
 
-	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY | _TIF_NOHZ), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz	tracesys
+
+	/*
+	 * Re-enable interrupts.  IRQ tracing already thinks that IRQs are
+	 * on (since we treat user mode as having IRQs on), and the
+	 * prologue above is too short for it to be worth adding a
+	 * tracing round trip.
+	 */
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
 entry_SYSCALL_64_fastpath:
 #if __SYSCALL_MASK == ~0
 	cmpq	$__NR_syscall_max, %rax
@@ -235,6 +240,15 @@ GLOBAL(int_ret_from_sys_call_irqs_off)
 
 	/* Do syscall entry tracing */
 tracesys:
+#ifdef CONFIG_CONTEXT_TRACKING
+	/* This is slow enough that it's worth tracing. */
+	TRACE_IRQS_OFF
+	call enter_from_user_mode
+	TRACE_IRQS_ON
+#endif
+
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
 	movq	%rsp, %rdi
 	movl	$AUDIT_ARCH_X86_64, %esi
 	call	syscall_trace_enter_phase1
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index ff32a289b5d1..099ec1174ff9 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -103,11 +103,19 @@ ENTRY(entry_SYSENTER_compat)
 	jnz	sysenter_fix_flags
 sysenter_flags_fixed:
 
+#ifdef CONFIG_CONTEXT_TRACKING
+	/* This is slow enough that it's worth tracing. */
+	TRACE_IRQS_OFF
+	call enter_from_user_mode
+	TRACE_IRQS_ON
+#endif
+
 	/*
 	 * Re-enable interrupts.  IRQ tracing already thinks that IRQs are
 	 * on (since we treat user mode as having IRQs on), and the
 	 * prologue above is too short for it to be worth adding a
-	 * tracing round trip.
+	 * tracing round trip except in the CONFIG_CONTEXT_TRACKING
+	 * case.
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
@@ -318,15 +326,10 @@ ENDPROC(entry_SYSENTER_compat)
  * with the int 0x80 path.
  */
 ENTRY(entry_SYSCALL_compat)
-	/*
-	 * Interrupts are off on entry.
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
+	/* Interrupts are off on entry. */
 	SWAPGS_UNSAFE_STACK
 	movl	%esp, %r8d
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-	ENABLE_INTERRUPTS(CLBR_NONE)
 
 	/* Zero-extending 32-bit regs, do not remove */
 	movl	%eax, %eax
@@ -346,6 +349,22 @@ ENTRY(entry_SYSCALL_compat)
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	sub	$(10*8), %rsp		/* pt_regs->r8-11, bp, bx, r12-15 not saved */
 
+#ifdef CONFIG_CONTEXT_TRACKING
+	/* This is slow enough that it's worth tracing. */
+	TRACE_IRQS_OFF
+	call enter_from_user_mode
+	TRACE_IRQS_ON
+#endif
+
+	/*
+	 * Re-enable interrupts.  IRQ tracing already thinks that IRQs are
+	 * on (since we treat user mode as having IRQs on), and the
+	 * prologue above is too short for it to be worth adding a
+	 * tracing round trip except in the CONFIG_CONTEXT_TRACKING
+	 * case.
+	 */
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
 	/*
 	 * No need to do an access_ok check here because r8 has been
 	 * 32-bit zero extended:
@@ -354,6 +373,7 @@ ENTRY(entry_SYSCALL_compat)
 1:	movl	(%r8), %r9d
 	_ASM_EXTABLE(1b, ia32_badarg)
 	ASM_CLAC
+
 	orl	$TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz	cstar_tracesys
@@ -518,14 +538,9 @@ ia32_ret_from_sys_call:
  */
 
 ENTRY(entry_INT80_compat)
-	/*
-	 * Interrupts are off on entry.
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
+	/* Interrupts are off on entry. */
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	SWAPGS
-	ENABLE_INTERRUPTS(CLBR_NONE)
 
 	/* Zero-extending 32-bit regs, do not remove */
 	movl	%eax, %eax
@@ -545,9 +560,17 @@ ENTRY(entry_INT80_compat)
 	sub	$(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
 
 	orl	$TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY | _TIF_NOHZ), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz	ia32_tracesys
 
+	/*
+	 * Re-enable interrupts.  IRQ tracing already thinks that IRQs are
+	 * on (since we treat user mode as having IRQs on), and the
+	 * prologue above is too short for it to be worth adding a
+	 * tracing round trip.
+	 */
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
 ia32_do_call:
 	/* 32-bit syscall -> 64-bit C ABI argument conversion */
 	movl	%edi, %r8d		/* arg5 */
@@ -564,6 +587,15 @@ ia32_do_call:
 	jmp	int_ret_from_sys_call
 
 ia32_tracesys:
+#ifdef CONFIG_CONTEXT_TRACKING
+	/* This is slow enough that it's worth tracing. */
+	TRACE_IRQS_OFF
+	call enter_from_user_mode
+	TRACE_IRQS_ON
+#endif
+
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi			/* &pt_regs -> arg1 */
 	call	syscall_trace_enter
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8afdc3e44247..3c5a96815dec 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -140,8 +140,7 @@ struct thread_info {
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
-	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
-	 _TIF_NOHZ)
+	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
 
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK						\
-- 
2.4.3


^ permalink raw reply related	[flat|nested] 10+ messages in thread