All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andy Lutomirski <luto@kernel.org>
To: x86@kernel.org
Cc: linux-kernel@vger.kernel.org, "Brian Gerst" <brgerst@gmail.com>,
	"Borislav Petkov" <bp@alien8.de>,
	"Frédéric Weisbecker" <fweisbec@gmail.com>,
	"Denys Vlasenko" <dvlasenk@redhat.com>,
	"Linus Torvalds" <torvalds@linux-foundation.org>,
	"Andy Lutomirski" <luto@kernel.org>
Subject: [PATCH 10/12] x86/entry/64: Migrate the 64-bit syscall slow path to C
Date: Mon,  7 Dec 2015 13:51:35 -0800	[thread overview]
Message-ID: <e1e7884bbff6496b39b4da51d8b37a1dfd149f5e.1449522077.git.luto@kernel.org> (raw)
In-Reply-To: <cover.1449522077.git.luto@kernel.org>
In-Reply-To: <cover.1449522077.git.luto@kernel.org>

This is more complicated than the 32-bit and compat cases because it
preserves an asm fast path for the case where the callee-saved regs
aren't needed in pt_regs and no entry or exit work needs to be done.

This appears to slow down fastpath syscalls by no more than one cycle
on my Skylake laptop.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/entry/common.c   |  26 ++++++++++
 arch/x86/entry/entry_64.S | 124 +++++++++++++++-------------------------------
 2 files changed, 67 insertions(+), 83 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a89fdbc1f0be..d45119e770ef 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -344,6 +344,32 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
 	prepare_exit_to_usermode(regs);
 }
 
+#ifdef CONFIG_X86_64
+__visible void do_syscall_64(struct pt_regs *regs)
+{
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
+	unsigned long nr = regs->orig_ax;
+
+	local_irq_enable();
+
+	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
+		nr = syscall_trace_enter(regs);
+
+	/*
+	 * NB: Native and x32 syscalls are dispatched from the same
+	 * table.  The only functional difference is the x32 bit in
+	 * regs->orig_ax, which changes the behavior of some syscalls.
+	 */
+	if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
+		regs->ax = sys_call_table[nr & __SYSCALL_MASK](
+			regs->di, regs->si, regs->dx,
+			regs->r10, regs->r8, regs->r9);
+	}
+
+	syscall_return_slowpath(regs);
+}
+#endif
+
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 /*
  * Does a 32-bit syscall.  Called with IRQs on and does all entry and
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 81b0944708c5..1ab5362f241d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -145,17 +145,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
+	TRACE_IRQS_OFF
+
 	/* Construct struct pt_regs on stack */
 	pushq	$__USER_DS			/* pt_regs->ss */
 	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
-	/*
-	 * Re-enable interrupts.
-	 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
-	 * must execute atomically in the face of possible interrupt-driven
-	 * task preemption. We must enable interrupts only after we're done
-	 * with using rsp_scratch:
-	 */
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq	%r11				/* pt_regs->flags */
 	pushq	$__USER_CS			/* pt_regs->cs */
 	pushq	%rcx				/* pt_regs->ip */
@@ -171,9 +165,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	pushq	%r11				/* pt_regs->r11 */
 	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
 
-	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	tracesys
+	/*
+	 * If we need to do entry work or if we guess we'll need to do
+	 * exit work, go straight to the slow path.
+	 */
+	testl	$_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	entry_SYSCALL64_slow_path
+
 entry_SYSCALL_64_fastpath:
+	/*
+	 * Easy case: enable interrupts and issue the syscall.  If the syscall
+	 * needs pt_regs, we'll call a stub that disables interrupts again
+	 * and jumps to the slow path.
+	 */
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
 #if __SYSCALL_MASK == ~0
 	cmpq	$__NR_syscall_max, %rax
 #else
@@ -185,93 +191,43 @@ entry_SYSCALL_64_fastpath:
 	call	*sys_call_table_fastpath_64(, %rax, 8)
 	movq	%rax, RAX(%rsp)
 1:
-/*
- * Syscall return path ending with SYSRET (fast path).
- * Has incompletely filled pt_regs.
- */
-	LOCKDEP_SYS_EXIT
-	/*
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
-	DISABLE_INTERRUPTS(CLBR_NONE)
 
 	/*
-	 * We must check ti flags with interrupts (or at least preemption)
-	 * off because we must *never* return to userspace without
-	 * processing exit work that is enqueued if we're preempted here.
-	 * In particular, returning to userspace with any of the one-shot
-	 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
-	 * very bad.
+	 * If we get here, then we know that pt_regs is clean for SYSRET64.
+	 * If we see that no exit work is required (which we are required
+	 * to check with IRQs off), then we can go straight to SYSRET64.
 	 */
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */
+	jnz	1f
 
-	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RIP(%rsp), %rcx
-	movq	EFLAGS(%rsp), %r11
+	LOCKDEP_SYS_EXIT
+	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
+	RESTORE_C_REGS
 	movq	RSP(%rsp), %rsp
-	/*
-	 * 64-bit SYSRET restores rip from rcx,
-	 * rflags from r11 (but RF and VM bits are forced to 0),
-	 * cs and ss are loaded from MSRs.
-	 * Restoration of rflags re-enables interrupts.
-	 *
-	 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
-	 * descriptor is not reinitialized.  This means that we should
-	 * avoid SYSRET with SS == NULL, which could happen if we schedule,
-	 * exit the kernel, and re-enter using an interrupt vector.  (All
-	 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
-	 * from happening by reloading SS in __switch_to.  (Actually
-	 * detecting the failure in 64-bit userspace is tricky but can be
-	 * done.)
-	 */
 	USERGS_SYSRET64
 
-GLOBAL(int_ret_from_sys_call_irqs_off)
+1:
+	/*
+	 * The fast path looked good when we started, but something changed
+	 * along the way and we need to switch to the slow path.  Calling
+	 * raise(3) will trigger this, for example.  IRQs are off.
+	 */
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	jmp int_ret_from_sys_call
-
-	/* Do syscall entry tracing */
-tracesys:
-	movq	%rsp, %rdi
-	movl	$AUDIT_ARCH_X86_64, %esi
-	call	syscall_trace_enter_phase1
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
-	movl	$AUDIT_ARCH_X86_64, %esi
-	movq	%rax, %rdx
-	call	syscall_trace_enter_phase2
-
-	/*
-	 * Reload registers from stack in case ptrace changed them.
-	 * We don't reload %rax because syscall_trace_entry_phase2() returned
-	 * the value it wants us to use in the table lookup.
-	 */
-	RESTORE_C_REGS_EXCEPT_RAX
-#if __SYSCALL_MASK == ~0
-	cmpq	$__NR_syscall_max, %rax
-#else
-	andl	$__SYSCALL_MASK, %eax
-	cmpl	$__NR_syscall_max, %eax
-#endif
-	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
-	movq	%r10, %rcx			/* fixup for C */
-	call	*sys_call_table(, %rax, 8)
-	movq	%rax, RAX(%rsp)
-	RESTORE_EXTRA_REGS
-1:
-	/* Use IRET because user could have changed pt_regs->foo */
+	call	syscall_return_slowpath	/* returns with IRQs disabled */
+	jmp	return_from_SYSCALL_64
 
-/*
- * Syscall return path ending with IRET.
- * Has correct iret frame.
- */
-GLOBAL(int_ret_from_sys_call)
+entry_SYSCALL64_slow_path:
+	/* IRQs are off. */
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
-	call	syscall_return_slowpath	/* returns with IRQs disabled */
+	call	do_syscall_64		/* returns with IRQs disabled */
+
+return_from_SYSCALL_64:
 	RESTORE_EXTRA_REGS
 	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
@@ -353,8 +309,10 @@ ENTRY(stub_ptregs_64)
 	 * Syscalls marked as needing ptregs that go through the fast path
 	 * land here.  We transfer to the slow path.
 	 */
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
 	addq	$8, %rsp
-	jmp	tracesys
+	jmp	entry_SYSCALL64_slow_path
 END(stub_ptregs_64)
 
 /*
-- 
2.5.0


  parent reply	other threads:[~2015-12-07 21:52 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-12-07 21:51 [PATCH 00/12] x86: Rewrite 64-bit syscall code Andy Lutomirski
2015-12-07 21:51 ` [PATCH 01/12] selftests/x86: Extend Makefile to allow 64-bit only tests Andy Lutomirski
2015-12-08  9:34   ` Borislav Petkov
2015-12-09 18:55     ` Andy Lutomirski
2015-12-09 19:11   ` Shuah Khan
2015-12-09 19:22     ` Andy Lutomirski
2015-12-09 19:58       ` Shuah Khan
2015-12-07 21:51 ` [PATCH 02/12] selftests/x86: Add check_initial_reg_state Andy Lutomirski
2015-12-08  9:54   ` Borislav Petkov
2015-12-09 18:56     ` Andy Lutomirski
2015-12-09 19:09       ` Borislav Petkov
2015-12-09 19:20         ` Andy Lutomirski
2015-12-09 19:28           ` Borislav Petkov
2015-12-07 21:51 ` [PATCH 03/12] x86/syscalls: Refactor syscalltbl.sh Andy Lutomirski
2015-12-07 21:51 ` [PATCH 04/12] x86/syscalls: Remove __SYSCALL_COMMON and __SYSCALL_X32 Andy Lutomirski
2015-12-07 21:51 ` [PATCH 05/12] x86/syscalls: Move compat syscall entry handling into syscalltbl.sh Andy Lutomirski
2015-12-07 21:51 ` [PATCH 06/12] x86/syscalls: Add syscall entry qualifiers Andy Lutomirski
2015-12-07 21:51 ` [PATCH 07/12] x86/entry/64: Always run ptregs-using syscalls on the slow path Andy Lutomirski
2015-12-08  0:50   ` Brian Gerst
2015-12-08  0:54     ` Brian Gerst
2015-12-08  1:12       ` Andy Lutomirski
2015-12-08 13:07         ` Brian Gerst
2015-12-08 18:56           ` Ingo Molnar
2015-12-08 21:51             ` Andy Lutomirski
2015-12-09  4:43   ` Brian Gerst
2015-12-09  5:45     ` Andy Lutomirski
2015-12-09  6:21       ` Andy Lutomirski
2015-12-09 12:52         ` Brian Gerst
2015-12-09 13:02         ` [PATCH] x86/entry/64: Remove duplicate syscall table for fast path Brian Gerst
2015-12-09 18:53           ` Andy Lutomirski
2015-12-09 21:08             ` Brian Gerst
2015-12-09 21:15               ` Andy Lutomirski
2015-12-09 23:50                 ` Andy Lutomirski
2015-12-10  5:42                   ` Brian Gerst
2015-12-10  5:54                     ` Andy Lutomirski
2015-12-09 19:30           ` Andy Lutomirski
2015-12-07 21:51 ` [PATCH 08/12] x86/entry/64: Call all native slow-path syscalls with full pt-regs Andy Lutomirski
2015-12-07 21:51 ` [PATCH 09/12] x86/entry/64: Stop using int_ret_from_sys_call in ret_from_fork Andy Lutomirski
2015-12-07 21:51 ` Andy Lutomirski [this message]
2015-12-07 21:51 ` [PATCH 11/12] x86/entry/32: Change INT80 to be an interrupt gate Andy Lutomirski
2016-04-01  1:45   ` Rusty Russell
2016-04-01  7:40     ` [tip:x86/urgent] lguest, x86/entry/32: Fix handling of guest syscalls using interrupt gates tip-bot for Rusty Russell
2015-12-07 21:51 ` [PATCH 12/12] x86/entry: Do enter_from_user_mode with IRQs off Andy Lutomirski
2015-12-07 22:55 ` [PATCH 00/12] x86: Rewrite 64-bit syscall code Andy Lutomirski
2015-12-08  4:42   ` Ingo Molnar
2015-12-08  5:42     ` Andy Lutomirski
2015-12-08  7:00       ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=e1e7884bbff6496b39b4da51d8b37a1dfd149f5e.1449522077.git.luto@kernel.org \
    --to=luto@kernel.org \
    --cc=bp@alien8.de \
    --cc=brgerst@gmail.com \
    --cc=dvlasenk@redhat.com \
    --cc=fweisbec@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.