[RFC] de-asmify the x86-64 system call slowpath

* [RFC] de-asmify the x86-64 system call slowpath
@ 2014-01-26 22:28 Linus Torvalds
  2014-01-27  0:22 ` Al Viro
                   ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Linus Torvalds @ 2014-01-26 22:28 UTC (permalink / raw)
  To: Peter Anvin, Ingo Molnar, Thomas Gleixner, Peter Zijlstra
  Cc: the arch/x86 maintainers, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 2092 bytes --]

The x86-64 (and 32-bit, for that matter) system call slowpaths are all
in C, but the *selection* of which slow-path to take is a mixture of
complicated assembler ("sysret_check -> sysret_careful ->
sysret_signal ->sysret_audit -> int_check_syscall_exit_work" etc), and
oddly named and placed C code ("schedule_user" vs
"__audit_syscall_exit" vs "do_notify_resume").

This attached patch tries to take the "do_notify_resume()" approach,
and renaming it to something sane ("syscall_exit_slowpath") and call
out to *all* the different slow cases from that one place, instead of
having some cases hardcoded in asm, and some in C. And instead of
hardcoding which cases result in a "iretq" and which cases result in a
faster sysret case, it's now simply a return value from that
syscall_exit_slowpath() function, so it's very natural and easy to say
"taking a signal will force us to do the slow iretq case, but we can
do the task exit work and still do the sysret".

I've marked this as an RFC, because I didn't bother trying to clean up
the 32-bit code similarly (no test-cases, and trust me, if you get
this wrong, it will fail spectacularly but in very subtle and
hard-to-debug ways), and I also didn't bother with the slow cases in
the "iretq" path, so that path still has the odd asm cases and calls
the old (now legacy) do_notify_resume() path.

But this is actually tested, and seems to work (including limited
testing with strace, gdb etc), and while it adds a few more lines than
it removes, the removed lines are mostly asm, and added lines are C
(and part of them are the temporary still-extant do_notify_resume()
wrapper). In particular, it should be fairly straightforward to take
this as a starting point, removing the extant
do_notify_resume/schedule/etc cases one by one, and get rid of more
asm code and finally the wrapper.

Comments? This was obviously brought on by my frustration with the
currently nasty do_notify_resume() always returning to iret for the
task_work case, and PeterZ's patch that fixed that, but made the asm
mess even *worse*.

                        Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/plain, Size: 4573 bytes --]

 arch/x86/kernel/entry_64.S | 51 ++++++++++++++--------------------------------
 arch/x86/kernel/signal.c   | 37 ++++++++++++++++++++++++++++-----
 2 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c3628bf2..15b5953da958 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -658,30 +658,24 @@ sysret_check:
 	/* Handle reschedules */
 	/* edx:	work, edi: workmask */
 sysret_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc sysret_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi %rdi
-	SCHEDULE_USER
-	popq_cfi %rdi
-	jmp sysret_check
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %r11
+	movq %rsp,%rdi
+	movl %edx,%esi
+	call syscall_exit_slowpath
+	movl $_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT,%edi
+	testl %eax,%eax
+	jne sysret_using_iret
+	addq $REST_SKIP,%rsp
+	jmp sysret_check;
 
-	/* Handle a signal */
-sysret_signal:
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-#ifdef CONFIG_AUDITSYSCALL
-	bt $TIF_SYSCALL_AUDIT,%edx
-	jc sysret_audit
-#endif
-	/*
-	 * We have a signal, or exit tracing or single-step.
-	 * These all wind up with the iret return path anyway,
-	 * so just join that path right now.
-	 */
-	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
-	jmp int_check_syscall_exit_work
+sysret_using_iret:
+	RESTORE_REST
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	jmp int_with_check
 
 badsys:
 	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -703,20 +697,6 @@ auditsys:
 	call __audit_syscall_entry
 	LOAD_ARGS 0		/* reload call-clobbered registers */
 	jmp system_call_fastpath
-
-	/*
-	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
-	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
-	 * masked off.
-	 */
-sysret_audit:
-	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
-	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
-	setbe %al		/* 1 if so, 0 if not */
-	movzbl %al,%edi		/* zero-extend that into %edi */
-	call __audit_syscall_exit
-	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
-	jmp sysret_check
 #endif	/* CONFIG_AUDITSYSCALL */
 
 	/* Do syscall tracing */
@@ -786,7 +766,6 @@ int_careful:
 int_very_careful:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-int_check_syscall_exit_work:
 	SAVE_REST
 	/* Check for syscall exit trace */
 	testl $_TIF_WORK_SYSCALL_EXIT,%edx
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 9e5de6813e1f..25d02f6a65f1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -725,13 +725,30 @@ static void do_signal(struct pt_regs *regs)
 }
 
 /*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
+ * This is the system call exit slowpath, called if any of
+ * the thread info flags are set that cause us to not just
+ * return using a 'sysret' instruction.
+ *
+ * Return zero if the user register state hasn't changed,
+ * and a 'sysret' is still ok. Return nonzero if we need
+ * to go to the slow 'iret' path due to possible register
+ * changes (signals, ptrace, whatever).
  */
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
+__visible int syscall_exit_slowpath(struct pt_regs *regs, u32 thread_info_flags)
 {
+	int retval = 0;
+
 	user_exit();
+	if (thread_info_flags & _TIF_NEED_RESCHED)
+		schedule();
+
+	if (thread_info_flags & _TIF_WORK_SYSCALL_EXIT)
+		syscall_trace_leave(regs);
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (thread_info_flags & _TIF_SYSCALL_AUDIT)
+		__audit_syscall_exit(regs->ax, regs->ax < -MAX_ERRNO);
+#endif
 
 #ifdef CONFIG_X86_MCE
 	/* notify userspace of pending MCEs */
@@ -743,8 +760,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		uprobe_notify_resume(regs);
 
 	/* deal with pending signal delivery */
-	if (thread_info_flags & _TIF_SIGPENDING)
+	if (thread_info_flags & _TIF_SIGPENDING) {
 		do_signal(regs);
+		retval = 1;
+	}
 
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
@@ -754,8 +773,16 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		fire_user_return_notifiers();
 
 	user_enter();
+	return retval;
 }
 
+/* Get rid of this old interface some day.. */
+__visible void do_notify_resume(struct pt_regs *regs, void *unused, u32 thread_info_flags)
+{
+	syscall_exit_slowpath(regs, thread_info_flags & ~(_TIF_NEED_RESCHED|_TIF_SYSCALL_AUDIT));
+}
+
+
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 {
 	struct task_struct *me = current;

^ permalink raw reply related	[flat|nested] 32+ messages in thread