All of lore.kernel.org
 help / color / mirror / Atom feed
* [kernel-hardening] [PATCH 0/3] x86/pti-ish syscall cleanups
@ 2018-01-28 18:38 Andy Lutomirski
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 1/3] x86/entry/64: Remove the SYSCALL64 fast path Andy Lutomirski
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Andy Lutomirski @ 2018-01-28 18:38 UTC (permalink / raw)
  To: x86, LKML
  Cc: Linus Torvalds, Kernel Hardening, Borislav Petkov, Andy Lutomirski

Three changes in here:
 - Get rid of the SYSCALL64 fast path as suggested by Linus.

 - Move TS_COMPAT into the same cacheline as thread_info::flags, also
   suggested by Linus.

 - Document SYSCALL_DEFINE a bit better.

Andy Lutomirski (3):
  x86/entry/64: Remove the SYSCALL64 fast path
  x86/asm: Move 'status' from thread_struct to thread_info
  syscalls: Add a bit of documentation to __SYSCALL_DEFINE

 arch/x86/entry/common.c            |   4 +-
 arch/x86/entry/entry_64.S          | 127 ++-----------------------------------
 arch/x86/entry/syscall_64.c        |   7 +-
 arch/x86/include/asm/processor.h   |   2 -
 arch/x86/include/asm/syscall.h     |   6 +-
 arch/x86/include/asm/thread_info.h |   3 +-
 arch/x86/kernel/process_64.c       |   4 +-
 arch/x86/kernel/ptrace.c           |   2 +-
 arch/x86/kernel/signal.c           |   2 +-
 include/linux/syscalls.h           |  10 +++
 10 files changed, 30 insertions(+), 137 deletions(-)

-- 
2.14.3

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [kernel-hardening] [PATCH 1/3] x86/entry/64: Remove the SYSCALL64 fast path
  2018-01-28 18:38 [kernel-hardening] [PATCH 0/3] x86/pti-ish syscall cleanups Andy Lutomirski
@ 2018-01-28 18:38 ` Andy Lutomirski
  2018-01-28 19:00   ` [kernel-hardening] " Ingo Molnar
                     ` (2 more replies)
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 2/3] x86/asm: Move 'status' from thread_struct to thread_info Andy Lutomirski
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE Andy Lutomirski
  2 siblings, 3 replies; 16+ messages in thread
From: Andy Lutomirski @ 2018-01-28 18:38 UTC (permalink / raw)
  To: x86, LKML
  Cc: Linus Torvalds, Kernel Hardening, Borislav Petkov, Andy Lutomirski

The SYCALLL64 fast path was a nice, if small, optimization back in
the good old days when syscalls were actually reasonably fast.  Now
we have PTI to slow everything down, and indirect branches are
verboten, making everything messier.  The retpoline code in the fast
path was particularly nasty.

Just get rid of the fast path.  The slow path is barely slower.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---

This isn't quite identical to Linus' patch.  I cleaned up the
SYSCALL64 entry code to use all pushes rather than pushing all but 6
regs and moving the rest.

 arch/x86/entry/entry_64.S   | 127 +++-----------------------------------------
 arch/x86/entry/syscall_64.c |   7 +--
 2 files changed, 9 insertions(+), 125 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4f8e1d35a97c..7fd98c8b5907 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -236,91 +236,20 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
 	pushq	%r9				/* pt_regs->r9 */
 	pushq	%r10				/* pt_regs->r10 */
 	pushq	%r11				/* pt_regs->r11 */
-	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
-	UNWIND_HINT_REGS extra=0
-
-	TRACE_IRQS_OFF
-
-	/*
-	 * If we need to do entry work or if we guess we'll need to do
-	 * exit work, go straight to the slow path.
-	 */
-	movq	PER_CPU_VAR(current_task), %r11
-	testl	$_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
-	jnz	entry_SYSCALL64_slow_path
-
-entry_SYSCALL_64_fastpath:
-	/*
-	 * Easy case: enable interrupts and issue the syscall.  If the syscall
-	 * needs pt_regs, we'll call a stub that disables interrupts again
-	 * and jumps to the slow path.
-	 */
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-#if __SYSCALL_MASK == ~0
-	cmpq	$__NR_syscall_max, %rax
-#else
-	andl	$__SYSCALL_MASK, %eax
-	cmpl	$__NR_syscall_max, %eax
-#endif
-	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
-	movq	%r10, %rcx
-
-	/*
-	 * This call instruction is handled specially in stub_ptregs_64.
-	 * It might end up jumping to the slow path.  If it jumps, RAX
-	 * and all argument registers are clobbered.
-	 */
-#ifdef CONFIG_RETPOLINE
-	movq	sys_call_table(, %rax, 8), %rax
-	call	__x86_indirect_thunk_rax
-#else
-	call	*sys_call_table(, %rax, 8)
-#endif
-.Lentry_SYSCALL_64_after_fastpath_call:
-
-	movq	%rax, RAX(%rsp)
-1:
+	pushq	%rbx				/* pt_regs->rbx */
+	pushq	%rbp				/* pt_regs->rbp */
+	pushq	%r12				/* pt_regs->r12 */
+	pushq	%r13				/* pt_regs->r13 */
+	pushq	%r14				/* pt_regs->r14 */
+	pushq	%r15				/* pt_regs->r15 */
+	UNWIND_HINT_REGS
 
-	/*
-	 * If we get here, then we know that pt_regs is clean for SYSRET64.
-	 * If we see that no exit work is required (which we are required
-	 * to check with IRQs off), then we can go straight to SYSRET64.
-	 */
-	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
-	movq	PER_CPU_VAR(current_task), %r11
-	testl	$_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
-	jnz	1f
 
-	LOCKDEP_SYS_EXIT
-	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
-	movq	RIP(%rsp), %rcx
-	movq	EFLAGS(%rsp), %r11
-	addq	$6*8, %rsp	/* skip extra regs -- they were preserved */
-	UNWIND_HINT_EMPTY
-	jmp	.Lpop_c_regs_except_rcx_r11_and_sysret
-
-1:
-	/*
-	 * The fast path looked good when we started, but something changed
-	 * along the way and we need to switch to the slow path.  Calling
-	 * raise(3) will trigger this, for example.  IRQs are off.
-	 */
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	SAVE_EXTRA_REGS
-	movq	%rsp, %rdi
-	call	syscall_return_slowpath	/* returns with IRQs disabled */
-	jmp	return_from_SYSCALL_64
-
-entry_SYSCALL64_slow_path:
 	/* IRQs are off. */
-	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
 	call	do_syscall_64		/* returns with IRQs disabled */
 
-return_from_SYSCALL_64:
 	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
 	/*
@@ -393,7 +322,6 @@ syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	UNWIND_HINT_EMPTY
 	POP_EXTRA_REGS
-.Lpop_c_regs_except_rcx_r11_and_sysret:
 	popq	%rsi	/* skip r11 */
 	popq	%r10
 	popq	%r9
@@ -424,47 +352,6 @@ syscall_return_via_sysret:
 	USERGS_SYSRET64
 END(entry_SYSCALL_64)
 
-ENTRY(stub_ptregs_64)
-	/*
-	 * Syscalls marked as needing ptregs land here.
-	 * If we are on the fast path, we need to save the extra regs,
-	 * which we achieve by trying again on the slow path.  If we are on
-	 * the slow path, the extra regs are already saved.
-	 *
-	 * RAX stores a pointer to the C function implementing the syscall.
-	 * IRQs are on.
-	 */
-	cmpq	$.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
-	jne	1f
-
-	/*
-	 * Called from fast path -- disable IRQs again, pop return address
-	 * and jump to slow path
-	 */
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	popq	%rax
-	UNWIND_HINT_REGS extra=0
-	jmp	entry_SYSCALL64_slow_path
-
-1:
-	JMP_NOSPEC %rax				/* Called from C */
-END(stub_ptregs_64)
-
-.macro ptregs_stub func
-ENTRY(ptregs_\func)
-	UNWIND_HINT_FUNC
-	leaq	\func(%rip), %rax
-	jmp	stub_ptregs_64
-END(ptregs_\func)
-.endm
-
-/* Instantiate ptregs_stub for each ptregs-using syscall */
-#define __SYSCALL_64_QUAL_(sym)
-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
-#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
-#include <asm/syscalls_64.h>
-
 /*
  * %rdi: prev task
  * %rsi: next task
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 9c09775e589d..c176d2fab1da 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -7,14 +7,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64_QUAL_(sym) sym
-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
-
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
+#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [kernel-hardening] [PATCH 2/3] x86/asm: Move 'status' from thread_struct to thread_info
  2018-01-28 18:38 [kernel-hardening] [PATCH 0/3] x86/pti-ish syscall cleanups Andy Lutomirski
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 1/3] x86/entry/64: Remove the SYSCALL64 fast path Andy Lutomirski
@ 2018-01-28 18:38 ` Andy Lutomirski
  2018-01-28 19:02   ` [kernel-hardening] " Ingo Molnar
                     ` (2 more replies)
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE Andy Lutomirski
  2 siblings, 3 replies; 16+ messages in thread
From: Andy Lutomirski @ 2018-01-28 18:38 UTC (permalink / raw)
  To: x86, LKML
  Cc: Linus Torvalds, Kernel Hardening, Borislav Petkov, Andy Lutomirski

The TS_COMPAT bit is very hot and is accessed from code paths that
mostly also touch thread_info::flags.  Move it into struct
thread_info to improve cache locality.

The only reason it was in thread_struct is that there was a brief
period during which we didn't allow arch-specific fields in struct
thread_info.

Linus suggested further changing:

  ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);

to:

  if (unlikely(ti->status & (TS_COMPAT|TS_I386_REGS_POKED)))
          ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);

on the theory that frequently dirtying the cacheline even in pure
64-bit code that never needs to modify status hurts performance.
That could be a reasonable followup patch, but I suspect it matters
less on top of this patch.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/entry/common.c            | 4 ++--
 arch/x86/include/asm/processor.h   | 2 --
 arch/x86/include/asm/syscall.h     | 6 +++---
 arch/x86/include/asm/thread_info.h | 3 ++-
 arch/x86/kernel/process_64.c       | 4 ++--
 arch/x86/kernel/ptrace.c           | 2 +-
 arch/x86/kernel/signal.c           | 2 +-
 7 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d7d3cc24baf4..99081340d19a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -206,7 +206,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	 * special case only applies after poking regs and before the
 	 * very next return to user mode.
 	 */
-	current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
+	ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
 #endif
 
 	user_enter_irqoff();
@@ -304,7 +304,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 	unsigned int nr = (unsigned int)regs->orig_ax;
 
 #ifdef CONFIG_IA32_EMULATION
-	current->thread.status |= TS_COMPAT;
+	ti->status |= TS_COMPAT;
 #endif
 
 	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d3a67fba200a..99799fbd0f7e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -460,8 +460,6 @@ struct thread_struct {
 	unsigned short		gsindex;
 #endif
 
-	u32			status;		/* thread synchronous flags */
-
 #ifdef CONFIG_X86_64
 	unsigned long		fsbase;
 	unsigned long		gsbase;
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index e3c95e8e61c5..03eedc21246d 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
 	 * TS_COMPAT is set for 32-bit syscall entries and then
 	 * remains set until we return to user mode.
 	 */
-	if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
+	if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
 		/*
 		 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
 		 * and will match correctly in comparisons.
@@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
 					 unsigned long *args)
 {
 # ifdef CONFIG_IA32_EMULATION
-	if (task->thread.status & TS_COMPAT)
+	if (task->thread_info.status & TS_COMPAT)
 		switch (i) {
 		case 0:
 			if (!n--) break;
@@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 					 const unsigned long *args)
 {
 # ifdef CONFIG_IA32_EMULATION
-	if (task->thread.status & TS_COMPAT)
+	if (task->thread_info.status & TS_COMPAT)
 		switch (i) {
 		case 0:
 			if (!n--) break;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 00223333821a..eda3b6823ca4 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -55,6 +55,7 @@ struct task_struct;
 
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
+	u32			status;		/* thread synchronous flags */
 };
 
 #define INIT_THREAD_INFO(tsk)			\
@@ -221,7 +222,7 @@ static inline int arch_within_stack_frames(const void * const stack,
 #define in_ia32_syscall() true
 #else
 #define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
-			   current->thread.status & TS_COMPAT)
+			   current_thread_info()->status & TS_COMPAT)
 #endif
 
 /*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c75466232016..9eb448c7859d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -557,7 +557,7 @@ static void __set_personality_x32(void)
 	 * Pretend to come from a x32 execve.
 	 */
 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
-	current->thread.status &= ~TS_COMPAT;
+	current_thread_info()->status &= ~TS_COMPAT;
 #endif
 }
 
@@ -571,7 +571,7 @@ static void __set_personality_ia32(void)
 	current->personality |= force_personality32;
 	/* Prepare the first "return" to user space */
 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
-	current->thread.status |= TS_COMPAT;
+	current_thread_info()->status |= TS_COMPAT;
 #endif
 }
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index f37d18124648..ed5c4cdf0a34 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -935,7 +935,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
 		 */
 		regs->orig_ax = value;
 		if (syscall_get_nr(child, regs) >= 0)
-			child->thread.status |= TS_I386_REGS_POKED;
+			child->thread_info.status |= TS_I386_REGS_POKED;
 		break;
 
 	case offsetof(struct user32, regs.eflags):
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b9e00e8f1c9b..4cdc0b27ec82 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -787,7 +787,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
 	 * than the tracee.
 	 */
 #ifdef CONFIG_IA32_EMULATION
-	if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
+	if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
 		return __NR_ia32_restart_syscall;
 #endif
 #ifdef CONFIG_X86_X32_ABI
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [kernel-hardening] [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE
  2018-01-28 18:38 [kernel-hardening] [PATCH 0/3] x86/pti-ish syscall cleanups Andy Lutomirski
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 1/3] x86/entry/64: Remove the SYSCALL64 fast path Andy Lutomirski
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 2/3] x86/asm: Move 'status' from thread_struct to thread_info Andy Lutomirski
@ 2018-01-28 18:38 ` Andy Lutomirski
  2018-01-28 19:15   ` [kernel-hardening] " Linus Torvalds
  2018-01-28 19:21   ` Ingo Molnar
  2 siblings, 2 replies; 16+ messages in thread
From: Andy Lutomirski @ 2018-01-28 18:38 UTC (permalink / raw)
  To: x86, LKML
  Cc: Linus Torvalds, Kernel Hardening, Borislav Petkov, Andy Lutomirski

__SYSCALL_DEFINE is rather magical.  Add a bit of documentation.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 include/linux/syscalls.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a78186d826d7..d3f244a447c5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -207,6 +207,16 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
 
 #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
+
+/*
+ * For a syscall like long foo(void *a, long long b), this defines:
+ *
+ * static inline long SYSC_foo(void *a, long long b): the actual code
+ *
+ * asmlinkage long SyS_foo(long a, long long b): wrapper that calls SYSC_foo
+ *
+ * asmlinkage long sys_foo(void *a, long long b): alias of SyS_foo
+ */
 #define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
 		__attribute__((alias(__stringify(SyS##name))));		\
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [kernel-hardening] Re: [PATCH 1/3] x86/entry/64: Remove the SYSCALL64 fast path
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 1/3] x86/entry/64: Remove the SYSCALL64 fast path Andy Lutomirski
@ 2018-01-28 19:00   ` Ingo Molnar
  2018-01-30 14:49   ` [tip:x86/pti] " tip-bot for Andy Lutomirski
  2018-01-30 14:50   ` [tip:x86/pti] x86/entry/64: Push extra regs right away tip-bot for Andy Lutomirski
  2 siblings, 0 replies; 16+ messages in thread
From: Ingo Molnar @ 2018-01-28 19:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: x86, LKML, Linus Torvalds, Kernel Hardening, Borislav Petkov


* Andy Lutomirski <luto@kernel.org> wrote:

> The SYCALLL64 fast path was a nice, if small, optimization back in
> the good old days when syscalls were actually reasonably fast.  Now
> we have PTI to slow everything down, and indirect branches are
> verboten, making everything messier.  The retpoline code in the fast
> path was particularly nasty.
> 
> Just get rid of the fast path.  The slow path is barely slower.
> 
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
> 
> This isn't quite identical to Linus' patch.  I cleaned up the
> SYSCALL64 entry code to use all pushes rather than pushing all but 6
> regs and moving the rest.

Hm, could we please have this in two parts please, out of general paranoia?

One patch doing the easy fast path removal, the other doing the mov/push 
conversion?

Bisectability, reviewability and all that.

Exact same patch result.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [kernel-hardening] Re: [PATCH 2/3] x86/asm: Move 'status' from thread_struct to thread_info
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 2/3] x86/asm: Move 'status' from thread_struct to thread_info Andy Lutomirski
@ 2018-01-28 19:02   ` Ingo Molnar
  2018-01-28 19:19   ` Linus Torvalds
  2018-01-30 14:50   ` [tip:x86/pti] " tip-bot for Andy Lutomirski
  2 siblings, 0 replies; 16+ messages in thread
From: Ingo Molnar @ 2018-01-28 19:02 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: x86, LKML, Linus Torvalds, Kernel Hardening, Borislav Petkov


* Andy Lutomirski <luto@kernel.org> wrote:

> The TS_COMPAT bit is very hot and is accessed from code paths that
> mostly also touch thread_info::flags.  Move it into struct
> thread_info to improve cache locality.
> 
> The only reason it was in thread_struct is that there was a brief
> period during which we didn't allow arch-specific fields in struct
> thread_info.
> 
> Linus suggested further changing:
> 
>   ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
> 
> to:
> 
>   if (unlikely(ti->status & (TS_COMPAT|TS_I386_REGS_POKED)))
>           ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
> 
> on the theory that frequently dirtying the cacheline even in pure
> 64-bit code that never needs to modify status hurts performance.
> That could be a reasonable followup patch, but I suspect it matters
> less on top of this patch.
> 
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  arch/x86/entry/common.c            | 4 ++--
>  arch/x86/include/asm/processor.h   | 2 --
>  arch/x86/include/asm/syscall.h     | 6 +++---
>  arch/x86/include/asm/thread_info.h | 3 ++-
>  arch/x86/kernel/process_64.c       | 4 ++--
>  arch/x86/kernel/ptrace.c           | 2 +-
>  arch/x86/kernel/signal.c           | 2 +-
>  7 files changed, 11 insertions(+), 12 deletions(-)

Reviewed-by: Ingo Molnar <mingo@kernel.org>

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [kernel-hardening] Re: [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE Andy Lutomirski
@ 2018-01-28 19:15   ` Linus Torvalds
  2018-01-28 20:21     ` Al Viro
  2018-01-28 19:21   ` Ingo Molnar
  1 sibling, 1 reply; 16+ messages in thread
From: Linus Torvalds @ 2018-01-28 19:15 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: the arch/x86 maintainers, LKML, Kernel Hardening, Borislav Petkov

On Sun, Jan 28, 2018 at 10:38 AM, Andy Lutomirski <luto@kernel.org> wrote:
> __SYSCALL_DEFINE is rather magical.  Add a bit of documentation.

Ack.

Is that "long long" part of the example on purpose? Because that's
likely the only really nasty part about any ptregs wrapper: some
arguments aren't _one_ register, they are two. And "long long" is the
simplest example, even though in practice the type is most often
"loff_t".

You won't see this on 64-bit architectures, but it's visible on 32-bit ones.

We may have to do wrappers for those, and error out for 'long long'.
We already do that for some cases, for compat system calls. See for
example

COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
                const struct compat_iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_compat_preadv64(fd, vec, vlen, pos, 0);
}

where we have the issue of a 64-bit value being split over two
registers even on 64-bit, due to it being a compat interface for 32
bit.

But if we pick up the values by hand from ptregs in a wrapper, we'll
have this issue even for native calls on 32-bit.

             Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [kernel-hardening] Re: [PATCH 2/3] x86/asm: Move 'status' from thread_struct to thread_info
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 2/3] x86/asm: Move 'status' from thread_struct to thread_info Andy Lutomirski
  2018-01-28 19:02   ` [kernel-hardening] " Ingo Molnar
@ 2018-01-28 19:19   ` Linus Torvalds
  2018-01-30 14:50   ` [tip:x86/pti] " tip-bot for Andy Lutomirski
  2 siblings, 0 replies; 16+ messages in thread
From: Linus Torvalds @ 2018-01-28 19:19 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: the arch/x86 maintainers, LKML, Kernel Hardening, Borislav Petkov

On Sun, Jan 28, 2018 at 10:38 AM, Andy Lutomirski <luto@kernel.org> wrote:
>
> Linus suggested further changing:
>
>   ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
>
> to:
>
>   if (unlikely(ti->status & (TS_COMPAT|TS_I386_REGS_POKED)))
>           ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
>
> on the theory that frequently dirtying the cacheline even in pure
> 64-bit code that never needs to modify status hurts performance.
> That could be a reasonable followup patch, but I suspect it matters
> less on top of this patch.

Ack, that should be done separately from the movement anyway.

And yes, it's possible that once it's in the same cacheline with the
thread flags, you can't even see the issue anyway. Although I *think*
all those early fields are normally mostly read-only, so that "read
before clear" may end up being a good idea regardless.

                Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [kernel-hardening] Re: [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE Andy Lutomirski
  2018-01-28 19:15   ` [kernel-hardening] " Linus Torvalds
@ 2018-01-28 19:21   ` Ingo Molnar
  1 sibling, 0 replies; 16+ messages in thread
From: Ingo Molnar @ 2018-01-28 19:21 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: x86, LKML, Linus Torvalds, Kernel Hardening, Borislav Petkov


* Andy Lutomirski <luto@kernel.org> wrote:

> __SYSCALL_DEFINE is rather magical.  Add a bit of documentation.
> 
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  include/linux/syscalls.h | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a78186d826d7..d3f244a447c5 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -207,6 +207,16 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
>  	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
>  
>  #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
> +
> +/*
> + * For a syscall like long foo(void *a, long long b), this defines:
> + *
> + * static inline long SYSC_foo(void *a, long long b): the actual code
> + *
> + * asmlinkage long SyS_foo(long a, long long b): wrapper that calls SYSC_foo
> + *
> + * asmlinkage long sys_foo(void *a, long long b): alias of SyS_foo
> + */

Nit, would it be more readable to write this as pseudo-code, i.e. something like:

/*
 * For a syscall like long foo(void *a, long long b), this defines:
 *
 *  static inline long SYSC_foo(void *a, long long b) { /* the actual code following */ }
 *
 *  asmlinkage long SyS_foo(long a, long long b) { /* wrapper that calls SYSC_foo() */ }
 *  asmlinkage long sys_foo(void *a, long long b); /* as GCC alias of SyS_foo() */
 */

Also, I wanted to suggest to also document that in practice the three methods map 
to the very same function in the end, with the SYSC_ variant being eliminated due 
to inlining - but when double checking an x86-64 defconfig that does not appear to 
be so for all system calls:

 triton:~/tip> grep accept4 System.map 
 ffffffff8170d710 t SYSC_accept4
 ffffffff8170f940 T SyS_accept4
 ffffffff8170f940 T sys_accept4

While for others there's just 2:

 triton:~/tip> grep sched_getattr System.map 
 ffffffff8107b9a0 T SyS_sched_getattr
 ffffffff8107b9a0 T sys_sched_getattr

The only difference appears to be that accept4() is called internally within the 
kernel, by socketcall:

 SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
                 int __user *, upeer_addrlen)
 {
         return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
 }

But why does that result in SYSC_accept4() being a different symbol?

The difference between the two appears to be rather dumb as well:

 ffffffff8170f940 <SyS_accept4>:
 ffffffff8170f940:       e9 cb dd ff ff          jmpq   ffffffff8170d710 <SYSC_accept4>

Using GCC 7.2.0.

What am I missing?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [kernel-hardening] Re: [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE
  2018-01-28 19:15   ` [kernel-hardening] " Linus Torvalds
@ 2018-01-28 20:21     ` Al Viro
  2018-01-28 20:42       ` Linus Torvalds
  0 siblings, 1 reply; 16+ messages in thread
From: Al Viro @ 2018-01-28 20:21 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andy Lutomirski, the arch/x86 maintainers, LKML,
	Kernel Hardening, Borislav Petkov

On Sun, Jan 28, 2018 at 11:15:15AM -0800, Linus Torvalds wrote:
> Is that "long long" part of the example on purpose? Because that's
> likely the only really nasty part about any ptregs wrapper: some
> arguments aren't _one_ register, they are two. And "long long" is the
> simplest example, even though in practice the type is most often
> "loff_t".
> 
> You won't see this on 64-bit architectures, but it's visible on 32-bit ones.
> 
> We may have to do wrappers for those, and error out for 'long long'.
> We already do that for some cases, for compat system calls. See for
> example
> 
> COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
>                 const struct compat_iovec __user *,vec,
>                 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
> {
>         loff_t pos = ((loff_t)pos_high << 32) | pos_low;
> 
>         return do_compat_preadv64(fd, vec, vlen, pos, 0);
> }
> 
> where we have the issue of a 64-bit value being split over two
> registers even on 64-bit, due to it being a compat interface for 32
> bit.
> 
> But if we pick up the values by hand from ptregs in a wrapper, we'll
> have this issue even for native calls on 32-bit.

... and have more of that logics arch-dependent than one might expect;
it's *not* just "split each 64bit argument into a pair of 32bit ones,
combine in the body".  I tried to do something to reduce the amount of
remaining wrappers several years ago.  Trying to automate that gets
very ugly very fast - there are architectures like mips where you get
alignment requirements ($6/$7 can hold that, $5/$6 can't), creating
padding arguments, etc.

FWIW, that affects
	lookup_dcookie() (64,32,32)
	truncate64(), ftruncate64() (32,64)
	fadvise64_64(), sync_file_range() (32,64,64,32)
	readahead() (32,64,32)
	fadvise64() (32,64,32,32)
	fallocate(), sync_file_range2() (32,32,64,64)
	fanotify_mark() (32,32,64,32,32)
	pread64(), pwrite64() (32,32,32,64)

Giving each of those a compat wrapper of its own is already annoying
(looking at that again, we should at least unify pread64() and pwrite64()
compat wrappers - grep for sys_pread64 and you'll see), but giving
each an explicit wrapper for native ones?  Ouch.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [kernel-hardening] Re: [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE
  2018-01-28 20:21     ` Al Viro
@ 2018-01-28 20:42       ` Linus Torvalds
  2018-01-28 22:50         ` Al Viro
  0 siblings, 1 reply; 16+ messages in thread
From: Linus Torvalds @ 2018-01-28 20:42 UTC (permalink / raw)
  To: Al Viro
  Cc: Andy Lutomirski, the arch/x86 maintainers, LKML,
	Kernel Hardening, Borislav Petkov

On Sun, Jan 28, 2018 at 12:21 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> ... and have more of that logics arch-dependent than one might expect;
> it's *not* just "split each 64bit argument into a pair of 32bit ones,
> combine in the body".

Right. The paired registers tend to have special arch-specific rules,
making this particularly painful.

I suspect that the best solution (and here "best" doesn't mean
"pretty", but doable) is to rely on some of the limits we have:

In particular, we don't want to convert all architectures to the
ptregs approach _anyway_, so we need to have a model where the
architecture sets up some config option.

End result: we don't have to handle the generic case, we just have to
have a way for an architecture to set up for this. And for a _first_
implementation, we can simply just limit things to x86-64 (and then
maybe arm64) which don't even have this issue.

Longer-term, the other thing that helps us is that we have already
limited our calling convention to a maximum of six registers (NOTE!
Not six _arguments_).

So even when we want to handle the 32-bit case, the combinatorial
space is at least fairly limited, and we can make the compiler notice
them (ie the ptregs accessor case can simply be made to error out if
you try to access a 64-bit type from one single register).

End result: we will need the per-architecture macros that access
arguments from the 'struct ptregs', so we'll have some macro for
"argument1(ptregs)".

The 64-bit argument for 32-bit case would end up having to have a few
more of those architecture-specific oddities. So not just
"argument1(ptregs)", but "argument2_32_64(ptregs)" or something that
says "get me argument 2, when the first argument is 32-bit and I want
a 64-bit one".

So _if_ a 32-bit architecture wants this, such an architecture would
probably have to implement 10-20 of those odd special cases. They'll
be ugly and nasty and have little sanity to them, but they'd be simple
one-liners.

So hacky. But not necessarily even needed (I really don't think the
kind of people who care about paranoid security are going to run
32-bit kernels), and if people want to, not _complicated_, just
annoying.

                    Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [kernel-hardening] Re: [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE
  2018-01-28 20:42       ` Linus Torvalds
@ 2018-01-28 22:50         ` Al Viro
  2018-01-29  6:22           ` Al Viro
  0 siblings, 1 reply; 16+ messages in thread
From: Al Viro @ 2018-01-28 22:50 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andy Lutomirski, the arch/x86 maintainers, LKML,
	Kernel Hardening, Borislav Petkov

On Sun, Jan 28, 2018 at 12:42:24PM -0800, Linus Torvalds wrote:

> The 64-bit argument for 32-bit case would end up having to have a few
> more of those architecture-specific oddities. So not just
> "argument1(ptregs)", but "argument2_32_64(ptregs)" or something that
> says "get me argument 2, when the first argument is 32-bit and I want
> a 64-bit one".

Yeah, but...  You either get to give SYSCALL_DEFINE more than just
the number of arguments (SYSCALL_DEFINE_WDDW) or you need to go
for rather grotty macros to pick that information.  That's pretty
much what I'd tried; it hadn't been fun at all...

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [kernel-hardening] Re: [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE
  2018-01-28 22:50         ` Al Viro
@ 2018-01-29  6:22           ` Al Viro
  0 siblings, 0 replies; 16+ messages in thread
From: Al Viro @ 2018-01-29  6:22 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andy Lutomirski, the arch/x86 maintainers, LKML,
	Kernel Hardening, Borislav Petkov

On Sun, Jan 28, 2018 at 10:50:31PM +0000, Al Viro wrote:
> On Sun, Jan 28, 2018 at 12:42:24PM -0800, Linus Torvalds wrote:
> 
> > The 64-bit argument for 32-bit case would end up having to have a few
> > more of those architecture-specific oddities. So not just
> > "argument1(ptregs)", but "argument2_32_64(ptregs)" or something that
> > says "get me argument 2, when the first argument is 32-bit and I want
> > a 64-bit one".
> 
> Yeah, but...  You either get to give SYSCALL_DEFINE more than just
> the number of arguments (SYSCALL_DEFINE_WDDW) or you need to go
> for rather grotty macros to pick that information.  That's pretty
> much what I'd tried; it hadn't been fun at all...

FWIW, going through the notes from back then (with some personal
comments censored - parts of that were definitely in the actionable
territory):

------

* s390 aside, the headache comes from combination of calling conventions
for 64bit arguments on 32bit and [speculation regarding libc maintainers
qualities]

* All architectures in question[1] treat syscall arguments as either
32bit (arith types <= 32bit, pointers) or 64bit.  Prototypical case
is f(int a1, int a2, ....); let L1, L2, ... be the sequence of locations
used to pass them (might be GPR, might be stack locations).  Anything
that doesn't involve long long uses the same sequence; in theory, we could
have different sets of registers for e.g. pointers and integers, but nothing
we care about seems to be doing that.

* wrt 64bit ints there are two variants: one is to simply treat them as pair
of 32bit ones (i.e. take the next two elements of the sequence), another is
to skip an element and then use the next two.  Some architectures always
go for the first variant; x86, s390 and, surprisingly, sparc are that way.
arm, mips, parisc, ppc and tile go for the second variant when odd number
of 32bit values had been passed so far.

* argument passing for syscalls is something almost, but not entirely different.
First of all, we don't want to pass them on stack (no surprise); mips o32
ABI is trouble in that respect, everything else manages to use registers
(so do other mips ABI variants).  Next, we are generally limited to 6 words.
No worries, right?  We don't have syscalls with more than 6 arguments, and
ones with 64bit args still fit into 32*6 bits.  Too fucking bad - akpm has
introduced int sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
and then topped it with
long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
Note that this sucker already has 32*6 bits total *AND* 64bit argument in odd
position.  arm, mips and ppc folks were not amused (IIRC, rmk got downright
sarcastic at the time; not quite Patrician-level, what with the lack of
resources, but...)
That had been compounded by sync_file_range(2), with identical braind^WAPI.
The latter got a saner variant (sync_file_range2(2)) and newer architectures
should take that.  fadvise64_64(2) has not.  BTW, that had been a headache
for other 32bit architectures as well - at least c6x and metag are in the
same boat.  Different solutions with that one - some split those 64bit into
32bit on C level and repackage them into 64bit in stubs, some reorder the
arguments so that 64bit ones are at good offsets.

* for syscalls like pread64/pwrite64, the situation is less dire.  Some still
pass the misaligned 64bit arg as a pair of C-level 32bit ones, some accept
the padding.

* to make things even more interesting, libc (all of them) pass a bunch of
64bit args as explicit pairs.  Which creates no end of amusing situations -
will this argument of this syscall end up with LSB first?  Last?  According
to endianness?  Opposite?  Different rules for different architectures?
Onna stick.  Inna bun.  And that's cuttin' me own throat... [speculations
regarding various habits of libc maintainers]

* it might be possible to teach COMPAT_SYSCALL_DEFINE to insert padding and
combine the halves.  Cost: collecting the information about the number of
words passed so far; messy, but maybe I'm missing some clever solution.
However, that doesn't do a damn thing for libc-inflicted idiocies, so it
might or might not be worth it.  In some cases the mapping from what
libc is feeding to the kernel to actual long long passed from the glue
into C side of things is just too fucking irregular[2].

------

[1] i.e. 32bit side of biarch ones; I was interested in COMPAT_SYSCALL_DEFINE
back then.
[2] looking at that now, parisc has fanotify_mark() passing halves of 64bit
arg in the order opposite to that for truncate64().  On the same parisc.
>From the same libc.  And then there's lookup_dcookie(), which might or
might not be correct there.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [tip:x86/pti] x86/entry/64: Remove the SYSCALL64 fast path
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 1/3] x86/entry/64: Remove the SYSCALL64 fast path Andy Lutomirski
  2018-01-28 19:00   ` [kernel-hardening] " Ingo Molnar
@ 2018-01-30 14:49   ` tip-bot for Andy Lutomirski
  2018-01-30 14:50   ` [tip:x86/pti] x86/entry/64: Push extra regs right away tip-bot for Andy Lutomirski
  2 siblings, 0 replies; 16+ messages in thread
From: tip-bot for Andy Lutomirski @ 2018-01-30 14:49 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: mingo, luto, bp, hpa, torvalds, tglx, kernel-hardening, linux-kernel

Commit-ID:  21d375b6b34ff511a507de27bf316b3dde6938d9
Gitweb:     https://git.kernel.org/tip/21d375b6b34ff511a507de27bf316b3dde6938d9
Author:     Andy Lutomirski <luto@kernel.org>
AuthorDate: Sun, 28 Jan 2018 10:38:49 -0800
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Tue, 30 Jan 2018 15:30:36 +0100

x86/entry/64: Remove the SYSCALL64 fast path

The SYCALLL64 fast path was a nice, if small, optimization back in the good
old days when syscalls were actually reasonably fast.  Now there is PTI to
slow everything down, and indirect branches are verboten, making everything
messier.  The retpoline code in the fast path is particularly nasty.

Just get rid of the fast path. The slow path is barely slower.

[ tglx: Split out the 'push all extra regs' part ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Kernel Hardening <kernel-hardening@lists.openwall.com>
Link: https://lkml.kernel.org/r/462dff8d4d64dfbfc851fbf3130641809d980ecd.1517164461.git.luto@kernel.org

---
 arch/x86/entry/entry_64.S   | 117 --------------------------------------------
 arch/x86/entry/syscall_64.c |   7 +--
 2 files changed, 2 insertions(+), 122 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a835704..c46c755 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -241,86 +241,11 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
 
 	TRACE_IRQS_OFF
 
-	/*
-	 * If we need to do entry work or if we guess we'll need to do
-	 * exit work, go straight to the slow path.
-	 */
-	movq	PER_CPU_VAR(current_task), %r11
-	testl	$_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
-	jnz	entry_SYSCALL64_slow_path
-
-entry_SYSCALL_64_fastpath:
-	/*
-	 * Easy case: enable interrupts and issue the syscall.  If the syscall
-	 * needs pt_regs, we'll call a stub that disables interrupts again
-	 * and jumps to the slow path.
-	 */
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-#if __SYSCALL_MASK == ~0
-	cmpq	$__NR_syscall_max, %rax
-#else
-	andl	$__SYSCALL_MASK, %eax
-	cmpl	$__NR_syscall_max, %eax
-#endif
-	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
-	movq	%r10, %rcx
-
-	/*
-	 * This call instruction is handled specially in stub_ptregs_64.
-	 * It might end up jumping to the slow path.  If it jumps, RAX
-	 * and all argument registers are clobbered.
-	 */
-#ifdef CONFIG_RETPOLINE
-	movq	sys_call_table(, %rax, 8), %rax
-	call	__x86_indirect_thunk_rax
-#else
-	call	*sys_call_table(, %rax, 8)
-#endif
-.Lentry_SYSCALL_64_after_fastpath_call:
-
-	movq	%rax, RAX(%rsp)
-1:
-
-	/*
-	 * If we get here, then we know that pt_regs is clean for SYSRET64.
-	 * If we see that no exit work is required (which we are required
-	 * to check with IRQs off), then we can go straight to SYSRET64.
-	 */
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	movq	PER_CPU_VAR(current_task), %r11
-	testl	$_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
-	jnz	1f
-
-	LOCKDEP_SYS_EXIT
-	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
-	movq	RIP(%rsp), %rcx
-	movq	EFLAGS(%rsp), %r11
-	addq	$6*8, %rsp	/* skip extra regs -- they were preserved */
-	UNWIND_HINT_EMPTY
-	jmp	.Lpop_c_regs_except_rcx_r11_and_sysret
-
-1:
-	/*
-	 * The fast path looked good when we started, but something changed
-	 * along the way and we need to switch to the slow path.  Calling
-	 * raise(3) will trigger this, for example.  IRQs are off.
-	 */
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	SAVE_EXTRA_REGS
-	movq	%rsp, %rdi
-	call	syscall_return_slowpath	/* returns with IRQs disabled */
-	jmp	return_from_SYSCALL_64
-
-entry_SYSCALL64_slow_path:
 	/* IRQs are off. */
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
 	call	do_syscall_64		/* returns with IRQs disabled */
 
-return_from_SYSCALL_64:
 	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
 	/*
@@ -393,7 +318,6 @@ syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	UNWIND_HINT_EMPTY
 	POP_EXTRA_REGS
-.Lpop_c_regs_except_rcx_r11_and_sysret:
 	popq	%rsi	/* skip r11 */
 	popq	%r10
 	popq	%r9
@@ -424,47 +348,6 @@ syscall_return_via_sysret:
 	USERGS_SYSRET64
 END(entry_SYSCALL_64)
 
-ENTRY(stub_ptregs_64)
-	/*
-	 * Syscalls marked as needing ptregs land here.
-	 * If we are on the fast path, we need to save the extra regs,
-	 * which we achieve by trying again on the slow path.  If we are on
-	 * the slow path, the extra regs are already saved.
-	 *
-	 * RAX stores a pointer to the C function implementing the syscall.
-	 * IRQs are on.
-	 */
-	cmpq	$.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
-	jne	1f
-
-	/*
-	 * Called from fast path -- disable IRQs again, pop return address
-	 * and jump to slow path
-	 */
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	popq	%rax
-	UNWIND_HINT_REGS extra=0
-	jmp	entry_SYSCALL64_slow_path
-
-1:
-	JMP_NOSPEC %rax				/* Called from C */
-END(stub_ptregs_64)
-
-.macro ptregs_stub func
-ENTRY(ptregs_\func)
-	UNWIND_HINT_FUNC
-	leaq	\func(%rip), %rax
-	jmp	stub_ptregs_64
-END(ptregs_\func)
-.endm
-
-/* Instantiate ptregs_stub for each ptregs-using syscall */
-#define __SYSCALL_64_QUAL_(sym)
-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
-#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
-#include <asm/syscalls_64.h>
-
 /*
  * %rdi: prev task
  * %rsi: next task
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 9c09775..c176d2f 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -7,14 +7,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64_QUAL_(sym) sym
-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
-
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
+#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [tip:x86/pti] x86/entry/64: Push extra regs right away
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 1/3] x86/entry/64: Remove the SYSCALL64 fast path Andy Lutomirski
  2018-01-28 19:00   ` [kernel-hardening] " Ingo Molnar
  2018-01-30 14:49   ` [tip:x86/pti] " tip-bot for Andy Lutomirski
@ 2018-01-30 14:50   ` tip-bot for Andy Lutomirski
  2 siblings, 0 replies; 16+ messages in thread
From: tip-bot for Andy Lutomirski @ 2018-01-30 14:50 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: luto, hpa, torvalds, tglx, kernel-hardening, bp, mingo, linux-kernel

Commit-ID:  d1f7732009e0549eedf8ea1db948dc37be77fd46
Gitweb:     https://git.kernel.org/tip/d1f7732009e0549eedf8ea1db948dc37be77fd46
Author:     Andy Lutomirski <luto@kernel.org>
AuthorDate: Sun, 28 Jan 2018 10:38:49 -0800
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Tue, 30 Jan 2018 15:30:36 +0100

x86/entry/64: Push extra regs right away

With the fast path removed there is no point in splitting the push of the
normal and the extra register set. Just push the extra regs right away.

[ tglx: Split out from 'x86/entry/64: Remove the SYSCALL64 fast path' ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Kernel Hardening <kernel-hardening@lists.openwall.com>
Link: https://lkml.kernel.org/r/462dff8d4d64dfbfc851fbf3130641809d980ecd.1517164461.git.luto@kernel.org

---
 arch/x86/entry/entry_64.S | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index c46c755..c752abe 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -236,13 +236,17 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
 	pushq	%r9				/* pt_regs->r9 */
 	pushq	%r10				/* pt_regs->r10 */
 	pushq	%r11				/* pt_regs->r11 */
-	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
-	UNWIND_HINT_REGS extra=0
+	pushq	%rbx				/* pt_regs->rbx */
+	pushq	%rbp				/* pt_regs->rbp */
+	pushq	%r12				/* pt_regs->r12 */
+	pushq	%r13				/* pt_regs->r13 */
+	pushq	%r14				/* pt_regs->r14 */
+	pushq	%r15				/* pt_regs->r15 */
+	UNWIND_HINT_REGS
 
 	TRACE_IRQS_OFF
 
 	/* IRQs are off. */
-	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
 	call	do_syscall_64		/* returns with IRQs disabled */
 

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [tip:x86/pti] x86/asm: Move 'status' from thread_struct to thread_info
  2018-01-28 18:38 ` [kernel-hardening] [PATCH 2/3] x86/asm: Move 'status' from thread_struct to thread_info Andy Lutomirski
  2018-01-28 19:02   ` [kernel-hardening] " Ingo Molnar
  2018-01-28 19:19   ` Linus Torvalds
@ 2018-01-30 14:50   ` tip-bot for Andy Lutomirski
  2 siblings, 0 replies; 16+ messages in thread
From: tip-bot for Andy Lutomirski @ 2018-01-30 14:50 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: mingo, hpa, linux-kernel, kernel-hardening, torvalds, tglx, bp, luto

Commit-ID:  37a8f7c38339b22b69876d6f5a0ab851565284e3
Gitweb:     https://git.kernel.org/tip/37a8f7c38339b22b69876d6f5a0ab851565284e3
Author:     Andy Lutomirski <luto@kernel.org>
AuthorDate: Sun, 28 Jan 2018 10:38:50 -0800
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Tue, 30 Jan 2018 15:30:36 +0100

x86/asm: Move 'status' from thread_struct to thread_info

The TS_COMPAT bit is very hot and is accessed from code paths that mostly
also touch thread_info::flags.  Move it into struct thread_info to improve
cache locality.

The only reason it was in thread_struct is that there was a brief period
during which arch-specific fields were not allowed in struct thread_info.

Linus suggested further changing:

  ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);

to:

  if (unlikely(ti->status & (TS_COMPAT|TS_I386_REGS_POKED)))
          ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);

on the theory that frequently dirtying the cacheline even in pure 64-bit
code that never needs to modify status hurts performance.  That could be a
reasonable followup patch, but I suspect it matters less on top of this
patch.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Kernel Hardening <kernel-hardening@lists.openwall.com>
Link: https://lkml.kernel.org/r/03148bcc1b217100e6e8ecf6a5468c45cf4304b6.1517164461.git.luto@kernel.org

---
 arch/x86/entry/common.c            | 4 ++--
 arch/x86/include/asm/processor.h   | 2 --
 arch/x86/include/asm/syscall.h     | 6 +++---
 arch/x86/include/asm/thread_info.h | 3 ++-
 arch/x86/kernel/process_64.c       | 4 ++--
 arch/x86/kernel/ptrace.c           | 2 +-
 arch/x86/kernel/signal.c           | 2 +-
 7 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d7d3cc2..9908134 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -206,7 +206,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	 * special case only applies after poking regs and before the
 	 * very next return to user mode.
 	 */
-	current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
+	ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
 #endif
 
 	user_enter_irqoff();
@@ -304,7 +304,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 	unsigned int nr = (unsigned int)regs->orig_ax;
 
 #ifdef CONFIG_IA32_EMULATION
-	current->thread.status |= TS_COMPAT;
+	ti->status |= TS_COMPAT;
 #endif
 
 	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index efbde08..513f960 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -460,8 +460,6 @@ struct thread_struct {
 	unsigned short		gsindex;
 #endif
 
-	u32			status;		/* thread synchronous flags */
-
 #ifdef CONFIG_X86_64
 	unsigned long		fsbase;
 	unsigned long		gsbase;
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index e3c95e8..03eedc2 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
 	 * TS_COMPAT is set for 32-bit syscall entries and then
 	 * remains set until we return to user mode.
 	 */
-	if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
+	if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
 		/*
 		 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
 		 * and will match correctly in comparisons.
@@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
 					 unsigned long *args)
 {
 # ifdef CONFIG_IA32_EMULATION
-	if (task->thread.status & TS_COMPAT)
+	if (task->thread_info.status & TS_COMPAT)
 		switch (i) {
 		case 0:
 			if (!n--) break;
@@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 					 const unsigned long *args)
 {
 # ifdef CONFIG_IA32_EMULATION
-	if (task->thread.status & TS_COMPAT)
+	if (task->thread_info.status & TS_COMPAT)
 		switch (i) {
 		case 0:
 			if (!n--) break;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0022333..eda3b68 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -55,6 +55,7 @@ struct task_struct;
 
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
+	u32			status;		/* thread synchronous flags */
 };
 
 #define INIT_THREAD_INFO(tsk)			\
@@ -221,7 +222,7 @@ static inline int arch_within_stack_frames(const void * const stack,
 #define in_ia32_syscall() true
 #else
 #define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
-			   current->thread.status & TS_COMPAT)
+			   current_thread_info()->status & TS_COMPAT)
 #endif
 
 /*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c754662..9eb448c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -557,7 +557,7 @@ static void __set_personality_x32(void)
 	 * Pretend to come from a x32 execve.
 	 */
 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
-	current->thread.status &= ~TS_COMPAT;
+	current_thread_info()->status &= ~TS_COMPAT;
 #endif
 }
 
@@ -571,7 +571,7 @@ static void __set_personality_ia32(void)
 	current->personality |= force_personality32;
 	/* Prepare the first "return" to user space */
 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
-	current->thread.status |= TS_COMPAT;
+	current_thread_info()->status |= TS_COMPAT;
 #endif
 }
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index f37d181..ed5c4cd 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -935,7 +935,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
 		 */
 		regs->orig_ax = value;
 		if (syscall_get_nr(child, regs) >= 0)
-			child->thread.status |= TS_I386_REGS_POKED;
+			child->thread_info.status |= TS_I386_REGS_POKED;
 		break;
 
 	case offsetof(struct user32, regs.eflags):
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b9e00e8..4cdc0b2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -787,7 +787,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
 	 * than the tracee.
 	 */
 #ifdef CONFIG_IA32_EMULATION
-	if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
+	if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
 		return __NR_ia32_restart_syscall;
 #endif
 #ifdef CONFIG_X86_X32_ABI

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2018-01-30 14:54 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-28 18:38 [kernel-hardening] [PATCH 0/3] x86/pti-ish syscall cleanups Andy Lutomirski
2018-01-28 18:38 ` [kernel-hardening] [PATCH 1/3] x86/entry/64: Remove the SYSCALL64 fast path Andy Lutomirski
2018-01-28 19:00   ` [kernel-hardening] " Ingo Molnar
2018-01-30 14:49   ` [tip:x86/pti] " tip-bot for Andy Lutomirski
2018-01-30 14:50   ` [tip:x86/pti] x86/entry/64: Push extra regs right away tip-bot for Andy Lutomirski
2018-01-28 18:38 ` [kernel-hardening] [PATCH 2/3] x86/asm: Move 'status' from thread_struct to thread_info Andy Lutomirski
2018-01-28 19:02   ` [kernel-hardening] " Ingo Molnar
2018-01-28 19:19   ` Linus Torvalds
2018-01-30 14:50   ` [tip:x86/pti] " tip-bot for Andy Lutomirski
2018-01-28 18:38 ` [kernel-hardening] [PATCH 3/3] syscalls: Add a bit of documentation to __SYSCALL_DEFINE Andy Lutomirski
2018-01-28 19:15   ` [kernel-hardening] " Linus Torvalds
2018-01-28 20:21     ` Al Viro
2018-01-28 20:42       ` Linus Torvalds
2018-01-28 22:50         ` Al Viro
2018-01-29  6:22           ` Al Viro
2018-01-28 19:21   ` Ingo Molnar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.