linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* x86/clearregs: Register sanitizing at kernel entry for speculation hygiene
@ 2018-01-10  1:03 Andi Kleen
  2018-01-10  1:03 ` [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call Andi Kleen
                   ` (8 more replies)
  0 siblings, 9 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  1:03 UTC (permalink / raw)
  To: tglx
  Cc: x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos

This patch kit implements clearing of all unused registers on kernel entries,
including system calls and all exceptions and interrupt.

This doesn't fix any known issue, but will make it harder in general
to exploit the kernel with speculation because it will be harder
to get user controlled values into kernel code.

The patchkit is a bit more complicated because it attempts to clear
unused argument registers, which requires on 64bit to know how
many arguments each system call has. I used some scripting
to derive the number of system calls from the SYSCALL_DEFINE*s
and add it to the x86 system call tables.

Everything else is relatively simple and straight forward,
and could be used independently.

I assume this mostly isn't 4.15 material, but should be considered for 4.16
Possibly some of the simpler patches could be considered for 4.15

Original patches were from Tim Chen, but changed significantly
by AK.

git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc.git spec/clearregs-1

v1: Initial post

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-10  1:03 x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andi Kleen
@ 2018-01-10  1:03 ` Andi Kleen
  2018-01-10  2:46   ` Brian Gerst
                     ` (2 more replies)
  2018-01-10  1:03 ` [PATCH v1 2/8] x86/entry/clearregs: Add infrastructure to clear registers Andi Kleen
                   ` (7 subsequent siblings)
  8 siblings, 3 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  1:03 UTC (permalink / raw)
  To: tglx
  Cc: x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Remove the partial stack frame in the 64bit syscall fast path.
In the next patch we want to clear the extra registers, which requires
to always save all registers. So remove the partial stack frame
in the syscall fast path and always save everything.

This actually simplifies the code because the ptregs stubs
are not needed anymore.

arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------------------
arch/x86/entry/syscall_64.c |  2 +-

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------
 arch/x86/entry/syscall_64.c |  2 +-
 2 files changed, 5 insertions(+), 54 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 58dbf7a12a05..bbdfbdd817d6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -234,7 +234,9 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
 	pushq	%r9				/* pt_regs->r9 */
 	pushq	%r10				/* pt_regs->r10 */
 	pushq	%r11				/* pt_regs->r11 */
-	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
+	sub	$(6*8), %rsp
+	SAVE_EXTRA_REGS
+
 	UNWIND_HINT_REGS extra=0
 
 	/*
@@ -262,11 +264,6 @@ entry_SYSCALL_64_fastpath:
 	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
 	movq	%r10, %rcx
 
-	/*
-	 * This call instruction is handled specially in stub_ptregs_64.
-	 * It might end up jumping to the slow path.  If it jumps, RAX
-	 * and all argument registers are clobbered.
-	 */
 #ifdef CONFIG_RETPOLINE
 	movq	sys_call_table(, %rax, 8), %rax
 	call	__x86_indirect_thunk_rax
@@ -293,9 +290,7 @@ entry_SYSCALL_64_fastpath:
 	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
-	addq	$6*8, %rsp	/* skip extra regs -- they were preserved */
-	UNWIND_HINT_EMPTY
-	jmp	.Lpop_c_regs_except_rcx_r11_and_sysret
+	jmp	syscall_return_via_sysret
 
 1:
 	/*
@@ -305,14 +300,12 @@ entry_SYSCALL_64_fastpath:
 	 */
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
-	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
 	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	jmp	return_from_SYSCALL_64
 
 entry_SYSCALL64_slow_path:
 	/* IRQs are off. */
-	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
 	call	do_syscall_64		/* returns with IRQs disabled */
 
@@ -389,7 +382,6 @@ syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	UNWIND_HINT_EMPTY
 	POP_EXTRA_REGS
-.Lpop_c_regs_except_rcx_r11_and_sysret:
 	popq	%rsi	/* skip r11 */
 	popq	%r10
 	popq	%r9
@@ -420,47 +412,6 @@ syscall_return_via_sysret:
 	USERGS_SYSRET64
 END(entry_SYSCALL_64)
 
-ENTRY(stub_ptregs_64)
-	/*
-	 * Syscalls marked as needing ptregs land here.
-	 * If we are on the fast path, we need to save the extra regs,
-	 * which we achieve by trying again on the slow path.  If we are on
-	 * the slow path, the extra regs are already saved.
-	 *
-	 * RAX stores a pointer to the C function implementing the syscall.
-	 * IRQs are on.
-	 */
-	cmpq	$.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
-	jne	1f
-
-	/*
-	 * Called from fast path -- disable IRQs again, pop return address
-	 * and jump to slow path
-	 */
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	popq	%rax
-	UNWIND_HINT_REGS extra=0
-	jmp	entry_SYSCALL64_slow_path
-
-1:
-	JMP_NOSPEC %rax				/* Called from C */
-END(stub_ptregs_64)
-
-.macro ptregs_stub func
-ENTRY(ptregs_\func)
-	UNWIND_HINT_FUNC
-	leaq	\func(%rip), %rax
-	jmp	stub_ptregs_64
-END(ptregs_\func)
-.endm
-
-/* Instantiate ptregs_stub for each ptregs-using syscall */
-#define __SYSCALL_64_QUAL_(sym)
-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
-#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
-#include <asm/syscalls_64.h>
-
 /*
  * %rdi: prev task
  * %rsi: next task
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 9c09775e589d..ad1ae014f943 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -8,7 +8,7 @@
 #include <asm/syscall.h>
 
 #define __SYSCALL_64_QUAL_(sym) sym
-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
+#define __SYSCALL_64_QUAL_ptregs(sym) sym
 
 #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 #include <asm/syscalls_64.h>
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v1 2/8] x86/entry/clearregs: Add infrastructure to clear registers
  2018-01-10  1:03 x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andi Kleen
  2018-01-10  1:03 ` [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call Andi Kleen
@ 2018-01-10  1:03 ` Andi Kleen
  2018-01-11 19:58   ` Konrad Rzeszutek Wilk
  2018-01-10  1:03 ` [PATCH v1 3/8] x86/entry/clearregs: Clear registers for 64bit SYSCALL Andi Kleen
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  1:03 UTC (permalink / raw)
  To: tglx
  Cc: x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Add 64bit assembler macros to clear registers on kernel entry.
Used in followon patches.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/entry/calling.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 45a63e00a6af..9444e7623185 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -172,6 +172,34 @@ For 32-bit we have the following conventions - kernel is built with
 	.byte 0xf1
 	.endm
 
+	.macro CLEAR_R11_TO_R15
+	xorq %r15, %r15
+	xorq %r14, %r14
+	xorq %r13, %r13
+	xorq %r12, %r12
+	xorq %r11, %r11
+	.endm
+
+	.macro CLEAR_R8_TO_R15
+	CLEAR_R11_TO_R15
+	xorq %r10, %r10
+	xorq %r9, %r9
+	xorq %r8, %r8
+	.endm
+
+	.macro CLEAR_ALL_REGS
+	CLEAR_R8_TO_R15
+	xorl %eax, %eax
+	xorl %ebx, %ebx
+	xorl %ecx, %ecx
+	xorl %edx, %edx
+	xorl %esi, %esi
+	xorl %edi, %edi
+#ifndef CONFIG_FRAME_POINTER
+	xorl %ebp, %ebp
+#endif
+	.endm
+
 /*
  * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The
  * frame pointer is replaced with an encoded pointer to pt_regs.  The encoding
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v1 3/8] x86/entry/clearregs: Clear registers for 64bit SYSCALL
  2018-01-10  1:03 x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andi Kleen
  2018-01-10  1:03 ` [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call Andi Kleen
  2018-01-10  1:03 ` [PATCH v1 2/8] x86/entry/clearregs: Add infrastructure to clear registers Andi Kleen
@ 2018-01-10  1:03 ` Andi Kleen
  2018-01-11  3:35   ` Brian Gerst
  2018-01-12  3:45   ` Josh Poimboeuf
  2018-01-10  1:03 ` [PATCH v1 4/8] x86/entry/retpoline: Clear extra registers for compat syscalls Andi Kleen
                   ` (5 subsequent siblings)
  8 siblings, 2 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  1:03 UTC (permalink / raw)
  To: tglx
  Cc: x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

We clear all the non argument registers for 64bit SYSCALLs
to minimize any risk of bad speculation using user values.

So far unused argument registers still leak. To be addressed
in future patches.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/entry/entry_64.S | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index bbdfbdd817d6..632081fd7086 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -236,6 +236,14 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
 	pushq	%r11				/* pt_regs->r11 */
 	sub	$(6*8), %rsp
 	SAVE_EXTRA_REGS
+	/* Sanitize registers against speculation attacks */
+	/* r10 is cleared later, arguments are handled in san_args* */
+	CLEAR_R11_TO_R15
+#ifndef CONFIG_FRAME_POINTER
+	xor	%ebp, %ebp
+#endif
+	xor	%ebx, %ebx
+	xor	%ecx, %ecx
 
 	UNWIND_HINT_REGS extra=0
 
@@ -263,6 +271,7 @@ entry_SYSCALL_64_fastpath:
 #endif
 	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
 	movq	%r10, %rcx
+	xor	%r10, %r10
 
 #ifdef CONFIG_RETPOLINE
 	movq	sys_call_table(, %rax, 8), %rax
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v1 4/8] x86/entry/retpoline: Clear extra registers for compat syscalls
  2018-01-10  1:03 x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andi Kleen
                   ` (2 preceding siblings ...)
  2018-01-10  1:03 ` [PATCH v1 3/8] x86/entry/clearregs: Clear registers for 64bit SYSCALL Andi Kleen
@ 2018-01-10  1:03 ` Andi Kleen
  2018-01-10  1:03 ` [PATCH v1 5/8] x86/entry/clearregs: Clear registers for 64bit exceptions/interrupts Andi Kleen
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  1:03 UTC (permalink / raw)
  To: tglx
  Cc: x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Clear all registers for compat calls on 64bit kernels. All arguments
are initially passed through the stack, so this is fairly simple
without additional stubs.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/entry/entry_64_compat.S | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 98d5358e4041..16fd2643a77f 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -95,6 +95,8 @@ ENTRY(entry_SYSENTER_compat)
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 	cld
+	/* Can clear all because arguments are passed through the stack */
+	CLEAR_ALL_REGS
 
 	/*
 	 * SYSENTER doesn't filter flags, so we need to clear NT and AC
@@ -223,6 +225,8 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 	pushq   $0			/* pt_regs->r13 = 0 */
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
+	/* Can clear all because arguments are passed through the stack */
+	CLEAR_ALL_REGS
 
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
@@ -348,6 +352,8 @@ ENTRY(entry_INT80_compat)
 	pushq   %r14                    /* pt_regs->r14 */
 	pushq   %r15                    /* pt_regs->r15 */
 	cld
+	/* Can clear all because arguments are passed through the stack */
+	CLEAR_ALL_REGS
 
 	/*
 	 * User mode is traced as though IRQs are on, and the interrupt
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v1 5/8] x86/entry/clearregs: Clear registers for 64bit exceptions/interrupts
  2018-01-10  1:03 x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andi Kleen
                   ` (3 preceding siblings ...)
  2018-01-10  1:03 ` [PATCH v1 4/8] x86/entry/retpoline: Clear extra registers for compat syscalls Andi Kleen
@ 2018-01-10  1:03 ` Andi Kleen
  2018-01-10  1:23   ` Andy Lutomirski
  2018-01-10  1:03 ` [PATCH v1 6/8] x86/entry/clearregs: Add number of arguments to syscall tables Andi Kleen
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  1:03 UTC (permalink / raw)
  To: tglx
  Cc: x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Clear all registers on entering the 64bit kernel for exceptions and
interrupts.

Since there are no arguments this is fairly simple.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/entry/entry_64.S | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 632081fd7086..6ab4c2aaeabb 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -636,6 +636,7 @@ END(irq_entries_start)
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 
 	testb	$3, CS(%rsp)
@@ -1192,6 +1193,7 @@ ENTRY(xen_failsafe_callback)
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 	jmp	error_exit
 END(xen_failsafe_callback)
@@ -1237,6 +1239,7 @@ ENTRY(paranoid_entry)
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
@@ -1289,6 +1292,7 @@ ENTRY(error_entry)
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER 8
 	xorl	%ebx, %ebx
 	testb	$3, CS+8(%rsp)
@@ -1487,6 +1491,7 @@ ENTRY(nmi)
 	pushq	%r14		/* pt_regs->r14 */
 	pushq	%r15		/* pt_regs->r15 */
 	UNWIND_HINT_REGS
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 
 	/*
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v1 6/8] x86/entry/clearregs: Add number of arguments to syscall tables
  2018-01-10  1:03 x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andi Kleen
                   ` (4 preceding siblings ...)
  2018-01-10  1:03 ` [PATCH v1 5/8] x86/entry/clearregs: Clear registers for 64bit exceptions/interrupts Andi Kleen
@ 2018-01-10  1:03 ` Andi Kleen
  2018-01-10  1:26   ` Andy Lutomirski
  2018-01-10  1:03 ` [PATCH v1 7/8] x86/entry/clearregs: Add 64bit stubs to clear unused arguments regs Andi Kleen
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  1:03 UTC (permalink / raw)
  To: tglx
  Cc: x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

In order to sanitize the system call arguments properly
we need to know the number of syscall arguments for each
syscall. Add a new column to the 32bit and 64bit syscall
tables to list the number of arguments.

Also fix the generation script to not confuse the number
with a compat entry.

Generated with some scripting and quick review (but more eyeballs
would be appreciated)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl | 726 ++++++++++++++++-----------------
 arch/x86/entry/syscalls/syscall_64.tbl | 708 ++++++++++++++++----------------
 arch/x86/entry/syscalls/syscalltbl.sh  |   7 +-
 3 files changed, 723 insertions(+), 718 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac2161112..c3a4480365dd 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -6,388 +6,388 @@
 #
 # The abi is always "i386" for this file.
 #
-0	i386	restart_syscall		sys_restart_syscall
-1	i386	exit			sys_exit
-2	i386	fork			sys_fork			sys_fork
-3	i386	read			sys_read
-4	i386	write			sys_write
-5	i386	open			sys_open			compat_sys_open
-6	i386	close			sys_close
-7	i386	waitpid			sys_waitpid			sys32_waitpid
-8	i386	creat			sys_creat
-9	i386	link			sys_link
-10	i386	unlink			sys_unlink
-11	i386	execve			sys_execve			compat_sys_execve
-12	i386	chdir			sys_chdir
-13	i386	time			sys_time			compat_sys_time
-14	i386	mknod			sys_mknod
-15	i386	chmod			sys_chmod
-16	i386	lchown			sys_lchown16
+0	i386	restart_syscall		sys_restart_syscall	0
+1	i386	exit			sys_exit	1
+2	i386	fork			sys_fork			sys_fork	0
+3	i386	read			sys_read	3
+4	i386	write			sys_write	3
+5	i386	open			sys_open			compat_sys_open	3
+6	i386	close			sys_close	1
+7	i386	waitpid			sys_waitpid			sys32_waitpid	3
+8	i386	creat			sys_creat	2
+9	i386	link			sys_link	2
+10	i386	unlink			sys_unlink	1
+11	i386	execve			sys_execve			compat_sys_execve	3
+12	i386	chdir			sys_chdir	1
+13	i386	time			sys_time			compat_sys_time	1
+14	i386	mknod			sys_mknod	3
+15	i386	chmod			sys_chmod	2
+16	i386	lchown			sys_lchown16	3
 17	i386	break
-18	i386	oldstat			sys_stat
-19	i386	lseek			sys_lseek			compat_sys_lseek
-20	i386	getpid			sys_getpid
-21	i386	mount			sys_mount			compat_sys_mount
-22	i386	umount			sys_oldumount
-23	i386	setuid			sys_setuid16
-24	i386	getuid			sys_getuid16
-25	i386	stime			sys_stime			compat_sys_stime
-26	i386	ptrace			sys_ptrace			compat_sys_ptrace
-27	i386	alarm			sys_alarm
-28	i386	oldfstat		sys_fstat
-29	i386	pause			sys_pause
-30	i386	utime			sys_utime			compat_sys_utime
+18	i386	oldstat			sys_stat	2
+19	i386	lseek			sys_lseek			compat_sys_lseek	3
+20	i386	getpid			sys_getpid	0
+21	i386	mount			sys_mount			compat_sys_mount	5
+22	i386	umount			sys_oldumount	2
+23	i386	setuid			sys_setuid16	1
+24	i386	getuid			sys_getuid16	0
+25	i386	stime			sys_stime			compat_sys_stime	1
+26	i386	ptrace			sys_ptrace			compat_sys_ptrace	4
+27	i386	alarm			sys_alarm	1
+28	i386	oldfstat		sys_fstat	2
+29	i386	pause			sys_pause	0
+30	i386	utime			sys_utime			compat_sys_utime	2
 31	i386	stty
 32	i386	gtty
-33	i386	access			sys_access
-34	i386	nice			sys_nice
+33	i386	access			sys_access	2
+34	i386	nice			sys_nice	1
 35	i386	ftime
-36	i386	sync			sys_sync
-37	i386	kill			sys_kill
-38	i386	rename			sys_rename
-39	i386	mkdir			sys_mkdir
-40	i386	rmdir			sys_rmdir
-41	i386	dup			sys_dup
-42	i386	pipe			sys_pipe
-43	i386	times			sys_times			compat_sys_times
+36	i386	sync			sys_sync	0
+37	i386	kill			sys_kill	2
+38	i386	rename			sys_rename	2
+39	i386	mkdir			sys_mkdir	2
+40	i386	rmdir			sys_rmdir	1
+41	i386	dup			sys_dup		1
+42	i386	pipe			sys_pipe	1
+43	i386	times			sys_times			compat_sys_times	1
 44	i386	prof
-45	i386	brk			sys_brk
-46	i386	setgid			sys_setgid16
-47	i386	getgid			sys_getgid16
-48	i386	signal			sys_signal
-49	i386	geteuid			sys_geteuid16
-50	i386	getegid			sys_getegid16
-51	i386	acct			sys_acct
-52	i386	umount2			sys_umount
+45	i386	brk			sys_brk	1
+46	i386	setgid			sys_setgid16	1
+47	i386	getgid			sys_getgid16	0
+48	i386	signal			sys_signal	2
+49	i386	geteuid			sys_geteuid16	0
+50	i386	getegid			sys_getegid16	0
+51	i386	acct			sys_acct	1
+52	i386	umount2			sys_umount	2
 53	i386	lock
-54	i386	ioctl			sys_ioctl			compat_sys_ioctl
-55	i386	fcntl			sys_fcntl			compat_sys_fcntl64
+54	i386	ioctl			sys_ioctl			compat_sys_ioctl	3
+55	i386	fcntl			sys_fcntl			compat_sys_fcntl64	3
 56	i386	mpx
-57	i386	setpgid			sys_setpgid
+57	i386	setpgid			sys_setpgid	2
 58	i386	ulimit
-59	i386	oldolduname		sys_olduname
-60	i386	umask			sys_umask
-61	i386	chroot			sys_chroot
-62	i386	ustat			sys_ustat			compat_sys_ustat
-63	i386	dup2			sys_dup2
-64	i386	getppid			sys_getppid
-65	i386	getpgrp			sys_getpgrp
-66	i386	setsid			sys_setsid
-67	i386	sigaction		sys_sigaction			compat_sys_sigaction
-68	i386	sgetmask		sys_sgetmask
-69	i386	ssetmask		sys_ssetmask
-70	i386	setreuid		sys_setreuid16
-71	i386	setregid		sys_setregid16
-72	i386	sigsuspend		sys_sigsuspend			sys_sigsuspend
-73	i386	sigpending		sys_sigpending			compat_sys_sigpending
-74	i386	sethostname		sys_sethostname
-75	i386	setrlimit		sys_setrlimit			compat_sys_setrlimit
-76	i386	getrlimit		sys_old_getrlimit		compat_sys_old_getrlimit
-77	i386	getrusage		sys_getrusage			compat_sys_getrusage
-78	i386	gettimeofday		sys_gettimeofday		compat_sys_gettimeofday
-79	i386	settimeofday		sys_settimeofday		compat_sys_settimeofday
-80	i386	getgroups		sys_getgroups16
-81	i386	setgroups		sys_setgroups16
-82	i386	select			sys_old_select			compat_sys_old_select
-83	i386	symlink			sys_symlink
-84	i386	oldlstat		sys_lstat
-85	i386	readlink		sys_readlink
-86	i386	uselib			sys_uselib
-87	i386	swapon			sys_swapon
-88	i386	reboot			sys_reboot
-89	i386	readdir			sys_old_readdir			compat_sys_old_readdir
-90	i386	mmap			sys_old_mmap			sys32_mmap
-91	i386	munmap			sys_munmap
-92	i386	truncate		sys_truncate			compat_sys_truncate
-93	i386	ftruncate		sys_ftruncate			compat_sys_ftruncate
-94	i386	fchmod			sys_fchmod
-95	i386	fchown			sys_fchown16
-96	i386	getpriority		sys_getpriority
-97	i386	setpriority		sys_setpriority
+59	i386	oldolduname		sys_olduname	1
+60	i386	umask			sys_umask	1
+61	i386	chroot			sys_chroot	1
+62	i386	ustat			sys_ustat			compat_sys_ustat	2
+63	i386	dup2			sys_dup2	2
+64	i386	getppid			sys_getppid	0
+65	i386	getpgrp			sys_getpgrp	0
+66	i386	setsid			sys_setsid	0
+67	i386	sigaction		sys_sigaction			compat_sys_sigaction	3
+68	i386	sgetmask		sys_sgetmask	0
+69	i386	ssetmask		sys_ssetmask	1
+70	i386	setreuid		sys_setreuid16	2
+71	i386	setregid		sys_setregid16	2
+72	i386	sigsuspend		sys_sigsuspend			sys_sigsuspend	3
+73	i386	sigpending		sys_sigpending			compat_sys_sigpending	1
+74	i386	sethostname		sys_sethostname	2
+75	i386	setrlimit		sys_setrlimit			compat_sys_setrlimit	2
+76	i386	getrlimit		sys_old_getrlimit		compat_sys_old_getrlimit	2
+77	i386	getrusage		sys_getrusage			compat_sys_getrusage	2
+78	i386	gettimeofday		sys_gettimeofday		compat_sys_gettimeofday	2
+79	i386	settimeofday		sys_settimeofday		compat_sys_settimeofday	2
+80	i386	getgroups		sys_getgroups16	2
+81	i386	setgroups		sys_setgroups16	2
+82	i386	select			sys_old_select			compat_sys_old_select	5
+83	i386	symlink			sys_symlink	2
+84	i386	oldlstat		sys_lstat	2
+85	i386	readlink		sys_readlink	3
+86	i386	uselib			sys_uselib	1
+87	i386	swapon			sys_swapon	2
+88	i386	reboot			sys_reboot	4
+89	i386	readdir			sys_old_readdir			compat_sys_old_readdir	3
+90	i386	mmap			sys_old_mmap			sys32_mmap	6
+91	i386	munmap			sys_munmap	2
+92	i386	truncate		sys_truncate			compat_sys_truncate	2
+93	i386	ftruncate		sys_ftruncate			compat_sys_ftruncate	2
+94	i386	fchmod			sys_fchmod	2
+95	i386	fchown			sys_fchown16	3
+96	i386	getpriority		sys_getpriority	2
+97	i386	setpriority		sys_setpriority	3
 98	i386	profil
-99	i386	statfs			sys_statfs			compat_sys_statfs
-100	i386	fstatfs			sys_fstatfs			compat_sys_fstatfs
-101	i386	ioperm			sys_ioperm
-102	i386	socketcall		sys_socketcall			compat_sys_socketcall
-103	i386	syslog			sys_syslog
-104	i386	setitimer		sys_setitimer			compat_sys_setitimer
-105	i386	getitimer		sys_getitimer			compat_sys_getitimer
-106	i386	stat			sys_newstat			compat_sys_newstat
-107	i386	lstat			sys_newlstat			compat_sys_newlstat
-108	i386	fstat			sys_newfstat			compat_sys_newfstat
-109	i386	olduname		sys_uname
-110	i386	iopl			sys_iopl
-111	i386	vhangup			sys_vhangup
+99	i386	statfs			sys_statfs			compat_sys_statfs	2
+100	i386	fstatfs			sys_fstatfs			compat_sys_fstatfs	2
+101	i386	ioperm			sys_ioperm	3
+102	i386	socketcall		sys_socketcall			compat_sys_socketcall	2
+103	i386	syslog			sys_syslog	3
+104	i386	setitimer		sys_setitimer			compat_sys_setitimer	3
+105	i386	getitimer		sys_getitimer			compat_sys_getitimer	2
+106	i386	stat			sys_newstat			compat_sys_newstat	2
+107	i386	lstat			sys_newlstat			compat_sys_newlstat	2
+108	i386	fstat			sys_newfstat			compat_sys_newfstat	2
+109	i386	olduname		sys_uname	1
+110	i386	iopl			sys_iopl	1
+111	i386	vhangup			sys_vhangup	0
 112	i386	idle
-113	i386	vm86old			sys_vm86old			sys_ni_syscall
-114	i386	wait4			sys_wait4			compat_sys_wait4
-115	i386	swapoff			sys_swapoff
-116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
-117	i386	ipc			sys_ipc				compat_sys_ipc
-118	i386	fsync			sys_fsync
-119	i386	sigreturn		sys_sigreturn			sys32_sigreturn
-120	i386	clone			sys_clone			stub32_clone
-121	i386	setdomainname		sys_setdomainname
-122	i386	uname			sys_newuname
-123	i386	modify_ldt		sys_modify_ldt
-124	i386	adjtimex		sys_adjtimex			compat_sys_adjtimex
-125	i386	mprotect		sys_mprotect
-126	i386	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
+113	i386	vm86old			sys_vm86old			sys_ni_syscall		1
+114	i386	wait4			sys_wait4			compat_sys_wait4	4
+115	i386	swapoff			sys_swapoff	1
+116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo	1
+117	i386	ipc			sys_ipc				compat_sys_ipc	6
+118	i386	fsync			sys_fsync	1
+119	i386	sigreturn		sys_sigreturn			sys32_sigreturn	0
+120	i386	clone			sys_clone			stub32_clone	6
+121	i386	setdomainname		sys_setdomainname	2
+122	i386	uname			sys_newuname	1
+123	i386	modify_ldt		sys_modify_ldt	3
+124	i386	adjtimex		sys_adjtimex			compat_sys_adjtimex	1
+125	i386	mprotect		sys_mprotect	3
+126	i386	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask	3
 127	i386	create_module
-128	i386	init_module		sys_init_module
-129	i386	delete_module		sys_delete_module
+128	i386	init_module		sys_init_module	3
+129	i386	delete_module		sys_delete_module	2
 130	i386	get_kernel_syms
-131	i386	quotactl		sys_quotactl			sys32_quotactl
-132	i386	getpgid			sys_getpgid
-133	i386	fchdir			sys_fchdir
-134	i386	bdflush			sys_bdflush
-135	i386	sysfs			sys_sysfs
-136	i386	personality		sys_personality
+131	i386	quotactl		sys_quotactl			sys32_quotactl	4
+132	i386	getpgid			sys_getpgid	1
+133	i386	fchdir			sys_fchdir	1
+134	i386	bdflush			sys_bdflush	2
+135	i386	sysfs			sys_sysfs	3
+136	i386	personality		sys_personality	1
 137	i386	afs_syscall
-138	i386	setfsuid		sys_setfsuid16
-139	i386	setfsgid		sys_setfsgid16
-140	i386	_llseek			sys_llseek
-141	i386	getdents		sys_getdents			compat_sys_getdents
-142	i386	_newselect		sys_select			compat_sys_select
-143	i386	flock			sys_flock
-144	i386	msync			sys_msync
-145	i386	readv			sys_readv			compat_sys_readv
-146	i386	writev			sys_writev			compat_sys_writev
-147	i386	getsid			sys_getsid
-148	i386	fdatasync		sys_fdatasync
-149	i386	_sysctl			sys_sysctl			compat_sys_sysctl
-150	i386	mlock			sys_mlock
-151	i386	munlock			sys_munlock
-152	i386	mlockall		sys_mlockall
-153	i386	munlockall		sys_munlockall
-154	i386	sched_setparam		sys_sched_setparam
-155	i386	sched_getparam		sys_sched_getparam
-156	i386	sched_setscheduler	sys_sched_setscheduler
-157	i386	sched_getscheduler	sys_sched_getscheduler
-158	i386	sched_yield		sys_sched_yield
-159	i386	sched_get_priority_max	sys_sched_get_priority_max
-160	i386	sched_get_priority_min	sys_sched_get_priority_min
-161	i386	sched_rr_get_interval	sys_sched_rr_get_interval	compat_sys_sched_rr_get_interval
-162	i386	nanosleep		sys_nanosleep			compat_sys_nanosleep
-163	i386	mremap			sys_mremap
-164	i386	setresuid		sys_setresuid16
-165	i386	getresuid		sys_getresuid16
-166	i386	vm86			sys_vm86			sys_ni_syscall
+138	i386	setfsuid		sys_setfsuid16	1
+139	i386	setfsgid		sys_setfsgid16	1
+140	i386	_llseek			sys_llseek	5
+141	i386	getdents		sys_getdents			compat_sys_getdents	3
+142	i386	_newselect		sys_select			compat_sys_select	5
+143	i386	flock			sys_flock	2
+144	i386	msync			sys_msync	3
+145	i386	readv			sys_readv			compat_sys_readv	3
+146	i386	writev			sys_writev			compat_sys_writev	3
+147	i386	getsid			sys_getsid	1
+148	i386	fdatasync		sys_fdatasync	1
+149	i386	_sysctl			sys_sysctl			compat_sys_sysctl	1
+150	i386	mlock			sys_mlock	2
+151	i386	munlock			sys_munlock	2
+152	i386	mlockall		sys_mlockall	1
+153	i386	munlockall		sys_munlockall	0
+154	i386	sched_setparam		sys_sched_setparam	2
+155	i386	sched_getparam		sys_sched_getparam	2
+156	i386	sched_setscheduler	sys_sched_setscheduler	3
+157	i386	sched_getscheduler	sys_sched_getscheduler	1
+158	i386	sched_yield		sys_sched_yield		0
+159	i386	sched_get_priority_max	sys_sched_get_priority_max	1
+160	i386	sched_get_priority_min	sys_sched_get_priority_min	1
+161	i386	sched_rr_get_interval	sys_sched_rr_get_interval	compat_sys_sched_rr_get_interval	2
+162	i386	nanosleep		sys_nanosleep			compat_sys_nanosleep	2
+163	i386	mremap			sys_mremap	5
+164	i386	setresuid		sys_setresuid16	3
+165	i386	getresuid		sys_getresuid16	3
+166	i386	vm86			sys_vm86			sys_ni_syscall		2
 167	i386	query_module
-168	i386	poll			sys_poll
+168	i386	poll			sys_poll	3
 169	i386	nfsservctl
-170	i386	setresgid		sys_setresgid16
-171	i386	getresgid		sys_getresgid16
-172	i386	prctl			sys_prctl
-173	i386	rt_sigreturn		sys_rt_sigreturn		sys32_rt_sigreturn
-174	i386	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
-175	i386	rt_sigprocmask		sys_rt_sigprocmask
-176	i386	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
-177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
-178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
-179	i386	rt_sigsuspend		sys_rt_sigsuspend
-180	i386	pread64			sys_pread64			sys32_pread
-181	i386	pwrite64		sys_pwrite64			sys32_pwrite
-182	i386	chown			sys_chown16
-183	i386	getcwd			sys_getcwd
-184	i386	capget			sys_capget
-185	i386	capset			sys_capset
-186	i386	sigaltstack		sys_sigaltstack			compat_sys_sigaltstack
-187	i386	sendfile		sys_sendfile			compat_sys_sendfile
+170	i386	setresgid		sys_setresgid16	3
+171	i386	getresgid		sys_getresgid16	3
+172	i386	prctl			sys_prctl	5
+173	i386	rt_sigreturn		sys_rt_sigreturn		sys32_rt_sigreturn	0
+174	i386	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction	5
+175	i386	rt_sigprocmask		sys_rt_sigprocmask	4
+176	i386	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending	2
+177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait	4
+178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo	3
+179	i386	rt_sigsuspend		sys_rt_sigsuspend	2
+180	i386	pread64			sys_pread64			sys32_pread	4
+181	i386	pwrite64		sys_pwrite64			sys32_pwrite	4
+182	i386	chown			sys_chown16	3
+183	i386	getcwd			sys_getcwd	2
+184	i386	capget			sys_capget	2
+185	i386	capset			sys_capset	2
+186	i386	sigaltstack		sys_sigaltstack			compat_sys_sigaltstack	2
+187	i386	sendfile		sys_sendfile			compat_sys_sendfile	4
 188	i386	getpmsg
 189	i386	putpmsg
-190	i386	vfork			sys_vfork			sys_vfork
-191	i386	ugetrlimit		sys_getrlimit			compat_sys_getrlimit
-192	i386	mmap2			sys_mmap_pgoff
-193	i386	truncate64		sys_truncate64			sys32_truncate64
-194	i386	ftruncate64		sys_ftruncate64			sys32_ftruncate64
-195	i386	stat64			sys_stat64			sys32_stat64
-196	i386	lstat64			sys_lstat64			sys32_lstat64
-197	i386	fstat64			sys_fstat64			sys32_fstat64
-198	i386	lchown32		sys_lchown
-199	i386	getuid32		sys_getuid
-200	i386	getgid32		sys_getgid
-201	i386	geteuid32		sys_geteuid
-202	i386	getegid32		sys_getegid
-203	i386	setreuid32		sys_setreuid
-204	i386	setregid32		sys_setregid
-205	i386	getgroups32		sys_getgroups
-206	i386	setgroups32		sys_setgroups
-207	i386	fchown32		sys_fchown
-208	i386	setresuid32		sys_setresuid
-209	i386	getresuid32		sys_getresuid
-210	i386	setresgid32		sys_setresgid
-211	i386	getresgid32		sys_getresgid
-212	i386	chown32			sys_chown
-213	i386	setuid32		sys_setuid
-214	i386	setgid32		sys_setgid
-215	i386	setfsuid32		sys_setfsuid
-216	i386	setfsgid32		sys_setfsgid
-217	i386	pivot_root		sys_pivot_root
-218	i386	mincore			sys_mincore
-219	i386	madvise			sys_madvise
-220	i386	getdents64		sys_getdents64
-221	i386	fcntl64			sys_fcntl64			compat_sys_fcntl64
+190	i386	vfork			sys_vfork			sys_vfork	0
+191	i386	ugetrlimit		sys_getrlimit			compat_sys_getrlimit	2
+192	i386	mmap2			sys_mmap_pgoff	6
+193	i386	truncate64		sys_truncate64			sys32_truncate64	2
+194	i386	ftruncate64		sys_ftruncate64			sys32_ftruncate64	2
+195	i386	stat64			sys_stat64			sys32_stat64	2
+196	i386	lstat64			sys_lstat64			sys32_lstat64	2
+197	i386	fstat64			sys_fstat64			sys32_fstat64	2
+198	i386	lchown32		sys_lchown	3
+199	i386	getuid32		sys_getuid	0
+200	i386	getgid32		sys_getgid	0
+201	i386	geteuid32		sys_geteuid	0
+202	i386	getegid32		sys_getegid	0
+203	i386	setreuid32		sys_setreuid	2
+204	i386	setregid32		sys_setregid	2
+205	i386	getgroups32		sys_getgroups	2
+206	i386	setgroups32		sys_setgroups	2
+207	i386	fchown32		sys_fchown	3
+208	i386	setresuid32		sys_setresuid	3
+209	i386	getresuid32		sys_getresuid	3
+210	i386	setresgid32		sys_setresgid	3
+211	i386	getresgid32		sys_getresgid	3
+212	i386	chown32			sys_chown	3
+213	i386	setuid32		sys_setuid	1
+214	i386	setgid32		sys_setgid	1
+215	i386	setfsuid32		sys_setfsuid	1
+216	i386	setfsgid32		sys_setfsgid	1
+217	i386	pivot_root		sys_pivot_root	2
+218	i386	mincore			sys_mincore	3
+219	i386	madvise			sys_madvise	3
+220	i386	getdents64		sys_getdents64	3
+221	i386	fcntl64			sys_fcntl64			compat_sys_fcntl64	3
 # 222 is unused
 # 223 is unused
-224	i386	gettid			sys_gettid
-225	i386	readahead		sys_readahead			sys32_readahead
-226	i386	setxattr		sys_setxattr
-227	i386	lsetxattr		sys_lsetxattr
-228	i386	fsetxattr		sys_fsetxattr
-229	i386	getxattr		sys_getxattr
-230	i386	lgetxattr		sys_lgetxattr
-231	i386	fgetxattr		sys_fgetxattr
-232	i386	listxattr		sys_listxattr
-233	i386	llistxattr		sys_llistxattr
-234	i386	flistxattr		sys_flistxattr
-235	i386	removexattr		sys_removexattr
-236	i386	lremovexattr		sys_lremovexattr
-237	i386	fremovexattr		sys_fremovexattr
-238	i386	tkill			sys_tkill
-239	i386	sendfile64		sys_sendfile64
-240	i386	futex			sys_futex			compat_sys_futex
-241	i386	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
-242	i386	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
-243	i386	set_thread_area		sys_set_thread_area
-244	i386	get_thread_area		sys_get_thread_area
-245	i386	io_setup		sys_io_setup			compat_sys_io_setup
-246	i386	io_destroy		sys_io_destroy
-247	i386	io_getevents		sys_io_getevents		compat_sys_io_getevents
-248	i386	io_submit		sys_io_submit			compat_sys_io_submit
-249	i386	io_cancel		sys_io_cancel
-250	i386	fadvise64		sys_fadvise64			sys32_fadvise64
+224	i386	gettid			sys_gettid	0
+225	i386	readahead		sys_readahead			sys32_readahead	3
+226	i386	setxattr		sys_setxattr	5
+227	i386	lsetxattr		sys_lsetxattr	5
+228	i386	fsetxattr		sys_fsetxattr	5
+229	i386	getxattr		sys_getxattr	4
+230	i386	lgetxattr		sys_lgetxattr	4
+231	i386	fgetxattr		sys_fgetxattr	4
+232	i386	listxattr		sys_listxattr	3
+233	i386	llistxattr		sys_llistxattr	3
+234	i386	flistxattr		sys_flistxattr	3
+235	i386	removexattr		sys_removexattr	2
+236	i386	lremovexattr		sys_lremovexattr	2
+237	i386	fremovexattr		sys_fremovexattr	2
+238	i386	tkill			sys_tkill	2
+239	i386	sendfile64		sys_sendfile64	4
+240	i386	futex			sys_futex			compat_sys_futex	6
+241	i386	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity	3
+242	i386	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity	3
+243	i386	set_thread_area		sys_set_thread_area	1
+244	i386	get_thread_area		sys_get_thread_area	1
+245	i386	io_setup		sys_io_setup			compat_sys_io_setup	2
+246	i386	io_destroy		sys_io_destroy	1
+247	i386	io_getevents		sys_io_getevents		compat_sys_io_getevents	5
+248	i386	io_submit		sys_io_submit			compat_sys_io_submit	3
+249	i386	io_cancel		sys_io_cancel	3
+250	i386	fadvise64		sys_fadvise64			sys32_fadvise64	4
 # 251 is available for reuse (was briefly sys_set_zone_reclaim)
-252	i386	exit_group		sys_exit_group
-253	i386	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
-254	i386	epoll_create		sys_epoll_create
-255	i386	epoll_ctl		sys_epoll_ctl
-256	i386	epoll_wait		sys_epoll_wait
-257	i386	remap_file_pages	sys_remap_file_pages
-258	i386	set_tid_address		sys_set_tid_address
-259	i386	timer_create		sys_timer_create		compat_sys_timer_create
-260	i386	timer_settime		sys_timer_settime		compat_sys_timer_settime
-261	i386	timer_gettime		sys_timer_gettime		compat_sys_timer_gettime
-262	i386	timer_getoverrun	sys_timer_getoverrun
-263	i386	timer_delete		sys_timer_delete
-264	i386	clock_settime		sys_clock_settime		compat_sys_clock_settime
-265	i386	clock_gettime		sys_clock_gettime		compat_sys_clock_gettime
-266	i386	clock_getres		sys_clock_getres		compat_sys_clock_getres
-267	i386	clock_nanosleep		sys_clock_nanosleep		compat_sys_clock_nanosleep
-268	i386	statfs64		sys_statfs64			compat_sys_statfs64
-269	i386	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
-270	i386	tgkill			sys_tgkill
-271	i386	utimes			sys_utimes			compat_sys_utimes
-272	i386	fadvise64_64		sys_fadvise64_64		sys32_fadvise64_64
+252	i386	exit_group		sys_exit_group	1
+253	i386	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie	3
+254	i386	epoll_create		sys_epoll_create	1
+255	i386	epoll_ctl		sys_epoll_ctl	4
+256	i386	epoll_wait		sys_epoll_wait	4
+257	i386	remap_file_pages	sys_remap_file_pages	5
+258	i386	set_tid_address		sys_set_tid_address	1
+259	i386	timer_create		sys_timer_create		compat_sys_timer_create	3
+260	i386	timer_settime		sys_timer_settime		compat_sys_timer_settime	4
+261	i386	timer_gettime		sys_timer_gettime		compat_sys_timer_gettime	2
+262	i386	timer_getoverrun	sys_timer_getoverrun	1
+263	i386	timer_delete		sys_timer_delete	1
+264	i386	clock_settime		sys_clock_settime		compat_sys_clock_settime	2
+265	i386	clock_gettime		sys_clock_gettime		compat_sys_clock_gettime	2
+266	i386	clock_getres		sys_clock_getres		compat_sys_clock_getres	2
+267	i386	clock_nanosleep		sys_clock_nanosleep		compat_sys_clock_nanosleep	4
+268	i386	statfs64		sys_statfs64			compat_sys_statfs64	3
+269	i386	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64	3
+270	i386	tgkill			sys_tgkill	3
+271	i386	utimes			sys_utimes			compat_sys_utimes	2
+272	i386	fadvise64_64		sys_fadvise64_64		sys32_fadvise64_64	4
 273	i386	vserver
-274	i386	mbind			sys_mbind
-275	i386	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
-276	i386	set_mempolicy		sys_set_mempolicy
-277	i386	mq_open			sys_mq_open			compat_sys_mq_open
-278	i386	mq_unlink		sys_mq_unlink
-279	i386	mq_timedsend		sys_mq_timedsend		compat_sys_mq_timedsend
-280	i386	mq_timedreceive		sys_mq_timedreceive		compat_sys_mq_timedreceive
-281	i386	mq_notify		sys_mq_notify			compat_sys_mq_notify
-282	i386	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
-283	i386	kexec_load		sys_kexec_load			compat_sys_kexec_load
-284	i386	waitid			sys_waitid			compat_sys_waitid
+274	i386	mbind			sys_mbind	6
+275	i386	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy	5
+276	i386	set_mempolicy		sys_set_mempolicy	3
+277	i386	mq_open			sys_mq_open			compat_sys_mq_open	4
+278	i386	mq_unlink		sys_mq_unlink	1
+279	i386	mq_timedsend		sys_mq_timedsend		compat_sys_mq_timedsend	5
+280	i386	mq_timedreceive		sys_mq_timedreceive		compat_sys_mq_timedreceive	5
+281	i386	mq_notify		sys_mq_notify			compat_sys_mq_notify	2
+282	i386	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr	3
+283	i386	kexec_load		sys_kexec_load			compat_sys_kexec_load	4
+284	i386	waitid			sys_waitid			compat_sys_waitid	5
 # 285 sys_setaltroot
-286	i386	add_key			sys_add_key
-287	i386	request_key		sys_request_key
-288	i386	keyctl			sys_keyctl			compat_sys_keyctl
-289	i386	ioprio_set		sys_ioprio_set
-290	i386	ioprio_get		sys_ioprio_get
-291	i386	inotify_init		sys_inotify_init
-292	i386	inotify_add_watch	sys_inotify_add_watch
-293	i386	inotify_rm_watch	sys_inotify_rm_watch
-294	i386	migrate_pages		sys_migrate_pages
-295	i386	openat			sys_openat			compat_sys_openat
-296	i386	mkdirat			sys_mkdirat
-297	i386	mknodat			sys_mknodat
-298	i386	fchownat		sys_fchownat
-299	i386	futimesat		sys_futimesat			compat_sys_futimesat
-300	i386	fstatat64		sys_fstatat64			sys32_fstatat
-301	i386	unlinkat		sys_unlinkat
-302	i386	renameat		sys_renameat
-303	i386	linkat			sys_linkat
-304	i386	symlinkat		sys_symlinkat
-305	i386	readlinkat		sys_readlinkat
-306	i386	fchmodat		sys_fchmodat
-307	i386	faccessat		sys_faccessat
-308	i386	pselect6		sys_pselect6			compat_sys_pselect6
-309	i386	ppoll			sys_ppoll			compat_sys_ppoll
-310	i386	unshare			sys_unshare
-311	i386	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
-312	i386	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
-313	i386	splice			sys_splice
-314	i386	sync_file_range		sys_sync_file_range		sys32_sync_file_range
-315	i386	tee			sys_tee
-316	i386	vmsplice		sys_vmsplice			compat_sys_vmsplice
-317	i386	move_pages		sys_move_pages			compat_sys_move_pages
-318	i386	getcpu			sys_getcpu
-319	i386	epoll_pwait		sys_epoll_pwait
-320	i386	utimensat		sys_utimensat			compat_sys_utimensat
-321	i386	signalfd		sys_signalfd			compat_sys_signalfd
-322	i386	timerfd_create		sys_timerfd_create
-323	i386	eventfd			sys_eventfd
-324	i386	fallocate		sys_fallocate			sys32_fallocate
-325	i386	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime
-326	i386	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime
-327	i386	signalfd4		sys_signalfd4			compat_sys_signalfd4
-328	i386	eventfd2		sys_eventfd2
-329	i386	epoll_create1		sys_epoll_create1
-330	i386	dup3			sys_dup3
-331	i386	pipe2			sys_pipe2
-332	i386	inotify_init1		sys_inotify_init1
-333	i386	preadv			sys_preadv			compat_sys_preadv
-334	i386	pwritev			sys_pwritev			compat_sys_pwritev
-335	i386	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
-336	i386	perf_event_open		sys_perf_event_open
-337	i386	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg
-338	i386	fanotify_init		sys_fanotify_init
-339	i386	fanotify_mark		sys_fanotify_mark		compat_sys_fanotify_mark
-340	i386	prlimit64		sys_prlimit64
-341	i386	name_to_handle_at	sys_name_to_handle_at
-342	i386	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
-343	i386	clock_adjtime		sys_clock_adjtime		compat_sys_clock_adjtime
-344	i386	syncfs			sys_syncfs
-345	i386	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
-346	i386	setns			sys_setns
-347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
-348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
-349	i386	kcmp			sys_kcmp
-350	i386	finit_module		sys_finit_module
-351	i386	sched_setattr		sys_sched_setattr
-352	i386	sched_getattr		sys_sched_getattr
-353	i386	renameat2		sys_renameat2
-354	i386	seccomp			sys_seccomp
-355	i386	getrandom		sys_getrandom
-356	i386	memfd_create		sys_memfd_create
-357	i386	bpf			sys_bpf
-358	i386	execveat		sys_execveat			compat_sys_execveat
-359	i386	socket			sys_socket
-360	i386	socketpair		sys_socketpair
-361	i386	bind			sys_bind
-362	i386	connect			sys_connect
-363	i386	listen			sys_listen
-364	i386	accept4			sys_accept4
-365	i386	getsockopt		sys_getsockopt			compat_sys_getsockopt
-366	i386	setsockopt		sys_setsockopt			compat_sys_setsockopt
-367	i386	getsockname		sys_getsockname
-368	i386	getpeername		sys_getpeername
-369	i386	sendto			sys_sendto
-370	i386	sendmsg			sys_sendmsg			compat_sys_sendmsg
-371	i386	recvfrom		sys_recvfrom			compat_sys_recvfrom
-372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
-373	i386	shutdown		sys_shutdown
-374	i386	userfaultfd		sys_userfaultfd
-375	i386	membarrier		sys_membarrier
-376	i386	mlock2			sys_mlock2
-377	i386	copy_file_range		sys_copy_file_range
-378	i386	preadv2			sys_preadv2			compat_sys_preadv2
-379	i386	pwritev2		sys_pwritev2			compat_sys_pwritev2
-380	i386	pkey_mprotect		sys_pkey_mprotect
-381	i386	pkey_alloc		sys_pkey_alloc
-382	i386	pkey_free		sys_pkey_free
-383	i386	statx			sys_statx
-384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
+286	i386	add_key			sys_add_key	5
+287	i386	request_key		sys_request_key	4
+288	i386	keyctl			sys_keyctl			compat_sys_keyctl	5
+289	i386	ioprio_set		sys_ioprio_set	3
+290	i386	ioprio_get		sys_ioprio_get	2
+291	i386	inotify_init		sys_inotify_init	0
+292	i386	inotify_add_watch	sys_inotify_add_watch	3
+293	i386	inotify_rm_watch	sys_inotify_rm_watch	2
+294	i386	migrate_pages		sys_migrate_pages	4
+295	i386	openat			sys_openat			compat_sys_openat	4
+296	i386	mkdirat			sys_mkdirat	3
+297	i386	mknodat			sys_mknodat	4
+298	i386	fchownat		sys_fchownat	5
+299	i386	futimesat		sys_futimesat			compat_sys_futimesat	3
+300	i386	fstatat64		sys_fstatat64			sys32_fstatat	4
+301	i386	unlinkat		sys_unlinkat	3
+302	i386	renameat		sys_renameat	4
+303	i386	linkat			sys_linkat	5
+304	i386	symlinkat		sys_symlinkat	3
+305	i386	readlinkat		sys_readlinkat	4
+306	i386	fchmodat		sys_fchmodat	3
+307	i386	faccessat		sys_faccessat	3
+308	i386	pselect6		sys_pselect6			compat_sys_pselect6	6
+309	i386	ppoll			sys_ppoll			compat_sys_ppoll	5
+310	i386	unshare			sys_unshare	1
+311	i386	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list	2
+312	i386	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list	3
+313	i386	splice			sys_splice	6
+314	i386	sync_file_range		sys_sync_file_range		sys32_sync_file_range	4
+315	i386	tee			sys_tee	4
+316	i386	vmsplice		sys_vmsplice			compat_sys_vmsplice	4
+317	i386	move_pages		sys_move_pages			compat_sys_move_pages	6
+318	i386	getcpu			sys_getcpu	3
+319	i386	epoll_pwait		sys_epoll_pwait	6
+320	i386	utimensat		sys_utimensat			compat_sys_utimensat	4
+321	i386	signalfd		sys_signalfd			compat_sys_signalfd	3
+322	i386	timerfd_create		sys_timerfd_create	2
+323	i386	eventfd			sys_eventfd	1
+324	i386	fallocate		sys_fallocate			sys32_fallocate	4
+325	i386	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime	4
+326	i386	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime	2
+327	i386	signalfd4		sys_signalfd4			compat_sys_signalfd4	4
+328	i386	eventfd2		sys_eventfd2	2
+329	i386	epoll_create1		sys_epoll_create1	1
+330	i386	dup3			sys_dup3	3
+331	i386	pipe2			sys_pipe2	2
+332	i386	inotify_init1		sys_inotify_init1	1
+333	i386	preadv			sys_preadv			compat_sys_preadv	5
+334	i386	pwritev			sys_pwritev			compat_sys_pwritev	5
+335	i386	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo	4
+336	i386	perf_event_open		sys_perf_event_open	5
+337	i386	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg	5
+338	i386	fanotify_init		sys_fanotify_init	2
+339	i386	fanotify_mark		sys_fanotify_mark		compat_sys_fanotify_mark	5
+340	i386	prlimit64		sys_prlimit64	4
+341	i386	name_to_handle_at	sys_name_to_handle_at	5
+342	i386	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at	3
+343	i386	clock_adjtime		sys_clock_adjtime		compat_sys_clock_adjtime	2
+344	i386	syncfs			sys_syncfs	1
+345	i386	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg	4
+346	i386	setns			sys_setns	2
+347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv	6
+348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev	6
+349	i386	kcmp			sys_kcmp	5
+350	i386	finit_module		sys_finit_module	3
+351	i386	sched_setattr		sys_sched_setattr	3
+352	i386	sched_getattr		sys_sched_getattr	4
+353	i386	renameat2		sys_renameat2	5
+354	i386	seccomp			sys_seccomp	3
+355	i386	getrandom		sys_getrandom	3
+356	i386	memfd_create		sys_memfd_create	2
+357	i386	bpf			sys_bpf	3
+358	i386	execveat		sys_execveat			compat_sys_execveat	5
+359	i386	socket			sys_socket	3
+360	i386	socketpair		sys_socketpair	4
+361	i386	bind			sys_bind	3
+362	i386	connect			sys_connect	3
+363	i386	listen			sys_listen	2
+364	i386	accept4			sys_accept4	4
+365	i386	getsockopt		sys_getsockopt			compat_sys_getsockopt	5
+366	i386	setsockopt		sys_setsockopt			compat_sys_setsockopt	5
+367	i386	getsockname		sys_getsockname	3
+368	i386	getpeername		sys_getpeername	3
+369	i386	sendto			sys_sendto	6
+370	i386	sendmsg			sys_sendmsg			compat_sys_sendmsg	3
+371	i386	recvfrom		sys_recvfrom			compat_sys_recvfrom	6
+372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg	3
+373	i386	shutdown		sys_shutdown	2
+374	i386	userfaultfd		sys_userfaultfd	1
+375	i386	membarrier		sys_membarrier	2
+376	i386	mlock2			sys_mlock2	3
+377	i386	copy_file_range		sys_copy_file_range	6
+378	i386	preadv2			sys_preadv2			compat_sys_preadv2	6
+379	i386	pwritev2		sys_pwritev2			compat_sys_pwritev2	6
+380	i386	pkey_mprotect		sys_pkey_mprotect	4
+381	i386	pkey_alloc		sys_pkey_alloc	2
+382	i386	pkey_free		sys_pkey_free	1
+383	i386	statx			sys_statx	5
+384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl	2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..4783ba204b8f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -6,377 +6,377 @@
 #
 # The abi is "common", "64" or "x32" for this file.
 #
-0	common	read			sys_read
-1	common	write			sys_write
-2	common	open			sys_open
-3	common	close			sys_close
-4	common	stat			sys_newstat
-5	common	fstat			sys_newfstat
-6	common	lstat			sys_newlstat
-7	common	poll			sys_poll
-8	common	lseek			sys_lseek
-9	common	mmap			sys_mmap
-10	common	mprotect		sys_mprotect
-11	common	munmap			sys_munmap
-12	common	brk			sys_brk
-13	64	rt_sigaction		sys_rt_sigaction
-14	common	rt_sigprocmask		sys_rt_sigprocmask
-15	64	rt_sigreturn		sys_rt_sigreturn/ptregs
-16	64	ioctl			sys_ioctl
-17	common	pread64			sys_pread64
-18	common	pwrite64		sys_pwrite64
-19	64	readv			sys_readv
-20	64	writev			sys_writev
-21	common	access			sys_access
-22	common	pipe			sys_pipe
-23	common	select			sys_select
-24	common	sched_yield		sys_sched_yield
-25	common	mremap			sys_mremap
-26	common	msync			sys_msync
-27	common	mincore			sys_mincore
-28	common	madvise			sys_madvise
-29	common	shmget			sys_shmget
-30	common	shmat			sys_shmat
-31	common	shmctl			sys_shmctl
-32	common	dup			sys_dup
-33	common	dup2			sys_dup2
-34	common	pause			sys_pause
-35	common	nanosleep		sys_nanosleep
-36	common	getitimer		sys_getitimer
-37	common	alarm			sys_alarm
-38	common	setitimer		sys_setitimer
-39	common	getpid			sys_getpid
-40	common	sendfile		sys_sendfile64
-41	common	socket			sys_socket
-42	common	connect			sys_connect
-43	common	accept			sys_accept
-44	common	sendto			sys_sendto
-45	64	recvfrom		sys_recvfrom
-46	64	sendmsg			sys_sendmsg
-47	64	recvmsg			sys_recvmsg
-48	common	shutdown		sys_shutdown
-49	common	bind			sys_bind
-50	common	listen			sys_listen
-51	common	getsockname		sys_getsockname
-52	common	getpeername		sys_getpeername
-53	common	socketpair		sys_socketpair
-54	64	setsockopt		sys_setsockopt
-55	64	getsockopt		sys_getsockopt
-56	common	clone			sys_clone/ptregs
-57	common	fork			sys_fork/ptregs
-58	common	vfork			sys_vfork/ptregs
-59	64	execve			sys_execve/ptregs
-60	common	exit			sys_exit
-61	common	wait4			sys_wait4
-62	common	kill			sys_kill
-63	common	uname			sys_newuname
-64	common	semget			sys_semget
-65	common	semop			sys_semop
-66	common	semctl			sys_semctl
-67	common	shmdt			sys_shmdt
-68	common	msgget			sys_msgget
-69	common	msgsnd			sys_msgsnd
-70	common	msgrcv			sys_msgrcv
-71	common	msgctl			sys_msgctl
-72	common	fcntl			sys_fcntl
-73	common	flock			sys_flock
-74	common	fsync			sys_fsync
-75	common	fdatasync		sys_fdatasync
-76	common	truncate		sys_truncate
-77	common	ftruncate		sys_ftruncate
-78	common	getdents		sys_getdents
-79	common	getcwd			sys_getcwd
-80	common	chdir			sys_chdir
-81	common	fchdir			sys_fchdir
-82	common	rename			sys_rename
-83	common	mkdir			sys_mkdir
-84	common	rmdir			sys_rmdir
-85	common	creat			sys_creat
-86	common	link			sys_link
-87	common	unlink			sys_unlink
-88	common	symlink			sys_symlink
-89	common	readlink		sys_readlink
-90	common	chmod			sys_chmod
-91	common	fchmod			sys_fchmod
-92	common	chown			sys_chown
-93	common	fchown			sys_fchown
-94	common	lchown			sys_lchown
-95	common	umask			sys_umask
-96	common	gettimeofday		sys_gettimeofday
-97	common	getrlimit		sys_getrlimit
-98	common	getrusage		sys_getrusage
-99	common	sysinfo			sys_sysinfo
-100	common	times			sys_times
-101	64	ptrace			sys_ptrace
-102	common	getuid			sys_getuid
-103	common	syslog			sys_syslog
-104	common	getgid			sys_getgid
-105	common	setuid			sys_setuid
-106	common	setgid			sys_setgid
-107	common	geteuid			sys_geteuid
-108	common	getegid			sys_getegid
-109	common	setpgid			sys_setpgid
-110	common	getppid			sys_getppid
-111	common	getpgrp			sys_getpgrp
-112	common	setsid			sys_setsid
-113	common	setreuid		sys_setreuid
-114	common	setregid		sys_setregid
-115	common	getgroups		sys_getgroups
-116	common	setgroups		sys_setgroups
-117	common	setresuid		sys_setresuid
-118	common	getresuid		sys_getresuid
-119	common	setresgid		sys_setresgid
-120	common	getresgid		sys_getresgid
-121	common	getpgid			sys_getpgid
-122	common	setfsuid		sys_setfsuid
-123	common	setfsgid		sys_setfsgid
-124	common	getsid			sys_getsid
-125	common	capget			sys_capget
-126	common	capset			sys_capset
-127	64	rt_sigpending		sys_rt_sigpending
-128	64	rt_sigtimedwait		sys_rt_sigtimedwait
-129	64	rt_sigqueueinfo		sys_rt_sigqueueinfo
-130	common	rt_sigsuspend		sys_rt_sigsuspend
-131	64	sigaltstack		sys_sigaltstack
-132	common	utime			sys_utime
-133	common	mknod			sys_mknod
+0	common	read			sys_read	3
+1	common	write			sys_write	3
+2	common	open			sys_open	3
+3	common	close			sys_close	1
+4	common	stat			sys_newstat	2
+5	common	fstat			sys_newfstat	2
+6	common	lstat			sys_newlstat	2
+7	common	poll			sys_poll	3
+8	common	lseek			sys_lseek	3
+9	common	mmap			sys_mmap	6
+10	common	mprotect		sys_mprotect	3
+11	common	munmap			sys_munmap	2
+12	common	brk			sys_brk		1
+13	64	rt_sigaction		sys_rt_sigaction	5
+14	common	rt_sigprocmask		sys_rt_sigprocmask	4
+15	64	rt_sigreturn		sys_rt_sigreturn/ptregs	0
+16	64	ioctl			sys_ioctl	3
+17	common	pread64			sys_pread64	4
+18	common	pwrite64		sys_pwrite64	4
+19	64	readv			sys_readv	3
+20	64	writev			sys_writev	3
+21	common	access			sys_access	2
+22	common	pipe			sys_pipe	1
+23	common	select			sys_select	5
+24	common	sched_yield		sys_sched_yield	0
+25	common	mremap			sys_mremap	5
+26	common	msync			sys_msync	3
+27	common	mincore			sys_mincore	3
+28	common	madvise			sys_madvise	3
+29	common	shmget			sys_shmget	3
+30	common	shmat			sys_shmat	3
+31	common	shmctl			sys_shmctl	3
+32	common	dup			sys_dup		1
+33	common	dup2			sys_dup2	2
+34	common	pause			sys_pause	0
+35	common	nanosleep		sys_nanosleep	2
+36	common	getitimer		sys_getitimer	2
+37	common	alarm			sys_alarm	1
+38	common	setitimer		sys_setitimer	3
+39	common	getpid			sys_getpid	0
+40	common	sendfile		sys_sendfile64	4
+41	common	socket			sys_socket	3
+42	common	connect			sys_connect	3
+43	common	accept			sys_accept	3
+44	common	sendto			sys_sendto	6
+45	64	recvfrom		sys_recvfrom	6
+46	64	sendmsg			sys_sendmsg	3
+47	64	recvmsg			sys_recvmsg	3
+48	common	shutdown		sys_shutdown	2
+49	common	bind			sys_bind	3
+50	common	listen			sys_listen	2
+51	common	getsockname		sys_getsockname	3
+52	common	getpeername		sys_getpeername	3
+53	common	socketpair		sys_socketpair	4
+54	64	setsockopt		sys_setsockopt	5
+55	64	getsockopt		sys_getsockopt	5
+56	common	clone			sys_clone/ptregs	6
+57	common	fork			sys_fork/ptregs		0
+58	common	vfork			sys_vfork/ptregs	0
+59	64	execve			sys_execve/ptregs	3
+60	common	exit			sys_exit	1
+61	common	wait4			sys_wait4	4
+62	common	kill			sys_kill	2
+63	common	uname			sys_newuname	1
+64	common	semget			sys_semget	3
+65	common	semop			sys_semop	3
+66	common	semctl			sys_semctl	4
+67	common	shmdt			sys_shmdt	1
+68	common	msgget			sys_msgget	2
+69	common	msgsnd			sys_msgsnd	4
+70	common	msgrcv			sys_msgrcv	5
+71	common	msgctl			sys_msgctl	3
+72	common	fcntl			sys_fcntl	3
+73	common	flock			sys_flock	2
+74	common	fsync			sys_fsync	1
+75	common	fdatasync		sys_fdatasync	1
+76	common	truncate		sys_truncate	2
+77	common	ftruncate		sys_ftruncate	2
+78	common	getdents		sys_getdents	3
+79	common	getcwd			sys_getcwd	2
+80	common	chdir			sys_chdir	1
+81	common	fchdir			sys_fchdir	1
+82	common	rename			sys_rename	2
+83	common	mkdir			sys_mkdir	2
+84	common	rmdir			sys_rmdir	1
+85	common	creat			sys_creat	2
+86	common	link			sys_link	2
+87	common	unlink			sys_unlink	1
+88	common	symlink			sys_symlink	2
+89	common	readlink		sys_readlink	3
+90	common	chmod			sys_chmod	2
+91	common	fchmod			sys_fchmod	2
+92	common	chown			sys_chown	3
+93	common	fchown			sys_fchown	3
+94	common	lchown			sys_lchown	3
+95	common	umask			sys_umask	1
+96	common	gettimeofday		sys_gettimeofday	2
+97	common	getrlimit		sys_getrlimit	2
+98	common	getrusage		sys_getrusage	2
+99	common	sysinfo			sys_sysinfo	1
+100	common	times			sys_times	1
+101	64	ptrace			sys_ptrace	4
+102	common	getuid			sys_getuid	0
+103	common	syslog			sys_syslog	3
+104	common	getgid			sys_getgid	0
+105	common	setuid			sys_setuid	1
+106	common	setgid			sys_setgid	1
+107	common	geteuid			sys_geteuid	0
+108	common	getegid			sys_getegid	0
+109	common	setpgid			sys_setpgid	2
+110	common	getppid			sys_getppid	0
+111	common	getpgrp			sys_getpgrp	0
+112	common	setsid			sys_setsid	0
+113	common	setreuid		sys_setreuid	2
+114	common	setregid		sys_setregid	2
+115	common	getgroups		sys_getgroups	2
+116	common	setgroups		sys_setgroups	2
+117	common	setresuid		sys_setresuid	3
+118	common	getresuid		sys_getresuid	3
+119	common	setresgid		sys_setresgid	3
+120	common	getresgid		sys_getresgid	3
+121	common	getpgid			sys_getpgid	1
+122	common	setfsuid		sys_setfsuid	1
+123	common	setfsgid		sys_setfsgid	1
+124	common	getsid			sys_getsid	1
+125	common	capget			sys_capget	2
+126	common	capset			sys_capset	2
+127	64	rt_sigpending		sys_rt_sigpending	2
+128	64	rt_sigtimedwait		sys_rt_sigtimedwait	4
+129	64	rt_sigqueueinfo		sys_rt_sigqueueinfo	3
+130	common	rt_sigsuspend		sys_rt_sigsuspend	2
+131	64	sigaltstack		sys_sigaltstack	2
+132	common	utime			sys_utime	2
+133	common	mknod			sys_mknod	3
 134	64	uselib
-135	common	personality		sys_personality
-136	common	ustat			sys_ustat
-137	common	statfs			sys_statfs
-138	common	fstatfs			sys_fstatfs
-139	common	sysfs			sys_sysfs
-140	common	getpriority		sys_getpriority
-141	common	setpriority		sys_setpriority
-142	common	sched_setparam		sys_sched_setparam
-143	common	sched_getparam		sys_sched_getparam
-144	common	sched_setscheduler	sys_sched_setscheduler
-145	common	sched_getscheduler	sys_sched_getscheduler
-146	common	sched_get_priority_max	sys_sched_get_priority_max
-147	common	sched_get_priority_min	sys_sched_get_priority_min
-148	common	sched_rr_get_interval	sys_sched_rr_get_interval
-149	common	mlock			sys_mlock
-150	common	munlock			sys_munlock
-151	common	mlockall		sys_mlockall
-152	common	munlockall		sys_munlockall
-153	common	vhangup			sys_vhangup
-154	common	modify_ldt		sys_modify_ldt
-155	common	pivot_root		sys_pivot_root
-156	64	_sysctl			sys_sysctl
-157	common	prctl			sys_prctl
-158	common	arch_prctl		sys_arch_prctl
-159	common	adjtimex		sys_adjtimex
-160	common	setrlimit		sys_setrlimit
-161	common	chroot			sys_chroot
-162	common	sync			sys_sync
-163	common	acct			sys_acct
-164	common	settimeofday		sys_settimeofday
-165	common	mount			sys_mount
-166	common	umount2			sys_umount
-167	common	swapon			sys_swapon
-168	common	swapoff			sys_swapoff
-169	common	reboot			sys_reboot
-170	common	sethostname		sys_sethostname
-171	common	setdomainname		sys_setdomainname
-172	common	iopl			sys_iopl/ptregs
-173	common	ioperm			sys_ioperm
+135	common	personality		sys_personality	1
+136	common	ustat			sys_ustat	2
+137	common	statfs			sys_statfs	2
+138	common	fstatfs			sys_fstatfs	2
+139	common	sysfs			sys_sysfs	3
+140	common	getpriority		sys_getpriority	2
+141	common	setpriority		sys_setpriority	3
+142	common	sched_setparam		sys_sched_setparam	2
+143	common	sched_getparam		sys_sched_getparam	2
+144	common	sched_setscheduler	sys_sched_setscheduler	3
+145	common	sched_getscheduler	sys_sched_getscheduler	1
+146	common	sched_get_priority_max	sys_sched_get_priority_max	1
+147	common	sched_get_priority_min	sys_sched_get_priority_min	1
+148	common	sched_rr_get_interval	sys_sched_rr_get_interval	2
+149	common	mlock			sys_mlock	2
+150	common	munlock			sys_munlock	2
+151	common	mlockall		sys_mlockall	1
+152	common	munlockall		sys_munlockall	0
+153	common	vhangup			sys_vhangup	0
+154	common	modify_ldt		sys_modify_ldt	3
+155	common	pivot_root		sys_pivot_root	2
+156	64	_sysctl			sys_sysctl	1
+157	common	prctl			sys_prctl	5
+158	common	arch_prctl		sys_arch_prctl	2
+159	common	adjtimex		sys_adjtimex	1
+160	common	setrlimit		sys_setrlimit	2
+161	common	chroot			sys_chroot	1
+162	common	sync			sys_sync	0
+163	common	acct			sys_acct	1
+164	common	settimeofday		sys_settimeofday	2
+165	common	mount			sys_mount	5
+166	common	umount2			sys_umount	2
+167	common	swapon			sys_swapon	2
+168	common	swapoff			sys_swapoff	1
+169	common	reboot			sys_reboot	4
+170	common	sethostname		sys_sethostname	2
+171	common	setdomainname		sys_setdomainname	2
+172	common	iopl			sys_iopl/ptregs	1
+173	common	ioperm			sys_ioperm	3
 174	64	create_module
-175	common	init_module		sys_init_module
-176	common	delete_module		sys_delete_module
+175	common	init_module		sys_init_module	3
+176	common	delete_module		sys_delete_module	2
 177	64	get_kernel_syms
 178	64	query_module
-179	common	quotactl		sys_quotactl
+179	common	quotactl		sys_quotactl	4
 180	64	nfsservctl
 181	common	getpmsg
 182	common	putpmsg
 183	common	afs_syscall
 184	common	tuxcall
 185	common	security
-186	common	gettid			sys_gettid
-187	common	readahead		sys_readahead
-188	common	setxattr		sys_setxattr
-189	common	lsetxattr		sys_lsetxattr
-190	common	fsetxattr		sys_fsetxattr
-191	common	getxattr		sys_getxattr
-192	common	lgetxattr		sys_lgetxattr
-193	common	fgetxattr		sys_fgetxattr
-194	common	listxattr		sys_listxattr
-195	common	llistxattr		sys_llistxattr
-196	common	flistxattr		sys_flistxattr
-197	common	removexattr		sys_removexattr
-198	common	lremovexattr		sys_lremovexattr
-199	common	fremovexattr		sys_fremovexattr
-200	common	tkill			sys_tkill
-201	common	time			sys_time
-202	common	futex			sys_futex
-203	common	sched_setaffinity	sys_sched_setaffinity
-204	common	sched_getaffinity	sys_sched_getaffinity
+186	common	gettid			sys_gettid	0
+187	common	readahead		sys_readahead	3
+188	common	setxattr		sys_setxattr	5
+189	common	lsetxattr		sys_lsetxattr	5
+190	common	fsetxattr		sys_fsetxattr	5
+191	common	getxattr		sys_getxattr	4
+192	common	lgetxattr		sys_lgetxattr	4
+193	common	fgetxattr		sys_fgetxattr	4
+194	common	listxattr		sys_listxattr	3
+195	common	llistxattr		sys_llistxattr	3
+196	common	flistxattr		sys_flistxattr	3
+197	common	removexattr		sys_removexattr	2
+198	common	lremovexattr		sys_lremovexattr	2
+199	common	fremovexattr		sys_fremovexattr	2
+200	common	tkill			sys_tkill	2
+201	common	time			sys_time	1
+202	common	futex			sys_futex	6
+203	common	sched_setaffinity	sys_sched_setaffinity	3
+204	common	sched_getaffinity	sys_sched_getaffinity	3
 205	64	set_thread_area
-206	64	io_setup		sys_io_setup
-207	common	io_destroy		sys_io_destroy
-208	common	io_getevents		sys_io_getevents
-209	64	io_submit		sys_io_submit
-210	common	io_cancel		sys_io_cancel
+206	64	io_setup		sys_io_setup	2
+207	common	io_destroy		sys_io_destroy	1
+208	common	io_getevents		sys_io_getevents	5
+209	64	io_submit		sys_io_submit	3
+210	common	io_cancel		sys_io_cancel	3
 211	64	get_thread_area
-212	common	lookup_dcookie		sys_lookup_dcookie
-213	common	epoll_create		sys_epoll_create
+212	common	lookup_dcookie		sys_lookup_dcookie	3
+213	common	epoll_create		sys_epoll_create	1
 214	64	epoll_ctl_old
 215	64	epoll_wait_old
-216	common	remap_file_pages	sys_remap_file_pages
-217	common	getdents64		sys_getdents64
-218	common	set_tid_address		sys_set_tid_address
-219	common	restart_syscall		sys_restart_syscall
-220	common	semtimedop		sys_semtimedop
-221	common	fadvise64		sys_fadvise64
-222	64	timer_create		sys_timer_create
-223	common	timer_settime		sys_timer_settime
-224	common	timer_gettime		sys_timer_gettime
-225	common	timer_getoverrun	sys_timer_getoverrun
-226	common	timer_delete		sys_timer_delete
-227	common	clock_settime		sys_clock_settime
-228	common	clock_gettime		sys_clock_gettime
-229	common	clock_getres		sys_clock_getres
-230	common	clock_nanosleep		sys_clock_nanosleep
-231	common	exit_group		sys_exit_group
-232	common	epoll_wait		sys_epoll_wait
-233	common	epoll_ctl		sys_epoll_ctl
-234	common	tgkill			sys_tgkill
-235	common	utimes			sys_utimes
+216	common	remap_file_pages	sys_remap_file_pages	5
+217	common	getdents64		sys_getdents64	3
+218	common	set_tid_address		sys_set_tid_address	1
+219	common	restart_syscall		sys_restart_syscall	0
+220	common	semtimedop		sys_semtimedop	4
+221	common	fadvise64		sys_fadvise64	4
+222	64	timer_create		sys_timer_create	3
+223	common	timer_settime		sys_timer_settime	4
+224	common	timer_gettime		sys_timer_gettime	2
+225	common	timer_getoverrun	sys_timer_getoverrun	1
+226	common	timer_delete		sys_timer_delete	1
+227	common	clock_settime		sys_clock_settime	2
+228	common	clock_gettime		sys_clock_gettime	2
+229	common	clock_getres		sys_clock_getres	2
+230	common	clock_nanosleep		sys_clock_nanosleep	4
+231	common	exit_group		sys_exit_group	1
+232	common	epoll_wait		sys_epoll_wait	4
+233	common	epoll_ctl		sys_epoll_ctl	4
+234	common	tgkill			sys_tgkill	3
+235	common	utimes			sys_utimes	2
 236	64	vserver
-237	common	mbind			sys_mbind
-238	common	set_mempolicy		sys_set_mempolicy
-239	common	get_mempolicy		sys_get_mempolicy
-240	common	mq_open			sys_mq_open
-241	common	mq_unlink		sys_mq_unlink
-242	common	mq_timedsend		sys_mq_timedsend
-243	common	mq_timedreceive		sys_mq_timedreceive
-244	64	mq_notify		sys_mq_notify
-245	common	mq_getsetattr		sys_mq_getsetattr
-246	64	kexec_load		sys_kexec_load
-247	64	waitid			sys_waitid
-248	common	add_key			sys_add_key
-249	common	request_key		sys_request_key
-250	common	keyctl			sys_keyctl
-251	common	ioprio_set		sys_ioprio_set
-252	common	ioprio_get		sys_ioprio_get
-253	common	inotify_init		sys_inotify_init
-254	common	inotify_add_watch	sys_inotify_add_watch
-255	common	inotify_rm_watch	sys_inotify_rm_watch
-256	common	migrate_pages		sys_migrate_pages
-257	common	openat			sys_openat
-258	common	mkdirat			sys_mkdirat
-259	common	mknodat			sys_mknodat
-260	common	fchownat		sys_fchownat
-261	common	futimesat		sys_futimesat
-262	common	newfstatat		sys_newfstatat
-263	common	unlinkat		sys_unlinkat
-264	common	renameat		sys_renameat
-265	common	linkat			sys_linkat
-266	common	symlinkat		sys_symlinkat
-267	common	readlinkat		sys_readlinkat
-268	common	fchmodat		sys_fchmodat
-269	common	faccessat		sys_faccessat
-270	common	pselect6		sys_pselect6
-271	common	ppoll			sys_ppoll
-272	common	unshare			sys_unshare
-273	64	set_robust_list		sys_set_robust_list
-274	64	get_robust_list		sys_get_robust_list
-275	common	splice			sys_splice
-276	common	tee			sys_tee
-277	common	sync_file_range		sys_sync_file_range
-278	64	vmsplice		sys_vmsplice
-279	64	move_pages		sys_move_pages
-280	common	utimensat		sys_utimensat
-281	common	epoll_pwait		sys_epoll_pwait
-282	common	signalfd		sys_signalfd
-283	common	timerfd_create		sys_timerfd_create
-284	common	eventfd			sys_eventfd
-285	common	fallocate		sys_fallocate
-286	common	timerfd_settime		sys_timerfd_settime
-287	common	timerfd_gettime		sys_timerfd_gettime
-288	common	accept4			sys_accept4
-289	common	signalfd4		sys_signalfd4
-290	common	eventfd2		sys_eventfd2
-291	common	epoll_create1		sys_epoll_create1
-292	common	dup3			sys_dup3
-293	common	pipe2			sys_pipe2
-294	common	inotify_init1		sys_inotify_init1
-295	64	preadv			sys_preadv
-296	64	pwritev			sys_pwritev
-297	64	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo
-298	common	perf_event_open		sys_perf_event_open
-299	64	recvmmsg		sys_recvmmsg
-300	common	fanotify_init		sys_fanotify_init
-301	common	fanotify_mark		sys_fanotify_mark
-302	common	prlimit64		sys_prlimit64
-303	common	name_to_handle_at	sys_name_to_handle_at
-304	common	open_by_handle_at	sys_open_by_handle_at
-305	common	clock_adjtime		sys_clock_adjtime
-306	common	syncfs			sys_syncfs
-307	64	sendmmsg		sys_sendmmsg
-308	common	setns			sys_setns
-309	common	getcpu			sys_getcpu
-310	64	process_vm_readv	sys_process_vm_readv
-311	64	process_vm_writev	sys_process_vm_writev
-312	common	kcmp			sys_kcmp
-313	common	finit_module		sys_finit_module
-314	common	sched_setattr		sys_sched_setattr
-315	common	sched_getattr		sys_sched_getattr
-316	common	renameat2		sys_renameat2
-317	common	seccomp			sys_seccomp
-318	common	getrandom		sys_getrandom
-319	common	memfd_create		sys_memfd_create
-320	common	kexec_file_load		sys_kexec_file_load
-321	common	bpf			sys_bpf
-322	64	execveat		sys_execveat/ptregs
-323	common	userfaultfd		sys_userfaultfd
-324	common	membarrier		sys_membarrier
-325	common	mlock2			sys_mlock2
-326	common	copy_file_range		sys_copy_file_range
-327	64	preadv2			sys_preadv2
-328	64	pwritev2		sys_pwritev2
-329	common	pkey_mprotect		sys_pkey_mprotect
-330	common	pkey_alloc		sys_pkey_alloc
-331	common	pkey_free		sys_pkey_free
-332	common	statx			sys_statx
+237	common	mbind			sys_mbind	6
+238	common	set_mempolicy		sys_set_mempolicy	3
+239	common	get_mempolicy		sys_get_mempolicy	5
+240	common	mq_open			sys_mq_open	4
+241	common	mq_unlink		sys_mq_unlink	1
+242	common	mq_timedsend		sys_mq_timedsend	5
+243	common	mq_timedreceive		sys_mq_timedreceive	5
+244	64	mq_notify		sys_mq_notify	2
+245	common	mq_getsetattr		sys_mq_getsetattr	3
+246	64	kexec_load		sys_kexec_load	4
+247	64	waitid			sys_waitid	5
+248	common	add_key			sys_add_key	5
+249	common	request_key		sys_request_key	4
+250	common	keyctl			sys_keyctl	5
+251	common	ioprio_set		sys_ioprio_set	3
+252	common	ioprio_get		sys_ioprio_get	2
+253	common	inotify_init		sys_inotify_init 1
+254	common	inotify_add_watch	sys_inotify_add_watch	3
+255	common	inotify_rm_watch	sys_inotify_rm_watch	2
+256	common	migrate_pages		sys_migrate_pages	4
+257	common	openat			sys_openat	4
+258	common	mkdirat			sys_mkdirat	3
+259	common	mknodat			sys_mknodat	4
+260	common	fchownat		sys_fchownat	5
+261	common	futimesat		sys_futimesat	3
+262	common	newfstatat		sys_newfstatat	4
+263	common	unlinkat		sys_unlinkat	3
+264	common	renameat		sys_renameat	4
+265	common	linkat			sys_linkat	5
+266	common	symlinkat		sys_symlinkat	3
+267	common	readlinkat		sys_readlinkat	4
+268	common	fchmodat		sys_fchmodat	3
+269	common	faccessat		sys_faccessat	3
+270	common	pselect6		sys_pselect6	6
+271	common	ppoll			sys_ppoll	5
+272	common	unshare			sys_unshare	1
+273	64	set_robust_list		sys_set_robust_list	2
+274	64	get_robust_list		sys_get_robust_list	3
+275	common	splice			sys_splice	6
+276	common	tee			sys_tee	4
+277	common	sync_file_range		sys_sync_file_range	4
+278	64	vmsplice		sys_vmsplice	4
+279	64	move_pages		sys_move_pages	6
+280	common	utimensat		sys_utimensat	4
+281	common	epoll_pwait		sys_epoll_pwait	6
+282	common	signalfd		sys_signalfd	3
+283	common	timerfd_create		sys_timerfd_create	2
+284	common	eventfd			sys_eventfd	1
+285	common	fallocate		sys_fallocate	4
+286	common	timerfd_settime		sys_timerfd_settime	4
+287	common	timerfd_gettime		sys_timerfd_gettime	2
+288	common	accept4			sys_accept4	4
+289	common	signalfd4		sys_signalfd4	4
+290	common	eventfd2		sys_eventfd2	2
+291	common	epoll_create1		sys_epoll_create1	1
+292	common	dup3			sys_dup3	3
+293	common	pipe2			sys_pipe2	2
+294	common	inotify_init1		sys_inotify_init1	1
+295	64	preadv			sys_preadv	5
+296	64	pwritev			sys_pwritev	5
+297	64	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo	4
+298	common	perf_event_open		sys_perf_event_open	5
+299	64	recvmmsg		sys_recvmmsg	5
+300	common	fanotify_init		sys_fanotify_init	2
+301	common	fanotify_mark		sys_fanotify_mark	5
+302	common	prlimit64		sys_prlimit64	4
+303	common	name_to_handle_at	sys_name_to_handle_at	5
+304	common	open_by_handle_at	sys_open_by_handle_at	3
+305	common	clock_adjtime		sys_clock_adjtime	2
+306	common	syncfs			sys_syncfs	1
+307	64	sendmmsg		sys_sendmmsg	4
+308	common	setns			sys_setns	2
+309	common	getcpu			sys_getcpu	3
+310	64	process_vm_readv	sys_process_vm_readv	6
+311	64	process_vm_writev	sys_process_vm_writev	6
+312	common	kcmp			sys_kcmp	5
+313	common	finit_module		sys_finit_module	3
+314	common	sched_setattr		sys_sched_setattr	3
+315	common	sched_getattr		sys_sched_getattr	4
+316	common	renameat2		sys_renameat2	5
+317	common	seccomp			sys_seccomp	3
+318	common	getrandom		sys_getrandom	3
+319	common	memfd_create		sys_memfd_create	2
+320	common	kexec_file_load		sys_kexec_file_load	5
+321	common	bpf			sys_bpf	3
+322	64	execveat		sys_execveat/ptregs	5
+323	common	userfaultfd		sys_userfaultfd	1
+324	common	membarrier		sys_membarrier	2
+325	common	mlock2			sys_mlock2	3
+326	common	copy_file_range		sys_copy_file_range	6
+327	64	preadv2			sys_preadv2	6
+328	64	pwritev2		sys_pwritev2	6
+329	common	pkey_mprotect		sys_pkey_mprotect	4
+330	common	pkey_alloc		sys_pkey_alloc	2
+331	common	pkey_free		sys_pkey_free	1
+332	common	statx			sys_statx	5
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
 # for native 64-bit operation.
 #
-512	x32	rt_sigaction		compat_sys_rt_sigaction
-513	x32	rt_sigreturn		sys32_x32_rt_sigreturn
-514	x32	ioctl			compat_sys_ioctl
-515	x32	readv			compat_sys_readv
-516	x32	writev			compat_sys_writev
-517	x32	recvfrom		compat_sys_recvfrom
-518	x32	sendmsg			compat_sys_sendmsg
-519	x32	recvmsg			compat_sys_recvmsg
-520	x32	execve			compat_sys_execve/ptregs
-521	x32	ptrace			compat_sys_ptrace
-522	x32	rt_sigpending		compat_sys_rt_sigpending
-523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
-524	x32	rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
-525	x32	sigaltstack		compat_sys_sigaltstack
-526	x32	timer_create		compat_sys_timer_create
-527	x32	mq_notify		compat_sys_mq_notify
-528	x32	kexec_load		compat_sys_kexec_load
-529	x32	waitid			compat_sys_waitid
-530	x32	set_robust_list		compat_sys_set_robust_list
-531	x32	get_robust_list		compat_sys_get_robust_list
-532	x32	vmsplice		compat_sys_vmsplice
-533	x32	move_pages		compat_sys_move_pages
-534	x32	preadv			compat_sys_preadv64
-535	x32	pwritev			compat_sys_pwritev64
-536	x32	rt_tgsigqueueinfo	compat_sys_rt_tgsigqueueinfo
-537	x32	recvmmsg		compat_sys_recvmmsg
-538	x32	sendmmsg		compat_sys_sendmmsg
-539	x32	process_vm_readv	compat_sys_process_vm_readv
-540	x32	process_vm_writev	compat_sys_process_vm_writev
-541	x32	setsockopt		compat_sys_setsockopt
-542	x32	getsockopt		compat_sys_getsockopt
-543	x32	io_setup		compat_sys_io_setup
-544	x32	io_submit		compat_sys_io_submit
-545	x32	execveat		compat_sys_execveat/ptregs
-546	x32	preadv2			compat_sys_preadv64v2
-547	x32	pwritev2		compat_sys_pwritev64v2
+512	x32	rt_sigaction		compat_sys_rt_sigaction	5
+513	x32	rt_sigreturn		sys32_x32_rt_sigreturn	0
+514	x32	ioctl			compat_sys_ioctl	3
+515	x32	readv			compat_sys_readv	3
+516	x32	writev			compat_sys_writev	3
+517	x32	recvfrom		compat_sys_recvfrom	6
+518	x32	sendmsg			compat_sys_sendmsg	3
+519	x32	recvmsg			compat_sys_recvmsg	3
+520	x32	execve			compat_sys_execve/ptregs	3
+521	x32	ptrace			compat_sys_ptrace	4
+522	x32	rt_sigpending		compat_sys_rt_sigpending	2
+523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait	4
+524	x32	rt_sigqueueinfo		compat_sys_rt_sigqueueinfo	3
+525	x32	sigaltstack		compat_sys_sigaltstack	2
+526	x32	timer_create		compat_sys_timer_create	3
+527	x32	mq_notify		compat_sys_mq_notify	2
+528	x32	kexec_load		compat_sys_kexec_load	4
+529	x32	waitid			compat_sys_waitid	5
+530	x32	set_robust_list		compat_sys_set_robust_list	2
+531	x32	get_robust_list		compat_sys_get_robust_list	3
+532	x32	vmsplice		compat_sys_vmsplice	4
+533	x32	move_pages		compat_sys_move_pages	6
+534	x32	preadv			compat_sys_preadv64	5
+535	x32	pwritev			compat_sys_pwritev64	5
+536	x32	rt_tgsigqueueinfo	compat_sys_rt_tgsigqueueinfo	4
+537	x32	recvmmsg		compat_sys_recvmmsg	5
+538	x32	sendmmsg		compat_sys_sendmmsg	4
+539	x32	process_vm_readv	compat_sys_process_vm_readv	6
+540	x32	process_vm_writev	compat_sys_process_vm_writev	6
+541	x32	setsockopt		compat_sys_setsockopt	5
+542	x32	getsockopt		compat_sys_getsockopt	5
+543	x32	io_setup		compat_sys_io_setup	2
+544	x32	io_submit		compat_sys_io_submit	3
+545	x32	execveat		compat_sys_execveat/ptregs	5
+546	x32	preadv2			compat_sys_preadv64v2	6
+547	x32	pwritev2		compat_sys_pwritev64v2	6
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index d71ef4bd3615..bb8a12f32610 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -8,6 +8,7 @@ syscall_macro() {
     abi="$1"
     nr="$2"
     entry="$3"
+    num="$4"
 
     # Entry can be either just a function name or "function/qualifier"
     real_entry="${entry%%/*}"
@@ -47,7 +48,11 @@ emit() {
 }
 
 grep '^[0-9]' "$in" | sort -n | (
-    while read nr abi name entry compat; do
+    while read nr abi name entry compat num; do
+	case "$compat" in
+	[0-9]*) num="$compat" ; compat="" ;
+	esac
+
 	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
 	if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then
 	    # COMMON is the same as 64, except that we don't expect X32
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v1 7/8] x86/entry/clearregs: Add 64bit stubs to clear unused arguments regs
  2018-01-10  1:03 x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andi Kleen
                   ` (5 preceding siblings ...)
  2018-01-10  1:03 ` [PATCH v1 6/8] x86/entry/clearregs: Add number of arguments to syscall tables Andi Kleen
@ 2018-01-10  1:03 ` Andi Kleen
  2018-01-10  1:03 ` [PATCH v1 8/8] x86/entry/clearregs: Clear registers for 32bit kernel Andi Kleen
  2018-01-10  1:16 ` x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andy Lutomirski
  8 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  1:03 UTC (permalink / raw)
  To: tglx
  Cc: x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

The main system call code doesn't know how many arguments each
system call has. So generate stubs that do the clearing.

Set up macros to generate stubs to clear unused argument registers
for each system call in a 64bit kernel. This uses the syscall
argument count from the syscall tables added earlier.

Each system call will run through its stub which then clears
the registers not used for input arguments before jumping
to the real system calls. It also clears RAX.

We have to move all the __SYSCALL_* users atomically.
This is a larger patch, but it's difficult to do it
git bisect safe otherwise.

Longer term this setup will also allow to get rid
of the system call table, as it will be possible
to compute the entry point with a simple shift.
So far this is not done here.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/entry/calling.h              | 24 ++++++++++++++++++++++++
 arch/x86/entry/entry_64.S             | 15 +++++++++++++++
 arch/x86/entry/syscall_32.c           |  4 ++--
 arch/x86/entry/syscall_64.c           |  5 +++--
 arch/x86/entry/syscalls/syscalltbl.sh | 15 ++++++++-------
 arch/x86/kernel/asm-offsets_32.c      |  2 +-
 arch/x86/kernel/asm-offsets_64.c      |  4 ++--
 arch/x86/um/sys_call_table_32.c       |  4 ++--
 arch/x86/um/sys_call_table_64.c       |  4 ++--
 9 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9444e7623185..c89a8a8d195c 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -200,6 +200,30 @@ For 32-bit we have the following conventions - kernel is built with
 #endif
 	.endm
 
+	/* Clear unused argument registers */
+
+.macro CLEAR_ARGS num
+	/* we leave EAX around because it has been already checked */
+	.if \num < 6
+	xorq	%r9, %r9	# arg6
+	.endif
+	.if \num < 5
+	xorq	%r8, %r8	# arg5
+	.endif
+	.if \num < 4
+	xorl	%ecx, %ecx	# arg4
+	.endif
+	.if \num < 3
+	xorl	%edx, %edx	# arg3
+	.endif
+	.if \num < 2
+	xorl	%esi, %esi	# arg2
+	.endif
+	.if \num < 1
+	xorl	%edi, %edi	# arg1
+	.endif
+.endm
+
 /*
  * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The
  * frame pointer is replaced with an encoded pointer to pt_regs.  The encoding
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 6ab4c2aaeabb..5b2456a30b17 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1755,6 +1755,21 @@ nmi_restore:
 	iretq
 END(nmi)
 
+/*
+ * Clear all argument registers not used by a system call.
+ */
+
+.macro gen_arg_stub sym, num
+ENTRY(san_args_\sym)
+	CLEAR_ARGS \num
+	xor	  %eax, %eax
+	jmp	   \sym
+END(san_args_\sym)
+.endm
+
+#define __SYSCALL_64(nr, sym, qual, num) gen_arg_stub sym, num
+#include <asm/syscalls_64.h>
+
 ENTRY(ignore_sysret)
 	UNWIND_HINT_EMPTY
 	mov	$-ENOSYS, %eax
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 95c294963612..b31e5c8b7ba7 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -7,11 +7,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual, num) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, qual, num) [nr] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index ad1ae014f943..963c9c14480f 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -10,11 +10,12 @@
 #define __SYSCALL_64_QUAL_(sym) sym
 #define __SYSCALL_64_QUAL_ptregs(sym) sym
 
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
+#define __SYSCALL_64(nr, sym, qual, num) \
+	extern asmlinkage long __SYSCALL_64_QUAL_##qual(san_args_##sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
+#define __SYSCALL_64(nr, sym, qual, num) [nr] = __SYSCALL_64_QUAL_##qual(san_args_##sym),
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index bb8a12f32610..79fff684d75e 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -18,7 +18,7 @@ syscall_macro() {
         qualifier=${entry#*/}
     fi
 
-    echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
+    echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier, $num)"
 }
 
 emit() {
@@ -26,6 +26,7 @@ emit() {
     nr="$2"
     entry="$3"
     compat="$4"
+    num="$5"
 
     if [ "$abi" = "64" -a -n "$compat" ]; then
 	echo "a compat entry for a 64-bit syscall makes no sense" >&2
@@ -34,15 +35,15 @@ emit() {
 
     if [ -z "$compat" ]; then
 	if [ -n "$entry" ]; then
-	    syscall_macro "$abi" "$nr" "$entry"
+	    syscall_macro "$abi" "$nr" "$entry" "$num"
 	fi
     else
 	echo "#ifdef CONFIG_X86_32"
 	if [ -n "$entry" ]; then
-	    syscall_macro "$abi" "$nr" "$entry"
+	    syscall_macro "$abi" "$nr" "$entry" "$num"
 	fi
 	echo "#else"
-	syscall_macro "$abi" "$nr" "$compat"
+	syscall_macro "$abi" "$nr" "$compat" "$num"
 	echo "#endif"
     fi
 }
@@ -58,14 +59,14 @@ grep '^[0-9]' "$in" | sort -n | (
 	    # COMMON is the same as 64, except that we don't expect X32
 	    # programs to use it.  Our expectation has nothing to do with
 	    # any generated code, so treat them the same.
-	    emit 64 "$nr" "$entry" "$compat"
+	    emit 64 "$nr" "$entry" "$compat" "$num"
 	elif [ "$abi" = "X32" ]; then
 	    # X32 is equivalent to 64 on an X32-compatible kernel.
 	    echo "#ifdef CONFIG_X86_X32_ABI"
-	    emit 64 "$nr" "$entry" "$compat"
+	    emit 64 "$nr" "$entry" "$compat" "$num"
 	    echo "#endif"
 	elif [ "$abi" = "I386" ]; then
-	    emit "$abi" "$nr" "$entry" "$compat"
+	    emit "$abi" "$nr" "$entry" "$compat" "$num"
 	else
 	    echo "Unknown abi $abi" >&2
 	    exit 1
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fa1261eefa16..13c7478bfe57 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -5,7 +5,7 @@
 
 #include <asm/ucontext.h>
 
-#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual, num) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bf51e51d808d..75d92b53240d 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -5,11 +5,11 @@
 
 #include <asm/ia32.h>
 
-#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
+#define __SYSCALL_64(nr, sym, qual, num) [nr] = 1,
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
-#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual, num) [nr] = 1,
 static char syscalls_ia32[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 9649b5ad2ca2..50002d938cef 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -26,11 +26,11 @@
 
 #define old_mmap sys_old_mmap
 
-#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual, num) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 
 #undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym, qual, num) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index c8bc7fb8cbd6..c39c5b3b8022 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -36,11 +36,11 @@
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual, num) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
 #undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym, qual, num) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v1 8/8] x86/entry/clearregs: Clear registers for 32bit kernel
  2018-01-10  1:03 x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andi Kleen
                   ` (6 preceding siblings ...)
  2018-01-10  1:03 ` [PATCH v1 7/8] x86/entry/clearregs: Add 64bit stubs to clear unused arguments regs Andi Kleen
@ 2018-01-10  1:03 ` Andi Kleen
  2018-01-10  1:24   ` Andy Lutomirski
  2018-01-10  1:16 ` x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andy Lutomirski
  8 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  1:03 UTC (permalink / raw)
  To: tglx
  Cc: x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

On a 32bit kernel clearing registers is much simpler than
on 64bit. The arguments for syscalls are initially passed
to a C function through the stack, so there's no need
to figure out how many arguments to clear.

So we always clear all registers (except frame pointer) for
all entry points.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/entry/entry_32.S | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index d2ef7f32905b..aee1085534ac 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -221,6 +221,18 @@
 	POP_GS_EX
 .endm
 
+.macro CLEAR_ALL_REGS
+#ifdef CONFIG_FRAME_POINTER
+	xorl	%ebp, %ebp
+#endif
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+	xorl	%ecx, %ecx
+	xorl	%edx, %edx
+	xorl	%edi, %edi
+	xorl	%esi, %esi
+.endm
+
 /*
  * %eax: prev task
  * %edx: next task
@@ -428,6 +440,7 @@ ENTRY(entry_SYSENTER_32)
 	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
 	pushl	%eax			/* pt_regs->orig_ax */
 	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
+	CLEAR_ALL_REGS
 
 	/*
 	 * SYSENTER doesn't filter flags, so we need to clear NT, AC
@@ -539,6 +552,7 @@ ENTRY(entry_INT80_32)
 	ASM_CLAC
 	pushl	%eax			/* pt_regs->orig_ax */
 	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
+	CLEAR_ALL_REGS
 
 	/*
 	 * User mode is traced as though IRQs are on, and the interrupt gate
@@ -673,6 +687,7 @@ common_interrupt:
 	ASM_CLAC
 	addl	$-0x80, (%esp)			/* Adjust vector into the [-256, -1] range */
 	SAVE_ALL
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 	TRACE_IRQS_OFF
 	movl	%esp, %eax
@@ -685,6 +700,7 @@ ENTRY(name)				\
 	ASM_CLAC;			\
 	pushl	$~(nr);			\
 	SAVE_ALL;			\
+	CLEAR_ALL_REGS;			\
 	ENCODE_FRAME_POINTER;		\
 	TRACE_IRQS_OFF			\
 	movl	%esp, %eax;		\
@@ -812,6 +828,7 @@ END(spurious_interrupt_bug)
 ENTRY(xen_hypervisor_callback)
 	pushl	$-1				/* orig_ax = -1 => not a system call */
 	SAVE_ALL
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 	TRACE_IRQS_OFF
 
@@ -867,6 +884,7 @@ ENTRY(xen_failsafe_callback)
 	jmp	iret_exc
 5:	pushl	$-1				/* orig_ax = -1 => not a system call */
 	SAVE_ALL
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 	jmp	ret_from_exception
 
@@ -921,6 +939,7 @@ common_exception:
 	pushl	%edx
 	pushl	%ecx
 	pushl	%ebx
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 	cld
 	movl	$(__KERNEL_PERCPU), %ecx
@@ -954,6 +973,7 @@ ENTRY(debug)
 	ASM_CLAC
 	pushl	$-1				# mark this as an int
 	SAVE_ALL
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 	xorl	%edx, %edx			# error code 0
 	movl	%esp, %eax			# pt_regs pointer
@@ -998,6 +1018,7 @@ ENTRY(nmi)
 
 	pushl	%eax				# pt_regs->orig_ax
 	SAVE_ALL
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 	xorl	%edx, %edx			# zero error code
 	movl	%esp, %eax			# pt_regs pointer
@@ -1038,6 +1059,7 @@ ENTRY(nmi)
 	.endr
 	pushl	%eax
 	SAVE_ALL
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 	FIXUP_ESPFIX_STACK			# %eax == %esp
 	xorl	%edx, %edx			# zero error code
@@ -1052,6 +1074,7 @@ ENTRY(int3)
 	ASM_CLAC
 	pushl	$-1				# mark this as an int
 	SAVE_ALL
+	CLEAR_ALL_REGS
 	ENCODE_FRAME_POINTER
 	TRACE_IRQS_OFF
 	xorl	%edx, %edx			# zero error code
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: x86/clearregs: Register sanitizing at kernel entry for speculation hygiene
  2018-01-10  1:03 x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andi Kleen
                   ` (7 preceding siblings ...)
  2018-01-10  1:03 ` [PATCH v1 8/8] x86/entry/clearregs: Clear registers for 32bit kernel Andi Kleen
@ 2018-01-10  1:16 ` Andy Lutomirski
  2018-01-10  1:34   ` Andi Kleen
  8 siblings, 1 reply; 34+ messages in thread
From: Andy Lutomirski @ 2018-01-10  1:16 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos


> On Jan 9, 2018, at 5:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
> 
> This patch kit implements clearing of all unused registers on kernel entries,
> including system calls and all exceptions and interrupt.
> 
> This doesn't fix any known issue, but will make it harder in general
> to exploit the kernel with speculation because it will be harder
> to get user controlled values into kernel code.

I don't like this at all.  Once upon a time, Linux syscalls were supposed to be fast.  Then we learned about the Meltdown screwup, so we mostly fixed it for real upstream and the distroa seriously half-arsed their own fixes [1].  This came with a big performance cost, but it can be turned off on non-busted hardware.  So be it.

But now we're proposing to throw out the whole fast path because it might make it a bit harder to do the most obvious attack.  Not very hard, mind you, but a little bit harder.  And there's no off switch for less-leaky hardware.  No thanks.

Meanwhile we're doing nothing whatsoever to mitigate cross-process attacks because we can't do anything about it short of turning IBRS on systemwide.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 5/8] x86/entry/clearregs: Clear registers for 64bit exceptions/interrupts
  2018-01-10  1:03 ` [PATCH v1 5/8] x86/entry/clearregs: Clear registers for 64bit exceptions/interrupts Andi Kleen
@ 2018-01-10  1:23   ` Andy Lutomirski
  0 siblings, 0 replies; 34+ messages in thread
From: Andy Lutomirski @ 2018-01-10  1:23 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen



> On Jan 9, 2018, at 5:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
> 
> From: Andi Kleen <ak@linux.intel.com>
> 
> Clear all registers on entering the 64bit kernel for exceptions and
> interrupts.
> 
> Since there are no arguments this is fairly simple.
> 
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
> arch/x86/entry/entry_64.S | 5 +++++
> 1 file changed, 5 insertions(+)
> 
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 632081fd7086..6ab4c2aaeabb 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -636,6 +636,7 @@ END(irq_entries_start)
>    ALLOC_PT_GPREGS_ON_STACK
>    SAVE_C_REGS
>    SAVE_EXTRA_REGS
> +    CLEAR_ALL_REGS
>    ENCODE_FRAME_POINTER
> 
>    testb    $3, CS(%rsp)
> @@ -1192,6 +1193,7 @@ ENTRY(xen_failsafe_callback)
>    ALLOC_PT_GPREGS_ON_STACK
>    SAVE_C_REGS
>    SAVE_EXTRA_REGS
> +    CLEAR_ALL_REGS
>    ENCODE_FRAME_POINTER

If CLEAR_ALL_REGS does what it sounds like, then its overkill here.

I could get behind this patch in general, though.  Interrupts are so slow the the overhead probably doesn't matter.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 8/8] x86/entry/clearregs: Clear registers for 32bit kernel
  2018-01-10  1:03 ` [PATCH v1 8/8] x86/entry/clearregs: Clear registers for 32bit kernel Andi Kleen
@ 2018-01-10  1:24   ` Andy Lutomirski
  0 siblings, 0 replies; 34+ messages in thread
From: Andy Lutomirski @ 2018-01-10  1:24 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen



> On Jan 9, 2018, at 5:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
> 
> From: Andi Kleen <ak@linux.intel.com>
> 
> On a 32bit kernel clearing registers is much simpler than
> on 64bit. The arguments for syscalls are initially passed
> to a C function through the stack, so there's no need
> to figure out how many arguments to clear.

Why are we even trying to improve the situation on 32-bit?  Unless someone actually tries to implement PTI, this seems useless.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 6/8] x86/entry/clearregs: Add number of arguments to syscall tables
  2018-01-10  1:03 ` [PATCH v1 6/8] x86/entry/clearregs: Add number of arguments to syscall tables Andi Kleen
@ 2018-01-10  1:26   ` Andy Lutomirski
  2018-01-10  4:37     ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Andy Lutomirski @ 2018-01-10  1:26 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen



> On Jan 9, 2018, at 5:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
> 
> From: Andi Kleen <ak@linux.intel.com>
> 
> In order to sanitize the system call arguments properly
> we need to know the number of syscall arguments for each
> syscall. Add a new column to the 32bit and 64bit syscall
> tables to list the number of arguments.
> 

Surely we can do this in the SYSCALL_DEFINE macros.  Or at least statically check it.

Also, what attack are we protecting against anyway?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: x86/clearregs: Register sanitizing at kernel entry for speculation hygiene
  2018-01-10  1:16 ` x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andy Lutomirski
@ 2018-01-10  1:34   ` Andi Kleen
  2018-01-10  1:39     ` Andy Lutomirski
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  1:34 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andi Kleen, tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto,
	peterz, thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos

> I don't like this at all.  Once upon a time, Linux syscalls were supposed to be fast.  Then we learned about the Meltdown screwup, so we mostly fixed it for real upstream and the distroa seriously half-arsed their own fixes [1].  This came with a big performance cost, but it can be turned off on non-busted hardware.  So be it.

That's true, but modern CPUs are also a lot faster/wider than the K8
the fast path was originally designed for. A modern CPU can go through
these instructions really fast with a very high IPC because they don't have
dependencies or stalls.

So it shouldn't hurt very much.

Also in fact when the fast path was originally written the ABI still had a
different caller/callee split which made it more better. Later on
it already lost some of its benefits and was less of a win.
 
> But now we're proposing to throw out the whole fast path because it might make it a bit harder to do the most obvious attack.  Not very hard, mind you, but a little bit harder.  And there's no off switch for less-leaky hardware.  No thanks.

Well the off switch is a fast CPU.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: x86/clearregs: Register sanitizing at kernel entry for speculation hygiene
  2018-01-10  1:34   ` Andi Kleen
@ 2018-01-10  1:39     ` Andy Lutomirski
  0 siblings, 0 replies; 34+ messages in thread
From: Andy Lutomirski @ 2018-01-10  1:39 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos



On Jan 9, 2018, at 5:34 PM, Andi Kleen <andi@firstfloor.org> wrote:

>> I don't like this at all.  Once upon a time, Linux syscalls were supposed to be fast.  Then we learned about the Meltdown screwup, so we mostly fixed it for real upstream and the distroa seriously half-arsed their own fixes [1].  This came with a big performance cost, but it can be turned off on non-busted hardware.  So be it.
> 
> That's true, but modern CPUs are also a lot faster/wider than the K8
> the fast path was originally designed for. A modern CPU can go through
> these instructions really fast with a very high IPC because they don't have
> dependencies or stalls.
> 
> So it shouldn't hurt very much.
> 
> Also in fact when the fast path was originally written the ABI still had a
> different caller/callee split which made it more better. Later on
> it already lost some of its benefits and was less of a win.
> 
>> But now we're proposing to throw out the whole fast path because it might make it a bit harder to do the most obvious attack.  Not very hard, mind you, but a little bit harder.  And there's no off switch for less-leaky hardware.  No thanks.
> 
> Well the off switch is a fast CPU.

When I rewrote the fast path, I did it on SNB.  Not much has changed.

This patch should come with benchmarks (with PTI off).

And Intel needs to come up with real fixes for this stuff.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-10  1:03 ` [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call Andi Kleen
@ 2018-01-10  2:46   ` Brian Gerst
  2018-01-11  0:16     ` Andi Kleen
  2018-01-11  2:09   ` Josh Poimboeuf
  2018-01-12  3:22   ` Josh Poimboeuf
  2 siblings, 1 reply; 34+ messages in thread
From: Brian Gerst @ 2018-01-10  2:46 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Thomas Gleixner, the arch/x86 maintainers,
	Linux Kernel Mailing List, Linus Torvalds, David Woodhouse,
	Paul Turner, Andy Lutomirski, Peter Zijlstra, Tom Lendacky,
	Tim Chen, Greg Kroah-Hartman, Dave Hansen, Jiri Kosina,
	Andi Kleen

On Tue, Jan 9, 2018 at 8:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> Remove the partial stack frame in the 64bit syscall fast path.
> In the next patch we want to clear the extra registers, which requires
> to always save all registers. So remove the partial stack frame
> in the syscall fast path and always save everything.
>
> This actually simplifies the code because the ptregs stubs
> are not needed anymore.
>
> arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------------------
> arch/x86/entry/syscall_64.c |  2 +-
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------
>  arch/x86/entry/syscall_64.c |  2 +-
>  2 files changed, 5 insertions(+), 54 deletions(-)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 58dbf7a12a05..bbdfbdd817d6 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -234,7 +234,9 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
>         pushq   %r9                             /* pt_regs->r9 */
>         pushq   %r10                            /* pt_regs->r10 */
>         pushq   %r11                            /* pt_regs->r11 */
> -       sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
> +       sub     $(6*8), %rsp
> +       SAVE_EXTRA_REGS
> +

Continue using pushes here

>         UNWIND_HINT_REGS extra=0
>
>         /*
> @@ -262,11 +264,6 @@ entry_SYSCALL_64_fastpath:
>         ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
>         movq    %r10, %rcx
>
> -       /*
> -        * This call instruction is handled specially in stub_ptregs_64.
> -        * It might end up jumping to the slow path.  If it jumps, RAX
> -        * and all argument registers are clobbered.
> -        */
>  #ifdef CONFIG_RETPOLINE
>         movq    sys_call_table(, %rax, 8), %rax
>         call    __x86_indirect_thunk_rax
> @@ -293,9 +290,7 @@ entry_SYSCALL_64_fastpath:
>         TRACE_IRQS_ON           /* user mode is traced as IRQs on */
>         movq    RIP(%rsp), %rcx
>         movq    EFLAGS(%rsp), %r11
> -       addq    $6*8, %rsp      /* skip extra regs -- they were preserved */
> -       UNWIND_HINT_EMPTY
> -       jmp     .Lpop_c_regs_except_rcx_r11_and_sysret
> +       jmp     syscall_return_via_sysret
>
>  1:
>         /*
> @@ -305,14 +300,12 @@ entry_SYSCALL_64_fastpath:
>          */
>         TRACE_IRQS_ON
>         ENABLE_INTERRUPTS(CLBR_ANY)
> -       SAVE_EXTRA_REGS
>         movq    %rsp, %rdi
>         call    syscall_return_slowpath /* returns with IRQs disabled */
>         jmp     return_from_SYSCALL_64
>
>  entry_SYSCALL64_slow_path:
>         /* IRQs are off. */
> -       SAVE_EXTRA_REGS
>         movq    %rsp, %rdi
>         call    do_syscall_64           /* returns with IRQs disabled */
>
> @@ -389,7 +382,6 @@ syscall_return_via_sysret:
>         /* rcx and r11 are already restored (see code above) */
>         UNWIND_HINT_EMPTY
>         POP_EXTRA_REGS
> -.Lpop_c_regs_except_rcx_r11_and_sysret:
>         popq    %rsi    /* skip r11 */
>         popq    %r10
>         popq    %r9
> @@ -420,47 +412,6 @@ syscall_return_via_sysret:
>         USERGS_SYSRET64
>  END(entry_SYSCALL_64)
>
> -ENTRY(stub_ptregs_64)
> -       /*
> -        * Syscalls marked as needing ptregs land here.
> -        * If we are on the fast path, we need to save the extra regs,
> -        * which we achieve by trying again on the slow path.  If we are on
> -        * the slow path, the extra regs are already saved.
> -        *
> -        * RAX stores a pointer to the C function implementing the syscall.
> -        * IRQs are on.
> -        */
> -       cmpq    $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
> -       jne     1f
> -
> -       /*
> -        * Called from fast path -- disable IRQs again, pop return address
> -        * and jump to slow path
> -        */
> -       DISABLE_INTERRUPTS(CLBR_ANY)
> -       TRACE_IRQS_OFF
> -       popq    %rax
> -       UNWIND_HINT_REGS extra=0
> -       jmp     entry_SYSCALL64_slow_path
> -
> -1:
> -       JMP_NOSPEC %rax                         /* Called from C */
> -END(stub_ptregs_64)
> -
> -.macro ptregs_stub func
> -ENTRY(ptregs_\func)
> -       UNWIND_HINT_FUNC
> -       leaq    \func(%rip), %rax
> -       jmp     stub_ptregs_64
> -END(ptregs_\func)
> -.endm
> -
> -/* Instantiate ptregs_stub for each ptregs-using syscall */
> -#define __SYSCALL_64_QUAL_(sym)
> -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
> -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
> -#include <asm/syscalls_64.h>
> -

You can't just blindly remove this.  We need to make sure that
syscalls that modify registers take the slow path exit, because they
may change the registers to be incompatible with SYSRET.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 6/8] x86/entry/clearregs: Add number of arguments to syscall tables
  2018-01-10  1:26   ` Andy Lutomirski
@ 2018-01-10  4:37     ` Andi Kleen
  2018-01-10 20:05       ` Andy Lutomirski
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2018-01-10  4:37 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andi Kleen, tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto,
	peterz, thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos

On Tue, Jan 09, 2018 at 05:26:43PM -0800, Andy Lutomirski wrote:
> 
> 
> > On Jan 9, 2018, at 5:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
> > 
> > From: Andi Kleen <ak@linux.intel.com>
> > 
> > In order to sanitize the system call arguments properly
> > we need to know the number of syscall arguments for each
> > syscall. Add a new column to the 32bit and 64bit syscall
> > tables to list the number of arguments.
> > 
> 
> Surely we can do this in the SYSCALL_DEFINE macros.  Or at least statically check it.

Possibly. The assembler would be much uglier as inline assembler though.
And adding the number shouldn't be a big burden when adding a system call.

I don't know how to check statically.

> 
> Also, what attack are we protecting against anyway?

There's no specific attack here.

But the idea is to make it harder to inject values into the kernel to abuse
with speculation.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 6/8] x86/entry/clearregs: Add number of arguments to syscall tables
  2018-01-10  4:37     ` Andi Kleen
@ 2018-01-10 20:05       ` Andy Lutomirski
  0 siblings, 0 replies; 34+ messages in thread
From: Andy Lutomirski @ 2018-01-10 20:05 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andi Kleen, Thomas Gleixner, X86 ML, LKML, Linus Torvalds,
	Woodhouse, David, Paul Turner, Andrew Lutomirski, Peter Zijlstra,
	Tom Lendacky, Tim Chen, gregkh, Dave Hansen, Jiri Kosina

On Tue, Jan 9, 2018 at 8:37 PM, Andi Kleen <ak@linux.intel.com> wrote:
> On Tue, Jan 09, 2018 at 05:26:43PM -0800, Andy Lutomirski wrote:
>>
>>
>> > On Jan 9, 2018, at 5:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> >
>> > From: Andi Kleen <ak@linux.intel.com>
>> >
>> > In order to sanitize the system call arguments properly
>> > we need to know the number of syscall arguments for each
>> > syscall. Add a new column to the 32bit and 64bit syscall
>> > tables to list the number of arguments.
>> >
>>
>> Surely we can do this in the SYSCALL_DEFINE macros.  Or at least statically check it.
>
> Possibly. The assembler would be much uglier as inline assembler though.
> And adding the number shouldn't be a big burden when adding a system call.
>
> I don't know how to check statically.
>

Somehow parse out the SYSCALL_DEFINE() macros at build time and check
the numbers.  Or munge the number into the SyS_ wrapper so we'd have
SyS0_fork but SyS3_read.

>>
>> Also, what attack are we protecting against anyway?
>
> There's no specific attack here.
>
> But the idea is to make it harder to inject values into the kernel to abuse
> with speculation.

I think a bit stronger justification would be good here.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-10  2:46   ` Brian Gerst
@ 2018-01-11  0:16     ` Andi Kleen
  2018-01-11  0:54       ` Brian Gerst
  2018-01-11  0:55       ` Andy Lutomirski
  0 siblings, 2 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-11  0:16 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Andi Kleen, Thomas Gleixner, the arch/x86 maintainers,
	Linux Kernel Mailing List, Linus Torvalds, David Woodhouse,
	Paul Turner, Andy Lutomirski, Peter Zijlstra, Tom Lendacky,
	Tim Chen, Greg Kroah-Hartman, Dave Hansen, Jiri Kosina,
	Andi Kleen

On Tue, Jan 09, 2018 at 09:46:16PM -0500, Brian Gerst wrote:
> On Tue, Jan 9, 2018 at 8:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
> > From: Andi Kleen <ak@linux.intel.com>
> >
> > Remove the partial stack frame in the 64bit syscall fast path.
> > In the next patch we want to clear the extra registers, which requires
> > to always save all registers. So remove the partial stack frame
> > in the syscall fast path and always save everything.
> >
> > This actually simplifies the code because the ptregs stubs
> > are not needed anymore.
> >
> > arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------------------
> > arch/x86/entry/syscall_64.c |  2 +-
> >
> > Signed-off-by: Andi Kleen <ak@linux.intel.com>
> > ---
> >  arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------
> >  arch/x86/entry/syscall_64.c |  2 +-
> >  2 files changed, 5 insertions(+), 54 deletions(-)
> >
> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > index 58dbf7a12a05..bbdfbdd817d6 100644
> > --- a/arch/x86/entry/entry_64.S
> > +++ b/arch/x86/entry/entry_64.S
> > @@ -234,7 +234,9 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
> >         pushq   %r9                             /* pt_regs->r9 */
> >         pushq   %r10                            /* pt_regs->r10 */
> >         pushq   %r11                            /* pt_regs->r11 */
> > -       sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
> > +       sub     $(6*8), %rsp
> > +       SAVE_EXTRA_REGS
> > +
> 
> Continue using pushes here
> 
> >         UNWIND_HINT_REGS extra=0
> >
> >         /*
> > @@ -262,11 +264,6 @@ entry_SYSCALL_64_fastpath:
> >         ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
> >         movq    %r10, %rcx
> >
> > -       /*
> > -        * This call instruction is handled specially in stub_ptregs_64.
> > -        * It might end up jumping to the slow path.  If it jumps, RAX
> > -        * and all argument registers are clobbered.
> > -        */
> >  #ifdef CONFIG_RETPOLINE
> >         movq    sys_call_table(, %rax, 8), %rax
> >         call    __x86_indirect_thunk_rax
> > @@ -293,9 +290,7 @@ entry_SYSCALL_64_fastpath:
> >         TRACE_IRQS_ON           /* user mode is traced as IRQs on */
> >         movq    RIP(%rsp), %rcx
> >         movq    EFLAGS(%rsp), %r11
> > -       addq    $6*8, %rsp      /* skip extra regs -- they were preserved */
> > -       UNWIND_HINT_EMPTY
> > -       jmp     .Lpop_c_regs_except_rcx_r11_and_sysret
> > +       jmp     syscall_return_via_sysret
> >
> >  1:
> >         /*
> > @@ -305,14 +300,12 @@ entry_SYSCALL_64_fastpath:
> >          */
> >         TRACE_IRQS_ON
> >         ENABLE_INTERRUPTS(CLBR_ANY)
> > -       SAVE_EXTRA_REGS
> >         movq    %rsp, %rdi
> >         call    syscall_return_slowpath /* returns with IRQs disabled */
> >         jmp     return_from_SYSCALL_64
> >
> >  entry_SYSCALL64_slow_path:
> >         /* IRQs are off. */
> > -       SAVE_EXTRA_REGS
> >         movq    %rsp, %rdi
> >         call    do_syscall_64           /* returns with IRQs disabled */
> >
> > @@ -389,7 +382,6 @@ syscall_return_via_sysret:
> >         /* rcx and r11 are already restored (see code above) */
> >         UNWIND_HINT_EMPTY
> >         POP_EXTRA_REGS
> > -.Lpop_c_regs_except_rcx_r11_and_sysret:
> >         popq    %rsi    /* skip r11 */
> >         popq    %r10
> >         popq    %r9
> > @@ -420,47 +412,6 @@ syscall_return_via_sysret:
> >         USERGS_SYSRET64
> >  END(entry_SYSCALL_64)
> >
> > -ENTRY(stub_ptregs_64)
> > -       /*
> > -        * Syscalls marked as needing ptregs land here.
> > -        * If we are on the fast path, we need to save the extra regs,
> > -        * which we achieve by trying again on the slow path.  If we are on
> > -        * the slow path, the extra regs are already saved.
> > -        *
> > -        * RAX stores a pointer to the C function implementing the syscall.
> > -        * IRQs are on.
> > -        */
> > -       cmpq    $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
> > -       jne     1f
> > -
> > -       /*
> > -        * Called from fast path -- disable IRQs again, pop return address
> > -        * and jump to slow path
> > -        */
> > -       DISABLE_INTERRUPTS(CLBR_ANY)
> > -       TRACE_IRQS_OFF
> > -       popq    %rax
> > -       UNWIND_HINT_REGS extra=0
> > -       jmp     entry_SYSCALL64_slow_path
> > -
> > -1:
> > -       JMP_NOSPEC %rax                         /* Called from C */
> > -END(stub_ptregs_64)
> > -
> > -.macro ptregs_stub func
> > -ENTRY(ptregs_\func)
> > -       UNWIND_HINT_FUNC
> > -       leaq    \func(%rip), %rax
> > -       jmp     stub_ptregs_64
> > -END(ptregs_\func)
> > -.endm
> > -
> > -/* Instantiate ptregs_stub for each ptregs-using syscall */
> > -#define __SYSCALL_64_QUAL_(sym)
> > -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
> > -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
> > -#include <asm/syscalls_64.h>
> > -
> 
> You can't just blindly remove this.  We need to make sure that
> syscalls that modify registers take the slow path exit, because they
> may change the registers to be incompatible with SYSRET.

That's a good point. I checked the ptregs calls:

iopl: should be fine, we will be restoring the correct IOPL through
	SYSRET

clone/fork: fine too, the original return is fine and ret_from_fork
            takes care of the child

execve et.al.: we will be leaking r11(rflags), rcx(orig return) into
	    the new process. but that seems acceptable.

rt_sigreturn:  that's the only one who has problems. I added a new
            TIF_FULL_RESTORE to force it into the slow path.	    
	 

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-11  0:16     ` Andi Kleen
@ 2018-01-11  0:54       ` Brian Gerst
  2018-01-11  1:02         ` Andi Kleen
  2018-01-11  0:55       ` Andy Lutomirski
  1 sibling, 1 reply; 34+ messages in thread
From: Brian Gerst @ 2018-01-11  0:54 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Thomas Gleixner, the arch/x86 maintainers,
	Linux Kernel Mailing List, Linus Torvalds, David Woodhouse,
	Paul Turner, Andy Lutomirski, Peter Zijlstra, Tom Lendacky,
	Tim Chen, Greg Kroah-Hartman, Dave Hansen, Jiri Kosina,
	Andi Kleen

On Wed, Jan 10, 2018 at 7:16 PM, Andi Kleen <andi@firstfloor.org> wrote:
> On Tue, Jan 09, 2018 at 09:46:16PM -0500, Brian Gerst wrote:
>> On Tue, Jan 9, 2018 at 8:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> > From: Andi Kleen <ak@linux.intel.com>
>> >
>> > Remove the partial stack frame in the 64bit syscall fast path.
>> > In the next patch we want to clear the extra registers, which requires
>> > to always save all registers. So remove the partial stack frame
>> > in the syscall fast path and always save everything.
>> >
>> > This actually simplifies the code because the ptregs stubs
>> > are not needed anymore.
>> >
>> > arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------------------
>> > arch/x86/entry/syscall_64.c |  2 +-
>> >
>> > Signed-off-by: Andi Kleen <ak@linux.intel.com>
>> > ---
>> >  arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------
>> >  arch/x86/entry/syscall_64.c |  2 +-
>> >  2 files changed, 5 insertions(+), 54 deletions(-)
>> >
>> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>> > index 58dbf7a12a05..bbdfbdd817d6 100644
>> > --- a/arch/x86/entry/entry_64.S
>> > +++ b/arch/x86/entry/entry_64.S
>> > @@ -234,7 +234,9 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
>> >         pushq   %r9                             /* pt_regs->r9 */
>> >         pushq   %r10                            /* pt_regs->r10 */
>> >         pushq   %r11                            /* pt_regs->r11 */
>> > -       sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
>> > +       sub     $(6*8), %rsp
>> > +       SAVE_EXTRA_REGS
>> > +
>>
>> Continue using pushes here
>>
>> >         UNWIND_HINT_REGS extra=0
>> >
>> >         /*
>> > @@ -262,11 +264,6 @@ entry_SYSCALL_64_fastpath:
>> >         ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
>> >         movq    %r10, %rcx
>> >
>> > -       /*
>> > -        * This call instruction is handled specially in stub_ptregs_64.
>> > -        * It might end up jumping to the slow path.  If it jumps, RAX
>> > -        * and all argument registers are clobbered.
>> > -        */
>> >  #ifdef CONFIG_RETPOLINE
>> >         movq    sys_call_table(, %rax, 8), %rax
>> >         call    __x86_indirect_thunk_rax
>> > @@ -293,9 +290,7 @@ entry_SYSCALL_64_fastpath:
>> >         TRACE_IRQS_ON           /* user mode is traced as IRQs on */
>> >         movq    RIP(%rsp), %rcx
>> >         movq    EFLAGS(%rsp), %r11
>> > -       addq    $6*8, %rsp      /* skip extra regs -- they were preserved */
>> > -       UNWIND_HINT_EMPTY
>> > -       jmp     .Lpop_c_regs_except_rcx_r11_and_sysret
>> > +       jmp     syscall_return_via_sysret
>> >
>> >  1:
>> >         /*
>> > @@ -305,14 +300,12 @@ entry_SYSCALL_64_fastpath:
>> >          */
>> >         TRACE_IRQS_ON
>> >         ENABLE_INTERRUPTS(CLBR_ANY)
>> > -       SAVE_EXTRA_REGS
>> >         movq    %rsp, %rdi
>> >         call    syscall_return_slowpath /* returns with IRQs disabled */
>> >         jmp     return_from_SYSCALL_64
>> >
>> >  entry_SYSCALL64_slow_path:
>> >         /* IRQs are off. */
>> > -       SAVE_EXTRA_REGS
>> >         movq    %rsp, %rdi
>> >         call    do_syscall_64           /* returns with IRQs disabled */
>> >
>> > @@ -389,7 +382,6 @@ syscall_return_via_sysret:
>> >         /* rcx and r11 are already restored (see code above) */
>> >         UNWIND_HINT_EMPTY
>> >         POP_EXTRA_REGS
>> > -.Lpop_c_regs_except_rcx_r11_and_sysret:
>> >         popq    %rsi    /* skip r11 */
>> >         popq    %r10
>> >         popq    %r9
>> > @@ -420,47 +412,6 @@ syscall_return_via_sysret:
>> >         USERGS_SYSRET64
>> >  END(entry_SYSCALL_64)
>> >
>> > -ENTRY(stub_ptregs_64)
>> > -       /*
>> > -        * Syscalls marked as needing ptregs land here.
>> > -        * If we are on the fast path, we need to save the extra regs,
>> > -        * which we achieve by trying again on the slow path.  If we are on
>> > -        * the slow path, the extra regs are already saved.
>> > -        *
>> > -        * RAX stores a pointer to the C function implementing the syscall.
>> > -        * IRQs are on.
>> > -        */
>> > -       cmpq    $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
>> > -       jne     1f
>> > -
>> > -       /*
>> > -        * Called from fast path -- disable IRQs again, pop return address
>> > -        * and jump to slow path
>> > -        */
>> > -       DISABLE_INTERRUPTS(CLBR_ANY)
>> > -       TRACE_IRQS_OFF
>> > -       popq    %rax
>> > -       UNWIND_HINT_REGS extra=0
>> > -       jmp     entry_SYSCALL64_slow_path
>> > -
>> > -1:
>> > -       JMP_NOSPEC %rax                         /* Called from C */
>> > -END(stub_ptregs_64)
>> > -
>> > -.macro ptregs_stub func
>> > -ENTRY(ptregs_\func)
>> > -       UNWIND_HINT_FUNC
>> > -       leaq    \func(%rip), %rax
>> > -       jmp     stub_ptregs_64
>> > -END(ptregs_\func)
>> > -.endm
>> > -
>> > -/* Instantiate ptregs_stub for each ptregs-using syscall */
>> > -#define __SYSCALL_64_QUAL_(sym)
>> > -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
>> > -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
>> > -#include <asm/syscalls_64.h>
>> > -
>>
>> You can't just blindly remove this.  We need to make sure that
>> syscalls that modify registers take the slow path exit, because they
>> may change the registers to be incompatible with SYSRET.
>
> That's a good point. I checked the ptregs calls:
>
> iopl: should be fine, we will be restoring the correct IOPL through
>         SYSRET
> clone/fork: fine too, the original return is fine and ret_from_fork
>             takes care of the child
> execve et.al.: we will be leaking r11(rflags), rcx(orig return) into
>             the new process. but that seems acceptable.

We still need to check if we are loading a 32-bit binary.  That must
return with IRET.

> rt_sigreturn:  that's the only one who has problems. I added a new
>             TIF_FULL_RESTORE to force it into the slow path.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-11  0:16     ` Andi Kleen
  2018-01-11  0:54       ` Brian Gerst
@ 2018-01-11  0:55       ` Andy Lutomirski
  2018-01-11  1:01         ` Andi Kleen
  2018-01-11  1:01         ` Brian Gerst
  1 sibling, 2 replies; 34+ messages in thread
From: Andy Lutomirski @ 2018-01-11  0:55 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Brian Gerst, Thomas Gleixner, the arch/x86 maintainers,
	Linux Kernel Mailing List, Linus Torvalds, David Woodhouse,
	Paul Turner, Andy Lutomirski, Peter Zijlstra, Tom Lendacky,
	Tim Chen, Greg Kroah-Hartman, Dave Hansen, Jiri Kosina,
	Andi Kleen



> On Jan 10, 2018, at 4:16 PM, Andi Kleen <andi@firstfloor.org> wrote:
> 
>> On Tue, Jan 09, 2018 at 09:46:16PM -0500, Brian Gerst wrote:
>>> On Tue, Jan 9, 2018 at 8:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
>>> From: Andi Kleen <ak@linux.intel.com>
>>> 
>>> Remove the partial stack frame in the 64bit syscall fast path.
>>> In the next patch we want to clear the extra registers, which requires
>>> to always save all registers. So remove the partial stack frame
>>> in the syscall fast path and always save everything.
>>> 
>>> This actually simplifies the code because the ptregs stubs
>>> are not needed anymore.
>>> 
>>> arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------------------
>>> arch/x86/entry/syscall_64.c |  2 +-
>>> 
>>> Signed-off-by: Andi Kleen <ak@linux.intel.com>
>>> ---
>>> arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------
>>> arch/x86/entry/syscall_64.c |  2 +-
>>> 2 files changed, 5 insertions(+), 54 deletions(-)
>>> 
>>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>>> index 58dbf7a12a05..bbdfbdd817d6 100644
>>> --- a/arch/x86/entry/entry_64.S
>>> +++ b/arch/x86/entry/entry_64.S
>>> @@ -234,7 +234,9 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
>>>        pushq   %r9                             /* pt_regs->r9 */
>>>        pushq   %r10                            /* pt_regs->r10 */
>>>        pushq   %r11                            /* pt_regs->r11 */
>>> -       sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
>>> +       sub     $(6*8), %rsp
>>> +       SAVE_EXTRA_REGS
>>> +
>> 
>> Continue using pushes here
>> 
>>>        UNWIND_HINT_REGS extra=0
>>> 
>>>        /*
>>> @@ -262,11 +264,6 @@ entry_SYSCALL_64_fastpath:
>>>        ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
>>>        movq    %r10, %rcx
>>> 
>>> -       /*
>>> -        * This call instruction is handled specially in stub_ptregs_64.
>>> -        * It might end up jumping to the slow path.  If it jumps, RAX
>>> -        * and all argument registers are clobbered.
>>> -        */
>>> #ifdef CONFIG_RETPOLINE
>>>        movq    sys_call_table(, %rax, 8), %rax
>>>        call    __x86_indirect_thunk_rax
>>> @@ -293,9 +290,7 @@ entry_SYSCALL_64_fastpath:
>>>        TRACE_IRQS_ON           /* user mode is traced as IRQs on */
>>>        movq    RIP(%rsp), %rcx
>>>        movq    EFLAGS(%rsp), %r11
>>> -       addq    $6*8, %rsp      /* skip extra regs -- they were preserved */
>>> -       UNWIND_HINT_EMPTY
>>> -       jmp     .Lpop_c_regs_except_rcx_r11_and_sysret
>>> +       jmp     syscall_return_via_sysret
>>> 
>>> 1:
>>>        /*
>>> @@ -305,14 +300,12 @@ entry_SYSCALL_64_fastpath:
>>>         */
>>>        TRACE_IRQS_ON
>>>        ENABLE_INTERRUPTS(CLBR_ANY)
>>> -       SAVE_EXTRA_REGS
>>>        movq    %rsp, %rdi
>>>        call    syscall_return_slowpath /* returns with IRQs disabled */
>>>        jmp     return_from_SYSCALL_64
>>> 
>>> entry_SYSCALL64_slow_path:
>>>        /* IRQs are off. */
>>> -       SAVE_EXTRA_REGS
>>>        movq    %rsp, %rdi
>>>        call    do_syscall_64           /* returns with IRQs disabled */
>>> 
>>> @@ -389,7 +382,6 @@ syscall_return_via_sysret:
>>>        /* rcx and r11 are already restored (see code above) */
>>>        UNWIND_HINT_EMPTY
>>>        POP_EXTRA_REGS
>>> -.Lpop_c_regs_except_rcx_r11_and_sysret:
>>>        popq    %rsi    /* skip r11 */
>>>        popq    %r10
>>>        popq    %r9
>>> @@ -420,47 +412,6 @@ syscall_return_via_sysret:
>>>        USERGS_SYSRET64
>>> END(entry_SYSCALL_64)
>>> 
>>> -ENTRY(stub_ptregs_64)
>>> -       /*
>>> -        * Syscalls marked as needing ptregs land here.
>>> -        * If we are on the fast path, we need to save the extra regs,
>>> -        * which we achieve by trying again on the slow path.  If we are on
>>> -        * the slow path, the extra regs are already saved.
>>> -        *
>>> -        * RAX stores a pointer to the C function implementing the syscall.
>>> -        * IRQs are on.
>>> -        */
>>> -       cmpq    $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
>>> -       jne     1f
>>> -
>>> -       /*
>>> -        * Called from fast path -- disable IRQs again, pop return address
>>> -        * and jump to slow path
>>> -        */
>>> -       DISABLE_INTERRUPTS(CLBR_ANY)
>>> -       TRACE_IRQS_OFF
>>> -       popq    %rax
>>> -       UNWIND_HINT_REGS extra=0
>>> -       jmp     entry_SYSCALL64_slow_path
>>> -
>>> -1:
>>> -       JMP_NOSPEC %rax                         /* Called from C */
>>> -END(stub_ptregs_64)
>>> -
>>> -.macro ptregs_stub func
>>> -ENTRY(ptregs_\func)
>>> -       UNWIND_HINT_FUNC
>>> -       leaq    \func(%rip), %rax
>>> -       jmp     stub_ptregs_64
>>> -END(ptregs_\func)
>>> -.endm
>>> -
>>> -/* Instantiate ptregs_stub for each ptregs-using syscall */
>>> -#define __SYSCALL_64_QUAL_(sym)
>>> -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
>>> -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
>>> -#include <asm/syscalls_64.h>
>>> -
>> 
>> You can't just blindly remove this.  We need to make sure that
>> syscalls that modify registers take the slow path exit, because they
>> may change the registers to be incompatible with SYSRET.
> 
> That's a good point. I checked the ptregs calls:
> 
> iopl: should be fine, we will be restoring the correct IOPL through
>    SYSRET
> 
> clone/fork: fine too, the original return is fine and ret_from_fork
>            takes care of the child
> 
> execve et.al.: we will be leaking r11(rflags), rcx(orig return) into
>        the new process. but that seems acceptable.
> 
> rt_sigreturn:  that's the only one who has problems. I added a new
>            TIF_FULL_RESTORE to force it into the slow path.        
>     

So your series removes the old declarative annotation and then will add a new TI flag to make it work again?

This whole thing seems to be at the wrong end of the cost benefit curve.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-11  0:55       ` Andy Lutomirski
@ 2018-01-11  1:01         ` Andi Kleen
  2018-01-11  1:01         ` Brian Gerst
  1 sibling, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-11  1:01 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andi Kleen, Brian Gerst, Thomas Gleixner,
	the arch/x86 maintainers, Linux Kernel Mailing List,
	Linus Torvalds, David Woodhouse, Paul Turner, Andy Lutomirski,
	Peter Zijlstra, Tom Lendacky, Tim Chen, Greg Kroah-Hartman,
	Dave Hansen, Jiri Kosina, Andi Kleen

> So your series removes the old declarative annotation and then will add a new TI flag to make it work again?

The flag is a lot simpler than the previous assembler mess.

I thought the general trend in entry* was to move assembler to C?

> 
> This whole thing seems to be at the wrong end of the cost benefit curve.

But if you prefer assembler mess we can go back to it, no problem.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-11  0:55       ` Andy Lutomirski
  2018-01-11  1:01         ` Andi Kleen
@ 2018-01-11  1:01         ` Brian Gerst
  2018-01-11  1:22           ` Andy Lutomirski
  1 sibling, 1 reply; 34+ messages in thread
From: Brian Gerst @ 2018-01-11  1:01 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andi Kleen, Thomas Gleixner, the arch/x86 maintainers,
	Linux Kernel Mailing List, Linus Torvalds, David Woodhouse,
	Paul Turner, Andy Lutomirski, Peter Zijlstra, Tom Lendacky,
	Tim Chen, Greg Kroah-Hartman, Dave Hansen, Jiri Kosina,
	Andi Kleen

On Wed, Jan 10, 2018 at 7:55 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>
>
>> On Jan 10, 2018, at 4:16 PM, Andi Kleen <andi@firstfloor.org> wrote:
>>
>>> On Tue, Jan 09, 2018 at 09:46:16PM -0500, Brian Gerst wrote:
>>>> On Tue, Jan 9, 2018 at 8:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
>>>> From: Andi Kleen <ak@linux.intel.com>
>>>>
>>>> Remove the partial stack frame in the 64bit syscall fast path.
>>>> In the next patch we want to clear the extra registers, which requires
>>>> to always save all registers. So remove the partial stack frame
>>>> in the syscall fast path and always save everything.
>>>>
>>>> This actually simplifies the code because the ptregs stubs
>>>> are not needed anymore.
>>>>
>>>> arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------------------
>>>> arch/x86/entry/syscall_64.c |  2 +-
>>>>
>>>> Signed-off-by: Andi Kleen <ak@linux.intel.com>
>>>> ---
>>>> arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------
>>>> arch/x86/entry/syscall_64.c |  2 +-
>>>> 2 files changed, 5 insertions(+), 54 deletions(-)
>>>>
>>>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>>>> index 58dbf7a12a05..bbdfbdd817d6 100644
>>>> --- a/arch/x86/entry/entry_64.S
>>>> +++ b/arch/x86/entry/entry_64.S
>>>> @@ -234,7 +234,9 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
>>>>        pushq   %r9                             /* pt_regs->r9 */
>>>>        pushq   %r10                            /* pt_regs->r10 */
>>>>        pushq   %r11                            /* pt_regs->r11 */
>>>> -       sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
>>>> +       sub     $(6*8), %rsp
>>>> +       SAVE_EXTRA_REGS
>>>> +
>>>
>>> Continue using pushes here
>>>
>>>>        UNWIND_HINT_REGS extra=0
>>>>
>>>>        /*
>>>> @@ -262,11 +264,6 @@ entry_SYSCALL_64_fastpath:
>>>>        ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
>>>>        movq    %r10, %rcx
>>>>
>>>> -       /*
>>>> -        * This call instruction is handled specially in stub_ptregs_64.
>>>> -        * It might end up jumping to the slow path.  If it jumps, RAX
>>>> -        * and all argument registers are clobbered.
>>>> -        */
>>>> #ifdef CONFIG_RETPOLINE
>>>>        movq    sys_call_table(, %rax, 8), %rax
>>>>        call    __x86_indirect_thunk_rax
>>>> @@ -293,9 +290,7 @@ entry_SYSCALL_64_fastpath:
>>>>        TRACE_IRQS_ON           /* user mode is traced as IRQs on */
>>>>        movq    RIP(%rsp), %rcx
>>>>        movq    EFLAGS(%rsp), %r11
>>>> -       addq    $6*8, %rsp      /* skip extra regs -- they were preserved */
>>>> -       UNWIND_HINT_EMPTY
>>>> -       jmp     .Lpop_c_regs_except_rcx_r11_and_sysret
>>>> +       jmp     syscall_return_via_sysret
>>>>
>>>> 1:
>>>>        /*
>>>> @@ -305,14 +300,12 @@ entry_SYSCALL_64_fastpath:
>>>>         */
>>>>        TRACE_IRQS_ON
>>>>        ENABLE_INTERRUPTS(CLBR_ANY)
>>>> -       SAVE_EXTRA_REGS
>>>>        movq    %rsp, %rdi
>>>>        call    syscall_return_slowpath /* returns with IRQs disabled */
>>>>        jmp     return_from_SYSCALL_64
>>>>
>>>> entry_SYSCALL64_slow_path:
>>>>        /* IRQs are off. */
>>>> -       SAVE_EXTRA_REGS
>>>>        movq    %rsp, %rdi
>>>>        call    do_syscall_64           /* returns with IRQs disabled */
>>>>
>>>> @@ -389,7 +382,6 @@ syscall_return_via_sysret:
>>>>        /* rcx and r11 are already restored (see code above) */
>>>>        UNWIND_HINT_EMPTY
>>>>        POP_EXTRA_REGS
>>>> -.Lpop_c_regs_except_rcx_r11_and_sysret:
>>>>        popq    %rsi    /* skip r11 */
>>>>        popq    %r10
>>>>        popq    %r9
>>>> @@ -420,47 +412,6 @@ syscall_return_via_sysret:
>>>>        USERGS_SYSRET64
>>>> END(entry_SYSCALL_64)
>>>>
>>>> -ENTRY(stub_ptregs_64)
>>>> -       /*
>>>> -        * Syscalls marked as needing ptregs land here.
>>>> -        * If we are on the fast path, we need to save the extra regs,
>>>> -        * which we achieve by trying again on the slow path.  If we are on
>>>> -        * the slow path, the extra regs are already saved.
>>>> -        *
>>>> -        * RAX stores a pointer to the C function implementing the syscall.
>>>> -        * IRQs are on.
>>>> -        */
>>>> -       cmpq    $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
>>>> -       jne     1f
>>>> -
>>>> -       /*
>>>> -        * Called from fast path -- disable IRQs again, pop return address
>>>> -        * and jump to slow path
>>>> -        */
>>>> -       DISABLE_INTERRUPTS(CLBR_ANY)
>>>> -       TRACE_IRQS_OFF
>>>> -       popq    %rax
>>>> -       UNWIND_HINT_REGS extra=0
>>>> -       jmp     entry_SYSCALL64_slow_path
>>>> -
>>>> -1:
>>>> -       JMP_NOSPEC %rax                         /* Called from C */
>>>> -END(stub_ptregs_64)
>>>> -
>>>> -.macro ptregs_stub func
>>>> -ENTRY(ptregs_\func)
>>>> -       UNWIND_HINT_FUNC
>>>> -       leaq    \func(%rip), %rax
>>>> -       jmp     stub_ptregs_64
>>>> -END(ptregs_\func)
>>>> -.endm
>>>> -
>>>> -/* Instantiate ptregs_stub for each ptregs-using syscall */
>>>> -#define __SYSCALL_64_QUAL_(sym)
>>>> -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
>>>> -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
>>>> -#include <asm/syscalls_64.h>
>>>> -
>>>
>>> You can't just blindly remove this.  We need to make sure that
>>> syscalls that modify registers take the slow path exit, because they
>>> may change the registers to be incompatible with SYSRET.
>>
>> That's a good point. I checked the ptregs calls:
>>
>> iopl: should be fine, we will be restoring the correct IOPL through
>>    SYSRET
>>
>> clone/fork: fine too, the original return is fine and ret_from_fork
>>            takes care of the child
>>
>> execve et.al.: we will be leaking r11(rflags), rcx(orig return) into
>>        the new process. but that seems acceptable.
>>
>> rt_sigreturn:  that's the only one who has problems. I added a new
>>            TIF_FULL_RESTORE to force it into the slow path.
>>
>
> So your series removes the old declarative annotation and then will add a new TI flag to make it work again?
>
> This whole thing seems to be at the wrong end of the cost benefit curve.

We already check TIF flags after the syscall on the fast path.  Adding
another bit to the mask costs nothing.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-11  0:54       ` Brian Gerst
@ 2018-01-11  1:02         ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-11  1:02 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Andi Kleen, Thomas Gleixner, the arch/x86 maintainers,
	Linux Kernel Mailing List, Linus Torvalds, David Woodhouse,
	Paul Turner, Andy Lutomirski, Peter Zijlstra, Tom Lendacky,
	Tim Chen, Greg Kroah-Hartman, Dave Hansen, Jiri Kosina,
	Andi Kleen

> > execve et.al.: we will be leaking r11(rflags), rcx(orig return) into
> >             the new process. but that seems acceptable.
> 
> We still need to check if we are loading a 32-bit binary.  That must
> return with IRET.

True. Will fix.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-11  1:01         ` Brian Gerst
@ 2018-01-11  1:22           ` Andy Lutomirski
  2018-01-11  1:47             ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Andy Lutomirski @ 2018-01-11  1:22 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Andi Kleen, Thomas Gleixner, the arch/x86 maintainers,
	Linux Kernel Mailing List, Linus Torvalds, David Woodhouse,
	Paul Turner, Andy Lutomirski, Peter Zijlstra, Tom Lendacky,
	Tim Chen, Greg Kroah-Hartman, Dave Hansen, Jiri Kosina,
	Andi Kleen



> On Jan 10, 2018, at 5:01 PM, Brian Gerst <brgerst@gmail.com> wrote:
> 
>> On Wed, Jan 10, 2018 at 7:55 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> 
>> 
>>>> On Jan 10, 2018, at 4:16 PM, Andi Kleen <andi@firstfloor.org> wrote:
>>>> 
>>>>> On Tue, Jan 09, 2018 at 09:46:16PM -0500, Brian Gerst wrote:
>>>>> On Tue, Jan 9, 2018 at 8:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
>>>>> From: Andi Kleen <ak@linux.intel.com>
>>>>> 
>>>>> Remove the partial stack frame in the 64bit syscall fast path.
>>>>> In the next patch we want to clear the extra registers, which requires
>>>>> to always save all registers. So remove the partial stack frame
>>>>> in the syscall fast path and always save everything.
>>>>> 
>>>>> This actually simplifies the code because the ptregs stubs
>>>>> are not needed anymore.
>>>>> 
>>>>> arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------------------
>>>>> arch/x86/entry/syscall_64.c |  2 +-
>>>>> 
>>>>> Signed-off-by: Andi Kleen <ak@linux.intel.com>
>>>>> ---
>>>>> arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------
>>>>> arch/x86/entry/syscall_64.c |  2 +-
>>>>> 2 files changed, 5 insertions(+), 54 deletions(-)
>>>>> 
>>>>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>>>>> index 58dbf7a12a05..bbdfbdd817d6 100644
>>>>> --- a/arch/x86/entry/entry_64.S
>>>>> +++ b/arch/x86/entry/entry_64.S
>>>>> @@ -234,7 +234,9 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
>>>>>       pushq   %r9                             /* pt_regs->r9 */
>>>>>       pushq   %r10                            /* pt_regs->r10 */
>>>>>       pushq   %r11                            /* pt_regs->r11 */
>>>>> -       sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
>>>>> +       sub     $(6*8), %rsp
>>>>> +       SAVE_EXTRA_REGS
>>>>> +
>>>> 
>>>> Continue using pushes here
>>>> 
>>>>>       UNWIND_HINT_REGS extra=0
>>>>> 
>>>>>       /*
>>>>> @@ -262,11 +264,6 @@ entry_SYSCALL_64_fastpath:
>>>>>       ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
>>>>>       movq    %r10, %rcx
>>>>> 
>>>>> -       /*
>>>>> -        * This call instruction is handled specially in stub_ptregs_64.
>>>>> -        * It might end up jumping to the slow path.  If it jumps, RAX
>>>>> -        * and all argument registers are clobbered.
>>>>> -        */
>>>>> #ifdef CONFIG_RETPOLINE
>>>>>       movq    sys_call_table(, %rax, 8), %rax
>>>>>       call    __x86_indirect_thunk_rax
>>>>> @@ -293,9 +290,7 @@ entry_SYSCALL_64_fastpath:
>>>>>       TRACE_IRQS_ON           /* user mode is traced as IRQs on */
>>>>>       movq    RIP(%rsp), %rcx
>>>>>       movq    EFLAGS(%rsp), %r11
>>>>> -       addq    $6*8, %rsp      /* skip extra regs -- they were preserved */
>>>>> -       UNWIND_HINT_EMPTY
>>>>> -       jmp     .Lpop_c_regs_except_rcx_r11_and_sysret
>>>>> +       jmp     syscall_return_via_sysret
>>>>> 
>>>>> 1:
>>>>>       /*
>>>>> @@ -305,14 +300,12 @@ entry_SYSCALL_64_fastpath:
>>>>>        */
>>>>>       TRACE_IRQS_ON
>>>>>       ENABLE_INTERRUPTS(CLBR_ANY)
>>>>> -       SAVE_EXTRA_REGS
>>>>>       movq    %rsp, %rdi
>>>>>       call    syscall_return_slowpath /* returns with IRQs disabled */
>>>>>       jmp     return_from_SYSCALL_64
>>>>> 
>>>>> entry_SYSCALL64_slow_path:
>>>>>       /* IRQs are off. */
>>>>> -       SAVE_EXTRA_REGS
>>>>>       movq    %rsp, %rdi
>>>>>       call    do_syscall_64           /* returns with IRQs disabled */
>>>>> 
>>>>> @@ -389,7 +382,6 @@ syscall_return_via_sysret:
>>>>>       /* rcx and r11 are already restored (see code above) */
>>>>>       UNWIND_HINT_EMPTY
>>>>>       POP_EXTRA_REGS
>>>>> -.Lpop_c_regs_except_rcx_r11_and_sysret:
>>>>>       popq    %rsi    /* skip r11 */
>>>>>       popq    %r10
>>>>>       popq    %r9
>>>>> @@ -420,47 +412,6 @@ syscall_return_via_sysret:
>>>>>       USERGS_SYSRET64
>>>>> END(entry_SYSCALL_64)
>>>>> 
>>>>> -ENTRY(stub_ptregs_64)
>>>>> -       /*
>>>>> -        * Syscalls marked as needing ptregs land here.
>>>>> -        * If we are on the fast path, we need to save the extra regs,
>>>>> -        * which we achieve by trying again on the slow path.  If we are on
>>>>> -        * the slow path, the extra regs are already saved.
>>>>> -        *
>>>>> -        * RAX stores a pointer to the C function implementing the syscall.
>>>>> -        * IRQs are on.
>>>>> -        */
>>>>> -       cmpq    $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
>>>>> -       jne     1f
>>>>> -
>>>>> -       /*
>>>>> -        * Called from fast path -- disable IRQs again, pop return address
>>>>> -        * and jump to slow path
>>>>> -        */
>>>>> -       DISABLE_INTERRUPTS(CLBR_ANY)
>>>>> -       TRACE_IRQS_OFF
>>>>> -       popq    %rax
>>>>> -       UNWIND_HINT_REGS extra=0
>>>>> -       jmp     entry_SYSCALL64_slow_path
>>>>> -
>>>>> -1:
>>>>> -       JMP_NOSPEC %rax                         /* Called from C */
>>>>> -END(stub_ptregs_64)
>>>>> -
>>>>> -.macro ptregs_stub func
>>>>> -ENTRY(ptregs_\func)
>>>>> -       UNWIND_HINT_FUNC
>>>>> -       leaq    \func(%rip), %rax
>>>>> -       jmp     stub_ptregs_64
>>>>> -END(ptregs_\func)
>>>>> -.endm
>>>>> -
>>>>> -/* Instantiate ptregs_stub for each ptregs-using syscall */
>>>>> -#define __SYSCALL_64_QUAL_(sym)
>>>>> -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
>>>>> -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
>>>>> -#include <asm/syscalls_64.h>
>>>>> -
>>>> 
>>>> You can't just blindly remove this.  We need to make sure that
>>>> syscalls that modify registers take the slow path exit, because they
>>>> may change the registers to be incompatible with SYSRET.
>>> 
>>> That's a good point. I checked the ptregs calls:
>>> 
>>> iopl: should be fine, we will be restoring the correct IOPL through
>>>   SYSRET
>>> 
>>> clone/fork: fine too, the original return is fine and ret_from_fork
>>>           takes care of the child
>>> 
>>> execve et.al.: we will be leaking r11(rflags), rcx(orig return) into
>>>       the new process. but that seems acceptable.
>>> 
>>> rt_sigreturn:  that's the only one who has problems. I added a new
>>>           TIF_FULL_RESTORE to force it into the slow path.
>>> 
>> 
>> So your series removes the old declarative annotation and then will add a new TI flag to make it work again?
>> 
>> This whole thing seems to be at the wrong end of the cost benefit curve.
> 
> We already check TIF flags after the syscall on the fast path.  Adding
> another bit to the mask costs nothing.
> 

What I mean is: this whole series is almost certainly a performance regression, it has no off switch, and is doesn't obviously solve any problem.  It' didn't qualify as a so.  And no one has benchmarked it. I think we should seriously consider just not applying it.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-11  1:22           ` Andy Lutomirski
@ 2018-01-11  1:47             ` Andi Kleen
  2018-01-11 18:44               ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Andi Kleen @ 2018-01-11  1:47 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Brian Gerst, Andi Kleen, Thomas Gleixner,
	the arch/x86 maintainers, Linux Kernel Mailing List,
	Linus Torvalds, David Woodhouse, Paul Turner, Andy Lutomirski,
	Peter Zijlstra, Tom Lendacky, Tim Chen, Greg Kroah-Hartman,
	Dave Hansen, Jiri Kosina, Andi Kleen

> What I mean is: this whole series is almost certainly a performance regression, it has no off switch, and is doesn't obviously solve any problem.  It' didn't qualify as a so.  And no one has benchmarked it. I think we should seriously consider just not applying it.

Well it's kernel hardening to guard against possible future speculation
attacks. Linus discussed it here for example:

https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1580667.html

For the on/off switch I can add a CONFIG to enable it, even though
it seems somewhat silly.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-10  1:03 ` [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call Andi Kleen
  2018-01-10  2:46   ` Brian Gerst
@ 2018-01-11  2:09   ` Josh Poimboeuf
  2018-01-12  3:22   ` Josh Poimboeuf
  2 siblings, 0 replies; 34+ messages in thread
From: Josh Poimboeuf @ 2018-01-11  2:09 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

On Tue, Jan 09, 2018 at 05:03:21PM -0800, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
> 
> Remove the partial stack frame in the 64bit syscall fast path.
> In the next patch we want to clear the extra registers, which requires
> to always save all registers. So remove the partial stack frame
> in the syscall fast path and always save everything.
> 
> This actually simplifies the code because the ptregs stubs
> are not needed anymore.
> 
> arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------------------
> arch/x86/entry/syscall_64.c |  2 +-
> 
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------
>  arch/x86/entry/syscall_64.c |  2 +-
>  2 files changed, 5 insertions(+), 54 deletions(-)
> 
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 58dbf7a12a05..bbdfbdd817d6 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -234,7 +234,9 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
>  	pushq	%r9				/* pt_regs->r9 */
>  	pushq	%r10				/* pt_regs->r10 */
>  	pushq	%r11				/* pt_regs->r11 */
> -	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
> +	sub	$(6*8), %rsp
> +	SAVE_EXTRA_REGS
> +
>  	UNWIND_HINT_REGS extra=0

Now that the extra regs are being saved, the "extra=0" can be removed
from the unwind hint.

-- 
Josh

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 3/8] x86/entry/clearregs: Clear registers for 64bit SYSCALL
  2018-01-10  1:03 ` [PATCH v1 3/8] x86/entry/clearregs: Clear registers for 64bit SYSCALL Andi Kleen
@ 2018-01-11  3:35   ` Brian Gerst
  2018-01-11 18:47     ` Andi Kleen
  2018-01-12  3:45   ` Josh Poimboeuf
  1 sibling, 1 reply; 34+ messages in thread
From: Brian Gerst @ 2018-01-11  3:35 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Thomas Gleixner, the arch/x86 maintainers,
	Linux Kernel Mailing List, Linus Torvalds, David Woodhouse,
	Paul Turner, Andy Lutomirski, Peter Zijlstra, Tom Lendacky,
	Tim Chen, Greg Kroah-Hartman, Dave Hansen, Jiri Kosina,
	Andi Kleen

On Tue, Jan 9, 2018 at 8:03 PM, Andi Kleen <andi@firstfloor.org> wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> We clear all the non argument registers for 64bit SYSCALLs
> to minimize any risk of bad speculation using user values.
>
> So far unused argument registers still leak. To be addressed
> in future patches.
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  arch/x86/entry/entry_64.S | 9 +++++++++
>  1 file changed, 9 insertions(+)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index bbdfbdd817d6..632081fd7086 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -236,6 +236,14 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
>         pushq   %r11                            /* pt_regs->r11 */
>         sub     $(6*8), %rsp
>         SAVE_EXTRA_REGS
> +       /* Sanitize registers against speculation attacks */
> +       /* r10 is cleared later, arguments are handled in san_args* */
> +       CLEAR_R11_TO_R15

Don't need to explicitly clear R11 here.  It is clobbered with current_task.

> +#ifndef CONFIG_FRAME_POINTER
> +       xor     %ebp, %ebp
> +#endif
> +       xor     %ebx, %ebx
> +       xor     %ecx, %ecx
>
>         UNWIND_HINT_REGS extra=0
>
> @@ -263,6 +271,7 @@ entry_SYSCALL_64_fastpath:
>  #endif
>         ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
>         movq    %r10, %rcx
> +       xor     %r10, %r10

RCX is already clear, so xchgq %r10, %rcx will be simpler.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-11  1:47             ` Andi Kleen
@ 2018-01-11 18:44               ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-11 18:44 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andy Lutomirski, Brian Gerst, Thomas Gleixner,
	the arch/x86 maintainers, Linux Kernel Mailing List,
	Linus Torvalds, David Woodhouse, Paul Turner, Andy Lutomirski,
	Peter Zijlstra, Tom Lendacky, Tim Chen, Greg Kroah-Hartman,
	Dave Hansen, Jiri Kosina, Andi Kleen

> Well it's kernel hardening to guard against possible future speculation
> attacks. Linus discussed it here for example:
> 
> https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1580667.html
> 
> For the on/off switch I can add a CONFIG to enable it, even though
> it seems somewhat silly.

I did some micro benchmarking now, sampling different real system
calls.

For the entry code (entry to call) I get on average 62 cycles 
for the old code, vs 78 cycles with clear regs and full
stack frame saving on Skylake.

So it's roughly ~20 cycles difference, if we include the restore.

I would conclude 20 cycles are not significant for a syscall,
so there's not a lot of motivation to add a switch
for less security.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 3/8] x86/entry/clearregs: Clear registers for 64bit SYSCALL
  2018-01-11  3:35   ` Brian Gerst
@ 2018-01-11 18:47     ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-11 18:47 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Andi Kleen, Thomas Gleixner, the arch/x86 maintainers,
	Linux Kernel Mailing List, Linus Torvalds, David Woodhouse,
	Paul Turner, Andy Lutomirski, Peter Zijlstra, Tom Lendacky,
	Tim Chen, Greg Kroah-Hartman, Dave Hansen, Jiri Kosina,
	Andi Kleen

On Wed, Jan 10, 2018 at 10:35:58PM -0500, Brian Gerst wrote:
> > @@ -263,6 +271,7 @@ entry_SYSCALL_64_fastpath:
> >  #endif
> >         ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
> >         movq    %r10, %rcx
> > +       xor     %r10, %r10
> 
> RCX is already clear, so xchgq %r10, %rcx will be simpler.

XOR is special cased by the hardware, so it's always more
efficient.

-Andi
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 2/8] x86/entry/clearregs: Add infrastructure to clear registers
  2018-01-10  1:03 ` [PATCH v1 2/8] x86/entry/clearregs: Add infrastructure to clear registers Andi Kleen
@ 2018-01-11 19:58   ` Konrad Rzeszutek Wilk
  2018-01-11 20:10     ` Andi Kleen
  0 siblings, 1 reply; 34+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-01-11 19:58 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

On Tue, Jan 09, 2018 at 05:03:22PM -0800, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
> 
> Add 64bit assembler macros to clear registers on kernel entry.
> Used in followon patches.
> 
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  arch/x86/entry/calling.h | 28 ++++++++++++++++++++++++++++
>  1 file changed, 28 insertions(+)
> 
> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
> index 45a63e00a6af..9444e7623185 100644
> --- a/arch/x86/entry/calling.h
> +++ b/arch/x86/entry/calling.h
> @@ -172,6 +172,34 @@ For 32-bit we have the following conventions - kernel is built with
>  	.byte 0xf1
>  	.endm
>  
> +	.macro CLEAR_R11_TO_R15
> +	xorq %r15, %r15
> +	xorq %r14, %r14
> +	xorq %r13, %r13
> +	xorq %r12, %r12
> +	xorq %r11, %r11
> +	.endm
> +
> +	.macro CLEAR_R8_TO_R15
> +	CLEAR_R11_TO_R15
> +	xorq %r10, %r10
> +	xorq %r9, %r9
> +	xorq %r8, %r8
> +	.endm
> +
> +	.macro CLEAR_ALL_REGS
> +	CLEAR_R8_TO_R15
> +	xorl %eax, %eax
> +	xorl %ebx, %ebx

How come you use xorl vs xorq?

> +	xorl %ecx, %ecx
> +	xorl %edx, %edx
> +	xorl %esi, %esi
> +	xorl %edi, %edi
> +#ifndef CONFIG_FRAME_POINTER
> +	xorl %ebp, %ebp
> +#endif
> +	.endm
> +
>  /*
>   * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The
>   * frame pointer is replaced with an encoded pointer to pt_regs.  The encoding
> -- 
> 2.14.3
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 2/8] x86/entry/clearregs: Add infrastructure to clear registers
  2018-01-11 19:58   ` Konrad Rzeszutek Wilk
@ 2018-01-11 20:10     ` Andi Kleen
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Kleen @ 2018-01-11 20:10 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Andi Kleen, tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto,
	peterz, thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos

> > +	.macro CLEAR_ALL_REGS
> > +	CLEAR_R8_TO_R15
> > +	xorl %eax, %eax
> > +	xorl %ebx, %ebx
> 
> How come you use xorl vs xorq?

64bit always zero extends 32->64, and xorl is one byte
shorter because it doesn't need a REX prefix.

-Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call
  2018-01-10  1:03 ` [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call Andi Kleen
  2018-01-10  2:46   ` Brian Gerst
  2018-01-11  2:09   ` Josh Poimboeuf
@ 2018-01-12  3:22   ` Josh Poimboeuf
  2 siblings, 0 replies; 34+ messages in thread
From: Josh Poimboeuf @ 2018-01-12  3:22 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

On Tue, Jan 09, 2018 at 05:03:21PM -0800, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
> 
> Remove the partial stack frame in the 64bit syscall fast path.
> In the next patch we want to clear the extra registers, which requires
> to always save all registers. So remove the partial stack frame
> in the syscall fast path and always save everything.
> 
> This actually simplifies the code because the ptregs stubs
> are not needed anymore.
> 
> arch/x86/entry/entry_64.S   | 57 ++++-----------------------------------------------------
> arch/x86/entry/syscall_64.c |  2 +-

This diffstat doesn't need to be in the changelog.

-- 
Josh

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v1 3/8] x86/entry/clearregs: Clear registers for 64bit SYSCALL
  2018-01-10  1:03 ` [PATCH v1 3/8] x86/entry/clearregs: Clear registers for 64bit SYSCALL Andi Kleen
  2018-01-11  3:35   ` Brian Gerst
@ 2018-01-12  3:45   ` Josh Poimboeuf
  1 sibling, 0 replies; 34+ messages in thread
From: Josh Poimboeuf @ 2018-01-12  3:45 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, x86, linux-kernel, torvalds, dwmw, pjt, luto, peterz,
	thomas.lendacky, tim.c.chen, gregkh, dave.hansen, jikos,
	Andi Kleen

On Tue, Jan 09, 2018 at 05:03:23PM -0800, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
> 
> We clear all the non argument registers for 64bit SYSCALLs
> to minimize any risk of bad speculation using user values.
> 
> So far unused argument registers still leak. To be addressed
> in future patches.
> 
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  arch/x86/entry/entry_64.S | 9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index bbdfbdd817d6..632081fd7086 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -236,6 +236,14 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
>  	pushq	%r11				/* pt_regs->r11 */
>  	sub	$(6*8), %rsp
>  	SAVE_EXTRA_REGS
> +	/* Sanitize registers against speculation attacks */

This comment isn't necessary, though it would be good to add comments
above the CLEAR macros themselves explaining why they're needed.

> +	/* r10 is cleared later, arguments are handled in san_args* */

What is san_args?

> +	CLEAR_R11_TO_R15
> +#ifndef CONFIG_FRAME_POINTER
> +	xor	%ebp, %ebp
> +#endif

Why is %rbp not cleared with CONFIG_FRAME_POINTER?  Is it because it
will get clobbered by the first called function?

> +	xor	%ebx, %ebx
> +	xor	%ecx, %ecx

I think clearing %ecx isn't needed, it gets clobbered below for the fast
path, and gets clobbered by do_syscall_64() for the slow path.

>  
>  	UNWIND_HINT_REGS extra=0
>  
> @@ -263,6 +271,7 @@ entry_SYSCALL_64_fastpath:
>  #endif
>  	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
>  	movq	%r10, %rcx
> +	xor	%r10, %r10
>  
>  #ifdef CONFIG_RETPOLINE
>  	movq	sys_call_table(, %rax, 8), %rax

Now that the fast path is getting slower, I wonder if it still makes
sense to have a "fast path"?  It would be good to see measurements
comparing the fast and slow paths.

-- 
Josh

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2018-01-12  3:45 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-10  1:03 x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andi Kleen
2018-01-10  1:03 ` [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call Andi Kleen
2018-01-10  2:46   ` Brian Gerst
2018-01-11  0:16     ` Andi Kleen
2018-01-11  0:54       ` Brian Gerst
2018-01-11  1:02         ` Andi Kleen
2018-01-11  0:55       ` Andy Lutomirski
2018-01-11  1:01         ` Andi Kleen
2018-01-11  1:01         ` Brian Gerst
2018-01-11  1:22           ` Andy Lutomirski
2018-01-11  1:47             ` Andi Kleen
2018-01-11 18:44               ` Andi Kleen
2018-01-11  2:09   ` Josh Poimboeuf
2018-01-12  3:22   ` Josh Poimboeuf
2018-01-10  1:03 ` [PATCH v1 2/8] x86/entry/clearregs: Add infrastructure to clear registers Andi Kleen
2018-01-11 19:58   ` Konrad Rzeszutek Wilk
2018-01-11 20:10     ` Andi Kleen
2018-01-10  1:03 ` [PATCH v1 3/8] x86/entry/clearregs: Clear registers for 64bit SYSCALL Andi Kleen
2018-01-11  3:35   ` Brian Gerst
2018-01-11 18:47     ` Andi Kleen
2018-01-12  3:45   ` Josh Poimboeuf
2018-01-10  1:03 ` [PATCH v1 4/8] x86/entry/retpoline: Clear extra registers for compat syscalls Andi Kleen
2018-01-10  1:03 ` [PATCH v1 5/8] x86/entry/clearregs: Clear registers for 64bit exceptions/interrupts Andi Kleen
2018-01-10  1:23   ` Andy Lutomirski
2018-01-10  1:03 ` [PATCH v1 6/8] x86/entry/clearregs: Add number of arguments to syscall tables Andi Kleen
2018-01-10  1:26   ` Andy Lutomirski
2018-01-10  4:37     ` Andi Kleen
2018-01-10 20:05       ` Andy Lutomirski
2018-01-10  1:03 ` [PATCH v1 7/8] x86/entry/clearregs: Add 64bit stubs to clear unused arguments regs Andi Kleen
2018-01-10  1:03 ` [PATCH v1 8/8] x86/entry/clearregs: Clear registers for 32bit kernel Andi Kleen
2018-01-10  1:24   ` Andy Lutomirski
2018-01-10  1:16 ` x86/clearregs: Register sanitizing at kernel entry for speculation hygiene Andy Lutomirski
2018-01-10  1:34   ` Andi Kleen
2018-01-10  1:39     ` Andy Lutomirski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).