linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/5] x86: entry_64.S: delete unused code
@ 2014-08-01 14:48 Denys Vlasenko
  2014-08-01 14:48 ` [PATCH 2/5] x86: open-code register save/restore in trace_hardirqs thunks Denys Vlasenko
                   ` (4 more replies)
  0 siblings, 5 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 14:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Denys Vlasenko, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

A define, two macros and an unreferenced bit of assembly are gone.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/calling.h |  1 -
 arch/x86/kernel/entry_64.S     | 34 ----------------------------------
 2 files changed, 35 deletions(-)

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index cb4c73b..e176cea 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with
 #define SS		160
 
 #define ARGOFFSET	R11
-#define SWFRAME		ORIG_RAX
 
 	.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
 	subq  $9*8+\addskip, %rsp
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b25ca96..dbfd037 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,27 +155,6 @@ ENDPROC(native_usergs_sysret64)
 	movq \tmp,R11+\offset(%rsp)
 	.endm
 
-	.macro FAKE_STACK_FRAME child_rip
-	/* push in order ss, rsp, eflags, cs, rip */
-	xorl %eax, %eax
-	pushq_cfi $__KERNEL_DS /* ss */
-	/*CFI_REL_OFFSET	ss,0*/
-	pushq_cfi %rax /* rsp */
-	CFI_REL_OFFSET	rsp,0
-	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
-	/*CFI_REL_OFFSET	rflags,0*/
-	pushq_cfi $__KERNEL_CS /* cs */
-	/*CFI_REL_OFFSET	cs,0*/
-	pushq_cfi \child_rip /* rip */
-	CFI_REL_OFFSET	rip,0
-	pushq_cfi %rax /* orig rax */
-	.endm
-
-	.macro UNFAKE_STACK_FRAME
-	addq $8*6, %rsp
-	CFI_ADJUST_CFA_OFFSET	-(6*8)
-	.endm
-
 /*
  * initial frame state for interrupts (and exceptions without error code)
  */
@@ -640,19 +619,6 @@ END(\label)
 	FORK_LIKE  vfork
 	FIXED_FRAME stub_iopl, sys_iopl
 
-ENTRY(ptregscall_common)
-	DEFAULT_FRAME 1 8	/* offset 8: return address */
-	RESTORE_TOP_OF_STACK %r11, 8
-	movq_cfi_restore R15+8, r15
-	movq_cfi_restore R14+8, r14
-	movq_cfi_restore R13+8, r13
-	movq_cfi_restore R12+8, r12
-	movq_cfi_restore RBP+8, rbp
-	movq_cfi_restore RBX+8, rbx
-	ret $REST_SKIP		/* pop extended registers */
-	CFI_ENDPROC
-END(ptregscall_common)
-
 ENTRY(stub_execve)
 	CFI_STARTPROC
 	addq $8, %rsp
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 2/5] x86: open-code register save/restore in trace_hardirqs thunks
  2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
@ 2014-08-01 14:48 ` Denys Vlasenko
  2014-08-01 14:48 ` [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user Denys Vlasenko
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 14:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Denys Vlasenko, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

This is a preparatory patch for change in "struct pt_regs"
handling in entry_64.S.

trace_hardirqs thunks were (ab)using a part of pt_regs
handling code, namely SAVE_ARGS/RESTORE_ARGS macros,
to save/restore registers across C function calls.

Since SAVE_ARGS is going to be changed, open-code
register saving/restoring here.

Incidentally, this removes a bit of dead code:
one SAVE_ARGS was just to emit a CFI annotation,
but it also generated unreachable assembly insns.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
---
 arch/x86/lib/thunk_64.S | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 92d9fea..c1c9131 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -16,10 +16,20 @@
 \name:
 	CFI_STARTPROC
 
-	/* this one pushes 9 elems, the next one would be %rIP */
-	SAVE_ARGS
+	subq  $9*8, %rsp
+	CFI_ADJUST_CFA_OFFSET 9*8
+	movq_cfi rdi, 8*8
+	movq_cfi rsi, 7*8
+	movq_cfi rdx, 6*8
+	movq_cfi rcx, 5*8
+	movq_cfi rax, 4*8
+	movq_cfi r8,  3*8
+	movq_cfi r9,  2*8
+	movq_cfi r10, 1*8
+	movq_cfi r11, 0*8
 
 	.if \put_ret_addr_in_rdi
+	/* 9*8(%rsp) is return addr on stack */
 	movq_cfi_restore 9*8, rdi
 	.endif
 
@@ -38,11 +48,20 @@
 	THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
 #endif
 
-	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
 	CFI_STARTPROC
-	SAVE_ARGS
+	CFI_ADJUST_CFA_OFFSET 9*8
 restore:
-	RESTORE_ARGS
+	movq_cfi_restore 0*8, r11
+	movq_cfi_restore 1*8, r10
+	movq_cfi_restore 2*8, r9
+	movq_cfi_restore 3*8, r8
+	movq_cfi_restore 4*8, rax
+	movq_cfi_restore 5*8, rcx
+	movq_cfi_restore 6*8, rdx
+	movq_cfi_restore 7*8, rsi
+	movq_cfi_restore 8*8, rdi
+	addq 9*8, %rsp
+	CFI_ADJUST_CFA_OFFSET -9*8
 	ret
 	CFI_ENDPROC
 	_ASM_NOKPROBE(restore)
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user
  2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
  2014-08-01 14:48 ` [PATCH 2/5] x86: open-code register save/restore in trace_hardirqs thunks Denys Vlasenko
@ 2014-08-01 14:48 ` Denys Vlasenko
  2014-08-01 18:30   ` Frederic Weisbecker
  2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 14:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Denys Vlasenko, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

No code changes.

This is a preparatory patch for change in "struct pt_regs" handling.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
---
 arch/x86/kernel/entry_64.S | 88 ++++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 46 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index dbfd037..37f7d95 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -218,51 +218,6 @@ ENDPROC(native_usergs_sysret64)
 	CFI_REL_OFFSET r15, R15+\offset
 	.endm
 
-/* save partial stack frame */
-	.macro SAVE_ARGS_IRQ
-	cld
-	/* start from rbp in pt_regs and jump over */
-	movq_cfi rdi, (RDI-RBP)
-	movq_cfi rsi, (RSI-RBP)
-	movq_cfi rdx, (RDX-RBP)
-	movq_cfi rcx, (RCX-RBP)
-	movq_cfi rax, (RAX-RBP)
-	movq_cfi  r8,  (R8-RBP)
-	movq_cfi  r9,  (R9-RBP)
-	movq_cfi r10, (R10-RBP)
-	movq_cfi r11, (R11-RBP)
-
-	/* Save rbp so that we can unwind from get_irq_regs() */
-	movq_cfi rbp, 0
-
-	/* Save previous stack value */
-	movq %rsp, %rsi
-
-	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
-	testl $3, CS-RBP(%rsi)
-	je 1f
-	SWAPGS
-	/*
-	 * irq_count is used to check if a CPU is already on an interrupt stack
-	 * or not. While this is essentially redundant with preempt_count it is
-	 * a little cheaper to use a separate counter in the PDA (short of
-	 * moving irq_enter into assembly, which would be too much work)
-	 */
-1:	incl PER_CPU_VAR(irq_count)
-	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
-	CFI_DEF_CFA_REGISTER	rsi
-
-	/* Store previous stack value */
-	pushq %rsi
-	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
-			0x77 /* DW_OP_breg7 */, 0, \
-			0x06 /* DW_OP_deref */, \
-			0x08 /* DW_OP_const1u */, SS+8-RBP, \
-			0x22 /* DW_OP_plus */
-	/* We entered an interrupt context - irqs are off: */
-	TRACE_IRQS_OFF
-	.endm
-
 ENTRY(save_paranoid)
 	XCPT_FRAME 1 RDI+8
 	cld
@@ -731,7 +686,48 @@ END(interrupt)
 	/* reserve pt_regs for scratch regs and rbp */
 	subq $ORIG_RAX-RBP, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
-	SAVE_ARGS_IRQ
+	cld
+	/* start from rbp in pt_regs and jump over */
+	movq_cfi rdi, (RDI-RBP)
+	movq_cfi rsi, (RSI-RBP)
+	movq_cfi rdx, (RDX-RBP)
+	movq_cfi rcx, (RCX-RBP)
+	movq_cfi rax, (RAX-RBP)
+	movq_cfi  r8,  (R8-RBP)
+	movq_cfi  r9,  (R9-RBP)
+	movq_cfi r10, (R10-RBP)
+	movq_cfi r11, (R11-RBP)
+
+	/* Save rbp so that we can unwind from get_irq_regs() */
+	movq_cfi rbp, 0
+
+	/* Save previous stack value */
+	movq %rsp, %rsi
+
+	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
+	testl $3, CS-RBP(%rsi)
+	je 1f
+	SWAPGS
+	/*
+	 * irq_count is used to check if a CPU is already on an interrupt stack
+	 * or not. While this is essentially redundant with preempt_count it is
+	 * a little cheaper to use a separate counter in the PDA (short of
+	 * moving irq_enter into assembly, which would be too much work)
+	 */
+1:	incl PER_CPU_VAR(irq_count)
+	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+	CFI_DEF_CFA_REGISTER	rsi
+
+	/* Store previous stack value */
+	pushq %rsi
+	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
+			0x77 /* DW_OP_breg7 */, 0, \
+			0x06 /* DW_OP_deref */, \
+			0x08 /* DW_OP_const1u */, SS+8-RBP, \
+			0x22 /* DW_OP_plus */
+	/* We entered an interrupt context - irqs are off: */
+	TRACE_IRQS_OFF
+
 	call \func
 	.endm
 
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
  2014-08-01 14:48 ` [PATCH 2/5] x86: open-code register save/restore in trace_hardirqs thunks Denys Vlasenko
  2014-08-01 14:48 ` [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user Denys Vlasenko
@ 2014-08-01 14:48 ` Denys Vlasenko
  2014-08-01 17:04   ` Andy Lutomirski
                     ` (5 more replies)
  2014-08-01 14:48 ` [PATCH 5/5] x86: mass removal of ARGOFFSET Denys Vlasenko
  2014-08-01 18:00 ` [PATCH 1/5] x86: entry_64.S: delete unused code Frederic Weisbecker
  4 siblings, 6 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 14:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Denys Vlasenko, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

64-bit code was using six stack slots fewer by not saving/restoring
registers which a callee-preserved according to C ABI,
and not allocating space for them.

Only when syscall needed a complete "struct pt_regs",
the complete area was allocated and filled in.

This proved to be a source of significant obfuscation and subtle bugs.
For example, stub_fork had to pop the return address,
extend the struct, save registers, and push return address back. Ugly.
ia32_ptregs_common pops return address and "returns" via jmp insn,
throwing a wrench into CPU return stack cache.

This patch changes code to always allocate a complete "struct pt_regs".
The saving of registers is still done lazily.

Macros which manipulate "struct pt_regs" on stack are reworked:
ALLOC_PTREGS_ON_STACK allocates the structure.
SAVE_C_REGS saves to it those registers which are clobbered by C code.
SAVE_EXTRA_REGS saves to it all other registers.
Corresponding RESTORE_* and REMOVE_PTREGS_FROM_STACK macros reverse it.

ia32_ptregs_common, stub_fork and friends lost their ugly dance with
return pointer.

LOAD_ARGS32 in ia32entry.S now uses a symbolic stack offsets
instead of magic numbers.

Misleading and slightly wrong comments in "struct pt_regs" are fixed
(four instances).

Patch was run-tested: 64-bit executables, 32-bit executables,
strace works.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
---
 arch/x86/ia32/ia32entry.S              |  47 +++----
 arch/x86/include/asm/calling.h         | 224 ++++++++++++++++-----------------
 arch/x86/include/asm/irqflags.h        |   4 +-
 arch/x86/include/asm/ptrace.h          |  13 +-
 arch/x86/include/uapi/asm/ptrace-abi.h |  16 ++-
 arch/x86/include/uapi/asm/ptrace.h     |  13 +-
 arch/x86/kernel/entry_64.S             | 132 ++++++++-----------
 arch/x86/kernel/preempt.S              |  16 ++-
 8 files changed, 232 insertions(+), 233 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 4299eb0..ef9ee16 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -62,12 +62,12 @@
 	 */
 	.macro LOAD_ARGS32 offset, _r9=0
 	.if \_r9
-	movl \offset+16(%rsp),%r9d
+	movl \offset+R9(%rsp),%r9d
 	.endif
-	movl \offset+40(%rsp),%ecx
-	movl \offset+48(%rsp),%edx
-	movl \offset+56(%rsp),%esi
-	movl \offset+64(%rsp),%edi
+	movl \offset+RCX(%rsp),%ecx
+	movl \offset+RDX(%rsp),%edx
+	movl \offset+RSI(%rsp),%esi
+	movl \offset+RDI(%rsp),%edi
 	movl %eax,%eax			/* zero extension */
 	.endm
 	
@@ -144,7 +144,8 @@ ENTRY(ia32_sysenter_target)
 	CFI_REL_OFFSET rip,0
 	pushq_cfi %rax
 	cld
-	SAVE_ARGS 0,1,0
+	ALLOC_PTREGS_ON_STACK
+	SAVE_C_REGS_EXCEPT_R891011
  	/* no need to do an access_ok check here because rbp has been
  	   32bit zero extended */ 
 	ASM_STAC
@@ -172,7 +173,8 @@ sysexit_from_sys_call:
 	andl  $~0x200,EFLAGS-R11(%rsp) 
 	movl	RIP-R11(%rsp),%edx		/* User %eip */
 	CFI_REGISTER rip,rdx
-	RESTORE_ARGS 0,24,0,0,0,0
+	RESTORE_RSI_RDI
+	REMOVE_PTREGS_FROM_STACK 8*3
 	xorq	%r8,%r8
 	xorq	%r9,%r9
 	xorq	%r10,%r10
@@ -240,13 +242,13 @@ sysenter_tracesys:
 	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jz	sysenter_auditsys
 #endif
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	CLEAR_RREGS
 	movq	$-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
 	call	syscall_trace_enter
 	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	cmpq	$(IA32_NR_syscalls-1),%rax
 	ja	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
 	jmp	sysenter_do_call
@@ -288,7 +290,8 @@ ENTRY(ia32_cstar_target)
 	 * disabled irqs and here we enable it straight after entry:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_ARGS 8,0,0
+	ALLOC_PTREGS_ON_STACK 8
+	SAVE_C_REGS_EXCEPT_RCX_R891011
 	movl 	%eax,%eax	/* zero extension */
 	movq	%rax,ORIG_RAX-ARGOFFSET(%rsp)
 	movq	%rcx,RIP-ARGOFFSET(%rsp)
@@ -325,7 +328,7 @@ cstar_dispatch:
 	jnz sysretl_audit
 sysretl_from_sys_call:
 	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	RESTORE_ARGS 0,-ARG_SKIP,0,0,0
+	RESTORE_RSI_RDI_RDX
 	movl RIP-ARGOFFSET(%rsp),%ecx
 	CFI_REGISTER rip,rcx
 	movl EFLAGS-ARGOFFSET(%rsp),%r11d	
@@ -356,13 +359,13 @@ cstar_tracesys:
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	CLEAR_RREGS 0, r9
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
 	LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace changed it */
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	xchgl %ebp,%r9d
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
@@ -417,7 +420,8 @@ ENTRY(ia32_syscall)
 	cld
 	/* note the registers are not zero extended to the sf.
 	   this could be a problem. */
-	SAVE_ARGS 0,1,0
+	ALLOC_PTREGS_ON_STACK
+	SAVE_C_REGS_EXCEPT_R891011
 	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jnz ia32_tracesys
@@ -430,16 +434,16 @@ ia32_sysret:
 	movq %rax,RAX-ARGOFFSET(%rsp)
 ia32_ret_from_sys_call:
 	CLEAR_RREGS -ARGOFFSET
-	jmp int_ret_from_sys_call 
+	jmp int_ret_from_sys_call
 
-ia32_tracesys:			 
-	SAVE_REST
+ia32_tracesys:
+	SAVE_EXTRA_REGS
 	CLEAR_RREGS
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
 	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
 	jmp ia32_do_call
@@ -475,7 +479,6 @@ GLOBAL(stub32_clone)
 
 	ALIGN
 ia32_ptregs_common:
-	popq %r11
 	CFI_ENDPROC
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
@@ -490,9 +493,9 @@ ia32_ptregs_common:
 /*	CFI_REL_OFFSET	rflags,EFLAGS-ARGOFFSET*/
 	CFI_REL_OFFSET	rsp,RSP-ARGOFFSET
 /*	CFI_REL_OFFSET	ss,SS-ARGOFFSET*/
-	SAVE_REST
+	SAVE_EXTRA_REGS 8
 	call *%rax
-	RESTORE_REST
-	jmp  ia32_sysret	/* misbalances the return cache */
+	RESTORE_EXTRA_REGS 8
+	ret
 	CFI_ENDPROC
 END(ia32_ptregs_common)
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index e176cea..10aff1e 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -52,142 +52,132 @@ For 32-bit we have the following conventions - kernel is built with
 
 /*
  * 64-bit system call stack frame layout defines and helpers,
- * for assembly code:
+ * for assembly code.
  */
 
-#define R15		  0
-#define R14		  8
-#define R13		 16
-#define R12		 24
-#define RBP		 32
-#define RBX		 40
-
-/* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11		 48
-#define R10		 56
-#define R9		 64
-#define R8		 72
-#define RAX		 80
-#define RCX		 88
-#define RDX		 96
-#define RSI		104
-#define RDI		112
-#define ORIG_RAX	120       /* + error_code */
-/* end of arguments */
-
-/* cpu exception frame or undefined in case of fast syscall: */
-#define RIP		128
-#define CS		136
-#define EFLAGS		144
-#define RSP		152
-#define SS		160
-
-#define ARGOFFSET	R11
-
-	.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
-	subq  $9*8+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET	9*8+\addskip
-	movq_cfi rdi, 8*8
-	movq_cfi rsi, 7*8
-	movq_cfi rdx, 6*8
-
-	.if \save_rcx
-	movq_cfi rcx, 5*8
-	.endif
-
-	movq_cfi rax, 4*8
+/* The layout forms the "struct pt_regs" on the stack: */
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+#define R15		0*8
+#define R14		1*8
+#define R13		2*8
+#define R12		3*8
+#define RBP		4*8
+#define RBX		5*8
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+#define R11		6*8
+#define R10		7*8
+#define R9		8*8
+#define R8		9*8
+#define RAX		10*8
+#define RCX		11*8
+#define RDX		12*8
+#define RSI		13*8
+#define RDI		14*8
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX	15*8
+/* Return frame for iretq */
+#define RIP		16*8
+#define CS		17*8
+#define EFLAGS		18*8
+#define RSP		19*8
+#define SS		20*8
+
+#define ARGOFFSET	0
+
+	.macro ALLOC_PTREGS_ON_STACK addskip=0
+	subq	$15*8+\addskip, %rsp
+	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
+	.endm
 
-	.if \save_r891011
-	movq_cfi r8,  3*8
-	movq_cfi r9,  2*8
-	movq_cfi r10, 1*8
-	movq_cfi r11, 0*8
+	.macro SAVE_C_REGS_HELPER rcx=1 r8plus=1
+	movq_cfi rdi, 14*8
+	movq_cfi rsi, 13*8
+	movq_cfi rdx, 12*8
+	.if \rcx
+	movq_cfi rcx, 11*8
 	.endif
-
+	movq_cfi rax, 10*8
+	.if \r8plus
+	movq_cfi r8,  9*8
+	movq_cfi r9,  8*8
+	movq_cfi r10, 7*8
+	movq_cfi r11, 6*8
+	.endif
+	.endm
+	.macro SAVE_C_REGS
+	SAVE_C_REGS_HELPER 1, 1
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_R891011
+	SAVE_C_REGS_HELPER 1, 0
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_RCX_R891011
+	SAVE_C_REGS_HELPER 0, 0
 	.endm
 
-#define ARG_SKIP	(9*8)
+	.macro SAVE_EXTRA_REGS offset=0
+	movq_cfi rbx, 5*8+\offset
+	movq_cfi rbp, 4*8+\offset
+	movq_cfi r12, 3*8+\offset
+	movq_cfi r13, 2*8+\offset
+	movq_cfi r14, 1*8+\offset
+	movq_cfi r15, 0*8+\offset
+	.endm
 
-	.macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
-			    rstor_r8910=1, rstor_rdx=1
-	.if \rstor_r11
-	movq_cfi_restore 0*8, r11
-	.endif
+	.macro RESTORE_EXTRA_REGS offset=0
+	movq_cfi_restore 0*8+\offset, r15
+	movq_cfi_restore 1*8+\offset, r14
+	movq_cfi_restore 2*8+\offset, r13
+	movq_cfi_restore 3*8+\offset, r12
+	movq_cfi_restore 4*8+\offset, rbp
+	movq_cfi_restore 5*8+\offset, rbx
+	.endm
 
-	.if \rstor_r8910
-	movq_cfi_restore 1*8, r10
-	movq_cfi_restore 2*8, r9
-	movq_cfi_restore 3*8, r8
+	.macro RESTORE_C_REGS_HELPER rax=1, rcx=1, r11=1, r8910=1, rdx=1
+	.if \r11
+	movq_cfi_restore 6*8, r11
 	.endif
-
-	.if \rstor_rax
-	movq_cfi_restore 4*8, rax
+	.if \r8910
+	movq_cfi_restore 7*8, r10
+	movq_cfi_restore 8*8, r9
+	movq_cfi_restore 9*8, r8
 	.endif
-
-	.if \rstor_rcx
-	movq_cfi_restore 5*8, rcx
+	.if \rax
+	movq_cfi_restore 10*8, rax
 	.endif
-
-	.if \rstor_rdx
-	movq_cfi_restore 6*8, rdx
+	.if \rcx
+	movq_cfi_restore 11*8, rcx
 	.endif
-
-	movq_cfi_restore 7*8, rsi
-	movq_cfi_restore 8*8, rdi
-
-	.if ARG_SKIP+\addskip > 0
-	addq $ARG_SKIP+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET	-(ARG_SKIP+\addskip)
+	.if \rdx
+	movq_cfi_restore 12*8, rdx
 	.endif
+	movq_cfi_restore 13*8, rsi
+	movq_cfi_restore 14*8, rdi
 	.endm
-
-	.macro LOAD_ARGS offset, skiprax=0
-	movq \offset(%rsp),    %r11
-	movq \offset+8(%rsp),  %r10
-	movq \offset+16(%rsp), %r9
-	movq \offset+24(%rsp), %r8
-	movq \offset+40(%rsp), %rcx
-	movq \offset+48(%rsp), %rdx
-	movq \offset+56(%rsp), %rsi
-	movq \offset+64(%rsp), %rdi
-	.if \skiprax
-	.else
-	movq \offset+72(%rsp), %rax
-	.endif
+	.macro RESTORE_C_REGS
+	RESTORE_C_REGS_HELPER 1,1,1,1,1
 	.endm
-
-#define REST_SKIP	(6*8)
-
-	.macro SAVE_REST
-	subq $REST_SKIP, %rsp
-	CFI_ADJUST_CFA_OFFSET	REST_SKIP
-	movq_cfi rbx, 5*8
-	movq_cfi rbp, 4*8
-	movq_cfi r12, 3*8
-	movq_cfi r13, 2*8
-	movq_cfi r14, 1*8
-	movq_cfi r15, 0*8
+	.macro RESTORE_C_REGS_EXCEPT_RAX
+	RESTORE_C_REGS_HELPER 0,1,1,1,1
 	.endm
-
-	.macro RESTORE_REST
-	movq_cfi_restore 0*8, r15
-	movq_cfi_restore 1*8, r14
-	movq_cfi_restore 2*8, r13
-	movq_cfi_restore 3*8, r12
-	movq_cfi_restore 4*8, rbp
-	movq_cfi_restore 5*8, rbx
-	addq $REST_SKIP, %rsp
-	CFI_ADJUST_CFA_OFFSET	-(REST_SKIP)
+	.macro RESTORE_C_REGS_EXCEPT_RCX
+	RESTORE_C_REGS_HELPER 1,0,1,1,1
 	.endm
-
-	.macro SAVE_ALL
-	SAVE_ARGS
-	SAVE_REST
+	.macro RESTORE_RSI_RDI
+	RESTORE_C_REGS_HELPER 0,0,0,0,0
+	.endm
+	.macro RESTORE_RSI_RDI_RDX
+	RESTORE_C_REGS_HELPER 0,0,0,0,1
 	.endm
 
-	.macro RESTORE_ALL addskip=0
-	RESTORE_REST
-	RESTORE_ARGS 1, \addskip
+	.macro REMOVE_PTREGS_FROM_STACK addskip=0
+	addq $15*8+\addskip, %rsp
+	CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
 	.endm
 
 	.macro icebp
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index bba3cf8..6f98c16 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void)
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
 	TRACE_IRQS_ON; \
 	sti; \
-	SAVE_REST; \
+	SAVE_EXTRA_REGS; \
 	LOCKDEP_SYS_EXIT; \
-	RESTORE_REST; \
+	RESTORE_EXTRA_REGS; \
 	cli; \
 	TRACE_IRQS_OFF;
 
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6205f0c..c822b35 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -31,13 +31,17 @@ struct pt_regs {
 #else /* __i386__ */
 
 struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
 	unsigned long r15;
 	unsigned long r14;
 	unsigned long r13;
 	unsigned long r12;
 	unsigned long bp;
 	unsigned long bx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
 	unsigned long r11;
 	unsigned long r10;
 	unsigned long r9;
@@ -47,9 +51,12 @@ struct pt_regs {
 	unsigned long dx;
 	unsigned long si;
 	unsigned long di;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
 	unsigned long orig_ax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
 	unsigned long ip;
 	unsigned long cs;
 	unsigned long flags;
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 7b0a55a..580aee3 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,13 +25,17 @@
 #else /* __i386__ */
 
 #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
 #define R15 0
 #define R14 8
 #define R13 16
 #define R12 24
 #define RBP 32
 #define RBX 40
-/* arguments: interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
 #define R11 48
 #define R10 56
 #define R9 64
@@ -41,15 +45,17 @@
 #define RDX 96
 #define RSI 104
 #define RDI 112
-#define ORIG_RAX 120       /* = ERROR */
-/* end of arguments */
-/* cpu exception frame or undefined in case of fast syscall. */
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 120
+/* Return frame for iretq */
 #define RIP 128
 #define CS 136
 #define EFLAGS 144
 #define RSP 152
 #define SS 160
-#define ARGOFFSET R11
 #endif /* __ASSEMBLY__ */
 
 /* top of stack page */
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index ac4b9aa..bc16115 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -41,13 +41,17 @@ struct pt_regs {
 #ifndef __KERNEL__
 
 struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
 	unsigned long r15;
 	unsigned long r14;
 	unsigned long r13;
 	unsigned long r12;
 	unsigned long rbp;
 	unsigned long rbx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
 	unsigned long r11;
 	unsigned long r10;
 	unsigned long r9;
@@ -57,9 +61,12 @@ struct pt_regs {
 	unsigned long rdx;
 	unsigned long rsi;
 	unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
 	unsigned long orig_rax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
 	unsigned long rip;
 	unsigned long cs;
 	unsigned long eflags;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 37f7d95..b3c3ebb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -26,12 +26,6 @@
  * Some macro usage:
  * - CFI macros are used to generate dwarf2 unwind information for better
  * backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
  * - ENTRY/END Define functions in the symbol table.
  * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
  * frame that is otherwise undefined after a SYSCALL
@@ -264,7 +258,7 @@ ENTRY(ret_from_fork)
 
 	GET_THREAD_INFO(%rcx)
 
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 
 	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
 	jz   1f
@@ -276,12 +270,10 @@ ENTRY(ret_from_fork)
 	jmp ret_from_sys_call			# go to the SYSRET fastpath
 
 1:
-	subq $REST_SKIP, %rsp	# leave space for volatiles
-	CFI_ADJUST_CFA_OFFSET	REST_SKIP
 	movq %rbp, %rdi
 	call *%rbx
 	movl $0, RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(ret_from_fork)
@@ -339,7 +331,8 @@ GLOBAL(system_call_after_swapgs)
 	 * and short:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_ARGS 8,0
+	ALLOC_PTREGS_ON_STACK 8
+	SAVE_C_REGS
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
@@ -375,9 +368,9 @@ sysret_check:
 	 * sysretq will re-enable interrupts:
 	 */
 	TRACE_IRQS_ON
+	RESTORE_C_REGS_EXCEPT_RCX
 	movq RIP-ARGOFFSET(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
-	RESTORE_ARGS 1,-ARG_SKIP,0
 	/*CFI_REGISTER	rflags,r11*/
 	movq	PER_CPU_VAR(old_rsp), %rsp
 	USERGS_SYSRET64
@@ -429,7 +422,7 @@ auditsys:
 	movq %rax,%rsi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
 	call __audit_syscall_entry
-	LOAD_ARGS 0		/* reload call-clobbered registers */
+	RESTORE_C_REGS			/* reload call-clobbered registers */
 	jmp system_call_fastpath
 
 	/*
@@ -453,7 +446,7 @@ tracesys:
 	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jz auditsys
 #endif
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
 	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp,%rdi
@@ -463,8 +456,8 @@ tracesys:
 	 * We don't reload %rax because syscall_trace_enter() returned
 	 * the value it wants us to use in the table lookup.
 	 */
-	LOAD_ARGS ARGOFFSET, 1
-	RESTORE_REST
+	RESTORE_C_REGS_EXCEPT_RAX
+	RESTORE_EXTRA_REGS
 #if __SYSCALL_MASK == ~0
 	cmpq $__NR_syscall_max,%rax
 #else
@@ -515,7 +508,7 @@ int_very_careful:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 int_check_syscall_exit_work:
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	/* Check for syscall exit trace */
 	testl $_TIF_WORK_SYSCALL_EXIT,%edx
 	jz int_signal
@@ -534,7 +527,7 @@ int_signal:
 	call do_notify_resume
 1:	movl $_TIF_WORK_MASK,%edi
 int_restore_rest:
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
@@ -544,15 +537,12 @@ END(system_call)
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
 	CFI_STARTPROC
-	popq	%r11			/* save return address */
-	PARTIAL_FRAME 0
-	SAVE_REST
-	pushq	%r11			/* put it back on stack */
+	DEFAULT_FRAME 0, 8		/* offset 8: return address */
+	SAVE_EXTRA_REGS 8
 	FIXUP_TOP_OF_STACK %r11, 8
-	DEFAULT_FRAME 0 8		/* offset 8: return address */
 	call sys_\func
 	RESTORE_TOP_OF_STACK %r11, 8
-	ret $REST_SKIP		/* pop extended registers */
+	ret
 	CFI_ENDPROC
 END(stub_\func)
 	.endm
@@ -560,7 +550,7 @@ END(stub_\func)
 	.macro FIXED_FRAME label,func
 ENTRY(\label)
 	CFI_STARTPROC
-	PARTIAL_FRAME 0 8		/* offset 8: return address */
+	DEFAULT_FRAME 0, 8		/* offset 8: return address */
 	FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
 	call \func
 	RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
@@ -577,12 +567,12 @@ END(\label)
 ENTRY(stub_execve)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %r11
 	call sys_execve
 	movq %rax,RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_execve)
@@ -594,12 +584,12 @@ END(stub_execve)
 ENTRY(stub_rt_sigreturn)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %r11
 	call sys_rt_sigreturn
 	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_rt_sigreturn)
@@ -608,12 +598,12 @@ END(stub_rt_sigreturn)
 ENTRY(stub_x32_rt_sigreturn)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %r11
 	call sys32_x32_rt_sigreturn
 	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_x32_rt_sigreturn)
@@ -621,13 +611,13 @@ END(stub_x32_rt_sigreturn)
 ENTRY(stub_x32_execve)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %r11
 	call compat_sys_execve
 	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_x32_execve)
@@ -683,51 +673,31 @@ END(interrupt)
 
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
-	/* reserve pt_regs for scratch regs and rbp */
-	subq $ORIG_RAX-RBP, %rsp
-	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
-	cld
-	/* start from rbp in pt_regs and jump over */
-	movq_cfi rdi, (RDI-RBP)
-	movq_cfi rsi, (RSI-RBP)
-	movq_cfi rdx, (RDX-RBP)
-	movq_cfi rcx, (RCX-RBP)
-	movq_cfi rax, (RAX-RBP)
-	movq_cfi  r8,  (R8-RBP)
-	movq_cfi  r9,  (R9-RBP)
-	movq_cfi r10, (R10-RBP)
-	movq_cfi r11, (R11-RBP)
-
-	/* Save rbp so that we can unwind from get_irq_regs() */
-	movq_cfi rbp, 0
-
-	/* Save previous stack value */
-	movq %rsp, %rsi
-
-	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
-	testl $3, CS-RBP(%rsi)
+	ALLOC_PTREGS_ON_STACK
+	SAVE_C_REGS
+	movq %rsp, %rdi	/* arg1 for handler */
+	testl $3, CS(%rsp)
 	je 1f
 	SWAPGS
-	/*
+1:	/*
 	 * irq_count is used to check if a CPU is already on an interrupt stack
 	 * or not. While this is essentially redundant with preempt_count it is
 	 * a little cheaper to use a separate counter in the PDA (short of
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
-1:	incl PER_CPU_VAR(irq_count)
+	incl PER_CPU_VAR(irq_count)
 	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
-	CFI_DEF_CFA_REGISTER	rsi
+	CFI_DEF_CFA_REGISTER	rdi
 
 	/* Store previous stack value */
-	pushq %rsi
+	pushq %rdi
 	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
 			0x77 /* DW_OP_breg7 */, 0, \
 			0x06 /* DW_OP_deref */, \
-			0x08 /* DW_OP_const1u */, SS+8-RBP, \
+			0x08 /* DW_OP_const1u */, SS+8, \
 			0x22 /* DW_OP_plus */
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
-
 	call \func
 	.endm
 
@@ -749,10 +719,9 @@ ret_from_intr:
 
 	/* Restore saved previous stack */
 	popq %rsi
-	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */
-	leaq ARGOFFSET-RBP(%rsi), %rsp
+	CFI_DEF_CFA rsi,SS+8	/* reg/off reset after def_cfa_expr */
+	movq %rsi, %rsp
 	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET
 
 exit_intr:
 	GET_THREAD_INFO(%rcx)
@@ -789,7 +758,8 @@ retint_restore_args:	/* return to kernel space */
 	 */
 	TRACE_IRQS_IRETQ
 restore_args:
-	RESTORE_ARGS 1,8,1
+	RESTORE_C_REGS
+	REMOVE_PTREGS_FROM_STACK 8
 
 irq_return:
 	/*
@@ -876,12 +846,12 @@ retint_signal:
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	movq $-1,ORIG_RAX(%rsp)
 	xorl %esi,%esi		# oldset
 	movq %rsp,%rdi		# &pt_regs
 	call do_notify_resume
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
@@ -1256,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
 	addq $0x30,%rsp
 	CFI_ADJUST_CFA_OFFSET -0x30
 	pushq_cfi $-1 /* orig_ax = -1 => not a system call */
-	SAVE_ALL
+	ALLOC_PTREGS_ON_STACK
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
 	jmp error_exit
 	CFI_ENDPROC
 END(xen_failsafe_callback)
@@ -1313,11 +1285,15 @@ ENTRY(paranoid_exit)
 paranoid_swapgs:
 	TRACE_IRQS_IRETQ 0
 	SWAPGS_UNSAFE_STACK
-	RESTORE_ALL 8
+	RESTORE_EXTRA_REGS
+	RESTORE_C_REGS
+	REMOVE_PTREGS_FROM_STACK 8
 	jmp irq_return
 paranoid_restore:
 	TRACE_IRQS_IRETQ_DEBUG 0
-	RESTORE_ALL 8
+	RESTORE_EXTRA_REGS
+	RESTORE_C_REGS
+	REMOVE_PTREGS_FROM_STACK 8
 	jmp irq_return
 paranoid_userspace:
 	GET_THREAD_INFO(%rcx)
@@ -1412,7 +1388,7 @@ END(error_entry)
 ENTRY(error_exit)
 	DEFAULT_FRAME
 	movl %ebx,%eax
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
@@ -1671,8 +1647,10 @@ end_repeat_nmi:
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
+	RESTORE_EXTRA_REGS
+	RESTORE_C_REGS
 	/* Pop the extra iret frame at once */
-	RESTORE_ALL 6*8
+	REMOVE_PTREGS_FROM_STACK 6*8
 
 	/* Clear the NMI executing stack variable */
 	movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S
index ca7f0d5..673da2f 100644
--- a/arch/x86/kernel/preempt.S
+++ b/arch/x86/kernel/preempt.S
@@ -6,9 +6,13 @@
 
 ENTRY(___preempt_schedule)
 	CFI_STARTPROC
-	SAVE_ALL
+	ALLOC_PTREGS_ON_STACK
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
 	call preempt_schedule
-	RESTORE_ALL
+	RESTORE_EXTRA_REGS
+	RESTORE_C_REGS
+	REMOVE_PTREGS_FROM_STACK
 	ret
 	CFI_ENDPROC
 
@@ -16,9 +20,13 @@ ENTRY(___preempt_schedule)
 
 ENTRY(___preempt_schedule_context)
 	CFI_STARTPROC
-	SAVE_ALL
+	ALLOC_PTREGS_ON_STACK
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
 	call preempt_schedule_context
-	RESTORE_ALL
+	RESTORE_EXTRA_REGS
+	RESTORE_C_REGS
+	REMOVE_PTREGS_FROM_STACK
 	ret
 	CFI_ENDPROC
 
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 5/5] x86: mass removal of ARGOFFSET
  2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
                   ` (2 preceding siblings ...)
  2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
@ 2014-08-01 14:48 ` Denys Vlasenko
  2014-08-01 18:00 ` [PATCH 1/5] x86: entry_64.S: delete unused code Frederic Weisbecker
  4 siblings, 0 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 14:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Denys Vlasenko, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

ARGOFFSET is zero now, removing it changes no code.
A few macros lost "offset" parameter, since it is always zero now too.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
---
 arch/x86/ia32/ia32entry.S      | 136 ++++++++++++++++++++---------------------
 arch/x86/include/asm/calling.h |   2 -
 arch/x86/kernel/entry_64.S     |  62 +++++++++----------
 3 files changed, 99 insertions(+), 101 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ef9ee16..4e51c7c 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -41,13 +41,13 @@
 	movl	%edx,%edx	/* zero extension */
 	.endm 
 
-	/* clobbers %eax */	
-	.macro  CLEAR_RREGS offset=0, _r9=rax
+	/* clobbers %eax */
+	.macro  CLEAR_RREGS _r9=rax
 	xorl 	%eax,%eax
-	movq	%rax,\offset+R11(%rsp)
-	movq	%rax,\offset+R10(%rsp)
-	movq	%\_r9,\offset+R9(%rsp)
-	movq	%rax,\offset+R8(%rsp)
+	movq	%rax,R11(%rsp)
+	movq	%rax,R10(%rsp)
+	movq	%\_r9,R9(%rsp)
+	movq	%rax,R8(%rsp)
 	.endm
 
 	/*
@@ -60,14 +60,14 @@
 	 * If it's -1 to make us punt the syscall, then (u32)-1 is still
 	 * an appropriately invalid value.
 	 */
-	.macro LOAD_ARGS32 offset, _r9=0
+	.macro LOAD_ARGS32 _r9=0
 	.if \_r9
-	movl \offset+R9(%rsp),%r9d
+	movl R9(%rsp),%r9d
 	.endif
-	movl \offset+RCX(%rsp),%ecx
-	movl \offset+RDX(%rsp),%edx
-	movl \offset+RSI(%rsp),%esi
-	movl \offset+RDI(%rsp),%edi
+	movl RCX(%rsp),%ecx
+	movl RDX(%rsp),%edx
+	movl RSI(%rsp),%esi
+	movl RDI(%rsp),%edi
 	movl %eax,%eax			/* zero extension */
 	.endm
 	
@@ -152,8 +152,8 @@ ENTRY(ia32_sysenter_target)
 1:	movl	(%rbp),%ebp
 	_ASM_EXTABLE(1b,ia32_badarg)
 	ASM_CLAC
-	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	cmpq	$(IA32_NR_syscalls-1),%rax
@@ -162,13 +162,13 @@ sysenter_do_call:
 	IA32_ARG_FIXUP
 sysenter_dispatch:
 	call	*ia32_sys_call_table(,%rax,8)
-	movq	%rax,RAX-ARGOFFSET(%rsp)
+	movq	%rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl	$_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl	$_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz	sysexit_audit
 sysexit_from_sys_call:
-	andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
 	/* clear IF, that popfq doesn't enable interrupts early */
 	andl  $~0x200,EFLAGS-R11(%rsp) 
 	movl	RIP-R11(%rsp),%edx		/* User %eip */
@@ -195,18 +195,18 @@ sysexit_from_sys_call:
 	movl %eax,%esi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%edi	/* 1st arg: audit arch */
 	call __audit_syscall_entry
-	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
+	movl RAX(%rsp),%eax		/* reload syscall number */
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
 	movl %ebx,%edi			/* reload 1st syscall arg */
-	movl RCX-ARGOFFSET(%rsp),%esi	/* reload 2nd syscall arg */
-	movl RDX-ARGOFFSET(%rsp),%edx	/* reload 3rd syscall arg */
-	movl RSI-ARGOFFSET(%rsp),%ecx	/* reload 4th syscall arg */
-	movl RDI-ARGOFFSET(%rsp),%r8d	/* reload 5th syscall arg */
+	movl RCX(%rsp),%esi		/* reload 2nd syscall arg */
+	movl RDX(%rsp),%edx		/* reload 3rd syscall arg */
+	movl RSI(%rsp),%ecx		/* reload 4th syscall arg */
+	movl RDI(%rsp),%r8d		/* reload 5th syscall arg */
 	.endm
 
 	.macro auditsys_exit exit
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
@@ -217,13 +217,13 @@ sysexit_from_sys_call:
 1:	setbe %al		/* 1 if error, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
 	call __audit_syscall_exit
-	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
+	movq RAX(%rsp),%rax	/* reload syscall return value */
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl %edi,TI_flags+THREAD_INFO(%rsp,RIP)
 	jz \exit
-	CLEAR_RREGS -ARGOFFSET
+	CLEAR_RREGS
 	jmp int_with_check
 	.endm
 
@@ -239,7 +239,7 @@ sysexit_audit:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
 	jz	sysenter_auditsys
 #endif
 	SAVE_EXTRA_REGS
@@ -247,7 +247,7 @@ sysenter_tracesys:
 	movq	$-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
 	call	syscall_trace_enter
-	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	LOAD_ARGS32	/* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
 	cmpq	$(IA32_NR_syscalls-1),%rax
 	ja	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
@@ -293,17 +293,17 @@ ENTRY(ia32_cstar_target)
 	ALLOC_PTREGS_ON_STACK 8
 	SAVE_C_REGS_EXCEPT_RCX_R891011
 	movl 	%eax,%eax	/* zero extension */
-	movq	%rax,ORIG_RAX-ARGOFFSET(%rsp)
-	movq	%rcx,RIP-ARGOFFSET(%rsp)
-	CFI_REL_OFFSET rip,RIP-ARGOFFSET
-	movq	%rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+	movq	%rax,ORIG_RAX(%rsp)
+	movq	%rcx,RIP(%rsp)
+	CFI_REL_OFFSET rip,RIP
+	movq	%rbp,RCX(%rsp) /* this lies slightly to ptrace */
 	movl	%ebp,%ecx
-	movq	$__USER32_CS,CS-ARGOFFSET(%rsp)
-	movq	$__USER32_DS,SS-ARGOFFSET(%rsp)
-	movq	%r11,EFLAGS-ARGOFFSET(%rsp)
-	/*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
-	movq	%r8,RSP-ARGOFFSET(%rsp)	
-	CFI_REL_OFFSET rsp,RSP-ARGOFFSET
+	movq	$__USER32_CS,CS(%rsp)
+	movq	$__USER32_DS,SS(%rsp)
+	movq	%r11,EFLAGS(%rsp)
+	/*CFI_REL_OFFSET rflags,EFLAGS*/
+	movq	%r8,RSP(%rsp)
+	CFI_REL_OFFSET rsp,RSP
 	/* no need to do an access_ok check here because r8 has been
 	   32bit zero extended */ 
 	/* hardware stack frame is complete now */	
@@ -311,8 +311,8 @@ ENTRY(ia32_cstar_target)
 1:	movl	(%r8),%r9d
 	_ASM_EXTABLE(1b,ia32_badarg)
 	ASM_CLAC
-	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	cmpq $IA32_NR_syscalls-1,%rax
@@ -321,32 +321,32 @@ cstar_do_call:
 	IA32_ARG_FIXUP 1
 cstar_dispatch:
 	call *ia32_sys_call_table(,%rax,8)
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz sysretl_audit
 sysretl_from_sys_call:
-	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
 	RESTORE_RSI_RDI_RDX
-	movl RIP-ARGOFFSET(%rsp),%ecx
+	movl RIP(%rsp),%ecx
 	CFI_REGISTER rip,rcx
-	movl EFLAGS-ARGOFFSET(%rsp),%r11d	
+	movl EFLAGS(%rsp),%r11d
 	/*CFI_REGISTER rflags,r11*/
 	xorq	%r10,%r10
 	xorq	%r9,%r9
 	xorq	%r8,%r8
 	TRACE_IRQS_ON
-	movl RSP-ARGOFFSET(%rsp),%esp
+	movl RSP(%rsp),%esp
 	CFI_RESTORE rsp
 	USERGS_SYSRET32
 	
 #ifdef CONFIG_AUDITSYSCALL
 cstar_auditsys:
 	CFI_RESTORE_STATE
-	movl %r9d,R9-ARGOFFSET(%rsp)	/* register to be clobbered by call */
+	movl %r9d,R9(%rsp)	/* register to be clobbered by call */
 	auditsys_entry_common
-	movl R9-ARGOFFSET(%rsp),%r9d	/* reload 6th syscall arg */
+	movl R9(%rsp),%r9d	/* reload 6th syscall arg */
 	jmp cstar_dispatch
 
 sysretl_audit:
@@ -355,16 +355,16 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
 	SAVE_EXTRA_REGS
-	CLEAR_RREGS 0, r9
+	CLEAR_RREGS r9
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
-	LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace changed it */
+	LOAD_ARGS32 1	/* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
 	xchgl %ebp,%r9d
 	cmpq $(IA32_NR_syscalls-1),%rax
@@ -422,8 +422,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	ALLOC_PTREGS_ON_STACK
 	SAVE_C_REGS_EXCEPT_R891011
-	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz ia32_tracesys
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
@@ -431,9 +431,9 @@ ia32_do_call:
 	IA32_ARG_FIXUP
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 ia32_ret_from_sys_call:
-	CLEAR_RREGS -ARGOFFSET
+	CLEAR_RREGS
 	jmp int_ret_from_sys_call
 
 ia32_tracesys:
@@ -442,7 +442,7 @@ ia32_tracesys:
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
-	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	LOAD_ARGS32	/* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
@@ -450,7 +450,7 @@ ia32_tracesys:
 END(ia32_syscall)
 
 ia32_badsys:
-	movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+	movq $0,ORIG_RAX(%rsp)
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
@@ -482,17 +482,17 @@ ia32_ptregs_common:
 	CFI_ENDPROC
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8-ARGOFFSET
-	CFI_REL_OFFSET	rax,RAX-ARGOFFSET
-	CFI_REL_OFFSET	rcx,RCX-ARGOFFSET
-	CFI_REL_OFFSET	rdx,RDX-ARGOFFSET
-	CFI_REL_OFFSET	rsi,RSI-ARGOFFSET
-	CFI_REL_OFFSET	rdi,RDI-ARGOFFSET
-	CFI_REL_OFFSET	rip,RIP-ARGOFFSET
-/*	CFI_REL_OFFSET	cs,CS-ARGOFFSET*/
-/*	CFI_REL_OFFSET	rflags,EFLAGS-ARGOFFSET*/
-	CFI_REL_OFFSET	rsp,RSP-ARGOFFSET
-/*	CFI_REL_OFFSET	ss,SS-ARGOFFSET*/
+	CFI_DEF_CFA	rsp,SS+8
+	CFI_REL_OFFSET	rax,RAX
+	CFI_REL_OFFSET	rcx,RCX
+	CFI_REL_OFFSET	rdx,RDX
+	CFI_REL_OFFSET	rsi,RSI
+	CFI_REL_OFFSET	rdi,RDI
+	CFI_REL_OFFSET	rip,RIP
+/*	CFI_REL_OFFSET	cs,CS*/
+/*	CFI_REL_OFFSET	rflags,EFLAGS*/
+	CFI_REL_OFFSET	rsp,RSP
+/*	CFI_REL_OFFSET	ss,SS*/
 	SAVE_EXTRA_REGS 8
 	call *%rax
 	RESTORE_EXTRA_REGS 8
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 10aff1e..31d0ac8 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -88,8 +88,6 @@ For 32-bit we have the following conventions - kernel is built with
 #define RSP		19*8
 #define SS		20*8
 
-#define ARGOFFSET	0
-
 	.macro ALLOC_PTREGS_ON_STACK addskip=0
 	subq	$15*8+\addskip, %rsp
 	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b3c3ebb..ec68a07 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -76,7 +76,7 @@ ENDPROC(native_usergs_sysret64)
 #endif /* CONFIG_PARAVIRT */
 
 
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+.macro TRACE_IRQS_IRETQ offset=0
 #ifdef CONFIG_TRACE_IRQFLAGS
 	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
 	jnc  1f
@@ -110,7 +110,7 @@ ENDPROC(native_usergs_sysret64)
 	call debug_stack_reset
 .endm
 
-.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
+.macro TRACE_IRQS_IRETQ_DEBUG offset=0
 	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
 	jnc  1f
 	TRACE_IRQS_ON_DEBUG
@@ -187,16 +187,16 @@ ENDPROC(native_usergs_sysret64)
  * frame that enables calling into C.
  */
 	.macro PARTIAL_FRAME start=1 offset=0
-	XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
-	CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
-	CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
-	CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
-	CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
-	CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
-	CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
-	CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
-	CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
-	CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+	XCPT_FRAME \start, ORIG_RAX+\offset
+	CFI_REL_OFFSET rdi, RDI+\offset
+	CFI_REL_OFFSET rsi, RSI+\offset
+	CFI_REL_OFFSET rdx, RDX+\offset
+	CFI_REL_OFFSET rcx, RCX+\offset
+	CFI_REL_OFFSET rax, RAX+\offset
+	CFI_REL_OFFSET r8, R8+\offset
+	CFI_REL_OFFSET r9, R9+\offset
+	CFI_REL_OFFSET r10, R10+\offset
+	CFI_REL_OFFSET r11, R11+\offset
 	.endm
 
 /*
@@ -260,13 +260,13 @@ ENTRY(ret_from_fork)
 
 	RESTORE_EXTRA_REGS
 
-	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
+	testl $3, CS(%rsp)			# from kernel_thread?
 	jz   1f
 
 	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
 	jnz  int_ret_from_sys_call
 
-	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
+	RESTORE_TOP_OF_STACK %rdi
 	jmp ret_from_sys_call			# go to the SYSRET fastpath
 
 1:
@@ -333,10 +333,10 @@ GLOBAL(system_call_after_swapgs)
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	ALLOC_PTREGS_ON_STACK 8
 	SAVE_C_REGS
-	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
-	movq  %rcx,RIP-ARGOFFSET(%rsp)
-	CFI_REL_OFFSET rip,RIP-ARGOFFSET
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	movq  %rax,ORIG_RAX(%rsp)
+	movq  %rcx,RIP(%rsp)
+	CFI_REL_OFFSET rip,RIP
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz tracesys
 system_call_fastpath:
 #if __SYSCALL_MASK == ~0
@@ -348,7 +348,7 @@ system_call_fastpath:
 	ja badsys
 	movq %r10,%rcx
 	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 /*
  * Syscall return path ending with SYSRET (fast path)
  * Has incomplete stack frame and undefined top of stack.
@@ -360,7 +360,7 @@ sysret_check:
 	LOCKDEP_SYS_EXIT
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
+	movl TI_flags+THREAD_INFO(%rsp,RIP),%edx
 	andl %edi,%edx
 	jnz  sysret_careful
 	CFI_REMEMBER_STATE
@@ -369,7 +369,7 @@ sysret_check:
 	 */
 	TRACE_IRQS_ON
 	RESTORE_C_REGS_EXCEPT_RCX
-	movq RIP-ARGOFFSET(%rsp),%rcx
+	movq RIP(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
 	movq	PER_CPU_VAR(old_rsp), %rsp
@@ -401,11 +401,11 @@ sysret_signal:
 	 * These all wind up with the iret return path anyway,
 	 * so just join that path right now.
 	 */
-	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
+	FIXUP_TOP_OF_STACK %r11
 	jmp int_check_syscall_exit_work
 
 badsys:
-	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+	movq $-ENOSYS,RAX(%rsp)
 	jmp ret_from_sys_call
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -431,7 +431,7 @@ auditsys:
 	 * masked off.
 	 */
 sysret_audit:
-	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
+	movq RAX(%rsp),%rsi	/* second arg, syscall return value */
 	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
 	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
@@ -443,7 +443,7 @@ sysret_audit:
 	/* Do syscall tracing */
 tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
 	jz auditsys
 #endif
 	SAVE_EXTRA_REGS
@@ -467,7 +467,7 @@ tracesys:
 	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 	/* Use IRET because user could have changed frame */
 
 /*
@@ -551,9 +551,9 @@ END(stub_\func)
 ENTRY(\label)
 	CFI_STARTPROC
 	DEFAULT_FRAME 0, 8		/* offset 8: return address */
-	FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
+	FIXUP_TOP_OF_STACK %r11, 8
 	call \func
-	RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
+	RESTORE_TOP_OF_STACK %r11, 8
 	ret
 	CFI_ENDPROC
 END(\label)
@@ -711,7 +711,7 @@ common_interrupt:
 	ASM_CLAC
 	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
 	interrupt do_IRQ
-	/* 0(%rsp): old_rsp-ARGOFFSET */
+	/* 0(%rsp): old_rsp */
 ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
@@ -725,7 +725,7 @@ ret_from_intr:
 
 exit_intr:
 	GET_THREAD_INFO(%rcx)
-	testl $3,CS-ARGOFFSET(%rsp)
+	testl $3,CS(%rsp)
 	je retint_kernel
 
 	/* Interrupt came from user space */
@@ -863,7 +863,7 @@ retint_signal:
 ENTRY(retint_kernel)
 	cmpl $0,PER_CPU_VAR(__preempt_count)
 	jnz  retint_restore_args
-	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
+	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
 	jmp exit_intr
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
@ 2014-08-01 17:04   ` Andy Lutomirski
  2014-08-04 14:28     ` Denys Vlasenko
  2014-08-01 18:09   ` Alexei Starovoitov
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-01 17:04 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: linux-kernel, Oleg Nesterov, H. Peter Anvin, Frederic Weisbecker,
	X86 ML, Alexei Starovoitov, Will Drewry, Kees Cook

On Fri, Aug 1, 2014 at 7:48 AM, Denys Vlasenko <dvlasenk@redhat.com> wrote:
> 64-bit code was using six stack slots fewer by not saving/restoring
> registers which a callee-preserved according to C ABI,
> and not allocating space for them

This is great.

Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :)  Maybe I'll give that a shot.

--Andy
.
>
> Only when syscall needed a complete "struct pt_regs",
> the complete area was allocated and filled in.
>
> This proved to be a source of significant obfuscation and subtle bugs.
> For example, stub_fork had to pop the return address,
> extend the struct, save registers, and push return address back. Ugly.
> ia32_ptregs_common pops return address and "returns" via jmp insn,
> throwing a wrench into CPU return stack cache.
>
> This patch changes code to always allocate a complete "struct pt_regs".
> The saving of registers is still done lazily.
>
> Macros which manipulate "struct pt_regs" on stack are reworked:
> ALLOC_PTREGS_ON_STACK allocates the structure.
> SAVE_C_REGS saves to it those registers which are clobbered by C code.
> SAVE_EXTRA_REGS saves to it all other registers.
> Corresponding RESTORE_* and REMOVE_PTREGS_FROM_STACK macros reverse it.
>
> ia32_ptregs_common, stub_fork and friends lost their ugly dance with
> return pointer.
>
> LOAD_ARGS32 in ia32entry.S now uses a symbolic stack offsets
> instead of magic numbers.
>
> Misleading and slightly wrong comments in "struct pt_regs" are fixed
> (four instances).
>
> Patch was run-tested: 64-bit executables, 32-bit executables,
> strace works.
>
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: "H. Peter Anvin" <hpa@zytor.com>
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Frederic Weisbecker <fweisbec@gmail.com>
> CC: X86 ML <x86@kernel.org>
> CC: Alexei Starovoitov <ast@plumgrid.com>
> CC: Will Drewry <wad@chromium.org>
> CC: Kees Cook <keescook@chromium.org>
> CC: linux-kernel@vger.kernel.org
> ---
>  arch/x86/ia32/ia32entry.S              |  47 +++----
>  arch/x86/include/asm/calling.h         | 224 ++++++++++++++++-----------------
>  arch/x86/include/asm/irqflags.h        |   4 +-
>  arch/x86/include/asm/ptrace.h          |  13 +-
>  arch/x86/include/uapi/asm/ptrace-abi.h |  16 ++-
>  arch/x86/include/uapi/asm/ptrace.h     |  13 +-
>  arch/x86/kernel/entry_64.S             | 132 ++++++++-----------
>  arch/x86/kernel/preempt.S              |  16 ++-
>  8 files changed, 232 insertions(+), 233 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 4299eb0..ef9ee16 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -62,12 +62,12 @@
>          */
>         .macro LOAD_ARGS32 offset, _r9=0
>         .if \_r9
> -       movl \offset+16(%rsp),%r9d
> +       movl \offset+R9(%rsp),%r9d
>         .endif
> -       movl \offset+40(%rsp),%ecx
> -       movl \offset+48(%rsp),%edx
> -       movl \offset+56(%rsp),%esi
> -       movl \offset+64(%rsp),%edi
> +       movl \offset+RCX(%rsp),%ecx
> +       movl \offset+RDX(%rsp),%edx
> +       movl \offset+RSI(%rsp),%esi
> +       movl \offset+RDI(%rsp),%edi
>         movl %eax,%eax                  /* zero extension */
>         .endm
>
> @@ -144,7 +144,8 @@ ENTRY(ia32_sysenter_target)
>         CFI_REL_OFFSET rip,0
>         pushq_cfi %rax
>         cld
> -       SAVE_ARGS 0,1,0
> +       ALLOC_PTREGS_ON_STACK
> +       SAVE_C_REGS_EXCEPT_R891011
>         /* no need to do an access_ok check here because rbp has been
>            32bit zero extended */
>         ASM_STAC
> @@ -172,7 +173,8 @@ sysexit_from_sys_call:
>         andl  $~0x200,EFLAGS-R11(%rsp)
>         movl    RIP-R11(%rsp),%edx              /* User %eip */
>         CFI_REGISTER rip,rdx
> -       RESTORE_ARGS 0,24,0,0,0,0
> +       RESTORE_RSI_RDI
> +       REMOVE_PTREGS_FROM_STACK 8*3
>         xorq    %r8,%r8
>         xorq    %r9,%r9
>         xorq    %r10,%r10
> @@ -240,13 +242,13 @@ sysenter_tracesys:
>         testl   $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
>         jz      sysenter_auditsys
>  #endif
> -       SAVE_REST
> +       SAVE_EXTRA_REGS
>         CLEAR_RREGS
>         movq    $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
>         movq    %rsp,%rdi        /* &pt_regs -> arg1 */
>         call    syscall_trace_enter
>         LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         cmpq    $(IA32_NR_syscalls-1),%rax
>         ja      int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
>         jmp     sysenter_do_call
> @@ -288,7 +290,8 @@ ENTRY(ia32_cstar_target)
>          * disabled irqs and here we enable it straight after entry:
>          */
>         ENABLE_INTERRUPTS(CLBR_NONE)
> -       SAVE_ARGS 8,0,0
> +       ALLOC_PTREGS_ON_STACK 8
> +       SAVE_C_REGS_EXCEPT_RCX_R891011
>         movl    %eax,%eax       /* zero extension */
>         movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
>         movq    %rcx,RIP-ARGOFFSET(%rsp)
> @@ -325,7 +328,7 @@ cstar_dispatch:
>         jnz sysretl_audit
>  sysretl_from_sys_call:
>         andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> -       RESTORE_ARGS 0,-ARG_SKIP,0,0,0
> +       RESTORE_RSI_RDI_RDX
>         movl RIP-ARGOFFSET(%rsp),%ecx
>         CFI_REGISTER rip,rcx
>         movl EFLAGS-ARGOFFSET(%rsp),%r11d
> @@ -356,13 +359,13 @@ cstar_tracesys:
>         jz cstar_auditsys
>  #endif
>         xchgl %r9d,%ebp
> -       SAVE_REST
> +       SAVE_EXTRA_REGS
>         CLEAR_RREGS 0, r9
>         movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
>         movq %rsp,%rdi        /* &pt_regs -> arg1 */
>         call syscall_trace_enter
>         LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace changed it */
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         xchgl %ebp,%r9d
>         cmpq $(IA32_NR_syscalls-1),%rax
>         ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
> @@ -417,7 +420,8 @@ ENTRY(ia32_syscall)
>         cld
>         /* note the registers are not zero extended to the sf.
>            this could be a problem. */
> -       SAVE_ARGS 0,1,0
> +       ALLOC_PTREGS_ON_STACK
> +       SAVE_C_REGS_EXCEPT_R891011
>         orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
>         testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
>         jnz ia32_tracesys
> @@ -430,16 +434,16 @@ ia32_sysret:
>         movq %rax,RAX-ARGOFFSET(%rsp)
>  ia32_ret_from_sys_call:
>         CLEAR_RREGS -ARGOFFSET
> -       jmp int_ret_from_sys_call
> +       jmp int_ret_from_sys_call
>
> -ia32_tracesys:
> -       SAVE_REST
> +ia32_tracesys:
> +       SAVE_EXTRA_REGS
>         CLEAR_RREGS
>         movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
>         movq %rsp,%rdi        /* &pt_regs -> arg1 */
>         call syscall_trace_enter
>         LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         cmpq $(IA32_NR_syscalls-1),%rax
>         ja  int_ret_from_sys_call       /* ia32_tracesys has set RAX(%rsp) */
>         jmp ia32_do_call
> @@ -475,7 +479,6 @@ GLOBAL(stub32_clone)
>
>         ALIGN
>  ia32_ptregs_common:
> -       popq %r11
>         CFI_ENDPROC
>         CFI_STARTPROC32 simple
>         CFI_SIGNAL_FRAME
> @@ -490,9 +493,9 @@ ia32_ptregs_common:
>  /*     CFI_REL_OFFSET  rflags,EFLAGS-ARGOFFSET*/
>         CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
>  /*     CFI_REL_OFFSET  ss,SS-ARGOFFSET*/
> -       SAVE_REST
> +       SAVE_EXTRA_REGS 8
>         call *%rax
> -       RESTORE_REST
> -       jmp  ia32_sysret        /* misbalances the return cache */
> +       RESTORE_EXTRA_REGS 8
> +       ret
>         CFI_ENDPROC
>  END(ia32_ptregs_common)
> diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
> index e176cea..10aff1e 100644
> --- a/arch/x86/include/asm/calling.h
> +++ b/arch/x86/include/asm/calling.h
> @@ -52,142 +52,132 @@ For 32-bit we have the following conventions - kernel is built with
>
>  /*
>   * 64-bit system call stack frame layout defines and helpers,
> - * for assembly code:
> + * for assembly code.
>   */
>
> -#define R15              0
> -#define R14              8
> -#define R13             16
> -#define R12             24
> -#define RBP             32
> -#define RBX             40
> -
> -/* arguments: interrupts/non tracing syscalls only save up to here: */
> -#define R11             48
> -#define R10             56
> -#define R9              64
> -#define R8              72
> -#define RAX             80
> -#define RCX             88
> -#define RDX             96
> -#define RSI            104
> -#define RDI            112
> -#define ORIG_RAX       120       /* + error_code */
> -/* end of arguments */
> -
> -/* cpu exception frame or undefined in case of fast syscall: */
> -#define RIP            128
> -#define CS             136
> -#define EFLAGS         144
> -#define RSP            152
> -#define SS             160
> -
> -#define ARGOFFSET      R11
> -
> -       .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
> -       subq  $9*8+\addskip, %rsp
> -       CFI_ADJUST_CFA_OFFSET   9*8+\addskip
> -       movq_cfi rdi, 8*8
> -       movq_cfi rsi, 7*8
> -       movq_cfi rdx, 6*8
> -
> -       .if \save_rcx
> -       movq_cfi rcx, 5*8
> -       .endif
> -
> -       movq_cfi rax, 4*8
> +/* The layout forms the "struct pt_regs" on the stack: */
> +/*
> + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
> + * unless syscall needs a complete, fully filled "struct pt_regs".
> + */
> +#define R15            0*8
> +#define R14            1*8
> +#define R13            2*8
> +#define R12            3*8
> +#define RBP            4*8
> +#define RBX            5*8
> +/* These regs are callee-clobbered. Always saved on kernel entry. */
> +#define R11            6*8
> +#define R10            7*8
> +#define R9             8*8
> +#define R8             9*8
> +#define RAX            10*8
> +#define RCX            11*8
> +#define RDX            12*8
> +#define RSI            13*8
> +#define RDI            14*8
> +/*
> + * On syscall entry, this is syscall#. On CPU exception, this is error code.
> + * On hw interrupt, it's IRQ number:
> + */
> +#define ORIG_RAX       15*8
> +/* Return frame for iretq */
> +#define RIP            16*8
> +#define CS             17*8
> +#define EFLAGS         18*8
> +#define RSP            19*8
> +#define SS             20*8
> +
> +#define ARGOFFSET      0
> +
> +       .macro ALLOC_PTREGS_ON_STACK addskip=0
> +       subq    $15*8+\addskip, %rsp
> +       CFI_ADJUST_CFA_OFFSET 15*8+\addskip
> +       .endm
>
> -       .if \save_r891011
> -       movq_cfi r8,  3*8
> -       movq_cfi r9,  2*8
> -       movq_cfi r10, 1*8
> -       movq_cfi r11, 0*8
> +       .macro SAVE_C_REGS_HELPER rcx=1 r8plus=1
> +       movq_cfi rdi, 14*8
> +       movq_cfi rsi, 13*8
> +       movq_cfi rdx, 12*8
> +       .if \rcx
> +       movq_cfi rcx, 11*8
>         .endif
> -
> +       movq_cfi rax, 10*8
> +       .if \r8plus
> +       movq_cfi r8,  9*8
> +       movq_cfi r9,  8*8
> +       movq_cfi r10, 7*8
> +       movq_cfi r11, 6*8
> +       .endif
> +       .endm
> +       .macro SAVE_C_REGS
> +       SAVE_C_REGS_HELPER 1, 1
> +       .endm
> +       .macro SAVE_C_REGS_EXCEPT_R891011
> +       SAVE_C_REGS_HELPER 1, 0
> +       .endm
> +       .macro SAVE_C_REGS_EXCEPT_RCX_R891011
> +       SAVE_C_REGS_HELPER 0, 0
>         .endm
>
> -#define ARG_SKIP       (9*8)
> +       .macro SAVE_EXTRA_REGS offset=0
> +       movq_cfi rbx, 5*8+\offset
> +       movq_cfi rbp, 4*8+\offset
> +       movq_cfi r12, 3*8+\offset
> +       movq_cfi r13, 2*8+\offset
> +       movq_cfi r14, 1*8+\offset
> +       movq_cfi r15, 0*8+\offset
> +       .endm
>
> -       .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
> -                           rstor_r8910=1, rstor_rdx=1
> -       .if \rstor_r11
> -       movq_cfi_restore 0*8, r11
> -       .endif
> +       .macro RESTORE_EXTRA_REGS offset=0
> +       movq_cfi_restore 0*8+\offset, r15
> +       movq_cfi_restore 1*8+\offset, r14
> +       movq_cfi_restore 2*8+\offset, r13
> +       movq_cfi_restore 3*8+\offset, r12
> +       movq_cfi_restore 4*8+\offset, rbp
> +       movq_cfi_restore 5*8+\offset, rbx
> +       .endm
>
> -       .if \rstor_r8910
> -       movq_cfi_restore 1*8, r10
> -       movq_cfi_restore 2*8, r9
> -       movq_cfi_restore 3*8, r8
> +       .macro RESTORE_C_REGS_HELPER rax=1, rcx=1, r11=1, r8910=1, rdx=1
> +       .if \r11
> +       movq_cfi_restore 6*8, r11
>         .endif
> -
> -       .if \rstor_rax
> -       movq_cfi_restore 4*8, rax
> +       .if \r8910
> +       movq_cfi_restore 7*8, r10
> +       movq_cfi_restore 8*8, r9
> +       movq_cfi_restore 9*8, r8
>         .endif
> -
> -       .if \rstor_rcx
> -       movq_cfi_restore 5*8, rcx
> +       .if \rax
> +       movq_cfi_restore 10*8, rax
>         .endif
> -
> -       .if \rstor_rdx
> -       movq_cfi_restore 6*8, rdx
> +       .if \rcx
> +       movq_cfi_restore 11*8, rcx
>         .endif
> -
> -       movq_cfi_restore 7*8, rsi
> -       movq_cfi_restore 8*8, rdi
> -
> -       .if ARG_SKIP+\addskip > 0
> -       addq $ARG_SKIP+\addskip, %rsp
> -       CFI_ADJUST_CFA_OFFSET   -(ARG_SKIP+\addskip)
> +       .if \rdx
> +       movq_cfi_restore 12*8, rdx
>         .endif
> +       movq_cfi_restore 13*8, rsi
> +       movq_cfi_restore 14*8, rdi
>         .endm
> -
> -       .macro LOAD_ARGS offset, skiprax=0
> -       movq \offset(%rsp),    %r11
> -       movq \offset+8(%rsp),  %r10
> -       movq \offset+16(%rsp), %r9
> -       movq \offset+24(%rsp), %r8
> -       movq \offset+40(%rsp), %rcx
> -       movq \offset+48(%rsp), %rdx
> -       movq \offset+56(%rsp), %rsi
> -       movq \offset+64(%rsp), %rdi
> -       .if \skiprax
> -       .else
> -       movq \offset+72(%rsp), %rax
> -       .endif
> +       .macro RESTORE_C_REGS
> +       RESTORE_C_REGS_HELPER 1,1,1,1,1
>         .endm
> -
> -#define REST_SKIP      (6*8)
> -
> -       .macro SAVE_REST
> -       subq $REST_SKIP, %rsp
> -       CFI_ADJUST_CFA_OFFSET   REST_SKIP
> -       movq_cfi rbx, 5*8
> -       movq_cfi rbp, 4*8
> -       movq_cfi r12, 3*8
> -       movq_cfi r13, 2*8
> -       movq_cfi r14, 1*8
> -       movq_cfi r15, 0*8
> +       .macro RESTORE_C_REGS_EXCEPT_RAX
> +       RESTORE_C_REGS_HELPER 0,1,1,1,1
>         .endm
> -
> -       .macro RESTORE_REST
> -       movq_cfi_restore 0*8, r15
> -       movq_cfi_restore 1*8, r14
> -       movq_cfi_restore 2*8, r13
> -       movq_cfi_restore 3*8, r12
> -       movq_cfi_restore 4*8, rbp
> -       movq_cfi_restore 5*8, rbx
> -       addq $REST_SKIP, %rsp
> -       CFI_ADJUST_CFA_OFFSET   -(REST_SKIP)
> +       .macro RESTORE_C_REGS_EXCEPT_RCX
> +       RESTORE_C_REGS_HELPER 1,0,1,1,1
>         .endm
> -
> -       .macro SAVE_ALL
> -       SAVE_ARGS
> -       SAVE_REST
> +       .macro RESTORE_RSI_RDI
> +       RESTORE_C_REGS_HELPER 0,0,0,0,0
> +       .endm
> +       .macro RESTORE_RSI_RDI_RDX
> +       RESTORE_C_REGS_HELPER 0,0,0,0,1
>         .endm
>
> -       .macro RESTORE_ALL addskip=0
> -       RESTORE_REST
> -       RESTORE_ARGS 1, \addskip
> +       .macro REMOVE_PTREGS_FROM_STACK addskip=0
> +       addq $15*8+\addskip, %rsp
> +       CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
>         .endm
>
>         .macro icebp
> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
> index bba3cf8..6f98c16 100644
> --- a/arch/x86/include/asm/irqflags.h
> +++ b/arch/x86/include/asm/irqflags.h
> @@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void)
>  #define ARCH_LOCKDEP_SYS_EXIT_IRQ      \
>         TRACE_IRQS_ON; \
>         sti; \
> -       SAVE_REST; \
> +       SAVE_EXTRA_REGS; \
>         LOCKDEP_SYS_EXIT; \
> -       RESTORE_REST; \
> +       RESTORE_EXTRA_REGS; \
>         cli; \
>         TRACE_IRQS_OFF;
>
> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
> index 6205f0c..c822b35 100644
> --- a/arch/x86/include/asm/ptrace.h
> +++ b/arch/x86/include/asm/ptrace.h
> @@ -31,13 +31,17 @@ struct pt_regs {
>  #else /* __i386__ */
>
>  struct pt_regs {
> +/*
> + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
> + * unless syscall needs a complete, fully filled "struct pt_regs".
> + */
>         unsigned long r15;
>         unsigned long r14;
>         unsigned long r13;
>         unsigned long r12;
>         unsigned long bp;
>         unsigned long bx;
> -/* arguments: non interrupts/non tracing syscalls only save up to here*/
> +/* These regs are callee-clobbered. Always saved on kernel entry. */
>         unsigned long r11;
>         unsigned long r10;
>         unsigned long r9;
> @@ -47,9 +51,12 @@ struct pt_regs {
>         unsigned long dx;
>         unsigned long si;
>         unsigned long di;
> +/*
> + * On syscall entry, this is syscall#. On CPU exception, this is error code.
> + * On hw interrupt, it's IRQ number:
> + */
>         unsigned long orig_ax;
> -/* end of arguments */
> -/* cpu exception frame or undefined */
> +/* Return frame for iretq */
>         unsigned long ip;
>         unsigned long cs;
>         unsigned long flags;
> diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
> index 7b0a55a..580aee3 100644
> --- a/arch/x86/include/uapi/asm/ptrace-abi.h
> +++ b/arch/x86/include/uapi/asm/ptrace-abi.h
> @@ -25,13 +25,17 @@
>  #else /* __i386__ */
>
>  #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
> +/*
> + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
> + * unless syscall needs a complete, fully filled "struct pt_regs".
> + */
>  #define R15 0
>  #define R14 8
>  #define R13 16
>  #define R12 24
>  #define RBP 32
>  #define RBX 40
> -/* arguments: interrupts/non tracing syscalls only save up to here*/
> +/* These regs are callee-clobbered. Always saved on kernel entry. */
>  #define R11 48
>  #define R10 56
>  #define R9 64
> @@ -41,15 +45,17 @@
>  #define RDX 96
>  #define RSI 104
>  #define RDI 112
> -#define ORIG_RAX 120       /* = ERROR */
> -/* end of arguments */
> -/* cpu exception frame or undefined in case of fast syscall. */
> +/*
> + * On syscall entry, this is syscall#. On CPU exception, this is error code.
> + * On hw interrupt, it's IRQ number:
> + */
> +#define ORIG_RAX 120
> +/* Return frame for iretq */
>  #define RIP 128
>  #define CS 136
>  #define EFLAGS 144
>  #define RSP 152
>  #define SS 160
> -#define ARGOFFSET R11
>  #endif /* __ASSEMBLY__ */
>
>  /* top of stack page */
> diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
> index ac4b9aa..bc16115 100644
> --- a/arch/x86/include/uapi/asm/ptrace.h
> +++ b/arch/x86/include/uapi/asm/ptrace.h
> @@ -41,13 +41,17 @@ struct pt_regs {
>  #ifndef __KERNEL__
>
>  struct pt_regs {
> +/*
> + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
> + * unless syscall needs a complete, fully filled "struct pt_regs".
> + */
>         unsigned long r15;
>         unsigned long r14;
>         unsigned long r13;
>         unsigned long r12;
>         unsigned long rbp;
>         unsigned long rbx;
> -/* arguments: non interrupts/non tracing syscalls only save up to here*/
> +/* These regs are callee-clobbered. Always saved on kernel entry. */
>         unsigned long r11;
>         unsigned long r10;
>         unsigned long r9;
> @@ -57,9 +61,12 @@ struct pt_regs {
>         unsigned long rdx;
>         unsigned long rsi;
>         unsigned long rdi;
> +/*
> + * On syscall entry, this is syscall#. On CPU exception, this is error code.
> + * On hw interrupt, it's IRQ number:
> + */
>         unsigned long orig_rax;
> -/* end of arguments */
> -/* cpu exception frame or undefined */
> +/* Return frame for iretq */
>         unsigned long rip;
>         unsigned long cs;
>         unsigned long eflags;
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 37f7d95..b3c3ebb 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -26,12 +26,6 @@
>   * Some macro usage:
>   * - CFI macros are used to generate dwarf2 unwind information for better
>   * backtraces. They don't change any code.
> - * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
> - * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
> - * There are unfortunately lots of special cases where some registers
> - * not touched. The macro is a big mess that should be cleaned up.
> - * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
> - * Gives a full stack frame.
>   * - ENTRY/END Define functions in the symbol table.
>   * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
>   * frame that is otherwise undefined after a SYSCALL
> @@ -264,7 +258,7 @@ ENTRY(ret_from_fork)
>
>         GET_THREAD_INFO(%rcx)
>
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>
>         testl $3, CS-ARGOFFSET(%rsp)            # from kernel_thread?
>         jz   1f
> @@ -276,12 +270,10 @@ ENTRY(ret_from_fork)
>         jmp ret_from_sys_call                   # go to the SYSRET fastpath
>
>  1:
> -       subq $REST_SKIP, %rsp   # leave space for volatiles
> -       CFI_ADJUST_CFA_OFFSET   REST_SKIP
>         movq %rbp, %rdi
>         call *%rbx
>         movl $0, RAX(%rsp)
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         jmp int_ret_from_sys_call
>         CFI_ENDPROC
>  END(ret_from_fork)
> @@ -339,7 +331,8 @@ GLOBAL(system_call_after_swapgs)
>          * and short:
>          */
>         ENABLE_INTERRUPTS(CLBR_NONE)
> -       SAVE_ARGS 8,0
> +       ALLOC_PTREGS_ON_STACK 8
> +       SAVE_C_REGS
>         movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
>         movq  %rcx,RIP-ARGOFFSET(%rsp)
>         CFI_REL_OFFSET rip,RIP-ARGOFFSET
> @@ -375,9 +368,9 @@ sysret_check:
>          * sysretq will re-enable interrupts:
>          */
>         TRACE_IRQS_ON
> +       RESTORE_C_REGS_EXCEPT_RCX
>         movq RIP-ARGOFFSET(%rsp),%rcx
>         CFI_REGISTER    rip,rcx
> -       RESTORE_ARGS 1,-ARG_SKIP,0
>         /*CFI_REGISTER  rflags,r11*/
>         movq    PER_CPU_VAR(old_rsp), %rsp
>         USERGS_SYSRET64
> @@ -429,7 +422,7 @@ auditsys:
>         movq %rax,%rsi                  /* 2nd arg: syscall number */
>         movl $AUDIT_ARCH_X86_64,%edi    /* 1st arg: audit arch */
>         call __audit_syscall_entry
> -       LOAD_ARGS 0             /* reload call-clobbered registers */
> +       RESTORE_C_REGS                  /* reload call-clobbered registers */
>         jmp system_call_fastpath
>
>         /*
> @@ -453,7 +446,7 @@ tracesys:
>         testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
>         jz auditsys
>  #endif
> -       SAVE_REST
> +       SAVE_EXTRA_REGS
>         movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
>         FIXUP_TOP_OF_STACK %rdi
>         movq %rsp,%rdi
> @@ -463,8 +456,8 @@ tracesys:
>          * We don't reload %rax because syscall_trace_enter() returned
>          * the value it wants us to use in the table lookup.
>          */
> -       LOAD_ARGS ARGOFFSET, 1
> -       RESTORE_REST
> +       RESTORE_C_REGS_EXCEPT_RAX
> +       RESTORE_EXTRA_REGS
>  #if __SYSCALL_MASK == ~0
>         cmpq $__NR_syscall_max,%rax
>  #else
> @@ -515,7 +508,7 @@ int_very_careful:
>         TRACE_IRQS_ON
>         ENABLE_INTERRUPTS(CLBR_NONE)
>  int_check_syscall_exit_work:
> -       SAVE_REST
> +       SAVE_EXTRA_REGS
>         /* Check for syscall exit trace */
>         testl $_TIF_WORK_SYSCALL_EXIT,%edx
>         jz int_signal
> @@ -534,7 +527,7 @@ int_signal:
>         call do_notify_resume
>  1:     movl $_TIF_WORK_MASK,%edi
>  int_restore_rest:
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         DISABLE_INTERRUPTS(CLBR_NONE)
>         TRACE_IRQS_OFF
>         jmp int_with_check
> @@ -544,15 +537,12 @@ END(system_call)
>         .macro FORK_LIKE func
>  ENTRY(stub_\func)
>         CFI_STARTPROC
> -       popq    %r11                    /* save return address */
> -       PARTIAL_FRAME 0
> -       SAVE_REST
> -       pushq   %r11                    /* put it back on stack */
> +       DEFAULT_FRAME 0, 8              /* offset 8: return address */
> +       SAVE_EXTRA_REGS 8
>         FIXUP_TOP_OF_STACK %r11, 8
> -       DEFAULT_FRAME 0 8               /* offset 8: return address */
>         call sys_\func
>         RESTORE_TOP_OF_STACK %r11, 8
> -       ret $REST_SKIP          /* pop extended registers */
> +       ret
>         CFI_ENDPROC
>  END(stub_\func)
>         .endm
> @@ -560,7 +550,7 @@ END(stub_\func)
>         .macro FIXED_FRAME label,func
>  ENTRY(\label)
>         CFI_STARTPROC
> -       PARTIAL_FRAME 0 8               /* offset 8: return address */
> +       DEFAULT_FRAME 0, 8              /* offset 8: return address */
>         FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
>         call \func
>         RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
> @@ -577,12 +567,12 @@ END(\label)
>  ENTRY(stub_execve)
>         CFI_STARTPROC
>         addq $8, %rsp
> -       PARTIAL_FRAME 0
> -       SAVE_REST
> +       DEFAULT_FRAME 0
> +       SAVE_EXTRA_REGS
>         FIXUP_TOP_OF_STACK %r11
>         call sys_execve
>         movq %rax,RAX(%rsp)
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         jmp int_ret_from_sys_call
>         CFI_ENDPROC
>  END(stub_execve)
> @@ -594,12 +584,12 @@ END(stub_execve)
>  ENTRY(stub_rt_sigreturn)
>         CFI_STARTPROC
>         addq $8, %rsp
> -       PARTIAL_FRAME 0
> -       SAVE_REST
> +       DEFAULT_FRAME 0
> +       SAVE_EXTRA_REGS
>         FIXUP_TOP_OF_STACK %r11
>         call sys_rt_sigreturn
>         movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         jmp int_ret_from_sys_call
>         CFI_ENDPROC
>  END(stub_rt_sigreturn)
> @@ -608,12 +598,12 @@ END(stub_rt_sigreturn)
>  ENTRY(stub_x32_rt_sigreturn)
>         CFI_STARTPROC
>         addq $8, %rsp
> -       PARTIAL_FRAME 0
> -       SAVE_REST
> +       DEFAULT_FRAME 0
> +       SAVE_EXTRA_REGS
>         FIXUP_TOP_OF_STACK %r11
>         call sys32_x32_rt_sigreturn
>         movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         jmp int_ret_from_sys_call
>         CFI_ENDPROC
>  END(stub_x32_rt_sigreturn)
> @@ -621,13 +611,13 @@ END(stub_x32_rt_sigreturn)
>  ENTRY(stub_x32_execve)
>         CFI_STARTPROC
>         addq $8, %rsp
> -       PARTIAL_FRAME 0
> -       SAVE_REST
> +       DEFAULT_FRAME 0
> +       SAVE_EXTRA_REGS
>         FIXUP_TOP_OF_STACK %r11
>         call compat_sys_execve
>         RESTORE_TOP_OF_STACK %r11
>         movq %rax,RAX(%rsp)
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         jmp int_ret_from_sys_call
>         CFI_ENDPROC
>  END(stub_x32_execve)
> @@ -683,51 +673,31 @@ END(interrupt)
>
>  /* 0(%rsp): ~(interrupt number) */
>         .macro interrupt func
> -       /* reserve pt_regs for scratch regs and rbp */
> -       subq $ORIG_RAX-RBP, %rsp
> -       CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
> -       cld
> -       /* start from rbp in pt_regs and jump over */
> -       movq_cfi rdi, (RDI-RBP)
> -       movq_cfi rsi, (RSI-RBP)
> -       movq_cfi rdx, (RDX-RBP)
> -       movq_cfi rcx, (RCX-RBP)
> -       movq_cfi rax, (RAX-RBP)
> -       movq_cfi  r8,  (R8-RBP)
> -       movq_cfi  r9,  (R9-RBP)
> -       movq_cfi r10, (R10-RBP)
> -       movq_cfi r11, (R11-RBP)
> -
> -       /* Save rbp so that we can unwind from get_irq_regs() */
> -       movq_cfi rbp, 0
> -
> -       /* Save previous stack value */
> -       movq %rsp, %rsi
> -
> -       leaq -RBP(%rsp),%rdi    /* arg1 for handler */
> -       testl $3, CS-RBP(%rsi)
> +       ALLOC_PTREGS_ON_STACK
> +       SAVE_C_REGS
> +       movq %rsp, %rdi /* arg1 for handler */
> +       testl $3, CS(%rsp)
>         je 1f
>         SWAPGS
> -       /*
> +1:     /*
>          * irq_count is used to check if a CPU is already on an interrupt stack
>          * or not. While this is essentially redundant with preempt_count it is
>          * a little cheaper to use a separate counter in the PDA (short of
>          * moving irq_enter into assembly, which would be too much work)
>          */
> -1:     incl PER_CPU_VAR(irq_count)
> +       incl PER_CPU_VAR(irq_count)
>         cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
> -       CFI_DEF_CFA_REGISTER    rsi
> +       CFI_DEF_CFA_REGISTER    rdi
>
>         /* Store previous stack value */
> -       pushq %rsi
> +       pushq %rdi
>         CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
>                         0x77 /* DW_OP_breg7 */, 0, \
>                         0x06 /* DW_OP_deref */, \
> -                       0x08 /* DW_OP_const1u */, SS+8-RBP, \
> +                       0x08 /* DW_OP_const1u */, SS+8, \
>                         0x22 /* DW_OP_plus */
>         /* We entered an interrupt context - irqs are off: */
>         TRACE_IRQS_OFF
> -
>         call \func
>         .endm
>
> @@ -749,10 +719,9 @@ ret_from_intr:
>
>         /* Restore saved previous stack */
>         popq %rsi
> -       CFI_DEF_CFA rsi,SS+8-RBP        /* reg/off reset after def_cfa_expr */
> -       leaq ARGOFFSET-RBP(%rsi), %rsp
> +       CFI_DEF_CFA rsi,SS+8    /* reg/off reset after def_cfa_expr */
> +       movq %rsi, %rsp
>         CFI_DEF_CFA_REGISTER    rsp
> -       CFI_ADJUST_CFA_OFFSET   RBP-ARGOFFSET
>
>  exit_intr:
>         GET_THREAD_INFO(%rcx)
> @@ -789,7 +758,8 @@ retint_restore_args:        /* return to kernel space */
>          */
>         TRACE_IRQS_IRETQ
>  restore_args:
> -       RESTORE_ARGS 1,8,1
> +       RESTORE_C_REGS
> +       REMOVE_PTREGS_FROM_STACK 8
>
>  irq_return:
>         /*
> @@ -876,12 +846,12 @@ retint_signal:
>         jz    retint_swapgs
>         TRACE_IRQS_ON
>         ENABLE_INTERRUPTS(CLBR_NONE)
> -       SAVE_REST
> +       SAVE_EXTRA_REGS
>         movq $-1,ORIG_RAX(%rsp)
>         xorl %esi,%esi          # oldset
>         movq %rsp,%rdi          # &pt_regs
>         call do_notify_resume
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         DISABLE_INTERRUPTS(CLBR_NONE)
>         TRACE_IRQS_OFF
>         GET_THREAD_INFO(%rcx)
> @@ -1256,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
>         addq $0x30,%rsp
>         CFI_ADJUST_CFA_OFFSET -0x30
>         pushq_cfi $-1 /* orig_ax = -1 => not a system call */
> -       SAVE_ALL
> +       ALLOC_PTREGS_ON_STACK
> +       SAVE_C_REGS
> +       SAVE_EXTRA_REGS
>         jmp error_exit
>         CFI_ENDPROC
>  END(xen_failsafe_callback)
> @@ -1313,11 +1285,15 @@ ENTRY(paranoid_exit)
>  paranoid_swapgs:
>         TRACE_IRQS_IRETQ 0
>         SWAPGS_UNSAFE_STACK
> -       RESTORE_ALL 8
> +       RESTORE_EXTRA_REGS
> +       RESTORE_C_REGS
> +       REMOVE_PTREGS_FROM_STACK 8
>         jmp irq_return
>  paranoid_restore:
>         TRACE_IRQS_IRETQ_DEBUG 0
> -       RESTORE_ALL 8
> +       RESTORE_EXTRA_REGS
> +       RESTORE_C_REGS
> +       REMOVE_PTREGS_FROM_STACK 8
>         jmp irq_return
>  paranoid_userspace:
>         GET_THREAD_INFO(%rcx)
> @@ -1412,7 +1388,7 @@ END(error_entry)
>  ENTRY(error_exit)
>         DEFAULT_FRAME
>         movl %ebx,%eax
> -       RESTORE_REST
> +       RESTORE_EXTRA_REGS
>         DISABLE_INTERRUPTS(CLBR_NONE)
>         TRACE_IRQS_OFF
>         GET_THREAD_INFO(%rcx)
> @@ -1671,8 +1647,10 @@ end_repeat_nmi:
>  nmi_swapgs:
>         SWAPGS_UNSAFE_STACK
>  nmi_restore:
> +       RESTORE_EXTRA_REGS
> +       RESTORE_C_REGS
>         /* Pop the extra iret frame at once */
> -       RESTORE_ALL 6*8
> +       REMOVE_PTREGS_FROM_STACK 6*8
>
>         /* Clear the NMI executing stack variable */
>         movq $0, 5*8(%rsp)
> diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S
> index ca7f0d5..673da2f 100644
> --- a/arch/x86/kernel/preempt.S
> +++ b/arch/x86/kernel/preempt.S
> @@ -6,9 +6,13 @@
>
>  ENTRY(___preempt_schedule)
>         CFI_STARTPROC
> -       SAVE_ALL
> +       ALLOC_PTREGS_ON_STACK
> +       SAVE_C_REGS
> +       SAVE_EXTRA_REGS
>         call preempt_schedule
> -       RESTORE_ALL
> +       RESTORE_EXTRA_REGS
> +       RESTORE_C_REGS
> +       REMOVE_PTREGS_FROM_STACK
>         ret
>         CFI_ENDPROC
>
> @@ -16,9 +20,13 @@ ENTRY(___preempt_schedule)
>
>  ENTRY(___preempt_schedule_context)
>         CFI_STARTPROC
> -       SAVE_ALL
> +       ALLOC_PTREGS_ON_STACK
> +       SAVE_C_REGS
> +       SAVE_EXTRA_REGS
>         call preempt_schedule_context
> -       RESTORE_ALL
> +       RESTORE_EXTRA_REGS
> +       RESTORE_C_REGS
> +       REMOVE_PTREGS_FROM_STACK
>         ret
>         CFI_ENDPROC
>
> --
> 1.8.1.4
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/5] x86: entry_64.S: delete unused code
  2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
                   ` (3 preceding siblings ...)
  2014-08-01 14:48 ` [PATCH 5/5] x86: mass removal of ARGOFFSET Denys Vlasenko
@ 2014-08-01 18:00 ` Frederic Weisbecker
  4 siblings, 0 replies; 41+ messages in thread
From: Frederic Weisbecker @ 2014-08-01 18:00 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: linux-kernel, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	X86 ML, Alexei Starovoitov, Will Drewry, Kees Cook

On Fri, Aug 01, 2014 at 04:48:14PM +0200, Denys Vlasenko wrote:
> A define, two macros and an unreferenced bit of assembly are gone.
> 
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: "H. Peter Anvin" <hpa@zytor.com>
> CC: Andy Lutomirski <luto@amacapital.net>

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
  2014-08-01 17:04   ` Andy Lutomirski
@ 2014-08-01 18:09   ` Alexei Starovoitov
  2014-08-01 18:30   ` Oleg Nesterov
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Alexei Starovoitov @ 2014-08-01 18:09 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: LKML, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	Frederic Weisbecker, X86 ML, Will Drewry, Kees Cook

On Fri, Aug 1, 2014 at 7:48 AM, Denys Vlasenko <dvlasenk@redhat.com> wrote:
> 64-bit code was using six stack slots fewer by not saving/restoring
> registers which a callee-preserved according to C ABI,
> and not allocating space for them.
>
> Only when syscall needed a complete "struct pt_regs",
> the complete area was allocated and filled in.
>
> This proved to be a source of significant obfuscation and subtle bugs.
> For example, stub_fork had to pop the return address,
> extend the struct, save registers, and push return address back. Ugly.
> ia32_ptregs_common pops return address and "returns" via jmp insn,
> throwing a wrench into CPU return stack cache.
>
> This patch changes code to always allocate a complete "struct pt_regs".
> The saving of registers is still done lazily.
>
> Macros which manipulate "struct pt_regs" on stack are reworked:
> ALLOC_PTREGS_ON_STACK allocates the structure.
> SAVE_C_REGS saves to it those registers which are clobbered by C code.
> SAVE_EXTRA_REGS saves to it all other registers.
> Corresponding RESTORE_* and REMOVE_PTREGS_FROM_STACK macros reverse it.

Looks like a nice cleanup at the cost of extra 48 byte stack gap for fast path.
I'm guessing the gap shouldn't affect performance, but would be nice
to see performance numbers before/after.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user
  2014-08-01 14:48 ` [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user Denys Vlasenko
@ 2014-08-01 18:30   ` Frederic Weisbecker
  0 siblings, 0 replies; 41+ messages in thread
From: Frederic Weisbecker @ 2014-08-01 18:30 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: linux-kernel, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	X86 ML, Alexei Starovoitov, Will Drewry, Kees Cook

On Fri, Aug 01, 2014 at 04:48:16PM +0200, Denys Vlasenko wrote:
> No code changes.
> 
> This is a preparatory patch for change in "struct pt_regs" handling.
> 
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: "H. Peter Anvin" <hpa@zytor.com>
> CC: Andy Lutomirski <luto@amacapital.net>

One macro dissolved == one Hop-o'-My-Thumb pebble less to drop before taking
a jump on the editor.

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
  2014-08-01 17:04   ` Andy Lutomirski
  2014-08-01 18:09   ` Alexei Starovoitov
@ 2014-08-01 18:30   ` Oleg Nesterov
  2014-08-01 18:35   ` H. Peter Anvin
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Oleg Nesterov @ 2014-08-01 18:30 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: linux-kernel, H. Peter Anvin, Andy Lutomirski,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

On 08/01, Denys Vlasenko wrote:
>
> This patch changes code to always allocate a complete "struct pt_regs".
> The saving of registers is still done lazily.

I obviously like this change very much. Unfortunately I can only ack the
intent ;)

I really hope that maintainers will take a closer look.

Oleg.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
                     ` (2 preceding siblings ...)
  2014-08-01 18:30   ` Oleg Nesterov
@ 2014-08-01 18:35   ` H. Peter Anvin
  2014-08-01 22:11     ` Denys Vlasenko
  2014-08-01 22:52   ` Frederic Weisbecker
  2014-08-01 23:19   ` Frederic Weisbecker
  5 siblings, 1 reply; 41+ messages in thread
From: H. Peter Anvin @ 2014-08-01 18:35 UTC (permalink / raw)
  To: Denys Vlasenko, linux-kernel
  Cc: Oleg Nesterov, Andy Lutomirski, Frederic Weisbecker, X86 ML,
	Alexei Starovoitov, Will Drewry, Kees Cook

On 08/01/2014 07:48 AM, Denys Vlasenko wrote:
> 
> Patch was run-tested: 64-bit executables, 32-bit executables,
> strace works.
> 

Could you please try to see if there is a measurable change in the
latency of a trivial syscall?

	-hpa



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 18:35   ` H. Peter Anvin
@ 2014-08-01 22:11     ` Denys Vlasenko
  2014-08-01 22:13       ` H. Peter Anvin
  0 siblings, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 22:11 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Denys Vlasenko, Linux Kernel Mailing List, Oleg Nesterov,
	Andy Lutomirski, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
	Will Drewry, Kees Cook

On Fri, Aug 1, 2014 at 8:35 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 08/01/2014 07:48 AM, Denys Vlasenko wrote:
>>
>> Patch was run-tested: 64-bit executables, 32-bit executables,
>> strace works.
>>
>
> Could you please try to see if there is a measurable change in the
> latency of a trivial syscall?

Will do.
Something along the lines of "how long does it take to execute two
gazillions of getppid()?"

I'll send v2 shortly - I just figured out what was this mysterious
dance with allocating "not-quite-full struct pt_regs" on interrupt path:
It is intended to make nested interrupts to have better
backtraces. I broke that by this patch, v2 will have this feature retained.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 22:11     ` Denys Vlasenko
@ 2014-08-01 22:13       ` H. Peter Anvin
  2014-08-02 21:14         ` Andy Lutomirski
  0 siblings, 1 reply; 41+ messages in thread
From: H. Peter Anvin @ 2014-08-01 22:13 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Denys Vlasenko, Linux Kernel Mailing List, Oleg Nesterov,
	Andy Lutomirski, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
	Will Drewry, Kees Cook

On 08/01/2014 03:11 PM, Denys Vlasenko wrote:
>>
>> Could you please try to see if there is a measurable change in the
>> latency of a trivial syscall?
> 
> Will do.
> Something along the lines of "how long does it take to execute two
> gazillions of getppid()?"
> 

Something like that, yes, but you have to run enough data points so you
can determine if the difference is statistically significant or just noise.

	-hpa


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
                     ` (3 preceding siblings ...)
  2014-08-01 18:35   ` H. Peter Anvin
@ 2014-08-01 22:52   ` Frederic Weisbecker
  2014-08-01 23:19   ` Frederic Weisbecker
  5 siblings, 0 replies; 41+ messages in thread
From: Frederic Weisbecker @ 2014-08-01 22:52 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: linux-kernel, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	X86 ML, Alexei Starovoitov, Will Drewry, Kees Cook

On Fri, Aug 01, 2014 at 04:48:17PM +0200, Denys Vlasenko wrote:
> 64-bit code was using six stack slots fewer by not saving/restoring
> registers which a callee-preserved according to C ABI,
> and not allocating space for them.
> 
> Only when syscall needed a complete "struct pt_regs",
> the complete area was allocated and filled in.
> 
> This proved to be a source of significant obfuscation and subtle bugs.
> For example, stub_fork had to pop the return address,
> extend the struct, save registers, and push return address back. Ugly.
> ia32_ptregs_common pops return address and "returns" via jmp insn,
> throwing a wrench into CPU return stack cache.
> 
> This patch changes code to always allocate a complete "struct pt_regs".
> The saving of registers is still done lazily.
> 
> Macros which manipulate "struct pt_regs" on stack are reworked:
> ALLOC_PTREGS_ON_STACK allocates the structure.
> SAVE_C_REGS saves to it those registers which are clobbered by C code.
> SAVE_EXTRA_REGS saves to it all other registers.
> Corresponding RESTORE_* and REMOVE_PTREGS_FROM_STACK macros reverse it.
> 
> ia32_ptregs_common, stub_fork and friends lost their ugly dance with
> return pointer.
> 
> LOAD_ARGS32 in ia32entry.S now uses a symbolic stack offsets
> instead of magic numbers.
> 
> Misleading and slightly wrong comments in "struct pt_regs" are fixed
> (four instances).
> 
> Patch was run-tested: 64-bit executables, 32-bit executables,
> strace works.
> 
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: "H. Peter Anvin" <hpa@zytor.com>
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Frederic Weisbecker <fweisbec@gmail.com>
> CC: X86 ML <x86@kernel.org>
> CC: Alexei Starovoitov <ast@plumgrid.com>
> CC: Will Drewry <wad@chromium.org>
> CC: Kees Cook <keescook@chromium.org>
> CC: linux-kernel@vger.kernel.org
> ---
>  arch/x86/ia32/ia32entry.S              |  47 +++----
>  arch/x86/include/asm/calling.h         | 224 ++++++++++++++++-----------------
>  arch/x86/include/asm/irqflags.h        |   4 +-
>  arch/x86/include/asm/ptrace.h          |  13 +-
>  arch/x86/include/uapi/asm/ptrace-abi.h |  16 ++-
>  arch/x86/include/uapi/asm/ptrace.h     |  13 +-
>  arch/x86/kernel/entry_64.S             | 132 ++++++++-----------
>  arch/x86/kernel/preempt.S              |  16 ++-
>  8 files changed, 232 insertions(+), 233 deletions(-)
> 
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 4299eb0..ef9ee16 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -62,12 +62,12 @@
>  	 */
>  	.macro LOAD_ARGS32 offset, _r9=0
>  	.if \_r9
> -	movl \offset+16(%rsp),%r9d
> +	movl \offset+R9(%rsp),%r9d
>  	.endif
> -	movl \offset+40(%rsp),%ecx
> -	movl \offset+48(%rsp),%edx
> -	movl \offset+56(%rsp),%esi
> -	movl \offset+64(%rsp),%edi
> +	movl \offset+RCX(%rsp),%ecx
> +	movl \offset+RDX(%rsp),%edx
> +	movl \offset+RSI(%rsp),%esi
> +	movl \offset+RDI(%rsp),%edi
>  	movl %eax,%eax			/* zero extension */
>  	.endm
>  	
> @@ -144,7 +144,8 @@ ENTRY(ia32_sysenter_target)
>  	CFI_REL_OFFSET rip,0
>  	pushq_cfi %rax
>  	cld
> -	SAVE_ARGS 0,1,0
> +	ALLOC_PTREGS_ON_STACK
> +	SAVE_C_REGS_EXCEPT_R891011
>   	/* no need to do an access_ok check here because rbp has been
>   	   32bit zero extended */ 
>  	ASM_STAC
> @@ -172,7 +173,8 @@ sysexit_from_sys_call:
>  	andl  $~0x200,EFLAGS-R11(%rsp) 
>  	movl	RIP-R11(%rsp),%edx		/* User %eip */
>  	CFI_REGISTER rip,rdx
> -	RESTORE_ARGS 0,24,0,0,0,0
> +	RESTORE_RSI_RDI

I heard there will be a v2 so I'll probably wait for it to review this patch
which really requires 0db where I sit.

But the macro names like above look much clearer as well!

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
                     ` (4 preceding siblings ...)
  2014-08-01 22:52   ` Frederic Weisbecker
@ 2014-08-01 23:19   ` Frederic Weisbecker
  2014-08-04  3:03     ` Denys Vlasenko
  5 siblings, 1 reply; 41+ messages in thread
From: Frederic Weisbecker @ 2014-08-01 23:19 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: linux-kernel, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	X86 ML, Alexei Starovoitov, Will Drewry, Kees Cook

On Fri, Aug 01, 2014 at 04:48:17PM +0200, Denys Vlasenko wrote:
>  
>  /* 0(%rsp): ~(interrupt number) */
>  	.macro interrupt func
> -	/* reserve pt_regs for scratch regs and rbp */
> -	subq $ORIG_RAX-RBP, %rsp
> -	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
> -	cld
> -	/* start from rbp in pt_regs and jump over */
> -	movq_cfi rdi, (RDI-RBP)
> -	movq_cfi rsi, (RSI-RBP)
> -	movq_cfi rdx, (RDX-RBP)
> -	movq_cfi rcx, (RCX-RBP)
> -	movq_cfi rax, (RAX-RBP)
> -	movq_cfi  r8,  (R8-RBP)
> -	movq_cfi  r9,  (R9-RBP)
> -	movq_cfi r10, (R10-RBP)
> -	movq_cfi r11, (R11-RBP)
> -
> -	/* Save rbp so that we can unwind from get_irq_regs() */
> -	movq_cfi rbp, 0

Hmm SAVEE_C_REGS below doesn't seem to save rbp like we did before.
Perhaps it's implicitely saved somewhere?

> -
> -	/* Save previous stack value */
> -	movq %rsp, %rsi

Also rsp isn't saved in %rsi like before. Maybe
that's because we already save it in rdi? Makes sense since
now arg1 == rsp. More on that later.

> -
> -	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
> -	testl $3, CS-RBP(%rsi)
> +	ALLOC_PTREGS_ON_STACK
> +	SAVE_C_REGS
> +	movq %rsp, %rdi	/* arg1 for handler */
> +	testl $3, CS(%rsp)
>  	je 1f
>  	SWAPGS
> -	/*
> +1:	/*
>  	 * irq_count is used to check if a CPU is already on an interrupt stack
>  	 * or not. While this is essentially redundant with preempt_count it is
>  	 * a little cheaper to use a separate counter in the PDA (short of
>  	 * moving irq_enter into assembly, which would be too much work)
>  	 */
> -1:	incl PER_CPU_VAR(irq_count)
> +	incl PER_CPU_VAR(irq_count)
>  	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
> -	CFI_DEF_CFA_REGISTER	rsi
> +	CFI_DEF_CFA_REGISTER	rdi
>  
>  	/* Store previous stack value */
> -	pushq %rsi
> +	pushq %rdi

So you push rdi instead...

>  	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
>  			0x77 /* DW_OP_breg7 */, 0, \
>  			0x06 /* DW_OP_deref */, \
> -			0x08 /* DW_OP_const1u */, SS+8-RBP, \
> +			0x08 /* DW_OP_const1u */, SS+8, \
>  			0x22 /* DW_OP_plus */
>  	/* We entered an interrupt context - irqs are off: */
>  	TRACE_IRQS_OFF
> -
>  	call \func
>  	.endm
>  
> @@ -749,10 +719,9 @@ ret_from_intr:
>  
>  	/* Restore saved previous stack */
>  	popq %rsi

And then you pop to rsi. Ok that indeed works but perhaps we should keep it symetrical
just for clarity? Any reason why we can't reuse rdi here?

> -	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */
> -	leaq ARGOFFSET-RBP(%rsi), %rsp
> +	CFI_DEF_CFA rsi,SS+8	/* reg/off reset after def_cfa_expr */
> +	movq %rsi, %rsp
>  	CFI_DEF_CFA_REGISTER	rsp
> -	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET

Thanks.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 22:13       ` H. Peter Anvin
@ 2014-08-02 21:14         ` Andy Lutomirski
  2014-08-02 21:23           ` H. Peter Anvin
  0 siblings, 1 reply; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-02 21:14 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Denys Vlasenko, Denys Vlasenko, Linux Kernel Mailing List,
	Oleg Nesterov, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
	Will Drewry, Kees Cook

On Fri, Aug 1, 2014 at 3:13 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 08/01/2014 03:11 PM, Denys Vlasenko wrote:
>>>
>>> Could you please try to see if there is a measurable change in the
>>> latency of a trivial syscall?
>>
>> Will do.
>> Something along the lines of "how long does it take to execute two
>> gazillions of getppid()?"
>>
>
> Something like that, yes, but you have to run enough data points so you
> can determine if the difference is statistically significant or just noise.

Denys, if you want to avoid five minutes of programming, you can build this:

https://gitorious.org/linux-test-utils/linux-clock-tests/

and run timing_test_64 10 getpid

It doesn't do real statistics, but it seems to get quite stable results.

--Andy

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-02 21:14         ` Andy Lutomirski
@ 2014-08-02 21:23           ` H. Peter Anvin
  2014-08-02 21:38             ` Andy Lutomirski
  0 siblings, 1 reply; 41+ messages in thread
From: H. Peter Anvin @ 2014-08-02 21:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Denys Vlasenko, Denys Vlasenko, Linux Kernel Mailing List,
	Oleg Nesterov, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
	Will Drewry, Kees Cook

It would be nice to automate running a T-test on it.

On August 2, 2014 2:14:50 PM PDT, Andy Lutomirski <luto@amacapital.net> wrote:
>On Fri, Aug 1, 2014 at 3:13 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 08/01/2014 03:11 PM, Denys Vlasenko wrote:
>>>>
>>>> Could you please try to see if there is a measurable change in the
>>>> latency of a trivial syscall?
>>>
>>> Will do.
>>> Something along the lines of "how long does it take to execute two
>>> gazillions of getppid()?"
>>>
>>
>> Something like that, yes, but you have to run enough data points so
>you
>> can determine if the difference is statistically significant or just
>noise.
>
>Denys, if you want to avoid five minutes of programming, you can build
>this:
>
>https://gitorious.org/linux-test-utils/linux-clock-tests/
>
>and run timing_test_64 10 getpid
>
>It doesn't do real statistics, but it seems to get quite stable
>results.
>
>--Andy

-- 
Sent from my mobile phone.  Please pardon brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-02 21:23           ` H. Peter Anvin
@ 2014-08-02 21:38             ` Andy Lutomirski
  0 siblings, 0 replies; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-02 21:38 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Denys Vlasenko, Denys Vlasenko, Linux Kernel Mailing List,
	Oleg Nesterov, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
	Will Drewry, Kees Cook

On Sat, Aug 2, 2014 at 2:23 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> It would be nice to automate running a T-test on it.

I'll see if I can do something like that.  Using the t statistic seems
like overkill here, though -- timing_test_64 runs millions of
iterations, so even if I batch them, I'll end up with n ~ 1000 (or
even larger), at which point plain old Gaussians should be fine.

--Andy

>
> On August 2, 2014 2:14:50 PM PDT, Andy Lutomirski <luto@amacapital.net> wrote:
>>On Fri, Aug 1, 2014 at 3:13 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> On 08/01/2014 03:11 PM, Denys Vlasenko wrote:
>>>>>
>>>>> Could you please try to see if there is a measurable change in the
>>>>> latency of a trivial syscall?
>>>>
>>>> Will do.
>>>> Something along the lines of "how long does it take to execute two
>>>> gazillions of getppid()?"
>>>>
>>>
>>> Something like that, yes, but you have to run enough data points so
>>you
>>> can determine if the difference is statistically significant or just
>>noise.
>>
>>Denys, if you want to avoid five minutes of programming, you can build
>>this:
>>
>>https://gitorious.org/linux-test-utils/linux-clock-tests/
>>
>>and run timing_test_64 10 getpid
>>
>>It doesn't do real statistics, but it seems to get quite stable
>>results.
>>
>>--Andy
>
> --
> Sent from my mobile phone.  Please pardon brevity and lack of formatting.



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 23:19   ` Frederic Weisbecker
@ 2014-08-04  3:03     ` Denys Vlasenko
  2014-08-04  7:57       ` Borislav Petkov
  2014-08-11  0:46       ` Frederic Weisbecker
  0 siblings, 2 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-04  3:03 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Denys Vlasenko, Linux Kernel Mailing List, Oleg Nesterov,
	H. Peter Anvin, Andy Lutomirski, X86 ML, Alexei Starovoitov,
	Will Drewry, Kees Cook

On Sat, Aug 2, 2014 at 1:19 AM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> On Fri, Aug 01, 2014 at 04:48:17PM +0200, Denys Vlasenko wrote:
>>
>>  /* 0(%rsp): ~(interrupt number) */
>>       .macro interrupt func
>> -     /* reserve pt_regs for scratch regs and rbp */
>> -     subq $ORIG_RAX-RBP, %rsp
>> -     CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
>> -     cld
>> -     /* start from rbp in pt_regs and jump over */
>> -     movq_cfi rdi, (RDI-RBP)
>> -     movq_cfi rsi, (RSI-RBP)
>> -     movq_cfi rdx, (RDX-RBP)
>> -     movq_cfi rcx, (RCX-RBP)
>> -     movq_cfi rax, (RAX-RBP)
>> -     movq_cfi  r8,  (R8-RBP)
>> -     movq_cfi  r9,  (R9-RBP)
>> -     movq_cfi r10, (R10-RBP)
>> -     movq_cfi r11, (R11-RBP)
>> -
>> -     /* Save rbp so that we can unwind from get_irq_regs() */
>> -     movq_cfi rbp, 0
>
> Hmm SAVEE_C_REGS below doesn't seem to save rbp like we did before.
> Perhaps it's implicitely saved somewhere?
>
>> -
>> -     /* Save previous stack value */
>> -     movq %rsp, %rsi
>
> Also rsp isn't saved in %rsi like before. Maybe
> that's because we already save it in rdi? Makes sense since
> now arg1 == rsp. More on that later.
>
>> -
>> -     leaq -RBP(%rsp),%rdi    /* arg1 for handler */
>> -     testl $3, CS-RBP(%rsi)
>> +     ALLOC_PTREGS_ON_STACK
>> +     SAVE_C_REGS
>> +     movq %rsp, %rdi /* arg1 for handler */
>> +     testl $3, CS(%rsp)
>>       je 1f
>>       SWAPGS
>> -     /*
>> +1:   /*
>>        * irq_count is used to check if a CPU is already on an interrupt stack
>>        * or not. While this is essentially redundant with preempt_count it is
>>        * a little cheaper to use a separate counter in the PDA (short of
>>        * moving irq_enter into assembly, which would be too much work)
>>        */
>> -1:   incl PER_CPU_VAR(irq_count)
>> +     incl PER_CPU_VAR(irq_count)
>>       cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
>> -     CFI_DEF_CFA_REGISTER    rsi
>> +     CFI_DEF_CFA_REGISTER    rdi
>>
>>       /* Store previous stack value */
>> -     pushq %rsi
>> +     pushq %rdi
>
> So you push rdi instead...
>
>>       CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
>>                       0x77 /* DW_OP_breg7 */, 0, \
>>                       0x06 /* DW_OP_deref */, \
>> -                     0x08 /* DW_OP_const1u */, SS+8-RBP, \
>> +                     0x08 /* DW_OP_const1u */, SS+8, \
>>                       0x22 /* DW_OP_plus */
>>       /* We entered an interrupt context - irqs are off: */
>>       TRACE_IRQS_OFF
>> -
>>       call \func
>>       .endm
>>
>> @@ -749,10 +719,9 @@ ret_from_intr:
>>
>>       /* Restore saved previous stack */
>>       popq %rsi
>
> And then you pop to rsi. Ok that indeed works but perhaps we should keep it symetrical
> just for clarity? Any reason why we can't reuse rdi here?

I changed this entire area in v2: basically, I will not change the logic,
but will add comments explaining what are we doing here, and why.
(Some minor code changes will be done, not affecting the logic).

While we are at it, what this  CFI_ESCAPE thing does here?
As usual, it has no comment :/

-- 
vda

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-04  3:03     ` Denys Vlasenko
@ 2014-08-04  7:57       ` Borislav Petkov
  2014-08-11  0:46       ` Frederic Weisbecker
  1 sibling, 0 replies; 41+ messages in thread
From: Borislav Petkov @ 2014-08-04  7:57 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Frederic Weisbecker, Denys Vlasenko, Linux Kernel Mailing List,
	Oleg Nesterov, H. Peter Anvin, Andy Lutomirski, X86 ML,
	Alexei Starovoitov, Will Drewry, Kees Cook

On Mon, Aug 04, 2014 at 05:03:42AM +0200, Denys Vlasenko wrote:
> I changed this entire area in v2: basically, I will not change the
> logic, but will add comments explaining what are we doing here, and
> why.

Very good idea! This file needs some good commenting.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-01 17:04   ` Andy Lutomirski
@ 2014-08-04 14:28     ` Denys Vlasenko
  2014-08-04 14:47       ` Oleg Nesterov
  2014-08-04 21:03       ` Andy Lutomirski
  0 siblings, 2 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-04 14:28 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Denys Vlasenko, linux-kernel, Oleg Nesterov, H. Peter Anvin,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

On Fri, Aug 1, 2014 at 7:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Aug 1, 2014 at 7:48 AM, Denys Vlasenko <dvlasenk@redhat.com> wrote:
>> 64-bit code was using six stack slots fewer by not saving/restoring
>> registers which a callee-preserved according to C ABI,
>> and not allocating space for them
>
> This is great.
>
> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :)  Maybe I'll give that a shot.

I'm yet at the stage "what that stuff does anyway?" and at
"why do we need percpu old_rsp thingy?" in particular.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-04 14:28     ` Denys Vlasenko
@ 2014-08-04 14:47       ` Oleg Nesterov
  2014-08-04 15:34         ` Oleg Nesterov
  2014-08-04 21:03       ` Andy Lutomirski
  1 sibling, 1 reply; 41+ messages in thread
From: Oleg Nesterov @ 2014-08-04 14:47 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Andy Lutomirski, Denys Vlasenko, linux-kernel, H. Peter Anvin,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

On 08/04, Denys Vlasenko wrote:
>
> "why do we need percpu old_rsp thingy?" in particular.

See thread_struct->usersp, current_user_stack_pointer().

Oleg.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-04 14:47       ` Oleg Nesterov
@ 2014-08-04 15:34         ` Oleg Nesterov
  0 siblings, 0 replies; 41+ messages in thread
From: Oleg Nesterov @ 2014-08-04 15:34 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Andy Lutomirski, Denys Vlasenko, linux-kernel, H. Peter Anvin,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

On 08/04, Oleg Nesterov wrote:
>
> On 08/04, Denys Vlasenko wrote:
> >
> > "why do we need percpu old_rsp thingy?" in particular.
>
> See thread_struct->usersp, current_user_stack_pointer().

Btw, perhaps it makes sense to document that task_pt_regs(current)->sp
is not current_user_stack_pointer() inside system_call_fastpath.

Oleg.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-04 14:28     ` Denys Vlasenko
  2014-08-04 14:47       ` Oleg Nesterov
@ 2014-08-04 21:03       ` Andy Lutomirski
  2014-08-04 21:23         ` Borislav Petkov
  2014-08-05 10:35         ` Denys Vlasenko
  1 sibling, 2 replies; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-04 21:03 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Denys Vlasenko, linux-kernel, Oleg Nesterov, H. Peter Anvin,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

On Mon, Aug 4, 2014 at 11:28 PM, Denys Vlasenko
<vda.linux@googlemail.com> wrote:
> On Fri, Aug 1, 2014 at 7:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Aug 1, 2014 at 7:48 AM, Denys Vlasenko <dvlasenk@redhat.com> wrote:
>>> 64-bit code was using six stack slots fewer by not saving/restoring
>>> registers which a callee-preserved according to C ABI,
>>> and not allocating space for them
>>
>> This is great.
>>
>> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :)  Maybe I'll give that a shot.
>
> I'm yet at the stage "what that stuff does anyway?" and at
> "why do we need percpu old_rsp thingy?" in particular.

On x86_64, the syscall instruction has no effect on rsp.  That means
that the entry point starts out with no stack.  There are no free
registers whatsoever at the entry point.

That means that the entry code needs to do swapgs, stash rsp somewhere
relative to gs, and then load the kernel's rsp.  old_rsp is the spot
used for this.

Now the kernel does an optimization that is, I think, very much not
worth it.  The kernel doesn't bother sticking the old rsp value into
pt_regs (saving two instructions on fast path entries) and doesn't
initialize the SS, CS, RCX, and EFLAGS fields in pt_regs, saving four
more instructions.

To make this optimization work, the whole FIXUP/RESTORE_TOP_OF_STACK
dance is needed, and there's the usersp crap in the context switch
code, and current_user_stack_pointer(), and probably even more crap
that I haven't noticed.  And I sure hope that nothing in the *compat*
syscall path touches current_user_stack_pointer(), because the compat
code doesn't seem to use old_rsp.

I think this should all be ripped out.  The only real difficulty will
be that the sysret code needs to restore rsp itself, so the sysret
path will end up needing two more instructions.  Removing all of the
TOP_OF_STACK stuff will add ten instructions to fast path syscalls,
and I wouldn't be surprised if this adds considerably fewer than ten
cycles on any modern chip.

(It's too bad that there's no unlocked xchg; this could be faster if
we had one.  It's also too bad that the syscall ABI didn't choose some
register to unconditionally set to zero, which would have given us the
single scratch register we'd need to avoid this whole mess in the
first place.)

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-04 21:03       ` Andy Lutomirski
@ 2014-08-04 21:23         ` Borislav Petkov
  2014-08-05 10:35         ` Denys Vlasenko
  1 sibling, 0 replies; 41+ messages in thread
From: Borislav Petkov @ 2014-08-04 21:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Denys Vlasenko, Denys Vlasenko, linux-kernel, Oleg Nesterov,
	H. Peter Anvin, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
	Will Drewry, Kees Cook

On Tue, Aug 05, 2014 at 06:03:24AM +0900, Andy Lutomirski wrote:
> (It's too bad that there's no unlocked xchg; this could be faster if
> we had one.

There is - just put both operands in registers. :-)

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-04 21:03       ` Andy Lutomirski
  2014-08-04 21:23         ` Borislav Petkov
@ 2014-08-05 10:35         ` Denys Vlasenko
  2014-08-05 14:53           ` Andy Lutomirski
  1 sibling, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-05 10:35 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Denys Vlasenko, linux-kernel, Oleg Nesterov, H. Peter Anvin,
	Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
	Kees Cook

On Mon, Aug 4, 2014 at 11:03 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :)  Maybe I'll give that a shot.
>>
>> I'm yet at the stage "what that stuff does anyway?" and at
>> "why do we need percpu old_rsp thingy?" in particular.
>
> On x86_64, the syscall instruction has no effect on rsp.  That means
> that the entry point starts out with no stack.  There are no free
> registers whatsoever at the entry point.
>
> That means that the entry code needs to do swapgs, stash rsp somewhere
> relative to gs, and then load the kernel's rsp.  old_rsp is the spot
> used for this.
>
> Now the kernel does an optimization that is, I think, very much not
> worth it.  The kernel doesn't bother sticking the old rsp value into
> pt_regs (saving two instructions on fast path entries) and doesn't
> initialize the SS, CS, RCX, and EFLAGS fields in pt_regs, saving four
> more instructions.
>
> To make this optimization work, the whole FIXUP/RESTORE_TOP_OF_STACK
> dance is needed, and there's the usersp crap in the context switch
> code, and current_user_stack_pointer(), and probably even more crap
> that I haven't noticed.  And I sure hope that nothing in the *compat*
> syscall path touches current_user_stack_pointer(), because the compat
> code doesn't seem to use old_rsp.
>
> I think this should all be ripped out.  The only real difficulty will
> be that the sysret code needs to restore rsp itself, so the sysret
> path will end up needing two more instructions.  Removing all of the
> TOP_OF_STACK stuff will add ten instructions to fast path syscalls,
> and I wouldn't be surprised if this adds considerably fewer than ten
> cycles on any modern chip.

Something like this on the fast path? -

        SWAPGS_UNSAFE_STACK
        movq    %rsp,PER_CPU_VAR(old_rsp)
        movq    PER_CPU_VAR(kernel_stack),%rsp
        ENABLE_INTERRUPTS(CLBR_NONE)
        ALLOC_PTREGS_ON_STACK 8         /* +8: space for orig_ax */
        SAVE_C_REGS
        movq  %rax,ORIG_RAX(%rsp)
        movq  %rcx,RIP(%rsp)
+       movq  %r11,EFLAGS(%rsp)
+       movq PER_CPU_VAR(old_rsp),%rcx
+       movq %rcx,RSP(%rsp)
        ...
-       RESTORE_C_REGS_EXCEPT_RCX
+       RESTORE_C_REGS_EXCEPT_RCX_R11
        movq RIP(%rsp),%rcx
+       movq    EFLAGS(%rsp), %r11
-       movq    PER_CPU_VAR(old_rsp), %rsp
+       movq    RSP(%rsp), %rsp
      USERGS_SYSRET64

Looks like only 3 additional insns (unfortunately, one is memory read).
Do we need to save rsc and r11 in "struct pt_regs" in their
"standard" slots, though? If we don't, we can drop two insns
(SAVE_C_REGS -> SAVE_C_REGS_EXCEPT_RCX_R11).

Then old_rsp can be nuked everywhere else,
RESTORE_TOP_OF_STACK can be nuked, and
FIXUP_TOP_OF_STACK can be reduced to merely:

        movq $__USER_DS,SS(%rsp)
        movq $__USER_CS,CS(%rsp)

(BTW, why currently it does "movq $-1,RCX+\offset(%rsp)?)

-- 
vda

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-05 10:35         ` Denys Vlasenko
@ 2014-08-05 14:53           ` Andy Lutomirski
  2014-08-05 15:17             ` Denys Vlasenko
  2014-08-07  9:54             ` Denys Vlasenko
  0 siblings, 2 replies; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-05 14:53 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: X86 ML, Frederic Weisbecker, Denys Vlasenko, Oleg Nesterov,
	linux-kernel, H. Peter Anvin, Alexei Starovoitov, Kees Cook,
	Will Drewry

On Aug 5, 2014 7:36 PM, "Denys Vlasenko" <vda.linux@googlemail.com> wrote:
>
> On Mon, Aug 4, 2014 at 11:03 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >>> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :)  Maybe I'll give that a shot.
> >>
> >> I'm yet at the stage "what that stuff does anyway?" and at
> >> "why do we need percpu old_rsp thingy?" in particular.
> >
> > On x86_64, the syscall instruction has no effect on rsp.  That means
> > that the entry point starts out with no stack.  There are no free
> > registers whatsoever at the entry point.
> >
> > That means that the entry code needs to do swapgs, stash rsp somewhere
> > relative to gs, and then load the kernel's rsp.  old_rsp is the spot
> > used for this.
> >
> > Now the kernel does an optimization that is, I think, very much not
> > worth it.  The kernel doesn't bother sticking the old rsp value into
> > pt_regs (saving two instructions on fast path entries) and doesn't
> > initialize the SS, CS, RCX, and EFLAGS fields in pt_regs, saving four
> > more instructions.
> >
> > To make this optimization work, the whole FIXUP/RESTORE_TOP_OF_STACK
> > dance is needed, and there's the usersp crap in the context switch
> > code, and current_user_stack_pointer(), and probably even more crap
> > that I haven't noticed.  And I sure hope that nothing in the *compat*
> > syscall path touches current_user_stack_pointer(), because the compat
> > code doesn't seem to use old_rsp.
> >
> > I think this should all be ripped out.  The only real difficulty will
> > be that the sysret code needs to restore rsp itself, so the sysret
> > path will end up needing two more instructions.  Removing all of the
> > TOP_OF_STACK stuff will add ten instructions to fast path syscalls,
> > and I wouldn't be surprised if this adds considerably fewer than ten
> > cycles on any modern chip.
>
> Something like this on the fast path? -
>
>         SWAPGS_UNSAFE_STACK
>         movq    %rsp,PER_CPU_VAR(old_rsp)
>         movq    PER_CPU_VAR(kernel_stack),%rsp
>         ENABLE_INTERRUPTS(CLBR_NONE)
>         ALLOC_PTREGS_ON_STACK 8         /* +8: space for orig_ax */
>         SAVE_C_REGS
>         movq  %rax,ORIG_RAX(%rsp)
>         movq  %rcx,RIP(%rsp)
> +       movq  %r11,EFLAGS(%rsp)
> +       movq PER_CPU_VAR(old_rsp),%rcx
> +       movq %rcx,RSP(%rsp)
>         ...
> -       RESTORE_C_REGS_EXCEPT_RCX
> +       RESTORE_C_REGS_EXCEPT_RCX_R11
>         movq RIP(%rsp),%rcx
> +       movq    EFLAGS(%rsp), %r11
> -       movq    PER_CPU_VAR(old_rsp), %rsp
> +       movq    RSP(%rsp), %rsp
>       USERGS_SYSRET64

The sysret code still needs the inverse, right?  ptrace can change
RSP.  And, if that happens, then all the context switch code can go,
as can the usersp thread info slot.

>
> Looks like only 3 additional insns (unfortunately, one is memory read).

The store forwarding buffer should handle that one, I think.

> Do we need to save rsc and r11 in "struct pt_regs" in their
> "standard" slots, though?

ptrace probably wants it.

> If we don't, we can drop two insns
> (SAVE_C_REGS -> SAVE_C_REGS_EXCEPT_RCX_R11).
>
> Then old_rsp can be nuked everywhere else,
> RESTORE_TOP_OF_STACK can be nuked, and
> FIXUP_TOP_OF_STACK can be reduced to merely:
>
>         movq $__USER_DS,SS(%rsp)
>         movq $__USER_CS,CS(%rsp)

Mmm, right.  That's probably better than doing this on the fast path.

>
> (BTW, why currently it does "movq $-1,RCX+\offset(%rsp)?)

I would argue this is a bug.  (In fact, I have a patch floating around
to fix it.  The current code is glitchy in a visible-to-user-space
way.)  We should put rcx into both RIP and RCX, since the sysret path
will implicitly do that, and we should restore the same register
values in the iret and sysret paths.

--Andy

>
> --
> vda

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-05 14:53           ` Andy Lutomirski
@ 2014-08-05 15:17             ` Denys Vlasenko
  2014-08-05 23:02               ` Andy Lutomirski
  2014-08-07  9:54             ` Denys Vlasenko
  1 sibling, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-05 15:17 UTC (permalink / raw)
  To: Andy Lutomirski, Denys Vlasenko
  Cc: X86 ML, Frederic Weisbecker, Oleg Nesterov, linux-kernel,
	H. Peter Anvin, Alexei Starovoitov, Kees Cook, Will Drewry

On 08/05/2014 04:53 PM, Andy Lutomirski wrote:
> On Aug 5, 2014 7:36 PM, "Denys Vlasenko" <vda.linux@googlemail.com> wrote:
>>
>> On Mon, Aug 4, 2014 at 11:03 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :)  Maybe I'll give that a shot.
>>>>
>>>> I'm yet at the stage "what that stuff does anyway?" and at
>>>> "why do we need percpu old_rsp thingy?" in particular.
>>>
>>> On x86_64, the syscall instruction has no effect on rsp.  That means
>>> that the entry point starts out with no stack.  There are no free
>>> registers whatsoever at the entry point.
>>>
>>> That means that the entry code needs to do swapgs, stash rsp somewhere
>>> relative to gs, and then load the kernel's rsp.  old_rsp is the spot
>>> used for this.
>>>
>>> Now the kernel does an optimization that is, I think, very much not
>>> worth it.  The kernel doesn't bother sticking the old rsp value into
>>> pt_regs (saving two instructions on fast path entries) and doesn't
>>> initialize the SS, CS, RCX, and EFLAGS fields in pt_regs, saving four
>>> more instructions.
>>>
>>> To make this optimization work, the whole FIXUP/RESTORE_TOP_OF_STACK
>>> dance is needed, and there's the usersp crap in the context switch
>>> code, and current_user_stack_pointer(), and probably even more crap
>>> that I haven't noticed.  And I sure hope that nothing in the *compat*
>>> syscall path touches current_user_stack_pointer(), because the compat
>>> code doesn't seem to use old_rsp.
>>>
>>> I think this should all be ripped out.  The only real difficulty will
>>> be that the sysret code needs to restore rsp itself, so the sysret
>>> path will end up needing two more instructions.  Removing all of the
>>> TOP_OF_STACK stuff will add ten instructions to fast path syscalls,
>>> and I wouldn't be surprised if this adds considerably fewer than ten
>>> cycles on any modern chip.
>>
>> Something like this on the fast path? -
>>
>>         SWAPGS_UNSAFE_STACK
>>         movq    %rsp,PER_CPU_VAR(old_rsp)
>>         movq    PER_CPU_VAR(kernel_stack),%rsp
>>         ENABLE_INTERRUPTS(CLBR_NONE)
>>         ALLOC_PTREGS_ON_STACK 8         /* +8: space for orig_ax */
>>         SAVE_C_REGS
>>         movq  %rax,ORIG_RAX(%rsp)
>>         movq  %rcx,RIP(%rsp)
>> +       movq  %r11,EFLAGS(%rsp)
>> +       movq PER_CPU_VAR(old_rsp),%rcx
>> +       movq %rcx,RSP(%rsp)
>>         ...
>> -       RESTORE_C_REGS_EXCEPT_RCX
>> +       RESTORE_C_REGS_EXCEPT_RCX_R11
>>         movq RIP(%rsp),%rcx
>> +       movq    EFLAGS(%rsp), %r11
>> -       movq    PER_CPU_VAR(old_rsp), %rsp
>> +       movq    RSP(%rsp), %rsp
>>       USERGS_SYSRET64
> 
> The sysret code still needs the inverse, right?

The part after "..." in my skecth is the sysret code.

> ptrace can change RSP.

By writing to pt_regs->rsp, yes. And the above code
will pick it up - we read RSP(%rsp).

>> Do we need to save rcx and r11 in "struct pt_regs" in their
>> "standard" slots, though?
> 
> ptrace probably wants it.

Let's see.
They don't contain any useful information:
With current code,
pt_regs->r11 is the same as pt_regs->rflags,
pt_regs->rcx is the same as pt_regs->rip (modulo weird store of -1).
So reading them by ptrace is... weird - just read
pt_regs->rflags or pt_regs->rip instead!

If ptrace is active, we'll return via iretq.
If ptrace wrote to these pt_regs members, on return
to userspace current code restores modified values.
My proposed change does not change this.

So, only ptrace reads of rcx and r11 will be affected.
Hmm. Maybe we can fill them only on "tracesys:" codepath?

>> Then old_rsp can be nuked everywhere else,
>> RESTORE_TOP_OF_STACK can be nuked, and
>> FIXUP_TOP_OF_STACK can be reduced to merely:
>>
>>         movq $__USER_DS,SS(%rsp)
>>         movq $__USER_CS,CS(%rsp)
> 
> Mmm, right.  That's probably better than doing this on the fast path.
> 
>>
>> (BTW, why currently it does "movq $-1,RCX+\offset(%rsp)?)
> 
> I would argue this is a bug.

Agree.

> (In fact, I have a patch floating around
> to fix it.  The current code is glitchy in a visible-to-user-space
> way.)  We should put rcx into both RIP and RCX, since the sysret path
> will implicitly do that, and we should restore the same register
> values in the iret and sysret paths.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-05 15:17             ` Denys Vlasenko
@ 2014-08-05 23:02               ` Andy Lutomirski
  0 siblings, 0 replies; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-05 23:02 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: X86 ML, Frederic Weisbecker, Oleg Nesterov, linux-kernel,
	H. Peter Anvin, Denys Vlasenko, Alexei Starovoitov, Will Drewry,
	Kees Cook

On Aug 6, 2014 12:17 AM, "Denys Vlasenko" <dvlasenk@redhat.com> wrote:
>
> On 08/05/2014 04:53 PM, Andy Lutomirski wrote:
> > On Aug 5, 2014 7:36 PM, "Denys Vlasenko" <vda.linux@googlemail.com> wrote:
> >>
> >> On Mon, Aug 4, 2014 at 11:03 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >>>>> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :)  Maybe I'll give that a shot.
> >>>>
> >>>> I'm yet at the stage "what that stuff does anyway?" and at
> >>>> "why do we need percpu old_rsp thingy?" in particular.
> >>>
> >>> On x86_64, the syscall instruction has no effect on rsp.  That means
> >>> that the entry point starts out with no stack.  There are no free
> >>> registers whatsoever at the entry point.
> >>>
> >>> That means that the entry code needs to do swapgs, stash rsp somewhere
> >>> relative to gs, and then load the kernel's rsp.  old_rsp is the spot
> >>> used for this.
> >>>
> >>> Now the kernel does an optimization that is, I think, very much not
> >>> worth it.  The kernel doesn't bother sticking the old rsp value into
> >>> pt_regs (saving two instructions on fast path entries) and doesn't
> >>> initialize the SS, CS, RCX, and EFLAGS fields in pt_regs, saving four
> >>> more instructions.
> >>>
> >>> To make this optimization work, the whole FIXUP/RESTORE_TOP_OF_STACK
> >>> dance is needed, and there's the usersp crap in the context switch
> >>> code, and current_user_stack_pointer(), and probably even more crap
> >>> that I haven't noticed.  And I sure hope that nothing in the *compat*
> >>> syscall path touches current_user_stack_pointer(), because the compat
> >>> code doesn't seem to use old_rsp.
> >>>
> >>> I think this should all be ripped out.  The only real difficulty will
> >>> be that the sysret code needs to restore rsp itself, so the sysret
> >>> path will end up needing two more instructions.  Removing all of the
> >>> TOP_OF_STACK stuff will add ten instructions to fast path syscalls,
> >>> and I wouldn't be surprised if this adds considerably fewer than ten
> >>> cycles on any modern chip.
> >>
> >> Something like this on the fast path? -
> >>
> >>         SWAPGS_UNSAFE_STACK
> >>         movq    %rsp,PER_CPU_VAR(old_rsp)
> >>         movq    PER_CPU_VAR(kernel_stack),%rsp
> >>         ENABLE_INTERRUPTS(CLBR_NONE)
> >>         ALLOC_PTREGS_ON_STACK 8         /* +8: space for orig_ax */
> >>         SAVE_C_REGS
> >>         movq  %rax,ORIG_RAX(%rsp)
> >>         movq  %rcx,RIP(%rsp)
> >> +       movq  %r11,EFLAGS(%rsp)
> >> +       movq PER_CPU_VAR(old_rsp),%rcx
> >> +       movq %rcx,RSP(%rsp)
> >>         ...
> >> -       RESTORE_C_REGS_EXCEPT_RCX
> >> +       RESTORE_C_REGS_EXCEPT_RCX_R11
> >>         movq RIP(%rsp),%rcx
> >> +       movq    EFLAGS(%rsp), %r11
> >> -       movq    PER_CPU_VAR(old_rsp), %rsp
> >> +       movq    RSP(%rsp), %rsp
> >>       USERGS_SYSRET64
> >
> > The sysret code still needs the inverse, right?
>
> The part after "..." in my skecth is the sysret code.

Right :)

>
> > ptrace can change RSP.
>
> By writing to pt_regs->rsp, yes. And the above code
> will pick it up - we read RSP(%rsp).

Also correct -- nice :)

>
> >> Do we need to save rcx and r11 in "struct pt_regs" in their
> >> "standard" slots, though?
> >
> > ptrace probably wants it.
>
> Let's see.
> They don't contain any useful information:
> With current code,
> pt_regs->r11 is the same as pt_regs->rflags,
> pt_regs->rcx is the same as pt_regs->rip (modulo weird store of -1).
> So reading them by ptrace is... weird - just read
> pt_regs->rflags or pt_regs->rip instead!

I'm unconvinced.  This is too complicated.

>
> If ptrace is active, we'll return via iretq.
> If ptrace wrote to these pt_regs members, on return
> to userspace current code restores modified values.
> My proposed change does not change this.
>
> So, only ptrace reads of rcx and r11 will be affected.
> Hmm. Maybe we can fill them only on "tracesys:" codepath?

It's not just tracesys -- there's FORK_LIKE, the various pt_regs
stubs, and exit work, too.  Setting these values up early is faster,
anyway -- no loads are needed, only stores, and the stores will
presumably be combined with the rest of the frame setup, so no
additional memory bandwidth should be needed.

--Andy

>
> >> Then old_rsp can be nuked everywhere else,
> >> RESTORE_TOP_OF_STACK can be nuked, and
> >> FIXUP_TOP_OF_STACK can be reduced to merely:
> >>
> >>         movq $__USER_DS,SS(%rsp)
> >>         movq $__USER_CS,CS(%rsp)
> >
> > Mmm, right.  That's probably better than doing this on the fast path.
> >
> >>
> >> (BTW, why currently it does "movq $-1,RCX+\offset(%rsp)?)
> >
> > I would argue this is a bug.
>
> Agree.
>
> > (In fact, I have a patch floating around
> > to fix it.  The current code is glitchy in a visible-to-user-space
> > way.)  We should put rcx into both RIP and RCX, since the sysret path
> > will implicitly do that, and we should restore the same register
> > values in the iret and sysret paths.
>

--Andy

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-05 14:53           ` Andy Lutomirski
  2014-08-05 15:17             ` Denys Vlasenko
@ 2014-08-07  9:54             ` Denys Vlasenko
  1 sibling, 0 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-07  9:54 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: X86 ML, Frederic Weisbecker, Denys Vlasenko, Oleg Nesterov,
	linux-kernel, H. Peter Anvin, Alexei Starovoitov, Kees Cook,
	Will Drewry

On Tue, Aug 5, 2014 at 4:53 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Aug 5, 2014 7:36 PM, "Denys Vlasenko" <vda.linux@googlemail.com> wrote:
>> Then old_rsp can be nuked everywhere else,
>> RESTORE_TOP_OF_STACK can be nuked, and
>> FIXUP_TOP_OF_STACK can be reduced to merely:
>>
>>         movq $__USER_DS,SS(%rsp)
>>         movq $__USER_CS,CS(%rsp)
>
> Mmm, right.  That's probably better than doing this on the fast path.

I measured it. These particular insns have a nasty huge encoding in x86 -
12 bytes each, no less!

48 c7 84 24 88 00 00 00 33 00 00 00   movq   $0x33,0x88(%rsp)
48 c7 84 24 a0 00 00 00 2b 00 00 00   movq   $0x2b,0xa0(%rsp)

With them in hot path, I measure 55.6 ns per syscall,
with them moved out of hot path it's 54.8 ns.
This is 2.70GHz CPU, so it must be 2 CPU cycles.

-- 
vda

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-04  3:03     ` Denys Vlasenko
  2014-08-04  7:57       ` Borislav Petkov
@ 2014-08-11  0:46       ` Frederic Weisbecker
  2014-08-11  8:40         ` Jan Beulich
  1 sibling, 1 reply; 41+ messages in thread
From: Frederic Weisbecker @ 2014-08-11  0:46 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Denys Vlasenko, Linux Kernel Mailing List, Oleg Nesterov,
	H. Peter Anvin, Andy Lutomirski, X86 ML, Alexei Starovoitov,
	Will Drewry, Kees Cook

On Mon, Aug 04, 2014 at 05:03:42AM +0200, Denys Vlasenko wrote:
> On Sat, Aug 2, 2014 at 1:19 AM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> >>       CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
> >>                       0x77 /* DW_OP_breg7 */, 0, \
> >>                       0x06 /* DW_OP_deref */, \
> >> -                     0x08 /* DW_OP_const1u */, SS+8-RBP, \
> >> +                     0x08 /* DW_OP_const1u */, SS+8, \
> >>                       0x22 /* DW_OP_plus */
> >>       /* We entered an interrupt context - irqs are off: */
> >>       TRACE_IRQS_OFF
> >> -
> >>       call \func
> >>       .endm
> >>
> >> @@ -749,10 +719,9 @@ ret_from_intr:
> >>
> >>       /* Restore saved previous stack */
> >>       popq %rsi
> >
> > And then you pop to rsi. Ok that indeed works but perhaps we should keep it symetrical
> > just for clarity? Any reason why we can't reuse rdi here?
> 
> I changed this entire area in v2: basically, I will not change the logic,
> but will add comments explaining what are we doing here, and why.
> (Some minor code changes will be done, not affecting the logic).
> 
> While we are at it, what this  CFI_ESCAPE thing does here?
> As usual, it has no comment :/

I don't know, only Jan Beulich understands those CFI black magic.
BTW he doesn't appears to be Cc, we should add him. 

> 
> -- 
> vda

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-11  0:46       ` Frederic Weisbecker
@ 2014-08-11  8:40         ` Jan Beulich
  2014-08-11  9:07           ` Andy Lutomirski
  2014-08-11 13:26           ` Denys Vlasenko
  0 siblings, 2 replies; 41+ messages in thread
From: Jan Beulich @ 2014-08-11  8:40 UTC (permalink / raw)
  To: Frederic Weisbecker, Denys Vlasenko
  Cc: Andy Lutomirski, Kees Cook, Will Drewry, X86 ML,
	Alexei Starovoitov, Denys Vlasenko, Oleg Nesterov,
	Linux Kernel Mailing List, H. Peter Anvin

>>> On 11.08.14 at 02:46, <fweisbec@gmail.com> wrote:
> On Mon, Aug 04, 2014 at 05:03:42AM +0200, Denys Vlasenko wrote:
>> On Sat, Aug 2, 2014 at 1:19 AM, Frederic Weisbecker <fweisbec@gmail.com> 
> wrote:
>> >>       CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
>> >>                       0x77 /* DW_OP_breg7 */, 0, \
>> >>                       0x06 /* DW_OP_deref */, \
>> >> -                     0x08 /* DW_OP_const1u */, SS+8-RBP, \
>> >> +                     0x08 /* DW_OP_const1u */, SS+8, \
>> >>                       0x22 /* DW_OP_plus */
>> >>       /* We entered an interrupt context - irqs are off: */
>> >>       TRACE_IRQS_OFF
>> >> -
>> >>       call \func
>> >>       .endm
>> >>
>> >> @@ -749,10 +719,9 @@ ret_from_intr:
>> >>
>> >>       /* Restore saved previous stack */
>> >>       popq %rsi
>> >
>> > And then you pop to rsi. Ok that indeed works but perhaps we should keep it symetrical
>> > just for clarity? Any reason why we can't reuse rdi here?
>> 
>> I changed this entire area in v2: basically, I will not change the logic,
>> but will add comments explaining what are we doing here, and why.
>> (Some minor code changes will be done, not affecting the logic).
>> 
>> While we are at it, what this  CFI_ESCAPE thing does here?
>> As usual, it has no comment :/

Each of its lines has a comment; with other CFI annotations not
each having comments, I don't see what else is needed here.

> I don't know, only Jan Beulich understands those CFI black magic.

That would be very said if true.

In any case: This needs to be a CFI_ESCAPE because there's no
other way I know of to emit the DW_CFA_def_cfa_expression.
And the change to it looks correct to me.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-11  8:40         ` Jan Beulich
@ 2014-08-11  9:07           ` Andy Lutomirski
  2014-08-11  9:31             ` Jan Beulich
  2014-08-11 13:26           ` Denys Vlasenko
  1 sibling, 1 reply; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-11  9:07 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Frederic Weisbecker, Denys Vlasenko, Kees Cook, Will Drewry,
	X86 ML, Alexei Starovoitov, Denys Vlasenko, Oleg Nesterov,
	Linux Kernel Mailing List, H. Peter Anvin

On Mon, Aug 11, 2014 at 5:40 PM, Jan Beulich <JBeulich@suse.com> wrote:
>>>> On 11.08.14 at 02:46, <fweisbec@gmail.com> wrote:
>> On Mon, Aug 04, 2014 at 05:03:42AM +0200, Denys Vlasenko wrote:
>>> On Sat, Aug 2, 2014 at 1:19 AM, Frederic Weisbecker <fweisbec@gmail.com>
>> wrote:
>>> >>       CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
>>> >>                       0x77 /* DW_OP_breg7 */, 0, \
>>> >>                       0x06 /* DW_OP_deref */, \
>>> >> -                     0x08 /* DW_OP_const1u */, SS+8-RBP, \
>>> >> +                     0x08 /* DW_OP_const1u */, SS+8, \
>>> >>                       0x22 /* DW_OP_plus */
>>> >>       /* We entered an interrupt context - irqs are off: */
>>> >>       TRACE_IRQS_OFF
>>> >> -
>>> >>       call \func
>>> >>       .endm
>>> >>
>>> >> @@ -749,10 +719,9 @@ ret_from_intr:
>>> >>
>>> >>       /* Restore saved previous stack */
>>> >>       popq %rsi
>>> >
>>> > And then you pop to rsi. Ok that indeed works but perhaps we should keep it symetrical
>>> > just for clarity? Any reason why we can't reuse rdi here?
>>>
>>> I changed this entire area in v2: basically, I will not change the logic,
>>> but will add comments explaining what are we doing here, and why.
>>> (Some minor code changes will be done, not affecting the logic).
>>>
>>> While we are at it, what this  CFI_ESCAPE thing does here?
>>> As usual, it has no comment :/
>
> Each of its lines has a comment; with other CFI annotations not
> each having comments, I don't see what else is needed here.
>
>> I don't know, only Jan Beulich understands those CFI black magic.
>
> That would be very said if true.
>
> In any case: This needs to be a CFI_ESCAPE because there's no
> other way I know of to emit the DW_CFA_def_cfa_expression.
> And the change to it looks correct to me.
>

How does one test the entry CFI annotations?  The best that I know of
is to single-step through using gdb attached to qemu and see whether
backtraces seem to work.

--Andy

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-11  9:07           ` Andy Lutomirski
@ 2014-08-11  9:31             ` Jan Beulich
  0 siblings, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2014-08-11  9:31 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Kees Cook, Will Drewry, Frederic Weisbecker, Denys Vlasenko,
	X86 ML, Alexei Starovoitov, Denys Vlasenko, Oleg Nesterov,
	Linux Kernel Mailing List, H. Peter Anvin

>>> On 11.08.14 at 11:07, <luto@amacapital.net> wrote:
> How does one test the entry CFI annotations?  The best that I know of
> is to single-step through using gdb attached to qemu and see whether
> backtraces seem to work.

Or have the kernel generate a backtrace from a suitable location and
check that the backtrace is correct (of course provided you have Dwarf
stack unwinding support in your kernel).

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-11  8:40         ` Jan Beulich
  2014-08-11  9:07           ` Andy Lutomirski
@ 2014-08-11 13:26           ` Denys Vlasenko
  2014-08-11 14:17             ` Jan Beulich
  1 sibling, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-11 13:26 UTC (permalink / raw)
  To: Jan Beulich, Frederic Weisbecker, Denys Vlasenko
  Cc: Andy Lutomirski, Kees Cook, Will Drewry, X86 ML,
	Alexei Starovoitov, Oleg Nesterov, Linux Kernel Mailing List,
	H. Peter Anvin

On 08/11/2014 10:40 AM, Jan Beulich wrote:
>>>>>       CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
>>>>>                       0x77 /* DW_OP_breg7 */, 0, \
>>>>>                       0x06 /* DW_OP_deref */, \
>>>>> -                     0x08 /* DW_OP_const1u */, SS+8-RBP, \
>>>>> +                     0x08 /* DW_OP_const1u */, SS+8, \
>>>>>                       0x22 /* DW_OP_plus */
>>>>>       /* We entered an interrupt context - irqs are off: */
>>>>>       TRACE_IRQS_OFF
...
...
>>> While we are at it, what this  CFI_ESCAPE thing does here?
>>> As usual, it has no comment :/
> 
> Each of its lines has a comment; with other CFI annotations not
> each having comments, I don't see what else is needed here.

The existing comments explain what every byte means.
They are useful if CFI-literate reader wants to check correctness
of the encoding of this annotation.

There is no overall comment what this CFI annotation
*achieves*. In human language, what do we say
to DWARF decoder here?


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-11 13:26           ` Denys Vlasenko
@ 2014-08-11 14:17             ` Jan Beulich
  2014-08-11 14:53               ` H. Peter Anvin
  0 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2014-08-11 14:17 UTC (permalink / raw)
  To: Denys Vlasenko, Denys Vlasenko
  Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
	X86 ML, Alexei Starovoitov, Oleg Nesterov,
	Linux Kernel Mailing List, H. Peter Anvin

>>> On 11.08.14 at 15:26, <dvlasenk@redhat.com> wrote:
> On 08/11/2014 10:40 AM, Jan Beulich wrote:
>>>>>>       CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
>>>>>>                       0x77 /* DW_OP_breg7 */, 0, \
>>>>>>                       0x06 /* DW_OP_deref */, \
>>>>>> -                     0x08 /* DW_OP_const1u */, SS+8-RBP, \
>>>>>> +                     0x08 /* DW_OP_const1u */, SS+8, \
>>>>>>                       0x22 /* DW_OP_plus */
>>>>>>       /* We entered an interrupt context - irqs are off: */
>>>>>>       TRACE_IRQS_OFF
> ...
> ...
>>>> While we are at it, what this  CFI_ESCAPE thing does here?
>>>> As usual, it has no comment :/
>> 
>> Each of its lines has a comment; with other CFI annotations not
>> each having comments, I don't see what else is needed here.
> 
> The existing comments explain what every byte means.
> They are useful if CFI-literate reader wants to check correctness
> of the encoding of this annotation.
> 
> There is no overall comment what this CFI annotation
> *achieves*. In human language, what do we say
> to DWARF decoder here?

Short answer: DW_CFA_def_cfa_expression.

Longer response: Just like I said before, what you're asking for is
identical to ask for each other CFI annotation to get a comment
associated to tell you what it's doing, which I don't think you
really mean to ask for. (Our main problem here is that we can't
specify expressions with the .cfi_* gas directives, and hence have
to resort to .cfi_escape.)

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-11 14:17             ` Jan Beulich
@ 2014-08-11 14:53               ` H. Peter Anvin
  2014-08-11 15:08                 ` Jan Beulich
  0 siblings, 1 reply; 41+ messages in thread
From: H. Peter Anvin @ 2014-08-11 14:53 UTC (permalink / raw)
  To: Jan Beulich, Denys Vlasenko, Denys Vlasenko
  Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
	X86 ML, Alexei Starovoitov, Oleg Nesterov,
	Linux Kernel Mailing List

On 08/11/2014 07:17 AM, Jan Beulich wrote:
>>
>> The existing comments explain what every byte means.
>> They are useful if CFI-literate reader wants to check correctness
>> of the encoding of this annotation.
>>
>> There is no overall comment what this CFI annotation
>> *achieves*. In human language, what do we say
>> to DWARF decoder here?
> 
> Short answer: DW_CFA_def_cfa_expression.
> 
> Longer response: Just like I said before, what you're asking for is
> identical to ask for each other CFI annotation to get a comment
> associated to tell you what it's doing, which I don't think you
> really mean to ask for. (Our main problem here is that we can't
> specify expressions with the .cfi_* gas directives, and hence have
> to resort to .cfi_escape.)
> 

No, in *human language*.  What does the DW_CFA_def_cfa_expression
actually aim to accomplish?  If you don't know the innards of the DWARF
spec, the whole thing might as well be Hungarian.

	-hpa


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-11 14:53               ` H. Peter Anvin
@ 2014-08-11 15:08                 ` Jan Beulich
  2014-08-11 15:13                   ` H. Peter Anvin
  0 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2014-08-11 15:08 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
	Denys Vlasenko, X86 ML, Alexei Starovoitov, Denys Vlasenko,
	Oleg Nesterov, Linux Kernel Mailing List

>>> On 11.08.14 at 16:53, <hpa@zytor.com> wrote:
> On 08/11/2014 07:17 AM, Jan Beulich wrote:
>>>
>>> The existing comments explain what every byte means.
>>> They are useful if CFI-literate reader wants to check correctness
>>> of the encoding of this annotation.
>>>
>>> There is no overall comment what this CFI annotation
>>> *achieves*. In human language, what do we say
>>> to DWARF decoder here?
>> 
>> Short answer: DW_CFA_def_cfa_expression.
>> 
>> Longer response: Just like I said before, what you're asking for is
>> identical to ask for each other CFI annotation to get a comment
>> associated to tell you what it's doing, which I don't think you
>> really mean to ask for. (Our main problem here is that we can't
>> specify expressions with the .cfi_* gas directives, and hence have
>> to resort to .cfi_escape.)
>> 
> 
> No, in *human language*.  What does the DW_CFA_def_cfa_expression
> actually aim to accomplish?  If you don't know the innards of the DWARF
> spec, the whole thing might as well be Hungarian.

Just like the other DW_CFA_def_cfa_* ones it sets the current
frame address (CFA), just not via one of the pre-canned shortcuts,
but via an expression (in the case here de-referencing the stack
pointer to read the top of stack, and then adding the necessary
offset). So it indeed is similar enough to other .cfi_* annotations we
use without further comments.

And btw., Hungarian isn't _that_ bad.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-11 15:08                 ` Jan Beulich
@ 2014-08-11 15:13                   ` H. Peter Anvin
  2014-08-12  9:31                     ` Denys Vlasenko
  0 siblings, 1 reply; 41+ messages in thread
From: H. Peter Anvin @ 2014-08-11 15:13 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
	Denys Vlasenko, X86 ML, Alexei Starovoitov, Denys Vlasenko,
	Oleg Nesterov, Linux Kernel Mailing List

On 08/11/2014 08:08 AM, Jan Beulich wrote:
>>
>> No, in *human language*.  What does the DW_CFA_def_cfa_expression
>> actually aim to accomplish?  If you don't know the innards of the DWARF
>> spec, the whole thing might as well be Hungarian.
> 
> Just like the other DW_CFA_def_cfa_* ones it sets the current
> frame address (CFA), just not via one of the pre-canned shortcuts,
> but via an expression (in the case here de-referencing the stack
> pointer to read the top of stack, and then adding the necessary
> offset). So it indeed is similar enough to other .cfi_* annotations we
> use without further comments.
> 

Actually, what you had inside the parenteses there is actually a
half-decent comment.  I'm going to pretend the rest of this wasn't posted.

	-hpa



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-11 15:13                   ` H. Peter Anvin
@ 2014-08-12  9:31                     ` Denys Vlasenko
  2014-08-12  9:50                       ` Jan Beulich
  0 siblings, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-12  9:31 UTC (permalink / raw)
  To: H. Peter Anvin, Jan Beulich
  Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
	Denys Vlasenko, X86 ML, Alexei Starovoitov, Oleg Nesterov,
	Linux Kernel Mailing List

On 08/11/2014 05:13 PM, H. Peter Anvin wrote:
> On 08/11/2014 08:08 AM, Jan Beulich wrote:
>>> No, in *human language*.  What does the DW_CFA_def_cfa_expression
>>> actually aim to accomplish?  If you don't know the innards of the DWARF
>>> spec, the whole thing might as well be Hungarian.
>>
>> Just like the other DW_CFA_def_cfa_* ones it sets the current
>> frame address (CFA), just not via one of the pre-canned shortcuts,
>> but via an expression (in the case here de-referencing the stack
>> pointer to read the top of stack, and then adding the necessary
>> offset). So it indeed is similar enough to other .cfi_* annotations we
>> use without further comments.
>>
> 
> Actually, what you had inside the parenteses there is actually a
> half-decent comment.  I'm going to pretend the rest of this wasn't posted.

Jan, Pater, does this look correct _and_ human-understandable?

--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -652,10 +652,14 @@ END(interrupt)
        cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
        CFI_DEF_CFA_REGISTER    rsi
        pushq %rsi
+       /*
+        * For debugger:
+        * "CFA (Current Frame Address) is the value on stack + offset"
+        */
        CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
-                       0x77 /* DW_OP_breg7 */, 0, \
+                       0x77 /* DW_OP_breg7 (rsp) */, 0, \
                        0x06 /* DW_OP_deref */, \
-                       0x08 /* DW_OP_const1u */, SS+8-RBP, \
+                       0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
                        0x22 /* DW_OP_plus */
        /* We entered an interrupt context - irqs are off: */
        TRACE_IRQS_OFF


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
  2014-08-12  9:31                     ` Denys Vlasenko
@ 2014-08-12  9:50                       ` Jan Beulich
  0 siblings, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2014-08-12  9:50 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
	Denys Vlasenko, X86 ML, Alexei Starovoitov, Oleg Nesterov,
	Linux Kernel Mailing List, H. Peter Anvin

>>> On 12.08.14 at 11:31, <dvlasenk@redhat.com> wrote:
> Jan, Pater, does this look correct _and_ human-understandable?
> 
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -652,10 +652,14 @@ END(interrupt)
>         cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
>         CFI_DEF_CFA_REGISTER    rsi
>         pushq %rsi
> +       /*
> +        * For debugger:
> +        * "CFA (Current Frame Address) is the value on stack + offset"
> +        */
>         CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
> -                       0x77 /* DW_OP_breg7 */, 0, \
> +                       0x77 /* DW_OP_breg7 (rsp) */, 0, \

I'm fine with the changes above.

>                         0x06 /* DW_OP_deref */, \
> -                       0x08 /* DW_OP_const1u */, SS+8-RBP, \
> +                       0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \

But I'd rather not see this one alone changed - either change all
similar uses of SS, or leave the one here alone.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2014-08-12  9:50 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
2014-08-01 14:48 ` [PATCH 2/5] x86: open-code register save/restore in trace_hardirqs thunks Denys Vlasenko
2014-08-01 14:48 ` [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user Denys Vlasenko
2014-08-01 18:30   ` Frederic Weisbecker
2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
2014-08-01 17:04   ` Andy Lutomirski
2014-08-04 14:28     ` Denys Vlasenko
2014-08-04 14:47       ` Oleg Nesterov
2014-08-04 15:34         ` Oleg Nesterov
2014-08-04 21:03       ` Andy Lutomirski
2014-08-04 21:23         ` Borislav Petkov
2014-08-05 10:35         ` Denys Vlasenko
2014-08-05 14:53           ` Andy Lutomirski
2014-08-05 15:17             ` Denys Vlasenko
2014-08-05 23:02               ` Andy Lutomirski
2014-08-07  9:54             ` Denys Vlasenko
2014-08-01 18:09   ` Alexei Starovoitov
2014-08-01 18:30   ` Oleg Nesterov
2014-08-01 18:35   ` H. Peter Anvin
2014-08-01 22:11     ` Denys Vlasenko
2014-08-01 22:13       ` H. Peter Anvin
2014-08-02 21:14         ` Andy Lutomirski
2014-08-02 21:23           ` H. Peter Anvin
2014-08-02 21:38             ` Andy Lutomirski
2014-08-01 22:52   ` Frederic Weisbecker
2014-08-01 23:19   ` Frederic Weisbecker
2014-08-04  3:03     ` Denys Vlasenko
2014-08-04  7:57       ` Borislav Petkov
2014-08-11  0:46       ` Frederic Weisbecker
2014-08-11  8:40         ` Jan Beulich
2014-08-11  9:07           ` Andy Lutomirski
2014-08-11  9:31             ` Jan Beulich
2014-08-11 13:26           ` Denys Vlasenko
2014-08-11 14:17             ` Jan Beulich
2014-08-11 14:53               ` H. Peter Anvin
2014-08-11 15:08                 ` Jan Beulich
2014-08-11 15:13                   ` H. Peter Anvin
2014-08-12  9:31                     ` Denys Vlasenko
2014-08-12  9:50                       ` Jan Beulich
2014-08-01 14:48 ` [PATCH 5/5] x86: mass removal of ARGOFFSET Denys Vlasenko
2014-08-01 18:00 ` [PATCH 1/5] x86: entry_64.S: delete unused code Frederic Weisbecker

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).