* [PATCH 1/5] x86: entry_64.S: delete unused code
@ 2014-08-01 14:48 Denys Vlasenko
2014-08-01 14:48 ` [PATCH 2/5] x86: open-code register save/restore in trace_hardirqs thunks Denys Vlasenko
` (4 more replies)
0 siblings, 5 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 14:48 UTC (permalink / raw)
To: linux-kernel
Cc: Denys Vlasenko, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
A define, two macros and an unreferenced bit of assembly are gone.
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
---
arch/x86/include/asm/calling.h | 1 -
arch/x86/kernel/entry_64.S | 34 ----------------------------------
2 files changed, 35 deletions(-)
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index cb4c73b..e176cea 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with
#define SS 160
#define ARGOFFSET R11
-#define SWFRAME ORIG_RAX
.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
subq $9*8+\addskip, %rsp
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b25ca96..dbfd037 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,27 +155,6 @@ ENDPROC(native_usergs_sysret64)
movq \tmp,R11+\offset(%rsp)
.endm
- .macro FAKE_STACK_FRAME child_rip
- /* push in order ss, rsp, eflags, cs, rip */
- xorl %eax, %eax
- pushq_cfi $__KERNEL_DS /* ss */
- /*CFI_REL_OFFSET ss,0*/
- pushq_cfi %rax /* rsp */
- CFI_REL_OFFSET rsp,0
- pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
- /*CFI_REL_OFFSET rflags,0*/
- pushq_cfi $__KERNEL_CS /* cs */
- /*CFI_REL_OFFSET cs,0*/
- pushq_cfi \child_rip /* rip */
- CFI_REL_OFFSET rip,0
- pushq_cfi %rax /* orig rax */
- .endm
-
- .macro UNFAKE_STACK_FRAME
- addq $8*6, %rsp
- CFI_ADJUST_CFA_OFFSET -(6*8)
- .endm
-
/*
* initial frame state for interrupts (and exceptions without error code)
*/
@@ -640,19 +619,6 @@ END(\label)
FORK_LIKE vfork
FIXED_FRAME stub_iopl, sys_iopl
-ENTRY(ptregscall_common)
- DEFAULT_FRAME 1 8 /* offset 8: return address */
- RESTORE_TOP_OF_STACK %r11, 8
- movq_cfi_restore R15+8, r15
- movq_cfi_restore R14+8, r14
- movq_cfi_restore R13+8, r13
- movq_cfi_restore R12+8, r12
- movq_cfi_restore RBP+8, rbp
- movq_cfi_restore RBX+8, rbx
- ret $REST_SKIP /* pop extended registers */
- CFI_ENDPROC
-END(ptregscall_common)
-
ENTRY(stub_execve)
CFI_STARTPROC
addq $8, %rsp
--
1.8.1.4
^ permalink raw reply related [flat|nested] 41+ messages in thread
* [PATCH 2/5] x86: open-code register save/restore in trace_hardirqs thunks
2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
@ 2014-08-01 14:48 ` Denys Vlasenko
2014-08-01 14:48 ` [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user Denys Vlasenko
` (3 subsequent siblings)
4 siblings, 0 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 14:48 UTC (permalink / raw)
To: linux-kernel
Cc: Denys Vlasenko, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
This is a preparatory patch for change in "struct pt_regs"
handling in entry_64.S.
trace_hardirqs thunks were (ab)using a part of pt_regs
handling code, namely SAVE_ARGS/RESTORE_ARGS macros,
to save/restore registers across C function calls.
Since SAVE_ARGS is going to be changed, open-code
register saving/restoring here.
Incidentally, this removes a bit of dead code:
one SAVE_ARGS was just to emit a CFI annotation,
but it also generated unreachable assembly insns.
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
---
arch/x86/lib/thunk_64.S | 29 ++++++++++++++++++++++++-----
1 file changed, 24 insertions(+), 5 deletions(-)
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 92d9fea..c1c9131 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -16,10 +16,20 @@
\name:
CFI_STARTPROC
- /* this one pushes 9 elems, the next one would be %rIP */
- SAVE_ARGS
+ subq $9*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 9*8
+ movq_cfi rdi, 8*8
+ movq_cfi rsi, 7*8
+ movq_cfi rdx, 6*8
+ movq_cfi rcx, 5*8
+ movq_cfi rax, 4*8
+ movq_cfi r8, 3*8
+ movq_cfi r9, 2*8
+ movq_cfi r10, 1*8
+ movq_cfi r11, 0*8
.if \put_ret_addr_in_rdi
+ /* 9*8(%rsp) is return addr on stack */
movq_cfi_restore 9*8, rdi
.endif
@@ -38,11 +48,20 @@
THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
#endif
- /* SAVE_ARGS below is used only for the .cfi directives it contains. */
CFI_STARTPROC
- SAVE_ARGS
+ CFI_ADJUST_CFA_OFFSET 9*8
restore:
- RESTORE_ARGS
+ movq_cfi_restore 0*8, r11
+ movq_cfi_restore 1*8, r10
+ movq_cfi_restore 2*8, r9
+ movq_cfi_restore 3*8, r8
+ movq_cfi_restore 4*8, rax
+ movq_cfi_restore 5*8, rcx
+ movq_cfi_restore 6*8, rdx
+ movq_cfi_restore 7*8, rsi
+ movq_cfi_restore 8*8, rdi
+ addq 9*8, %rsp
+ CFI_ADJUST_CFA_OFFSET -9*8
ret
CFI_ENDPROC
_ASM_NOKPROBE(restore)
--
1.8.1.4
^ permalink raw reply related [flat|nested] 41+ messages in thread
* [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user
2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
2014-08-01 14:48 ` [PATCH 2/5] x86: open-code register save/restore in trace_hardirqs thunks Denys Vlasenko
@ 2014-08-01 14:48 ` Denys Vlasenko
2014-08-01 18:30 ` Frederic Weisbecker
2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
` (2 subsequent siblings)
4 siblings, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 14:48 UTC (permalink / raw)
To: linux-kernel
Cc: Denys Vlasenko, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
No code changes.
This is a preparatory patch for change in "struct pt_regs" handling.
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
---
arch/x86/kernel/entry_64.S | 88 ++++++++++++++++++++++------------------------
1 file changed, 42 insertions(+), 46 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index dbfd037..37f7d95 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -218,51 +218,6 @@ ENDPROC(native_usergs_sysret64)
CFI_REL_OFFSET r15, R15+\offset
.endm
-/* save partial stack frame */
- .macro SAVE_ARGS_IRQ
- cld
- /* start from rbp in pt_regs and jump over */
- movq_cfi rdi, (RDI-RBP)
- movq_cfi rsi, (RSI-RBP)
- movq_cfi rdx, (RDX-RBP)
- movq_cfi rcx, (RCX-RBP)
- movq_cfi rax, (RAX-RBP)
- movq_cfi r8, (R8-RBP)
- movq_cfi r9, (R9-RBP)
- movq_cfi r10, (R10-RBP)
- movq_cfi r11, (R11-RBP)
-
- /* Save rbp so that we can unwind from get_irq_regs() */
- movq_cfi rbp, 0
-
- /* Save previous stack value */
- movq %rsp, %rsi
-
- leaq -RBP(%rsp),%rdi /* arg1 for handler */
- testl $3, CS-RBP(%rsi)
- je 1f
- SWAPGS
- /*
- * irq_count is used to check if a CPU is already on an interrupt stack
- * or not. While this is essentially redundant with preempt_count it is
- * a little cheaper to use a separate counter in the PDA (short of
- * moving irq_enter into assembly, which would be too much work)
- */
-1: incl PER_CPU_VAR(irq_count)
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- CFI_DEF_CFA_REGISTER rsi
-
- /* Store previous stack value */
- pushq %rsi
- CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 */, 0, \
- 0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SS+8-RBP, \
- 0x22 /* DW_OP_plus */
- /* We entered an interrupt context - irqs are off: */
- TRACE_IRQS_OFF
- .endm
-
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -731,7 +686,48 @@ END(interrupt)
/* reserve pt_regs for scratch regs and rbp */
subq $ORIG_RAX-RBP, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
- SAVE_ARGS_IRQ
+ cld
+ /* start from rbp in pt_regs and jump over */
+ movq_cfi rdi, (RDI-RBP)
+ movq_cfi rsi, (RSI-RBP)
+ movq_cfi rdx, (RDX-RBP)
+ movq_cfi rcx, (RCX-RBP)
+ movq_cfi rax, (RAX-RBP)
+ movq_cfi r8, (R8-RBP)
+ movq_cfi r9, (R9-RBP)
+ movq_cfi r10, (R10-RBP)
+ movq_cfi r11, (R11-RBP)
+
+ /* Save rbp so that we can unwind from get_irq_regs() */
+ movq_cfi rbp, 0
+
+ /* Save previous stack value */
+ movq %rsp, %rsi
+
+ leaq -RBP(%rsp),%rdi /* arg1 for handler */
+ testl $3, CS-RBP(%rsi)
+ je 1f
+ SWAPGS
+ /*
+ * irq_count is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+ * a little cheaper to use a separate counter in the PDA (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+1: incl PER_CPU_VAR(irq_count)
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+ CFI_DEF_CFA_REGISTER rsi
+
+ /* Store previous stack value */
+ pushq %rsi
+ CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
+ 0x77 /* DW_OP_breg7 */, 0, \
+ 0x06 /* DW_OP_deref */, \
+ 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x22 /* DW_OP_plus */
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+
call \func
.endm
--
1.8.1.4
^ permalink raw reply related [flat|nested] 41+ messages in thread
* [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
2014-08-01 14:48 ` [PATCH 2/5] x86: open-code register save/restore in trace_hardirqs thunks Denys Vlasenko
2014-08-01 14:48 ` [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user Denys Vlasenko
@ 2014-08-01 14:48 ` Denys Vlasenko
2014-08-01 17:04 ` Andy Lutomirski
` (5 more replies)
2014-08-01 14:48 ` [PATCH 5/5] x86: mass removal of ARGOFFSET Denys Vlasenko
2014-08-01 18:00 ` [PATCH 1/5] x86: entry_64.S: delete unused code Frederic Weisbecker
4 siblings, 6 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 14:48 UTC (permalink / raw)
To: linux-kernel
Cc: Denys Vlasenko, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
64-bit code was using six stack slots fewer by not saving/restoring
registers which a callee-preserved according to C ABI,
and not allocating space for them.
Only when syscall needed a complete "struct pt_regs",
the complete area was allocated and filled in.
This proved to be a source of significant obfuscation and subtle bugs.
For example, stub_fork had to pop the return address,
extend the struct, save registers, and push return address back. Ugly.
ia32_ptregs_common pops return address and "returns" via jmp insn,
throwing a wrench into CPU return stack cache.
This patch changes code to always allocate a complete "struct pt_regs".
The saving of registers is still done lazily.
Macros which manipulate "struct pt_regs" on stack are reworked:
ALLOC_PTREGS_ON_STACK allocates the structure.
SAVE_C_REGS saves to it those registers which are clobbered by C code.
SAVE_EXTRA_REGS saves to it all other registers.
Corresponding RESTORE_* and REMOVE_PTREGS_FROM_STACK macros reverse it.
ia32_ptregs_common, stub_fork and friends lost their ugly dance with
return pointer.
LOAD_ARGS32 in ia32entry.S now uses a symbolic stack offsets
instead of magic numbers.
Misleading and slightly wrong comments in "struct pt_regs" are fixed
(four instances).
Patch was run-tested: 64-bit executables, 32-bit executables,
strace works.
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
---
arch/x86/ia32/ia32entry.S | 47 +++----
arch/x86/include/asm/calling.h | 224 ++++++++++++++++-----------------
arch/x86/include/asm/irqflags.h | 4 +-
arch/x86/include/asm/ptrace.h | 13 +-
arch/x86/include/uapi/asm/ptrace-abi.h | 16 ++-
arch/x86/include/uapi/asm/ptrace.h | 13 +-
arch/x86/kernel/entry_64.S | 132 ++++++++-----------
arch/x86/kernel/preempt.S | 16 ++-
8 files changed, 232 insertions(+), 233 deletions(-)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 4299eb0..ef9ee16 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -62,12 +62,12 @@
*/
.macro LOAD_ARGS32 offset, _r9=0
.if \_r9
- movl \offset+16(%rsp),%r9d
+ movl \offset+R9(%rsp),%r9d
.endif
- movl \offset+40(%rsp),%ecx
- movl \offset+48(%rsp),%edx
- movl \offset+56(%rsp),%esi
- movl \offset+64(%rsp),%edi
+ movl \offset+RCX(%rsp),%ecx
+ movl \offset+RDX(%rsp),%edx
+ movl \offset+RSI(%rsp),%esi
+ movl \offset+RDI(%rsp),%edi
movl %eax,%eax /* zero extension */
.endm
@@ -144,7 +144,8 @@ ENTRY(ia32_sysenter_target)
CFI_REL_OFFSET rip,0
pushq_cfi %rax
cld
- SAVE_ARGS 0,1,0
+ ALLOC_PTREGS_ON_STACK
+ SAVE_C_REGS_EXCEPT_R891011
/* no need to do an access_ok check here because rbp has been
32bit zero extended */
ASM_STAC
@@ -172,7 +173,8 @@ sysexit_from_sys_call:
andl $~0x200,EFLAGS-R11(%rsp)
movl RIP-R11(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
- RESTORE_ARGS 0,24,0,0,0,0
+ RESTORE_RSI_RDI
+ REMOVE_PTREGS_FROM_STACK 8*3
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
@@ -240,13 +242,13 @@ sysenter_tracesys:
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz sysenter_auditsys
#endif
- SAVE_REST
+ SAVE_EXTRA_REGS
CLEAR_RREGS
movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
- RESTORE_REST
+ RESTORE_EXTRA_REGS
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
jmp sysenter_do_call
@@ -288,7 +290,8 @@ ENTRY(ia32_cstar_target)
* disabled irqs and here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,0,0
+ ALLOC_PTREGS_ON_STACK 8
+ SAVE_C_REGS_EXCEPT_RCX_R891011
movl %eax,%eax /* zero extension */
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
@@ -325,7 +328,7 @@ cstar_dispatch:
jnz sysretl_audit
sysretl_from_sys_call:
andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- RESTORE_ARGS 0,-ARG_SKIP,0,0,0
+ RESTORE_RSI_RDI_RDX
movl RIP-ARGOFFSET(%rsp),%ecx
CFI_REGISTER rip,rcx
movl EFLAGS-ARGOFFSET(%rsp),%r11d
@@ -356,13 +359,13 @@ cstar_tracesys:
jz cstar_auditsys
#endif
xchgl %r9d,%ebp
- SAVE_REST
+ SAVE_EXTRA_REGS
CLEAR_RREGS 0, r9
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
- RESTORE_REST
+ RESTORE_EXTRA_REGS
xchgl %ebp,%r9d
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
@@ -417,7 +420,8 @@ ENTRY(ia32_syscall)
cld
/* note the registers are not zero extended to the sf.
this could be a problem. */
- SAVE_ARGS 0,1,0
+ ALLOC_PTREGS_ON_STACK
+ SAVE_C_REGS_EXCEPT_R891011
orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz ia32_tracesys
@@ -430,16 +434,16 @@ ia32_sysret:
movq %rax,RAX-ARGOFFSET(%rsp)
ia32_ret_from_sys_call:
CLEAR_RREGS -ARGOFFSET
- jmp int_ret_from_sys_call
+ jmp int_ret_from_sys_call
-ia32_tracesys:
- SAVE_REST
+ia32_tracesys:
+ SAVE_EXTRA_REGS
CLEAR_RREGS
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
- RESTORE_REST
+ RESTORE_EXTRA_REGS
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
jmp ia32_do_call
@@ -475,7 +479,6 @@ GLOBAL(stub32_clone)
ALIGN
ia32_ptregs_common:
- popq %r11
CFI_ENDPROC
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
@@ -490,9 +493,9 @@ ia32_ptregs_common:
/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
CFI_REL_OFFSET rsp,RSP-ARGOFFSET
/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
- SAVE_REST
+ SAVE_EXTRA_REGS 8
call *%rax
- RESTORE_REST
- jmp ia32_sysret /* misbalances the return cache */
+ RESTORE_EXTRA_REGS 8
+ ret
CFI_ENDPROC
END(ia32_ptregs_common)
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index e176cea..10aff1e 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -52,142 +52,132 @@ For 32-bit we have the following conventions - kernel is built with
/*
* 64-bit system call stack frame layout defines and helpers,
- * for assembly code:
+ * for assembly code.
*/
-#define R15 0
-#define R14 8
-#define R13 16
-#define R12 24
-#define RBP 32
-#define RBX 40
-
-/* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11 48
-#define R10 56
-#define R9 64
-#define R8 72
-#define RAX 80
-#define RCX 88
-#define RDX 96
-#define RSI 104
-#define RDI 112
-#define ORIG_RAX 120 /* + error_code */
-/* end of arguments */
-
-/* cpu exception frame or undefined in case of fast syscall: */
-#define RIP 128
-#define CS 136
-#define EFLAGS 144
-#define RSP 152
-#define SS 160
-
-#define ARGOFFSET R11
-
- .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
- subq $9*8+\addskip, %rsp
- CFI_ADJUST_CFA_OFFSET 9*8+\addskip
- movq_cfi rdi, 8*8
- movq_cfi rsi, 7*8
- movq_cfi rdx, 6*8
-
- .if \save_rcx
- movq_cfi rcx, 5*8
- .endif
-
- movq_cfi rax, 4*8
+/* The layout forms the "struct pt_regs" on the stack: */
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+#define R15 0*8
+#define R14 1*8
+#define R13 2*8
+#define R12 3*8
+#define RBP 4*8
+#define RBX 5*8
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+#define R11 6*8
+#define R10 7*8
+#define R9 8*8
+#define R8 9*8
+#define RAX 10*8
+#define RCX 11*8
+#define RDX 12*8
+#define RSI 13*8
+#define RDI 14*8
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 15*8
+/* Return frame for iretq */
+#define RIP 16*8
+#define CS 17*8
+#define EFLAGS 18*8
+#define RSP 19*8
+#define SS 20*8
+
+#define ARGOFFSET 0
+
+ .macro ALLOC_PTREGS_ON_STACK addskip=0
+ subq $15*8+\addskip, %rsp
+ CFI_ADJUST_CFA_OFFSET 15*8+\addskip
+ .endm
- .if \save_r891011
- movq_cfi r8, 3*8
- movq_cfi r9, 2*8
- movq_cfi r10, 1*8
- movq_cfi r11, 0*8
+ .macro SAVE_C_REGS_HELPER rcx=1 r8plus=1
+ movq_cfi rdi, 14*8
+ movq_cfi rsi, 13*8
+ movq_cfi rdx, 12*8
+ .if \rcx
+ movq_cfi rcx, 11*8
.endif
-
+ movq_cfi rax, 10*8
+ .if \r8plus
+ movq_cfi r8, 9*8
+ movq_cfi r9, 8*8
+ movq_cfi r10, 7*8
+ movq_cfi r11, 6*8
+ .endif
+ .endm
+ .macro SAVE_C_REGS
+ SAVE_C_REGS_HELPER 1, 1
+ .endm
+ .macro SAVE_C_REGS_EXCEPT_R891011
+ SAVE_C_REGS_HELPER 1, 0
+ .endm
+ .macro SAVE_C_REGS_EXCEPT_RCX_R891011
+ SAVE_C_REGS_HELPER 0, 0
.endm
-#define ARG_SKIP (9*8)
+ .macro SAVE_EXTRA_REGS offset=0
+ movq_cfi rbx, 5*8+\offset
+ movq_cfi rbp, 4*8+\offset
+ movq_cfi r12, 3*8+\offset
+ movq_cfi r13, 2*8+\offset
+ movq_cfi r14, 1*8+\offset
+ movq_cfi r15, 0*8+\offset
+ .endm
- .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
- rstor_r8910=1, rstor_rdx=1
- .if \rstor_r11
- movq_cfi_restore 0*8, r11
- .endif
+ .macro RESTORE_EXTRA_REGS offset=0
+ movq_cfi_restore 0*8+\offset, r15
+ movq_cfi_restore 1*8+\offset, r14
+ movq_cfi_restore 2*8+\offset, r13
+ movq_cfi_restore 3*8+\offset, r12
+ movq_cfi_restore 4*8+\offset, rbp
+ movq_cfi_restore 5*8+\offset, rbx
+ .endm
- .if \rstor_r8910
- movq_cfi_restore 1*8, r10
- movq_cfi_restore 2*8, r9
- movq_cfi_restore 3*8, r8
+ .macro RESTORE_C_REGS_HELPER rax=1, rcx=1, r11=1, r8910=1, rdx=1
+ .if \r11
+ movq_cfi_restore 6*8, r11
.endif
-
- .if \rstor_rax
- movq_cfi_restore 4*8, rax
+ .if \r8910
+ movq_cfi_restore 7*8, r10
+ movq_cfi_restore 8*8, r9
+ movq_cfi_restore 9*8, r8
.endif
-
- .if \rstor_rcx
- movq_cfi_restore 5*8, rcx
+ .if \rax
+ movq_cfi_restore 10*8, rax
.endif
-
- .if \rstor_rdx
- movq_cfi_restore 6*8, rdx
+ .if \rcx
+ movq_cfi_restore 11*8, rcx
.endif
-
- movq_cfi_restore 7*8, rsi
- movq_cfi_restore 8*8, rdi
-
- .if ARG_SKIP+\addskip > 0
- addq $ARG_SKIP+\addskip, %rsp
- CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
+ .if \rdx
+ movq_cfi_restore 12*8, rdx
.endif
+ movq_cfi_restore 13*8, rsi
+ movq_cfi_restore 14*8, rdi
.endm
-
- .macro LOAD_ARGS offset, skiprax=0
- movq \offset(%rsp), %r11
- movq \offset+8(%rsp), %r10
- movq \offset+16(%rsp), %r9
- movq \offset+24(%rsp), %r8
- movq \offset+40(%rsp), %rcx
- movq \offset+48(%rsp), %rdx
- movq \offset+56(%rsp), %rsi
- movq \offset+64(%rsp), %rdi
- .if \skiprax
- .else
- movq \offset+72(%rsp), %rax
- .endif
+ .macro RESTORE_C_REGS
+ RESTORE_C_REGS_HELPER 1,1,1,1,1
.endm
-
-#define REST_SKIP (6*8)
-
- .macro SAVE_REST
- subq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- movq_cfi rbx, 5*8
- movq_cfi rbp, 4*8
- movq_cfi r12, 3*8
- movq_cfi r13, 2*8
- movq_cfi r14, 1*8
- movq_cfi r15, 0*8
+ .macro RESTORE_C_REGS_EXCEPT_RAX
+ RESTORE_C_REGS_HELPER 0,1,1,1,1
.endm
-
- .macro RESTORE_REST
- movq_cfi_restore 0*8, r15
- movq_cfi_restore 1*8, r14
- movq_cfi_restore 2*8, r13
- movq_cfi_restore 3*8, r12
- movq_cfi_restore 4*8, rbp
- movq_cfi_restore 5*8, rbx
- addq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
+ .macro RESTORE_C_REGS_EXCEPT_RCX
+ RESTORE_C_REGS_HELPER 1,0,1,1,1
.endm
-
- .macro SAVE_ALL
- SAVE_ARGS
- SAVE_REST
+ .macro RESTORE_RSI_RDI
+ RESTORE_C_REGS_HELPER 0,0,0,0,0
+ .endm
+ .macro RESTORE_RSI_RDI_RDX
+ RESTORE_C_REGS_HELPER 0,0,0,0,1
.endm
- .macro RESTORE_ALL addskip=0
- RESTORE_REST
- RESTORE_ARGS 1, \addskip
+ .macro REMOVE_PTREGS_FROM_STACK addskip=0
+ addq $15*8+\addskip, %rsp
+ CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
.endm
.macro icebp
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index bba3cf8..6f98c16 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void)
#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
TRACE_IRQS_ON; \
sti; \
- SAVE_REST; \
+ SAVE_EXTRA_REGS; \
LOCKDEP_SYS_EXIT; \
- RESTORE_REST; \
+ RESTORE_EXTRA_REGS; \
cli; \
TRACE_IRQS_OFF;
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6205f0c..c822b35 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -31,13 +31,17 @@ struct pt_regs {
#else /* __i386__ */
struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
@@ -47,9 +51,12 @@ struct pt_regs {
unsigned long dx;
unsigned long si;
unsigned long di;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
unsigned long orig_ax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
unsigned long ip;
unsigned long cs;
unsigned long flags;
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 7b0a55a..580aee3 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,13 +25,17 @@
#else /* __i386__ */
#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
#define R15 0
#define R14 8
#define R13 16
#define R12 24
#define RBP 32
#define RBX 40
-/* arguments: interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
#define R11 48
#define R10 56
#define R9 64
@@ -41,15 +45,17 @@
#define RDX 96
#define RSI 104
#define RDI 112
-#define ORIG_RAX 120 /* = ERROR */
-/* end of arguments */
-/* cpu exception frame or undefined in case of fast syscall. */
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 120
+/* Return frame for iretq */
#define RIP 128
#define CS 136
#define EFLAGS 144
#define RSP 152
#define SS 160
-#define ARGOFFSET R11
#endif /* __ASSEMBLY__ */
/* top of stack page */
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index ac4b9aa..bc16115 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -41,13 +41,17 @@ struct pt_regs {
#ifndef __KERNEL__
struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long rbp;
unsigned long rbx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
@@ -57,9 +61,12 @@ struct pt_regs {
unsigned long rdx;
unsigned long rsi;
unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
unsigned long orig_rax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
unsigned long rip;
unsigned long cs;
unsigned long eflags;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 37f7d95..b3c3ebb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -26,12 +26,6 @@
* Some macro usage:
* - CFI macros are used to generate dwarf2 unwind information for better
* backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
* - ENTRY/END Define functions in the symbol table.
* - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
* frame that is otherwise undefined after a SYSCALL
@@ -264,7 +258,7 @@ ENTRY(ret_from_fork)
GET_THREAD_INFO(%rcx)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
jz 1f
@@ -276,12 +270,10 @@ ENTRY(ret_from_fork)
jmp ret_from_sys_call # go to the SYSRET fastpath
1:
- subq $REST_SKIP, %rsp # leave space for volatiles
- CFI_ADJUST_CFA_OFFSET REST_SKIP
movq %rbp, %rdi
call *%rbx
movl $0, RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(ret_from_fork)
@@ -339,7 +331,8 @@ GLOBAL(system_call_after_swapgs)
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,0
+ ALLOC_PTREGS_ON_STACK 8
+ SAVE_C_REGS
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
@@ -375,9 +368,9 @@ sysret_check:
* sysretq will re-enable interrupts:
*/
TRACE_IRQS_ON
+ RESTORE_C_REGS_EXCEPT_RCX
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER rflags,r11*/
movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
@@ -429,7 +422,7 @@ auditsys:
movq %rax,%rsi /* 2nd arg: syscall number */
movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
call __audit_syscall_entry
- LOAD_ARGS 0 /* reload call-clobbered registers */
+ RESTORE_C_REGS /* reload call-clobbered registers */
jmp system_call_fastpath
/*
@@ -453,7 +446,7 @@ tracesys:
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz auditsys
#endif
- SAVE_REST
+ SAVE_EXTRA_REGS
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
FIXUP_TOP_OF_STACK %rdi
movq %rsp,%rdi
@@ -463,8 +456,8 @@ tracesys:
* We don't reload %rax because syscall_trace_enter() returned
* the value it wants us to use in the table lookup.
*/
- LOAD_ARGS ARGOFFSET, 1
- RESTORE_REST
+ RESTORE_C_REGS_EXCEPT_RAX
+ RESTORE_EXTRA_REGS
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
#else
@@ -515,7 +508,7 @@ int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
int_check_syscall_exit_work:
- SAVE_REST
+ SAVE_EXTRA_REGS
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
@@ -534,7 +527,7 @@ int_signal:
call do_notify_resume
1: movl $_TIF_WORK_MASK,%edi
int_restore_rest:
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
@@ -544,15 +537,12 @@ END(system_call)
.macro FORK_LIKE func
ENTRY(stub_\func)
CFI_STARTPROC
- popq %r11 /* save return address */
- PARTIAL_FRAME 0
- SAVE_REST
- pushq %r11 /* put it back on stack */
+ DEFAULT_FRAME 0, 8 /* offset 8: return address */
+ SAVE_EXTRA_REGS 8
FIXUP_TOP_OF_STACK %r11, 8
- DEFAULT_FRAME 0 8 /* offset 8: return address */
call sys_\func
RESTORE_TOP_OF_STACK %r11, 8
- ret $REST_SKIP /* pop extended registers */
+ ret
CFI_ENDPROC
END(stub_\func)
.endm
@@ -560,7 +550,7 @@ END(stub_\func)
.macro FIXED_FRAME label,func
ENTRY(\label)
CFI_STARTPROC
- PARTIAL_FRAME 0 8 /* offset 8: return address */
+ DEFAULT_FRAME 0, 8 /* offset 8: return address */
FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
call \func
RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
@@ -577,12 +567,12 @@ END(\label)
ENTRY(stub_execve)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys_execve
movq %rax,RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execve)
@@ -594,12 +584,12 @@ END(stub_execve)
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_rt_sigreturn)
@@ -608,12 +598,12 @@ END(stub_rt_sigreturn)
ENTRY(stub_x32_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys32_x32_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_x32_rt_sigreturn)
@@ -621,13 +611,13 @@ END(stub_x32_rt_sigreturn)
ENTRY(stub_x32_execve)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call compat_sys_execve
RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_x32_execve)
@@ -683,51 +673,31 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- /* reserve pt_regs for scratch regs and rbp */
- subq $ORIG_RAX-RBP, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
- cld
- /* start from rbp in pt_regs and jump over */
- movq_cfi rdi, (RDI-RBP)
- movq_cfi rsi, (RSI-RBP)
- movq_cfi rdx, (RDX-RBP)
- movq_cfi rcx, (RCX-RBP)
- movq_cfi rax, (RAX-RBP)
- movq_cfi r8, (R8-RBP)
- movq_cfi r9, (R9-RBP)
- movq_cfi r10, (R10-RBP)
- movq_cfi r11, (R11-RBP)
-
- /* Save rbp so that we can unwind from get_irq_regs() */
- movq_cfi rbp, 0
-
- /* Save previous stack value */
- movq %rsp, %rsi
-
- leaq -RBP(%rsp),%rdi /* arg1 for handler */
- testl $3, CS-RBP(%rsi)
+ ALLOC_PTREGS_ON_STACK
+ SAVE_C_REGS
+ movq %rsp, %rdi /* arg1 for handler */
+ testl $3, CS(%rsp)
je 1f
SWAPGS
- /*
+1: /*
* irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
-1: incl PER_CPU_VAR(irq_count)
+ incl PER_CPU_VAR(irq_count)
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- CFI_DEF_CFA_REGISTER rsi
+ CFI_DEF_CFA_REGISTER rdi
/* Store previous stack value */
- pushq %rsi
+ pushq %rdi
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
0x77 /* DW_OP_breg7 */, 0, \
0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x08 /* DW_OP_const1u */, SS+8, \
0x22 /* DW_OP_plus */
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
-
call \func
.endm
@@ -749,10 +719,9 @@ ret_from_intr:
/* Restore saved previous stack */
popq %rsi
- CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
- leaq ARGOFFSET-RBP(%rsi), %rsp
+ CFI_DEF_CFA rsi,SS+8 /* reg/off reset after def_cfa_expr */
+ movq %rsi, %rsp
CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
exit_intr:
GET_THREAD_INFO(%rcx)
@@ -789,7 +758,8 @@ retint_restore_args: /* return to kernel space */
*/
TRACE_IRQS_IRETQ
restore_args:
- RESTORE_ARGS 1,8,1
+ RESTORE_C_REGS
+ REMOVE_PTREGS_FROM_STACK 8
irq_return:
/*
@@ -876,12 +846,12 @@ retint_signal:
jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_REST
+ SAVE_EXTRA_REGS
movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
@@ -1256,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
pushq_cfi $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
+ ALLOC_PTREGS_ON_STACK
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
jmp error_exit
CFI_ENDPROC
END(xen_failsafe_callback)
@@ -1313,11 +1285,15 @@ ENTRY(paranoid_exit)
paranoid_swapgs:
TRACE_IRQS_IRETQ 0
SWAPGS_UNSAFE_STACK
- RESTORE_ALL 8
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PTREGS_FROM_STACK 8
jmp irq_return
paranoid_restore:
TRACE_IRQS_IRETQ_DEBUG 0
- RESTORE_ALL 8
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PTREGS_FROM_STACK 8
jmp irq_return
paranoid_userspace:
GET_THREAD_INFO(%rcx)
@@ -1412,7 +1388,7 @@ END(error_entry)
ENTRY(error_exit)
DEFAULT_FRAME
movl %ebx,%eax
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
@@ -1671,8 +1647,10 @@ end_repeat_nmi:
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
/* Pop the extra iret frame at once */
- RESTORE_ALL 6*8
+ REMOVE_PTREGS_FROM_STACK 6*8
/* Clear the NMI executing stack variable */
movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S
index ca7f0d5..673da2f 100644
--- a/arch/x86/kernel/preempt.S
+++ b/arch/x86/kernel/preempt.S
@@ -6,9 +6,13 @@
ENTRY(___preempt_schedule)
CFI_STARTPROC
- SAVE_ALL
+ ALLOC_PTREGS_ON_STACK
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
call preempt_schedule
- RESTORE_ALL
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PTREGS_FROM_STACK
ret
CFI_ENDPROC
@@ -16,9 +20,13 @@ ENTRY(___preempt_schedule)
ENTRY(___preempt_schedule_context)
CFI_STARTPROC
- SAVE_ALL
+ ALLOC_PTREGS_ON_STACK
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
call preempt_schedule_context
- RESTORE_ALL
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PTREGS_FROM_STACK
ret
CFI_ENDPROC
--
1.8.1.4
^ permalink raw reply related [flat|nested] 41+ messages in thread
* [PATCH 5/5] x86: mass removal of ARGOFFSET
2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
` (2 preceding siblings ...)
2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
@ 2014-08-01 14:48 ` Denys Vlasenko
2014-08-01 18:00 ` [PATCH 1/5] x86: entry_64.S: delete unused code Frederic Weisbecker
4 siblings, 0 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 14:48 UTC (permalink / raw)
To: linux-kernel
Cc: Denys Vlasenko, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
ARGOFFSET is zero now, removing it changes no code.
A few macros lost "offset" parameter, since it is always zero now too.
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
---
arch/x86/ia32/ia32entry.S | 136 ++++++++++++++++++++---------------------
arch/x86/include/asm/calling.h | 2 -
arch/x86/kernel/entry_64.S | 62 +++++++++----------
3 files changed, 99 insertions(+), 101 deletions(-)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ef9ee16..4e51c7c 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -41,13 +41,13 @@
movl %edx,%edx /* zero extension */
.endm
- /* clobbers %eax */
- .macro CLEAR_RREGS offset=0, _r9=rax
+ /* clobbers %eax */
+ .macro CLEAR_RREGS _r9=rax
xorl %eax,%eax
- movq %rax,\offset+R11(%rsp)
- movq %rax,\offset+R10(%rsp)
- movq %\_r9,\offset+R9(%rsp)
- movq %rax,\offset+R8(%rsp)
+ movq %rax,R11(%rsp)
+ movq %rax,R10(%rsp)
+ movq %\_r9,R9(%rsp)
+ movq %rax,R8(%rsp)
.endm
/*
@@ -60,14 +60,14 @@
* If it's -1 to make us punt the syscall, then (u32)-1 is still
* an appropriately invalid value.
*/
- .macro LOAD_ARGS32 offset, _r9=0
+ .macro LOAD_ARGS32 _r9=0
.if \_r9
- movl \offset+R9(%rsp),%r9d
+ movl R9(%rsp),%r9d
.endif
- movl \offset+RCX(%rsp),%ecx
- movl \offset+RDX(%rsp),%edx
- movl \offset+RSI(%rsp),%esi
- movl \offset+RDI(%rsp),%edi
+ movl RCX(%rsp),%ecx
+ movl RDX(%rsp),%edx
+ movl RSI(%rsp),%esi
+ movl RDI(%rsp),%edi
movl %eax,%eax /* zero extension */
.endm
@@ -152,8 +152,8 @@ ENTRY(ia32_sysenter_target)
1: movl (%rbp),%ebp
_ASM_EXTABLE(1b,ia32_badarg)
ASM_CLAC
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
CFI_REMEMBER_STATE
jnz sysenter_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
@@ -162,13 +162,13 @@ sysenter_do_call:
IA32_ARG_FIXUP
sysenter_dispatch:
call *ia32_sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
jnz sysexit_audit
sysexit_from_sys_call:
- andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
/* clear IF, that popfq doesn't enable interrupts early */
andl $~0x200,EFLAGS-R11(%rsp)
movl RIP-R11(%rsp),%edx /* User %eip */
@@ -195,18 +195,18 @@ sysexit_from_sys_call:
movl %eax,%esi /* 2nd arg: syscall number */
movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
call __audit_syscall_entry
- movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
+ movl RAX(%rsp),%eax /* reload syscall number */
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
movl %ebx,%edi /* reload 1st syscall arg */
- movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
- movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
- movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
- movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
+ movl RCX(%rsp),%esi /* reload 2nd syscall arg */
+ movl RDX(%rsp),%edx /* reload 3rd syscall arg */
+ movl RSI(%rsp),%ecx /* reload 4th syscall arg */
+ movl RDI(%rsp),%r8d /* reload 5th syscall arg */
.endm
.macro auditsys_exit exit
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -217,13 +217,13 @@ sysexit_from_sys_call:
1: setbe %al /* 1 if error, 0 if not */
movzbl %al,%edi /* zero-extend that into %edi */
call __audit_syscall_exit
- movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */
+ movq RAX(%rsp),%rax /* reload syscall return value */
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl %edi,TI_flags+THREAD_INFO(%rsp,RIP)
jz \exit
- CLEAR_RREGS -ARGOFFSET
+ CLEAR_RREGS
jmp int_with_check
.endm
@@ -239,7 +239,7 @@ sysexit_audit:
sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
jz sysenter_auditsys
#endif
SAVE_EXTRA_REGS
@@ -247,7 +247,7 @@ sysenter_tracesys:
movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
+ LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
RESTORE_EXTRA_REGS
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
@@ -293,17 +293,17 @@ ENTRY(ia32_cstar_target)
ALLOC_PTREGS_ON_STACK 8
SAVE_C_REGS_EXCEPT_RCX_R891011
movl %eax,%eax /* zero extension */
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
- movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+ movq %rax,ORIG_RAX(%rsp)
+ movq %rcx,RIP(%rsp)
+ CFI_REL_OFFSET rip,RIP
+ movq %rbp,RCX(%rsp) /* this lies slightly to ptrace */
movl %ebp,%ecx
- movq $__USER32_CS,CS-ARGOFFSET(%rsp)
- movq $__USER32_DS,SS-ARGOFFSET(%rsp)
- movq %r11,EFLAGS-ARGOFFSET(%rsp)
- /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
- movq %r8,RSP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rsp,RSP-ARGOFFSET
+ movq $__USER32_CS,CS(%rsp)
+ movq $__USER32_DS,SS(%rsp)
+ movq %r11,EFLAGS(%rsp)
+ /*CFI_REL_OFFSET rflags,EFLAGS*/
+ movq %r8,RSP(%rsp)
+ CFI_REL_OFFSET rsp,RSP
/* no need to do an access_ok check here because r8 has been
32bit zero extended */
/* hardware stack frame is complete now */
@@ -311,8 +311,8 @@ ENTRY(ia32_cstar_target)
1: movl (%r8),%r9d
_ASM_EXTABLE(1b,ia32_badarg)
ASM_CLAC
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
CFI_REMEMBER_STATE
jnz cstar_tracesys
cmpq $IA32_NR_syscalls-1,%rax
@@ -321,32 +321,32 @@ cstar_do_call:
IA32_ARG_FIXUP 1
cstar_dispatch:
call *ia32_sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
jnz sysretl_audit
sysretl_from_sys_call:
- andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
RESTORE_RSI_RDI_RDX
- movl RIP-ARGOFFSET(%rsp),%ecx
+ movl RIP(%rsp),%ecx
CFI_REGISTER rip,rcx
- movl EFLAGS-ARGOFFSET(%rsp),%r11d
+ movl EFLAGS(%rsp),%r11d
/*CFI_REGISTER rflags,r11*/
xorq %r10,%r10
xorq %r9,%r9
xorq %r8,%r8
TRACE_IRQS_ON
- movl RSP-ARGOFFSET(%rsp),%esp
+ movl RSP(%rsp),%esp
CFI_RESTORE rsp
USERGS_SYSRET32
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
CFI_RESTORE_STATE
- movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
+ movl %r9d,R9(%rsp) /* register to be clobbered by call */
auditsys_entry_common
- movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
+ movl R9(%rsp),%r9d /* reload 6th syscall arg */
jmp cstar_dispatch
sysretl_audit:
@@ -355,16 +355,16 @@ sysretl_audit:
cstar_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
jz cstar_auditsys
#endif
xchgl %r9d,%ebp
SAVE_EXTRA_REGS
- CLEAR_RREGS 0, r9
+ CLEAR_RREGS r9
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
+ LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */
RESTORE_EXTRA_REGS
xchgl %ebp,%r9d
cmpq $(IA32_NR_syscalls-1),%rax
@@ -422,8 +422,8 @@ ENTRY(ia32_syscall)
this could be a problem. */
ALLOC_PTREGS_ON_STACK
SAVE_C_REGS_EXCEPT_R891011
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
jnz ia32_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
@@ -431,9 +431,9 @@ ia32_do_call:
IA32_ARG_FIXUP
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
ia32_ret_from_sys_call:
- CLEAR_RREGS -ARGOFFSET
+ CLEAR_RREGS
jmp int_ret_from_sys_call
ia32_tracesys:
@@ -442,7 +442,7 @@ ia32_tracesys:
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
+ LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
RESTORE_EXTRA_REGS
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
@@ -450,7 +450,7 @@ ia32_tracesys:
END(ia32_syscall)
ia32_badsys:
- movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+ movq $0,ORIG_RAX(%rsp)
movq $-ENOSYS,%rax
jmp ia32_sysret
@@ -482,17 +482,17 @@ ia32_ptregs_common:
CFI_ENDPROC
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-ARGOFFSET
- CFI_REL_OFFSET rax,RAX-ARGOFFSET
- CFI_REL_OFFSET rcx,RCX-ARGOFFSET
- CFI_REL_OFFSET rdx,RDX-ARGOFFSET
- CFI_REL_OFFSET rsi,RSI-ARGOFFSET
- CFI_REL_OFFSET rdi,RDI-ARGOFFSET
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
-/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
-/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
- CFI_REL_OFFSET rsp,RSP-ARGOFFSET
-/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
+ CFI_DEF_CFA rsp,SS+8
+ CFI_REL_OFFSET rax,RAX
+ CFI_REL_OFFSET rcx,RCX
+ CFI_REL_OFFSET rdx,RDX
+ CFI_REL_OFFSET rsi,RSI
+ CFI_REL_OFFSET rdi,RDI
+ CFI_REL_OFFSET rip,RIP
+/* CFI_REL_OFFSET cs,CS*/
+/* CFI_REL_OFFSET rflags,EFLAGS*/
+ CFI_REL_OFFSET rsp,RSP
+/* CFI_REL_OFFSET ss,SS*/
SAVE_EXTRA_REGS 8
call *%rax
RESTORE_EXTRA_REGS 8
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 10aff1e..31d0ac8 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -88,8 +88,6 @@ For 32-bit we have the following conventions - kernel is built with
#define RSP 19*8
#define SS 20*8
-#define ARGOFFSET 0
-
.macro ALLOC_PTREGS_ON_STACK addskip=0
subq $15*8+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET 15*8+\addskip
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b3c3ebb..ec68a07 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -76,7 +76,7 @@ ENDPROC(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+.macro TRACE_IRQS_IRETQ offset=0
#ifdef CONFIG_TRACE_IRQFLAGS
bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
jnc 1f
@@ -110,7 +110,7 @@ ENDPROC(native_usergs_sysret64)
call debug_stack_reset
.endm
-.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
+.macro TRACE_IRQS_IRETQ_DEBUG offset=0
bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
jnc 1f
TRACE_IRQS_ON_DEBUG
@@ -187,16 +187,16 @@ ENDPROC(native_usergs_sysret64)
* frame that enables calling into C.
*/
.macro PARTIAL_FRAME start=1 offset=0
- XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
- CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
- CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
- CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
- CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
- CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
- CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
- CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+ XCPT_FRAME \start, ORIG_RAX+\offset
+ CFI_REL_OFFSET rdi, RDI+\offset
+ CFI_REL_OFFSET rsi, RSI+\offset
+ CFI_REL_OFFSET rdx, RDX+\offset
+ CFI_REL_OFFSET rcx, RCX+\offset
+ CFI_REL_OFFSET rax, RAX+\offset
+ CFI_REL_OFFSET r8, R8+\offset
+ CFI_REL_OFFSET r9, R9+\offset
+ CFI_REL_OFFSET r10, R10+\offset
+ CFI_REL_OFFSET r11, R11+\offset
.endm
/*
@@ -260,13 +260,13 @@ ENTRY(ret_from_fork)
RESTORE_EXTRA_REGS
- testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
+ testl $3, CS(%rsp) # from kernel_thread?
jz 1f
testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
- RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
+ RESTORE_TOP_OF_STACK %rdi
jmp ret_from_sys_call # go to the SYSRET fastpath
1:
@@ -333,10 +333,10 @@ GLOBAL(system_call_after_swapgs)
ENABLE_INTERRUPTS(CLBR_NONE)
ALLOC_PTREGS_ON_STACK 8
SAVE_C_REGS
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ movq %rax,ORIG_RAX(%rsp)
+ movq %rcx,RIP(%rsp)
+ CFI_REL_OFFSET rip,RIP
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
jnz tracesys
system_call_fastpath:
#if __SYSCALL_MASK == ~0
@@ -348,7 +348,7 @@ system_call_fastpath:
ja badsys
movq %r10,%rcx
call *sys_call_table(,%rax,8) # XXX: rip relative
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
/*
* Syscall return path ending with SYSRET (fast path)
* Has incomplete stack frame and undefined top of stack.
@@ -360,7 +360,7 @@ sysret_check:
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
+ movl TI_flags+THREAD_INFO(%rsp,RIP),%edx
andl %edi,%edx
jnz sysret_careful
CFI_REMEMBER_STATE
@@ -369,7 +369,7 @@ sysret_check:
*/
TRACE_IRQS_ON
RESTORE_C_REGS_EXCEPT_RCX
- movq RIP-ARGOFFSET(%rsp),%rcx
+ movq RIP(%rsp),%rcx
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
movq PER_CPU_VAR(old_rsp), %rsp
@@ -401,11 +401,11 @@ sysret_signal:
* These all wind up with the iret return path anyway,
* so just join that path right now.
*/
- FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
+ FIXUP_TOP_OF_STACK %r11
jmp int_check_syscall_exit_work
badsys:
- movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+ movq $-ENOSYS,RAX(%rsp)
jmp ret_from_sys_call
#ifdef CONFIG_AUDITSYSCALL
@@ -431,7 +431,7 @@ auditsys:
* masked off.
*/
sysret_audit:
- movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
+ movq RAX(%rsp),%rsi /* second arg, syscall return value */
cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
setbe %al /* 1 if so, 0 if not */
movzbl %al,%edi /* zero-extend that into %edi */
@@ -443,7 +443,7 @@ sysret_audit:
/* Do syscall tracing */
tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
jz auditsys
#endif
SAVE_EXTRA_REGS
@@ -467,7 +467,7 @@ tracesys:
ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
/* Use IRET because user could have changed frame */
/*
@@ -551,9 +551,9 @@ END(stub_\func)
ENTRY(\label)
CFI_STARTPROC
DEFAULT_FRAME 0, 8 /* offset 8: return address */
- FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
+ FIXUP_TOP_OF_STACK %r11, 8
call \func
- RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
+ RESTORE_TOP_OF_STACK %r11, 8
ret
CFI_ENDPROC
END(\label)
@@ -711,7 +711,7 @@ common_interrupt:
ASM_CLAC
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
- /* 0(%rsp): old_rsp-ARGOFFSET */
+ /* 0(%rsp): old_rsp */
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -725,7 +725,7 @@ ret_from_intr:
exit_intr:
GET_THREAD_INFO(%rcx)
- testl $3,CS-ARGOFFSET(%rsp)
+ testl $3,CS(%rsp)
je retint_kernel
/* Interrupt came from user space */
@@ -863,7 +863,7 @@ retint_signal:
ENTRY(retint_kernel)
cmpl $0,PER_CPU_VAR(__preempt_count)
jnz retint_restore_args
- bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
+ bt $9,EFLAGS(%rsp) /* interrupts off? */
jnc retint_restore_args
call preempt_schedule_irq
jmp exit_intr
--
1.8.1.4
^ permalink raw reply related [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
@ 2014-08-01 17:04 ` Andy Lutomirski
2014-08-04 14:28 ` Denys Vlasenko
2014-08-01 18:09 ` Alexei Starovoitov
` (4 subsequent siblings)
5 siblings, 1 reply; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-01 17:04 UTC (permalink / raw)
To: Denys Vlasenko
Cc: linux-kernel, Oleg Nesterov, H. Peter Anvin, Frederic Weisbecker,
X86 ML, Alexei Starovoitov, Will Drewry, Kees Cook
On Fri, Aug 1, 2014 at 7:48 AM, Denys Vlasenko <dvlasenk@redhat.com> wrote:
> 64-bit code was using six stack slots fewer by not saving/restoring
> registers which a callee-preserved according to C ABI,
> and not allocating space for them
This is great.
Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :) Maybe I'll give that a shot.
--Andy
.
>
> Only when syscall needed a complete "struct pt_regs",
> the complete area was allocated and filled in.
>
> This proved to be a source of significant obfuscation and subtle bugs.
> For example, stub_fork had to pop the return address,
> extend the struct, save registers, and push return address back. Ugly.
> ia32_ptregs_common pops return address and "returns" via jmp insn,
> throwing a wrench into CPU return stack cache.
>
> This patch changes code to always allocate a complete "struct pt_regs".
> The saving of registers is still done lazily.
>
> Macros which manipulate "struct pt_regs" on stack are reworked:
> ALLOC_PTREGS_ON_STACK allocates the structure.
> SAVE_C_REGS saves to it those registers which are clobbered by C code.
> SAVE_EXTRA_REGS saves to it all other registers.
> Corresponding RESTORE_* and REMOVE_PTREGS_FROM_STACK macros reverse it.
>
> ia32_ptregs_common, stub_fork and friends lost their ugly dance with
> return pointer.
>
> LOAD_ARGS32 in ia32entry.S now uses a symbolic stack offsets
> instead of magic numbers.
>
> Misleading and slightly wrong comments in "struct pt_regs" are fixed
> (four instances).
>
> Patch was run-tested: 64-bit executables, 32-bit executables,
> strace works.
>
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: "H. Peter Anvin" <hpa@zytor.com>
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Frederic Weisbecker <fweisbec@gmail.com>
> CC: X86 ML <x86@kernel.org>
> CC: Alexei Starovoitov <ast@plumgrid.com>
> CC: Will Drewry <wad@chromium.org>
> CC: Kees Cook <keescook@chromium.org>
> CC: linux-kernel@vger.kernel.org
> ---
> arch/x86/ia32/ia32entry.S | 47 +++----
> arch/x86/include/asm/calling.h | 224 ++++++++++++++++-----------------
> arch/x86/include/asm/irqflags.h | 4 +-
> arch/x86/include/asm/ptrace.h | 13 +-
> arch/x86/include/uapi/asm/ptrace-abi.h | 16 ++-
> arch/x86/include/uapi/asm/ptrace.h | 13 +-
> arch/x86/kernel/entry_64.S | 132 ++++++++-----------
> arch/x86/kernel/preempt.S | 16 ++-
> 8 files changed, 232 insertions(+), 233 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 4299eb0..ef9ee16 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -62,12 +62,12 @@
> */
> .macro LOAD_ARGS32 offset, _r9=0
> .if \_r9
> - movl \offset+16(%rsp),%r9d
> + movl \offset+R9(%rsp),%r9d
> .endif
> - movl \offset+40(%rsp),%ecx
> - movl \offset+48(%rsp),%edx
> - movl \offset+56(%rsp),%esi
> - movl \offset+64(%rsp),%edi
> + movl \offset+RCX(%rsp),%ecx
> + movl \offset+RDX(%rsp),%edx
> + movl \offset+RSI(%rsp),%esi
> + movl \offset+RDI(%rsp),%edi
> movl %eax,%eax /* zero extension */
> .endm
>
> @@ -144,7 +144,8 @@ ENTRY(ia32_sysenter_target)
> CFI_REL_OFFSET rip,0
> pushq_cfi %rax
> cld
> - SAVE_ARGS 0,1,0
> + ALLOC_PTREGS_ON_STACK
> + SAVE_C_REGS_EXCEPT_R891011
> /* no need to do an access_ok check here because rbp has been
> 32bit zero extended */
> ASM_STAC
> @@ -172,7 +173,8 @@ sysexit_from_sys_call:
> andl $~0x200,EFLAGS-R11(%rsp)
> movl RIP-R11(%rsp),%edx /* User %eip */
> CFI_REGISTER rip,rdx
> - RESTORE_ARGS 0,24,0,0,0,0
> + RESTORE_RSI_RDI
> + REMOVE_PTREGS_FROM_STACK 8*3
> xorq %r8,%r8
> xorq %r9,%r9
> xorq %r10,%r10
> @@ -240,13 +242,13 @@ sysenter_tracesys:
> testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> jz sysenter_auditsys
> #endif
> - SAVE_REST
> + SAVE_EXTRA_REGS
> CLEAR_RREGS
> movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
> movq %rsp,%rdi /* &pt_regs -> arg1 */
> call syscall_trace_enter
> LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> cmpq $(IA32_NR_syscalls-1),%rax
> ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
> jmp sysenter_do_call
> @@ -288,7 +290,8 @@ ENTRY(ia32_cstar_target)
> * disabled irqs and here we enable it straight after entry:
> */
> ENABLE_INTERRUPTS(CLBR_NONE)
> - SAVE_ARGS 8,0,0
> + ALLOC_PTREGS_ON_STACK 8
> + SAVE_C_REGS_EXCEPT_RCX_R891011
> movl %eax,%eax /* zero extension */
> movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
> movq %rcx,RIP-ARGOFFSET(%rsp)
> @@ -325,7 +328,7 @@ cstar_dispatch:
> jnz sysretl_audit
> sysretl_from_sys_call:
> andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> - RESTORE_ARGS 0,-ARG_SKIP,0,0,0
> + RESTORE_RSI_RDI_RDX
> movl RIP-ARGOFFSET(%rsp),%ecx
> CFI_REGISTER rip,rcx
> movl EFLAGS-ARGOFFSET(%rsp),%r11d
> @@ -356,13 +359,13 @@ cstar_tracesys:
> jz cstar_auditsys
> #endif
> xchgl %r9d,%ebp
> - SAVE_REST
> + SAVE_EXTRA_REGS
> CLEAR_RREGS 0, r9
> movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
> movq %rsp,%rdi /* &pt_regs -> arg1 */
> call syscall_trace_enter
> LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> xchgl %ebp,%r9d
> cmpq $(IA32_NR_syscalls-1),%rax
> ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
> @@ -417,7 +420,8 @@ ENTRY(ia32_syscall)
> cld
> /* note the registers are not zero extended to the sf.
> this could be a problem. */
> - SAVE_ARGS 0,1,0
> + ALLOC_PTREGS_ON_STACK
> + SAVE_C_REGS_EXCEPT_R891011
> orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> jnz ia32_tracesys
> @@ -430,16 +434,16 @@ ia32_sysret:
> movq %rax,RAX-ARGOFFSET(%rsp)
> ia32_ret_from_sys_call:
> CLEAR_RREGS -ARGOFFSET
> - jmp int_ret_from_sys_call
> + jmp int_ret_from_sys_call
>
> -ia32_tracesys:
> - SAVE_REST
> +ia32_tracesys:
> + SAVE_EXTRA_REGS
> CLEAR_RREGS
> movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
> movq %rsp,%rdi /* &pt_regs -> arg1 */
> call syscall_trace_enter
> LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> cmpq $(IA32_NR_syscalls-1),%rax
> ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
> jmp ia32_do_call
> @@ -475,7 +479,6 @@ GLOBAL(stub32_clone)
>
> ALIGN
> ia32_ptregs_common:
> - popq %r11
> CFI_ENDPROC
> CFI_STARTPROC32 simple
> CFI_SIGNAL_FRAME
> @@ -490,9 +493,9 @@ ia32_ptregs_common:
> /* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
> CFI_REL_OFFSET rsp,RSP-ARGOFFSET
> /* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
> - SAVE_REST
> + SAVE_EXTRA_REGS 8
> call *%rax
> - RESTORE_REST
> - jmp ia32_sysret /* misbalances the return cache */
> + RESTORE_EXTRA_REGS 8
> + ret
> CFI_ENDPROC
> END(ia32_ptregs_common)
> diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
> index e176cea..10aff1e 100644
> --- a/arch/x86/include/asm/calling.h
> +++ b/arch/x86/include/asm/calling.h
> @@ -52,142 +52,132 @@ For 32-bit we have the following conventions - kernel is built with
>
> /*
> * 64-bit system call stack frame layout defines and helpers,
> - * for assembly code:
> + * for assembly code.
> */
>
> -#define R15 0
> -#define R14 8
> -#define R13 16
> -#define R12 24
> -#define RBP 32
> -#define RBX 40
> -
> -/* arguments: interrupts/non tracing syscalls only save up to here: */
> -#define R11 48
> -#define R10 56
> -#define R9 64
> -#define R8 72
> -#define RAX 80
> -#define RCX 88
> -#define RDX 96
> -#define RSI 104
> -#define RDI 112
> -#define ORIG_RAX 120 /* + error_code */
> -/* end of arguments */
> -
> -/* cpu exception frame or undefined in case of fast syscall: */
> -#define RIP 128
> -#define CS 136
> -#define EFLAGS 144
> -#define RSP 152
> -#define SS 160
> -
> -#define ARGOFFSET R11
> -
> - .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
> - subq $9*8+\addskip, %rsp
> - CFI_ADJUST_CFA_OFFSET 9*8+\addskip
> - movq_cfi rdi, 8*8
> - movq_cfi rsi, 7*8
> - movq_cfi rdx, 6*8
> -
> - .if \save_rcx
> - movq_cfi rcx, 5*8
> - .endif
> -
> - movq_cfi rax, 4*8
> +/* The layout forms the "struct pt_regs" on the stack: */
> +/*
> + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
> + * unless syscall needs a complete, fully filled "struct pt_regs".
> + */
> +#define R15 0*8
> +#define R14 1*8
> +#define R13 2*8
> +#define R12 3*8
> +#define RBP 4*8
> +#define RBX 5*8
> +/* These regs are callee-clobbered. Always saved on kernel entry. */
> +#define R11 6*8
> +#define R10 7*8
> +#define R9 8*8
> +#define R8 9*8
> +#define RAX 10*8
> +#define RCX 11*8
> +#define RDX 12*8
> +#define RSI 13*8
> +#define RDI 14*8
> +/*
> + * On syscall entry, this is syscall#. On CPU exception, this is error code.
> + * On hw interrupt, it's IRQ number:
> + */
> +#define ORIG_RAX 15*8
> +/* Return frame for iretq */
> +#define RIP 16*8
> +#define CS 17*8
> +#define EFLAGS 18*8
> +#define RSP 19*8
> +#define SS 20*8
> +
> +#define ARGOFFSET 0
> +
> + .macro ALLOC_PTREGS_ON_STACK addskip=0
> + subq $15*8+\addskip, %rsp
> + CFI_ADJUST_CFA_OFFSET 15*8+\addskip
> + .endm
>
> - .if \save_r891011
> - movq_cfi r8, 3*8
> - movq_cfi r9, 2*8
> - movq_cfi r10, 1*8
> - movq_cfi r11, 0*8
> + .macro SAVE_C_REGS_HELPER rcx=1 r8plus=1
> + movq_cfi rdi, 14*8
> + movq_cfi rsi, 13*8
> + movq_cfi rdx, 12*8
> + .if \rcx
> + movq_cfi rcx, 11*8
> .endif
> -
> + movq_cfi rax, 10*8
> + .if \r8plus
> + movq_cfi r8, 9*8
> + movq_cfi r9, 8*8
> + movq_cfi r10, 7*8
> + movq_cfi r11, 6*8
> + .endif
> + .endm
> + .macro SAVE_C_REGS
> + SAVE_C_REGS_HELPER 1, 1
> + .endm
> + .macro SAVE_C_REGS_EXCEPT_R891011
> + SAVE_C_REGS_HELPER 1, 0
> + .endm
> + .macro SAVE_C_REGS_EXCEPT_RCX_R891011
> + SAVE_C_REGS_HELPER 0, 0
> .endm
>
> -#define ARG_SKIP (9*8)
> + .macro SAVE_EXTRA_REGS offset=0
> + movq_cfi rbx, 5*8+\offset
> + movq_cfi rbp, 4*8+\offset
> + movq_cfi r12, 3*8+\offset
> + movq_cfi r13, 2*8+\offset
> + movq_cfi r14, 1*8+\offset
> + movq_cfi r15, 0*8+\offset
> + .endm
>
> - .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
> - rstor_r8910=1, rstor_rdx=1
> - .if \rstor_r11
> - movq_cfi_restore 0*8, r11
> - .endif
> + .macro RESTORE_EXTRA_REGS offset=0
> + movq_cfi_restore 0*8+\offset, r15
> + movq_cfi_restore 1*8+\offset, r14
> + movq_cfi_restore 2*8+\offset, r13
> + movq_cfi_restore 3*8+\offset, r12
> + movq_cfi_restore 4*8+\offset, rbp
> + movq_cfi_restore 5*8+\offset, rbx
> + .endm
>
> - .if \rstor_r8910
> - movq_cfi_restore 1*8, r10
> - movq_cfi_restore 2*8, r9
> - movq_cfi_restore 3*8, r8
> + .macro RESTORE_C_REGS_HELPER rax=1, rcx=1, r11=1, r8910=1, rdx=1
> + .if \r11
> + movq_cfi_restore 6*8, r11
> .endif
> -
> - .if \rstor_rax
> - movq_cfi_restore 4*8, rax
> + .if \r8910
> + movq_cfi_restore 7*8, r10
> + movq_cfi_restore 8*8, r9
> + movq_cfi_restore 9*8, r8
> .endif
> -
> - .if \rstor_rcx
> - movq_cfi_restore 5*8, rcx
> + .if \rax
> + movq_cfi_restore 10*8, rax
> .endif
> -
> - .if \rstor_rdx
> - movq_cfi_restore 6*8, rdx
> + .if \rcx
> + movq_cfi_restore 11*8, rcx
> .endif
> -
> - movq_cfi_restore 7*8, rsi
> - movq_cfi_restore 8*8, rdi
> -
> - .if ARG_SKIP+\addskip > 0
> - addq $ARG_SKIP+\addskip, %rsp
> - CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
> + .if \rdx
> + movq_cfi_restore 12*8, rdx
> .endif
> + movq_cfi_restore 13*8, rsi
> + movq_cfi_restore 14*8, rdi
> .endm
> -
> - .macro LOAD_ARGS offset, skiprax=0
> - movq \offset(%rsp), %r11
> - movq \offset+8(%rsp), %r10
> - movq \offset+16(%rsp), %r9
> - movq \offset+24(%rsp), %r8
> - movq \offset+40(%rsp), %rcx
> - movq \offset+48(%rsp), %rdx
> - movq \offset+56(%rsp), %rsi
> - movq \offset+64(%rsp), %rdi
> - .if \skiprax
> - .else
> - movq \offset+72(%rsp), %rax
> - .endif
> + .macro RESTORE_C_REGS
> + RESTORE_C_REGS_HELPER 1,1,1,1,1
> .endm
> -
> -#define REST_SKIP (6*8)
> -
> - .macro SAVE_REST
> - subq $REST_SKIP, %rsp
> - CFI_ADJUST_CFA_OFFSET REST_SKIP
> - movq_cfi rbx, 5*8
> - movq_cfi rbp, 4*8
> - movq_cfi r12, 3*8
> - movq_cfi r13, 2*8
> - movq_cfi r14, 1*8
> - movq_cfi r15, 0*8
> + .macro RESTORE_C_REGS_EXCEPT_RAX
> + RESTORE_C_REGS_HELPER 0,1,1,1,1
> .endm
> -
> - .macro RESTORE_REST
> - movq_cfi_restore 0*8, r15
> - movq_cfi_restore 1*8, r14
> - movq_cfi_restore 2*8, r13
> - movq_cfi_restore 3*8, r12
> - movq_cfi_restore 4*8, rbp
> - movq_cfi_restore 5*8, rbx
> - addq $REST_SKIP, %rsp
> - CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
> + .macro RESTORE_C_REGS_EXCEPT_RCX
> + RESTORE_C_REGS_HELPER 1,0,1,1,1
> .endm
> -
> - .macro SAVE_ALL
> - SAVE_ARGS
> - SAVE_REST
> + .macro RESTORE_RSI_RDI
> + RESTORE_C_REGS_HELPER 0,0,0,0,0
> + .endm
> + .macro RESTORE_RSI_RDI_RDX
> + RESTORE_C_REGS_HELPER 0,0,0,0,1
> .endm
>
> - .macro RESTORE_ALL addskip=0
> - RESTORE_REST
> - RESTORE_ARGS 1, \addskip
> + .macro REMOVE_PTREGS_FROM_STACK addskip=0
> + addq $15*8+\addskip, %rsp
> + CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
> .endm
>
> .macro icebp
> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
> index bba3cf8..6f98c16 100644
> --- a/arch/x86/include/asm/irqflags.h
> +++ b/arch/x86/include/asm/irqflags.h
> @@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void)
> #define ARCH_LOCKDEP_SYS_EXIT_IRQ \
> TRACE_IRQS_ON; \
> sti; \
> - SAVE_REST; \
> + SAVE_EXTRA_REGS; \
> LOCKDEP_SYS_EXIT; \
> - RESTORE_REST; \
> + RESTORE_EXTRA_REGS; \
> cli; \
> TRACE_IRQS_OFF;
>
> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
> index 6205f0c..c822b35 100644
> --- a/arch/x86/include/asm/ptrace.h
> +++ b/arch/x86/include/asm/ptrace.h
> @@ -31,13 +31,17 @@ struct pt_regs {
> #else /* __i386__ */
>
> struct pt_regs {
> +/*
> + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
> + * unless syscall needs a complete, fully filled "struct pt_regs".
> + */
> unsigned long r15;
> unsigned long r14;
> unsigned long r13;
> unsigned long r12;
> unsigned long bp;
> unsigned long bx;
> -/* arguments: non interrupts/non tracing syscalls only save up to here*/
> +/* These regs are callee-clobbered. Always saved on kernel entry. */
> unsigned long r11;
> unsigned long r10;
> unsigned long r9;
> @@ -47,9 +51,12 @@ struct pt_regs {
> unsigned long dx;
> unsigned long si;
> unsigned long di;
> +/*
> + * On syscall entry, this is syscall#. On CPU exception, this is error code.
> + * On hw interrupt, it's IRQ number:
> + */
> unsigned long orig_ax;
> -/* end of arguments */
> -/* cpu exception frame or undefined */
> +/* Return frame for iretq */
> unsigned long ip;
> unsigned long cs;
> unsigned long flags;
> diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
> index 7b0a55a..580aee3 100644
> --- a/arch/x86/include/uapi/asm/ptrace-abi.h
> +++ b/arch/x86/include/uapi/asm/ptrace-abi.h
> @@ -25,13 +25,17 @@
> #else /* __i386__ */
>
> #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
> +/*
> + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
> + * unless syscall needs a complete, fully filled "struct pt_regs".
> + */
> #define R15 0
> #define R14 8
> #define R13 16
> #define R12 24
> #define RBP 32
> #define RBX 40
> -/* arguments: interrupts/non tracing syscalls only save up to here*/
> +/* These regs are callee-clobbered. Always saved on kernel entry. */
> #define R11 48
> #define R10 56
> #define R9 64
> @@ -41,15 +45,17 @@
> #define RDX 96
> #define RSI 104
> #define RDI 112
> -#define ORIG_RAX 120 /* = ERROR */
> -/* end of arguments */
> -/* cpu exception frame or undefined in case of fast syscall. */
> +/*
> + * On syscall entry, this is syscall#. On CPU exception, this is error code.
> + * On hw interrupt, it's IRQ number:
> + */
> +#define ORIG_RAX 120
> +/* Return frame for iretq */
> #define RIP 128
> #define CS 136
> #define EFLAGS 144
> #define RSP 152
> #define SS 160
> -#define ARGOFFSET R11
> #endif /* __ASSEMBLY__ */
>
> /* top of stack page */
> diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
> index ac4b9aa..bc16115 100644
> --- a/arch/x86/include/uapi/asm/ptrace.h
> +++ b/arch/x86/include/uapi/asm/ptrace.h
> @@ -41,13 +41,17 @@ struct pt_regs {
> #ifndef __KERNEL__
>
> struct pt_regs {
> +/*
> + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
> + * unless syscall needs a complete, fully filled "struct pt_regs".
> + */
> unsigned long r15;
> unsigned long r14;
> unsigned long r13;
> unsigned long r12;
> unsigned long rbp;
> unsigned long rbx;
> -/* arguments: non interrupts/non tracing syscalls only save up to here*/
> +/* These regs are callee-clobbered. Always saved on kernel entry. */
> unsigned long r11;
> unsigned long r10;
> unsigned long r9;
> @@ -57,9 +61,12 @@ struct pt_regs {
> unsigned long rdx;
> unsigned long rsi;
> unsigned long rdi;
> +/*
> + * On syscall entry, this is syscall#. On CPU exception, this is error code.
> + * On hw interrupt, it's IRQ number:
> + */
> unsigned long orig_rax;
> -/* end of arguments */
> -/* cpu exception frame or undefined */
> +/* Return frame for iretq */
> unsigned long rip;
> unsigned long cs;
> unsigned long eflags;
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 37f7d95..b3c3ebb 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -26,12 +26,6 @@
> * Some macro usage:
> * - CFI macros are used to generate dwarf2 unwind information for better
> * backtraces. They don't change any code.
> - * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
> - * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
> - * There are unfortunately lots of special cases where some registers
> - * not touched. The macro is a big mess that should be cleaned up.
> - * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
> - * Gives a full stack frame.
> * - ENTRY/END Define functions in the symbol table.
> * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
> * frame that is otherwise undefined after a SYSCALL
> @@ -264,7 +258,7 @@ ENTRY(ret_from_fork)
>
> GET_THREAD_INFO(%rcx)
>
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
>
> testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
> jz 1f
> @@ -276,12 +270,10 @@ ENTRY(ret_from_fork)
> jmp ret_from_sys_call # go to the SYSRET fastpath
>
> 1:
> - subq $REST_SKIP, %rsp # leave space for volatiles
> - CFI_ADJUST_CFA_OFFSET REST_SKIP
> movq %rbp, %rdi
> call *%rbx
> movl $0, RAX(%rsp)
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(ret_from_fork)
> @@ -339,7 +331,8 @@ GLOBAL(system_call_after_swapgs)
> * and short:
> */
> ENABLE_INTERRUPTS(CLBR_NONE)
> - SAVE_ARGS 8,0
> + ALLOC_PTREGS_ON_STACK 8
> + SAVE_C_REGS
> movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
> movq %rcx,RIP-ARGOFFSET(%rsp)
> CFI_REL_OFFSET rip,RIP-ARGOFFSET
> @@ -375,9 +368,9 @@ sysret_check:
> * sysretq will re-enable interrupts:
> */
> TRACE_IRQS_ON
> + RESTORE_C_REGS_EXCEPT_RCX
> movq RIP-ARGOFFSET(%rsp),%rcx
> CFI_REGISTER rip,rcx
> - RESTORE_ARGS 1,-ARG_SKIP,0
> /*CFI_REGISTER rflags,r11*/
> movq PER_CPU_VAR(old_rsp), %rsp
> USERGS_SYSRET64
> @@ -429,7 +422,7 @@ auditsys:
> movq %rax,%rsi /* 2nd arg: syscall number */
> movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
> call __audit_syscall_entry
> - LOAD_ARGS 0 /* reload call-clobbered registers */
> + RESTORE_C_REGS /* reload call-clobbered registers */
> jmp system_call_fastpath
>
> /*
> @@ -453,7 +446,7 @@ tracesys:
> testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> jz auditsys
> #endif
> - SAVE_REST
> + SAVE_EXTRA_REGS
> movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
> FIXUP_TOP_OF_STACK %rdi
> movq %rsp,%rdi
> @@ -463,8 +456,8 @@ tracesys:
> * We don't reload %rax because syscall_trace_enter() returned
> * the value it wants us to use in the table lookup.
> */
> - LOAD_ARGS ARGOFFSET, 1
> - RESTORE_REST
> + RESTORE_C_REGS_EXCEPT_RAX
> + RESTORE_EXTRA_REGS
> #if __SYSCALL_MASK == ~0
> cmpq $__NR_syscall_max,%rax
> #else
> @@ -515,7 +508,7 @@ int_very_careful:
> TRACE_IRQS_ON
> ENABLE_INTERRUPTS(CLBR_NONE)
> int_check_syscall_exit_work:
> - SAVE_REST
> + SAVE_EXTRA_REGS
> /* Check for syscall exit trace */
> testl $_TIF_WORK_SYSCALL_EXIT,%edx
> jz int_signal
> @@ -534,7 +527,7 @@ int_signal:
> call do_notify_resume
> 1: movl $_TIF_WORK_MASK,%edi
> int_restore_rest:
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> DISABLE_INTERRUPTS(CLBR_NONE)
> TRACE_IRQS_OFF
> jmp int_with_check
> @@ -544,15 +537,12 @@ END(system_call)
> .macro FORK_LIKE func
> ENTRY(stub_\func)
> CFI_STARTPROC
> - popq %r11 /* save return address */
> - PARTIAL_FRAME 0
> - SAVE_REST
> - pushq %r11 /* put it back on stack */
> + DEFAULT_FRAME 0, 8 /* offset 8: return address */
> + SAVE_EXTRA_REGS 8
> FIXUP_TOP_OF_STACK %r11, 8
> - DEFAULT_FRAME 0 8 /* offset 8: return address */
> call sys_\func
> RESTORE_TOP_OF_STACK %r11, 8
> - ret $REST_SKIP /* pop extended registers */
> + ret
> CFI_ENDPROC
> END(stub_\func)
> .endm
> @@ -560,7 +550,7 @@ END(stub_\func)
> .macro FIXED_FRAME label,func
> ENTRY(\label)
> CFI_STARTPROC
> - PARTIAL_FRAME 0 8 /* offset 8: return address */
> + DEFAULT_FRAME 0, 8 /* offset 8: return address */
> FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
> call \func
> RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
> @@ -577,12 +567,12 @@ END(\label)
> ENTRY(stub_execve)
> CFI_STARTPROC
> addq $8, %rsp
> - PARTIAL_FRAME 0
> - SAVE_REST
> + DEFAULT_FRAME 0
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %r11
> call sys_execve
> movq %rax,RAX(%rsp)
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(stub_execve)
> @@ -594,12 +584,12 @@ END(stub_execve)
> ENTRY(stub_rt_sigreturn)
> CFI_STARTPROC
> addq $8, %rsp
> - PARTIAL_FRAME 0
> - SAVE_REST
> + DEFAULT_FRAME 0
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %r11
> call sys_rt_sigreturn
> movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(stub_rt_sigreturn)
> @@ -608,12 +598,12 @@ END(stub_rt_sigreturn)
> ENTRY(stub_x32_rt_sigreturn)
> CFI_STARTPROC
> addq $8, %rsp
> - PARTIAL_FRAME 0
> - SAVE_REST
> + DEFAULT_FRAME 0
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %r11
> call sys32_x32_rt_sigreturn
> movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(stub_x32_rt_sigreturn)
> @@ -621,13 +611,13 @@ END(stub_x32_rt_sigreturn)
> ENTRY(stub_x32_execve)
> CFI_STARTPROC
> addq $8, %rsp
> - PARTIAL_FRAME 0
> - SAVE_REST
> + DEFAULT_FRAME 0
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %r11
> call compat_sys_execve
> RESTORE_TOP_OF_STACK %r11
> movq %rax,RAX(%rsp)
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(stub_x32_execve)
> @@ -683,51 +673,31 @@ END(interrupt)
>
> /* 0(%rsp): ~(interrupt number) */
> .macro interrupt func
> - /* reserve pt_regs for scratch regs and rbp */
> - subq $ORIG_RAX-RBP, %rsp
> - CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
> - cld
> - /* start from rbp in pt_regs and jump over */
> - movq_cfi rdi, (RDI-RBP)
> - movq_cfi rsi, (RSI-RBP)
> - movq_cfi rdx, (RDX-RBP)
> - movq_cfi rcx, (RCX-RBP)
> - movq_cfi rax, (RAX-RBP)
> - movq_cfi r8, (R8-RBP)
> - movq_cfi r9, (R9-RBP)
> - movq_cfi r10, (R10-RBP)
> - movq_cfi r11, (R11-RBP)
> -
> - /* Save rbp so that we can unwind from get_irq_regs() */
> - movq_cfi rbp, 0
> -
> - /* Save previous stack value */
> - movq %rsp, %rsi
> -
> - leaq -RBP(%rsp),%rdi /* arg1 for handler */
> - testl $3, CS-RBP(%rsi)
> + ALLOC_PTREGS_ON_STACK
> + SAVE_C_REGS
> + movq %rsp, %rdi /* arg1 for handler */
> + testl $3, CS(%rsp)
> je 1f
> SWAPGS
> - /*
> +1: /*
> * irq_count is used to check if a CPU is already on an interrupt stack
> * or not. While this is essentially redundant with preempt_count it is
> * a little cheaper to use a separate counter in the PDA (short of
> * moving irq_enter into assembly, which would be too much work)
> */
> -1: incl PER_CPU_VAR(irq_count)
> + incl PER_CPU_VAR(irq_count)
> cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
> - CFI_DEF_CFA_REGISTER rsi
> + CFI_DEF_CFA_REGISTER rdi
>
> /* Store previous stack value */
> - pushq %rsi
> + pushq %rdi
> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
> 0x77 /* DW_OP_breg7 */, 0, \
> 0x06 /* DW_OP_deref */, \
> - 0x08 /* DW_OP_const1u */, SS+8-RBP, \
> + 0x08 /* DW_OP_const1u */, SS+8, \
> 0x22 /* DW_OP_plus */
> /* We entered an interrupt context - irqs are off: */
> TRACE_IRQS_OFF
> -
> call \func
> .endm
>
> @@ -749,10 +719,9 @@ ret_from_intr:
>
> /* Restore saved previous stack */
> popq %rsi
> - CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
> - leaq ARGOFFSET-RBP(%rsi), %rsp
> + CFI_DEF_CFA rsi,SS+8 /* reg/off reset after def_cfa_expr */
> + movq %rsi, %rsp
> CFI_DEF_CFA_REGISTER rsp
> - CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
>
> exit_intr:
> GET_THREAD_INFO(%rcx)
> @@ -789,7 +758,8 @@ retint_restore_args: /* return to kernel space */
> */
> TRACE_IRQS_IRETQ
> restore_args:
> - RESTORE_ARGS 1,8,1
> + RESTORE_C_REGS
> + REMOVE_PTREGS_FROM_STACK 8
>
> irq_return:
> /*
> @@ -876,12 +846,12 @@ retint_signal:
> jz retint_swapgs
> TRACE_IRQS_ON
> ENABLE_INTERRUPTS(CLBR_NONE)
> - SAVE_REST
> + SAVE_EXTRA_REGS
> movq $-1,ORIG_RAX(%rsp)
> xorl %esi,%esi # oldset
> movq %rsp,%rdi # &pt_regs
> call do_notify_resume
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> DISABLE_INTERRUPTS(CLBR_NONE)
> TRACE_IRQS_OFF
> GET_THREAD_INFO(%rcx)
> @@ -1256,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
> addq $0x30,%rsp
> CFI_ADJUST_CFA_OFFSET -0x30
> pushq_cfi $-1 /* orig_ax = -1 => not a system call */
> - SAVE_ALL
> + ALLOC_PTREGS_ON_STACK
> + SAVE_C_REGS
> + SAVE_EXTRA_REGS
> jmp error_exit
> CFI_ENDPROC
> END(xen_failsafe_callback)
> @@ -1313,11 +1285,15 @@ ENTRY(paranoid_exit)
> paranoid_swapgs:
> TRACE_IRQS_IRETQ 0
> SWAPGS_UNSAFE_STACK
> - RESTORE_ALL 8
> + RESTORE_EXTRA_REGS
> + RESTORE_C_REGS
> + REMOVE_PTREGS_FROM_STACK 8
> jmp irq_return
> paranoid_restore:
> TRACE_IRQS_IRETQ_DEBUG 0
> - RESTORE_ALL 8
> + RESTORE_EXTRA_REGS
> + RESTORE_C_REGS
> + REMOVE_PTREGS_FROM_STACK 8
> jmp irq_return
> paranoid_userspace:
> GET_THREAD_INFO(%rcx)
> @@ -1412,7 +1388,7 @@ END(error_entry)
> ENTRY(error_exit)
> DEFAULT_FRAME
> movl %ebx,%eax
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> DISABLE_INTERRUPTS(CLBR_NONE)
> TRACE_IRQS_OFF
> GET_THREAD_INFO(%rcx)
> @@ -1671,8 +1647,10 @@ end_repeat_nmi:
> nmi_swapgs:
> SWAPGS_UNSAFE_STACK
> nmi_restore:
> + RESTORE_EXTRA_REGS
> + RESTORE_C_REGS
> /* Pop the extra iret frame at once */
> - RESTORE_ALL 6*8
> + REMOVE_PTREGS_FROM_STACK 6*8
>
> /* Clear the NMI executing stack variable */
> movq $0, 5*8(%rsp)
> diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S
> index ca7f0d5..673da2f 100644
> --- a/arch/x86/kernel/preempt.S
> +++ b/arch/x86/kernel/preempt.S
> @@ -6,9 +6,13 @@
>
> ENTRY(___preempt_schedule)
> CFI_STARTPROC
> - SAVE_ALL
> + ALLOC_PTREGS_ON_STACK
> + SAVE_C_REGS
> + SAVE_EXTRA_REGS
> call preempt_schedule
> - RESTORE_ALL
> + RESTORE_EXTRA_REGS
> + RESTORE_C_REGS
> + REMOVE_PTREGS_FROM_STACK
> ret
> CFI_ENDPROC
>
> @@ -16,9 +20,13 @@ ENTRY(___preempt_schedule)
>
> ENTRY(___preempt_schedule_context)
> CFI_STARTPROC
> - SAVE_ALL
> + ALLOC_PTREGS_ON_STACK
> + SAVE_C_REGS
> + SAVE_EXTRA_REGS
> call preempt_schedule_context
> - RESTORE_ALL
> + RESTORE_EXTRA_REGS
> + RESTORE_C_REGS
> + REMOVE_PTREGS_FROM_STACK
> ret
> CFI_ENDPROC
>
> --
> 1.8.1.4
>
--
Andy Lutomirski
AMA Capital Management, LLC
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 1/5] x86: entry_64.S: delete unused code
2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
` (3 preceding siblings ...)
2014-08-01 14:48 ` [PATCH 5/5] x86: mass removal of ARGOFFSET Denys Vlasenko
@ 2014-08-01 18:00 ` Frederic Weisbecker
4 siblings, 0 replies; 41+ messages in thread
From: Frederic Weisbecker @ 2014-08-01 18:00 UTC (permalink / raw)
To: Denys Vlasenko
Cc: linux-kernel, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
X86 ML, Alexei Starovoitov, Will Drewry, Kees Cook
On Fri, Aug 01, 2014 at 04:48:14PM +0200, Denys Vlasenko wrote:
> A define, two macros and an unreferenced bit of assembly are gone.
>
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: "H. Peter Anvin" <hpa@zytor.com>
> CC: Andy Lutomirski <luto@amacapital.net>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
2014-08-01 17:04 ` Andy Lutomirski
@ 2014-08-01 18:09 ` Alexei Starovoitov
2014-08-01 18:30 ` Oleg Nesterov
` (3 subsequent siblings)
5 siblings, 0 replies; 41+ messages in thread
From: Alexei Starovoitov @ 2014-08-01 18:09 UTC (permalink / raw)
To: Denys Vlasenko
Cc: LKML, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
Frederic Weisbecker, X86 ML, Will Drewry, Kees Cook
On Fri, Aug 1, 2014 at 7:48 AM, Denys Vlasenko <dvlasenk@redhat.com> wrote:
> 64-bit code was using six stack slots fewer by not saving/restoring
> registers which a callee-preserved according to C ABI,
> and not allocating space for them.
>
> Only when syscall needed a complete "struct pt_regs",
> the complete area was allocated and filled in.
>
> This proved to be a source of significant obfuscation and subtle bugs.
> For example, stub_fork had to pop the return address,
> extend the struct, save registers, and push return address back. Ugly.
> ia32_ptregs_common pops return address and "returns" via jmp insn,
> throwing a wrench into CPU return stack cache.
>
> This patch changes code to always allocate a complete "struct pt_regs".
> The saving of registers is still done lazily.
>
> Macros which manipulate "struct pt_regs" on stack are reworked:
> ALLOC_PTREGS_ON_STACK allocates the structure.
> SAVE_C_REGS saves to it those registers which are clobbered by C code.
> SAVE_EXTRA_REGS saves to it all other registers.
> Corresponding RESTORE_* and REMOVE_PTREGS_FROM_STACK macros reverse it.
Looks like a nice cleanup at the cost of extra 48 byte stack gap for fast path.
I'm guessing the gap shouldn't affect performance, but would be nice
to see performance numbers before/after.
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user
2014-08-01 14:48 ` [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user Denys Vlasenko
@ 2014-08-01 18:30 ` Frederic Weisbecker
0 siblings, 0 replies; 41+ messages in thread
From: Frederic Weisbecker @ 2014-08-01 18:30 UTC (permalink / raw)
To: Denys Vlasenko
Cc: linux-kernel, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
X86 ML, Alexei Starovoitov, Will Drewry, Kees Cook
On Fri, Aug 01, 2014 at 04:48:16PM +0200, Denys Vlasenko wrote:
> No code changes.
>
> This is a preparatory patch for change in "struct pt_regs" handling.
>
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: "H. Peter Anvin" <hpa@zytor.com>
> CC: Andy Lutomirski <luto@amacapital.net>
One macro dissolved == one Hop-o'-My-Thumb pebble less to drop before taking
a jump on the editor.
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
2014-08-01 17:04 ` Andy Lutomirski
2014-08-01 18:09 ` Alexei Starovoitov
@ 2014-08-01 18:30 ` Oleg Nesterov
2014-08-01 18:35 ` H. Peter Anvin
` (2 subsequent siblings)
5 siblings, 0 replies; 41+ messages in thread
From: Oleg Nesterov @ 2014-08-01 18:30 UTC (permalink / raw)
To: Denys Vlasenko
Cc: linux-kernel, H. Peter Anvin, Andy Lutomirski,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
On 08/01, Denys Vlasenko wrote:
>
> This patch changes code to always allocate a complete "struct pt_regs".
> The saving of registers is still done lazily.
I obviously like this change very much. Unfortunately I can only ack the
intent ;)
I really hope that maintainers will take a closer look.
Oleg.
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
` (2 preceding siblings ...)
2014-08-01 18:30 ` Oleg Nesterov
@ 2014-08-01 18:35 ` H. Peter Anvin
2014-08-01 22:11 ` Denys Vlasenko
2014-08-01 22:52 ` Frederic Weisbecker
2014-08-01 23:19 ` Frederic Weisbecker
5 siblings, 1 reply; 41+ messages in thread
From: H. Peter Anvin @ 2014-08-01 18:35 UTC (permalink / raw)
To: Denys Vlasenko, linux-kernel
Cc: Oleg Nesterov, Andy Lutomirski, Frederic Weisbecker, X86 ML,
Alexei Starovoitov, Will Drewry, Kees Cook
On 08/01/2014 07:48 AM, Denys Vlasenko wrote:
>
> Patch was run-tested: 64-bit executables, 32-bit executables,
> strace works.
>
Could you please try to see if there is a measurable change in the
latency of a trivial syscall?
-hpa
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 18:35 ` H. Peter Anvin
@ 2014-08-01 22:11 ` Denys Vlasenko
2014-08-01 22:13 ` H. Peter Anvin
0 siblings, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-01 22:11 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Denys Vlasenko, Linux Kernel Mailing List, Oleg Nesterov,
Andy Lutomirski, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
Will Drewry, Kees Cook
On Fri, Aug 1, 2014 at 8:35 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 08/01/2014 07:48 AM, Denys Vlasenko wrote:
>>
>> Patch was run-tested: 64-bit executables, 32-bit executables,
>> strace works.
>>
>
> Could you please try to see if there is a measurable change in the
> latency of a trivial syscall?
Will do.
Something along the lines of "how long does it take to execute two
gazillions of getppid()?"
I'll send v2 shortly - I just figured out what was this mysterious
dance with allocating "not-quite-full struct pt_regs" on interrupt path:
It is intended to make nested interrupts to have better
backtraces. I broke that by this patch, v2 will have this feature retained.
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 22:11 ` Denys Vlasenko
@ 2014-08-01 22:13 ` H. Peter Anvin
2014-08-02 21:14 ` Andy Lutomirski
0 siblings, 1 reply; 41+ messages in thread
From: H. Peter Anvin @ 2014-08-01 22:13 UTC (permalink / raw)
To: Denys Vlasenko
Cc: Denys Vlasenko, Linux Kernel Mailing List, Oleg Nesterov,
Andy Lutomirski, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
Will Drewry, Kees Cook
On 08/01/2014 03:11 PM, Denys Vlasenko wrote:
>>
>> Could you please try to see if there is a measurable change in the
>> latency of a trivial syscall?
>
> Will do.
> Something along the lines of "how long does it take to execute two
> gazillions of getppid()?"
>
Something like that, yes, but you have to run enough data points so you
can determine if the difference is statistically significant or just noise.
-hpa
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
` (3 preceding siblings ...)
2014-08-01 18:35 ` H. Peter Anvin
@ 2014-08-01 22:52 ` Frederic Weisbecker
2014-08-01 23:19 ` Frederic Weisbecker
5 siblings, 0 replies; 41+ messages in thread
From: Frederic Weisbecker @ 2014-08-01 22:52 UTC (permalink / raw)
To: Denys Vlasenko
Cc: linux-kernel, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
X86 ML, Alexei Starovoitov, Will Drewry, Kees Cook
On Fri, Aug 01, 2014 at 04:48:17PM +0200, Denys Vlasenko wrote:
> 64-bit code was using six stack slots fewer by not saving/restoring
> registers which a callee-preserved according to C ABI,
> and not allocating space for them.
>
> Only when syscall needed a complete "struct pt_regs",
> the complete area was allocated and filled in.
>
> This proved to be a source of significant obfuscation and subtle bugs.
> For example, stub_fork had to pop the return address,
> extend the struct, save registers, and push return address back. Ugly.
> ia32_ptregs_common pops return address and "returns" via jmp insn,
> throwing a wrench into CPU return stack cache.
>
> This patch changes code to always allocate a complete "struct pt_regs".
> The saving of registers is still done lazily.
>
> Macros which manipulate "struct pt_regs" on stack are reworked:
> ALLOC_PTREGS_ON_STACK allocates the structure.
> SAVE_C_REGS saves to it those registers which are clobbered by C code.
> SAVE_EXTRA_REGS saves to it all other registers.
> Corresponding RESTORE_* and REMOVE_PTREGS_FROM_STACK macros reverse it.
>
> ia32_ptregs_common, stub_fork and friends lost their ugly dance with
> return pointer.
>
> LOAD_ARGS32 in ia32entry.S now uses a symbolic stack offsets
> instead of magic numbers.
>
> Misleading and slightly wrong comments in "struct pt_regs" are fixed
> (four instances).
>
> Patch was run-tested: 64-bit executables, 32-bit executables,
> strace works.
>
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: "H. Peter Anvin" <hpa@zytor.com>
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Frederic Weisbecker <fweisbec@gmail.com>
> CC: X86 ML <x86@kernel.org>
> CC: Alexei Starovoitov <ast@plumgrid.com>
> CC: Will Drewry <wad@chromium.org>
> CC: Kees Cook <keescook@chromium.org>
> CC: linux-kernel@vger.kernel.org
> ---
> arch/x86/ia32/ia32entry.S | 47 +++----
> arch/x86/include/asm/calling.h | 224 ++++++++++++++++-----------------
> arch/x86/include/asm/irqflags.h | 4 +-
> arch/x86/include/asm/ptrace.h | 13 +-
> arch/x86/include/uapi/asm/ptrace-abi.h | 16 ++-
> arch/x86/include/uapi/asm/ptrace.h | 13 +-
> arch/x86/kernel/entry_64.S | 132 ++++++++-----------
> arch/x86/kernel/preempt.S | 16 ++-
> 8 files changed, 232 insertions(+), 233 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 4299eb0..ef9ee16 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -62,12 +62,12 @@
> */
> .macro LOAD_ARGS32 offset, _r9=0
> .if \_r9
> - movl \offset+16(%rsp),%r9d
> + movl \offset+R9(%rsp),%r9d
> .endif
> - movl \offset+40(%rsp),%ecx
> - movl \offset+48(%rsp),%edx
> - movl \offset+56(%rsp),%esi
> - movl \offset+64(%rsp),%edi
> + movl \offset+RCX(%rsp),%ecx
> + movl \offset+RDX(%rsp),%edx
> + movl \offset+RSI(%rsp),%esi
> + movl \offset+RDI(%rsp),%edi
> movl %eax,%eax /* zero extension */
> .endm
>
> @@ -144,7 +144,8 @@ ENTRY(ia32_sysenter_target)
> CFI_REL_OFFSET rip,0
> pushq_cfi %rax
> cld
> - SAVE_ARGS 0,1,0
> + ALLOC_PTREGS_ON_STACK
> + SAVE_C_REGS_EXCEPT_R891011
> /* no need to do an access_ok check here because rbp has been
> 32bit zero extended */
> ASM_STAC
> @@ -172,7 +173,8 @@ sysexit_from_sys_call:
> andl $~0x200,EFLAGS-R11(%rsp)
> movl RIP-R11(%rsp),%edx /* User %eip */
> CFI_REGISTER rip,rdx
> - RESTORE_ARGS 0,24,0,0,0,0
> + RESTORE_RSI_RDI
I heard there will be a v2 so I'll probably wait for it to review this patch
which really requires 0db where I sit.
But the macro names like above look much clearer as well!
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
` (4 preceding siblings ...)
2014-08-01 22:52 ` Frederic Weisbecker
@ 2014-08-01 23:19 ` Frederic Weisbecker
2014-08-04 3:03 ` Denys Vlasenko
5 siblings, 1 reply; 41+ messages in thread
From: Frederic Weisbecker @ 2014-08-01 23:19 UTC (permalink / raw)
To: Denys Vlasenko
Cc: linux-kernel, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
X86 ML, Alexei Starovoitov, Will Drewry, Kees Cook
On Fri, Aug 01, 2014 at 04:48:17PM +0200, Denys Vlasenko wrote:
>
> /* 0(%rsp): ~(interrupt number) */
> .macro interrupt func
> - /* reserve pt_regs for scratch regs and rbp */
> - subq $ORIG_RAX-RBP, %rsp
> - CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
> - cld
> - /* start from rbp in pt_regs and jump over */
> - movq_cfi rdi, (RDI-RBP)
> - movq_cfi rsi, (RSI-RBP)
> - movq_cfi rdx, (RDX-RBP)
> - movq_cfi rcx, (RCX-RBP)
> - movq_cfi rax, (RAX-RBP)
> - movq_cfi r8, (R8-RBP)
> - movq_cfi r9, (R9-RBP)
> - movq_cfi r10, (R10-RBP)
> - movq_cfi r11, (R11-RBP)
> -
> - /* Save rbp so that we can unwind from get_irq_regs() */
> - movq_cfi rbp, 0
Hmm SAVEE_C_REGS below doesn't seem to save rbp like we did before.
Perhaps it's implicitely saved somewhere?
> -
> - /* Save previous stack value */
> - movq %rsp, %rsi
Also rsp isn't saved in %rsi like before. Maybe
that's because we already save it in rdi? Makes sense since
now arg1 == rsp. More on that later.
> -
> - leaq -RBP(%rsp),%rdi /* arg1 for handler */
> - testl $3, CS-RBP(%rsi)
> + ALLOC_PTREGS_ON_STACK
> + SAVE_C_REGS
> + movq %rsp, %rdi /* arg1 for handler */
> + testl $3, CS(%rsp)
> je 1f
> SWAPGS
> - /*
> +1: /*
> * irq_count is used to check if a CPU is already on an interrupt stack
> * or not. While this is essentially redundant with preempt_count it is
> * a little cheaper to use a separate counter in the PDA (short of
> * moving irq_enter into assembly, which would be too much work)
> */
> -1: incl PER_CPU_VAR(irq_count)
> + incl PER_CPU_VAR(irq_count)
> cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
> - CFI_DEF_CFA_REGISTER rsi
> + CFI_DEF_CFA_REGISTER rdi
>
> /* Store previous stack value */
> - pushq %rsi
> + pushq %rdi
So you push rdi instead...
> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
> 0x77 /* DW_OP_breg7 */, 0, \
> 0x06 /* DW_OP_deref */, \
> - 0x08 /* DW_OP_const1u */, SS+8-RBP, \
> + 0x08 /* DW_OP_const1u */, SS+8, \
> 0x22 /* DW_OP_plus */
> /* We entered an interrupt context - irqs are off: */
> TRACE_IRQS_OFF
> -
> call \func
> .endm
>
> @@ -749,10 +719,9 @@ ret_from_intr:
>
> /* Restore saved previous stack */
> popq %rsi
And then you pop to rsi. Ok that indeed works but perhaps we should keep it symetrical
just for clarity? Any reason why we can't reuse rdi here?
> - CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
> - leaq ARGOFFSET-RBP(%rsi), %rsp
> + CFI_DEF_CFA rsi,SS+8 /* reg/off reset after def_cfa_expr */
> + movq %rsi, %rsp
> CFI_DEF_CFA_REGISTER rsp
> - CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
Thanks.
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 22:13 ` H. Peter Anvin
@ 2014-08-02 21:14 ` Andy Lutomirski
2014-08-02 21:23 ` H. Peter Anvin
0 siblings, 1 reply; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-02 21:14 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Denys Vlasenko, Denys Vlasenko, Linux Kernel Mailing List,
Oleg Nesterov, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
Will Drewry, Kees Cook
On Fri, Aug 1, 2014 at 3:13 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 08/01/2014 03:11 PM, Denys Vlasenko wrote:
>>>
>>> Could you please try to see if there is a measurable change in the
>>> latency of a trivial syscall?
>>
>> Will do.
>> Something along the lines of "how long does it take to execute two
>> gazillions of getppid()?"
>>
>
> Something like that, yes, but you have to run enough data points so you
> can determine if the difference is statistically significant or just noise.
Denys, if you want to avoid five minutes of programming, you can build this:
https://gitorious.org/linux-test-utils/linux-clock-tests/
and run timing_test_64 10 getpid
It doesn't do real statistics, but it seems to get quite stable results.
--Andy
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-02 21:14 ` Andy Lutomirski
@ 2014-08-02 21:23 ` H. Peter Anvin
2014-08-02 21:38 ` Andy Lutomirski
0 siblings, 1 reply; 41+ messages in thread
From: H. Peter Anvin @ 2014-08-02 21:23 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Denys Vlasenko, Denys Vlasenko, Linux Kernel Mailing List,
Oleg Nesterov, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
Will Drewry, Kees Cook
It would be nice to automate running a T-test on it.
On August 2, 2014 2:14:50 PM PDT, Andy Lutomirski <luto@amacapital.net> wrote:
>On Fri, Aug 1, 2014 at 3:13 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 08/01/2014 03:11 PM, Denys Vlasenko wrote:
>>>>
>>>> Could you please try to see if there is a measurable change in the
>>>> latency of a trivial syscall?
>>>
>>> Will do.
>>> Something along the lines of "how long does it take to execute two
>>> gazillions of getppid()?"
>>>
>>
>> Something like that, yes, but you have to run enough data points so
>you
>> can determine if the difference is statistically significant or just
>noise.
>
>Denys, if you want to avoid five minutes of programming, you can build
>this:
>
>https://gitorious.org/linux-test-utils/linux-clock-tests/
>
>and run timing_test_64 10 getpid
>
>It doesn't do real statistics, but it seems to get quite stable
>results.
>
>--Andy
--
Sent from my mobile phone. Please pardon brevity and lack of formatting.
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-02 21:23 ` H. Peter Anvin
@ 2014-08-02 21:38 ` Andy Lutomirski
0 siblings, 0 replies; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-02 21:38 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Denys Vlasenko, Denys Vlasenko, Linux Kernel Mailing List,
Oleg Nesterov, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
Will Drewry, Kees Cook
On Sat, Aug 2, 2014 at 2:23 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> It would be nice to automate running a T-test on it.
I'll see if I can do something like that. Using the t statistic seems
like overkill here, though -- timing_test_64 runs millions of
iterations, so even if I batch them, I'll end up with n ~ 1000 (or
even larger), at which point plain old Gaussians should be fine.
--Andy
>
> On August 2, 2014 2:14:50 PM PDT, Andy Lutomirski <luto@amacapital.net> wrote:
>>On Fri, Aug 1, 2014 at 3:13 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> On 08/01/2014 03:11 PM, Denys Vlasenko wrote:
>>>>>
>>>>> Could you please try to see if there is a measurable change in the
>>>>> latency of a trivial syscall?
>>>>
>>>> Will do.
>>>> Something along the lines of "how long does it take to execute two
>>>> gazillions of getppid()?"
>>>>
>>>
>>> Something like that, yes, but you have to run enough data points so
>>you
>>> can determine if the difference is statistically significant or just
>>noise.
>>
>>Denys, if you want to avoid five minutes of programming, you can build
>>this:
>>
>>https://gitorious.org/linux-test-utils/linux-clock-tests/
>>
>>and run timing_test_64 10 getpid
>>
>>It doesn't do real statistics, but it seems to get quite stable
>>results.
>>
>>--Andy
>
> --
> Sent from my mobile phone. Please pardon brevity and lack of formatting.
--
Andy Lutomirski
AMA Capital Management, LLC
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 23:19 ` Frederic Weisbecker
@ 2014-08-04 3:03 ` Denys Vlasenko
2014-08-04 7:57 ` Borislav Petkov
2014-08-11 0:46 ` Frederic Weisbecker
0 siblings, 2 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-04 3:03 UTC (permalink / raw)
To: Frederic Weisbecker
Cc: Denys Vlasenko, Linux Kernel Mailing List, Oleg Nesterov,
H. Peter Anvin, Andy Lutomirski, X86 ML, Alexei Starovoitov,
Will Drewry, Kees Cook
On Sat, Aug 2, 2014 at 1:19 AM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> On Fri, Aug 01, 2014 at 04:48:17PM +0200, Denys Vlasenko wrote:
>>
>> /* 0(%rsp): ~(interrupt number) */
>> .macro interrupt func
>> - /* reserve pt_regs for scratch regs and rbp */
>> - subq $ORIG_RAX-RBP, %rsp
>> - CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
>> - cld
>> - /* start from rbp in pt_regs and jump over */
>> - movq_cfi rdi, (RDI-RBP)
>> - movq_cfi rsi, (RSI-RBP)
>> - movq_cfi rdx, (RDX-RBP)
>> - movq_cfi rcx, (RCX-RBP)
>> - movq_cfi rax, (RAX-RBP)
>> - movq_cfi r8, (R8-RBP)
>> - movq_cfi r9, (R9-RBP)
>> - movq_cfi r10, (R10-RBP)
>> - movq_cfi r11, (R11-RBP)
>> -
>> - /* Save rbp so that we can unwind from get_irq_regs() */
>> - movq_cfi rbp, 0
>
> Hmm SAVEE_C_REGS below doesn't seem to save rbp like we did before.
> Perhaps it's implicitely saved somewhere?
>
>> -
>> - /* Save previous stack value */
>> - movq %rsp, %rsi
>
> Also rsp isn't saved in %rsi like before. Maybe
> that's because we already save it in rdi? Makes sense since
> now arg1 == rsp. More on that later.
>
>> -
>> - leaq -RBP(%rsp),%rdi /* arg1 for handler */
>> - testl $3, CS-RBP(%rsi)
>> + ALLOC_PTREGS_ON_STACK
>> + SAVE_C_REGS
>> + movq %rsp, %rdi /* arg1 for handler */
>> + testl $3, CS(%rsp)
>> je 1f
>> SWAPGS
>> - /*
>> +1: /*
>> * irq_count is used to check if a CPU is already on an interrupt stack
>> * or not. While this is essentially redundant with preempt_count it is
>> * a little cheaper to use a separate counter in the PDA (short of
>> * moving irq_enter into assembly, which would be too much work)
>> */
>> -1: incl PER_CPU_VAR(irq_count)
>> + incl PER_CPU_VAR(irq_count)
>> cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
>> - CFI_DEF_CFA_REGISTER rsi
>> + CFI_DEF_CFA_REGISTER rdi
>>
>> /* Store previous stack value */
>> - pushq %rsi
>> + pushq %rdi
>
> So you push rdi instead...
>
>> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
>> 0x77 /* DW_OP_breg7 */, 0, \
>> 0x06 /* DW_OP_deref */, \
>> - 0x08 /* DW_OP_const1u */, SS+8-RBP, \
>> + 0x08 /* DW_OP_const1u */, SS+8, \
>> 0x22 /* DW_OP_plus */
>> /* We entered an interrupt context - irqs are off: */
>> TRACE_IRQS_OFF
>> -
>> call \func
>> .endm
>>
>> @@ -749,10 +719,9 @@ ret_from_intr:
>>
>> /* Restore saved previous stack */
>> popq %rsi
>
> And then you pop to rsi. Ok that indeed works but perhaps we should keep it symetrical
> just for clarity? Any reason why we can't reuse rdi here?
I changed this entire area in v2: basically, I will not change the logic,
but will add comments explaining what are we doing here, and why.
(Some minor code changes will be done, not affecting the logic).
While we are at it, what this CFI_ESCAPE thing does here?
As usual, it has no comment :/
--
vda
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-04 3:03 ` Denys Vlasenko
@ 2014-08-04 7:57 ` Borislav Petkov
2014-08-11 0:46 ` Frederic Weisbecker
1 sibling, 0 replies; 41+ messages in thread
From: Borislav Petkov @ 2014-08-04 7:57 UTC (permalink / raw)
To: Denys Vlasenko
Cc: Frederic Weisbecker, Denys Vlasenko, Linux Kernel Mailing List,
Oleg Nesterov, H. Peter Anvin, Andy Lutomirski, X86 ML,
Alexei Starovoitov, Will Drewry, Kees Cook
On Mon, Aug 04, 2014 at 05:03:42AM +0200, Denys Vlasenko wrote:
> I changed this entire area in v2: basically, I will not change the
> logic, but will add comments explaining what are we doing here, and
> why.
Very good idea! This file needs some good commenting.
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-01 17:04 ` Andy Lutomirski
@ 2014-08-04 14:28 ` Denys Vlasenko
2014-08-04 14:47 ` Oleg Nesterov
2014-08-04 21:03 ` Andy Lutomirski
0 siblings, 2 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-04 14:28 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Denys Vlasenko, linux-kernel, Oleg Nesterov, H. Peter Anvin,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
On Fri, Aug 1, 2014 at 7:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Aug 1, 2014 at 7:48 AM, Denys Vlasenko <dvlasenk@redhat.com> wrote:
>> 64-bit code was using six stack slots fewer by not saving/restoring
>> registers which a callee-preserved according to C ABI,
>> and not allocating space for them
>
> This is great.
>
> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :) Maybe I'll give that a shot.
I'm yet at the stage "what that stuff does anyway?" and at
"why do we need percpu old_rsp thingy?" in particular.
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-04 14:28 ` Denys Vlasenko
@ 2014-08-04 14:47 ` Oleg Nesterov
2014-08-04 15:34 ` Oleg Nesterov
2014-08-04 21:03 ` Andy Lutomirski
1 sibling, 1 reply; 41+ messages in thread
From: Oleg Nesterov @ 2014-08-04 14:47 UTC (permalink / raw)
To: Denys Vlasenko
Cc: Andy Lutomirski, Denys Vlasenko, linux-kernel, H. Peter Anvin,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
On 08/04, Denys Vlasenko wrote:
>
> "why do we need percpu old_rsp thingy?" in particular.
See thread_struct->usersp, current_user_stack_pointer().
Oleg.
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-04 14:47 ` Oleg Nesterov
@ 2014-08-04 15:34 ` Oleg Nesterov
0 siblings, 0 replies; 41+ messages in thread
From: Oleg Nesterov @ 2014-08-04 15:34 UTC (permalink / raw)
To: Denys Vlasenko
Cc: Andy Lutomirski, Denys Vlasenko, linux-kernel, H. Peter Anvin,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
On 08/04, Oleg Nesterov wrote:
>
> On 08/04, Denys Vlasenko wrote:
> >
> > "why do we need percpu old_rsp thingy?" in particular.
>
> See thread_struct->usersp, current_user_stack_pointer().
Btw, perhaps it makes sense to document that task_pt_regs(current)->sp
is not current_user_stack_pointer() inside system_call_fastpath.
Oleg.
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-04 14:28 ` Denys Vlasenko
2014-08-04 14:47 ` Oleg Nesterov
@ 2014-08-04 21:03 ` Andy Lutomirski
2014-08-04 21:23 ` Borislav Petkov
2014-08-05 10:35 ` Denys Vlasenko
1 sibling, 2 replies; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-04 21:03 UTC (permalink / raw)
To: Denys Vlasenko
Cc: Denys Vlasenko, linux-kernel, Oleg Nesterov, H. Peter Anvin,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
On Mon, Aug 4, 2014 at 11:28 PM, Denys Vlasenko
<vda.linux@googlemail.com> wrote:
> On Fri, Aug 1, 2014 at 7:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Aug 1, 2014 at 7:48 AM, Denys Vlasenko <dvlasenk@redhat.com> wrote:
>>> 64-bit code was using six stack slots fewer by not saving/restoring
>>> registers which a callee-preserved according to C ABI,
>>> and not allocating space for them
>>
>> This is great.
>>
>> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :) Maybe I'll give that a shot.
>
> I'm yet at the stage "what that stuff does anyway?" and at
> "why do we need percpu old_rsp thingy?" in particular.
On x86_64, the syscall instruction has no effect on rsp. That means
that the entry point starts out with no stack. There are no free
registers whatsoever at the entry point.
That means that the entry code needs to do swapgs, stash rsp somewhere
relative to gs, and then load the kernel's rsp. old_rsp is the spot
used for this.
Now the kernel does an optimization that is, I think, very much not
worth it. The kernel doesn't bother sticking the old rsp value into
pt_regs (saving two instructions on fast path entries) and doesn't
initialize the SS, CS, RCX, and EFLAGS fields in pt_regs, saving four
more instructions.
To make this optimization work, the whole FIXUP/RESTORE_TOP_OF_STACK
dance is needed, and there's the usersp crap in the context switch
code, and current_user_stack_pointer(), and probably even more crap
that I haven't noticed. And I sure hope that nothing in the *compat*
syscall path touches current_user_stack_pointer(), because the compat
code doesn't seem to use old_rsp.
I think this should all be ripped out. The only real difficulty will
be that the sysret code needs to restore rsp itself, so the sysret
path will end up needing two more instructions. Removing all of the
TOP_OF_STACK stuff will add ten instructions to fast path syscalls,
and I wouldn't be surprised if this adds considerably fewer than ten
cycles on any modern chip.
(It's too bad that there's no unlocked xchg; this could be faster if
we had one. It's also too bad that the syscall ABI didn't choose some
register to unconditionally set to zero, which would have given us the
single scratch register we'd need to avoid this whole mess in the
first place.)
--Andy
--
Andy Lutomirski
AMA Capital Management, LLC
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-04 21:03 ` Andy Lutomirski
@ 2014-08-04 21:23 ` Borislav Petkov
2014-08-05 10:35 ` Denys Vlasenko
1 sibling, 0 replies; 41+ messages in thread
From: Borislav Petkov @ 2014-08-04 21:23 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Denys Vlasenko, Denys Vlasenko, linux-kernel, Oleg Nesterov,
H. Peter Anvin, Frederic Weisbecker, X86 ML, Alexei Starovoitov,
Will Drewry, Kees Cook
On Tue, Aug 05, 2014 at 06:03:24AM +0900, Andy Lutomirski wrote:
> (It's too bad that there's no unlocked xchg; this could be faster if
> we had one.
There is - just put both operands in registers. :-)
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-04 21:03 ` Andy Lutomirski
2014-08-04 21:23 ` Borislav Petkov
@ 2014-08-05 10:35 ` Denys Vlasenko
2014-08-05 14:53 ` Andy Lutomirski
1 sibling, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-05 10:35 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Denys Vlasenko, linux-kernel, Oleg Nesterov, H. Peter Anvin,
Frederic Weisbecker, X86 ML, Alexei Starovoitov, Will Drewry,
Kees Cook
On Mon, Aug 4, 2014 at 11:03 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :) Maybe I'll give that a shot.
>>
>> I'm yet at the stage "what that stuff does anyway?" and at
>> "why do we need percpu old_rsp thingy?" in particular.
>
> On x86_64, the syscall instruction has no effect on rsp. That means
> that the entry point starts out with no stack. There are no free
> registers whatsoever at the entry point.
>
> That means that the entry code needs to do swapgs, stash rsp somewhere
> relative to gs, and then load the kernel's rsp. old_rsp is the spot
> used for this.
>
> Now the kernel does an optimization that is, I think, very much not
> worth it. The kernel doesn't bother sticking the old rsp value into
> pt_regs (saving two instructions on fast path entries) and doesn't
> initialize the SS, CS, RCX, and EFLAGS fields in pt_regs, saving four
> more instructions.
>
> To make this optimization work, the whole FIXUP/RESTORE_TOP_OF_STACK
> dance is needed, and there's the usersp crap in the context switch
> code, and current_user_stack_pointer(), and probably even more crap
> that I haven't noticed. And I sure hope that nothing in the *compat*
> syscall path touches current_user_stack_pointer(), because the compat
> code doesn't seem to use old_rsp.
>
> I think this should all be ripped out. The only real difficulty will
> be that the sysret code needs to restore rsp itself, so the sysret
> path will end up needing two more instructions. Removing all of the
> TOP_OF_STACK stuff will add ten instructions to fast path syscalls,
> and I wouldn't be surprised if this adds considerably fewer than ten
> cycles on any modern chip.
Something like this on the fast path? -
SWAPGS_UNSAFE_STACK
movq %rsp,PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack),%rsp
ENABLE_INTERRUPTS(CLBR_NONE)
ALLOC_PTREGS_ON_STACK 8 /* +8: space for orig_ax */
SAVE_C_REGS
movq %rax,ORIG_RAX(%rsp)
movq %rcx,RIP(%rsp)
+ movq %r11,EFLAGS(%rsp)
+ movq PER_CPU_VAR(old_rsp),%rcx
+ movq %rcx,RSP(%rsp)
...
- RESTORE_C_REGS_EXCEPT_RCX
+ RESTORE_C_REGS_EXCEPT_RCX_R11
movq RIP(%rsp),%rcx
+ movq EFLAGS(%rsp), %r11
- movq PER_CPU_VAR(old_rsp), %rsp
+ movq RSP(%rsp), %rsp
USERGS_SYSRET64
Looks like only 3 additional insns (unfortunately, one is memory read).
Do we need to save rsc and r11 in "struct pt_regs" in their
"standard" slots, though? If we don't, we can drop two insns
(SAVE_C_REGS -> SAVE_C_REGS_EXCEPT_RCX_R11).
Then old_rsp can be nuked everywhere else,
RESTORE_TOP_OF_STACK can be nuked, and
FIXUP_TOP_OF_STACK can be reduced to merely:
movq $__USER_DS,SS(%rsp)
movq $__USER_CS,CS(%rsp)
(BTW, why currently it does "movq $-1,RCX+\offset(%rsp)?)
--
vda
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-05 10:35 ` Denys Vlasenko
@ 2014-08-05 14:53 ` Andy Lutomirski
2014-08-05 15:17 ` Denys Vlasenko
2014-08-07 9:54 ` Denys Vlasenko
0 siblings, 2 replies; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-05 14:53 UTC (permalink / raw)
To: Denys Vlasenko
Cc: X86 ML, Frederic Weisbecker, Denys Vlasenko, Oleg Nesterov,
linux-kernel, H. Peter Anvin, Alexei Starovoitov, Kees Cook,
Will Drewry
On Aug 5, 2014 7:36 PM, "Denys Vlasenko" <vda.linux@googlemail.com> wrote:
>
> On Mon, Aug 4, 2014 at 11:03 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >>> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :) Maybe I'll give that a shot.
> >>
> >> I'm yet at the stage "what that stuff does anyway?" and at
> >> "why do we need percpu old_rsp thingy?" in particular.
> >
> > On x86_64, the syscall instruction has no effect on rsp. That means
> > that the entry point starts out with no stack. There are no free
> > registers whatsoever at the entry point.
> >
> > That means that the entry code needs to do swapgs, stash rsp somewhere
> > relative to gs, and then load the kernel's rsp. old_rsp is the spot
> > used for this.
> >
> > Now the kernel does an optimization that is, I think, very much not
> > worth it. The kernel doesn't bother sticking the old rsp value into
> > pt_regs (saving two instructions on fast path entries) and doesn't
> > initialize the SS, CS, RCX, and EFLAGS fields in pt_regs, saving four
> > more instructions.
> >
> > To make this optimization work, the whole FIXUP/RESTORE_TOP_OF_STACK
> > dance is needed, and there's the usersp crap in the context switch
> > code, and current_user_stack_pointer(), and probably even more crap
> > that I haven't noticed. And I sure hope that nothing in the *compat*
> > syscall path touches current_user_stack_pointer(), because the compat
> > code doesn't seem to use old_rsp.
> >
> > I think this should all be ripped out. The only real difficulty will
> > be that the sysret code needs to restore rsp itself, so the sysret
> > path will end up needing two more instructions. Removing all of the
> > TOP_OF_STACK stuff will add ten instructions to fast path syscalls,
> > and I wouldn't be surprised if this adds considerably fewer than ten
> > cycles on any modern chip.
>
> Something like this on the fast path? -
>
> SWAPGS_UNSAFE_STACK
> movq %rsp,PER_CPU_VAR(old_rsp)
> movq PER_CPU_VAR(kernel_stack),%rsp
> ENABLE_INTERRUPTS(CLBR_NONE)
> ALLOC_PTREGS_ON_STACK 8 /* +8: space for orig_ax */
> SAVE_C_REGS
> movq %rax,ORIG_RAX(%rsp)
> movq %rcx,RIP(%rsp)
> + movq %r11,EFLAGS(%rsp)
> + movq PER_CPU_VAR(old_rsp),%rcx
> + movq %rcx,RSP(%rsp)
> ...
> - RESTORE_C_REGS_EXCEPT_RCX
> + RESTORE_C_REGS_EXCEPT_RCX_R11
> movq RIP(%rsp),%rcx
> + movq EFLAGS(%rsp), %r11
> - movq PER_CPU_VAR(old_rsp), %rsp
> + movq RSP(%rsp), %rsp
> USERGS_SYSRET64
The sysret code still needs the inverse, right? ptrace can change
RSP. And, if that happens, then all the context switch code can go,
as can the usersp thread info slot.
>
> Looks like only 3 additional insns (unfortunately, one is memory read).
The store forwarding buffer should handle that one, I think.
> Do we need to save rsc and r11 in "struct pt_regs" in their
> "standard" slots, though?
ptrace probably wants it.
> If we don't, we can drop two insns
> (SAVE_C_REGS -> SAVE_C_REGS_EXCEPT_RCX_R11).
>
> Then old_rsp can be nuked everywhere else,
> RESTORE_TOP_OF_STACK can be nuked, and
> FIXUP_TOP_OF_STACK can be reduced to merely:
>
> movq $__USER_DS,SS(%rsp)
> movq $__USER_CS,CS(%rsp)
Mmm, right. That's probably better than doing this on the fast path.
>
> (BTW, why currently it does "movq $-1,RCX+\offset(%rsp)?)
I would argue this is a bug. (In fact, I have a patch floating around
to fix it. The current code is glitchy in a visible-to-user-space
way.) We should put rcx into both RIP and RCX, since the sysret path
will implicitly do that, and we should restore the same register
values in the iret and sysret paths.
--Andy
>
> --
> vda
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-05 14:53 ` Andy Lutomirski
@ 2014-08-05 15:17 ` Denys Vlasenko
2014-08-05 23:02 ` Andy Lutomirski
2014-08-07 9:54 ` Denys Vlasenko
1 sibling, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-05 15:17 UTC (permalink / raw)
To: Andy Lutomirski, Denys Vlasenko
Cc: X86 ML, Frederic Weisbecker, Oleg Nesterov, linux-kernel,
H. Peter Anvin, Alexei Starovoitov, Kees Cook, Will Drewry
On 08/05/2014 04:53 PM, Andy Lutomirski wrote:
> On Aug 5, 2014 7:36 PM, "Denys Vlasenko" <vda.linux@googlemail.com> wrote:
>>
>> On Mon, Aug 4, 2014 at 11:03 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :) Maybe I'll give that a shot.
>>>>
>>>> I'm yet at the stage "what that stuff does anyway?" and at
>>>> "why do we need percpu old_rsp thingy?" in particular.
>>>
>>> On x86_64, the syscall instruction has no effect on rsp. That means
>>> that the entry point starts out with no stack. There are no free
>>> registers whatsoever at the entry point.
>>>
>>> That means that the entry code needs to do swapgs, stash rsp somewhere
>>> relative to gs, and then load the kernel's rsp. old_rsp is the spot
>>> used for this.
>>>
>>> Now the kernel does an optimization that is, I think, very much not
>>> worth it. The kernel doesn't bother sticking the old rsp value into
>>> pt_regs (saving two instructions on fast path entries) and doesn't
>>> initialize the SS, CS, RCX, and EFLAGS fields in pt_regs, saving four
>>> more instructions.
>>>
>>> To make this optimization work, the whole FIXUP/RESTORE_TOP_OF_STACK
>>> dance is needed, and there's the usersp crap in the context switch
>>> code, and current_user_stack_pointer(), and probably even more crap
>>> that I haven't noticed. And I sure hope that nothing in the *compat*
>>> syscall path touches current_user_stack_pointer(), because the compat
>>> code doesn't seem to use old_rsp.
>>>
>>> I think this should all be ripped out. The only real difficulty will
>>> be that the sysret code needs to restore rsp itself, so the sysret
>>> path will end up needing two more instructions. Removing all of the
>>> TOP_OF_STACK stuff will add ten instructions to fast path syscalls,
>>> and I wouldn't be surprised if this adds considerably fewer than ten
>>> cycles on any modern chip.
>>
>> Something like this on the fast path? -
>>
>> SWAPGS_UNSAFE_STACK
>> movq %rsp,PER_CPU_VAR(old_rsp)
>> movq PER_CPU_VAR(kernel_stack),%rsp
>> ENABLE_INTERRUPTS(CLBR_NONE)
>> ALLOC_PTREGS_ON_STACK 8 /* +8: space for orig_ax */
>> SAVE_C_REGS
>> movq %rax,ORIG_RAX(%rsp)
>> movq %rcx,RIP(%rsp)
>> + movq %r11,EFLAGS(%rsp)
>> + movq PER_CPU_VAR(old_rsp),%rcx
>> + movq %rcx,RSP(%rsp)
>> ...
>> - RESTORE_C_REGS_EXCEPT_RCX
>> + RESTORE_C_REGS_EXCEPT_RCX_R11
>> movq RIP(%rsp),%rcx
>> + movq EFLAGS(%rsp), %r11
>> - movq PER_CPU_VAR(old_rsp), %rsp
>> + movq RSP(%rsp), %rsp
>> USERGS_SYSRET64
>
> The sysret code still needs the inverse, right?
The part after "..." in my skecth is the sysret code.
> ptrace can change RSP.
By writing to pt_regs->rsp, yes. And the above code
will pick it up - we read RSP(%rsp).
>> Do we need to save rcx and r11 in "struct pt_regs" in their
>> "standard" slots, though?
>
> ptrace probably wants it.
Let's see.
They don't contain any useful information:
With current code,
pt_regs->r11 is the same as pt_regs->rflags,
pt_regs->rcx is the same as pt_regs->rip (modulo weird store of -1).
So reading them by ptrace is... weird - just read
pt_regs->rflags or pt_regs->rip instead!
If ptrace is active, we'll return via iretq.
If ptrace wrote to these pt_regs members, on return
to userspace current code restores modified values.
My proposed change does not change this.
So, only ptrace reads of rcx and r11 will be affected.
Hmm. Maybe we can fill them only on "tracesys:" codepath?
>> Then old_rsp can be nuked everywhere else,
>> RESTORE_TOP_OF_STACK can be nuked, and
>> FIXUP_TOP_OF_STACK can be reduced to merely:
>>
>> movq $__USER_DS,SS(%rsp)
>> movq $__USER_CS,CS(%rsp)
>
> Mmm, right. That's probably better than doing this on the fast path.
>
>>
>> (BTW, why currently it does "movq $-1,RCX+\offset(%rsp)?)
>
> I would argue this is a bug.
Agree.
> (In fact, I have a patch floating around
> to fix it. The current code is glitchy in a visible-to-user-space
> way.) We should put rcx into both RIP and RCX, since the sysret path
> will implicitly do that, and we should restore the same register
> values in the iret and sysret paths.
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-05 15:17 ` Denys Vlasenko
@ 2014-08-05 23:02 ` Andy Lutomirski
0 siblings, 0 replies; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-05 23:02 UTC (permalink / raw)
To: Denys Vlasenko
Cc: X86 ML, Frederic Weisbecker, Oleg Nesterov, linux-kernel,
H. Peter Anvin, Denys Vlasenko, Alexei Starovoitov, Will Drewry,
Kees Cook
On Aug 6, 2014 12:17 AM, "Denys Vlasenko" <dvlasenk@redhat.com> wrote:
>
> On 08/05/2014 04:53 PM, Andy Lutomirski wrote:
> > On Aug 5, 2014 7:36 PM, "Denys Vlasenko" <vda.linux@googlemail.com> wrote:
> >>
> >> On Mon, Aug 4, 2014 at 11:03 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >>>>> Next up: remove FIXUP/RESTORE_TOP_OF_STACK? :) Maybe I'll give that a shot.
> >>>>
> >>>> I'm yet at the stage "what that stuff does anyway?" and at
> >>>> "why do we need percpu old_rsp thingy?" in particular.
> >>>
> >>> On x86_64, the syscall instruction has no effect on rsp. That means
> >>> that the entry point starts out with no stack. There are no free
> >>> registers whatsoever at the entry point.
> >>>
> >>> That means that the entry code needs to do swapgs, stash rsp somewhere
> >>> relative to gs, and then load the kernel's rsp. old_rsp is the spot
> >>> used for this.
> >>>
> >>> Now the kernel does an optimization that is, I think, very much not
> >>> worth it. The kernel doesn't bother sticking the old rsp value into
> >>> pt_regs (saving two instructions on fast path entries) and doesn't
> >>> initialize the SS, CS, RCX, and EFLAGS fields in pt_regs, saving four
> >>> more instructions.
> >>>
> >>> To make this optimization work, the whole FIXUP/RESTORE_TOP_OF_STACK
> >>> dance is needed, and there's the usersp crap in the context switch
> >>> code, and current_user_stack_pointer(), and probably even more crap
> >>> that I haven't noticed. And I sure hope that nothing in the *compat*
> >>> syscall path touches current_user_stack_pointer(), because the compat
> >>> code doesn't seem to use old_rsp.
> >>>
> >>> I think this should all be ripped out. The only real difficulty will
> >>> be that the sysret code needs to restore rsp itself, so the sysret
> >>> path will end up needing two more instructions. Removing all of the
> >>> TOP_OF_STACK stuff will add ten instructions to fast path syscalls,
> >>> and I wouldn't be surprised if this adds considerably fewer than ten
> >>> cycles on any modern chip.
> >>
> >> Something like this on the fast path? -
> >>
> >> SWAPGS_UNSAFE_STACK
> >> movq %rsp,PER_CPU_VAR(old_rsp)
> >> movq PER_CPU_VAR(kernel_stack),%rsp
> >> ENABLE_INTERRUPTS(CLBR_NONE)
> >> ALLOC_PTREGS_ON_STACK 8 /* +8: space for orig_ax */
> >> SAVE_C_REGS
> >> movq %rax,ORIG_RAX(%rsp)
> >> movq %rcx,RIP(%rsp)
> >> + movq %r11,EFLAGS(%rsp)
> >> + movq PER_CPU_VAR(old_rsp),%rcx
> >> + movq %rcx,RSP(%rsp)
> >> ...
> >> - RESTORE_C_REGS_EXCEPT_RCX
> >> + RESTORE_C_REGS_EXCEPT_RCX_R11
> >> movq RIP(%rsp),%rcx
> >> + movq EFLAGS(%rsp), %r11
> >> - movq PER_CPU_VAR(old_rsp), %rsp
> >> + movq RSP(%rsp), %rsp
> >> USERGS_SYSRET64
> >
> > The sysret code still needs the inverse, right?
>
> The part after "..." in my skecth is the sysret code.
Right :)
>
> > ptrace can change RSP.
>
> By writing to pt_regs->rsp, yes. And the above code
> will pick it up - we read RSP(%rsp).
Also correct -- nice :)
>
> >> Do we need to save rcx and r11 in "struct pt_regs" in their
> >> "standard" slots, though?
> >
> > ptrace probably wants it.
>
> Let's see.
> They don't contain any useful information:
> With current code,
> pt_regs->r11 is the same as pt_regs->rflags,
> pt_regs->rcx is the same as pt_regs->rip (modulo weird store of -1).
> So reading them by ptrace is... weird - just read
> pt_regs->rflags or pt_regs->rip instead!
I'm unconvinced. This is too complicated.
>
> If ptrace is active, we'll return via iretq.
> If ptrace wrote to these pt_regs members, on return
> to userspace current code restores modified values.
> My proposed change does not change this.
>
> So, only ptrace reads of rcx and r11 will be affected.
> Hmm. Maybe we can fill them only on "tracesys:" codepath?
It's not just tracesys -- there's FORK_LIKE, the various pt_regs
stubs, and exit work, too. Setting these values up early is faster,
anyway -- no loads are needed, only stores, and the stores will
presumably be combined with the rest of the frame setup, so no
additional memory bandwidth should be needed.
--Andy
>
> >> Then old_rsp can be nuked everywhere else,
> >> RESTORE_TOP_OF_STACK can be nuked, and
> >> FIXUP_TOP_OF_STACK can be reduced to merely:
> >>
> >> movq $__USER_DS,SS(%rsp)
> >> movq $__USER_CS,CS(%rsp)
> >
> > Mmm, right. That's probably better than doing this on the fast path.
> >
> >>
> >> (BTW, why currently it does "movq $-1,RCX+\offset(%rsp)?)
> >
> > I would argue this is a bug.
>
> Agree.
>
> > (In fact, I have a patch floating around
> > to fix it. The current code is glitchy in a visible-to-user-space
> > way.) We should put rcx into both RIP and RCX, since the sysret path
> > will implicitly do that, and we should restore the same register
> > values in the iret and sysret paths.
>
--Andy
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-05 14:53 ` Andy Lutomirski
2014-08-05 15:17 ` Denys Vlasenko
@ 2014-08-07 9:54 ` Denys Vlasenko
1 sibling, 0 replies; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-07 9:54 UTC (permalink / raw)
To: Andy Lutomirski
Cc: X86 ML, Frederic Weisbecker, Denys Vlasenko, Oleg Nesterov,
linux-kernel, H. Peter Anvin, Alexei Starovoitov, Kees Cook,
Will Drewry
On Tue, Aug 5, 2014 at 4:53 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Aug 5, 2014 7:36 PM, "Denys Vlasenko" <vda.linux@googlemail.com> wrote:
>> Then old_rsp can be nuked everywhere else,
>> RESTORE_TOP_OF_STACK can be nuked, and
>> FIXUP_TOP_OF_STACK can be reduced to merely:
>>
>> movq $__USER_DS,SS(%rsp)
>> movq $__USER_CS,CS(%rsp)
>
> Mmm, right. That's probably better than doing this on the fast path.
I measured it. These particular insns have a nasty huge encoding in x86 -
12 bytes each, no less!
48 c7 84 24 88 00 00 00 33 00 00 00 movq $0x33,0x88(%rsp)
48 c7 84 24 a0 00 00 00 2b 00 00 00 movq $0x2b,0xa0(%rsp)
With them in hot path, I measure 55.6 ns per syscall,
with them moved out of hot path it's 54.8 ns.
This is 2.70GHz CPU, so it must be 2 CPU cycles.
--
vda
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-04 3:03 ` Denys Vlasenko
2014-08-04 7:57 ` Borislav Petkov
@ 2014-08-11 0:46 ` Frederic Weisbecker
2014-08-11 8:40 ` Jan Beulich
1 sibling, 1 reply; 41+ messages in thread
From: Frederic Weisbecker @ 2014-08-11 0:46 UTC (permalink / raw)
To: Denys Vlasenko
Cc: Denys Vlasenko, Linux Kernel Mailing List, Oleg Nesterov,
H. Peter Anvin, Andy Lutomirski, X86 ML, Alexei Starovoitov,
Will Drewry, Kees Cook
On Mon, Aug 04, 2014 at 05:03:42AM +0200, Denys Vlasenko wrote:
> On Sat, Aug 2, 2014 at 1:19 AM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> >> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
> >> 0x77 /* DW_OP_breg7 */, 0, \
> >> 0x06 /* DW_OP_deref */, \
> >> - 0x08 /* DW_OP_const1u */, SS+8-RBP, \
> >> + 0x08 /* DW_OP_const1u */, SS+8, \
> >> 0x22 /* DW_OP_plus */
> >> /* We entered an interrupt context - irqs are off: */
> >> TRACE_IRQS_OFF
> >> -
> >> call \func
> >> .endm
> >>
> >> @@ -749,10 +719,9 @@ ret_from_intr:
> >>
> >> /* Restore saved previous stack */
> >> popq %rsi
> >
> > And then you pop to rsi. Ok that indeed works but perhaps we should keep it symetrical
> > just for clarity? Any reason why we can't reuse rdi here?
>
> I changed this entire area in v2: basically, I will not change the logic,
> but will add comments explaining what are we doing here, and why.
> (Some minor code changes will be done, not affecting the logic).
>
> While we are at it, what this CFI_ESCAPE thing does here?
> As usual, it has no comment :/
I don't know, only Jan Beulich understands those CFI black magic.
BTW he doesn't appears to be Cc, we should add him.
>
> --
> vda
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-11 0:46 ` Frederic Weisbecker
@ 2014-08-11 8:40 ` Jan Beulich
2014-08-11 9:07 ` Andy Lutomirski
2014-08-11 13:26 ` Denys Vlasenko
0 siblings, 2 replies; 41+ messages in thread
From: Jan Beulich @ 2014-08-11 8:40 UTC (permalink / raw)
To: Frederic Weisbecker, Denys Vlasenko
Cc: Andy Lutomirski, Kees Cook, Will Drewry, X86 ML,
Alexei Starovoitov, Denys Vlasenko, Oleg Nesterov,
Linux Kernel Mailing List, H. Peter Anvin
>>> On 11.08.14 at 02:46, <fweisbec@gmail.com> wrote:
> On Mon, Aug 04, 2014 at 05:03:42AM +0200, Denys Vlasenko wrote:
>> On Sat, Aug 2, 2014 at 1:19 AM, Frederic Weisbecker <fweisbec@gmail.com>
> wrote:
>> >> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
>> >> 0x77 /* DW_OP_breg7 */, 0, \
>> >> 0x06 /* DW_OP_deref */, \
>> >> - 0x08 /* DW_OP_const1u */, SS+8-RBP, \
>> >> + 0x08 /* DW_OP_const1u */, SS+8, \
>> >> 0x22 /* DW_OP_plus */
>> >> /* We entered an interrupt context - irqs are off: */
>> >> TRACE_IRQS_OFF
>> >> -
>> >> call \func
>> >> .endm
>> >>
>> >> @@ -749,10 +719,9 @@ ret_from_intr:
>> >>
>> >> /* Restore saved previous stack */
>> >> popq %rsi
>> >
>> > And then you pop to rsi. Ok that indeed works but perhaps we should keep it symetrical
>> > just for clarity? Any reason why we can't reuse rdi here?
>>
>> I changed this entire area in v2: basically, I will not change the logic,
>> but will add comments explaining what are we doing here, and why.
>> (Some minor code changes will be done, not affecting the logic).
>>
>> While we are at it, what this CFI_ESCAPE thing does here?
>> As usual, it has no comment :/
Each of its lines has a comment; with other CFI annotations not
each having comments, I don't see what else is needed here.
> I don't know, only Jan Beulich understands those CFI black magic.
That would be very said if true.
In any case: This needs to be a CFI_ESCAPE because there's no
other way I know of to emit the DW_CFA_def_cfa_expression.
And the change to it looks correct to me.
Jan
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-11 8:40 ` Jan Beulich
@ 2014-08-11 9:07 ` Andy Lutomirski
2014-08-11 9:31 ` Jan Beulich
2014-08-11 13:26 ` Denys Vlasenko
1 sibling, 1 reply; 41+ messages in thread
From: Andy Lutomirski @ 2014-08-11 9:07 UTC (permalink / raw)
To: Jan Beulich
Cc: Frederic Weisbecker, Denys Vlasenko, Kees Cook, Will Drewry,
X86 ML, Alexei Starovoitov, Denys Vlasenko, Oleg Nesterov,
Linux Kernel Mailing List, H. Peter Anvin
On Mon, Aug 11, 2014 at 5:40 PM, Jan Beulich <JBeulich@suse.com> wrote:
>>>> On 11.08.14 at 02:46, <fweisbec@gmail.com> wrote:
>> On Mon, Aug 04, 2014 at 05:03:42AM +0200, Denys Vlasenko wrote:
>>> On Sat, Aug 2, 2014 at 1:19 AM, Frederic Weisbecker <fweisbec@gmail.com>
>> wrote:
>>> >> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
>>> >> 0x77 /* DW_OP_breg7 */, 0, \
>>> >> 0x06 /* DW_OP_deref */, \
>>> >> - 0x08 /* DW_OP_const1u */, SS+8-RBP, \
>>> >> + 0x08 /* DW_OP_const1u */, SS+8, \
>>> >> 0x22 /* DW_OP_plus */
>>> >> /* We entered an interrupt context - irqs are off: */
>>> >> TRACE_IRQS_OFF
>>> >> -
>>> >> call \func
>>> >> .endm
>>> >>
>>> >> @@ -749,10 +719,9 @@ ret_from_intr:
>>> >>
>>> >> /* Restore saved previous stack */
>>> >> popq %rsi
>>> >
>>> > And then you pop to rsi. Ok that indeed works but perhaps we should keep it symetrical
>>> > just for clarity? Any reason why we can't reuse rdi here?
>>>
>>> I changed this entire area in v2: basically, I will not change the logic,
>>> but will add comments explaining what are we doing here, and why.
>>> (Some minor code changes will be done, not affecting the logic).
>>>
>>> While we are at it, what this CFI_ESCAPE thing does here?
>>> As usual, it has no comment :/
>
> Each of its lines has a comment; with other CFI annotations not
> each having comments, I don't see what else is needed here.
>
>> I don't know, only Jan Beulich understands those CFI black magic.
>
> That would be very said if true.
>
> In any case: This needs to be a CFI_ESCAPE because there's no
> other way I know of to emit the DW_CFA_def_cfa_expression.
> And the change to it looks correct to me.
>
How does one test the entry CFI annotations? The best that I know of
is to single-step through using gdb attached to qemu and see whether
backtraces seem to work.
--Andy
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-11 9:07 ` Andy Lutomirski
@ 2014-08-11 9:31 ` Jan Beulich
0 siblings, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2014-08-11 9:31 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Kees Cook, Will Drewry, Frederic Weisbecker, Denys Vlasenko,
X86 ML, Alexei Starovoitov, Denys Vlasenko, Oleg Nesterov,
Linux Kernel Mailing List, H. Peter Anvin
>>> On 11.08.14 at 11:07, <luto@amacapital.net> wrote:
> How does one test the entry CFI annotations? The best that I know of
> is to single-step through using gdb attached to qemu and see whether
> backtraces seem to work.
Or have the kernel generate a backtrace from a suitable location and
check that the backtrace is correct (of course provided you have Dwarf
stack unwinding support in your kernel).
Jan
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-11 8:40 ` Jan Beulich
2014-08-11 9:07 ` Andy Lutomirski
@ 2014-08-11 13:26 ` Denys Vlasenko
2014-08-11 14:17 ` Jan Beulich
1 sibling, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-11 13:26 UTC (permalink / raw)
To: Jan Beulich, Frederic Weisbecker, Denys Vlasenko
Cc: Andy Lutomirski, Kees Cook, Will Drewry, X86 ML,
Alexei Starovoitov, Oleg Nesterov, Linux Kernel Mailing List,
H. Peter Anvin
On 08/11/2014 10:40 AM, Jan Beulich wrote:
>>>>> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
>>>>> 0x77 /* DW_OP_breg7 */, 0, \
>>>>> 0x06 /* DW_OP_deref */, \
>>>>> - 0x08 /* DW_OP_const1u */, SS+8-RBP, \
>>>>> + 0x08 /* DW_OP_const1u */, SS+8, \
>>>>> 0x22 /* DW_OP_plus */
>>>>> /* We entered an interrupt context - irqs are off: */
>>>>> TRACE_IRQS_OFF
...
...
>>> While we are at it, what this CFI_ESCAPE thing does here?
>>> As usual, it has no comment :/
>
> Each of its lines has a comment; with other CFI annotations not
> each having comments, I don't see what else is needed here.
The existing comments explain what every byte means.
They are useful if CFI-literate reader wants to check correctness
of the encoding of this annotation.
There is no overall comment what this CFI annotation
*achieves*. In human language, what do we say
to DWARF decoder here?
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-11 13:26 ` Denys Vlasenko
@ 2014-08-11 14:17 ` Jan Beulich
2014-08-11 14:53 ` H. Peter Anvin
0 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2014-08-11 14:17 UTC (permalink / raw)
To: Denys Vlasenko, Denys Vlasenko
Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
X86 ML, Alexei Starovoitov, Oleg Nesterov,
Linux Kernel Mailing List, H. Peter Anvin
>>> On 11.08.14 at 15:26, <dvlasenk@redhat.com> wrote:
> On 08/11/2014 10:40 AM, Jan Beulich wrote:
>>>>>> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
>>>>>> 0x77 /* DW_OP_breg7 */, 0, \
>>>>>> 0x06 /* DW_OP_deref */, \
>>>>>> - 0x08 /* DW_OP_const1u */, SS+8-RBP, \
>>>>>> + 0x08 /* DW_OP_const1u */, SS+8, \
>>>>>> 0x22 /* DW_OP_plus */
>>>>>> /* We entered an interrupt context - irqs are off: */
>>>>>> TRACE_IRQS_OFF
> ...
> ...
>>>> While we are at it, what this CFI_ESCAPE thing does here?
>>>> As usual, it has no comment :/
>>
>> Each of its lines has a comment; with other CFI annotations not
>> each having comments, I don't see what else is needed here.
>
> The existing comments explain what every byte means.
> They are useful if CFI-literate reader wants to check correctness
> of the encoding of this annotation.
>
> There is no overall comment what this CFI annotation
> *achieves*. In human language, what do we say
> to DWARF decoder here?
Short answer: DW_CFA_def_cfa_expression.
Longer response: Just like I said before, what you're asking for is
identical to ask for each other CFI annotation to get a comment
associated to tell you what it's doing, which I don't think you
really mean to ask for. (Our main problem here is that we can't
specify expressions with the .cfi_* gas directives, and hence have
to resort to .cfi_escape.)
Jan
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-11 14:17 ` Jan Beulich
@ 2014-08-11 14:53 ` H. Peter Anvin
2014-08-11 15:08 ` Jan Beulich
0 siblings, 1 reply; 41+ messages in thread
From: H. Peter Anvin @ 2014-08-11 14:53 UTC (permalink / raw)
To: Jan Beulich, Denys Vlasenko, Denys Vlasenko
Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
X86 ML, Alexei Starovoitov, Oleg Nesterov,
Linux Kernel Mailing List
On 08/11/2014 07:17 AM, Jan Beulich wrote:
>>
>> The existing comments explain what every byte means.
>> They are useful if CFI-literate reader wants to check correctness
>> of the encoding of this annotation.
>>
>> There is no overall comment what this CFI annotation
>> *achieves*. In human language, what do we say
>> to DWARF decoder here?
>
> Short answer: DW_CFA_def_cfa_expression.
>
> Longer response: Just like I said before, what you're asking for is
> identical to ask for each other CFI annotation to get a comment
> associated to tell you what it's doing, which I don't think you
> really mean to ask for. (Our main problem here is that we can't
> specify expressions with the .cfi_* gas directives, and hence have
> to resort to .cfi_escape.)
>
No, in *human language*. What does the DW_CFA_def_cfa_expression
actually aim to accomplish? If you don't know the innards of the DWARF
spec, the whole thing might as well be Hungarian.
-hpa
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-11 14:53 ` H. Peter Anvin
@ 2014-08-11 15:08 ` Jan Beulich
2014-08-11 15:13 ` H. Peter Anvin
0 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2014-08-11 15:08 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
Denys Vlasenko, X86 ML, Alexei Starovoitov, Denys Vlasenko,
Oleg Nesterov, Linux Kernel Mailing List
>>> On 11.08.14 at 16:53, <hpa@zytor.com> wrote:
> On 08/11/2014 07:17 AM, Jan Beulich wrote:
>>>
>>> The existing comments explain what every byte means.
>>> They are useful if CFI-literate reader wants to check correctness
>>> of the encoding of this annotation.
>>>
>>> There is no overall comment what this CFI annotation
>>> *achieves*. In human language, what do we say
>>> to DWARF decoder here?
>>
>> Short answer: DW_CFA_def_cfa_expression.
>>
>> Longer response: Just like I said before, what you're asking for is
>> identical to ask for each other CFI annotation to get a comment
>> associated to tell you what it's doing, which I don't think you
>> really mean to ask for. (Our main problem here is that we can't
>> specify expressions with the .cfi_* gas directives, and hence have
>> to resort to .cfi_escape.)
>>
>
> No, in *human language*. What does the DW_CFA_def_cfa_expression
> actually aim to accomplish? If you don't know the innards of the DWARF
> spec, the whole thing might as well be Hungarian.
Just like the other DW_CFA_def_cfa_* ones it sets the current
frame address (CFA), just not via one of the pre-canned shortcuts,
but via an expression (in the case here de-referencing the stack
pointer to read the top of stack, and then adding the necessary
offset). So it indeed is similar enough to other .cfi_* annotations we
use without further comments.
And btw., Hungarian isn't _that_ bad.
Jan
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-11 15:08 ` Jan Beulich
@ 2014-08-11 15:13 ` H. Peter Anvin
2014-08-12 9:31 ` Denys Vlasenko
0 siblings, 1 reply; 41+ messages in thread
From: H. Peter Anvin @ 2014-08-11 15:13 UTC (permalink / raw)
To: Jan Beulich
Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
Denys Vlasenko, X86 ML, Alexei Starovoitov, Denys Vlasenko,
Oleg Nesterov, Linux Kernel Mailing List
On 08/11/2014 08:08 AM, Jan Beulich wrote:
>>
>> No, in *human language*. What does the DW_CFA_def_cfa_expression
>> actually aim to accomplish? If you don't know the innards of the DWARF
>> spec, the whole thing might as well be Hungarian.
>
> Just like the other DW_CFA_def_cfa_* ones it sets the current
> frame address (CFA), just not via one of the pre-canned shortcuts,
> but via an expression (in the case here de-referencing the stack
> pointer to read the top of stack, and then adding the necessary
> offset). So it indeed is similar enough to other .cfi_* annotations we
> use without further comments.
>
Actually, what you had inside the parenteses there is actually a
half-decent comment. I'm going to pretend the rest of this wasn't posted.
-hpa
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-11 15:13 ` H. Peter Anvin
@ 2014-08-12 9:31 ` Denys Vlasenko
2014-08-12 9:50 ` Jan Beulich
0 siblings, 1 reply; 41+ messages in thread
From: Denys Vlasenko @ 2014-08-12 9:31 UTC (permalink / raw)
To: H. Peter Anvin, Jan Beulich
Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
Denys Vlasenko, X86 ML, Alexei Starovoitov, Oleg Nesterov,
Linux Kernel Mailing List
On 08/11/2014 05:13 PM, H. Peter Anvin wrote:
> On 08/11/2014 08:08 AM, Jan Beulich wrote:
>>> No, in *human language*. What does the DW_CFA_def_cfa_expression
>>> actually aim to accomplish? If you don't know the innards of the DWARF
>>> spec, the whole thing might as well be Hungarian.
>>
>> Just like the other DW_CFA_def_cfa_* ones it sets the current
>> frame address (CFA), just not via one of the pre-canned shortcuts,
>> but via an expression (in the case here de-referencing the stack
>> pointer to read the top of stack, and then adding the necessary
>> offset). So it indeed is similar enough to other .cfi_* annotations we
>> use without further comments.
>>
>
> Actually, what you had inside the parenteses there is actually a
> half-decent comment. I'm going to pretend the rest of this wasn't posted.
Jan, Pater, does this look correct _and_ human-understandable?
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -652,10 +652,14 @@ END(interrupt)
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
CFI_DEF_CFA_REGISTER rsi
pushq %rsi
+ /*
+ * For debugger:
+ * "CFA (Current Frame Address) is the value on stack + offset"
+ */
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 */, 0, \
+ 0x77 /* DW_OP_breg7 (rsp) */, 0, \
0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
0x22 /* DW_OP_plus */
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs"
2014-08-12 9:31 ` Denys Vlasenko
@ 2014-08-12 9:50 ` Jan Beulich
0 siblings, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2014-08-12 9:50 UTC (permalink / raw)
To: Denys Vlasenko
Cc: Andy Lutomirski, Kees Cook, Will Drewry, Frederic Weisbecker,
Denys Vlasenko, X86 ML, Alexei Starovoitov, Oleg Nesterov,
Linux Kernel Mailing List, H. Peter Anvin
>>> On 12.08.14 at 11:31, <dvlasenk@redhat.com> wrote:
> Jan, Pater, does this look correct _and_ human-understandable?
>
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -652,10 +652,14 @@ END(interrupt)
> cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
> CFI_DEF_CFA_REGISTER rsi
> pushq %rsi
> + /*
> + * For debugger:
> + * "CFA (Current Frame Address) is the value on stack + offset"
> + */
> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
> - 0x77 /* DW_OP_breg7 */, 0, \
> + 0x77 /* DW_OP_breg7 (rsp) */, 0, \
I'm fine with the changes above.
> 0x06 /* DW_OP_deref */, \
> - 0x08 /* DW_OP_const1u */, SS+8-RBP, \
> + 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
But I'd rather not see this one alone changed - either change all
similar uses of SS, or leave the one here alone.
Jan
^ permalink raw reply [flat|nested] 41+ messages in thread
end of thread, other threads:[~2014-08-12 9:50 UTC | newest]
Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-01 14:48 [PATCH 1/5] x86: entry_64.S: delete unused code Denys Vlasenko
2014-08-01 14:48 ` [PATCH 2/5] x86: open-code register save/restore in trace_hardirqs thunks Denys Vlasenko
2014-08-01 14:48 ` [PATCH 3/5] x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user Denys Vlasenko
2014-08-01 18:30 ` Frederic Weisbecker
2014-08-01 14:48 ` [PATCH 4/5] x86: entry_64.S: always allocate complete "struct pt_regs" Denys Vlasenko
2014-08-01 17:04 ` Andy Lutomirski
2014-08-04 14:28 ` Denys Vlasenko
2014-08-04 14:47 ` Oleg Nesterov
2014-08-04 15:34 ` Oleg Nesterov
2014-08-04 21:03 ` Andy Lutomirski
2014-08-04 21:23 ` Borislav Petkov
2014-08-05 10:35 ` Denys Vlasenko
2014-08-05 14:53 ` Andy Lutomirski
2014-08-05 15:17 ` Denys Vlasenko
2014-08-05 23:02 ` Andy Lutomirski
2014-08-07 9:54 ` Denys Vlasenko
2014-08-01 18:09 ` Alexei Starovoitov
2014-08-01 18:30 ` Oleg Nesterov
2014-08-01 18:35 ` H. Peter Anvin
2014-08-01 22:11 ` Denys Vlasenko
2014-08-01 22:13 ` H. Peter Anvin
2014-08-02 21:14 ` Andy Lutomirski
2014-08-02 21:23 ` H. Peter Anvin
2014-08-02 21:38 ` Andy Lutomirski
2014-08-01 22:52 ` Frederic Weisbecker
2014-08-01 23:19 ` Frederic Weisbecker
2014-08-04 3:03 ` Denys Vlasenko
2014-08-04 7:57 ` Borislav Petkov
2014-08-11 0:46 ` Frederic Weisbecker
2014-08-11 8:40 ` Jan Beulich
2014-08-11 9:07 ` Andy Lutomirski
2014-08-11 9:31 ` Jan Beulich
2014-08-11 13:26 ` Denys Vlasenko
2014-08-11 14:17 ` Jan Beulich
2014-08-11 14:53 ` H. Peter Anvin
2014-08-11 15:08 ` Jan Beulich
2014-08-11 15:13 ` H. Peter Anvin
2014-08-12 9:31 ` Denys Vlasenko
2014-08-12 9:50 ` Jan Beulich
2014-08-01 14:48 ` [PATCH 5/5] x86: mass removal of ARGOFFSET Denys Vlasenko
2014-08-01 18:00 ` [PATCH 1/5] x86: entry_64.S: delete unused code Frederic Weisbecker
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.