* [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS
@ 2018-02-07 20:15 Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions Dominik Brodowski
` (6 more replies)
0 siblings, 7 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto
The starting point for this series was the intention to interleave
the register clearing with register PUSH or MOV instructions, in order
to minimize the cost of the additional instructions required for the
register clearing. While at it, I noticed that a couple of macros in
arch/x86/entry/calling.h are unused and can be cleaned up.
Based on a preliminary version of this patch,[*] Linus suggested to
merge further codepaths and to use PUSH instead of MOV, as this should
be faster on newer CPUs.
[*] http://lkml.kernel.org/r/20180206212546.GA2026@light.dominikbrodowski.net and
http://lkml.kernel.org/r/20180206213202.GB2026@light.dominikbrodowski.net
NOTE / WARNING: Please be *extremely* stringent in reviewing these
patches, *especially* concerning patch 6/7 (x86/entry: get rid of
ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS).
Dominik Brodowski (7):
x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused
extensions
x86/entry: merge POP_C_REGS and POP_EXTRA_REGS
x86/entry: interleave XOR register clearing with PUSH instructions
x86/entry: introduce PUSH_AND_CLEAN_REGS
x86/entry: use PUSH_AND_CLEAN_REGS in more cases
x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly
arch/x86/entry/calling.h | 118 ++++++++++++++++++----------------------------
arch/x86/entry/entry_64.S | 104 +++++++---------------------------------
2 files changed, 62 insertions(+), 160 deletions(-)
--
2.16.1
^ permalink raw reply [flat|nested] 15+ messages in thread
* [RFC v2 PATCH 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions
2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 2/7] x86/entry: merge POP_C_REGS and POP_EXTRA_REGS Dominik Brodowski
` (5 subsequent siblings)
6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto
All current code paths call SAVE_C_REGS and then immediately
SAVE_EXTRA_REGS. Therefore, merge these two macros and order the MOV
sequeneces properly.
While at it, remove the macros to save all except specific registers,
as these macros have been unused for a long time.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
arch/x86/entry/calling.h | 57 +++++++++++++----------------------------------
arch/x86/entry/entry_64.S | 12 ++++------
2 files changed, 19 insertions(+), 50 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f4b129d4af42..8907a6593b42 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -101,49 +101,22 @@ For 32-bit we have the following conventions - kernel is built with
addq $-(15*8), %rsp
.endm
- .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
- .if \r11
- movq %r11, 6*8+\offset(%rsp)
- .endif
- .if \r8910
- movq %r10, 7*8+\offset(%rsp)
- movq %r9, 8*8+\offset(%rsp)
- movq %r8, 9*8+\offset(%rsp)
- .endif
- .if \rax
- movq %rax, 10*8+\offset(%rsp)
- .endif
- .if \rcx
- movq %rcx, 11*8+\offset(%rsp)
- .endif
- movq %rdx, 12*8+\offset(%rsp)
- movq %rsi, 13*8+\offset(%rsp)
+ .macro SAVE_REGS offset=0
movq %rdi, 14*8+\offset(%rsp)
- UNWIND_HINT_REGS offset=\offset extra=0
- .endm
- .macro SAVE_C_REGS offset=0
- SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
- .endm
- .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
- SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
- .endm
- .macro SAVE_C_REGS_EXCEPT_R891011
- SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
- .endm
- .macro SAVE_C_REGS_EXCEPT_RCX_R891011
- SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
- .endm
- .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
- SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
- .endm
-
- .macro SAVE_EXTRA_REGS offset=0
- movq %r15, 0*8+\offset(%rsp)
- movq %r14, 1*8+\offset(%rsp)
- movq %r13, 2*8+\offset(%rsp)
- movq %r12, 3*8+\offset(%rsp)
- movq %rbp, 4*8+\offset(%rsp)
+ movq %rsi, 13*8+\offset(%rsp)
+ movq %rdx, 12*8+\offset(%rsp)
+ movq %rcx, 11*8+\offset(%rsp)
+ movq %rax, 10*8+\offset(%rsp)
+ movq %r8, 9*8+\offset(%rsp)
+ movq %r9, 8*8+\offset(%rsp)
+ movq %r10, 7*8+\offset(%rsp)
+ movq %r11, 6*8+\offset(%rsp)
movq %rbx, 5*8+\offset(%rsp)
+ movq %rbp, 4*8+\offset(%rsp)
+ movq %r12, 3*8+\offset(%rsp)
+ movq %r13, 2*8+\offset(%rsp)
+ movq %r14, 1*8+\offset(%rsp)
+ movq %r15, 0*8+\offset(%rsp)
UNWIND_HINT_REGS offset=\offset
.endm
@@ -197,7 +170,7 @@ For 32-bit we have the following conventions - kernel is built with
* is just setting the LSB, which makes it an invalid stack address and is also
* a signal to the unwinder that it's a pt_regs pointer in disguise.
*
- * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts
+ * NOTE: This macro must be used *after* SAVE_REGS because it corrupts
* the original rbp.
*/
.macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9e48002b953b..91e8d84c2496 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -573,8 +573,7 @@ END(irq_entries_start)
1:
ALLOC_PT_GPREGS_ON_STACK
- SAVE_C_REGS
- SAVE_EXTRA_REGS
+ SAVE_REGS
CLEAR_REGS_NOSPEC
ENCODE_FRAME_POINTER
@@ -1132,8 +1131,7 @@ ENTRY(xen_failsafe_callback)
UNWIND_HINT_IRET_REGS
pushq $-1 /* orig_ax = -1 => not a system call */
ALLOC_PT_GPREGS_ON_STACK
- SAVE_C_REGS
- SAVE_EXTRA_REGS
+ SAVE_REGS
CLEAR_REGS_NOSPEC
ENCODE_FRAME_POINTER
jmp error_exit
@@ -1178,8 +1176,7 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
ENTRY(paranoid_entry)
UNWIND_HINT_FUNC
cld
- SAVE_C_REGS 8
- SAVE_EXTRA_REGS 8
+ SAVE_REGS 8
CLEAR_REGS_NOSPEC
ENCODE_FRAME_POINTER 8
movl $1, %ebx
@@ -1231,8 +1228,7 @@ END(paranoid_exit)
ENTRY(error_entry)
UNWIND_HINT_FUNC
cld
- SAVE_C_REGS 8
- SAVE_EXTRA_REGS 8
+ SAVE_REGS 8
CLEAR_REGS_NOSPEC
ENCODE_FRAME_POINTER 8
testb $3, CS+8(%rsp)
--
2.16.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [RFC v2 PATCH 2/7] x86/entry: merge POP_C_REGS and POP_EXTRA_REGS
2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 3/7] x86/entry: interleave XOR register clearing with PUSH instructions Dominik Brodowski
` (4 subsequent siblings)
6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto
The two special, opencoded cases for POP_C_REGS can be handled by ASM
macros.
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
arch/x86/entry/calling.h | 15 +++++++++++----
arch/x86/entry/entry_64.S | 26 ++++----------------------
2 files changed, 15 insertions(+), 26 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 8907a6593b42..3bda31736a7b 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -139,25 +139,32 @@ For 32-bit we have the following conventions - kernel is built with
xorq %r15, %r15
.endm
- .macro POP_EXTRA_REGS
+ .macro POP_REGS pop_rdi=1 skip_r11rcx=0
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
- .endm
-
- .macro POP_C_REGS
+ .if \skip_r11rcx
+ popq %rsi
+ .else
popq %r11
+ .endif
popq %r10
popq %r9
popq %r8
popq %rax
+ .if \skip_r11rcx
+ popq %rsi
+ .else
popq %rcx
+ .endif
popq %rdx
popq %rsi
+ .if \pop_rdi
popq %rdi
+ .endif
.endm
.macro icebp
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 91e8d84c2496..286b24b650da 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -334,15 +334,7 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
UNWIND_HINT_EMPTY
- POP_EXTRA_REGS
- popq %rsi /* skip r11 */
- popq %r10
- popq %r9
- popq %r8
- popq %rax
- popq %rsi /* skip rcx */
- popq %rdx
- popq %rsi
+ POP_REGS pop_rdi=0 skip_r11rcx=1
/*
* Now all regs are restored except RSP and RDI.
@@ -635,15 +627,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
ud2
1:
#endif
- POP_EXTRA_REGS
- popq %r11
- popq %r10
- popq %r9
- popq %r8
- popq %rax
- popq %rcx
- popq %rdx
- popq %rsi
+ POP_REGS pop_rdi=0
/*
* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
@@ -701,8 +685,7 @@ GLOBAL(restore_regs_and_return_to_kernel)
ud2
1:
#endif
- POP_EXTRA_REGS
- POP_C_REGS
+ POP_REGS
addq $8, %rsp /* skip regs->orig_ax */
INTERRUPT_RETURN
@@ -1661,8 +1644,7 @@ end_repeat_nmi:
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
- POP_EXTRA_REGS
- POP_C_REGS
+ POP_REGS
/*
* Skip orig_ax and the "outermost" frame to point RSP at the "iret"
--
2.16.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [RFC v2 PATCH 3/7] x86/entry: interleave XOR register clearing with PUSH instructions
2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 2/7] x86/entry: merge POP_C_REGS and POP_EXTRA_REGS Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 4/7] x86/entry: introduce PUSH_AND_CLEAN_REGS Dominik Brodowski
` (3 subsequent siblings)
6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto
Same as is done for syscalls, interleave XOR with PUSH instructions
for exceptions/interrupts, in order to minimize the cost of the
additional instructions required for register clearing.
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
arch/x86/entry/calling.h | 40 +++++++++++++++++++---------------------
arch/x86/entry/entry_64.S | 30 +++++++++++++++++++++---------
2 files changed, 40 insertions(+), 30 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3bda31736a7b..a05cbb81268d 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -101,44 +101,42 @@ For 32-bit we have the following conventions - kernel is built with
addq $-(15*8), %rsp
.endm
- .macro SAVE_REGS offset=0
+ .macro SAVE_AND_CLEAR_REGS offset=0
+ /*
+ * Save registers and sanitize registers of values that a
+ * speculation attack might otherwise want to exploit. The
+ * lower registers are likely clobbered well before they
+ * could be put to use in a speculative execution gadget.
+ * Interleave XOR with PUSH for better uop scheduling:
+ */
movq %rdi, 14*8+\offset(%rsp)
movq %rsi, 13*8+\offset(%rsp)
movq %rdx, 12*8+\offset(%rsp)
movq %rcx, 11*8+\offset(%rsp)
movq %rax, 10*8+\offset(%rsp)
movq %r8, 9*8+\offset(%rsp)
+ xorq %r8, %r8 /* nospec r8 */
movq %r9, 8*8+\offset(%rsp)
+ xorq %r9, %r9 /* nospec r9 */
movq %r10, 7*8+\offset(%rsp)
+ xorq %r10, %r10 /* nospec r10 */
movq %r11, 6*8+\offset(%rsp)
+ xorq %r11, %r11 /* nospec r11 */
movq %rbx, 5*8+\offset(%rsp)
+ xorl %ebx, %ebx /* nospec rbx */
movq %rbp, 4*8+\offset(%rsp)
+ xorl %ebp, %ebp /* nospec rbp */
movq %r12, 3*8+\offset(%rsp)
+ xorq %r12, %r12 /* nospec r12 */
movq %r13, 2*8+\offset(%rsp)
+ xorq %r13, %r13 /* nospec r13 */
movq %r14, 1*8+\offset(%rsp)
+ xorq %r14, %r14 /* nospec r14 */
movq %r15, 0*8+\offset(%rsp)
+ xorq %r15, %r15 /* nospec r15 */
UNWIND_HINT_REGS offset=\offset
.endm
- /*
- * Sanitize registers of values that a speculation attack
- * might otherwise want to exploit. The lower registers are
- * likely clobbered well before they could be put to use in
- * a speculative execution gadget:
- */
- .macro CLEAR_REGS_NOSPEC
- xorl %ebp, %ebp
- xorl %ebx, %ebx
- xorq %r8, %r8
- xorq %r9, %r9
- xorq %r10, %r10
- xorq %r11, %r11
- xorq %r12, %r12
- xorq %r13, %r13
- xorq %r14, %r14
- xorq %r15, %r15
- .endm
-
.macro POP_REGS pop_rdi=1 skip_r11rcx=0
popq %r15
popq %r14
@@ -177,7 +175,7 @@ For 32-bit we have the following conventions - kernel is built with
* is just setting the LSB, which makes it an invalid stack address and is also
* a signal to the unwinder that it's a pt_regs pointer in disguise.
*
- * NOTE: This macro must be used *after* SAVE_REGS because it corrupts
+ * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
* the original rbp.
*/
.macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 286b24b650da..1194814ee12b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -565,8 +565,7 @@ END(irq_entries_start)
1:
ALLOC_PT_GPREGS_ON_STACK
- SAVE_REGS
- CLEAR_REGS_NOSPEC
+ SAVE_AND_CLEAR_REGS
ENCODE_FRAME_POINTER
testb $3, CS(%rsp)
@@ -1114,8 +1113,7 @@ ENTRY(xen_failsafe_callback)
UNWIND_HINT_IRET_REGS
pushq $-1 /* orig_ax = -1 => not a system call */
ALLOC_PT_GPREGS_ON_STACK
- SAVE_REGS
- CLEAR_REGS_NOSPEC
+ SAVE_AND_CLEAR_REGS
ENCODE_FRAME_POINTER
jmp error_exit
END(xen_failsafe_callback)
@@ -1159,8 +1157,7 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
ENTRY(paranoid_entry)
UNWIND_HINT_FUNC
cld
- SAVE_REGS 8
- CLEAR_REGS_NOSPEC
+ SAVE_AND_CLEAR_REGS 8
ENCODE_FRAME_POINTER 8
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
@@ -1211,8 +1208,7 @@ END(paranoid_exit)
ENTRY(error_entry)
UNWIND_HINT_FUNC
cld
- SAVE_REGS 8
- CLEAR_REGS_NOSPEC
+ SAVE_AND_CLEAR_REGS 8
ENCODE_FRAME_POINTER 8
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1399,18 +1395,34 @@ ENTRY(nmi)
pushq (%rdx) /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq %rax /* pt_regs->ax */
+ /*
+ * Sanitize registers of values that a speculation attack
+ * might otherwise want to exploit. The lower registers are
+ * likely clobbered well before they could be put to use in
+ * a speculative execution gadget. Interleave XOR with PUSH
+ * for better uop scheduling:
+ */
pushq %r8 /* pt_regs->r8 */
+ xorq %r8, %r8 /* nospec r8 */
pushq %r9 /* pt_regs->r9 */
+ xorq %r9, %r9 /* nospec r9 */
pushq %r10 /* pt_regs->r10 */
+ xorq %r10, %r10 /* nospec r10 */
pushq %r11 /* pt_regs->r11 */
+ xorq %r11, %r11 /* nospec r11*/
pushq %rbx /* pt_regs->rbx */
+ xorl %ebx, %ebx /* nospec rbx*/
pushq %rbp /* pt_regs->rbp */
+ xorl %ebp, %ebp /* nospec rbp*/
pushq %r12 /* pt_regs->r12 */
+ xorq %r12, %r12 /* nospec r12*/
pushq %r13 /* pt_regs->r13 */
+ xorq %r13, %r13 /* nospec r13*/
pushq %r14 /* pt_regs->r14 */
+ xorq %r14, %r14 /* nospec r14*/
pushq %r15 /* pt_regs->r15 */
+ xorq %r15, %r15 /* nospec r15*/
UNWIND_HINT_REGS
- CLEAR_REGS_NOSPEC
ENCODE_FRAME_POINTER
/*
--
2.16.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [RFC v2 PATCH 4/7] x86/entry: introduce PUSH_AND_CLEAN_REGS
2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
` (2 preceding siblings ...)
2018-02-07 20:15 ` [RFC v2 PATCH 3/7] x86/entry: interleave XOR register clearing with PUSH instructions Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 5/7] x86/entry: use PUSH_AND_CLEAN_REGS in more cases Dominik Brodowski
` (2 subsequent siblings)
6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto
Those instances where ALLOC_PT_GPREGS_ON_STACK is called just before
SAVE_AND_CLEAR_REGS can trivially be replaced by PUSH_AND_CLEAN_REGS.
This macro uses PUSH instead of MOV and should therefore be faster, at
least on newer CPUs.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
arch/x86/entry/calling.h | 36 ++++++++++++++++++++++++++++++++++++
arch/x86/entry/entry_64.S | 6 ++----
2 files changed, 38 insertions(+), 4 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index a05cbb81268d..57b1b87a04f0 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -137,6 +137,42 @@ For 32-bit we have the following conventions - kernel is built with
UNWIND_HINT_REGS offset=\offset
.endm
+ .macro PUSH_AND_CLEAR_REGS
+ /*
+ * Push registers and sanitize registers of values that a
+ * speculation attack might otherwise want to exploit. The
+ * lower registers are likely clobbered well before they
+ * could be put to use in a speculative execution gadget.
+ * Interleave XOR with PUSH for better uop scheduling:
+ */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ xorq %r8, %r8 /* nospec r8 */
+ pushq %r9 /* pt_regs->r9 */
+ xorq %r9, %r9 /* nospec r9 */
+ pushq %r10 /* pt_regs->r10 */
+ xorq %r10, %r10 /* nospec r10 */
+ pushq %r11 /* pt_regs->r11 */
+ xorq %r11, %r11 /* nospec r11*/
+ pushq %rbx /* pt_regs->rbx */
+ xorl %ebx, %ebx /* nospec rbx*/
+ pushq %rbp /* pt_regs->rbp */
+ xorl %ebp, %ebp /* nospec rbp*/
+ pushq %r12 /* pt_regs->r12 */
+ xorq %r12, %r12 /* nospec r12*/
+ pushq %r13 /* pt_regs->r13 */
+ xorq %r13, %r13 /* nospec r13*/
+ pushq %r14 /* pt_regs->r14 */
+ xorq %r14, %r14 /* nospec r14*/
+ pushq %r15 /* pt_regs->r15 */
+ xorq %r15, %r15 /* nospec r15*/
+ UNWIND_HINT_REGS
+ .endm
+
.macro POP_REGS pop_rdi=1 skip_r11rcx=0
popq %r15
popq %r14
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1194814ee12b..9dd3fbfdc75d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -564,8 +564,7 @@ END(irq_entries_start)
call switch_to_thread_stack
1:
- ALLOC_PT_GPREGS_ON_STACK
- SAVE_AND_CLEAR_REGS
+ PUSH_AND_CLEAR_REGS
ENCODE_FRAME_POINTER
testb $3, CS(%rsp)
@@ -1112,8 +1111,7 @@ ENTRY(xen_failsafe_callback)
addq $0x30, %rsp
UNWIND_HINT_IRET_REGS
pushq $-1 /* orig_ax = -1 => not a system call */
- ALLOC_PT_GPREGS_ON_STACK
- SAVE_AND_CLEAR_REGS
+ PUSH_AND_CLEAR_REGS
ENCODE_FRAME_POINTER
jmp error_exit
END(xen_failsafe_callback)
--
2.16.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [RFC v2 PATCH 5/7] x86/entry: use PUSH_AND_CLEAN_REGS in more cases
2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
` (3 preceding siblings ...)
2018-02-07 20:15 ` [RFC v2 PATCH 4/7] x86/entry: introduce PUSH_AND_CLEAN_REGS Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 7/7] x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly Dominik Brodowski
6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto
entry_SYSCALL_64_after_hwframe and nmi can be converted to use
PUSH_AND_CLEAN_REGS instead of opencoded variants thereof. Due to
the interleaving, the additional XOR-based clearing of r8 and r9
in entry_SYSCALL_64_after_hwframe should not have any noticeable
negative implications.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
arch/x86/entry/calling.h | 6 ++---
arch/x86/entry/entry_64.S | 65 +++--------------------------------------------
2 files changed, 6 insertions(+), 65 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 57b1b87a04f0..d6a97e2945ee 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -137,7 +137,7 @@ For 32-bit we have the following conventions - kernel is built with
UNWIND_HINT_REGS offset=\offset
.endm
- .macro PUSH_AND_CLEAR_REGS
+ .macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
/*
* Push registers and sanitize registers of values that a
* speculation attack might otherwise want to exploit. The
@@ -147,9 +147,9 @@ For 32-bit we have the following conventions - kernel is built with
*/
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
- pushq %rdx /* pt_regs->dx */
+ pushq \rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
- pushq %rax /* pt_regs->ax */
+ pushq \rax /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
xorq %r8, %r8 /* nospec r8 */
pushq %r9 /* pt_regs->r9 */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9dd3fbfdc75d..9c4fe360db42 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -227,35 +227,8 @@ ENTRY(entry_SYSCALL_64)
pushq %rcx /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq %rax /* pt_regs->orig_ax */
- pushq %rdi /* pt_regs->di */
- pushq %rsi /* pt_regs->si */
- pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx */
- pushq $-ENOSYS /* pt_regs->ax */
- pushq %r8 /* pt_regs->r8 */
- pushq %r9 /* pt_regs->r9 */
- pushq %r10 /* pt_regs->r10 */
- /*
- * Clear extra registers that a speculation attack might
- * otherwise want to exploit. Interleave XOR with PUSH
- * for better uop scheduling:
- */
- xorq %r10, %r10 /* nospec r10 */
- pushq %r11 /* pt_regs->r11 */
- xorq %r11, %r11 /* nospec r11 */
- pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx */
- pushq %rbp /* pt_regs->rbp */
- xorl %ebp, %ebp /* nospec rbp */
- pushq %r12 /* pt_regs->r12 */
- xorq %r12, %r12 /* nospec r12 */
- pushq %r13 /* pt_regs->r13 */
- xorq %r13, %r13 /* nospec r13 */
- pushq %r14 /* pt_regs->r14 */
- xorq %r14, %r14 /* nospec r14 */
- pushq %r15 /* pt_regs->r15 */
- xorq %r15, %r15 /* nospec r15 */
- UNWIND_HINT_REGS
+
+ PUSH_AND_CLEAR_REGS rax=$-ENOSYS
TRACE_IRQS_OFF
@@ -1388,39 +1361,7 @@ ENTRY(nmi)
pushq 1*8(%rdx) /* pt_regs->rip */
UNWIND_HINT_IRET_REGS
pushq $-1 /* pt_regs->orig_ax */
- pushq %rdi /* pt_regs->di */
- pushq %rsi /* pt_regs->si */
- pushq (%rdx) /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx */
- pushq %rax /* pt_regs->ax */
- /*
- * Sanitize registers of values that a speculation attack
- * might otherwise want to exploit. The lower registers are
- * likely clobbered well before they could be put to use in
- * a speculative execution gadget. Interleave XOR with PUSH
- * for better uop scheduling:
- */
- pushq %r8 /* pt_regs->r8 */
- xorq %r8, %r8 /* nospec r8 */
- pushq %r9 /* pt_regs->r9 */
- xorq %r9, %r9 /* nospec r9 */
- pushq %r10 /* pt_regs->r10 */
- xorq %r10, %r10 /* nospec r10 */
- pushq %r11 /* pt_regs->r11 */
- xorq %r11, %r11 /* nospec r11*/
- pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx*/
- pushq %rbp /* pt_regs->rbp */
- xorl %ebp, %ebp /* nospec rbp*/
- pushq %r12 /* pt_regs->r12 */
- xorq %r12, %r12 /* nospec r12*/
- pushq %r13 /* pt_regs->r13 */
- xorq %r13, %r13 /* nospec r13*/
- pushq %r14 /* pt_regs->r14 */
- xorq %r14, %r14 /* nospec r14*/
- pushq %r15 /* pt_regs->r15 */
- xorq %r15, %r15 /* nospec r15*/
- UNWIND_HINT_REGS
+ PUSH_AND_CLEAR_REGS rdx=(%rdx)
ENCODE_FRAME_POINTER
/*
--
2.16.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
` (4 preceding siblings ...)
2018-02-07 20:15 ` [RFC v2 PATCH 5/7] x86/entry: use PUSH_AND_CLEAN_REGS in more cases Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
2018-02-07 20:44 ` Linus Torvalds
2018-02-07 20:15 ` [RFC v2 PATCH 7/7] x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly Dominik Brodowski
6 siblings, 1 reply; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto
Previously, error_entry() and paranoid_entry() saved the GP registers
onto stack space previously allocated by its callers. Combine these two
steps in the callers, and use the generic PUSH_AND_CLEAR_REGS macro
for that.
Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
modification. Previously %rsp was manually decreased by 15*8; with
this patch, %rsp is decreased by 15 pushq instructions. Moreover,
error_entry did and does the exact same test (with offset=8) after
the registers have been moved/pushed and cleared.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
arch/x86/entry/calling.h | 42 +-----------------------------------------
arch/x86/entry/entry_64.S | 15 +++++++--------
2 files changed, 8 insertions(+), 49 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d6a97e2945ee..59675010c9a0 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,46 +97,6 @@ For 32-bit we have the following conventions - kernel is built with
#define SIZEOF_PTREGS 21*8
- .macro ALLOC_PT_GPREGS_ON_STACK
- addq $-(15*8), %rsp
- .endm
-
- .macro SAVE_AND_CLEAR_REGS offset=0
- /*
- * Save registers and sanitize registers of values that a
- * speculation attack might otherwise want to exploit. The
- * lower registers are likely clobbered well before they
- * could be put to use in a speculative execution gadget.
- * Interleave XOR with PUSH for better uop scheduling:
- */
- movq %rdi, 14*8+\offset(%rsp)
- movq %rsi, 13*8+\offset(%rsp)
- movq %rdx, 12*8+\offset(%rsp)
- movq %rcx, 11*8+\offset(%rsp)
- movq %rax, 10*8+\offset(%rsp)
- movq %r8, 9*8+\offset(%rsp)
- xorq %r8, %r8 /* nospec r8 */
- movq %r9, 8*8+\offset(%rsp)
- xorq %r9, %r9 /* nospec r9 */
- movq %r10, 7*8+\offset(%rsp)
- xorq %r10, %r10 /* nospec r10 */
- movq %r11, 6*8+\offset(%rsp)
- xorq %r11, %r11 /* nospec r11 */
- movq %rbx, 5*8+\offset(%rsp)
- xorl %ebx, %ebx /* nospec rbx */
- movq %rbp, 4*8+\offset(%rsp)
- xorl %ebp, %ebp /* nospec rbp */
- movq %r12, 3*8+\offset(%rsp)
- xorq %r12, %r12 /* nospec r12 */
- movq %r13, 2*8+\offset(%rsp)
- xorq %r13, %r13 /* nospec r13 */
- movq %r14, 1*8+\offset(%rsp)
- xorq %r14, %r14 /* nospec r14 */
- movq %r15, 0*8+\offset(%rsp)
- xorq %r15, %r15 /* nospec r15 */
- UNWIND_HINT_REGS offset=\offset
- .endm
-
.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
/*
* Push registers and sanitize registers of values that a
@@ -211,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
* is just setting the LSB, which makes it an invalid stack address and is also
* a signal to the unwinder that it's a pt_regs pointer in disguise.
*
- * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
+ * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
* the original rbp.
*/
.macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9c4fe360db42..fa56a974d1c1 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -871,7 +871,9 @@ ENTRY(\sym)
pushq $-1 /* ORIG_RAX: no syscall to restart */
.endif
- ALLOC_PT_GPREGS_ON_STACK
+ /* Save all registers in pt_regs */
+ PUSH_AND_CLEAR_REGS
+ ENCODE_FRAME_POINTER
.if \paranoid < 2
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
@@ -1121,15 +1123,13 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
#endif
/*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
* Use slow, but surefire "are we in kernel?" check.
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(paranoid_entry)
UNWIND_HINT_FUNC
cld
- SAVE_AND_CLEAR_REGS 8
- ENCODE_FRAME_POINTER 8
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
@@ -1173,14 +1173,12 @@ ENTRY(paranoid_exit)
END(paranoid_exit)
/*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
* Return: EBX=0: came from user mode; EBX=1: otherwise
*/
ENTRY(error_entry)
UNWIND_HINT_FUNC
cld
- SAVE_AND_CLEAR_REGS 8
- ENCODE_FRAME_POINTER 8
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1571,7 +1569,8 @@ end_repeat_nmi:
* frame to point back to repeat_nmi.
*/
pushq $-1 /* ORIG_RAX: no syscall to restart */
- ALLOC_PT_GPREGS_ON_STACK
+ PUSH_AND_CLEAR_REGS
+ ENCODE_FRAME_POINTER
/*
* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
--
2.16.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [RFC v2 PATCH 7/7] x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly
2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
` (5 preceding siblings ...)
2018-02-07 20:15 ` [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto
... same as the other macros in arch/x86/entry/calling.h
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
arch/x86/entry/calling.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 59675010c9a0..6985440c68fa 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,7 +97,7 @@ For 32-bit we have the following conventions - kernel is built with
#define SIZEOF_PTREGS 21*8
- .macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
/*
* Push registers and sanitize registers of values that a
* speculation attack might otherwise want to exploit. The
@@ -131,9 +131,9 @@ For 32-bit we have the following conventions - kernel is built with
pushq %r15 /* pt_regs->r15 */
xorq %r15, %r15 /* nospec r15*/
UNWIND_HINT_REGS
- .endm
+.endm
- .macro POP_REGS pop_rdi=1 skip_r11rcx=0
+.macro POP_REGS pop_rdi=1 skip_r11rcx=0
popq %r15
popq %r14
popq %r13
@@ -163,7 +163,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro icebp
.byte 0xf1
- .endm
+.endm
/*
* This is a sneaky trick to help the unwinder find pt_regs on the stack. The
--
2.16.1
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
2018-02-07 20:15 ` [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS Dominik Brodowski
@ 2018-02-07 20:44 ` Linus Torvalds
2018-02-07 21:29 ` Dominik Brodowski
0 siblings, 1 reply; 15+ messages in thread
From: Linus Torvalds @ 2018-02-07 20:44 UTC (permalink / raw)
To: Dominik Brodowski
Cc: Linux Kernel Mailing List, Ingo Molnar, the arch/x86 maintainers,
Dan Williams, Thomas Gleixner, Andi Kleen, Andrew Lutomirski
On Wed, Feb 7, 2018 at 12:15 PM, Dominik Brodowski
<linux@dominikbrodowski.net> wrote:
>
> Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
> modification. Previously %rsp was manually decreased by 15*8; with
> this patch, %rsp is decreased by 15 pushq instructions. Moreover,
> error_entry did and does the exact same test (with offset=8) after
> the registers have been moved/pushed and cleared.
So this has the problem that now those save/clear instructions will
all be done in that idtentry macro.
So now that code will be duplicated for all the users of that macro.
The old code did the saving in the common error_entry and
paranoid_entry routines, in order to be able to share all the code,
and making the duplicated stub functions generated by the idtentry
macro smaller.
Now, admittedly the new push sequence is much smaller than the old
movq sequence, so the duplication doesn't hurt as much, but it's still
likely quite noticeable.
So this removes lines of asm code, but it adds a lot of instructions
to the end result thanks to the macro, I think.
Linus
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
2018-02-07 20:44 ` Linus Torvalds
@ 2018-02-07 21:29 ` Dominik Brodowski
2018-02-07 21:58 ` Linus Torvalds
` (2 more replies)
0 siblings, 3 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 21:29 UTC (permalink / raw)
To: Linus Torvalds
Cc: Linux Kernel Mailing List, Ingo Molnar, the arch/x86 maintainers,
Dan Williams, Thomas Gleixner, Andi Kleen, Andrew Lutomirski
On Wed, Feb 07, 2018 at 12:44:41PM -0800, Linus Torvalds wrote:
> On Wed, Feb 7, 2018 at 12:15 PM, Dominik Brodowski
> <linux@dominikbrodowski.net> wrote:
> >
> > Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
> > modification. Previously %rsp was manually decreased by 15*8; with
> > this patch, %rsp is decreased by 15 pushq instructions. Moreover,
> > error_entry did and does the exact same test (with offset=8) after
> > the registers have been moved/pushed and cleared.
>
> So this has the problem that now those save/clear instructions will
> all be done in that idtentry macro.
>
> So now that code will be duplicated for all the users of that macro.
>
> The old code did the saving in the common error_entry and
> paranoid_entry routines, in order to be able to share all the code,
> and making the duplicated stub functions generated by the idtentry
> macro smaller.
>
> Now, admittedly the new push sequence is much smaller than the old
> movq sequence, so the duplication doesn't hurt as much, but it's still
> likely quite noticeable.
>
> So this removes lines of asm code, but it adds a lot of instructions
> to the end result thanks to the macro, I think.
Indeed, that is the case (see below). However, if we want to switch to
PUSH instructions and do this in a routine which is call'ed and which
ret'urns, %rsp needs to be moved around even more often than the old
ALLOC_PT_GPREGS_ON_STACK macro did (which you wanted to get rid of,
IIUYC). Or do I miss something?
text data bss dec hex filename
19500 0 0 19500 4c2c arch/x86/entry/entry_64.o-orig
19510 0 0 19510 4c36 arch/x86/entry/entry_64.o-3_of_7
21105 0 0 21105 5271 arch/x86/entry/entry_64.o-5_of_7
24307 0 0 24307 5ef3 arch/x86/entry/entry_64.o-7_of_7
In any case, here's a v2.1 for this patch 6/7, which silences an objtool
warning in error_entry.
Thanks,
Dominik
----------------------------------------------------
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Wed, 7 Feb 2018 20:56:13 +0100
Subject: [PATCH] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
Previously, error_entry() and paranoid_entry() saved the GP registers
onto stack space previously allocated by its callers. Combine these two
steps in the callers, and use the generic PUSH_AND_CLEAR_REGS macro
for that.
Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
modification. Previously %rsp was manually decreased by 15*8; with
this patch, %rsp is decreased by 15 pushq instructions. Moreover,
error_entry did and does the exact same test (with offset=8) after
the registers have been moved/pushed and cleared.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d6a97e2945ee..59675010c9a0 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,46 +97,6 @@ For 32-bit we have the following conventions - kernel is built with
#define SIZEOF_PTREGS 21*8
- .macro ALLOC_PT_GPREGS_ON_STACK
- addq $-(15*8), %rsp
- .endm
-
- .macro SAVE_AND_CLEAR_REGS offset=0
- /*
- * Save registers and sanitize registers of values that a
- * speculation attack might otherwise want to exploit. The
- * lower registers are likely clobbered well before they
- * could be put to use in a speculative execution gadget.
- * Interleave XOR with PUSH for better uop scheduling:
- */
- movq %rdi, 14*8+\offset(%rsp)
- movq %rsi, 13*8+\offset(%rsp)
- movq %rdx, 12*8+\offset(%rsp)
- movq %rcx, 11*8+\offset(%rsp)
- movq %rax, 10*8+\offset(%rsp)
- movq %r8, 9*8+\offset(%rsp)
- xorq %r8, %r8 /* nospec r8 */
- movq %r9, 8*8+\offset(%rsp)
- xorq %r9, %r9 /* nospec r9 */
- movq %r10, 7*8+\offset(%rsp)
- xorq %r10, %r10 /* nospec r10 */
- movq %r11, 6*8+\offset(%rsp)
- xorq %r11, %r11 /* nospec r11 */
- movq %rbx, 5*8+\offset(%rsp)
- xorl %ebx, %ebx /* nospec rbx */
- movq %rbp, 4*8+\offset(%rsp)
- xorl %ebp, %ebp /* nospec rbp */
- movq %r12, 3*8+\offset(%rsp)
- xorq %r12, %r12 /* nospec r12 */
- movq %r13, 2*8+\offset(%rsp)
- xorq %r13, %r13 /* nospec r13 */
- movq %r14, 1*8+\offset(%rsp)
- xorq %r14, %r14 /* nospec r14 */
- movq %r15, 0*8+\offset(%rsp)
- xorq %r15, %r15 /* nospec r15 */
- UNWIND_HINT_REGS offset=\offset
- .endm
-
.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
/*
* Push registers and sanitize registers of values that a
@@ -211,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
* is just setting the LSB, which makes it an invalid stack address and is also
* a signal to the unwinder that it's a pt_regs pointer in disguise.
*
- * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
+ * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
* the original rbp.
*/
.macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9c4fe360db42..c6e5d44eb322 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -871,7 +871,9 @@ ENTRY(\sym)
pushq $-1 /* ORIG_RAX: no syscall to restart */
.endif
- ALLOC_PT_GPREGS_ON_STACK
+ /* Save all registers in pt_regs */
+ PUSH_AND_CLEAR_REGS
+ ENCODE_FRAME_POINTER
.if \paranoid < 2
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
@@ -1121,15 +1123,13 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
#endif
/*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
* Use slow, but surefire "are we in kernel?" check.
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(paranoid_entry)
UNWIND_HINT_FUNC
cld
- SAVE_AND_CLEAR_REGS 8
- ENCODE_FRAME_POINTER 8
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
@@ -1173,14 +1173,13 @@ ENTRY(paranoid_exit)
END(paranoid_exit)
/*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
* Return: EBX=0: came from user mode; EBX=1: otherwise
*/
ENTRY(error_entry)
UNWIND_HINT_FUNC
+ UNWIND_HINT_REGS offset=8
cld
- SAVE_AND_CLEAR_REGS 8
- ENCODE_FRAME_POINTER 8
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1571,7 +1570,8 @@ end_repeat_nmi:
* frame to point back to repeat_nmi.
*/
pushq $-1 /* ORIG_RAX: no syscall to restart */
- ALLOC_PT_GPREGS_ON_STACK
+ PUSH_AND_CLEAR_REGS
+ ENCODE_FRAME_POINTER
/*
* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
2018-02-07 21:29 ` Dominik Brodowski
@ 2018-02-07 21:58 ` Linus Torvalds
2018-02-08 7:20 ` Dominik Brodowski
2018-02-08 9:47 ` Ingo Molnar
2018-02-08 18:35 ` Josh Poimboeuf
2 siblings, 1 reply; 15+ messages in thread
From: Linus Torvalds @ 2018-02-07 21:58 UTC (permalink / raw)
To: Dominik Brodowski
Cc: Linux Kernel Mailing List, Ingo Molnar, the arch/x86 maintainers,
Dan Williams, Thomas Gleixner, Andi Kleen, Andrew Lutomirski
On Wed, Feb 7, 2018 at 1:29 PM, Dominik Brodowski
<linux@dominikbrodowski.net> wrote:
>>
>> So this removes lines of asm code, but it adds a lot of instructions
>> to the end result thanks to the macro, I think.
>
> Indeed, that is the case (see below). However, if we want to switch to
> PUSH instructions and do this in a routine which is call'ed and which
> ret'urns, %rsp needs to be moved around even more often than the old
> ALLOC_PT_GPREGS_ON_STACK macro did (which you wanted to get rid of,
> IIUYC). Or do I miss something?
So I agree that your approach makes for a lot simpler stack setup.
I was just hoping that we could play some tricks.
For example, right now your PUSH_AND_CLEAR_REGS starts off with
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
....
and maybe we could still use this in paranoid_entry and error_entry if
we made it something like
/* if 'save_ret' is set, we pop the return point into %rsi */
.macro PUSH_AND_CLEAR_REGS save_ret=0
.if \save_ret
pushq %%rsi
movq 8(%%rsp),%rsi
movq %%rdi,8(%%rsp)
.else
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
.endif
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
....
which would allow error_entry and paranoid_entry to do something like this:
PUSH_AND_CLEAR_REGS save_ret=1
pushq %rsi
... do the other common code ..
ret
(totally untested, I'm just doing a "stream-of-consciousness" thing in
the email.
See what I'm saying?
That said, maybe the pushq sequence is now so small that it doesn't
even matter, and duplicating it isn't a big problem.
Because your version sure is simpler.
Linus
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
2018-02-07 21:58 ` Linus Torvalds
@ 2018-02-08 7:20 ` Dominik Brodowski
0 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-08 7:20 UTC (permalink / raw)
To: Linus Torvalds
Cc: Linux Kernel Mailing List, Ingo Molnar, the arch/x86 maintainers,
Dan Williams, Thomas Gleixner, Andi Kleen, Andrew Lutomirski
On Wed, Feb 07, 2018 at 01:58:20PM -0800, Linus Torvalds wrote:
> I was just hoping that we could play some tricks.
>
> [...]
>
> See what I'm saying?
Clever. Though I'd include the "pushq %rsi" in the macro, to be even more
tricky.
text data bss dec hex filename
19500 0 0 19500 4c2c arch/x86/entry/entry_64.o-orig
24307 0 0 24307 5ef3 arch/x86/entry/entry_64.o-7_of_7
20987 0 0 20987 51fb arch/x86/entry/entry_64.o-trick
I'm not really sure yet where the increase in text size comes from in my
patch set, though.
I *hope* that the CS-ORIG_RAX(%rsp) testb is correct; again, excercise
an extremely stringent review of this patch, please.
Thanks,
Dominik
--------------------------------------------------------
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Wed, 7 Feb 2018 20:56:13 +0100
Subject: [PATCH] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
Previously, error_entry() and paranoid_entry() saved the GP registers
onto stack space previously allocated by its callers. Combine these two
steps in the callee, and use the generic PUSH_AND_CLEAR_REGS macro
for that, but play a litte trick in it -- suggested by Linus -- to insert
the GP registers "above" the original return address.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d6a97e2945ee..dc60365547e1 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,47 +97,7 @@ For 32-bit we have the following conventions - kernel is built with
#define SIZEOF_PTREGS 21*8
- .macro ALLOC_PT_GPREGS_ON_STACK
- addq $-(15*8), %rsp
- .endm
-
- .macro SAVE_AND_CLEAR_REGS offset=0
- /*
- * Save registers and sanitize registers of values that a
- * speculation attack might otherwise want to exploit. The
- * lower registers are likely clobbered well before they
- * could be put to use in a speculative execution gadget.
- * Interleave XOR with PUSH for better uop scheduling:
- */
- movq %rdi, 14*8+\offset(%rsp)
- movq %rsi, 13*8+\offset(%rsp)
- movq %rdx, 12*8+\offset(%rsp)
- movq %rcx, 11*8+\offset(%rsp)
- movq %rax, 10*8+\offset(%rsp)
- movq %r8, 9*8+\offset(%rsp)
- xorq %r8, %r8 /* nospec r8 */
- movq %r9, 8*8+\offset(%rsp)
- xorq %r9, %r9 /* nospec r9 */
- movq %r10, 7*8+\offset(%rsp)
- xorq %r10, %r10 /* nospec r10 */
- movq %r11, 6*8+\offset(%rsp)
- xorq %r11, %r11 /* nospec r11 */
- movq %rbx, 5*8+\offset(%rsp)
- xorl %ebx, %ebx /* nospec rbx */
- movq %rbp, 4*8+\offset(%rsp)
- xorl %ebp, %ebp /* nospec rbp */
- movq %r12, 3*8+\offset(%rsp)
- xorq %r12, %r12 /* nospec r12 */
- movq %r13, 2*8+\offset(%rsp)
- xorq %r13, %r13 /* nospec r13 */
- movq %r14, 1*8+\offset(%rsp)
- xorq %r14, %r14 /* nospec r14 */
- movq %r15, 0*8+\offset(%rsp)
- xorq %r15, %r15 /* nospec r15 */
- UNWIND_HINT_REGS offset=\offset
- .endm
-
- .macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
+ .macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
/*
* Push registers and sanitize registers of values that a
* speculation attack might otherwise want to exploit. The
@@ -145,8 +105,14 @@ For 32-bit we have the following conventions - kernel is built with
* could be put to use in a speculative execution gadget.
* Interleave XOR with PUSH for better uop scheduling:
*/
+ .if \save_ret
+ pushq %rsi /* pt_regs->si */
+ movq 8(%rsp), %rsi /* temporarily store ret address in %rsi */
+ movq %rdi, 8(%rsp) /* pt_regs->di (overwriting original ret) */
+ .else
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
+ .endif
pushq \rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq \rax /* pt_regs->ax */
@@ -171,6 +137,9 @@ For 32-bit we have the following conventions - kernel is built with
pushq %r15 /* pt_regs->r15 */
xorq %r15, %r15 /* nospec r15*/
UNWIND_HINT_REGS
+ .if \save_ret
+ pushq %rsi /* return address on top of stack */
+ .endif
.endm
.macro POP_REGS pop_rdi=1 skip_r11rcx=0
@@ -211,7 +180,7 @@ For 32-bit we have the following conventions - kernel is built with
* is just setting the LSB, which makes it an invalid stack address and is also
* a signal to the unwinder that it's a pt_regs pointer in disguise.
*
- * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
+ * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
* the original rbp.
*/
.macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9c4fe360db42..a2e41177e390 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -871,10 +871,8 @@ ENTRY(\sym)
pushq $-1 /* ORIG_RAX: no syscall to restart */
.endif
- ALLOC_PT_GPREGS_ON_STACK
-
.if \paranoid < 2
- testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
+ testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */
jnz .Lfrom_usermode_switch_stack_\@
.endif
@@ -1128,7 +1126,7 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
ENTRY(paranoid_entry)
UNWIND_HINT_FUNC
cld
- SAVE_AND_CLEAR_REGS 8
+ PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
@@ -1179,7 +1177,7 @@ END(paranoid_exit)
ENTRY(error_entry)
UNWIND_HINT_FUNC
cld
- SAVE_AND_CLEAR_REGS 8
+ PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1571,7 +1569,6 @@ end_repeat_nmi:
* frame to point back to repeat_nmi.
*/
pushq $-1 /* ORIG_RAX: no syscall to restart */
- ALLOC_PT_GPREGS_ON_STACK
/*
* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
2018-02-07 21:29 ` Dominik Brodowski
2018-02-07 21:58 ` Linus Torvalds
@ 2018-02-08 9:47 ` Ingo Molnar
2018-02-08 17:39 ` Linus Torvalds
2018-02-08 18:35 ` Josh Poimboeuf
2 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2018-02-08 9:47 UTC (permalink / raw)
To: Dominik Brodowski
Cc: Linus Torvalds, Linux Kernel Mailing List,
the arch/x86 maintainers, Dan Williams, Thomas Gleixner,
Andi Kleen, Andrew Lutomirski, Josh Poimboeuf, Brian Gerst,
Denys Vlasenko, Borislav Petkov, Peter Zijlstra
* Dominik Brodowski <linux@dominikbrodowski.net> wrote:
> On Wed, Feb 07, 2018 at 12:44:41PM -0800, Linus Torvalds wrote:
> > On Wed, Feb 7, 2018 at 12:15 PM, Dominik Brodowski
> > <linux@dominikbrodowski.net> wrote:
> > >
> > > Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
> > > modification. Previously %rsp was manually decreased by 15*8; with
> > > this patch, %rsp is decreased by 15 pushq instructions. Moreover,
> > > error_entry did and does the exact same test (with offset=8) after
> > > the registers have been moved/pushed and cleared.
> >
> > So this has the problem that now those save/clear instructions will
> > all be done in that idtentry macro.
> >
> > So now that code will be duplicated for all the users of that macro.
> >
> > The old code did the saving in the common error_entry and
> > paranoid_entry routines, in order to be able to share all the code,
> > and making the duplicated stub functions generated by the idtentry
> > macro smaller.
> >
> > Now, admittedly the new push sequence is much smaller than the old
> > movq sequence, so the duplication doesn't hurt as much, but it's still
> > likely quite noticeable.
> >
> > So this removes lines of asm code, but it adds a lot of instructions
> > to the end result thanks to the macro, I think.
>
> Indeed, that is the case (see below). However, if we want to switch to
> PUSH instructions and do this in a routine which is call'ed and which
> ret'urns, %rsp needs to be moved around even more often than the old
> ALLOC_PT_GPREGS_ON_STACK macro did (which you wanted to get rid of,
> IIUYC). Or do I miss something?
>
> text data bss dec hex filename
> 19500 0 0 19500 4c2c arch/x86/entry/entry_64.o-orig
> 19510 0 0 19510 4c36 arch/x86/entry/entry_64.o-3_of_7
> 21105 0 0 21105 5271 arch/x86/entry/entry_64.o-5_of_7
> 24307 0 0 24307 5ef3 arch/x86/entry/entry_64.o-7_of_7
So while this shows a +~5K increase in text size, these numbers also _very_
significantly over-represent the extra footprint. The assumptions that resulted in
us compressing the IRQ entry code have changed very significantly with the new x86
IRQ allocation code we introduced in the last year:
- IRQ vectors are usually populated in tightly clustered groups.
With our new vector allocator code the typical per CPU allocation percentage on
x86 systems is ~3 device vectors and ~10 fixed vectors out of ~220 vectors -
i.e. a very low ~6% utilization (!). This means that the _real_ text footprint
increase is probably closer to 0.1K of text...
This is a very typical picture on an average desktop system:
/sys/kernel/debug/irq/domains/VECTORS:
Online bitmaps: 40
Global available: 8007
Global reserved: 24
Total allocated: 73
System: 42: 0-19,32,50,128,237-255
| CPU | avl | man | act | vectors
0 199 0 3 33-34,48
1 199 0 3 33-35
2 199 0 3 33-35
3 199 0 3 33-35
4 199 0 3 33-35
5 199 0 3 33-35
6 199 0 3 33-35
7 199 0 3 33-35
8 199 0 3 33-35
9 199 0 3 33-35
10 201 0 1 33
11 201 0 1 33
12 201 0 1 33
13 201 0 1 33
14 201 0 1 33
15 201 0 1 33
16 201 0 1 33
17 201 0 1 33
18 201 0 1 33
19 201 0 1 33
20 199 0 3 33-35
21 199 0 3 33-35
22 199 0 3 33-35
23 199 0 3 33-35
24 199 0 3 33-35
25 199 0 3 33-35
26 199 0 3 33-35
27 199 0 3 33-35
28 199 0 3 33-35
29 199 0 3 33-35
30 201 0 1 33
31 201 0 1 33
32 201 0 1 33
33 202 0 0
34 202 0 0
35 202 0 0
36 202 0 0
37 202 0 0
38 202 0 0
39 202 0 0
I.e. the average number of (device) IRQ vectors per CPU is around 2, with the
max being 3.
The days where we allocated a lot of vectors on every CPU and the compression
of the IRQ entry code text mattered are over.
- Another issue is that only a small minority of vectors is frequent enough to
actually matter to cache utilization in practice: 3-4 key IPIs and 1-2 device
IRQs at most - and those vectors tend to be tightly clustered as well into about
two groups, and are probably already on 2-3 cache lines in practice.
For the common case of 'cache cold' IRQs it's the depth of the call chain and
the fragmentation of the resulting I$ that should be the main performance limit
- not the overall size of it.
- The CPU side cost of IRQ delivery is still very expensive even in the best, most
cached case, as in 'over a thousand cycles'. So much stuff is done that maybe
contemporary x86 IRQ entry microcode already prefetches the IDT entry and its
expected call target address.
So for those reasons I'm really tempted by the all around simplification offered
by this series:
2 files changed, 62 insertions(+), 160 deletions(-)
Basically in this specific case I'd like to turn the argument around,
use this simpler design and put the burden of proof on the patches that
want to _complicate it_ beyond this simple, straightforward model: show us
the numbers that it truly makes a difference: it's not that hard to
spray a CPU with IRQs by using IPIs, and the cache miss rate and the
overall perf counter stats should tell a clear story about whether it
makes sense to cache-compress (and complicate) the IRQ entry sequences.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
2018-02-08 9:47 ` Ingo Molnar
@ 2018-02-08 17:39 ` Linus Torvalds
0 siblings, 0 replies; 15+ messages in thread
From: Linus Torvalds @ 2018-02-08 17:39 UTC (permalink / raw)
To: Ingo Molnar
Cc: Dominik Brodowski, Linux Kernel Mailing List,
the arch/x86 maintainers, Dan Williams, Thomas Gleixner,
Andi Kleen, Andrew Lutomirski, Josh Poimboeuf, Brian Gerst,
Denys Vlasenko, Borislav Petkov, Peter Zijlstra
On Thu, Feb 8, 2018 at 1:47 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> So for those reasons I'm really tempted by the all around simplification offered
> by this series:
>
> 2 files changed, 62 insertions(+), 160 deletions(-)
>
> Basically in this specific case I'd like to turn the argument around,
> use this simpler design and put the burden of proof on the patches that
> want to _complicate it_ beyond this simple, straightforward model
You make a good argument.
If the only real downside is slightly bigger static footprint (but the
actual real dynamic I$ footprint is not really impacted), then let's
just go for the simpler code.
It really would be lovely if our entry asm was understandable to mere mortals.
Linus
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
2018-02-07 21:29 ` Dominik Brodowski
2018-02-07 21:58 ` Linus Torvalds
2018-02-08 9:47 ` Ingo Molnar
@ 2018-02-08 18:35 ` Josh Poimboeuf
2 siblings, 0 replies; 15+ messages in thread
From: Josh Poimboeuf @ 2018-02-08 18:35 UTC (permalink / raw)
To: Dominik Brodowski
Cc: Linus Torvalds, Linux Kernel Mailing List, Ingo Molnar,
the arch/x86 maintainers, Dan Williams, Thomas Gleixner,
Andi Kleen, Andrew Lutomirski
On Wed, Feb 07, 2018 at 10:29:15PM +0100, Dominik Brodowski wrote:
> On Wed, Feb 07, 2018 at 12:44:41PM -0800, Linus Torvalds wrote:
> > On Wed, Feb 7, 2018 at 12:15 PM, Dominik Brodowski
> > <linux@dominikbrodowski.net> wrote:
> > >
> > > Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
> > > modification. Previously %rsp was manually decreased by 15*8; with
> > > this patch, %rsp is decreased by 15 pushq instructions. Moreover,
> > > error_entry did and does the exact same test (with offset=8) after
> > > the registers have been moved/pushed and cleared.
> >
> > So this has the problem that now those save/clear instructions will
> > all be done in that idtentry macro.
> >
> > So now that code will be duplicated for all the users of that macro.
> >
> > The old code did the saving in the common error_entry and
> > paranoid_entry routines, in order to be able to share all the code,
> > and making the duplicated stub functions generated by the idtentry
> > macro smaller.
> >
> > Now, admittedly the new push sequence is much smaller than the old
> > movq sequence, so the duplication doesn't hurt as much, but it's still
> > likely quite noticeable.
> >
> > So this removes lines of asm code, but it adds a lot of instructions
> > to the end result thanks to the macro, I think.
>
> Indeed, that is the case (see below). However, if we want to switch to
> PUSH instructions and do this in a routine which is call'ed and which
> ret'urns, %rsp needs to be moved around even more often than the old
> ALLOC_PT_GPREGS_ON_STACK macro did (which you wanted to get rid of,
> IIUYC). Or do I miss something?
>
> text data bss dec hex filename
> 19500 0 0 19500 4c2c arch/x86/entry/entry_64.o-orig
> 19510 0 0 19510 4c36 arch/x86/entry/entry_64.o-3_of_7
> 21105 0 0 21105 5271 arch/x86/entry/entry_64.o-5_of_7
> 24307 0 0 24307 5ef3 arch/x86/entry/entry_64.o-7_of_7
>
> In any case, here's a v2.1 for this patch 6/7, which silences an objtool
> warning in error_entry.
>
> Thanks,
> Dominik
I like it. Some minor unwind hint improvements below, feel free to fold
them into this patch:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ae1b09a2e872..d7bd489fa360 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1135,7 +1135,6 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(paranoid_entry)
- UNWIND_HINT_FUNC
cld
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
@@ -1149,7 +1148,7 @@ ENTRY(paranoid_entry)
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
ret
-END(paranoid_entry)
+ENDPROC(paranoid_entry)
/*
* "Paranoid" exit path from exception stack. This is invoked
@@ -1184,7 +1183,6 @@ END(paranoid_exit)
* Return: EBX=0: came from user mode; EBX=1: otherwise
*/
ENTRY(error_entry)
- UNWIND_HINT_FUNC
UNWIND_HINT_REGS offset=8
cld
testb $3, CS+8(%rsp)
^ permalink raw reply related [flat|nested] 15+ messages in thread
end of thread, other threads:[~2018-02-08 18:35 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 2/7] x86/entry: merge POP_C_REGS and POP_EXTRA_REGS Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 3/7] x86/entry: interleave XOR register clearing with PUSH instructions Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 4/7] x86/entry: introduce PUSH_AND_CLEAN_REGS Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 5/7] x86/entry: use PUSH_AND_CLEAN_REGS in more cases Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS Dominik Brodowski
2018-02-07 20:44 ` Linus Torvalds
2018-02-07 21:29 ` Dominik Brodowski
2018-02-07 21:58 ` Linus Torvalds
2018-02-08 7:20 ` Dominik Brodowski
2018-02-08 9:47 ` Ingo Molnar
2018-02-08 17:39 ` Linus Torvalds
2018-02-08 18:35 ` Josh Poimboeuf
2018-02-07 20:15 ` [RFC v2 PATCH 7/7] x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly Dominik Brodowski
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).