All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS
@ 2018-02-07 20:15 Dominik Brodowski
  2018-02-07 20:15 ` [RFC v2 PATCH 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions Dominik Brodowski
                   ` (6 more replies)
  0 siblings, 7 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
  To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto

The starting point for this series was the intention to interleave
the register clearing with register PUSH or MOV instructions, in order
to minimize the cost of the additional instructions required for the
register clearing. While at it, I noticed that a couple of macros in
arch/x86/entry/calling.h are unused and can be cleaned up.

Based on a preliminary version of this patch,[*] Linus suggested to
merge further codepaths and to use PUSH instead of MOV, as this should
be faster on newer CPUs.

[*] http://lkml.kernel.org/r/20180206212546.GA2026@light.dominikbrodowski.net and
    http://lkml.kernel.org/r/20180206213202.GB2026@light.dominikbrodowski.net

NOTE / WARNING: Please be *extremely* stringent in reviewing these
patches, *especially* concerning patch 6/7 (x86/entry: get rid of 
ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS).


Dominik Brodowski (7):
  x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused
    extensions
  x86/entry: merge POP_C_REGS and POP_EXTRA_REGS
  x86/entry: interleave XOR register clearing with PUSH instructions
  x86/entry: introduce PUSH_AND_CLEAN_REGS
  x86/entry: use PUSH_AND_CLEAN_REGS in more cases
  x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
  x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly

 arch/x86/entry/calling.h  | 118 ++++++++++++++++++----------------------------
 arch/x86/entry/entry_64.S | 104 +++++++---------------------------------
 2 files changed, 62 insertions(+), 160 deletions(-)

-- 
2.16.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC v2 PATCH 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions
  2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
  2018-02-07 20:15 ` [RFC v2 PATCH 2/7] x86/entry: merge POP_C_REGS and POP_EXTRA_REGS Dominik Brodowski
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
  To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto

All current code paths call SAVE_C_REGS and then immediately
SAVE_EXTRA_REGS. Therefore, merge these two macros and order the MOV
sequeneces properly.

While at it, remove the macros to save all except specific registers,
as these macros have been unused for a long time.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/x86/entry/calling.h  | 57 +++++++++++++----------------------------------
 arch/x86/entry/entry_64.S | 12 ++++------
 2 files changed, 19 insertions(+), 50 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f4b129d4af42..8907a6593b42 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -101,49 +101,22 @@ For 32-bit we have the following conventions - kernel is built with
 	addq	$-(15*8), %rsp
 	.endm
 
-	.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
-	.if \r11
-	movq %r11, 6*8+\offset(%rsp)
-	.endif
-	.if \r8910
-	movq %r10, 7*8+\offset(%rsp)
-	movq %r9,  8*8+\offset(%rsp)
-	movq %r8,  9*8+\offset(%rsp)
-	.endif
-	.if \rax
-	movq %rax, 10*8+\offset(%rsp)
-	.endif
-	.if \rcx
-	movq %rcx, 11*8+\offset(%rsp)
-	.endif
-	movq %rdx, 12*8+\offset(%rsp)
-	movq %rsi, 13*8+\offset(%rsp)
+	.macro SAVE_REGS offset=0
 	movq %rdi, 14*8+\offset(%rsp)
-	UNWIND_HINT_REGS offset=\offset extra=0
-	.endm
-	.macro SAVE_C_REGS offset=0
-	SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
-	.endm
-	.macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
-	SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
-	.endm
-	.macro SAVE_C_REGS_EXCEPT_R891011
-	SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
-	.endm
-	.macro SAVE_C_REGS_EXCEPT_RCX_R891011
-	SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
-	.endm
-	.macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
-	SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
-	.endm
-
-	.macro SAVE_EXTRA_REGS offset=0
-	movq %r15, 0*8+\offset(%rsp)
-	movq %r14, 1*8+\offset(%rsp)
-	movq %r13, 2*8+\offset(%rsp)
-	movq %r12, 3*8+\offset(%rsp)
-	movq %rbp, 4*8+\offset(%rsp)
+	movq %rsi, 13*8+\offset(%rsp)
+	movq %rdx, 12*8+\offset(%rsp)
+	movq %rcx, 11*8+\offset(%rsp)
+	movq %rax, 10*8+\offset(%rsp)
+	movq %r8,  9*8+\offset(%rsp)
+	movq %r9,  8*8+\offset(%rsp)
+	movq %r10, 7*8+\offset(%rsp)
+	movq %r11, 6*8+\offset(%rsp)
 	movq %rbx, 5*8+\offset(%rsp)
+	movq %rbp, 4*8+\offset(%rsp)
+	movq %r12, 3*8+\offset(%rsp)
+	movq %r13, 2*8+\offset(%rsp)
+	movq %r14, 1*8+\offset(%rsp)
+	movq %r15, 0*8+\offset(%rsp)
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 
@@ -197,7 +170,7 @@ For 32-bit we have the following conventions - kernel is built with
  * is just setting the LSB, which makes it an invalid stack address and is also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
- * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts
+ * NOTE: This macro must be used *after* SAVE_REGS because it corrupts
  * the original rbp.
  */
 .macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9e48002b953b..91e8d84c2496 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -573,8 +573,7 @@ END(irq_entries_start)
 1:
 
 	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_C_REGS
-	SAVE_EXTRA_REGS
+	SAVE_REGS
 	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER
 
@@ -1132,8 +1131,7 @@ ENTRY(xen_failsafe_callback)
 	UNWIND_HINT_IRET_REGS
 	pushq	$-1 /* orig_ax = -1 => not a system call */
 	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_C_REGS
-	SAVE_EXTRA_REGS
+	SAVE_REGS
 	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER
 	jmp	error_exit
@@ -1178,8 +1176,7 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 ENTRY(paranoid_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_C_REGS 8
-	SAVE_EXTRA_REGS 8
+	SAVE_REGS 8
 	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
@@ -1231,8 +1228,7 @@ END(paranoid_exit)
 ENTRY(error_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_C_REGS 8
-	SAVE_EXTRA_REGS 8
+	SAVE_REGS 8
 	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER 8
 	testb	$3, CS+8(%rsp)
-- 
2.16.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC v2 PATCH 2/7] x86/entry: merge POP_C_REGS and POP_EXTRA_REGS
  2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
  2018-02-07 20:15 ` [RFC v2 PATCH 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
  2018-02-07 20:15 ` [RFC v2 PATCH 3/7] x86/entry: interleave XOR register clearing with PUSH instructions Dominik Brodowski
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
  To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto

The two special, opencoded cases for POP_C_REGS can be handled by ASM
macros.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/x86/entry/calling.h  | 15 +++++++++++----
 arch/x86/entry/entry_64.S | 26 ++++----------------------
 2 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 8907a6593b42..3bda31736a7b 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -139,25 +139,32 @@ For 32-bit we have the following conventions - kernel is built with
 	xorq %r15, %r15
 	.endm
 
-	.macro POP_EXTRA_REGS
+	.macro POP_REGS pop_rdi=1 skip_r11rcx=0
 	popq %r15
 	popq %r14
 	popq %r13
 	popq %r12
 	popq %rbp
 	popq %rbx
-	.endm
-
-	.macro POP_C_REGS
+	.if \skip_r11rcx
+	popq %rsi
+	.else
 	popq %r11
+	.endif
 	popq %r10
 	popq %r9
 	popq %r8
 	popq %rax
+	.if \skip_r11rcx
+	popq %rsi
+	.else
 	popq %rcx
+	.endif
 	popq %rdx
 	popq %rsi
+	.if \pop_rdi
 	popq %rdi
+	.endif
 	.endm
 
 	.macro icebp
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 91e8d84c2496..286b24b650da 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -334,15 +334,7 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	UNWIND_HINT_EMPTY
-	POP_EXTRA_REGS
-	popq	%rsi	/* skip r11 */
-	popq	%r10
-	popq	%r9
-	popq	%r8
-	popq	%rax
-	popq	%rsi	/* skip rcx */
-	popq	%rdx
-	popq	%rsi
+	POP_REGS pop_rdi=0 skip_r11rcx=1
 
 	/*
 	 * Now all regs are restored except RSP and RDI.
@@ -635,15 +627,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 	ud2
 1:
 #endif
-	POP_EXTRA_REGS
-	popq	%r11
-	popq	%r10
-	popq	%r9
-	popq	%r8
-	popq	%rax
-	popq	%rcx
-	popq	%rdx
-	popq	%rsi
+	POP_REGS pop_rdi=0
 
 	/*
 	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
@@ -701,8 +685,7 @@ GLOBAL(restore_regs_and_return_to_kernel)
 	ud2
 1:
 #endif
-	POP_EXTRA_REGS
-	POP_C_REGS
+	POP_REGS
 	addq	$8, %rsp	/* skip regs->orig_ax */
 	INTERRUPT_RETURN
 
@@ -1661,8 +1644,7 @@ end_repeat_nmi:
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
-	POP_EXTRA_REGS
-	POP_C_REGS
+	POP_REGS
 
 	/*
 	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
-- 
2.16.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC v2 PATCH 3/7] x86/entry: interleave XOR register clearing with PUSH instructions
  2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
  2018-02-07 20:15 ` [RFC v2 PATCH 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions Dominik Brodowski
  2018-02-07 20:15 ` [RFC v2 PATCH 2/7] x86/entry: merge POP_C_REGS and POP_EXTRA_REGS Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
  2018-02-07 20:15 ` [RFC v2 PATCH 4/7] x86/entry: introduce PUSH_AND_CLEAN_REGS Dominik Brodowski
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
  To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto

Same as is done for syscalls, interleave XOR with PUSH instructions
for exceptions/interrupts, in order to minimize the cost of the
additional instructions required for register clearing.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/x86/entry/calling.h  | 40 +++++++++++++++++++---------------------
 arch/x86/entry/entry_64.S | 30 +++++++++++++++++++++---------
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3bda31736a7b..a05cbb81268d 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -101,44 +101,42 @@ For 32-bit we have the following conventions - kernel is built with
 	addq	$-(15*8), %rsp
 	.endm
 
-	.macro SAVE_REGS offset=0
+	.macro SAVE_AND_CLEAR_REGS offset=0
+	/*
+	 * Save registers and sanitize registers of values that a
+	 * speculation attack might otherwise want to exploit. The
+	 * lower registers are likely clobbered well before they
+	 * could be put to use in a speculative execution gadget.
+	 * Interleave XOR with PUSH for better uop scheduling:
+	 */
 	movq %rdi, 14*8+\offset(%rsp)
 	movq %rsi, 13*8+\offset(%rsp)
 	movq %rdx, 12*8+\offset(%rsp)
 	movq %rcx, 11*8+\offset(%rsp)
 	movq %rax, 10*8+\offset(%rsp)
 	movq %r8,  9*8+\offset(%rsp)
+	xorq %r8, %r8				/* nospec r8 */
 	movq %r9,  8*8+\offset(%rsp)
+	xorq %r9, %r9				/* nospec r9 */
 	movq %r10, 7*8+\offset(%rsp)
+	xorq %r10, %r10				/* nospec r10 */
 	movq %r11, 6*8+\offset(%rsp)
+	xorq %r11, %r11				/* nospec r11 */
 	movq %rbx, 5*8+\offset(%rsp)
+	xorl %ebx, %ebx				/* nospec rbx */
 	movq %rbp, 4*8+\offset(%rsp)
+	xorl %ebp, %ebp				/* nospec rbp */
 	movq %r12, 3*8+\offset(%rsp)
+	xorq %r12, %r12				/* nospec r12 */
 	movq %r13, 2*8+\offset(%rsp)
+	xorq %r13, %r13				/* nospec r13 */
 	movq %r14, 1*8+\offset(%rsp)
+	xorq %r14, %r14				/* nospec r14 */
 	movq %r15, 0*8+\offset(%rsp)
+	xorq %r15, %r15				/* nospec r15 */
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 
-	/*
-	 * Sanitize registers of values that a speculation attack
-	 * might otherwise want to exploit. The lower registers are
-	 * likely clobbered well before they could be put to use in
-	 * a speculative execution gadget:
-	 */
-	.macro CLEAR_REGS_NOSPEC
-	xorl %ebp, %ebp
-	xorl %ebx, %ebx
-	xorq %r8, %r8
-	xorq %r9, %r9
-	xorq %r10, %r10
-	xorq %r11, %r11
-	xorq %r12, %r12
-	xorq %r13, %r13
-	xorq %r14, %r14
-	xorq %r15, %r15
-	.endm
-
 	.macro POP_REGS pop_rdi=1 skip_r11rcx=0
 	popq %r15
 	popq %r14
@@ -177,7 +175,7 @@ For 32-bit we have the following conventions - kernel is built with
  * is just setting the LSB, which makes it an invalid stack address and is also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
- * NOTE: This macro must be used *after* SAVE_REGS because it corrupts
+ * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
  * the original rbp.
  */
 .macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 286b24b650da..1194814ee12b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -565,8 +565,7 @@ END(irq_entries_start)
 1:
 
 	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_REGS
-	CLEAR_REGS_NOSPEC
+	SAVE_AND_CLEAR_REGS
 	ENCODE_FRAME_POINTER
 
 	testb	$3, CS(%rsp)
@@ -1114,8 +1113,7 @@ ENTRY(xen_failsafe_callback)
 	UNWIND_HINT_IRET_REGS
 	pushq	$-1 /* orig_ax = -1 => not a system call */
 	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_REGS
-	CLEAR_REGS_NOSPEC
+	SAVE_AND_CLEAR_REGS
 	ENCODE_FRAME_POINTER
 	jmp	error_exit
 END(xen_failsafe_callback)
@@ -1159,8 +1157,7 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 ENTRY(paranoid_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_REGS 8
-	CLEAR_REGS_NOSPEC
+	SAVE_AND_CLEAR_REGS 8
 	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
@@ -1211,8 +1208,7 @@ END(paranoid_exit)
 ENTRY(error_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_REGS 8
-	CLEAR_REGS_NOSPEC
+	SAVE_AND_CLEAR_REGS 8
 	ENCODE_FRAME_POINTER 8
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
@@ -1399,18 +1395,34 @@ ENTRY(nmi)
 	pushq   (%rdx)		/* pt_regs->dx */
 	pushq   %rcx		/* pt_regs->cx */
 	pushq   %rax		/* pt_regs->ax */
+	/*
+	 * Sanitize registers of values that a speculation attack
+	 * might otherwise want to exploit. The lower registers are
+	 * likely clobbered well before they could be put to use in
+	 * a speculative execution gadget. Interleave XOR with PUSH
+	 * for better uop scheduling:
+	 */
 	pushq   %r8		/* pt_regs->r8 */
+	xorq    %r8, %r8	/* nospec   r8 */
 	pushq   %r9		/* pt_regs->r9 */
+	xorq    %r9, %r9	/* nospec   r9 */
 	pushq   %r10		/* pt_regs->r10 */
+	xorq    %r10, %r10	/* nospec   r10 */
 	pushq   %r11		/* pt_regs->r11 */
+	xorq    %r11, %r11	/* nospec   r11*/
 	pushq	%rbx		/* pt_regs->rbx */
+	xorl    %ebx, %ebx	/* nospec   rbx*/
 	pushq	%rbp		/* pt_regs->rbp */
+	xorl    %ebp, %ebp	/* nospec   rbp*/
 	pushq	%r12		/* pt_regs->r12 */
+	xorq    %r12, %r12	/* nospec   r12*/
 	pushq	%r13		/* pt_regs->r13 */
+	xorq    %r13, %r13	/* nospec   r13*/
 	pushq	%r14		/* pt_regs->r14 */
+	xorq    %r14, %r14	/* nospec   r14*/
 	pushq	%r15		/* pt_regs->r15 */
+	xorq    %r15, %r15	/* nospec   r15*/
 	UNWIND_HINT_REGS
-	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER
 
 	/*
-- 
2.16.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC v2 PATCH 4/7] x86/entry: introduce PUSH_AND_CLEAN_REGS
  2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
                   ` (2 preceding siblings ...)
  2018-02-07 20:15 ` [RFC v2 PATCH 3/7] x86/entry: interleave XOR register clearing with PUSH instructions Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
  2018-02-07 20:15 ` [RFC v2 PATCH 5/7] x86/entry: use PUSH_AND_CLEAN_REGS in more cases Dominik Brodowski
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
  To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto

Those instances where ALLOC_PT_GPREGS_ON_STACK is called just before
SAVE_AND_CLEAR_REGS can trivially be replaced by PUSH_AND_CLEAN_REGS.
This macro uses PUSH instead of MOV and should therefore be faster, at
least on newer CPUs.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/x86/entry/calling.h  | 36 ++++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_64.S |  6 ++----
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index a05cbb81268d..57b1b87a04f0 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -137,6 +137,42 @@ For 32-bit we have the following conventions - kernel is built with
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 
+	.macro PUSH_AND_CLEAR_REGS
+	/*
+	 * Push registers and sanitize registers of values that a
+	 * speculation attack might otherwise want to exploit. The
+	 * lower registers are likely clobbered well before they
+	 * could be put to use in a speculative execution gadget.
+	 * Interleave XOR with PUSH for better uop scheduling:
+	 */
+	pushq   %rdi		/* pt_regs->di */
+	pushq   %rsi		/* pt_regs->si */
+	pushq   %rdx		/* pt_regs->dx */
+	pushq   %rcx		/* pt_regs->cx */
+	pushq   %rax		/* pt_regs->ax */
+	pushq   %r8		/* pt_regs->r8 */
+	xorq    %r8, %r8	/* nospec   r8 */
+	pushq   %r9		/* pt_regs->r9 */
+	xorq    %r9, %r9	/* nospec   r9 */
+	pushq   %r10		/* pt_regs->r10 */
+	xorq    %r10, %r10	/* nospec   r10 */
+	pushq   %r11		/* pt_regs->r11 */
+	xorq    %r11, %r11	/* nospec   r11*/
+	pushq	%rbx		/* pt_regs->rbx */
+	xorl    %ebx, %ebx	/* nospec   rbx*/
+	pushq	%rbp		/* pt_regs->rbp */
+	xorl    %ebp, %ebp	/* nospec   rbp*/
+	pushq	%r12		/* pt_regs->r12 */
+	xorq    %r12, %r12	/* nospec   r12*/
+	pushq	%r13		/* pt_regs->r13 */
+	xorq    %r13, %r13	/* nospec   r13*/
+	pushq	%r14		/* pt_regs->r14 */
+	xorq    %r14, %r14	/* nospec   r14*/
+	pushq	%r15		/* pt_regs->r15 */
+	xorq    %r15, %r15	/* nospec   r15*/
+	UNWIND_HINT_REGS
+	.endm
+
 	.macro POP_REGS pop_rdi=1 skip_r11rcx=0
 	popq %r15
 	popq %r14
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1194814ee12b..9dd3fbfdc75d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -564,8 +564,7 @@ END(irq_entries_start)
 	call	switch_to_thread_stack
 1:
 
-	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_AND_CLEAR_REGS
+	PUSH_AND_CLEAR_REGS
 	ENCODE_FRAME_POINTER
 
 	testb	$3, CS(%rsp)
@@ -1112,8 +1111,7 @@ ENTRY(xen_failsafe_callback)
 	addq	$0x30, %rsp
 	UNWIND_HINT_IRET_REGS
 	pushq	$-1 /* orig_ax = -1 => not a system call */
-	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_AND_CLEAR_REGS
+	PUSH_AND_CLEAR_REGS
 	ENCODE_FRAME_POINTER
 	jmp	error_exit
 END(xen_failsafe_callback)
-- 
2.16.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC v2 PATCH 5/7] x86/entry: use PUSH_AND_CLEAN_REGS in more cases
  2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
                   ` (3 preceding siblings ...)
  2018-02-07 20:15 ` [RFC v2 PATCH 4/7] x86/entry: introduce PUSH_AND_CLEAN_REGS Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
  2018-02-07 20:15 ` [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS Dominik Brodowski
  2018-02-07 20:15 ` [RFC v2 PATCH 7/7] x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly Dominik Brodowski
  6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
  To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto

entry_SYSCALL_64_after_hwframe and nmi can be converted to use
PUSH_AND_CLEAN_REGS instead of opencoded variants thereof. Due to
the interleaving, the additional XOR-based clearing of r8 and r9
in entry_SYSCALL_64_after_hwframe should not have any noticeable
negative implications.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/x86/entry/calling.h  |  6 ++---
 arch/x86/entry/entry_64.S | 65 +++--------------------------------------------
 2 files changed, 6 insertions(+), 65 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 57b1b87a04f0..d6a97e2945ee 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -137,7 +137,7 @@ For 32-bit we have the following conventions - kernel is built with
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 
-	.macro PUSH_AND_CLEAR_REGS
+	.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
 	/*
 	 * Push registers and sanitize registers of values that a
 	 * speculation attack might otherwise want to exploit. The
@@ -147,9 +147,9 @@ For 32-bit we have the following conventions - kernel is built with
 	 */
 	pushq   %rdi		/* pt_regs->di */
 	pushq   %rsi		/* pt_regs->si */
-	pushq   %rdx		/* pt_regs->dx */
+	pushq	\rdx		/* pt_regs->dx */
 	pushq   %rcx		/* pt_regs->cx */
-	pushq   %rax		/* pt_regs->ax */
+	pushq   \rax		/* pt_regs->ax */
 	pushq   %r8		/* pt_regs->r8 */
 	xorq    %r8, %r8	/* nospec   r8 */
 	pushq   %r9		/* pt_regs->r9 */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9dd3fbfdc75d..9c4fe360db42 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -227,35 +227,8 @@ ENTRY(entry_SYSCALL_64)
 	pushq	%rcx				/* pt_regs->ip */
 GLOBAL(entry_SYSCALL_64_after_hwframe)
 	pushq	%rax				/* pt_regs->orig_ax */
-	pushq	%rdi				/* pt_regs->di */
-	pushq	%rsi				/* pt_regs->si */
-	pushq	%rdx				/* pt_regs->dx */
-	pushq	%rcx				/* pt_regs->cx */
-	pushq	$-ENOSYS			/* pt_regs->ax */
-	pushq	%r8				/* pt_regs->r8 */
-	pushq	%r9				/* pt_regs->r9 */
-	pushq	%r10				/* pt_regs->r10 */
-	/*
-	 * Clear extra registers that a speculation attack might
-	 * otherwise want to exploit. Interleave XOR with PUSH
-	 * for better uop scheduling:
-	 */
-	xorq	%r10, %r10			/* nospec   r10 */
-	pushq	%r11				/* pt_regs->r11 */
-	xorq	%r11, %r11			/* nospec   r11 */
-	pushq	%rbx				/* pt_regs->rbx */
-	xorl	%ebx, %ebx			/* nospec   rbx */
-	pushq	%rbp				/* pt_regs->rbp */
-	xorl	%ebp, %ebp			/* nospec   rbp */
-	pushq	%r12				/* pt_regs->r12 */
-	xorq	%r12, %r12			/* nospec   r12 */
-	pushq	%r13				/* pt_regs->r13 */
-	xorq	%r13, %r13			/* nospec   r13 */
-	pushq	%r14				/* pt_regs->r14 */
-	xorq	%r14, %r14			/* nospec   r14 */
-	pushq	%r15				/* pt_regs->r15 */
-	xorq	%r15, %r15			/* nospec   r15 */
-	UNWIND_HINT_REGS
+
+	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
 
 	TRACE_IRQS_OFF
 
@@ -1388,39 +1361,7 @@ ENTRY(nmi)
 	pushq	1*8(%rdx)	/* pt_regs->rip */
 	UNWIND_HINT_IRET_REGS
 	pushq   $-1		/* pt_regs->orig_ax */
-	pushq   %rdi		/* pt_regs->di */
-	pushq   %rsi		/* pt_regs->si */
-	pushq   (%rdx)		/* pt_regs->dx */
-	pushq   %rcx		/* pt_regs->cx */
-	pushq   %rax		/* pt_regs->ax */
-	/*
-	 * Sanitize registers of values that a speculation attack
-	 * might otherwise want to exploit. The lower registers are
-	 * likely clobbered well before they could be put to use in
-	 * a speculative execution gadget. Interleave XOR with PUSH
-	 * for better uop scheduling:
-	 */
-	pushq   %r8		/* pt_regs->r8 */
-	xorq    %r8, %r8	/* nospec   r8 */
-	pushq   %r9		/* pt_regs->r9 */
-	xorq    %r9, %r9	/* nospec   r9 */
-	pushq   %r10		/* pt_regs->r10 */
-	xorq    %r10, %r10	/* nospec   r10 */
-	pushq   %r11		/* pt_regs->r11 */
-	xorq    %r11, %r11	/* nospec   r11*/
-	pushq	%rbx		/* pt_regs->rbx */
-	xorl    %ebx, %ebx	/* nospec   rbx*/
-	pushq	%rbp		/* pt_regs->rbp */
-	xorl    %ebp, %ebp	/* nospec   rbp*/
-	pushq	%r12		/* pt_regs->r12 */
-	xorq    %r12, %r12	/* nospec   r12*/
-	pushq	%r13		/* pt_regs->r13 */
-	xorq    %r13, %r13	/* nospec   r13*/
-	pushq	%r14		/* pt_regs->r14 */
-	xorq    %r14, %r14	/* nospec   r14*/
-	pushq	%r15		/* pt_regs->r15 */
-	xorq    %r15, %r15	/* nospec   r15*/
-	UNWIND_HINT_REGS
+	PUSH_AND_CLEAR_REGS rdx=(%rdx)
 	ENCODE_FRAME_POINTER
 
 	/*
-- 
2.16.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
  2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
                   ` (4 preceding siblings ...)
  2018-02-07 20:15 ` [RFC v2 PATCH 5/7] x86/entry: use PUSH_AND_CLEAN_REGS in more cases Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
  2018-02-07 20:44   ` Linus Torvalds
  2018-02-07 20:15 ` [RFC v2 PATCH 7/7] x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly Dominik Brodowski
  6 siblings, 1 reply; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
  To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto

Previously, error_entry() and paranoid_entry() saved the GP registers
onto stack space previously allocated by its callers. Combine these two
steps in the callers, and use the generic PUSH_AND_CLEAR_REGS macro
for that.

Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
modification. Previously %rsp was manually decreased by 15*8; with
this patch, %rsp is decreased by 15 pushq instructions. Moreover,
error_entry did and does the exact same test (with offset=8) after
the registers have been moved/pushed and cleared.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/x86/entry/calling.h  | 42 +-----------------------------------------
 arch/x86/entry/entry_64.S | 15 +++++++--------
 2 files changed, 8 insertions(+), 49 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d6a97e2945ee..59675010c9a0 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,46 +97,6 @@ For 32-bit we have the following conventions - kernel is built with
 
 #define SIZEOF_PTREGS	21*8
 
-	.macro ALLOC_PT_GPREGS_ON_STACK
-	addq	$-(15*8), %rsp
-	.endm
-
-	.macro SAVE_AND_CLEAR_REGS offset=0
-	/*
-	 * Save registers and sanitize registers of values that a
-	 * speculation attack might otherwise want to exploit. The
-	 * lower registers are likely clobbered well before they
-	 * could be put to use in a speculative execution gadget.
-	 * Interleave XOR with PUSH for better uop scheduling:
-	 */
-	movq %rdi, 14*8+\offset(%rsp)
-	movq %rsi, 13*8+\offset(%rsp)
-	movq %rdx, 12*8+\offset(%rsp)
-	movq %rcx, 11*8+\offset(%rsp)
-	movq %rax, 10*8+\offset(%rsp)
-	movq %r8,  9*8+\offset(%rsp)
-	xorq %r8, %r8				/* nospec r8 */
-	movq %r9,  8*8+\offset(%rsp)
-	xorq %r9, %r9				/* nospec r9 */
-	movq %r10, 7*8+\offset(%rsp)
-	xorq %r10, %r10				/* nospec r10 */
-	movq %r11, 6*8+\offset(%rsp)
-	xorq %r11, %r11				/* nospec r11 */
-	movq %rbx, 5*8+\offset(%rsp)
-	xorl %ebx, %ebx				/* nospec rbx */
-	movq %rbp, 4*8+\offset(%rsp)
-	xorl %ebp, %ebp				/* nospec rbp */
-	movq %r12, 3*8+\offset(%rsp)
-	xorq %r12, %r12				/* nospec r12 */
-	movq %r13, 2*8+\offset(%rsp)
-	xorq %r13, %r13				/* nospec r13 */
-	movq %r14, 1*8+\offset(%rsp)
-	xorq %r14, %r14				/* nospec r14 */
-	movq %r15, 0*8+\offset(%rsp)
-	xorq %r15, %r15				/* nospec r15 */
-	UNWIND_HINT_REGS offset=\offset
-	.endm
-
 	.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
 	/*
 	 * Push registers and sanitize registers of values that a
@@ -211,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
  * is just setting the LSB, which makes it an invalid stack address and is also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
- * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
+ * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
  * the original rbp.
  */
 .macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9c4fe360db42..fa56a974d1c1 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -871,7 +871,9 @@ ENTRY(\sym)
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	.endif
 
-	ALLOC_PT_GPREGS_ON_STACK
+	/* Save all registers in pt_regs */
+	PUSH_AND_CLEAR_REGS
+	ENCODE_FRAME_POINTER
 
 	.if \paranoid < 2
 	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
@@ -1121,15 +1123,13 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 #endif
 
 /*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
  * Use slow, but surefire "are we in kernel?" check.
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_AND_CLEAR_REGS 8
-	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
 	rdmsr
@@ -1173,14 +1173,12 @@ ENTRY(paranoid_exit)
 END(paranoid_exit)
 
 /*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
  * Return: EBX=0: came from user mode; EBX=1: otherwise
  */
 ENTRY(error_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_AND_CLEAR_REGS 8
-	ENCODE_FRAME_POINTER 8
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
 
@@ -1571,7 +1569,8 @@ end_repeat_nmi:
 	 * frame to point back to repeat_nmi.
 	 */
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
-	ALLOC_PT_GPREGS_ON_STACK
+	PUSH_AND_CLEAR_REGS
+	ENCODE_FRAME_POINTER
 
 	/*
 	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
-- 
2.16.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC v2 PATCH 7/7] x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly
  2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
                   ` (5 preceding siblings ...)
  2018-02-07 20:15 ` [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS Dominik Brodowski
@ 2018-02-07 20:15 ` Dominik Brodowski
  6 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 20:15 UTC (permalink / raw)
  To: linux-kernel, mingo, x86; +Cc: dan.j.williams, tglx, ak, torvalds, luto

... same as the other macros in arch/x86/entry/calling.h

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/x86/entry/calling.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 59675010c9a0..6985440c68fa 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,7 +97,7 @@ For 32-bit we have the following conventions - kernel is built with
 
 #define SIZEOF_PTREGS	21*8
 
-	.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
 	/*
 	 * Push registers and sanitize registers of values that a
 	 * speculation attack might otherwise want to exploit. The
@@ -131,9 +131,9 @@ For 32-bit we have the following conventions - kernel is built with
 	pushq	%r15		/* pt_regs->r15 */
 	xorq    %r15, %r15	/* nospec   r15*/
 	UNWIND_HINT_REGS
-	.endm
+.endm
 
-	.macro POP_REGS pop_rdi=1 skip_r11rcx=0
+.macro POP_REGS pop_rdi=1 skip_r11rcx=0
 	popq %r15
 	popq %r14
 	popq %r13
@@ -163,7 +163,7 @@ For 32-bit we have the following conventions - kernel is built with
 
 	.macro icebp
 	.byte 0xf1
-	.endm
+.endm
 
 /*
  * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The
-- 
2.16.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
  2018-02-07 20:15 ` [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS Dominik Brodowski
@ 2018-02-07 20:44   ` Linus Torvalds
  2018-02-07 21:29     ` Dominik Brodowski
  0 siblings, 1 reply; 15+ messages in thread
From: Linus Torvalds @ 2018-02-07 20:44 UTC (permalink / raw)
  To: Dominik Brodowski
  Cc: Linux Kernel Mailing List, Ingo Molnar, the arch/x86 maintainers,
	Dan Williams, Thomas Gleixner, Andi Kleen, Andrew Lutomirski

On Wed, Feb 7, 2018 at 12:15 PM, Dominik Brodowski
<linux@dominikbrodowski.net> wrote:
>
> Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
> modification. Previously %rsp was manually decreased by 15*8; with
> this patch, %rsp is decreased by 15 pushq instructions. Moreover,
> error_entry did and does the exact same test (with offset=8) after
> the registers have been moved/pushed and cleared.

So this has the problem that now those save/clear instructions will
all be done in that idtentry macro.

So now that code will be duplicated for all the users of that macro.

The old code did the saving in the common error_entry and
paranoid_entry routines, in order to be able to share all the code,
and making the duplicated stub functions generated by the idtentry
macro smaller.

Now, admittedly the new push sequence is much smaller than the old
movq sequence, so the duplication doesn't hurt as much, but it's still
likely quite noticeable.

So this removes lines of asm code, but it adds a lot of instructions
to the end result thanks to the macro, I think.

               Linus

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
  2018-02-07 20:44   ` Linus Torvalds
@ 2018-02-07 21:29     ` Dominik Brodowski
  2018-02-07 21:58       ` Linus Torvalds
                         ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-07 21:29 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Linux Kernel Mailing List, Ingo Molnar, the arch/x86 maintainers,
	Dan Williams, Thomas Gleixner, Andi Kleen, Andrew Lutomirski

On Wed, Feb 07, 2018 at 12:44:41PM -0800, Linus Torvalds wrote:
> On Wed, Feb 7, 2018 at 12:15 PM, Dominik Brodowski
> <linux@dominikbrodowski.net> wrote:
> >
> > Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
> > modification. Previously %rsp was manually decreased by 15*8; with
> > this patch, %rsp is decreased by 15 pushq instructions. Moreover,
> > error_entry did and does the exact same test (with offset=8) after
> > the registers have been moved/pushed and cleared.
> 
> So this has the problem that now those save/clear instructions will
> all be done in that idtentry macro.
> 
> So now that code will be duplicated for all the users of that macro.
> 
> The old code did the saving in the common error_entry and
> paranoid_entry routines, in order to be able to share all the code,
> and making the duplicated stub functions generated by the idtentry
> macro smaller.
> 
> Now, admittedly the new push sequence is much smaller than the old
> movq sequence, so the duplication doesn't hurt as much, but it's still
> likely quite noticeable.
> 
> So this removes lines of asm code, but it adds a lot of instructions
> to the end result thanks to the macro, I think.

Indeed, that is the case (see below). However, if we want to switch to
PUSH instructions and do this in a routine which is call'ed and which
ret'urns, %rsp needs to be moved around even more often than the old
ALLOC_PT_GPREGS_ON_STACK macro did (which you wanted to get rid of,
IIUYC). Or do I miss something?

   text	   data	    bss	    dec	    hex	filename
  19500	      0	      0	  19500	   4c2c	arch/x86/entry/entry_64.o-orig
  19510	      0	      0	  19510	   4c36	arch/x86/entry/entry_64.o-3_of_7
  21105	      0	      0	  21105	   5271	arch/x86/entry/entry_64.o-5_of_7
  24307	      0	      0	  24307	   5ef3	arch/x86/entry/entry_64.o-7_of_7

In any case, here's a v2.1 for this patch 6/7, which silences an objtool
warning in error_entry.

Thanks,
	Dominik

----------------------------------------------------
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Wed, 7 Feb 2018 20:56:13 +0100
Subject: [PATCH] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS

Previously, error_entry() and paranoid_entry() saved the GP registers
onto stack space previously allocated by its callers. Combine these two
steps in the callers, and use the generic PUSH_AND_CLEAR_REGS macro
for that.

Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
modification. Previously %rsp was manually decreased by 15*8; with
this patch, %rsp is decreased by 15 pushq instructions. Moreover,
error_entry did and does the exact same test (with offset=8) after
the registers have been moved/pushed and cleared.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d6a97e2945ee..59675010c9a0 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,46 +97,6 @@ For 32-bit we have the following conventions - kernel is built with
 
 #define SIZEOF_PTREGS	21*8
 
-	.macro ALLOC_PT_GPREGS_ON_STACK
-	addq	$-(15*8), %rsp
-	.endm
-
-	.macro SAVE_AND_CLEAR_REGS offset=0
-	/*
-	 * Save registers and sanitize registers of values that a
-	 * speculation attack might otherwise want to exploit. The
-	 * lower registers are likely clobbered well before they
-	 * could be put to use in a speculative execution gadget.
-	 * Interleave XOR with PUSH for better uop scheduling:
-	 */
-	movq %rdi, 14*8+\offset(%rsp)
-	movq %rsi, 13*8+\offset(%rsp)
-	movq %rdx, 12*8+\offset(%rsp)
-	movq %rcx, 11*8+\offset(%rsp)
-	movq %rax, 10*8+\offset(%rsp)
-	movq %r8,  9*8+\offset(%rsp)
-	xorq %r8, %r8				/* nospec r8 */
-	movq %r9,  8*8+\offset(%rsp)
-	xorq %r9, %r9				/* nospec r9 */
-	movq %r10, 7*8+\offset(%rsp)
-	xorq %r10, %r10				/* nospec r10 */
-	movq %r11, 6*8+\offset(%rsp)
-	xorq %r11, %r11				/* nospec r11 */
-	movq %rbx, 5*8+\offset(%rsp)
-	xorl %ebx, %ebx				/* nospec rbx */
-	movq %rbp, 4*8+\offset(%rsp)
-	xorl %ebp, %ebp				/* nospec rbp */
-	movq %r12, 3*8+\offset(%rsp)
-	xorq %r12, %r12				/* nospec r12 */
-	movq %r13, 2*8+\offset(%rsp)
-	xorq %r13, %r13				/* nospec r13 */
-	movq %r14, 1*8+\offset(%rsp)
-	xorq %r14, %r14				/* nospec r14 */
-	movq %r15, 0*8+\offset(%rsp)
-	xorq %r15, %r15				/* nospec r15 */
-	UNWIND_HINT_REGS offset=\offset
-	.endm
-
 	.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
 	/*
 	 * Push registers and sanitize registers of values that a
@@ -211,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
  * is just setting the LSB, which makes it an invalid stack address and is also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
- * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
+ * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
  * the original rbp.
  */
 .macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9c4fe360db42..c6e5d44eb322 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -871,7 +871,9 @@ ENTRY(\sym)
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	.endif
 
-	ALLOC_PT_GPREGS_ON_STACK
+	/* Save all registers in pt_regs */
+	PUSH_AND_CLEAR_REGS
+	ENCODE_FRAME_POINTER
 
 	.if \paranoid < 2
 	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
@@ -1121,15 +1123,13 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 #endif
 
 /*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
  * Use slow, but surefire "are we in kernel?" check.
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_AND_CLEAR_REGS 8
-	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
 	rdmsr
@@ -1173,14 +1173,13 @@ ENTRY(paranoid_exit)
 END(paranoid_exit)
 
 /*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
  * Return: EBX=0: came from user mode; EBX=1: otherwise
  */
 ENTRY(error_entry)
 	UNWIND_HINT_FUNC
+	UNWIND_HINT_REGS offset=8
 	cld
-	SAVE_AND_CLEAR_REGS 8
-	ENCODE_FRAME_POINTER 8
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
 
@@ -1571,7 +1570,8 @@ end_repeat_nmi:
 	 * frame to point back to repeat_nmi.
 	 */
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
-	ALLOC_PT_GPREGS_ON_STACK
+	PUSH_AND_CLEAR_REGS
+	ENCODE_FRAME_POINTER
 
 	/*
 	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
  2018-02-07 21:29     ` Dominik Brodowski
@ 2018-02-07 21:58       ` Linus Torvalds
  2018-02-08  7:20         ` Dominik Brodowski
  2018-02-08  9:47       ` Ingo Molnar
  2018-02-08 18:35       ` Josh Poimboeuf
  2 siblings, 1 reply; 15+ messages in thread
From: Linus Torvalds @ 2018-02-07 21:58 UTC (permalink / raw)
  To: Dominik Brodowski
  Cc: Linux Kernel Mailing List, Ingo Molnar, the arch/x86 maintainers,
	Dan Williams, Thomas Gleixner, Andi Kleen, Andrew Lutomirski

On Wed, Feb 7, 2018 at 1:29 PM, Dominik Brodowski
<linux@dominikbrodowski.net> wrote:
>>
>> So this removes lines of asm code, but it adds a lot of instructions
>> to the end result thanks to the macro, I think.
>
> Indeed, that is the case (see below). However, if we want to switch to
> PUSH instructions and do this in a routine which is call'ed and which
> ret'urns, %rsp needs to be moved around even more often than the old
> ALLOC_PT_GPREGS_ON_STACK macro did (which you wanted to get rid of,
> IIUYC). Or do I miss something?

So I agree that your approach makes for a lot simpler stack setup.

I was just hoping that we could play some tricks.

For example, right now your PUSH_AND_CLEAR_REGS starts off with

        pushq   %rdi            /* pt_regs->di */
        pushq   %rsi            /* pt_regs->si */
        pushq   %rdx            /* pt_regs->dx */
        pushq   %rcx            /* pt_regs->cx */
        ....

and maybe we could still use this in paranoid_entry and error_entry if
we made it something like

/* if 'save_ret' is set, we pop the return point into %rsi */
.macro PUSH_AND_CLEAR_REGS save_ret=0
        .if \save_ret
        pushq %%rsi
        movq 8(%%rsp),%rsi
        movq %%rdi,8(%%rsp)
        .else
        pushq   %rdi            /* pt_regs->di */
        pushq   %rsi            /* pt_regs->si */
        .endif
        pushq   %rdx            /* pt_regs->dx */
        pushq   %rcx            /* pt_regs->cx */
        ....

which would allow error_entry and paranoid_entry to do something like this:

        PUSH_AND_CLEAR_REGS save_ret=1
        pushq %rsi
        ... do the other common code ..
        ret

(totally untested, I'm just doing a "stream-of-consciousness" thing in
the email.

See what I'm saying?

That said, maybe the pushq sequence is now so small that it doesn't
even matter, and duplicating it isn't a big problem.

Because your version sure is simpler.

         Linus

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
  2018-02-07 21:58       ` Linus Torvalds
@ 2018-02-08  7:20         ` Dominik Brodowski
  0 siblings, 0 replies; 15+ messages in thread
From: Dominik Brodowski @ 2018-02-08  7:20 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Linux Kernel Mailing List, Ingo Molnar, the arch/x86 maintainers,
	Dan Williams, Thomas Gleixner, Andi Kleen, Andrew Lutomirski

On Wed, Feb 07, 2018 at 01:58:20PM -0800, Linus Torvalds wrote:
> I was just hoping that we could play some tricks.
> 
> [...]
> 
> See what I'm saying?

Clever. Though I'd include the "pushq %rsi" in the macro, to be even more
tricky.

   text    data     bss     dec     hex filename
  19500       0       0   19500    4c2c arch/x86/entry/entry_64.o-orig
  24307       0       0   24307    5ef3 arch/x86/entry/entry_64.o-7_of_7
  20987	      0	      0	  20987	   51fb arch/x86/entry/entry_64.o-trick

I'm not really sure yet where the increase in text size comes from in my
patch set, though.

I *hope* that the CS-ORIG_RAX(%rsp) testb is correct; again, excercise
an extremely stringent review of this patch, please.

Thanks,
	Dominik

--------------------------------------------------------
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Wed, 7 Feb 2018 20:56:13 +0100
Subject: [PATCH] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS

Previously, error_entry() and paranoid_entry() saved the GP registers
onto stack space previously allocated by its callers. Combine these two
steps in the callee, and use the generic PUSH_AND_CLEAR_REGS macro
for that, but play a litte trick in it -- suggested by Linus -- to insert
the GP registers "above" the original return address.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d6a97e2945ee..dc60365547e1 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,47 +97,7 @@ For 32-bit we have the following conventions - kernel is built with
 
 #define SIZEOF_PTREGS	21*8
 
-	.macro ALLOC_PT_GPREGS_ON_STACK
-	addq	$-(15*8), %rsp
-	.endm
-
-	.macro SAVE_AND_CLEAR_REGS offset=0
-	/*
-	 * Save registers and sanitize registers of values that a
-	 * speculation attack might otherwise want to exploit. The
-	 * lower registers are likely clobbered well before they
-	 * could be put to use in a speculative execution gadget.
-	 * Interleave XOR with PUSH for better uop scheduling:
-	 */
-	movq %rdi, 14*8+\offset(%rsp)
-	movq %rsi, 13*8+\offset(%rsp)
-	movq %rdx, 12*8+\offset(%rsp)
-	movq %rcx, 11*8+\offset(%rsp)
-	movq %rax, 10*8+\offset(%rsp)
-	movq %r8,  9*8+\offset(%rsp)
-	xorq %r8, %r8				/* nospec r8 */
-	movq %r9,  8*8+\offset(%rsp)
-	xorq %r9, %r9				/* nospec r9 */
-	movq %r10, 7*8+\offset(%rsp)
-	xorq %r10, %r10				/* nospec r10 */
-	movq %r11, 6*8+\offset(%rsp)
-	xorq %r11, %r11				/* nospec r11 */
-	movq %rbx, 5*8+\offset(%rsp)
-	xorl %ebx, %ebx				/* nospec rbx */
-	movq %rbp, 4*8+\offset(%rsp)
-	xorl %ebp, %ebp				/* nospec rbp */
-	movq %r12, 3*8+\offset(%rsp)
-	xorq %r12, %r12				/* nospec r12 */
-	movq %r13, 2*8+\offset(%rsp)
-	xorq %r13, %r13				/* nospec r13 */
-	movq %r14, 1*8+\offset(%rsp)
-	xorq %r14, %r14				/* nospec r14 */
-	movq %r15, 0*8+\offset(%rsp)
-	xorq %r15, %r15				/* nospec r15 */
-	UNWIND_HINT_REGS offset=\offset
-	.endm
-
-	.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
+	.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
 	/*
 	 * Push registers and sanitize registers of values that a
 	 * speculation attack might otherwise want to exploit. The
@@ -145,8 +105,14 @@ For 32-bit we have the following conventions - kernel is built with
 	 * could be put to use in a speculative execution gadget.
 	 * Interleave XOR with PUSH for better uop scheduling:
 	 */
+	.if \save_ret
+	pushq	%rsi		/* pt_regs->si */
+	movq	8(%rsp), %rsi	/* temporarily store ret address in %rsi */
+	movq	%rdi, 8(%rsp)	/* pt_regs->di (overwriting original ret) */
+	.else
 	pushq   %rdi		/* pt_regs->di */
 	pushq   %rsi		/* pt_regs->si */
+	.endif
 	pushq	\rdx		/* pt_regs->dx */
 	pushq   %rcx		/* pt_regs->cx */
 	pushq   \rax		/* pt_regs->ax */
@@ -171,6 +137,9 @@ For 32-bit we have the following conventions - kernel is built with
 	pushq	%r15		/* pt_regs->r15 */
 	xorq    %r15, %r15	/* nospec   r15*/
 	UNWIND_HINT_REGS
+	.if \save_ret
+	pushq	%rsi		/* return address on top of stack */
+	.endif
 	.endm
 
 	.macro POP_REGS pop_rdi=1 skip_r11rcx=0
@@ -211,7 +180,7 @@ For 32-bit we have the following conventions - kernel is built with
  * is just setting the LSB, which makes it an invalid stack address and is also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
- * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
+ * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
  * the original rbp.
  */
 .macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9c4fe360db42..a2e41177e390 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -871,10 +871,8 @@ ENTRY(\sym)
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	.endif
 
-	ALLOC_PT_GPREGS_ON_STACK
-
 	.if \paranoid < 2
-	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
+	testb	$3, CS-ORIG_RAX(%rsp)		/* If coming from userspace, switch stacks */
 	jnz	.Lfrom_usermode_switch_stack_\@
 	.endif
 
@@ -1128,7 +1126,7 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 ENTRY(paranoid_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_AND_CLEAR_REGS 8
+	PUSH_AND_CLEAR_REGS save_ret=1
 	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
@@ -1179,7 +1177,7 @@ END(paranoid_exit)
 ENTRY(error_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_AND_CLEAR_REGS 8
+	PUSH_AND_CLEAR_REGS save_ret=1
 	ENCODE_FRAME_POINTER 8
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
@@ -1571,7 +1569,6 @@ end_repeat_nmi:
 	 * frame to point back to repeat_nmi.
 	 */
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
-	ALLOC_PT_GPREGS_ON_STACK
 
 	/*
 	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
  2018-02-07 21:29     ` Dominik Brodowski
  2018-02-07 21:58       ` Linus Torvalds
@ 2018-02-08  9:47       ` Ingo Molnar
  2018-02-08 17:39         ` Linus Torvalds
  2018-02-08 18:35       ` Josh Poimboeuf
  2 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2018-02-08  9:47 UTC (permalink / raw)
  To: Dominik Brodowski
  Cc: Linus Torvalds, Linux Kernel Mailing List,
	the arch/x86 maintainers, Dan Williams, Thomas Gleixner,
	Andi Kleen, Andrew Lutomirski, Josh Poimboeuf, Brian Gerst,
	Denys Vlasenko, Borislav Petkov, Peter Zijlstra


* Dominik Brodowski <linux@dominikbrodowski.net> wrote:

> On Wed, Feb 07, 2018 at 12:44:41PM -0800, Linus Torvalds wrote:
> > On Wed, Feb 7, 2018 at 12:15 PM, Dominik Brodowski
> > <linux@dominikbrodowski.net> wrote:
> > >
> > > Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
> > > modification. Previously %rsp was manually decreased by 15*8; with
> > > this patch, %rsp is decreased by 15 pushq instructions. Moreover,
> > > error_entry did and does the exact same test (with offset=8) after
> > > the registers have been moved/pushed and cleared.
> > 
> > So this has the problem that now those save/clear instructions will
> > all be done in that idtentry macro.
> > 
> > So now that code will be duplicated for all the users of that macro.
> > 
> > The old code did the saving in the common error_entry and
> > paranoid_entry routines, in order to be able to share all the code,
> > and making the duplicated stub functions generated by the idtentry
> > macro smaller.
> > 
> > Now, admittedly the new push sequence is much smaller than the old
> > movq sequence, so the duplication doesn't hurt as much, but it's still
> > likely quite noticeable.
> > 
> > So this removes lines of asm code, but it adds a lot of instructions
> > to the end result thanks to the macro, I think.
> 
> Indeed, that is the case (see below). However, if we want to switch to
> PUSH instructions and do this in a routine which is call'ed and which
> ret'urns, %rsp needs to be moved around even more often than the old
> ALLOC_PT_GPREGS_ON_STACK macro did (which you wanted to get rid of,
> IIUYC). Or do I miss something?
> 
>    text	   data	    bss	    dec	    hex	filename
>   19500	      0	      0	  19500	   4c2c	arch/x86/entry/entry_64.o-orig
>   19510	      0	      0	  19510	   4c36	arch/x86/entry/entry_64.o-3_of_7
>   21105	      0	      0	  21105	   5271	arch/x86/entry/entry_64.o-5_of_7
>   24307	      0	      0	  24307	   5ef3	arch/x86/entry/entry_64.o-7_of_7

So while this shows a +~5K increase in text size, these numbers also _very_ 
significantly over-represent the extra footprint. The assumptions that resulted in 
us compressing the IRQ entry code have changed very significantly with the new x86 
IRQ allocation code we introduced in the last year:

- IRQ vectors are usually populated in tightly clustered groups.

  With our new vector allocator code the typical per CPU allocation percentage on
  x86 systems is ~3 device vectors and ~10 fixed vectors out of ~220 vectors -
  i.e. a very low ~6% utilization (!). This means that the _real_ text footprint
  increase is probably closer to 0.1K of text...

  This is a very typical picture on an average desktop system:

  /sys/kernel/debug/irq/domains/VECTORS:

   Online bitmaps:       40
   Global available:   8007
   Global reserved:      24
   Total allocated:      73
   System: 42: 0-19,32,50,128,237-255

 | CPU | avl | man | act | vectors
     0   199     0     3  33-34,48
     1   199     0     3  33-35
     2   199     0     3  33-35
     3   199     0     3  33-35
     4   199     0     3  33-35
     5   199     0     3  33-35
     6   199     0     3  33-35
     7   199     0     3  33-35
     8   199     0     3  33-35
     9   199     0     3  33-35
    10   201     0     1  33
    11   201     0     1  33
    12   201     0     1  33
    13   201     0     1  33
    14   201     0     1  33
    15   201     0     1  33
    16   201     0     1  33
    17   201     0     1  33
    18   201     0     1  33
    19   201     0     1  33
    20   199     0     3  33-35
    21   199     0     3  33-35
    22   199     0     3  33-35
    23   199     0     3  33-35
    24   199     0     3  33-35
    25   199     0     3  33-35
    26   199     0     3  33-35
    27   199     0     3  33-35
    28   199     0     3  33-35
    29   199     0     3  33-35
    30   201     0     1  33
    31   201     0     1  33
    32   201     0     1  33
    33   202     0     0  
    34   202     0     0  
    35   202     0     0  
    36   202     0     0  
    37   202     0     0  
    38   202     0     0  
    39   202     0     0  

  I.e. the average number of (device) IRQ vectors per CPU is around 2, with the 
  max being 3.

  The days where we allocated a lot of vectors on every CPU and the compression 
  of the IRQ entry code text mattered are over.

- Another issue is that only a small minority of vectors is frequent enough to
  actually matter to cache utilization in practice: 3-4 key IPIs and 1-2 device 
  IRQs at most - and those vectors tend to be tightly clustered as well into about 
  two groups, and are probably already on 2-3 cache lines in practice.

  For the common case of 'cache cold' IRQs it's the depth of the call chain and 
  the fragmentation of the resulting I$ that should be the main performance limit
  - not the overall size of it.

- The CPU side cost of IRQ delivery is still very expensive even in the best, most 
  cached case, as in 'over a thousand cycles'. So much stuff is done that maybe 
  contemporary x86 IRQ entry microcode already prefetches the IDT entry and its 
  expected call target address.

So for those reasons I'm really tempted by the all around simplification offered 
by this series:

   2 files changed, 62 insertions(+), 160 deletions(-)

Basically in this specific case I'd like to turn the argument around,
use this simpler design and put the burden of proof on the patches that
want to _complicate it_ beyond this simple, straightforward model: show us
the numbers that it truly makes a difference: it's not that hard to 
spray a CPU with IRQs by using IPIs, and the cache miss rate and the
overall perf counter stats should tell a clear story about whether it
makes sense to cache-compress (and complicate) the IRQ entry sequences.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
  2018-02-08  9:47       ` Ingo Molnar
@ 2018-02-08 17:39         ` Linus Torvalds
  0 siblings, 0 replies; 15+ messages in thread
From: Linus Torvalds @ 2018-02-08 17:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Dominik Brodowski, Linux Kernel Mailing List,
	the arch/x86 maintainers, Dan Williams, Thomas Gleixner,
	Andi Kleen, Andrew Lutomirski, Josh Poimboeuf, Brian Gerst,
	Denys Vlasenko, Borislav Petkov, Peter Zijlstra

On Thu, Feb 8, 2018 at 1:47 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> So for those reasons I'm really tempted by the all around simplification offered
> by this series:
>
>    2 files changed, 62 insertions(+), 160 deletions(-)
>
> Basically in this specific case I'd like to turn the argument around,
> use this simpler design and put the burden of proof on the patches that
> want to _complicate it_ beyond this simple, straightforward model

You make a good argument.

If the only real downside is slightly bigger static footprint (but the
actual real dynamic I$ footprint is not really impacted), then let's
just go for the simpler code.

It really would be lovely if our entry asm was understandable to mere mortals.

               Linus

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS
  2018-02-07 21:29     ` Dominik Brodowski
  2018-02-07 21:58       ` Linus Torvalds
  2018-02-08  9:47       ` Ingo Molnar
@ 2018-02-08 18:35       ` Josh Poimboeuf
  2 siblings, 0 replies; 15+ messages in thread
From: Josh Poimboeuf @ 2018-02-08 18:35 UTC (permalink / raw)
  To: Dominik Brodowski
  Cc: Linus Torvalds, Linux Kernel Mailing List, Ingo Molnar,
	the arch/x86 maintainers, Dan Williams, Thomas Gleixner,
	Andi Kleen, Andrew Lutomirski

On Wed, Feb 07, 2018 at 10:29:15PM +0100, Dominik Brodowski wrote:
> On Wed, Feb 07, 2018 at 12:44:41PM -0800, Linus Torvalds wrote:
> > On Wed, Feb 7, 2018 at 12:15 PM, Dominik Brodowski
> > <linux@dominikbrodowski.net> wrote:
> > >
> > > Note: The testb $3, CS(%rsp) instruction in idtentry() does not need
> > > modification. Previously %rsp was manually decreased by 15*8; with
> > > this patch, %rsp is decreased by 15 pushq instructions. Moreover,
> > > error_entry did and does the exact same test (with offset=8) after
> > > the registers have been moved/pushed and cleared.
> > 
> > So this has the problem that now those save/clear instructions will
> > all be done in that idtentry macro.
> > 
> > So now that code will be duplicated for all the users of that macro.
> > 
> > The old code did the saving in the common error_entry and
> > paranoid_entry routines, in order to be able to share all the code,
> > and making the duplicated stub functions generated by the idtentry
> > macro smaller.
> > 
> > Now, admittedly the new push sequence is much smaller than the old
> > movq sequence, so the duplication doesn't hurt as much, but it's still
> > likely quite noticeable.
> > 
> > So this removes lines of asm code, but it adds a lot of instructions
> > to the end result thanks to the macro, I think.
> 
> Indeed, that is the case (see below). However, if we want to switch to
> PUSH instructions and do this in a routine which is call'ed and which
> ret'urns, %rsp needs to be moved around even more often than the old
> ALLOC_PT_GPREGS_ON_STACK macro did (which you wanted to get rid of,
> IIUYC). Or do I miss something?
> 
>    text	   data	    bss	    dec	    hex	filename
>   19500	      0	      0	  19500	   4c2c	arch/x86/entry/entry_64.o-orig
>   19510	      0	      0	  19510	   4c36	arch/x86/entry/entry_64.o-3_of_7
>   21105	      0	      0	  21105	   5271	arch/x86/entry/entry_64.o-5_of_7
>   24307	      0	      0	  24307	   5ef3	arch/x86/entry/entry_64.o-7_of_7
> 
> In any case, here's a v2.1 for this patch 6/7, which silences an objtool
> warning in error_entry.
> 
> Thanks,
> 	Dominik

I like it.  Some minor unwind hint improvements below, feel free to fold
them into this patch:

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ae1b09a2e872..d7bd489fa360 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1135,7 +1135,6 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
-	UNWIND_HINT_FUNC
 	cld
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
@@ -1149,7 +1148,7 @@ ENTRY(paranoid_entry)
 	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
 
 	ret
-END(paranoid_entry)
+ENDPROC(paranoid_entry)
 
 /*
  * "Paranoid" exit path from exception stack.  This is invoked
@@ -1184,7 +1183,6 @@ END(paranoid_exit)
  * Return: EBX=0: came from user mode; EBX=1: otherwise
  */
 ENTRY(error_entry)
-	UNWIND_HINT_FUNC
 	UNWIND_HINT_REGS offset=8
 	cld
 	testb	$3, CS+8(%rsp)

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2018-02-08 18:35 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-07 20:15 [RFC v2 PATCH 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 2/7] x86/entry: merge POP_C_REGS and POP_EXTRA_REGS Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 3/7] x86/entry: interleave XOR register clearing with PUSH instructions Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 4/7] x86/entry: introduce PUSH_AND_CLEAN_REGS Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 5/7] x86/entry: use PUSH_AND_CLEAN_REGS in more cases Dominik Brodowski
2018-02-07 20:15 ` [RFC v2 PATCH 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS Dominik Brodowski
2018-02-07 20:44   ` Linus Torvalds
2018-02-07 21:29     ` Dominik Brodowski
2018-02-07 21:58       ` Linus Torvalds
2018-02-08  7:20         ` Dominik Brodowski
2018-02-08  9:47       ` Ingo Molnar
2018-02-08 17:39         ` Linus Torvalds
2018-02-08 18:35       ` Josh Poimboeuf
2018-02-07 20:15 ` [RFC v2 PATCH 7/7] x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly Dominik Brodowski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.