linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH 0/2] x86/retpoline: Retpoline on a diet
@ 2021-02-18 16:59 Peter Zijlstra
  2021-02-18 16:59 ` [RFC][PATCH 1/2] x86/retpoline: Simplify retpolines Peter Zijlstra
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-02-18 16:59 UTC (permalink / raw)
  To: x86, tony.luck, pjt; +Cc: linux-kernel, peterz, r.marek, jpoimboe, jikos

Hi!

The first patch rearranges the implementation and consolidates unused bytes.
The second patch uses INT3 over LFENCE to shrink the retpoline to 15 bytes, by
which 4 can live in a cacheline.

Patches have been boot tested on my IVB.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC][PATCH 1/2] x86/retpoline: Simplify retpolines
  2021-02-18 16:59 [RFC][PATCH 0/2] x86/retpoline: Retpoline on a diet Peter Zijlstra
@ 2021-02-18 16:59 ` Peter Zijlstra
  2021-02-22 11:36   ` Peter Zijlstra
  2021-02-18 16:59 ` [RFC][PATCH 2/2] x86/retpoline: Compress retpolines Peter Zijlstra
  2021-02-18 18:46 ` [RFC PATCH] x86/retpolines: Prevent speculation after RET Borislav Petkov
  2 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-02-18 16:59 UTC (permalink / raw)
  To: x86, tony.luck, pjt; +Cc: linux-kernel, peterz, r.marek, jpoimboe, jikos

Currently out retpolines consist of 2 symbols,
__x86_indirect_thunk_\reg, which is the compiler target, and
__x86_retpoline_\reg, which is the actual retpoline. Both are
consecutive in code and aligned such that for any one register they
both live in the same cacheline:

  0000000000000000 <__x86_indirect_thunk_rax>:
   0:   ff e0                   jmpq   *%rax
   2:   90                      nop
   3:   90                      nop
   4:   90                      nop

  0000000000000005 <__x86_retpoline_rax>:
   5:   e8 07 00 00 00          callq  11 <__x86_retpoline_rax+0xc>
   a:   f3 90                   pause
   c:   0f ae e8                lfence
   f:   eb f9                   jmp    a <__x86_retpoline_rax+0x5>
  11:   48 89 04 24             mov    %rax,(%rsp)
  15:   c3                      retq
  16:   66 2e 0f 1f 84 00 00 00 00 00   nopw   %cs:0x0(%rax,%rax,1)

The thunk is an alternative_2, where one option is a jmp to the
retpoline. Observe that we can fold the entire retpoline into the
alternative to simplify and consolidate unused bytes:

  0000000000000000 <__x86_indirect_thunk_rax>:
   0:   ff e0                   jmpq   *%rax
   2:   90                      nop
   3:   90                      nop
   4:   90                      nop
   5:   90                      nop
   6:   90                      nop
   7:   90                      nop
   8:   90                      nop
   9:   90                      nop
   a:   90                      nop
   b:   90                      nop
   c:   90                      nop
   d:   90                      nop
   e:   90                      nop
   f:   90                      nop
  10:   90                      nop
  11:   66 66 2e 0f 1f 84 00 00 00 00 00        data16 nopw %cs:0x0(%rax,%rax,1)
  1c:   0f 1f 40 00             nopl   0x0(%rax)

Notice that since our longest alternative sequence is now:

   0:   e8 07 00 00 00          callq  c <.altinstr_replacement+0xc>
   5:   f3 90                   pause
   7:   0f ae e8                lfence
   a:   eb f9                   jmp    5 <.altinstr_replacement+0x5>
   c:   48 89 04 24             mov    %rax,(%rsp)
  10:   c3                      retq

17 bytes, we have 15 bytes NOP at the end of our 32 byte slot.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/asm-prototypes.h |    7 -------
 arch/x86/include/asm/nospec-branch.h  |    6 +++---
 arch/x86/lib/retpoline.S              |   34 +++++++++++++++++-----------------
 tools/objtool/check.c                 |    3 +--
 4 files changed, 21 insertions(+), 29 deletions(-)

--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -22,15 +22,8 @@ extern void cmpxchg8b_emu(void);
 #define DECL_INDIRECT_THUNK(reg) \
 	extern asmlinkage void __x86_indirect_thunk_ ## reg (void);
 
-#define DECL_RETPOLINE(reg) \
-	extern asmlinkage void __x86_retpoline_ ## reg (void);
-
 #undef GEN
 #define GEN(reg) DECL_INDIRECT_THUNK(reg)
 #include <asm/GEN-for-each-reg.h>
 
-#undef GEN
-#define GEN(reg) DECL_RETPOLINE(reg)
-#include <asm/GEN-for-each-reg.h>
-
 #endif /* CONFIG_RETPOLINE */
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -81,7 +81,7 @@
 .macro JMP_NOSPEC reg:req
 #ifdef CONFIG_RETPOLINE
 	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
-		      __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \
+		      __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
 		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
 #else
 	jmp	*%\reg
@@ -91,7 +91,7 @@
 .macro CALL_NOSPEC reg:req
 #ifdef CONFIG_RETPOLINE
 	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
-		      __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \
+		      __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
 		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD
 #else
 	call	*%\reg
@@ -129,7 +129,7 @@
 	ALTERNATIVE_2(						\
 	ANNOTATE_RETPOLINE_SAFE					\
 	"call *%[thunk_target]\n",				\
-	"call __x86_retpoline_%V[thunk_target]\n",		\
+	"call __x86_indirect_thunk_%V[thunk_target]\n",		\
 	X86_FEATURE_RETPOLINE,					\
 	"lfence;\n"						\
 	ANNOTATE_RETPOLINE_SAFE					\
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -10,27 +10,31 @@
 #include <asm/unwind_hints.h>
 #include <asm/frame.h>
 
-.macro THUNK reg
-	.section .text.__x86.indirect_thunk
-
-	.align 32
-SYM_FUNC_START(__x86_indirect_thunk_\reg)
-	JMP_NOSPEC \reg
-SYM_FUNC_END(__x86_indirect_thunk_\reg)
-
-SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg)
+.macro RETPOLINE reg
 	ANNOTATE_INTRA_FUNCTION_CALL
-	call	.Ldo_rop_\@
+	call    .Ldo_rop_\@
 .Lspec_trap_\@:
 	UNWIND_HINT_EMPTY
 	pause
 	lfence
-	jmp	.Lspec_trap_\@
+	jmp .Lspec_trap_\@
 .Ldo_rop_\@:
-	mov	%\reg, (%_ASM_SP)
+	mov     %\reg, (%_ASM_SP)
 	UNWIND_HINT_FUNC
 	ret
-SYM_FUNC_END(__x86_retpoline_\reg)
+.endm
+
+.macro THUNK reg
+	.section .text.__x86.indirect_thunk
+
+	.align 32
+SYM_FUNC_START(__x86_indirect_thunk_\reg)
+
+	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
+		      __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
+		      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
+
+SYM_FUNC_END(__x86_indirect_thunk_\reg)
 
 .endm
 
@@ -48,7 +52,6 @@ SYM_FUNC_END(__x86_retpoline_\reg)
 
 #define __EXPORT_THUNK(sym)	_ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
 #define EXPORT_THUNK(reg)	__EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
-#define EXPORT_RETPOLINE(reg)  __EXPORT_THUNK(__x86_retpoline_ ## reg)
 
 #undef GEN
 #define GEN(reg) THUNK reg
@@ -58,6 +61,3 @@ SYM_FUNC_END(__x86_retpoline_\reg)
 #define GEN(reg) EXPORT_THUNK(reg)
 #include <asm/GEN-for-each-reg.h>
 
-#undef GEN
-#define GEN(reg) EXPORT_RETPOLINE(reg)
-#include <asm/GEN-for-each-reg.h>
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -800,8 +800,7 @@ static int add_jump_destinations(struct
 		} else if (reloc->sym->type == STT_SECTION) {
 			dest_sec = reloc->sym->sec;
 			dest_off = arch_dest_reloc_offset(reloc->addend);
-		} else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21) ||
-			   !strncmp(reloc->sym->name, "__x86_retpoline_", 16)) {
+		} else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21)) {
 			/*
 			 * Retpoline jumps are really dynamic jumps in
 			 * disguise, so convert them accordingly.



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC][PATCH 2/2] x86/retpoline: Compress retpolines
  2021-02-18 16:59 [RFC][PATCH 0/2] x86/retpoline: Retpoline on a diet Peter Zijlstra
  2021-02-18 16:59 ` [RFC][PATCH 1/2] x86/retpoline: Simplify retpolines Peter Zijlstra
@ 2021-02-18 16:59 ` Peter Zijlstra
  2021-02-19  7:14   ` Borislav Petkov
  2021-02-18 18:46 ` [RFC PATCH] x86/retpolines: Prevent speculation after RET Borislav Petkov
  2 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-02-18 16:59 UTC (permalink / raw)
  To: x86, tony.luck, pjt; +Cc: linux-kernel, peterz, r.marek, jpoimboe, jikos

By using int3 as a speculation fence instead of lfence, we can shrink
the longest alternative to just 15 bytes:

  0:   e8 05 00 00 00          callq  a <.altinstr_replacement+0xa>
  5:   f3 90                   pause  
  7:   cc                      int3   
  8:   eb fb                   jmp    5 <.altinstr_replacement+0x5>
  a:   48 89 04 24             mov    %rax,(%rsp)
  e:   c3                      retq   

This means we can change the alignment from 32 to 16 bytes and get 4
retpolines per cacheline, $I win.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/lib/retpoline.S |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -16,7 +16,7 @@
 .Lspec_trap_\@:
 	UNWIND_HINT_EMPTY
 	pause
-	lfence
+	int3
 	jmp .Lspec_trap_\@
 .Ldo_rop_\@:
 	mov     %\reg, (%_ASM_SP)
@@ -27,7 +27,7 @@
 .macro THUNK reg
 	.section .text.__x86.indirect_thunk
 
-	.align 32
+	.align 16
 SYM_FUNC_START(__x86_indirect_thunk_\reg)
 
 	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC PATCH] x86/retpolines: Prevent speculation after RET
  2021-02-18 16:59 [RFC][PATCH 0/2] x86/retpoline: Retpoline on a diet Peter Zijlstra
  2021-02-18 16:59 ` [RFC][PATCH 1/2] x86/retpoline: Simplify retpolines Peter Zijlstra
  2021-02-18 16:59 ` [RFC][PATCH 2/2] x86/retpoline: Compress retpolines Peter Zijlstra
@ 2021-02-18 18:46 ` Borislav Petkov
  2021-02-18 19:02   ` Peter Zijlstra
  2 siblings, 1 reply; 12+ messages in thread
From: Borislav Petkov @ 2021-02-18 18:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, tony.luck, pjt, linux-kernel, r.marek, jpoimboe, jikos,
	Dave Hansen, Andrew Cooper

On Thu, Feb 18, 2021 at 05:59:38PM +0100, Peter Zijlstra wrote:
> Hi!
> 
> The first patch rearranges the implementation and consolidates unused bytes.
> The second patch uses INT3 over LFENCE to shrink the retpoline to 15 bytes, by
> which 4 can live in a cacheline.
> 
> Patches have been boot tested on my IVB.

And here's the patch that prompted all this:

---
From: Borislav Petkov <bp@suse.de>
Date: Thu, 18 Feb 2021 17:21:24 +0100

Both vendors speculate after a near RET in some way:

Intel:

"Unlike near indirect CALL and near indirect JMP, the processor will not
speculatively execute the next sequential instruction after a near RET
unless that instruction is also the target of a jump or is a target in a
branch predictor."

AMD:

"Some AMD processors when they first encounter a branch do not stall
dispatch and use the branches dynamic execution to determine the target.
Therefore, they will speculatively dispatch the sequential instructions
after the branch. This happens for near return instructions where it is
not clear what code may exist sequentially after the return instruction.
This behavior also occurs with jmp/call instructions with indirect
targets. Software should place a LFENCE or another dispatch serializing
instruction after the return or jmp/call indirect instruction to prevent
this sequential speculation."

The AMD side doesn't really need the LFENCE because it'll do LFENCE;
JMP/CALL <target> due to X86_FEATURE_RETPOLINE_AMD, before it reaches
the RET.

Objtool bits provided by Peter Zijlstra (Intel) <peterz@infradead.org>

Reported-by: Rudolf Marek <r.marek@assembler.cz>
Signed-off-by: Borislav Petkov <bp@suse.de>
Co-developed-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/lib/retpoline.S             | 1 +
 tools/objtool/arch/x86/decode.c      | 5 +++++
 tools/objtool/check.c                | 6 ++++++
 tools/objtool/include/objtool/arch.h | 1 +
 4 files changed, 13 insertions(+)

diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 397d408e8244..3f8652aaf84d 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -31,6 +31,7 @@ SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg)
 	mov	%\reg, (%_ASM_SP)
 	UNWIND_HINT_FUNC
 	ret
+	lfence
 SYM_FUNC_END(__x86_retpoline_\reg)
 
 .endm
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 549813cff8ab..84a5e3cfa72d 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -464,6 +464,11 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec,
 				op->src.type = OP_SRC_POP;
 				op->dest.type = OP_DEST_MEM;
 			}
+
+		} else if (op2 == 0xae && modrm == 0xe8) {
+
+			/* lfence */
+			*type = INSN_NOSPEC;
 		}
 
 		break;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 331a763d8775..9ab84f0c4032 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2868,6 +2868,12 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio
 	      insn->jump_dest && insn->jump_dest->type == INSN_BUG)))
 		return true;
 
+	/*
+	 * We allow speculation traps after RETURN instructions.
+	 */
+	if (prev_insn->type == INSN_RETURN && insn->type == INSN_NOSPEC)
+		return true;
+
 	/*
 	 * Check if this (or a subsequent) instruction is related to
 	 * CONFIG_UBSAN or CONFIG_KASAN.
diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h
index 6ff0685f5cc5..faf0c0afd938 100644
--- a/tools/objtool/include/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -26,6 +26,7 @@ enum insn_type {
 	INSN_CLAC,
 	INSN_STD,
 	INSN_CLD,
+	INSN_NOSPEC,
 	INSN_OTHER,
 };
 
-- 
2.29.2

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] x86/retpolines: Prevent speculation after RET
  2021-02-18 18:46 ` [RFC PATCH] x86/retpolines: Prevent speculation after RET Borislav Petkov
@ 2021-02-18 19:02   ` Peter Zijlstra
  2021-02-18 19:11     ` Borislav Petkov
  2021-02-19  9:28     ` David Laight
  0 siblings, 2 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-02-18 19:02 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: x86, tony.luck, pjt, linux-kernel, r.marek, jpoimboe, jikos,
	Dave Hansen, Andrew Cooper

On Thu, Feb 18, 2021 at 07:46:39PM +0100, Borislav Petkov wrote:
> Both vendors speculate after a near RET in some way:
> 
> Intel:
> 
> "Unlike near indirect CALL and near indirect JMP, the processor will not
> speculatively execute the next sequential instruction after a near RET
> unless that instruction is also the target of a jump or is a target in a
> branch predictor."

Right, the way I read that means it's not a problem for us here.

> AMD:
> 
> "Some AMD processors when they first encounter a branch do not stall
> dispatch and use the branches dynamic execution to determine the target.
> Therefore, they will speculatively dispatch the sequential instructions
> after the branch. This happens for near return instructions where it is
> not clear what code may exist sequentially after the return instruction.
> This behavior also occurs with jmp/call instructions with indirect
> targets. Software should place a LFENCE or another dispatch serializing
> instruction after the return or jmp/call indirect instruction to prevent
> this sequential speculation."
> 
> The AMD side doesn't really need the LFENCE because it'll do LFENCE;
> JMP/CALL <target> due to X86_FEATURE_RETPOLINE_AMD, before it reaches
> the RET.

It never reached the RET.

So all in all, I really don't see why we'd need this.

Now, if AMD were to say something like: hey, that retpoline is pretty
awesome, we ought to use that instead of an uconditional LFENCE, then
sure, but as is, I don't think so.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] x86/retpolines: Prevent speculation after RET
  2021-02-18 19:02   ` Peter Zijlstra
@ 2021-02-18 19:11     ` Borislav Petkov
  2021-02-19  8:15       ` Peter Zijlstra
  2021-02-19  9:28     ` David Laight
  1 sibling, 1 reply; 12+ messages in thread
From: Borislav Petkov @ 2021-02-18 19:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, tony.luck, pjt, linux-kernel, r.marek, jpoimboe, jikos,
	Dave Hansen, Andrew Cooper

On Thu, Feb 18, 2021 at 08:02:31PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 18, 2021 at 07:46:39PM +0100, Borislav Petkov wrote:
> > Both vendors speculate after a near RET in some way:
> > 
> > Intel:
> > 
> > "Unlike near indirect CALL and near indirect JMP, the processor will not
> > speculatively execute the next sequential instruction after a near RET
> > unless that instruction is also the target of a jump or is a target in a
> > branch predictor."
> 
> Right, the way I read that means it's not a problem for us here.

Look at that other thread: the instruction *after* the RET can be
speculatively executed if that instruction is the target of a jump or it
is in a branch predictor.

And yes, the text is confusing and no one from Intel has clarified
definitively yet what that text means exactly.

> Now, if AMD were to say something like: hey, that retpoline is pretty
> awesome, we ought to use that instead of an uconditional LFENCE, then
> sure, but as is, I don't think so.

AMD prefers the LFENCE instead of the ratpoline sequence.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH 2/2] x86/retpoline: Compress retpolines
  2021-02-18 16:59 ` [RFC][PATCH 2/2] x86/retpoline: Compress retpolines Peter Zijlstra
@ 2021-02-19  7:14   ` Borislav Petkov
  2021-02-22 11:27     ` Peter Zijlstra
  0 siblings, 1 reply; 12+ messages in thread
From: Borislav Petkov @ 2021-02-19  7:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: x86, tony.luck, pjt, linux-kernel, r.marek, jpoimboe, jikos

On Thu, Feb 18, 2021 at 05:59:40PM +0100, Peter Zijlstra wrote:
> By using int3 as a speculation fence instead of lfence, we can shrink
> the longest alternative to just 15 bytes:
> 
>   0:   e8 05 00 00 00          callq  a <.altinstr_replacement+0xa>
>   5:   f3 90                   pause  
>   7:   cc                      int3   
>   8:   eb fb                   jmp    5 <.altinstr_replacement+0x5>
>   a:   48 89 04 24             mov    %rax,(%rsp)
>   e:   c3                      retq   
> 
> This means we can change the alignment from 32 to 16 bytes and get 4
> retpolines per cacheline, $I win.

You mean I$ :)

In any case, for both:

Reviewed-by: Borislav Petkov <bp@suse.de>

and it looks real nice here, the size:

 readelf -s vmlinux | grep __x86_indirect
 78966: ffffffff81c023e0    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 81653: ffffffff81c02390    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 82338: ffffffff81c02430    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 82955: ffffffff81c02380    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 85057: ffffffff81c023f0    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 89996: ffffffff81c023a0    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 91094: ffffffff81c02400    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 91278: ffffffff81c023b0    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 92015: ffffffff81c02360    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 92722: ffffffff81c023c0    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 97062: ffffffff81c02410    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 98687: ffffffff81c023d0    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 99076: ffffffff81c02350    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
 99500: ffffffff81c02370    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]
100579: ffffffff81c02420    15 FUNC    GLOBAL DEFAULT    1 __x86_indirect_t[...]

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] x86/retpolines: Prevent speculation after RET
  2021-02-18 19:11     ` Borislav Petkov
@ 2021-02-19  8:15       ` Peter Zijlstra
  2021-02-19 12:08         ` Andrew Cooper
  0 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-02-19  8:15 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: x86, tony.luck, pjt, linux-kernel, r.marek, jpoimboe, jikos,
	Dave Hansen, Andrew Cooper

On Thu, Feb 18, 2021 at 08:11:38PM +0100, Borislav Petkov wrote:
> On Thu, Feb 18, 2021 at 08:02:31PM +0100, Peter Zijlstra wrote:
> > On Thu, Feb 18, 2021 at 07:46:39PM +0100, Borislav Petkov wrote:
> > > Both vendors speculate after a near RET in some way:
> > > 
> > > Intel:
> > > 
> > > "Unlike near indirect CALL and near indirect JMP, the processor will not
> > > speculatively execute the next sequential instruction after a near RET
> > > unless that instruction is also the target of a jump or is a target in a
> > > branch predictor."
> > 
> > Right, the way I read that means it's not a problem for us here.
> 
> Look at that other thread: the instruction *after* the RET can be
> speculatively executed if that instruction is the target of a jump or it
> is in a branch predictor.

Right, but that has nothing to do with the RET instruction itself. You
can speculatively execute any random instruction by training the BTB,
which is I suppose the entire point of things :-)

So the way I read it is that: RET does not 'leak' speculation, but if
you target the instruction after RET with any other speculation crud,
ofcourse you can get it to 'run'.

And until further clarified, I'll stick with that :-)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC PATCH] x86/retpolines: Prevent speculation after RET
  2021-02-18 19:02   ` Peter Zijlstra
  2021-02-18 19:11     ` Borislav Petkov
@ 2021-02-19  9:28     ` David Laight
  1 sibling, 0 replies; 12+ messages in thread
From: David Laight @ 2021-02-19  9:28 UTC (permalink / raw)
  To: 'Peter Zijlstra', Borislav Petkov
  Cc: x86, tony.luck, pjt, linux-kernel, r.marek, jpoimboe, jikos,
	Dave Hansen, Andrew Cooper

From: Peter Zijlstra
> Sent: 18 February 2021 19:03
> 
> On Thu, Feb 18, 2021 at 07:46:39PM +0100, Borislav Petkov wrote:
> > Both vendors speculate after a near RET in some way:
> >
> > Intel:
> >
> > "Unlike near indirect CALL and near indirect JMP, the processor will not
> > speculatively execute the next sequential instruction after a near RET
> > unless that instruction is also the target of a jump or is a target in a
> > branch predictor."
> 
> Right, the way I read that means it's not a problem for us here.

They got a lawyer to write that sentence :-)
What on earth is that 'unless' clause about?
Either:
1) The instructions might be speculatively executed for some entirely
   different reason.
or:
2) The cpu might use the BTB to determine the instruction that follows the
   RET - and so might happen to execute the instruction that follows it.

I can't manage to read it in any way that suggests that the cpu will
ignore the fact it is a RET and start executing the instruction that
follows.
(Unlike some ARM cpus which do seem to do that.)

> > AMD:
> >
> > "Some AMD processors when they first encounter a branch do not stall
> > dispatch and use the branches dynamic execution to determine the target.
> > Therefore, they will speculatively dispatch the sequential instructions
> > after the branch. This happens for near return instructions where it is
> > not clear what code may exist sequentially after the return instruction.

Sounds like the conditional branch prediction (and the BTB?) get used for RET
instructions when the 'return address stack' is invalid.

> > This behavior also occurs with jmp/call instructions with indirect
> > targets. Software should place a LFENCE or another dispatch serializing
> > instruction after the return or jmp/call indirect instruction to prevent
> > this sequential speculation."
> >
> > The AMD side doesn't really need the LFENCE because it'll do LFENCE;
> > JMP/CALL <target> due to X86_FEATURE_RETPOLINE_AMD, before it reaches
> > the RET.
> 
> It never reached the RET.
> 
> So all in all, I really don't see why we'd need this.

I read that as implying that some AMD cpu can sometimes treat the RET as
a conditional branch and so speculatively assume it isn't taken.
So you need an LFENCE (or ???) following the RET at the end of every function.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH] x86/retpolines: Prevent speculation after RET
  2021-02-19  8:15       ` Peter Zijlstra
@ 2021-02-19 12:08         ` Andrew Cooper
  0 siblings, 0 replies; 12+ messages in thread
From: Andrew Cooper @ 2021-02-19 12:08 UTC (permalink / raw)
  To: Peter Zijlstra, Borislav Petkov
  Cc: x86, tony.luck, pjt, linux-kernel, r.marek, jpoimboe, jikos, Dave Hansen

On 19/02/2021 08:15, Peter Zijlstra wrote:
> On Thu, Feb 18, 2021 at 08:11:38PM +0100, Borislav Petkov wrote:
>> On Thu, Feb 18, 2021 at 08:02:31PM +0100, Peter Zijlstra wrote:
>>> On Thu, Feb 18, 2021 at 07:46:39PM +0100, Borislav Petkov wrote:
>>>> Both vendors speculate after a near RET in some way:
>>>>
>>>> Intel:
>>>>
>>>> "Unlike near indirect CALL and near indirect JMP, the processor will not
>>>> speculatively execute the next sequential instruction after a near RET
>>>> unless that instruction is also the target of a jump or is a target in a
>>>> branch predictor."
>>> Right, the way I read that means it's not a problem for us here.
>> Look at that other thread: the instruction *after* the RET can be
>> speculatively executed if that instruction is the target of a jump or it
>> is in a branch predictor.
> Right, but that has nothing to do with the RET instruction itself. You
> can speculatively execute any random instruction by training the BTB,
> which is I suppose the entire point of things :-)
>
> So the way I read it is that: RET does not 'leak' speculation, but if
> you target the instruction after RET with any other speculation crud,
> ofcourse you can get it to 'run'.
>
> And until further clarified, I'll stick with that :-)

https://developer.amd.com/wp-content/resources/Managing-Speculation-on-AMD-Processors.pdf
Final page, Mitigation G-5

Some parts (before Milan I believe that CPUID rule translates into) may
speculatively execute the instructions sequentially following a call/jmp
indirect or ret instruction.

For Intel, its just call/jmp instructions.  From SDM Vol2 for CALL (and
similar for JMP)

"Certain situations may lead to the next sequential instruction after a
near indirect CALL being speculatively executed. If software needs to
prevent this (e.g., in order to prevent a speculative execution side
channel), then an LFENCE instruction opcode can be placed after the near
indirect CALL in order to block speculative execution."


In both cases, the reason LFENCE is given is for the CALL case, where
there is sequential architectural execution.  JMP and RET do not have
architectural execution following them, so can use a shorter speculation
blocker.

When compiling with retpoline, all CALL/JMP indirects are removed, other
than within the __x86_indirect_thunk_%reg blocks, and those can be fixed
by hand.  That just leaves RET speculation, which has no following
architectural execution, at which point `ret; int3` is the shortest way
of halting speculation, at half the size of `ret; lfence`.

With a gcc toolchain, it does actually work if you macro 'ret' (and
retl/q) to be .byte 0xc3, 0xcc, but this doesn't work for Clang IAS
which refuses to macro real instructions.

What would be massively helpful if is the toolchains could have their
existing ARM straight-line-speculation support hooked up appropriately
so we get some new code gen options on x86, and don't have to resort to
the macro bodges above.

~Andrew

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH 2/2] x86/retpoline: Compress retpolines
  2021-02-19  7:14   ` Borislav Petkov
@ 2021-02-22 11:27     ` Peter Zijlstra
  0 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-02-22 11:27 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: x86, tony.luck, pjt, linux-kernel, r.marek, jpoimboe, jikos,
	andrew.cooper3

On Fri, Feb 19, 2021 at 08:14:39AM +0100, Borislav Petkov wrote:
> On Thu, Feb 18, 2021 at 05:59:40PM +0100, Peter Zijlstra wrote:
> > By using int3 as a speculation fence instead of lfence, we can shrink
> > the longest alternative to just 15 bytes:
> > 
> >   0:   e8 05 00 00 00          callq  a <.altinstr_replacement+0xa>
> >   5:   f3 90                   pause  
> >   7:   cc                      int3   
> >   8:   eb fb                   jmp    5 <.altinstr_replacement+0x5>
> >   a:   48 89 04 24             mov    %rax,(%rsp)
> >   e:   c3                      retq   
> > 
> > This means we can change the alignment from 32 to 16 bytes and get 4
> > retpolines per cacheline, $I win.
> 
> You mean I$ :)

Typin' so hard.

> In any case, for both:
> 
> Reviewed-by: Borislav Petkov <bp@suse.de>

Thanks, except I've been told there is a performance implication. But
since all that happened in sekrit, none of that is recorded :/

I was hoping for some people (Tony, Paul) to respond with more data.
Also, Andrew said that if we ditch the lfence we could also ditch the
pause.

So people, please speak up, and if possible share any data you still
might have from back when retpolines were developed such that we can
have it on record.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH 1/2] x86/retpoline: Simplify retpolines
  2021-02-18 16:59 ` [RFC][PATCH 1/2] x86/retpoline: Simplify retpolines Peter Zijlstra
@ 2021-02-22 11:36   ` Peter Zijlstra
  0 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-02-22 11:36 UTC (permalink / raw)
  To: x86, tony.luck, pjt; +Cc: linux-kernel, r.marek, jpoimboe, jikos

On Thu, Feb 18, 2021 at 05:59:39PM +0100, Peter Zijlstra wrote:
> Currently out retpolines consist of 2 symbols,
> __x86_indirect_thunk_\reg, which is the compiler target, and
> __x86_retpoline_\reg, which is the actual retpoline. Both are
> consecutive in code and aligned such that for any one register they
> both live in the same cacheline:
> 
>   0000000000000000 <__x86_indirect_thunk_rax>:
>    0:   ff e0                   jmpq   *%rax
>    2:   90                      nop
>    3:   90                      nop
>    4:   90                      nop
> 
>   0000000000000005 <__x86_retpoline_rax>:
>    5:   e8 07 00 00 00          callq  11 <__x86_retpoline_rax+0xc>
>    a:   f3 90                   pause
>    c:   0f ae e8                lfence
>    f:   eb f9                   jmp    a <__x86_retpoline_rax+0x5>
>   11:   48 89 04 24             mov    %rax,(%rsp)
>   15:   c3                      retq
>   16:   66 2e 0f 1f 84 00 00 00 00 00   nopw   %cs:0x0(%rax,%rax,1)
> 
> The thunk is an alternative_2, where one option is a jmp to the
> retpoline.

So the reason I originally did that was because objtool could not deal
with alternatives with stack ops. But we've recently fixed that.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-02-22 11:37 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-18 16:59 [RFC][PATCH 0/2] x86/retpoline: Retpoline on a diet Peter Zijlstra
2021-02-18 16:59 ` [RFC][PATCH 1/2] x86/retpoline: Simplify retpolines Peter Zijlstra
2021-02-22 11:36   ` Peter Zijlstra
2021-02-18 16:59 ` [RFC][PATCH 2/2] x86/retpoline: Compress retpolines Peter Zijlstra
2021-02-19  7:14   ` Borislav Petkov
2021-02-22 11:27     ` Peter Zijlstra
2021-02-18 18:46 ` [RFC PATCH] x86/retpolines: Prevent speculation after RET Borislav Petkov
2021-02-18 19:02   ` Peter Zijlstra
2021-02-18 19:11     ` Borislav Petkov
2021-02-19  8:15       ` Peter Zijlstra
2021-02-19 12:08         ` Andrew Cooper
2021-02-19  9:28     ` David Laight

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).