All of lore.kernel.org
 help / color / mirror / Atom feed
* Avoid speculative indirect calls in kernel
@ 2018-01-03 23:09 Andi Kleen
  2018-01-03 23:09 ` [PATCH 01/11] x86/retpoline: Define retpoline indirect thunk and macros Andi Kleen
                   ` (11 more replies)
  0 siblings, 12 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx; +Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen

This is a fix for Variant 2 in 
https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html

Any speculative indirect calls in the kernel can be tricked 
to execute any kernel code, which may allow side channel
attacks that can leak arbitrary kernel data.

So we want to avoid speculative indirect calls in the kernel.

There's a special code sequence called a retpoline that can
do indirect calls without speculation. We use a new compiler
option -mindirect-branch=thunk-extern (gcc patch will be released
separately) to recompile the kernel with this new sequence.

We also patch all the assembler code in the kernel to use
the new sequence.

The patches were originally from David Woodhouse and Tim Chen,
but then reworked and enhanced by me.

No performance numbers at this point. 32bit is only boot tested.

Git tree available in 
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc spec/retpoline-415-1

v1: Initial post.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* [PATCH 01/11] x86/retpoline: Define retpoline indirect thunk and macros
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-03 23:09 ` [PATCH 02/11] x86/retpoline/crypto: Convert crypto assembler indirect jumps Andi Kleen
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Dave Hansen, Andi Kleen

From: Dave Hansen <dave.hansen@linux.intel.com>

From: David Woodhouse <dwmw@amazon.co.uk>

retpoline is a special sequence on Intel CPUs to stop speculation for
indirect branches.

Provide assembler infrastructure to use retpoline by the compiler
and for assembler. We add the out of line trampoline used by the
compiler, and NOSPEC_JUMP / NOSPEC_CALL macros for assembler

[Originally from David and Tim, heavily hacked by AK]

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/include/asm/jump-asm.h | 47 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/vmlinux.lds.S   |  1 +
 arch/x86/lib/Makefile           |  1 +
 arch/x86/lib/retpoline.S        | 35 ++++++++++++++++++++++++++++++
 4 files changed, 84 insertions(+)
 create mode 100644 arch/x86/include/asm/jump-asm.h
 create mode 100644 arch/x86/lib/retpoline.S

diff --git a/arch/x86/include/asm/jump-asm.h b/arch/x86/include/asm/jump-asm.h
new file mode 100644
index 000000000000..953c391991b9
--- /dev/null
+++ b/arch/x86/include/asm/jump-asm.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef JUMP_ASM_H
+#define JUMP_ASM_H 1
+
+#ifdef __ASSEMBLY__
+
+/*
+ * Jump to an indirect pointer without speculation.
+ *
+ * The out of line __x86.indirect_thunk has special code sequences
+ * to stop speculation.
+ */
+
+.macro NOSPEC_JMP target
+	push	\target
+	jmp	__x86.indirect_thunk
+.endm
+
+
+/*
+ * Call an indirect pointer without speculation.
+ */
+
+.macro NOSPEC_CALL target
+	jmp     1221f
+1222:
+	push	\target
+	jmp	__x86.indirect_thunk
+1221:
+	call	1222b
+.endm
+
+#else /* __ASSEMBLY__ */
+
+#define NOSPEC_JUMP(t) \
+	"push " t "; "				\
+	"jmp __x86.indirect_thunk; "
+
+#define NOSPEC_CALL(t) \
+	"	jmp 1221f; "			\
+	"1222:	push " t ";"			\
+	"	jmp __x86.indirect_thunk;"	\
+	"1221:	call 1222b;"
+
+#endif /* !__ASSEMBLY */
+
+#endif
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 1e413a9326aa..2e64241a6664 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -103,6 +103,7 @@ SECTIONS
 		/* bootstrapping code */
 		HEAD_TEXT
 		. = ALIGN(8);
+		*(.text.__x86.indirect_thunk)
 		TEXT_TEXT
 		SCHED_TEXT
 		CPUIDLE_TEXT
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 7b181b61170e..ec7a329b9b3c 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
+lib-y += retpoline.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
new file mode 100644
index 000000000000..cb40781adbfe
--- /dev/null
+++ b/arch/x86/lib/retpoline.S
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Out of line jump trampoline for calls that disable speculation.
+ *
+ * This is a special sequence that prevents the CPU speculating
+ * for indirect calls.
+ *
+ * This can be called by gcc generated code, or with the asm macros
+ * in asm/jump-asm.h
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/export.h>
+
+	.section	.text.__x86.indirect_thunk,"ax"
+
+ENTRY(__x86.indirect_thunk)
+	CFI_STARTPROC
+	call	retpoline_call_target
+2:
+	lfence		/* stop speculation */
+	jmp	2b
+retpoline_call_target:
+#ifdef CONFIG_64BIT
+	lea	8(%rsp), %rsp
+#else
+	lea	4(%esp), %esp
+#endif
+	ret
+	CFI_ENDPROC
+ENDPROC(__x86.indirect_thunk)
+
+	EXPORT_SYMBOL(__x86.indirect_thunk)
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* [PATCH 02/11] x86/retpoline/crypto: Convert crypto assembler indirect jumps
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
  2018-01-03 23:09 ` [PATCH 01/11] x86/retpoline: Define retpoline indirect thunk and macros Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-03 23:09 ` [PATCH 03/11] x86/retpoline/entry: Convert entry " Andi Kleen
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Convert all indirect jumps in crypto assembler code to use
non speculative sequences.

Based on code from David Woodhouse and Tim Chen

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/crypto/aesni-intel_asm.S            | 5 +++--
 arch/x86/crypto/camellia-aesni-avx-asm_64.S  | 3 ++-
 arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 2 +-
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S    | 3 ++-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 16627fec80b2..f6c964f30ede 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,6 +32,7 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 #include <asm/frame.h>
+#include <asm/jump-asm.h>
 
 /*
  * The following macros are used to move an (un)aligned 16 byte value to/from
@@ -2884,7 +2885,7 @@ ENTRY(aesni_xts_crypt8)
 	pxor INC, STATE4
 	movdqu IV, 0x30(OUTP)
 
-	call *%r11
+	NOSPEC_CALL %r11
 
 	movdqu 0x00(OUTP), INC
 	pxor INC, STATE1
@@ -2929,7 +2930,7 @@ ENTRY(aesni_xts_crypt8)
 	_aesni_gf128mul_x_ble()
 	movups IV, (IVP)
 
-	call *%r11
+	NOSPEC_CALL %r11
 
 	movdqu 0x40(OUTP), INC
 	pxor INC, STATE1
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index f7c495e2863c..9006543227b2 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -17,6 +17,7 @@
 
 #include <linux/linkage.h>
 #include <asm/frame.h>
+#include <asm/jump-asm.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -1227,7 +1228,7 @@ camellia_xts_crypt_16way:
 	vpxor 14 * 16(%rax), %xmm15, %xmm14;
 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
 
-	call *%r9;
+	NOSPEC_CALL %r9;
 
 	addq $(16 * 16), %rsp;
 
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index eee5b3982cfd..1743e6850e00 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -1343,7 +1343,7 @@ camellia_xts_crypt_32way:
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-	call *%r9;
+	NOSPEC_CALL %r9;
 
 	addq $(16 * 32), %rsp;
 
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 7a7de27c6f41..969000c64456 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -45,6 +45,7 @@
 
 #include <asm/inst.h>
 #include <linux/linkage.h>
+#include <asm/jump-asm.h>
 
 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
 
@@ -172,7 +173,7 @@ continue_block:
 	movzxw  (bufp, %rax, 2), len
 	lea	crc_array(%rip), bufp
 	lea     (bufp, len, 1), bufp
-	jmp     *bufp
+	NOSPEC_JMP bufp
 
 	################################################################
 	## 2a) PROCESS FULL BLOCKS:
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* [PATCH 03/11] x86/retpoline/entry: Convert entry assembler indirect jumps
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
  2018-01-03 23:09 ` [PATCH 01/11] x86/retpoline: Define retpoline indirect thunk and macros Andi Kleen
  2018-01-03 23:09 ` [PATCH 02/11] x86/retpoline/crypto: Convert crypto assembler indirect jumps Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-03 23:09 ` [PATCH 04/11] x86/retpoline/ftrace: Convert ftrace " Andi Kleen
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Convert all indirect jumps in core 32/64bit entry assembler code to use
non speculative sequences.

Based on code from David Woodhouse and Tim Chen

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/entry/entry_32.S |  5 +++--
 arch/x86/entry/entry_64.S | 12 +++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index ace8f321a5a1..a4b88260d6f7 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -44,6 +44,7 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 #include <asm/frame.h>
+#include <asm/jump-asm.h>
 
 	.section .entry.text, "ax"
 
@@ -290,7 +291,7 @@ ENTRY(ret_from_fork)
 
 	/* kernel thread */
 1:	movl	%edi, %eax
-	call	*%ebx
+	NOSPEC_CALL	%ebx
 	/*
 	 * A kernel thread is allowed to return here after successfully
 	 * calling do_execve().  Exit to userspace to complete the execve()
@@ -919,7 +920,7 @@ common_exception:
 	movl	%ecx, %es
 	TRACE_IRQS_OFF
 	movl	%esp, %eax			# pt_regs pointer
-	call	*%edi
+	NOSPEC_CALL	%edi
 	jmp	ret_from_exception
 END(common_exception)
 
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f048e384ff54..486990fb3e4d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -37,6 +37,7 @@
 #include <asm/pgtable_types.h>
 #include <asm/export.h>
 #include <asm/frame.h>
+#include <asm/jump-asm.h>
 #include <linux/err.h>
 
 #include "calling.h"
@@ -191,7 +192,7 @@ ENTRY(entry_SYSCALL_64_trampoline)
 	 */
 	pushq	%rdi
 	movq	$entry_SYSCALL_64_stage2, %rdi
-	jmp	*%rdi
+	NOSPEC_JMP	%rdi
 END(entry_SYSCALL_64_trampoline)
 
 	.popsection
@@ -269,8 +270,9 @@ entry_SYSCALL_64_fastpath:
 	 * This call instruction is handled specially in stub_ptregs_64.
 	 * It might end up jumping to the slow path.  If it jumps, RAX
 	 * and all argument registers are clobbered.
-	 */
-	call	*sys_call_table(, %rax, 8)
+	*/
+	movq	sys_call_table(, %rax, 8), %rax
+	NOSPEC_CALL	%rax
 .Lentry_SYSCALL_64_after_fastpath_call:
 
 	movq	%rax, RAX(%rsp)
@@ -442,7 +444,7 @@ ENTRY(stub_ptregs_64)
 	jmp	entry_SYSCALL64_slow_path
 
 1:
-	jmp	*%rax				/* Called from C */
+	NOSPEC_JMP	%rax				/* Called from C */
 END(stub_ptregs_64)
 
 .macro ptregs_stub func
@@ -521,7 +523,7 @@ ENTRY(ret_from_fork)
 1:
 	/* kernel thread */
 	movq	%r12, %rdi
-	call	*%rbx
+	NOSPEC_CALL	%rbx
 	/*
 	 * A kernel thread is allowed to return here after successfully
 	 * calling do_execve().  Exit to userspace to complete the execve()
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* [PATCH 04/11] x86/retpoline/ftrace: Convert ftrace assembler indirect jumps
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
                   ` (2 preceding siblings ...)
  2018-01-03 23:09 ` [PATCH 03/11] x86/retpoline/entry: Convert entry " Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-03 23:09 ` [PATCH 05/11] x86/retpoline/hyperv: Convert " Andi Kleen
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Convert all indirect jumps in ftrace assembler code to use
non speculative sequences.

Based on code from David Woodhouse and Tim Chen

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/ftrace_32.S | 3 ++-
 arch/x86/kernel/ftrace_64.S | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index b6c6468e10bc..c0c10ba53375 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -8,6 +8,7 @@
 #include <asm/segment.h>
 #include <asm/export.h>
 #include <asm/ftrace.h>
+#include <asm/jump-asm.h>
 
 #ifdef CC_USING_FENTRY
 # define function_hook	__fentry__
@@ -241,5 +242,5 @@ return_to_handler:
 	movl	%eax, %ecx
 	popl	%edx
 	popl	%eax
-	jmp	*%ecx
+	NOSPEC_JMP	%ecx
 #endif
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index c832291d948a..4814c4b30b8d 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -7,7 +7,7 @@
 #include <asm/ptrace.h>
 #include <asm/ftrace.h>
 #include <asm/export.h>
-
+#include <asm/jump-asm.h>
 
 	.code64
 	.section .entry.text, "ax"
@@ -286,7 +286,7 @@ trace:
 	 * ip and parent ip are used and the list function is called when
 	 * function tracing is enabled.
 	 */
-	call   *ftrace_trace_function
+	NOSPEC_CALL ftrace_trace_function
 
 	restore_mcount_regs
 
@@ -329,5 +329,5 @@ GLOBAL(return_to_handler)
 	movq 8(%rsp), %rdx
 	movq (%rsp), %rax
 	addq $24, %rsp
-	jmp *%rdi
+	NOSPEC_JMP %rdi
 #endif
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* [PATCH 05/11] x86/retpoline/hyperv: Convert assembler indirect jumps
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
                   ` (3 preceding siblings ...)
  2018-01-03 23:09 ` [PATCH 04/11] x86/retpoline/ftrace: Convert ftrace " Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-03 23:09 ` [PATCH 06/11] x86/retpoline/crypto: Convert xen " Andi Kleen
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Convert all indirect jumps in hyperv inline asm code to use
non speculative sequences.

Based on code from David Woodhouse and Tim Chen

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/include/asm/mshyperv.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 5400add2885b..2dfcdd401d4c 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -7,6 +7,7 @@
 #include <linux/nmi.h>
 #include <asm/io.h>
 #include <asm/hyperv.h>
+#include <asm/jump-asm.h>
 
 /*
  * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
@@ -186,7 +187,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 		return U64_MAX;
 
 	__asm__ __volatile__("mov %4, %%r8\n"
-			     "call *%5"
+			     NOSPEC_CALL("%5")
 			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 			       "+c" (control), "+d" (input_address)
 			     :  "r" (output_address), "m" (hv_hypercall_pg)
@@ -200,7 +201,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 	if (!hv_hypercall_pg)
 		return U64_MAX;
 
-	__asm__ __volatile__("call *%7"
+	__asm__ __volatile__(NOSPEC_CALL("%7")
 			     : "=A" (hv_status),
 			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
 			     : "A" (control),
@@ -227,7 +228,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 
 #ifdef CONFIG_X86_64
 	{
-		__asm__ __volatile__("call *%4"
+		__asm__ __volatile__(NOSPEC_CALL("%4")
 				     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 				       "+c" (control), "+d" (input1)
 				     : "m" (hv_hypercall_pg)
@@ -238,7 +239,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 		u32 input1_hi = upper_32_bits(input1);
 		u32 input1_lo = lower_32_bits(input1);
 
-		__asm__ __volatile__ ("call *%5"
+		__asm__ __volatile__ (NOSPEC_CALL("%5")
 				      : "=A"(hv_status),
 					"+c"(input1_lo),
 					ASM_CALL_CONSTRAINT
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* [PATCH 06/11] x86/retpoline/crypto: Convert xen assembler indirect jumps
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
                   ` (4 preceding siblings ...)
  2018-01-03 23:09 ` [PATCH 05/11] x86/retpoline/hyperv: Convert " Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-03 23:09 ` [PATCH 07/11] x86/retpoline/checksum32: Convert " Andi Kleen
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Convert all indirect jumps in xen inline assembler code to use
non speculative sequences.

Based on code from David Woodhouse and Tim Chen

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 1 +
 arch/x86/include/asm/xen/hypercall.h         | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 1743e6850e00..9cd8450a2050 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -12,6 +12,7 @@
 
 #include <linux/linkage.h>
 #include <asm/frame.h>
+#include <asm/jump-asm.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 7cb282e9e587..91de35bcce5e 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -44,6 +44,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/smap.h>
+#include <asm/jump-asm.h>
 
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
@@ -217,7 +218,7 @@ privcmd_call(unsigned call,
 	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
 
 	stac();
-	asm volatile("call *%[call]"
+	asm volatile(NOSPEC_CALL("%[call]")
 		     : __HYPERCALL_5PARAM
 		     : [call] "a" (&hypercall_page[call])
 		     : __HYPERCALL_CLOBBER5);
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* [PATCH 07/11] x86/retpoline/checksum32: Convert  assembler indirect jumps
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
                   ` (5 preceding siblings ...)
  2018-01-03 23:09 ` [PATCH 06/11] x86/retpoline/crypto: Convert xen " Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-03 23:09 ` [PATCH 08/11] x86/retpoline/irq32: " Andi Kleen
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Convert all indirect jumps in 32bit checksum assembler code to use
non speculative sequences.

Based on code from David Woodhouse and Tim Chen

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/lib/checksum_32.S | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 4d34bb548b41..d19862183d4b 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -29,6 +29,7 @@
 #include <asm/errno.h>
 #include <asm/asm.h>
 #include <asm/export.h>
+#include <asm/jump-asm.h>
 				
 /*
  * computes a partial checksum, e.g. for TCP/UDP fragments
@@ -156,7 +157,7 @@ ENTRY(csum_partial)
 	negl %ebx
 	lea 45f(%ebx,%ebx,2), %ebx
 	testl %esi, %esi
-	jmp *%ebx
+	NOSPEC_JMP %ebx
 
 	# Handle 2-byte-aligned regions
 20:	addw (%esi), %ax
@@ -439,7 +440,7 @@ ENTRY(csum_partial_copy_generic)
 	andl $-32,%edx
 	lea 3f(%ebx,%ebx), %ebx
 	testl %esi, %esi 
-	jmp *%ebx
+	NOSPEC_JMP %ebx
 1:	addl $64,%esi
 	addl $64,%edi 
 	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* [PATCH 08/11] x86/retpoline/irq32: Convert assembler indirect jumps
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
                   ` (6 preceding siblings ...)
  2018-01-03 23:09 ` [PATCH 07/11] x86/retpoline/checksum32: Convert " Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-03 23:09 ` [PATCH 09/11] x86/retpoline: Finally enable retpoline for C code Andi Kleen
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Convert all indirect jumps in 32bit irq inline asm code to use
non speculative sequences.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/irq_32.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a83b3346a0e1..2dce2fdd2c4e 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 
 #include <asm/apic.h>
+#include <asm/jump-asm.h>
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 
@@ -55,7 +56,7 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
 static void call_on_stack(void *func, void *stack)
 {
 	asm volatile("xchgl	%%ebx,%%esp	\n"
-		     "call	*%%edi		\n"
+		     NOSPEC_CALL("%%edi")
 		     "movl	%%ebx,%%esp	\n"
 		     : "=b" (stack)
 		     : "0" (stack),
@@ -95,7 +96,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
 		call_on_stack(print_stack_overflow, isp);
 
 	asm volatile("xchgl	%%ebx,%%esp	\n"
-		     "call	*%%edi		\n"
+		     NOSPEC_CALL("%%edi")
 		     "movl	%%ebx,%%esp	\n"
 		     : "=a" (arg1), "=b" (isp)
 		     :  "0" (desc),   "1" (isp),
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* [PATCH 09/11] x86/retpoline: Finally enable retpoline for C code
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
                   ` (7 preceding siblings ...)
  2018-01-03 23:09 ` [PATCH 08/11] x86/retpoline/irq32: " Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-04  8:28   ` Greg KH
  2018-01-03 23:09 ` [PATCH 10/11] retpoline/taint: Taint kernel for missing retpoline in compiler Andi Kleen
                   ` (2 subsequent siblings)
  11 siblings, 1 reply; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Dave Hansen

From: Dave Hansen <dave.hansen@linux.intel.com>

From: David Woodhouse <dwmw@amazon.co.uk>

Add retpoline compile option in Makefile

Update Makefile with retpoline compile options.  This requires a gcc with the
retpoline compiler patches enabled.

Print a warning when the compiler doesn't support retpoline

[Originally from David and Tim, but hacked by AK]

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 arch/x86/Makefile | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 3e73bc255e4e..dad4b24abdc9 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -230,6 +230,15 @@ KBUILD_CFLAGS += -Wno-sign-compare
 #
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 
+#
+RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern)
+ifneq ($(RETPOLINE_CFLAGS),)
+	KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
+	KBUILD_AFLAGS += -DRETPOLINE
+else
+        $(warning Retpoline not supported in compiler. Kernel may be insecure.)
+endif
+
 archscripts: scripts_basic
 	$(Q)$(MAKE) $(build)=arch/x86/tools relocs
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* [PATCH 10/11] retpoline/taint: Taint kernel for missing retpoline in compiler
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
                   ` (8 preceding siblings ...)
  2018-01-03 23:09 ` [PATCH 09/11] x86/retpoline: Finally enable retpoline for C code Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-04  0:29   ` Thomas Gleixner
  2018-01-03 23:09 ` [PATCH 11/11] retpoline/objtool: Disable some objtool warnings Andi Kleen
  2018-01-03 23:51 ` Avoid speculative indirect calls in kernel Linus Torvalds
  11 siblings, 1 reply; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

When the kernel or a module hasn't been compiled with a retpoline
aware compiler, print a warning and set a taint flag.

For modules it is checked at compile time, however it cannot
check assembler or other non compiled objects used in the module link.

Due to lack of better letter it uses taint option 'Z'

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 Documentation/admin-guide/tainted-kernels.rst |  3 +++
 arch/x86/kernel/setup.c                       |  6 ++++++
 include/linux/kernel.h                        |  4 +++-
 kernel/module.c                               | 11 ++++++++++-
 kernel/panic.c                                |  1 +
 scripts/mod/modpost.c                         |  9 +++++++++
 6 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/tainted-kernels.rst b/Documentation/admin-guide/tainted-kernels.rst
index 1df03b5cb02f..800261b6bd6f 100644
--- a/Documentation/admin-guide/tainted-kernels.rst
+++ b/Documentation/admin-guide/tainted-kernels.rst
@@ -52,6 +52,9 @@ characters, each representing a particular tainted value.
 
  16) ``K`` if the kernel has been live patched.
 
+ 17) ``Z`` if the x86 kernel or a module hasn't been compiled with
+     a retpoline aware compiler and may be vulnerable to data leaks.
+
 The primary reason for the **'Tainted: '** string is to tell kernel
 debuggers if this is a clean kernel or if anything unusual has
 occurred.  Tainting is permanent: even if an offending module is
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8af2e8d0c0a1..5b4f4d3a897b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1296,6 +1296,12 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	unwind_init();
+
+#ifndef RETPOLINE
+	add_taint(TAINT_NO_RETPOLINE, LOCKDEP_STILL_OK);
+	pr_warn("No support for retpoline in kernel compiler\n");
+	pr_warn("Kernel may be vulnerable to data leaks.\n");
+#endif
 }
 
 #ifdef CONFIG_X86_32
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index ce51455e2adf..fbb4d3baffcc 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -550,7 +550,9 @@ extern enum system_states {
 #define TAINT_SOFTLOCKUP		14
 #define TAINT_LIVEPATCH			15
 #define TAINT_AUX			16
-#define TAINT_FLAGS_COUNT		17
+#define TAINT_NO_RETPOLINE		17
+
+#define TAINT_FLAGS_COUNT		18
 
 struct taint_flag {
 	char c_true;	/* character printed when tainted */
diff --git a/kernel/module.c b/kernel/module.c
index dea01ac9cb74..92db3f59a29a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3028,7 +3028,16 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
 				mod->name);
 		add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
 	}
-
+#ifdef RETPOLINE
+	if (!get_modinfo(info, "retpoline")) {
+		if (!test_taint(TAINT_NO_RETPOLINE)) {
+			pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+				mod->name);
+			pr_warn("Kernel may be vulnerable to data leaks.\n");
+		}
+		add_taint_module(mod, TAINT_NO_RETPOLINE, LOCKDEP_STILL_OK);
+	}
+#endif
 	if (get_modinfo(info, "staging")) {
 		add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
 		pr_warn("%s: module is from the staging directory, the quality "
diff --git a/kernel/panic.c b/kernel/panic.c
index 2cfef408fec9..6686c67b6e4b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -325,6 +325,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
 	{ 'L', ' ', false },	/* TAINT_SOFTLOCKUP */
 	{ 'K', ' ', true },	/* TAINT_LIVEPATCH */
 	{ 'X', ' ', true },	/* TAINT_AUX */
+	{ 'Z', ' ', true },	/* TAINT_NO_RETPOLINE */
 };
 
 /**
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index f51cf977c65b..6510536c06df 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -2165,6 +2165,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
 		buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
 }
 
+/* Cannot check for assembler */
+static void add_retpoline(struct buffer *b)
+{
+	buf_printf(b, "\n#ifdef RETPOLINE\n");
+	buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
+	buf_printf(b, "#endif\n");
+}
+
 static void add_staging_flag(struct buffer *b, const char *name)
 {
 	static const char *staging_dir = "drivers/staging";
@@ -2506,6 +2514,7 @@ int main(int argc, char **argv)
 		err |= check_modname_len(mod);
 		add_header(&buf, mod);
 		add_intree_flag(&buf, !external_module);
+		add_retpoline(&buf);
 		add_staging_flag(&buf, mod->name);
 		err |= add_versions(&buf, mod);
 		add_depends(&buf, mod, modules);
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* [PATCH 11/11] retpoline/objtool: Disable some objtool warnings
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
                   ` (9 preceding siblings ...)
  2018-01-03 23:09 ` [PATCH 10/11] retpoline/taint: Taint kernel for missing retpoline in compiler Andi Kleen
@ 2018-01-03 23:09 ` Andi Kleen
  2018-01-03 23:51 ` Avoid speculative indirect calls in kernel Linus Torvalds
  11 siblings, 0 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-03 23:09 UTC (permalink / raw)
  To: tglx
  Cc: torvalds, gregkh, dwmw, tim.c.chen, linux-kernel, dave.hansen,
	Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

With the indirect call thunk enabled compiler two objtool
warnings are triggered very frequently and make the build
very noisy.

I don't see a good way to avoid them, so just disable them
for now.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 tools/objtool/check.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 9b341584eb1b..435c71f944dc 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -503,8 +503,13 @@ static int add_call_destinations(struct objtool_file *file)
 			insn->call_dest = find_symbol_by_offset(insn->sec,
 								dest_off);
 			if (!insn->call_dest) {
+#if 0
+				/* Compilers with -mindirect-branch=thunk-extern trigger
+				 * this everywhere on x86. Disable for now.
+				 */
 				WARN_FUNC("can't find call dest symbol at offset 0x%lx",
 					  insn->sec, insn->offset, dest_off);
+#endif
 				return -1;
 			}
 		} else if (rela->sym->type == STT_SECTION) {
@@ -1716,8 +1721,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 					return 1;
 
 			} else if (func && has_modified_stack_frame(&state)) {
+#if 0
+				/* Compilers with -mindirect-branch=thunk-extern trigger
+				 * this everywhere on x86. Disable for now.
+				 */
+
 				WARN_FUNC("sibling call from callable instruction with modified stack frame",
 					  sec, insn->offset);
+#endif
 				return 1;
 			}
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
                   ` (10 preceding siblings ...)
  2018-01-03 23:09 ` [PATCH 11/11] retpoline/objtool: Disable some objtool warnings Andi Kleen
@ 2018-01-03 23:51 ` Linus Torvalds
  2018-01-04  0:00   ` Alan Cox
                     ` (4 more replies)
  11 siblings, 5 replies; 107+ messages in thread
From: Linus Torvalds @ 2018-01-03 23:51 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

On Wed, Jan 3, 2018 at 3:09 PM, Andi Kleen <andi@firstfloor.org> wrote:
> This is a fix for Variant 2 in
> https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
>
> Any speculative indirect calls in the kernel can be tricked
> to execute any kernel code, which may allow side channel
> attacks that can leak arbitrary kernel data.

Why is this all done without any configuration options?

A *competent* CPU engineer would fix this by making sure speculation
doesn't happen across protection domains. Maybe even a L1 I$ that is
keyed by CPL.

I think somebody inside of Intel needs to really take a long hard look
at their CPU's, and actually admit that they have issues instead of
writing PR blurbs that say that everything works as designed.

.. and that really means that all these mitigation patches should be
written with "not all CPU's are crap" in mind.

Or is Intel basically saying "we are committed to selling you shit
forever and ever, and never fixing anything"?

Because if that's the case, maybe we should start looking towards the
ARM64 people more.

Please talk to management. Because I really see exactly two possibibilities:

 - Intel never intends to fix anything

OR

 - these workarounds should have a way to disable them.

Which of the two is it?

                   Linus

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-03 23:51 ` Avoid speculative indirect calls in kernel Linus Torvalds
@ 2018-01-04  0:00   ` Alan Cox
  2018-01-04  0:09   ` Andi Kleen
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 107+ messages in thread
From: Alan Cox @ 2018-01-04  0:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, tglx, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

>  - these workarounds should have a way to disable them.
> 
> Which of the two is it?

The latter clearly - because there are processors today that don't have
those problems because they are sufficiently dumb.

As for future products - you know perfectly well that none of the vendors
can answer that here because of the US laws on public companies and
forward looking statements.

Alan

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-03 23:51 ` Avoid speculative indirect calls in kernel Linus Torvalds
  2018-01-04  0:00   ` Alan Cox
@ 2018-01-04  0:09   ` Andi Kleen
  2018-01-04  0:12     ` Thomas Gleixner
  2018-01-04  0:18     ` David Lang
  2018-01-04  1:00   ` Paul Turner
                     ` (2 subsequent siblings)
  4 siblings, 2 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-04  0:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, tglx, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

Hi Linus,

On Wed, Jan 03, 2018 at 03:51:35PM -0800, Linus Torvalds wrote:
> On Wed, Jan 3, 2018 at 3:09 PM, Andi Kleen <andi@firstfloor.org> wrote:
> > This is a fix for Variant 2 in
> > https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
> >
> > Any speculative indirect calls in the kernel can be tricked
> > to execute any kernel code, which may allow side channel
> > attacks that can leak arbitrary kernel data.
> 
> Why is this all done without any configuration options?

I was thinking of a config option, but I was struggling with a name.

CONFIG_INSECURE_KERNEL, CONFIG_LEAK_MEMORY? 

And should it be positive or negative?

So I opted to be secure uncontionally.

It would be simple to add however, all hooks are either in the Makefile
or in asm/jump-asm.h

>  - these workarounds should have a way to disable them.
> 

There will be soon patches to add other ways and they have a way
to patch out most of the retpoline overhead at runtime
(basically replace the trampoline with a pure ret)

We just wanted to get the retpoline code out first because
it's the most basic and widest applicable fix.

-Andi

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:09   ` Andi Kleen
@ 2018-01-04  0:12     ` Thomas Gleixner
  2018-01-04  0:15       ` Andi Kleen
  2018-01-04  0:20       ` Linus Torvalds
  2018-01-04  0:18     ` David Lang
  1 sibling, 2 replies; 107+ messages in thread
From: Thomas Gleixner @ 2018-01-04  0:12 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Linus Torvalds, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

On Wed, 3 Jan 2018, Andi Kleen wrote:
> On Wed, Jan 03, 2018 at 03:51:35PM -0800, Linus Torvalds wrote:
> > On Wed, Jan 3, 2018 at 3:09 PM, Andi Kleen <andi@firstfloor.org> wrote:
> > > This is a fix for Variant 2 in
> > > https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
> > >
> > > Any speculative indirect calls in the kernel can be tricked
> > > to execute any kernel code, which may allow side channel
> > > attacks that can leak arbitrary kernel data.
> > 
> > Why is this all done without any configuration options?
> 
> I was thinking of a config option, but I was struggling with a name.
> 
> CONFIG_INSECURE_KERNEL, CONFIG_LEAK_MEMORY?
> 
> And should it be positive or negative?

It should be a CPU_BUG bit as we have for the other mess. And that can be
used for patching.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:12     ` Thomas Gleixner
@ 2018-01-04  0:15       ` Andi Kleen
  2018-01-04  0:19         ` Jiri Kosina
  2018-01-04  0:29         ` Alan Cox
  2018-01-04  0:20       ` Linus Torvalds
  1 sibling, 2 replies; 107+ messages in thread
From: Andi Kleen @ 2018-01-04  0:15 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andi Kleen, Linus Torvalds, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

> It should be a CPU_BUG bit as we have for the other mess. And that can be
> used for patching.

It has to be done at compile time because it requires a compiler option.

Most of the indirect calls are in C code.

So it cannot just patched in, only partially out.

-Andi

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:09   ` Andi Kleen
  2018-01-04  0:12     ` Thomas Gleixner
@ 2018-01-04  0:18     ` David Lang
  1 sibling, 0 replies; 107+ messages in thread
From: David Lang @ 2018-01-04  0:18 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Linus Torvalds, tglx, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

On Wed, 3 Jan 2018, Andi Kleen wrote:

>>
>> Why is this all done without any configuration options?
>
> I was thinking of a config option, but I was struggling with a name.
>
> CONFIG_INSECURE_KERNEL, CONFIG_LEAK_MEMORY?

CONFIG_BUGGY_INTEL_CACHE (or similar)

something that indicates that this is to support the Intel CPUs that have this 
bug in them.

We've had such CPU specific support options in the past.

Some people will need the speed more than the protection, some people will be 
running on CPUs that don't need this.

Why is this needed? because of an Intel bug, so name it accordingly.

David Lang

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:15       ` Andi Kleen
@ 2018-01-04  0:19         ` Jiri Kosina
  2018-01-05  2:01           ` james harvey
  2018-01-04  0:29         ` Alan Cox
  1 sibling, 1 reply; 107+ messages in thread
From: Jiri Kosina @ 2018-01-04  0:19 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Thomas Gleixner, Linus Torvalds, Greg Kroah-Hartman, dwmw,
	Tim Chen, Linux Kernel Mailing List, Dave Hansen

On Wed, 3 Jan 2018, Andi Kleen wrote:

> > It should be a CPU_BUG bit as we have for the other mess. And that can be
> > used for patching.
> 
> It has to be done at compile time because it requires a compiler option.

If gcc anotates indirect calls/jumps in a way that we could patch them 
using alternatives in runtime, that'd be enough.

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:12     ` Thomas Gleixner
  2018-01-04  0:15       ` Andi Kleen
@ 2018-01-04  0:20       ` Linus Torvalds
  2018-01-04  0:26         ` Thomas Gleixner
  1 sibling, 1 reply; 107+ messages in thread
From: Linus Torvalds @ 2018-01-04  0:20 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andi Kleen, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

On Wed, Jan 3, 2018 at 4:12 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
>
> It should be a CPU_BUG bit as we have for the other mess. And that can be
> used for patching.

That would definitely be the right approach.

However, that's also probably quite challenging for the gcc option.

              Linus

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:20       ` Linus Torvalds
@ 2018-01-04  0:26         ` Thomas Gleixner
  0 siblings, 0 replies; 107+ messages in thread
From: Thomas Gleixner @ 2018-01-04  0:26 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

On Wed, 3 Jan 2018, Linus Torvalds wrote:
> On Wed, Jan 3, 2018 at 4:12 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > It should be a CPU_BUG bit as we have for the other mess. And that can be
> > used for patching.
> 
> That would definitely be the right approach.
> 
> However, that's also probably quite challenging for the gcc option.

AFAICT, the thunk which GCC injects can be supplied to GCC, so it should be
doable.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:15       ` Andi Kleen
  2018-01-04  0:19         ` Jiri Kosina
@ 2018-01-04  0:29         ` Alan Cox
  2018-01-04  0:31           ` Thomas Gleixner
  1 sibling, 1 reply; 107+ messages in thread
From: Alan Cox @ 2018-01-04  0:29 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Thomas Gleixner, Linus Torvalds, Greg Kroah-Hartman, dwmw,
	Tim Chen, Linux Kernel Mailing List, Dave Hansen

On Wed, 3 Jan 2018 16:15:01 -0800
Andi Kleen <andi@firstfloor.org> wrote:

> > It should be a CPU_BUG bit as we have for the other mess. And that can be
> > used for patching.  
> 
> It has to be done at compile time because it requires a compiler option.
> 
> Most of the indirect calls are in C code.
> 
> So it cannot just patched in, only partially out.

You can replace the pushl ; jmp  with an alternatives section (although
there might be a lot of them). Even if gcc isn't smart enough to do that
perl is.

Alan

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: [PATCH 10/11] retpoline/taint: Taint kernel for missing retpoline in compiler
  2018-01-03 23:09 ` [PATCH 10/11] retpoline/taint: Taint kernel for missing retpoline in compiler Andi Kleen
@ 2018-01-04  0:29   ` Thomas Gleixner
  2018-01-04  0:35     ` Randy Dunlap
  0 siblings, 1 reply; 107+ messages in thread
From: Thomas Gleixner @ 2018-01-04  0:29 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, torvalds, gregkh, dwmw, tim.c.chen, linux-kernel,
	dave.hansen, Andi Kleen

On Wed, 3 Jan 2018, Andi Kleen wrote:
>  	unwind_init();
> +
> +#ifndef RETPOLINE
> +	add_taint(TAINT_NO_RETPOLINE, LOCKDEP_STILL_OK);
> +	pr_warn("No support for retpoline in kernel compiler\n");
> +	pr_warn("Kernel may be vulnerable to data leaks.\n");

That's blantantly wrong.

The kernel is not vulnerable to data leaks. The hardware is.

An that's what the CPU_BUG bit is for. If the mitigation is in place,
activate the proper feature bit like we did with PTI

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:29         ` Alan Cox
@ 2018-01-04  0:31           ` Thomas Gleixner
  2018-01-04  0:38             ` Alan Cox
  2018-01-04  0:40             ` Andi Kleen
  0 siblings, 2 replies; 107+ messages in thread
From: Thomas Gleixner @ 2018-01-04  0:31 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andi Kleen, Linus Torvalds, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

On Thu, 4 Jan 2018, Alan Cox wrote:
> On Wed, 3 Jan 2018 16:15:01 -0800
> Andi Kleen <andi@firstfloor.org> wrote:
> 
> > > It should be a CPU_BUG bit as we have for the other mess. And that can be
> > > used for patching.  
> > 
> > It has to be done at compile time because it requires a compiler option.
> > 
> > Most of the indirect calls are in C code.
> > 
> > So it cannot just patched in, only partially out.
> 
> You can replace the pushl ; jmp  with an alternatives section (although
> there might be a lot of them). Even if gcc isn't smart enough to do that
> perl is.

So you say, that we finally need a perl interpreter in the kernel to do
alternative patching?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: [PATCH 10/11] retpoline/taint: Taint kernel for missing retpoline in compiler
  2018-01-04  0:29   ` Thomas Gleixner
@ 2018-01-04  0:35     ` Randy Dunlap
  0 siblings, 0 replies; 107+ messages in thread
From: Randy Dunlap @ 2018-01-04  0:35 UTC (permalink / raw)
  To: Thomas Gleixner, Andi Kleen
  Cc: tglx, torvalds, gregkh, dwmw, tim.c.chen, linux-kernel,
	dave.hansen, Andi Kleen

On 01/03/2018 04:29 PM, Thomas Gleixner wrote:
> On Wed, 3 Jan 2018, Andi Kleen wrote:
>>  	unwind_init();
>> +
>> +#ifndef RETPOLINE
>> +	add_taint(TAINT_NO_RETPOLINE, LOCKDEP_STILL_OK);
>> +	pr_warn("No support for retpoline in kernel compiler\n");
>> +	pr_warn("Kernel may be vulnerable to data leaks.\n");
> 
> That's blantantly wrong.
> 
> The kernel is not vulnerable to data leaks. The hardware is.

Well it's the kernel that is vulnerable.  The warning doesn't say what is
causing the vulnerability, unless it's the "kernel compiler."

> An that's what the CPU_BUG bit is for. If the mitigation is in place,
> activate the proper feature bit like we did with PTI

> So you say, that we finally need a perl interpreter in the kernel to do
> alternative patching?

Yes, Finally.

-- 
~Randy

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:31           ` Thomas Gleixner
@ 2018-01-04  0:38             ` Alan Cox
  2018-01-04  0:40             ` Andi Kleen
  1 sibling, 0 replies; 107+ messages in thread
From: Alan Cox @ 2018-01-04  0:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andi Kleen, Linus Torvalds, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

> So you say, that we finally need a perl interpreter in the kernel to do
> alternative patching?

No but for weird cases like that

gcc -S
perl -e
as

does work.

Alan

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:31           ` Thomas Gleixner
  2018-01-04  0:38             ` Alan Cox
@ 2018-01-04  0:40             ` Andi Kleen
  2018-01-04  8:15               ` Woodhouse, David
  1 sibling, 1 reply; 107+ messages in thread
From: Andi Kleen @ 2018-01-04  0:40 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alan Cox, Andi Kleen, Linus Torvalds, Greg Kroah-Hartman, dwmw,
	Tim Chen, Linux Kernel Mailing List, Dave Hansen

> So you say, that we finally need a perl interpreter in the kernel to do
> alternative patching?

I don't think perl or objtool makes sense. That would be just incredibly
fragile because compilers can reorder and mix code. 

It could be done with a gcc change I suppose. That should be reliable.

But that would need to be developed first. We don't have it right now.

As the first step a compile time approach should be sufficient.
We can add a CONFIG option so people can chose at compile time.

Then later we can investigate run time patching.

-Andi

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-03 23:51 ` Avoid speculative indirect calls in kernel Linus Torvalds
  2018-01-04  0:00   ` Alan Cox
  2018-01-04  0:09   ` Andi Kleen
@ 2018-01-04  1:00   ` Paul Turner
  2018-01-04  1:41   ` Paolo Bonzini
  2018-01-04 11:26   ` Pavel Machek
  4 siblings, 0 replies; 107+ messages in thread
From: Paul Turner @ 2018-01-04  1:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, tglx, Greg Kroah-Hartman, Woodhouse, David, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

On Wed, Jan 3, 2018 at 3:51 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Wed, Jan 3, 2018 at 3:09 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> This is a fix for Variant 2 in
>> https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
>>
>> Any speculative indirect calls in the kernel can be tricked
>> to execute any kernel code, which may allow side channel
>> attacks that can leak arbitrary kernel data.
>
> Why is this all done without any configuration options?
>
> A *competent* CPU engineer would fix this by making sure speculation
> doesn't happen across protection domains. Maybe even a L1 I$ that is
> keyed by CPL.
>
> I think somebody inside of Intel needs to really take a long hard look
> at their CPU's, and actually admit that they have issues instead of
> writing PR blurbs that say that everything works as designed.
>
> .. and that really means that all these mitigation patches should be
> written with "not all CPU's are crap" in mind.
>
> Or is Intel basically saying "we are committed to selling you shit
> forever and ever, and never fixing anything"?
>
> Because if that's the case, maybe we should start looking towards the
> ARM64 people more.
>
> Please talk to management. Because I really see exactly two possibibilities:
>
>  - Intel never intends to fix anything
>
> OR
>
>  - these workarounds should have a way to disable them.
>
> Which of the two is it?
>
>                    Linus

With all of today's excitement these raced slightly with a post we are
making explaining the technique and its application.
The modifications are mostly at the compiler level, to produce
binaries which can safely execute on an affected target.
(Discussing how such a binary could be cross-compiled for vulnerable
and non-vulnerable targets is an interesting discussion, but right
now, all targets are obviously vulnerable.)

Let me finish getting the post up and I'll bring back more context here.

- Paul

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-03 23:51 ` Avoid speculative indirect calls in kernel Linus Torvalds
                     ` (2 preceding siblings ...)
  2018-01-04  1:00   ` Paul Turner
@ 2018-01-04  1:41   ` Paolo Bonzini
  2018-01-04  1:59     ` Alan Cox
  2018-01-04 11:26   ` Pavel Machek
  4 siblings, 1 reply; 107+ messages in thread
From: Paolo Bonzini @ 2018-01-04  1:41 UTC (permalink / raw)
  To: Linus Torvalds, Andi Kleen
  Cc: tglx, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

On 04/01/2018 00:51, Linus Torvalds wrote:
> On Wed, Jan 3, 2018 at 3:09 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> This is a fix for Variant 2 in
>> https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
>>
>> Any speculative indirect calls in the kernel can be tricked
>> to execute any kernel code, which may allow side channel
>> attacks that can leak arbitrary kernel data.
> 
> Why is this all done without any configuration options?
> 
> A *competent* CPU engineer would fix this by making sure speculation
> doesn't happen across protection domains. Maybe even a L1 I$ that is
> keyed by CPL.

It's not that simple; Andi's patches don't fix this yet, but there's
also the case of context switching, where two processes run in the same
protection domain and still shouldn't be able to interfere with each other.

But then, exactly because the retpoline approach adds quite some cruft
and leaves something to be desired, why even bother?  Intel has also
started releasing microcode updates that basically add some chicken bits
and also let you flush branch predictor state to handle the context
switch case.  Why not just require people to have their microcode
updated, and DTRT from the beginning?

Also, according to Google the KVM PoC can be broken simply by clearing
the registers on every exit to the hypervisor.  Of course it's just
mitigation, but perhaps _that_ is where we should start fixing the
user/kernel boundary too.

Paolo

> I think somebody inside of Intel needs to really take a long hard look
> at their CPU's, and actually admit that they have issues instead of
> writing PR blurbs that say that everything works as designed.
> 
> .. and that really means that all these mitigation patches should be
> written with "not all CPU's are crap" in mind.
> 
> Or is Intel basically saying "we are committed to selling you shit
> forever and ever, and never fixing anything"?
> 
> Because if that's the case, maybe we should start looking towards the
> ARM64 people more.
> 
> Please talk to management. Because I really see exactly two possibibilities:
> 
>  - Intel never intends to fix anything
> 
> OR
> 
>  - these workarounds should have a way to disable them.
> 
> Which of the two is it?
> 
>                    Linus
> 

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  1:41   ` Paolo Bonzini
@ 2018-01-04  1:59     ` Alan Cox
  2018-01-04  2:11       ` Paolo Bonzini
  0 siblings, 1 reply; 107+ messages in thread
From: Alan Cox @ 2018-01-04  1:59 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Linus Torvalds, Andi Kleen, tglx, Greg Kroah-Hartman, dwmw,
	Tim Chen, Linux Kernel Mailing List, Dave Hansen

> But then, exactly because the retpoline approach adds quite some cruft
> and leaves something to be desired, why even bother?  Intel has also

Performance

> Also, according to Google the KVM PoC can be broken simply by clearing
> the registers on every exit to the hypervisor.  Of course it's just
> mitigation, but perhaps _that_ is where we should start fixing the
> user/kernel boundary too.

The syscall boundary isn't quite that simple and clearing registers make
things harder but not impossible. It's a good hardening exercise as are
things like anding the top bit off userspace addresses on x86 64bit so
that even if someone speculates through a user copy they get to steal
their own data.

Other hardening possibilities include moving processes between cores,
yielding to another task for a bit or clearing L1 data if a syscall
returns an error, running only processes for the same uid on hyperthreaded
pairs/quads (more easy to do with VMs and something some cloud folk kind
of do anyway so that you more clearly get what you pay for in CPU time)
etc

Buffer overruns went from fly swatting, through organized analysis,
hardening, tools, better interfaces and language changes. History usually
repeats itself.

But absolutely - yes we should be looking at effective hardening
mechanisms in the kernel just as people will be in the hardware.

Alan

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  1:59     ` Alan Cox
@ 2018-01-04  2:11       ` Paolo Bonzini
  2018-01-04  8:20         ` Woodhouse, David
  0 siblings, 1 reply; 107+ messages in thread
From: Paolo Bonzini @ 2018-01-04  2:11 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linus Torvalds, Andi Kleen, tglx, Greg Kroah-Hartman, dwmw,
	Tim Chen, Linux Kernel Mailing List, Dave Hansen

On 04/01/2018 02:59, Alan Cox wrote:
>> But then, exactly because the retpoline approach adds quite some cruft
>> and leaves something to be desired, why even bother?
>
> Performance

Dunno.  If I care about mitigating this threat, I wouldn't stop at
retpolines even if the full solution has pretty bad performance (it's
roughly in the same ballpark as PTI).  But if I don't care, I wouldn't
want retpolines either, since they do introduce a small slowdown (10-20
cycles per indirect branch, meaning that after a thousand such papercuts
they become slower than the full solution).

A couple manually written asm retpolines may be good as mitigation to
block the simplest PoCs (Linus may disagree), but patching the compiler,
getting alternatives right, etc. will take a while.  The only redeeming
grace of retpolines is that they don't require a microcode update, but
the microcode will be out there long before these patches are included
and trickle down to distros...  I just don't see the point in starting
from retpolines or drawing the line there.

Paolo

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:40             ` Andi Kleen
@ 2018-01-04  8:15               ` Woodhouse, David
  2018-01-04 15:53                 ` Andi Kleen
  0 siblings, 1 reply; 107+ messages in thread
From: Woodhouse, David @ 2018-01-04  8:15 UTC (permalink / raw)
  To: Andi Kleen, Thomas Gleixner
  Cc: Alan Cox, Linus Torvalds, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen


[-- Attachment #1.1: Type: text/plain, Size: 1823 bytes --]

On Wed, 2018-01-03 at 16:40 -0800, Andi Kleen wrote:
> > 
> > So you say, that we finally need a perl interpreter in the kernel
> > to do
> > alternative patching?
> I don't think perl or objtool makes sense. That would be just
> incredibly
> fragile because compilers can reorder and mix code. 
> 
> It could be done with a gcc change I suppose. That should be
> reliable.
> 
> But that would need to be developed first. We don't have it right
> now.
> 
> As the first step a compile time approach should be sufficient.
> We can add a CONFIG option so people can chose at compile time.
> 
> Then later we can investigate run time patching.

My original code, that Intel had included in the osv_v7.1 patch set,
already did runtime patching for all the explicit call sites:

+.macro JMP_THUNK reg:req
+#ifdef RETPOLINE
+       ALTERNATIVE __stringify(jmp __x86.indirect_thunk.\reg), __stringify(jmp *%\reg), X86_FEATURE_IBRS_ATT
+#else
+       jmp *\reg
+#endif
+.endm


.. and the thunks themselves were also using alternatives:

+ENTRY(__x86.indirect_thunk.\reg)
+       CFI_STARTPROC
+       ALTERNATIVE "call 2f", __stringify(jmp *%\reg), X86_FEATURE_IBRS_ATT
+1:
+       lfence
+       jmp     1b
+2:
+       mov     %\reg, (%\sp)
+       ret
+       CFI_ENDPROC
+ENDPROC(__x86.indirect_thunk.\reg)


So all you were left with was the single static jump in the call sites
which GCC emitted, which are basically harmless. It's not clear that it
makes sense to post-process GCC output just to eliminate those. 

Andi, you seem to have made a lot of changes, some cosmetic and some
make it look like you were working on an older version of the code.

For reference, here's my original version.

[-- Attachment #1.2: Type: text/x-patch, Size: 2469 bytes --]

From 970687a4c18ac5adb2124c673e31fbfe079220ca Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 23 Dec 2017 00:51:24 +0000
Subject: [PATCH 1/8] x86: Add initial retpoline support with
 -mindirect-branch=thunk-extern

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/Makefile           |  7 +++++++
 arch/x86/kernel/Makefile    |  1 +
 arch/x86/kernel/retpoline.S | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+)
 create mode 100644 arch/x86/kernel/retpoline.S

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a20eacd..a6d5d39 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -235,6 +235,13 @@ KBUILD_CFLAGS += -Wno-sign-compare
 #
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 
+#
+RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
+ifneq ($(RETPOLINE_CFLAGS),)
+	KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
+	KBUILD_AFLAGS += -DRETPOLINE
+endif
+
 archscripts: scripts_basic
 	$(Q)$(MAKE) $(build)=arch/x86/tools relocs
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 295abaa..d1af553 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
+obj-y			+= retpoline.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o
 obj-$(CONFIG_X86_ESPFIX64)	+= espfix_64.o
 obj-$(CONFIG_SYSFS)	+= ksysfs.o
diff --git a/arch/x86/kernel/retpoline.S b/arch/x86/kernel/retpoline.S
new file mode 100644
index 0000000..b2b17b1
--- /dev/null
+++ b/arch/x86/kernel/retpoline.S
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef RETPOLINE
+
+#include <linux/stringify.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+
+.macro THUNK sp reg
+	.section .text.__x86.indirect_thunk.\reg
+
+ENTRY(__x86.indirect_thunk.\reg)
+	CFI_STARTPROC
+	ALTERNATIVE "call 2f", __stringify(jmp *%\reg), X86_FEATURE_IBRS_ATT
+1:
+	lfence
+	jmp	1b
+2:
+	mov	%\reg, (%\sp)
+	ret
+	CFI_ENDPROC
+ENDPROC(__x86.indirect_thunk.\reg)
+.endm
+
+#ifdef CONFIG_64BIT
+.irp reg rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+	THUNK rsp \reg
+.endr
+#else
+.irp reg eax ebx ecx edx esi edi ebp
+	THUNK esp \reg
+.endr
+#endif
+
+#endif /* RETPOLINE */
-- 
2.7.4


[-- Attachment #1.3: Type: text/x-patch, Size: 8359 bytes --]

From 68521da88d6d05ed2b9ad5e57a7475e8d3fdc904 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 23 Dec 2017 01:59:04 +0000
Subject: [PATCH 2/8] x86: Replace indirect branches in asm with
 CALL_THUNK/JMP_THUNK macro

This uses the same retpoline thunks that the -mindirect-branch= GCC option
uses, to avoid speculative branch prediction problems.

Open-code a version of the thunk in the syscall trampoline because we
can't just jump to the existing ones.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/crypto/aesni-intel_asm.S            |  5 +++--
 arch/x86/crypto/camellia-aesni-avx-asm_64.S  |  3 ++-
 arch/x86/crypto/camellia-aesni-avx2-asm_64.S |  3 ++-
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S    |  3 ++-
 arch/x86/entry/entry_32.S                    |  4 ++--
 arch/x86/entry/entry_64.S                    | 19 +++++++++++++++----
 arch/x86/include/asm/spec_ctrl.h             | 16 ++++++++++++++++
 arch/x86/kernel/ftrace_32.S                  |  6 ++++--
 arch/x86/kernel/ftrace_64.S                  |  7 ++++---
 arch/x86/lib/checksum_32.S                   |  5 +++--
 10 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 16627fe..52160d1 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,6 +32,7 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 #include <asm/frame.h>
+#include <asm/spec_ctrl.h>
 
 /*
  * The following macros are used to move an (un)aligned 16 byte value to/from
@@ -2884,7 +2885,7 @@ ENTRY(aesni_xts_crypt8)
 	pxor INC, STATE4
 	movdqu IV, 0x30(OUTP)
 
-	call *%r11
+	CALL_THUNK r11
 
 	movdqu 0x00(OUTP), INC
 	pxor INC, STATE1
@@ -2929,7 +2930,7 @@ ENTRY(aesni_xts_crypt8)
 	_aesni_gf128mul_x_ble()
 	movups IV, (IVP)
 
-	call *%r11
+	CALL_THUNK r11
 
 	movdqu 0x40(OUTP), INC
 	pxor INC, STATE1
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index f7c495e..3bf23db 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -17,6 +17,7 @@
 
 #include <linux/linkage.h>
 #include <asm/frame.h>
+#include <asm/spec_ctrl.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -1227,7 +1228,7 @@ camellia_xts_crypt_16way:
 	vpxor 14 * 16(%rax), %xmm15, %xmm14;
 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
 
-	call *%r9;
+	CALL_THUNK r9;
 
 	addq $(16 * 16), %rsp;
 
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index eee5b39..1f06c98 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -12,6 +12,7 @@
 
 #include <linux/linkage.h>
 #include <asm/frame.h>
+#include <asm/spec_ctrl.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -1343,7 +1344,7 @@ camellia_xts_crypt_32way:
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-	call *%r9;
+	CALL_THUNK r9;
 
 	addq $(16 * 32), %rsp;
 
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 7a7de27..fd89745 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -45,6 +45,7 @@
 
 #include <asm/inst.h>
 #include <linux/linkage.h>
+#include <asm/spec_ctrl.h>
 
 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
 
@@ -172,7 +173,7 @@ continue_block:
 	movzxw  (bufp, %rax, 2), len
 	lea	crc_array(%rip), bufp
 	lea     (bufp, len, 1), bufp
-	jmp     *bufp
+	JMP_THUNK bufp
 
 	################################################################
 	## 2a) PROCESS FULL BLOCKS:
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index ace8f32..608d198 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -290,7 +290,7 @@ ENTRY(ret_from_fork)
 
 	/* kernel thread */
 1:	movl	%edi, %eax
-	call	*%ebx
+	CALL_THUNK ebx
 	/*
 	 * A kernel thread is allowed to return here after successfully
 	 * calling do_execve().  Exit to userspace to complete the execve()
@@ -919,7 +919,7 @@ common_exception:
 	movl	%ecx, %es
 	TRACE_IRQS_OFF
 	movl	%esp, %eax			# pt_regs pointer
-	call	*%edi
+	CALL_THUNK edi
 	jmp	ret_from_exception
 END(common_exception)
 
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index c6db10f..0d3ff7f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -189,7 +189,17 @@ ENTRY(entry_SYSCALL_64_trampoline)
 	 */
 	pushq	%rdi
 	movq	$entry_SYSCALL_64_stage2, %rdi
-	jmp	*%rdi
+	/*
+	 * Open-code the retpoline from retpoline.S, because we can't
+	 * just jump to it directly.
+	 */
+	ALTERNATIVE "call 2f", "jmp *%rdi", X86_FEATURE_IBRS_ATT
+1:
+	lfence
+	jmp	1b
+2:
+	mov	%rdi, (%rsp)
+	ret
 END(entry_SYSCALL_64_trampoline)
 
 	.popsection
@@ -276,7 +286,8 @@ entry_SYSCALL_64_fastpath:
 	 * It might end up jumping to the slow path.  If it jumps, RAX
 	 * and all argument registers are clobbered.
 	 */
-	call	*sys_call_table(, %rax, 8)
+	movq	sys_call_table(, %rax, 8), %rax
+	CALL_THUNK rax
 .Lentry_SYSCALL_64_after_fastpath_call:
 
 	movq	%rax, RAX(%rsp)
@@ -449,7 +460,7 @@ ENTRY(stub_ptregs_64)
 	jmp	entry_SYSCALL64_slow_path
 
 1:
-	jmp	*%rax				/* Called from C */
+	JMP_THUNK rax				/* Called from C */
 END(stub_ptregs_64)
 
 .macro ptregs_stub func
@@ -528,7 +539,7 @@ ENTRY(ret_from_fork)
 1:
 	/* kernel thread */
 	movq	%r12, %rdi
-	call	*%rbx
+	CALL_THUNK rbx
 	/*
 	 * A kernel thread is allowed to return here after successfully
 	 * calling do_execve().  Exit to userspace to complete the execve()
diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h
index e8ed6fd..59b664e 100644
--- a/arch/x86/include/asm/spec_ctrl.h
+++ b/arch/x86/include/asm/spec_ctrl.h
@@ -180,5 +180,21 @@ ALTERNATIVE "", __stringify(__ASM_DISABLE_IBRS_CLOBBER), X86_FEATURE_SPEC_CTRL
 ALTERNATIVE __stringify(__ASM_STUFF_RSB), "", X86_FEATURE_SMEP
 .endm
 
+.macro JMP_THUNK reg:req
+#ifdef RETPOLINE
+	ALTERNATIVE __stringify(jmp __x86.indirect_thunk.\reg), __stringify(jmp *%\reg), X86_FEATURE_IBRS_ATT
+#else
+	jmp *\reg
+#endif
+.endm
+
+.macro CALL_THUNK reg:req
+#ifdef RETPOLINE
+	ALTERNATIVE __stringify(call __x86.indirect_thunk.\reg), __stringify(call *%\reg), X86_FEATURE_IBRS_ATT
+#else
+	jmp *\reg
+#endif
+.endm
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_SPEC_CTRL_H */
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index b6c6468..e89c6be 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -8,6 +8,7 @@
 #include <asm/segment.h>
 #include <asm/export.h>
 #include <asm/ftrace.h>
+#include <asm/spec_ctrl.h>
 
 #ifdef CC_USING_FENTRY
 # define function_hook	__fentry__
@@ -197,7 +198,8 @@ ftrace_stub:
 	movl	0x4(%ebp), %edx
 	subl	$MCOUNT_INSN_SIZE, %eax
 
-	call	*ftrace_trace_function
+	movl	ftrace_trace_function, %ecx
+	CALL_THUNK ecx
 
 	popl	%edx
 	popl	%ecx
@@ -241,5 +243,5 @@ return_to_handler:
 	movl	%eax, %ecx
 	popl	%edx
 	popl	%eax
-	jmp	*%ecx
+	JMP_THUNK ecx
 #endif
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index c832291..6c97463 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -7,6 +7,7 @@
 #include <asm/ptrace.h>
 #include <asm/ftrace.h>
 #include <asm/export.h>
+#include <asm/spec_ctrl.h>
 
 
 	.code64
@@ -286,8 +287,8 @@ trace:
 	 * ip and parent ip are used and the list function is called when
 	 * function tracing is enabled.
 	 */
-	call   *ftrace_trace_function
-
+	movq ftrace_trace_function, %r8
+	CALL_THUNK r8
 	restore_mcount_regs
 
 	jmp fgraph_trace
@@ -329,5 +330,5 @@ GLOBAL(return_to_handler)
 	movq 8(%rsp), %rdx
 	movq (%rsp), %rax
 	addq $24, %rsp
-	jmp *%rdi
+	JMP_THUNK rdi
 #endif
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 4d34bb5..86df74c 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -29,7 +29,8 @@
 #include <asm/errno.h>
 #include <asm/asm.h>
 #include <asm/export.h>
-				
+#include <asm/spec_ctrl.h>
+
 /*
  * computes a partial checksum, e.g. for TCP/UDP fragments
  */
@@ -156,7 +157,7 @@ ENTRY(csum_partial)
 	negl %ebx
 	lea 45f(%ebx,%ebx,2), %ebx
 	testl %esi, %esi
-	jmp *%ebx
+	JMP_THUNK ebx
 
 	# Handle 2-byte-aligned regions
 20:	addw (%esi), %ax
-- 
2.7.4


[-- Attachment #1.4: Type: text/x-patch, Size: 5961 bytes --]

From 0441f6cf7f593502ecfec0da954ef3e15cae7b70 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 23 Dec 2017 02:11:18 +0000
Subject: [PATCH 3/8] x86: Use retpoline for calls in inline asm

Fix up PV operations and hypercalls to avoid indirect calls.

There is perhaps scope for optimising the pvops case. For ops which aren't
going to be changed at runtime (all of them?), we could just patch the code
to make it use *direct* jumps instead of indirect.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/include/asm/mshyperv.h       | 18 ++++++++++--------
 arch/x86/include/asm/paravirt_types.h | 15 ++++++++++++---
 arch/x86/include/asm/spec_ctrl.h      | 13 +++++++++++++
 arch/x86/include/asm/xen/hypercall.h  |  5 +++--
 4 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 581bb54..ca7463b 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -7,6 +7,7 @@
 #include <linux/nmi.h>
 #include <asm/io.h>
 #include <asm/hyperv.h>
+#include <asm/spec_ctrl.h>
 
 /*
  * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
@@ -186,10 +187,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 		return U64_MAX;
 
 	__asm__ __volatile__("mov %4, %%r8\n"
-			     "call *%5"
+			     CALL_THUNK
 			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 			       "+c" (control), "+d" (input_address)
-			     :  "r" (output_address), "m" (hv_hypercall_pg)
+			     :  "r" (output_address),
+				THUNK_TARGET(hv_hypercall_pg)
 			     : "cc", "memory", "r8", "r9", "r10", "r11");
 #else
 	u32 input_address_hi = upper_32_bits(input_address);
@@ -200,13 +202,13 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 	if (!hv_hypercall_pg)
 		return U64_MAX;
 
-	__asm__ __volatile__("call *%7"
+	__asm__ __volatile__(CALL_THUNK
 			     : "=A" (hv_status),
 			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
 			     : "A" (control),
 			       "b" (input_address_hi),
 			       "D"(output_address_hi), "S"(output_address_lo),
-			       "m" (hv_hypercall_pg)
+			       THUNK_TARGET(hv_hypercall_pg)
 			     : "cc", "memory");
 #endif /* !x86_64 */
 	return hv_status;
@@ -227,10 +229,10 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 
 #ifdef CONFIG_X86_64
 	{
-		__asm__ __volatile__("call *%4"
+		__asm__ __volatile__(CALL_THUNK
 				     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 				       "+c" (control), "+d" (input1)
-				     : "m" (hv_hypercall_pg)
+				     : THUNK_TARGET(hv_hypercall_pg)
 				     : "cc", "r8", "r9", "r10", "r11");
 	}
 #else
@@ -238,13 +240,13 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 		u32 input1_hi = upper_32_bits(input1);
 		u32 input1_lo = lower_32_bits(input1);
 
-		__asm__ __volatile__ ("call *%5"
+		__asm__ __volatile__ (CALL_THUNK
 				      : "=A"(hv_status),
 					"+c"(input1_lo),
 					ASM_CALL_CONSTRAINT
 				      :	"A" (control),
 					"b" (input1_hi),
-					"m" (hv_hypercall_pg)
+					THUNK_TARGET(hv_hypercall_pg)
 				      : "cc", "edi", "esi");
 	}
 #endif
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 6ec54d0..dfff11e 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -336,11 +336,17 @@ extern struct pv_lock_ops pv_lock_ops;
 #define PARAVIRT_PATCH(x)					\
 	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
 
+#define paravirt_clobber(clobber)		\
+	[paravirt_clobber] "i" (clobber)
+#ifdef RETPOLINE
+#define paravirt_type(op)				\
+	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
+	[paravirt_opptr] "r" ((op))
+#else
 #define paravirt_type(op)				\
 	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
 	[paravirt_opptr] "i" (&(op))
-#define paravirt_clobber(clobber)		\
-	[paravirt_clobber] "i" (clobber)
+#endif
 
 /*
  * Generate some code, and mark it as patchable by the
@@ -392,8 +398,11 @@ int paravirt_disable_iospace(void);
  * offset into the paravirt_patch_template structure, and can therefore be
  * freely converted back into a structure offset.
  */
+#ifdef RETPOLINE
+#define PARAVIRT_CALL	"call __x86.indirect_thunk.%V[paravirt_opptr];"
+#else
 #define PARAVIRT_CALL	"call *%c[paravirt_opptr];"
-
+#endif
 /*
  * These macros are intended to wrap calls through one of the paravirt
  * ops structs, so that they can be later identified and patched at
diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h
index 59b664e..58a49a2 100644
--- a/arch/x86/include/asm/spec_ctrl.h
+++ b/arch/x86/include/asm/spec_ctrl.h
@@ -196,5 +196,18 @@ ALTERNATIVE __stringify(__ASM_STUFF_RSB), "", X86_FEATURE_SMEP
 #endif
 .endm
 
+#else /* __ASSEMBLY__ */
+
+#ifdef RETPOLINE
+#define CALL_THUNK ALTERNATIVE(					\
+        "call __x86.indirect_thunk.%V[thunk_target]",		\
+	"call *%[thunk_target]", X86_FEATURE_IBRS_ATT)
+#define THUNK_TARGET(addr) [thunk_target] "r" (addr)
+#else
+#define CALL_THUNK "call *%[thunk_target]"
+#define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#endif
+
 #endif /* __ASSEMBLY__ */
+
 #endif /* _ASM_X86_SPEC_CTRL_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 7cb282e..90df832 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -44,6 +44,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/smap.h>
+#include <asm/spec_ctrl.h>
 
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
@@ -217,9 +218,9 @@ privcmd_call(unsigned call,
 	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
 
 	stac();
-	asm volatile("call *%[call]"
+	asm volatile(CALL_THUNK
 		     : __HYPERCALL_5PARAM
-		     : [call] "a" (&hypercall_page[call])
+		     : [thunk_target] "a" (&hypercall_page[call])
 		     : __HYPERCALL_CLOBBER5);
 	clac();
 
-- 
2.7.4


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5210 bytes --]

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  2:11       ` Paolo Bonzini
@ 2018-01-04  8:20         ` Woodhouse, David
  2018-01-04 11:42           ` Pavel Machek
  2018-01-04 19:57           ` Jon Masters
  0 siblings, 2 replies; 107+ messages in thread
From: Woodhouse, David @ 2018-01-04  8:20 UTC (permalink / raw)
  To: Paolo Bonzini, Alan Cox
  Cc: Linus Torvalds, Andi Kleen, tglx, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

[-- Attachment #1: Type: text/plain, Size: 1656 bytes --]

On Thu, 2018-01-04 at 03:11 +0100, Paolo Bonzini wrote:
> On 04/01/2018 02:59, Alan Cox wrote:
> >> But then, exactly because the retpoline approach adds quite some cruft
> >> and leaves something to be desired, why even bother?
> >
> > Performance
> 
> Dunno.  If I care about mitigating this threat, I wouldn't stop at
> retpolines even if the full solution has pretty bad performance (it's
> roughly in the same ballpark as PTI).  But if I don't care, I wouldn't
> want retpolines either, since they do introduce a small slowdown (10-20
> cycles per indirect branch, meaning that after a thousand such papercuts
> they become slower than the full solution).
> 
> A couple manually written asm retpolines may be good as mitigation to
> block the simplest PoCs (Linus may disagree), but patching the compiler,
> getting alternatives right, etc. will take a while.  The only redeeming
> grace of retpolines is that they don't require a microcode update, but
> the microcode will be out there long before these patches are included
> and trickle down to distros...  I just don't see the point in starting
> from retpolines or drawing the line there.

No, really. The full mitigation with the microcode update and IBRS
support is *slow*. Horribly slow.

By using retpoline, we avoid the need to set IBRS on *every* entry into
the kernel. It gives you *most* of that performance back.

It's horrid, but it seems to be the best option we have. And in my
original patch set, it goes away almost completely if it isn't being
used. (Apart from the fact that your eyes may still bleed; I can't do
anything about that. Sorry).


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5210 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: [PATCH 09/11] x86/retpoline: Finally enable retpoline for C code
  2018-01-03 23:09 ` [PATCH 09/11] x86/retpoline: Finally enable retpoline for C code Andi Kleen
@ 2018-01-04  8:28   ` Greg KH
  2018-01-04  8:30     ` Dave Hansen
  0 siblings, 1 reply; 107+ messages in thread
From: Greg KH @ 2018-01-04  8:28 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, torvalds, dwmw, tim.c.chen, linux-kernel, dave.hansen, Dave Hansen

On Wed, Jan 03, 2018 at 03:09:32PM -0800, Andi Kleen wrote:
> From: Dave Hansen <dave.hansen@linux.intel.com>
> 
> From: David Woodhouse <dwmw@amazon.co.uk>

We do now accept "Co-Developed-by:" in the signed-off-by area to handle
patches that are worked on by multiple people.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: [PATCH 09/11] x86/retpoline: Finally enable retpoline for C code
  2018-01-04  8:28   ` Greg KH
@ 2018-01-04  8:30     ` Dave Hansen
  0 siblings, 0 replies; 107+ messages in thread
From: Dave Hansen @ 2018-01-04  8:30 UTC (permalink / raw)
  To: Greg KH, Andi Kleen
  Cc: tglx, torvalds, dwmw, tim.c.chen, linux-kernel, Dave Hansen

On 01/04/2018 12:28 AM, Greg KH wrote:
> On Wed, Jan 03, 2018 at 03:09:32PM -0800, Andi Kleen wrote:
>> From: Dave Hansen <dave.hansen@linux.intel.com>
>>
>> From: David Woodhouse <dwmw@amazon.co.uk>
> We do now accept "Co-Developed-by:" in the signed-off-by area to handle
> patches that are worked on by multiple people.

FWIW, this was probably just some artifact from some stupid scripts on
my end and going from quilt to git a few times.  I didn't do anything
useful here.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-03 23:51 ` Avoid speculative indirect calls in kernel Linus Torvalds
                     ` (3 preceding siblings ...)
  2018-01-04  1:41   ` Paolo Bonzini
@ 2018-01-04 11:26   ` Pavel Machek
  2018-01-04 11:54     ` Alan Cox
  2018-01-04 18:33     ` Linus Torvalds
  4 siblings, 2 replies; 107+ messages in thread
From: Pavel Machek @ 2018-01-04 11:26 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, tglx, Greg Kroah-Hartman, dwmw, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

[-- Attachment #1: Type: text/plain, Size: 1260 bytes --]

On Wed 2018-01-03 15:51:35, Linus Torvalds wrote:
> On Wed, Jan 3, 2018 at 3:09 PM, Andi Kleen <andi@firstfloor.org> wrote:
> > This is a fix for Variant 2 in
> > https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
> >
> > Any speculative indirect calls in the kernel can be tricked
> > to execute any kernel code, which may allow side channel
> > attacks that can leak arbitrary kernel data.
> 
> Why is this all done without any configuration options?
> 
> A *competent* CPU engineer would fix this by making sure speculation
> doesn't happen across protection domains. Maybe even a L1 I$ that is
> keyed by CPL.

Would that be enough?

AFAICT this will be pretty tricky to fix; it looks like you could
"attack" one userland application from another. Probing does not have
to work on L1 cache level; even main memory has "state".

It seems that complete fix would be considering any cache modification
and any memory access as a "side effect" -- so not okay to do
speculatively.

But that sounds... quite expensive for the performance...?

									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  8:20         ` Woodhouse, David
@ 2018-01-04 11:42           ` Pavel Machek
  2018-01-04 11:47             ` Woodhouse, David
  2018-01-04 19:57           ` Jon Masters
  1 sibling, 1 reply; 107+ messages in thread
From: Pavel Machek @ 2018-01-04 11:42 UTC (permalink / raw)
  To: Woodhouse, David
  Cc: Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen, tglx,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Dave Hansen

On Thu 2018-01-04 08:20:14, Woodhouse, David wrote:
> On Thu, 2018-01-04 at 03:11 +0100, Paolo Bonzini wrote:
> > On 04/01/2018 02:59, Alan Cox wrote:
> > >> But then, exactly because the retpoline approach adds quite some cruft
> > >> and leaves something to be desired, why even bother?
> > >
> > > Performance
> > 
> > Dunno.  If I care about mitigating this threat, I wouldn't stop at
> > retpolines even if the full solution has pretty bad performance (it's
> > roughly in the same ballpark as PTI).  But if I don't care, I wouldn't
> > want retpolines either, since they do introduce a small slowdown (10-20
> > cycles per indirect branch, meaning that after a thousand such papercuts
> > they become slower than the full solution).
> > 
> > A couple manually written asm retpolines may be good as mitigation to
> > block the simplest PoCs (Linus may disagree), but patching the compiler,
> > getting alternatives right, etc. will take a while.  The only redeeming
> > grace of retpolines is that they don't require a microcode update, but
> > the microcode will be out there long before these patches are included
> > and trickle down to distros...  I just don't see the point in starting
> > from retpolines or drawing the line there.
> 
> No, really. The full mitigation with the microcode update and IBRS
> support is *slow*. Horribly slow.

What is IBRS? Invalidate BRanch prediction bufferS?

> By using retpoline, we avoid the need to set IBRS on *every* entry into
> the kernel. It gives you *most* of that performance back.
> 
> It's horrid, but it seems to be the best option we have. And in my
> original patch set, it goes away almost completely if it isn't being
> used. (Apart from the fact that your eyes may still bleed; I can't do
> anything about that. Sorry).

Bleeding eyes sound like quite serious side-effect :-).

								Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 11:42           ` Pavel Machek
@ 2018-01-04 11:47             ` Woodhouse, David
  2018-01-04 14:20               ` Paolo Bonzini
  0 siblings, 1 reply; 107+ messages in thread
From: Woodhouse, David @ 2018-01-04 11:47 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen, tglx,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Dave Hansen

[-- Attachment #1: Type: text/plain, Size: 881 bytes --]

On Thu, 2018-01-04 at 12:42 +0100, Pavel Machek wrote:
> 
> > No, really. The full mitigation with the microcode update and IBRS
> > support is *slow*. Horribly slow.
> 
> What is IBRS? Invalidate BRanch prediction bufferS?

That isn't the precise acronym, but yes.

The branch predictor flush that, without retpoline, we have to do on
every entry to the kernel. Requires new microcode, and the patches that
I believe Intel are *about* to post...

The first variant (all they can do on current CPUs with a microcode
update) is really slow, and thus retpoline is *very* much the preferred
option to protect the kernel on current CPUs.

Later CPUs will apparently have a better version of IBRS which is
preferred, so we'll ALTERNATIVE out the retpoline if we discover we're
running on one of those.

Public docs will, presumably, be forthcoming Real Soon Now™.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5210 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 11:26   ` Pavel Machek
@ 2018-01-04 11:54     ` Alan Cox
  2018-01-04 18:33     ` Linus Torvalds
  1 sibling, 0 replies; 107+ messages in thread
From: Alan Cox @ 2018-01-04 11:54 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Linus Torvalds, Andi Kleen, tglx, Greg Kroah-Hartman, dwmw,
	Tim Chen, Linux Kernel Mailing List, Dave Hansen

On Thu, 4 Jan 2018 12:26:14 +0100
Pavel Machek <pavel@ucw.cz> wrote:

> On Wed 2018-01-03 15:51:35, Linus Torvalds wrote:
> > On Wed, Jan 3, 2018 at 3:09 PM, Andi Kleen <andi@firstfloor.org> wrote:  
> > > This is a fix for Variant 2 in
> > > https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
> > >
> > > Any speculative indirect calls in the kernel can be tricked
> > > to execute any kernel code, which may allow side channel
> > > attacks that can leak arbitrary kernel data.  
> > 
> > Why is this all done without any configuration options?
> > 
> > A *competent* CPU engineer would fix this by making sure speculation
> > doesn't happen across protection domains. Maybe even a L1 I$ that is
> > keyed by CPL.  
> 
> Would that be enough?

For the entire system - no. To start with the current most dangerous
attack is the javascript one. And that is an attack by a process on
itself. Likewise simply keying L1I by CPL wouldn't stop ring 3
processes attacking one another or deal with virtual machines properly.
For some of those cases (notably the JIT ones) it's quite probable that
there isn't enough information for the processor to even infer what is
needed.

Alan

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 11:47             ` Woodhouse, David
@ 2018-01-04 14:20               ` Paolo Bonzini
  2018-01-04 14:51                 ` Andrew Cooper
                                   ` (2 more replies)
  0 siblings, 3 replies; 107+ messages in thread
From: Paolo Bonzini @ 2018-01-04 14:20 UTC (permalink / raw)
  To: Woodhouse, David, pavel
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli

On 04/01/2018 12:47, Woodhouse, David wrote:
> On Thu, 2018-01-04 at 12:42 +0100, Pavel Machek wrote:
>>
>>> No, really. The full mitigation with the microcode update and IBRS
>>> support is *slow*. Horribly slow.
>>
>> What is IBRS? Invalidate BRanch prediction bufferS?
> 
> That isn't the precise acronym, but yes.

My stab at backronyming it was "indirect branch restricted speculation",
and there's also IBPB which is "indirect branch prediction barrier".

> The branch predictor flush that, without retpoline, we have to do on
> every entry to the kernel. Requires new microcode, and the patches that
> I believe Intel are *about* to post...

Based on our experiments it was actually pretty good on Skylake, and bad
to horrible on earlier machines.  Still, microbenchmarks were in the
same ballpark as PTI.

BTW, the choices we have implemented in RHEL are basically:

* do nothing

* turn off indirect branch prediction in kernel mode (IBRS=1 in ring 0)

* completely turn off indirect branch prediction (IBRS=1 always, which
would also let future CPUs do their thing), which can be good for
paranoia but also for performance in some worklods

* never turn off indirect branch prediction, but use a branch prediction
barrier on every mode switch (needed for current AMD microcode)

and though we're obviously not wed to the specific debugfs paths and
names, I'd really like all of them to be available upstream too.  The
effect on performance (or lack of performance...) *is* there, so people
should be able to understand the effect on their workloads and customize
accordingly.

I hope that, after the first few weeks of panic, people will learn to
decide on a case-by-case basis whether to enable both PTI and IBRS/IBPB,
or none.  Distros will provide tuning guide and automatic tuning
profiles, etc., and the world will go on.

Paolo

ps: BTW^2 (and this is of course not about you, David) I'm disappointed
that for "Spectre" there was no discussion between upstream developers,
or between Linux vendors, or in fact hardly any discussion beyond "these
are the patches".  I understand that (unlike PTI) there was no back
story to cover up the actual vulnerability, but... grow up, folks.
Seriously, "these are the patches" won't fly with either upstream or
distros.


> The first variant (all they can do on current CPUs with a microcode
> update) is really slow, and thus retpoline is *very* much the preferred
> option to protect the kernel on current CPUs.

> Later CPUs will apparently have a better version of IBRS which is
> preferred, so we'll ALTERNATIVE out the retpoline if we discover we're
> running on one of those.
> 
> Public docs will, presumably, be forthcoming Real Soon Now™.
> 
> 
> 
> Amazon Web Services UK Limited. Registered in England and Wales with
> registration number 08650665 and which has its registered office at 60
> Holborn Viaduct, London EC1A 2FD, United Kingdom.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 14:20               ` Paolo Bonzini
@ 2018-01-04 14:51                 ` Andrew Cooper
  2018-01-04 15:29                   ` Woodhouse, David
  2018-01-04 15:32                   ` Paolo Bonzini
  2018-01-04 14:55                 ` Woodhouse, David
  2018-01-04 18:24                 ` Pavel Machek
  2 siblings, 2 replies; 107+ messages in thread
From: Andrew Cooper @ 2018-01-04 14:51 UTC (permalink / raw)
  To: Paolo Bonzini, Woodhouse, David, pavel
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli

On 04/01/18 14:20, Paolo Bonzini wrote:
> On 04/01/2018 12:47, Woodhouse, David wrote:
>> On Thu, 2018-01-04 at 12:42 +0100, Pavel Machek wrote:
>>>> No, really. The full mitigation with the microcode update and IBRS
>>>> support is *slow*. Horribly slow.
>>> What is IBRS? Invalidate BRanch prediction bufferS?
>> That isn't the precise acronym, but yes.
> My stab at backronyming it was "indirect branch restricted speculation",
> and there's also IBPB which is "indirect branch prediction barrier".
>
>> The branch predictor flush that, without retpoline, we have to do on
>> every entry to the kernel. Requires new microcode, and the patches that
>> I believe Intel are *about* to post...
> Based on our experiments it was actually pretty good on Skylake, and bad
> to horrible on earlier machines.  Still, microbenchmarks were in the
> same ballpark as PTI.
>
> BTW, the choices we have implemented in RHEL are basically:
>
> * do nothing
>
> * turn off indirect branch prediction in kernel mode (IBRS=1 in ring 0)
>
> * completely turn off indirect branch prediction (IBRS=1 always, which
> would also let future CPUs do their thing), which can be good for
> paranoia but also for performance in some worklods
>
> * never turn off indirect branch prediction, but use a branch prediction
> barrier on every mode switch (needed for current AMD microcode)

Where have you got this idea from?  Using IBPB on every mode switch
would be an insane overhead to take, and isn't necessary.

Also, remember that PTI and these mitigations are for orthogonal issues.

Perhaps it is easiest to refer directly to the Xen SP2 mitigations and
my commentary of what is going on:
http://xenbits.xen.org/gitweb/?p=people/andrewcoop/xen.git;a=blob;f=xen/arch/x86/spec_ctrl.c;h=79aedf774a390293dfd564ce978500085344e305;hb=refs/heads/sp2-mitigations-v6.5#l192

With the GCC -mindirect-branch=thunk-external support, and microcode,
Xen will make a boot-time choice between using Retpoline, Lfence (which
is the better AMD option, and more performant than retpoline), or IBRS
on Skylake and newer processors where it is strictly necessary, as well
as using IBPB whenever available.

It also supports virtualising IBRS for guest usage when the kernel has
chosen not to use it; a configuration I haven't seen in any of the Linux
patch series thusfar.

~Andrew

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 14:20               ` Paolo Bonzini
  2018-01-04 14:51                 ` Andrew Cooper
@ 2018-01-04 14:55                 ` Woodhouse, David
  2018-01-04 18:24                 ` Pavel Machek
  2 siblings, 0 replies; 107+ messages in thread
From: Woodhouse, David @ 2018-01-04 14:55 UTC (permalink / raw)
  To: Paolo Bonzini, pavel
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli

[-- Attachment #1: Type: text/plain, Size: 2379 bytes --]

On Thu, 2018-01-04 at 15:20 +0100, Paolo Bonzini wrote:
> On 04/01/2018 12:47, Woodhouse, David wrote:
> > 
> > On Thu, 2018-01-04 at 12:42 +0100, Pavel Machek wrote:
> > > 
> > > 
> > > > 
> > > > No, really. The full mitigation with the microcode update and IBRS
> > > > support is *slow*. Horribly slow.
> > > What is IBRS? Invalidate BRanch prediction bufferS?
> > That isn't the precise acronym, but yes.
>
> My stab at backronyming it was "indirect branch restricted speculation",
> and there's also IBPB which is "indirect branch prediction barrier".

Something like that, yeah. But remember, setting IBRS is a barrier too.
You can't just set it and forget it; you have to do it on *every* entry
into the kernel.

Later CPUs are intended to have an 'IBRS all the time' feature which is
set-and-forget, and will perform much better, I believe. If we find
we're running on a CPU with that, we'll turn off the retpoline with
alternatives.

> > 
> > The branch predictor flush that, without retpoline, we have to do on
> > every entry to the kernel. Requires new microcode, and the patches that
> > I believe Intel are *about* to post...
> Based on our experiments it was actually pretty good on Skylake, and bad
> to horrible on earlier machines.  Still, microbenchmarks were in the
> same ballpark as PTI.

That's good, because retpoline doesn't work on Skylake (since Skylake
will actually predict rets too, and then you're just completely hosed).

So on Skylake, we'll be using the basic IBRS support too, and also
alternativing out the retpoline.

> ps: BTW^2 (and this is of course not about you, David) I'm disappointed
> that for "Spectre" there was no discussion between upstream developers,
> or between Linux vendors, or in fact hardly any discussion beyond "these
> are the patches".  I understand that (unlike PTI) there was no back
> story to cover up the actual vulnerability, but... grow up, folks.
> Seriously, "these are the patches" won't fly with either upstream or
> distros.

Yeah, that could have been improved. Although I note the patch sets
that Intel were waving around did have fixes from both you and me, by
the end. It wasn't *entirely* behind closed doors.

On the retpoline side, HJ was *very* responsive to suggestions, and was
giving me a new set of GCC patches almost daily at one point.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5210 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 14:51                 ` Andrew Cooper
@ 2018-01-04 15:29                   ` Woodhouse, David
  2018-01-04 15:32                     ` Paolo Bonzini
                                       ` (2 more replies)
  2018-01-04 15:32                   ` Paolo Bonzini
  1 sibling, 3 replies; 107+ messages in thread
From: Woodhouse, David @ 2018-01-04 15:29 UTC (permalink / raw)
  To: Andrew Cooper, Paolo Bonzini, pavel
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli

[-- Attachment #1: Type: text/plain, Size: 1837 bytes --]

On Thu, 2018-01-04 at 14:51 +0000, Andrew Cooper wrote:
> 
> > * never turn off indirect branch prediction, but use a branch prediction
> > barrier on every mode switch (needed for current AMD microcode)
> 
> Where have you got this idea from?  Using IBPB on every mode switch
> would be an insane overhead to take, and isn't necessary.

AMD *only* has IBPB and not IBRS, but IIRC you don't need to do it on
every context switch into the kernel; only when switching between
VMs/processes?

> Also, remember that PTI and these mitigations are for orthogonal issues.
> 
> Perhaps it is easiest to refer directly to the Xen SP2 mitigations and
> my commentary of what is going on:
> http://xenbits.xen.org/gitweb/?p=people/andrewcoop/xen.git;a=blob;f=xen/arch/x86/spec_ctrl.c;h=79aedf774a390293dfd564ce978500085344e305;hb=refs/heads/sp2-mitigations-v6.5#l192
> 
> With the GCC -mindirect-branch=thunk-external support, and microcode,
> Xen will make a boot-time choice between using Retpoline, Lfence (which
> is the better AMD option, and more performant than retpoline), or IBRS
> on Skylake and newer processors where it is strictly necessary, as well
> as using IBPB whenever available.

I need to pull in the AMD lfence alternative for retpoline, giving us a
3-way choice of the existing retpoline thunk, "lfence; jmp *%\reg", and
a bare "jmp *%\reg".

Then the IBRS bits can be added on top.

> It also supports virtualising IBRS for guest usage when the kernel has
> chosen not to use it; a configuration I haven't seen in any of the Linux
> patch series thusfar.

Adding that for KVM is in the Linux IBRS patch set that I've seen.
Didn't we already have a conversation about how the Linux patch set
does it as an atomically-switched MSR while you've done it manually in
Xen because it's faster?

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5210 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 14:51                 ` Andrew Cooper
  2018-01-04 15:29                   ` Woodhouse, David
@ 2018-01-04 15:32                   ` Paolo Bonzini
  2018-01-04 16:25                     ` Andrea Arcangeli
  1 sibling, 1 reply; 107+ messages in thread
From: Paolo Bonzini @ 2018-01-04 15:32 UTC (permalink / raw)
  To: Andrew Cooper, Woodhouse, David, pavel
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli

On 04/01/2018 15:51, Andrew Cooper wrote:
> Where have you got this idea from?  Using IBPB on every mode switch
> would be an insane overhead to take, and isn't necessary.

IIRC it started as a paranoia mode for AMD, but then we found out it was
actually faster than IBRS on some Intel processor where IBRS performance
was horrible.  But I don't remember the details of the performance
testing, sorry.

Paolo

> Also, remember that PTI and these mitigations are for orthogonal issues.
> 
> Perhaps it is easiest to refer directly to the Xen SP2 mitigations and
> my commentary of what is going on:
> http://xenbits.xen.org/gitweb/?p=people/andrewcoop/xen.git;a=blob;f=xen/arch/x86/spec_ctrl.c;h=79aedf774a390293dfd564ce978500085344e305;hb=refs/heads/sp2-mitigations-v6.5#l192
> 
> With the GCC -mindirect-branch=thunk-external support, and microcode,
> Xen will make a boot-time choice between using Retpoline, Lfence (which
> is the better AMD option, and more performant than retpoline), or IBRS
> on Skylake and newer processors where it is strictly necessary, as well
> as using IBPB whenever available.
> 
> It also supports virtualising IBRS for guest usage when the kernel has
> chosen not to use it; a configuration I haven't seen in any of the Linux
> patch series thusfar.
> 
> ~Andrew
> 

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 15:29                   ` Woodhouse, David
@ 2018-01-04 15:32                     ` Paolo Bonzini
  2018-01-04 15:37                       ` Andrew Cooper
  2018-01-04 16:15                     ` David Woodhouse
  2018-01-04 16:52                     ` Andrea Arcangeli
  2 siblings, 1 reply; 107+ messages in thread
From: Paolo Bonzini @ 2018-01-04 15:32 UTC (permalink / raw)
  To: Woodhouse, David, pavel, andrew.cooper3
  Cc: linux-kernel, tim.c.chen, torvalds, tglx, andi, aarcange, gnomes,
	dave.hansen, gregkh

On 04/01/2018 16:29, Woodhouse, David wrote:
> Adding that for KVM is in the Linux IBRS patch set that I've seen.
> Didn't we already have a conversation about how the Linux patch set
> does it as an atomically-switched MSR while you've done it manually in
> Xen because it's faster?

I'm also doing it manually in the RHEL versions of the KVM patches, for
what it's worth.

Paolo

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 15:32                     ` Paolo Bonzini
@ 2018-01-04 15:37                       ` Andrew Cooper
  0 siblings, 0 replies; 107+ messages in thread
From: Andrew Cooper @ 2018-01-04 15:37 UTC (permalink / raw)
  To: Paolo Bonzini, Woodhouse, David, pavel
  Cc: linux-kernel, tim.c.chen, torvalds, tglx, andi, aarcange, gnomes,
	dave.hansen, gregkh

On 04/01/18 15:32, Paolo Bonzini wrote:
> On 04/01/2018 16:29, Woodhouse, David wrote:
>> Adding that for KVM is in the Linux IBRS patch set that I've seen.
>> Didn't we already have a conversation about how the Linux patch set
>> does it as an atomically-switched MSR while you've done it manually in
>> Xen because it's faster?
> I'm also doing it manually in the RHEL versions of the KVM patches, for
> what it's worth.

Actually, I did it manually in Xen because I was expecting IBRS on AMD,
and there are no MSR load/save lists for PV or SVM guests.  (Also yes,
I've been reliably informed that manually is much faster than VT-x
load/save lists.)

~Andrew

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  8:15               ` Woodhouse, David
@ 2018-01-04 15:53                 ` Andi Kleen
  2018-01-04 15:55                   ` Woodhouse, David
  0 siblings, 1 reply; 107+ messages in thread
From: Andi Kleen @ 2018-01-04 15:53 UTC (permalink / raw)
  To: Woodhouse, David
  Cc: tglx, andi, torvalds, linux-kernel, gregkh, gnomes, tim.c.chen,
	dave.hansen

> +.macro JMP_THUNK reg:req
> +#ifdef RETPOLINE
> +       ALTERNATIVE __stringify(jmp __x86.indirect_thunk.\reg), __stringify(jmp *%\reg), X86_FEATURE_IBRS_ATT
> +#else
> +       jmp *\reg
> +#endif
> +.endm

I remove that because what you're testing for doesn't exist in the tree yet. 

Yes it can be added later.

Right now we just want a basic static version to work reliably.

-Andi

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 15:53                 ` Andi Kleen
@ 2018-01-04 15:55                   ` Woodhouse, David
  0 siblings, 0 replies; 107+ messages in thread
From: Woodhouse, David @ 2018-01-04 15:55 UTC (permalink / raw)
  To: Andi Kleen
  Cc: tglx, torvalds, linux-kernel, gregkh, gnomes, tim.c.chen, dave.hansen

[-- Attachment #1: Type: text/plain, Size: 574 bytes --]

On Thu, 2018-01-04 at 07:53 -0800, Andi Kleen wrote:
> 
> I remove that because what you're testing for doesn't exist in the
> tree yet. 
> 
> Yes it can be added later.
> 
> Right now we just want a basic static version to work reliably.

But it doesn't. You can't protect VMs from each other, or userspace
processes from each other, without IBPB. And you really ought to be
setting IBRS before calling any runtime firmware APIs too.

So we really do need to add the microcode support on top right away,
even if we refactor the series to put retpoline first.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5210 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 15:29                   ` Woodhouse, David
  2018-01-04 15:32                     ` Paolo Bonzini
@ 2018-01-04 16:15                     ` David Woodhouse
  2018-01-04 20:00                       ` Tom Lendacky
  2018-01-04 16:52                     ` Andrea Arcangeli
  2 siblings, 1 reply; 107+ messages in thread
From: David Woodhouse @ 2018-01-04 16:15 UTC (permalink / raw)
  To: Andrew Cooper, Paolo Bonzini, pavel, Tom Lendacky
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli

[-- Attachment #1: Type: text/plain, Size: 1355 bytes --]

On Thu, 2018-01-04 at 15:29 +0000, Woodhouse, David wrote:
> 
> > With the GCC -mindirect-branch=thunk-external support, and microcode,
> > Xen will make a boot-time choice between using Retpoline, Lfence (which
> > is the better AMD option, and more performant than retpoline), or IBRS
> > on Skylake and newer processors where it is strictly necessary, as well
> > as using IBPB whenever available.
> 
> I need to pull in the AMD lfence alternative for retpoline, giving us a
> 3-way choice of the existing retpoline thunk, "lfence; jmp *%\reg", and
> a bare "jmp *%\reg".

I think I can abuse X86_FEATURE_SYSCALL for that, right? So it would
look something like this:

 --- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -12,7 +12,7 @@
 
 ENTRY(__x86.indirect_thunk.\reg)
        CFI_STARTPROC
-       ALTERNATIVE "call 2f", __stringify(jmp *%\reg), X86_BUG_NO_RETPOLINE
+       ALTERNATIVE_2 "call 2f", __stringify(lfence;jmp *%\reg), X86_FEATURE_SYSCALL, __stringify(jmp *%\reg), X86_BUG_NO_RETPOLINE
 1:
        lfence
        ASM_UNREACHABLE


However, I would very much like to see a categorical statement from AMD
that the lfence is sufficient in all cases. Remember, Intel were saying
that too for a while, before finding that it was not *quite* good
enough.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5213 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 15:32                   ` Paolo Bonzini
@ 2018-01-04 16:25                     ` Andrea Arcangeli
  2018-01-04 17:04                       ` Alan Cox
  2018-01-04 17:13                       ` Dave Hansen
  0 siblings, 2 replies; 107+ messages in thread
From: Andrea Arcangeli @ 2018-01-04 16:25 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Andrew Cooper, Woodhouse, David, pavel, tim.c.chen, linux-kernel,
	torvalds, tglx, andi, gnomes, dave.hansen, gregkh

Hello,

On Thu, Jan 04, 2018 at 04:32:01PM +0100, Paolo Bonzini wrote:
> On 04/01/2018 15:51, Andrew Cooper wrote:
> > Where have you got this idea from?  Using IBPB on every mode switch
> > would be an insane overhead to take, and isn't necessary.

It's only on kernel entry and vmexit.

> IIRC it started as a paranoia mode for AMD, but then we found out it was
> actually faster than IBRS on some Intel processor where IBRS performance
> was horrible.  But I don't remember the details of the performance
> testing, sorry.

Yes, it depends on the workload what is faster. ibrs 0 ibpb 2 is
possible to use on CPUs with SPEC_CTRL too in fact.

It's only where SPEC_CTRL is missing and only IBPB_SUPPORT is
available, that ibrs 0 ibpb 2 is the only option to fix variant#2 for
good.

If you run lots of syscalls ibrs 1 ibpb 1 is much faster. If you do
infrequent syscalls computing a lot in kernel like I/O with large
buffers getting copied, ibrs 0 ibpb 2 is much faster than ibrs 1 ibpb
1 (on those microcodes where ibrs 1 reduces performance a lot, not all
microcodes implementing SPEC_CTRL are inefficient like that).

If SPEC_CTRL is available ibrs 1 ibpb 1 should be preferred even if it
may not always be faster in every workload.

AMD website says
https://www.amd.com/en/corporate/speculative-execution

"Differences in AMD architecture mean there is a near zero risk of
exploitation of this variant."

ibrs 0 ibpb 2 brings the probability down to zero even when SPEC_CTRL
is missing and only IBPB_SUPPORT is available in microcode, if you
need that kind of piece of mind.

What exactly would be the point of shipping fixes for variant#2 if we
leave spectre variant#2 unfixed also in cases where we could have
fixed it?

The problem is, it's very unlikely, but if by accident somebody can
mount and setup such an attack, then spectre variant#2 becomes a
problem almost as bad as spectre variant#1 is and your hypervisor
guest/host isolation is fully compromised.

It's not up to us to decide if to leave something with "near zero
risk" unfixed by default, so for now we provided a fix that brings the
probability of such spectre variant#2 attack to zero whenever
possible so that such a spectre varaint#2 attack becomes impossible
(not just "near zero risk"").

Of course we made sure the performance comes back at runtime no matter
what after running this:

    echo 0 >/sys/kernel/debug/x86/ibpb_enabled
    echo 0 >/sys/kernel/debug/x86/ibrs_enabled

Or if you prefer at boot time with "noibrs noibpb". Not everyone
will necessarily care about that kind of variant#2 attacks of course.

NOTE: if those two tunables both read as 0 it means the fix for
variant#2 isn't activated by the running kernel and you need to
contact your CPU manufacturer for a microcode update providing
SPEC_CTRL or at least IBPB_SUPPORT (in the latter case the fix will
generally tend to perform worse and ibrs 0 ibpb 2 mode will
auto-engage).

For meltdown variant#3 same thing: if you want to disable the fix at
runtime because it's a guest kernel and it's running a single
microservice with a single app (similar to unikernel) or something
like that, you can with "nopti" or:

    echo 0 >/sys/kernel/debug/x86/pti_enabled

Same issue if it's a bare metal host and it's running a single app and
it doesn't store secure data in kernel space etc... There's always an
option to disable the fixes.

Only spectre variant#1 fix is always on, as there's no performance
overhead to it.

By default it boots in the most secure setting possible so that all
spectre variant#1 and variant2 and meltdown variant#3 are fixed.

Thanks,
Andrea

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 15:29                   ` Woodhouse, David
  2018-01-04 15:32                     ` Paolo Bonzini
  2018-01-04 16:15                     ` David Woodhouse
@ 2018-01-04 16:52                     ` Andrea Arcangeli
  2 siblings, 0 replies; 107+ messages in thread
From: Andrea Arcangeli @ 2018-01-04 16:52 UTC (permalink / raw)
  To: Woodhouse, David
  Cc: pavel, pbonzini, andrew.cooper3, linux-kernel, tim.c.chen,
	torvalds, tglx, andi, gnomes, dave.hansen, gregkh

On Thu, Jan 04, 2018 at 03:29:37PM +0000, Woodhouse, David wrote:
> On Thu, 2018-01-04 at 14:51 +0000, Andrew Cooper wrote:
> > 
> > > * never turn off indirect branch prediction, but use a branch prediction
> > > barrier on every mode switch (needed for current AMD microcode)
> > 
> > Where have you got this idea from?  Using IBPB on every mode switch
> > would be an insane overhead to take, and isn't necessary.
> 
> AMD *only* has IBPB and not IBRS, but IIRC you don't need to do it on

AMD 0x10 0x12 0x16 basically have IBRS and no IBPB, those works
perfectly fine in ibrs 2 ibpb 1 mode, variant#2 fixed and zero
overhead.

> every context switch into the kernel; only when switching between
> VMs/processes?

Some AMD only has IBPB and no IBRS, then IBPB has to be called in
every enter kernel or vmexit to give the same security as ibrs 1 ibpb
1 (modulo SMT/HT but that's not the spectre PoC and you can rule that
out mathematically also by simply using cpu pinning as you already do
or disabling SMT if you care that much). Note ibrs 1 ibpb 1 also won't
cover HT effects of guest/user mode vs guest/user mode so cpu pinning
may be advisable anyway in your case (even with ibrs 1 ibpb 1 no
difference).

Of course everything can be trivially opted out at runtime and all
measurable performance restored, but by default it boots in the most
secure config available and it will make spectre variant#2 attack
impossible with only ibpb available.

> I need to pull in the AMD lfence alternative for retpoline, giving us a
> 3-way choice of the existing retpoline thunk, "lfence; jmp *%\reg", and
> a bare "jmp *%\reg".
> 
> Then the IBRS bits can be added on top.

"AMD lfence and reptoline" in the same sentence sounds like somebody
else also cares about spectre variant#2 on AMD. "Reptoline" only ever
makes sense in spectre variant#2 context so either ibrs 0 ibpb 2 mode
makes some sense too, or special lfence repotline for AMD should not
be worth mentioning in the first place.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 16:25                     ` Andrea Arcangeli
@ 2018-01-04 17:04                       ` Alan Cox
  2018-01-04 17:40                         ` Andrea Arcangeli
  2018-01-04 17:13                       ` Dave Hansen
  1 sibling, 1 reply; 107+ messages in thread
From: Alan Cox @ 2018-01-04 17:04 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Paolo Bonzini, Andrew Cooper, Woodhouse, David, pavel,
	tim.c.chen, linux-kernel, torvalds, tglx, andi, dave.hansen,
	gregkh

> If you run lots of syscalls ibrs 1 ibpb 1 is much faster. If you do
> infrequent syscalls computing a lot in kernel like I/O with large
> buffers getting copied, ibrs 0 ibpb 2 is much faster than ibrs 1 ibpb
> 1 (on those microcodes where ibrs 1 reduces performance a lot, not all
> microcodes implementing SPEC_CTRL are inefficient like that).

Have you looked at whether you can measure activity and switch
automatically between the two (or by task). It seems silly to leave
something the machine can accurately assess toa human ?

Alan

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 16:25                     ` Andrea Arcangeli
  2018-01-04 17:04                       ` Alan Cox
@ 2018-01-04 17:13                       ` Dave Hansen
  2018-01-04 17:15                         ` Paolo Bonzini
  1 sibling, 1 reply; 107+ messages in thread
From: Dave Hansen @ 2018-01-04 17:13 UTC (permalink / raw)
  To: Andrea Arcangeli, Paolo Bonzini
  Cc: Andrew Cooper, Woodhouse, David, pavel, tim.c.chen, linux-kernel,
	torvalds, tglx, andi, gnomes, gregkh

On 01/04/2018 08:25 AM, Andrea Arcangeli wrote:
> It's only where SPEC_CTRL is missing and only IBPB_SUPPORT is
> available, that ibrs 0 ibpb 2 is the only option to fix variant#2 for
> good.

Could you help us decode what "ibrs 0 ibpb 2" means to you?

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 17:13                       ` Dave Hansen
@ 2018-01-04 17:15                         ` Paolo Bonzini
  2018-01-04 18:05                           ` Andrea Arcangeli
  0 siblings, 1 reply; 107+ messages in thread
From: Paolo Bonzini @ 2018-01-04 17:15 UTC (permalink / raw)
  To: Dave Hansen, Andrea Arcangeli
  Cc: Andrew Cooper, Woodhouse, David, pavel, tim.c.chen, linux-kernel,
	torvalds, tglx, andi, gnomes, gregkh

On 04/01/2018 18:13, Dave Hansen wrote:
> On 01/04/2018 08:25 AM, Andrea Arcangeli wrote:
>> It's only where SPEC_CTRL is missing and only IBPB_SUPPORT is
>> available, that ibrs 0 ibpb 2 is the only option to fix variant#2 for
>> good.
> 
> Could you help us decode what "ibrs 0 ibpb 2" means to you?

IBRS 0 = disabled
IBRS 1 = only kernel sets IBRS=1
IBRS 2 = indirect branch prediction fully disabled, or do the right
thing on future processors

IBPB 0 = disabled
IBPB 1 = on context switch
IBPB 2 = on every kernel or hypervisor entry

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 17:04                       ` Alan Cox
@ 2018-01-04 17:40                         ` Andrea Arcangeli
  0 siblings, 0 replies; 107+ messages in thread
From: Andrea Arcangeli @ 2018-01-04 17:40 UTC (permalink / raw)
  To: Alan Cox
  Cc: Paolo Bonzini, Andrew Cooper, Woodhouse, David, pavel,
	tim.c.chen, linux-kernel, torvalds, tglx, andi, dave.hansen,
	gregkh

Hi Alan,

On Thu, Jan 04, 2018 at 05:04:42PM +0000, Alan Cox wrote:
> > If you run lots of syscalls ibrs 1 ibpb 1 is much faster. If you do
> > infrequent syscalls computing a lot in kernel like I/O with large
> > buffers getting copied, ibrs 0 ibpb 2 is much faster than ibrs 1 ibpb
> > 1 (on those microcodes where ibrs 1 reduces performance a lot, not all
> > microcodes implementing SPEC_CTRL are inefficient like that).
> 
> Have you looked at whether you can measure activity and switch
> automatically between the two (or by task). It seems silly to leave
> something the machine can accurately assess toa human ?

We didn't but it'd be definitely reasonable to investigate and it's a
good idea for those CPUs where the updated microcode has to shutdown
way more than just indirect branch prediction speculation to achieve
the ibrs 1 semantics.

If the workload changes from frequent syscalls to reasonably large
read/writes and less frequent syscalls or lots of interrupts in idle
CPUs, it would work well to switch between ibrs 1 ibpb 1 and ibpb 2
ibrs 0 automatically. As long as the pattern keeps repeating for a
while... that is the question ;).

Thanks!
Andrea

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 17:15                         ` Paolo Bonzini
@ 2018-01-04 18:05                           ` Andrea Arcangeli
  0 siblings, 0 replies; 107+ messages in thread
From: Andrea Arcangeli @ 2018-01-04 18:05 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Dave Hansen, Andrew Cooper, Woodhouse, David, pavel, tim.c.chen,
	linux-kernel, torvalds, tglx, andi, gnomes, gregkh

Hello,

On Thu, Jan 04, 2018 at 06:15:01PM +0100, Paolo Bonzini wrote:
> On 04/01/2018 18:13, Dave Hansen wrote:
> > On 01/04/2018 08:25 AM, Andrea Arcangeli wrote:
> >> It's only where SPEC_CTRL is missing and only IBPB_SUPPORT is
> >> available, that ibrs 0 ibpb 2 is the only option to fix variant#2 for
> >> good.
> > 
> > Could you help us decode what "ibrs 0 ibpb 2" means to you?
> 
> IBRS 0 = disabled
> IBRS 1 = only kernel sets IBRS=1
> IBRS 2 = indirect branch prediction fully disabled, or do the right
> thing on future processors
> 
> IBPB 0 = disabled
> IBPB 1 = on context switch
> IBPB 2 = on every kernel or hypervisor entry

Yes.

ibrs 0 ibpb 2 kernel entry and vmexit.

ibpb 2 if set, is forcing ibrs to 0 (it's sharing the same branch in
the kernel entry points and it wouldn't make sense anyway to enable
ibrs with ibpb 2).

ibrs 0 ibpb 2 is only ever activated if SPEC_CTRL is missing but
IBPB_SUPPORT is present and it does the same as stuff_RSB, imagine it
like a stuff_IBP where stuff_RSB is already called.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 14:20               ` Paolo Bonzini
  2018-01-04 14:51                 ` Andrew Cooper
  2018-01-04 14:55                 ` Woodhouse, David
@ 2018-01-04 18:24                 ` Pavel Machek
  2 siblings, 0 replies; 107+ messages in thread
From: Pavel Machek @ 2018-01-04 18:24 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Woodhouse, David, tim.c.chen, linux-kernel, torvalds, tglx, andi,
	gnomes, dave.hansen, gregkh, Andrea Arcangeli

[-- Attachment #1: Type: text/plain, Size: 726 bytes --]

Hi!

> ps: BTW^2 (and this is of course not about you, David) I'm disappointed
> that for "Spectre" there was no discussion between upstream developers,
> or between Linux vendors, or in fact hardly any discussion beyond "these
> are the patches".  I understand that (unlike PTI) there was no back
> story to cover up the actual vulnerability, but... grow up, folks.
> Seriously, "these are the patches" won't fly with either upstream or
> distros.

I still hope that the discussion will now.... Or is someone still
under impression that embargo is in place?
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 11:26   ` Pavel Machek
  2018-01-04 11:54     ` Alan Cox
@ 2018-01-04 18:33     ` Linus Torvalds
  2018-01-04 20:08       ` Jon Masters
  1 sibling, 1 reply; 107+ messages in thread
From: Linus Torvalds @ 2018-01-04 18:33 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Andi Kleen, tglx, Greg Kroah-Hartman, David Woodhouse, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen

On Thu, Jan 4, 2018 at 3:26 AM, Pavel Machek <pavel@ucw.cz> wrote:
> On Wed 2018-01-03 15:51:35, Linus Torvalds wrote:
>>
>> A *competent* CPU engineer would fix this by making sure speculation
>> doesn't happen across protection domains. Maybe even a L1 I$ that is
>> keyed by CPL.
>
> Would that be enough?

No, you'd need to add the CPL to the branch target buffer itself, not the I$ L1.

And as somebody pointed out, that only helps the user space messing
with the kernel. It doesn't help the "one user context fools another
user context to mispredict". (Where the user contexts might be a
JIT'ed JS vs the rest of the web browser).

So you really would want to just make sure the full address is used to
index (or at least verify) the BTB lookup, and even then you'd then
need to invalidate the BTB on context switches so that one context
can't fill in data for another context.

           Linus

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  8:20         ` Woodhouse, David
  2018-01-04 11:42           ` Pavel Machek
@ 2018-01-04 19:57           ` Jon Masters
  2018-01-05  0:41             ` Jon Masters
  1 sibling, 1 reply; 107+ messages in thread
From: Jon Masters @ 2018-01-04 19:57 UTC (permalink / raw)
  To: Woodhouse, David, Paolo Bonzini, Alan Cox
  Cc: Linus Torvalds, Andi Kleen, tglx, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

+ Jeff Law, Nick Clifton

On 01/04/2018 03:20 AM, Woodhouse, David wrote:
> On Thu, 2018-01-04 at 03:11 +0100, Paolo Bonzini wrote:
>> On 04/01/2018 02:59, Alan Cox wrote:
>>>> But then, exactly because the retpoline approach adds quite some cruft
>>>> and leaves something to be desired, why even bother?
>>>
>>> Performance
>>
>> Dunno.  If I care about mitigating this threat, I wouldn't stop at
>> retpolines even if the full solution has pretty bad performance (it's
>> roughly in the same ballpark as PTI).  But if I don't care, I wouldn't
>> want retpolines either, since they do introduce a small slowdown (10-20
>> cycles per indirect branch, meaning that after a thousand such papercuts
>> they become slower than the full solution).
>>
>> A couple manually written asm retpolines may be good as mitigation to
>> block the simplest PoCs (Linus may disagree), but patching the compiler,
>> getting alternatives right, etc. will take a while.  The only redeeming
>> grace of retpolines is that they don't require a microcode update, but
>> the microcode will be out there long before these patches are included
>> and trickle down to distros...  I just don't see the point in starting
>> from retpolines or drawing the line there.
> 
> No, really. The full mitigation with the microcode update and IBRS
> support is *slow*. Horribly slow.

It is horribly slow, though the story changes with CPU generation as
others noted (and what needs disabling in the microcode). We did various
analysis of the retpoline patches, including benchmarks, and we decided
that the fastest and safest approach for Tue^W yesterday was to use the
new MSRs. Especially in light of the corner cases we would need to
address for an empty RSB, etc. I'm adding Jeff Law because he and the
tools team have done analysis on this and he may have thoughts.

There's also a cross-architecture concern here in that different
solutions are needed across architectures. Retpolines are not endorsed
or recommended by every architecture vendor at this time. It's important
to make sure the necessary cross-vendor discussion happens now that it
can happen in the open.

Longer term, it'll be good to see BTBs tagged using the full address
space (including any address space IDs...) in future silicon.

Jon.

-- 
Computer Architect | Sent from my Fedora powered laptop

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 16:15                     ` David Woodhouse
@ 2018-01-04 20:00                       ` Tom Lendacky
  2018-01-04 20:05                         ` David Woodhouse
  0 siblings, 1 reply; 107+ messages in thread
From: Tom Lendacky @ 2018-01-04 20:00 UTC (permalink / raw)
  To: David Woodhouse, Andrew Cooper, Paolo Bonzini, pavel
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli



On 1/4/2018 10:15 AM, David Woodhouse wrote:
> On Thu, 2018-01-04 at 15:29 +0000, Woodhouse, David wrote:
>>
>>> With the GCC -mindirect-branch=thunk-external support, and microcode,
>>> Xen will make a boot-time choice between using Retpoline, Lfence (which
>>> is the better AMD option, and more performant than retpoline), or IBRS
>>> on Skylake and newer processors where it is strictly necessary, as well
>>> as using IBPB whenever available.
>>
>> I need to pull in the AMD lfence alternative for retpoline, giving us a
>> 3-way choice of the existing retpoline thunk, "lfence; jmp *%\reg", and
>> a bare "jmp *%\reg".
> 
> I think I can abuse X86_FEATURE_SYSCALL for that, right? So it would
> look something like this:
> 
>  --- a/arch/x86/lib/retpoline.S
> +++ b/arch/x86/lib/retpoline.S
> @@ -12,7 +12,7 @@
>  
>  ENTRY(__x86.indirect_thunk.\reg)
>         CFI_STARTPROC
> -       ALTERNATIVE "call 2f", __stringify(jmp *%\reg), X86_BUG_NO_RETPOLINE
> +       ALTERNATIVE_2 "call 2f", __stringify(lfence;jmp *%\reg), X86_FEATURE_SYSCALL, __stringify(jmp *%\reg), X86_BUG_NO_RETPOLINE
>  1:
>         lfence
>         ASM_UNREACHABLE
> 
> 
> However, I would very much like to see a categorical statement from AMD
> that the lfence is sufficient in all cases. Remember, Intel were saying
> that too for a while, before finding that it was not *quite* good
> enough.

Yes, lfence is sufficient.  As long as the target is in the register
before the lfence and we jump through the register all is good, i.e.:

Include a dispatch serializing instruction after the load of an indirect
branch target. For instance, change this code:

1: jmp *[rax] ; jump to address pointed to by RAX

To this:

1: mov [rax], rax ; load target address
2: lfence ; dispatch serializing instruction
3: jmp *rax

The processor will stop dispatching instructions until all older
instructions have returned their results and are capable of being retired
by the processor. At this point the branch target will be in the general
purpose register (rax in this example) and available at dispatch for
execution such that the speculative execution window is not large enough
to be exploited.

Thanks,
Tom

> 

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 20:00                       ` Tom Lendacky
@ 2018-01-04 20:05                         ` David Woodhouse
  2018-01-04 23:47                           ` Tom Lendacky
  0 siblings, 1 reply; 107+ messages in thread
From: David Woodhouse @ 2018-01-04 20:05 UTC (permalink / raw)
  To: Tom Lendacky, Andrew Cooper, Paolo Bonzini, pavel
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli

[-- Attachment #1: Type: text/plain, Size: 1193 bytes --]

On Thu, 2018-01-04 at 14:00 -0600, Tom Lendacky wrote:
> Yes, lfence is sufficient.  As long as the target is in the register
> before the lfence and we jump through the register all is good, i.e.:

Thanks. Can I have a Reviewed-by: for this then please:

http://git.infradead.org/users/dwmw2/linux-retpoline.git/commitdiff/08d9eda03

From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 4 Jan 2018 20:01:53 +0000
Subject: [PATCH] x86/retpoline: Simplify AMD variant of retpoline thunk

On AMD (which is X86_FEATURE_K8), just the lfence is sufficient.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/lib/retpoline.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index bbdda5cc136e..26070976bff0 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -11,7 +11,7 @@
 
 ENTRY(__x86.indirect_thunk.\reg)
 	CFI_STARTPROC
-	ALTERNATIVE "call 2f", __stringify(jmp *%\reg), X86_BUG_NO_RETPOLINE
+	ALTERNATIVE_2 "call 2f", __stringify(lfence;jmp *%\reg), X86_FEATURE_K8, __stringify(jmp *%\reg), X86_BUG_NO_RETPOLINE
 1:
 	lfence
 	jmp	1b
-- 
2.14.3

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5213 bytes --]

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 18:33     ` Linus Torvalds
@ 2018-01-04 20:08       ` Jon Masters
  0 siblings, 0 replies; 107+ messages in thread
From: Jon Masters @ 2018-01-04 20:08 UTC (permalink / raw)
  To: Linus Torvalds, Pavel Machek
  Cc: Andi Kleen, Thomas Gleixner, Greg Kroah-Hartman, David Woodhouse,
	Tim Chen, Linux Kernel Mailing List, Dave Hansen

On 01/04/2018 01:33 PM, Linus Torvalds wrote:
> On Thu, Jan 4, 2018 at 3:26 AM, Pavel Machek <pavel@ucw.cz> wrote:
>> On Wed 2018-01-03 15:51:35, Linus Torvalds wrote:
>>>
>>> A *competent* CPU engineer would fix this by making sure speculation
>>> doesn't happen across protection domains. Maybe even a L1 I$ that is
>>> keyed by CPL.
>>
>> Would that be enough?
> 
> No, you'd need to add the CPL to the branch target buffer itself, not the I$ L1.
> 
> And as somebody pointed out, that only helps the user space messing
> with the kernel. It doesn't help the "one user context fools another
> user context to mispredict". (Where the user contexts might be a
> JIT'ed JS vs the rest of the web browser).
> 
> So you really would want to just make sure the full address is used to
> index (or at least verify) the BTB lookup, and even then you'd then
> need to invalidate the BTB on context switches so that one context
> can't fill in data for another context.

IMO the correct hardware fix is to index the BTB using the full VA
including the ASID/PCID. And guarantee (as is the case) that there is
not a live conflict between address space identifiers with entries.

The sad thing is that even the latest academic courses recommend
"optimizing" branch predictors with a few low order bits (e.g. 31 in
Intel's case, various others for different vendors). The fix for variant
3 is similarly not that difficult in new hardware: don't allow the
speculated load to happen by enforcing the permission check at the right
time. The last several editions of Computer Architecture spell this out
in Appendix B (page 37 or thereabouts).

Jon.


-- 
Computer Architect | Sent from my Fedora powered laptop

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 20:05                         ` David Woodhouse
@ 2018-01-04 23:47                           ` Tom Lendacky
  2018-01-05  0:06                             ` Andrew Cooper
  2018-01-05  0:26                             ` Tom Lendacky
  0 siblings, 2 replies; 107+ messages in thread
From: Tom Lendacky @ 2018-01-04 23:47 UTC (permalink / raw)
  To: David Woodhouse, Andrew Cooper, Paolo Bonzini, pavel
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli

On 1/4/2018 2:05 PM, David Woodhouse wrote:
> On Thu, 2018-01-04 at 14:00 -0600, Tom Lendacky wrote:
>> Yes, lfence is sufficient.  As long as the target is in the register
>> before the lfence and we jump through the register all is good, i.e.:
> 
> Thanks. Can I have a Reviewed-by: for this then please:

Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>

While this works, a more efficient way to do the lfence support would be
to not use the retpoline in this case.  Changing the indirect jumps to
do the "mov [rax], rax; lfence; jmp *rax" sequence would be quicker. I'm
not sure if this is feasible given the need to do a retpoline if you can't
use lfence, though.

Thanks,
Tom

> 
> http://git.infradead.org/users/dwmw2/linux-retpoline.git/commitdiff/08d9eda03
> 
> From: David Woodhouse <dwmw@amazon.co.uk>
> Date: Thu, 4 Jan 2018 20:01:53 +0000
> Subject: [PATCH] x86/retpoline: Simplify AMD variant of retpoline thunk
> 
> On AMD (which is X86_FEATURE_K8), just the lfence is sufficient.
> 
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
>  arch/x86/lib/retpoline.S | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
> index bbdda5cc136e..26070976bff0 100644
> --- a/arch/x86/lib/retpoline.S
> +++ b/arch/x86/lib/retpoline.S
> @@ -11,7 +11,7 @@
>  
>  ENTRY(__x86.indirect_thunk.\reg)
>  	CFI_STARTPROC
> -	ALTERNATIVE "call 2f", __stringify(jmp *%\reg), X86_BUG_NO_RETPOLINE
> +	ALTERNATIVE_2 "call 2f", __stringify(lfence;jmp *%\reg), X86_FEATURE_K8, __stringify(jmp *%\reg), X86_BUG_NO_RETPOLINE
>  1:
>  	lfence
>  	jmp	1b
> -- 
> 2.14.3
> 

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 23:47                           ` Tom Lendacky
@ 2018-01-05  0:06                             ` Andrew Cooper
  2018-01-05  0:26                             ` Tom Lendacky
  1 sibling, 0 replies; 107+ messages in thread
From: Andrew Cooper @ 2018-01-05  0:06 UTC (permalink / raw)
  To: Tom Lendacky, David Woodhouse, Paolo Bonzini, pavel
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli

On 04/01/2018 23:47, Tom Lendacky wrote:
> On 1/4/2018 2:05 PM, David Woodhouse wrote:
>> On Thu, 2018-01-04 at 14:00 -0600, Tom Lendacky wrote:
>>> Yes, lfence is sufficient.  As long as the target is in the register
>>> before the lfence and we jump through the register all is good, i.e.:
>> Thanks. Can I have a Reviewed-by: for this then please:
> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
>
> While this works, a more efficient way to do the lfence support would be
> to not use the retpoline in this case.  Changing the indirect jumps to
> do the "mov [rax], rax; lfence; jmp *rax" sequence would be quicker. I'm
> not sure if this is feasible given the need to do a retpoline if you can't
> use lfence, though.

That would be most efficient for AMD, but it isn't compatible with
having a single binary which can mitigate itself most efficiently
wherever it was booted.  On most hardware, we'll want to dynamically
chose between repoline and lfence depending on vendor.

One option would be to teach GCC/Clang/Other to output alternative
patch-point data for indirect branches in the format Linux/Xen could
consume, and feed this into the alternatives framework.

The practical option to actually deploy in the timeframe is to use
__x86.indirect_thunk.%reg and alternate between repoline and lfence in
15 locations, which does add an unconditional call/jmp over the most
efficient alternative, but allows us to switch the thunk-in-use at boot
time.

~Andrew

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 23:47                           ` Tom Lendacky
  2018-01-05  0:06                             ` Andrew Cooper
@ 2018-01-05  0:26                             ` Tom Lendacky
  1 sibling, 0 replies; 107+ messages in thread
From: Tom Lendacky @ 2018-01-05  0:26 UTC (permalink / raw)
  To: David Woodhouse, Andrew Cooper, Paolo Bonzini, pavel
  Cc: tim.c.chen, linux-kernel, torvalds, tglx, andi, gnomes,
	dave.hansen, gregkh, Andrea Arcangeli

On 1/4/2018 5:47 PM, Tom Lendacky wrote:
> On 1/4/2018 2:05 PM, David Woodhouse wrote:
>> On Thu, 2018-01-04 at 14:00 -0600, Tom Lendacky wrote:
>>> Yes, lfence is sufficient.  As long as the target is in the register
>>> before the lfence and we jump through the register all is good, i.e.:
>>
>> Thanks. Can I have a Reviewed-by: for this then please:
> 
> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
> 
> While this works, a more efficient way to do the lfence support would be
> to not use the retpoline in this case.  Changing the indirect jumps to
> do the "mov [rax], rax; lfence; jmp *rax" sequence would be quicker. I'm
> not sure if this is feasible given the need to do a retpoline if you can't
> use lfence, though.
> 
> Thanks,
> Tom
> 

I do need to send the patches that make lfence a serializing instruction
for AMD.  I'll get those out as soon as I can.

Thanks,
Tom

>>
>> http://git.infradead.org/users/dwmw2/linux-retpoline.git/commitdiff/08d9eda03
>>
>> From: David Woodhouse <dwmw@amazon.co.uk>
>> Date: Thu, 4 Jan 2018 20:01:53 +0000
>> Subject: [PATCH] x86/retpoline: Simplify AMD variant of retpoline thunk
>>
>> On AMD (which is X86_FEATURE_K8), just the lfence is sufficient.
>>
>> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
>> ---
>>  arch/x86/lib/retpoline.S | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
>> index bbdda5cc136e..26070976bff0 100644
>> --- a/arch/x86/lib/retpoline.S
>> +++ b/arch/x86/lib/retpoline.S
>> @@ -11,7 +11,7 @@
>>  
>>  ENTRY(__x86.indirect_thunk.\reg)
>>  	CFI_STARTPROC
>> -	ALTERNATIVE "call 2f", __stringify(jmp *%\reg), X86_BUG_NO_RETPOLINE
>> +	ALTERNATIVE_2 "call 2f", __stringify(lfence;jmp *%\reg), X86_FEATURE_K8, __stringify(jmp *%\reg), X86_BUG_NO_RETPOLINE
>>  1:
>>  	lfence
>>  	jmp	1b
>> -- 
>> 2.14.3
>>

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 19:57           ` Jon Masters
@ 2018-01-05  0:41             ` Jon Masters
  2018-01-05  0:54               ` Thomas Gleixner
  0 siblings, 1 reply; 107+ messages in thread
From: Jon Masters @ 2018-01-05  0:41 UTC (permalink / raw)
  To: Woodhouse, David, Paolo Bonzini, Alan Cox
  Cc: Linus Torvalds, Andi Kleen, tglx, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

On 01/04/2018 02:57 PM, Jon Masters wrote:
> + Jeff Law, Nick Clifton
> 
> On 01/04/2018 03:20 AM, Woodhouse, David wrote:
>> On Thu, 2018-01-04 at 03:11 +0100, Paolo Bonzini wrote:
>>> On 04/01/2018 02:59, Alan Cox wrote:
>>>>> But then, exactly because the retpoline approach adds quite some cruft
>>>>> and leaves something to be desired, why even bother?
>>>>
>>>> Performance
>>>
>>> Dunno.  If I care about mitigating this threat, I wouldn't stop at
>>> retpolines even if the full solution has pretty bad performance (it's
>>> roughly in the same ballpark as PTI).  But if I don't care, I wouldn't
>>> want retpolines either, since they do introduce a small slowdown (10-20
>>> cycles per indirect branch, meaning that after a thousand such papercuts
>>> they become slower than the full solution).
>>>
>>> A couple manually written asm retpolines may be good as mitigation to
>>> block the simplest PoCs (Linus may disagree), but patching the compiler,
>>> getting alternatives right, etc. will take a while.  The only redeeming
>>> grace of retpolines is that they don't require a microcode update, but
>>> the microcode will be out there long before these patches are included
>>> and trickle down to distros...  I just don't see the point in starting
>>> from retpolines or drawing the line there.
>>
>> No, really. The full mitigation with the microcode update and IBRS
>> support is *slow*. Horribly slow.
> 
> It is horribly slow, though the story changes with CPU generation as
> others noted (and what needs disabling in the microcode). We did various
> analysis of the retpoline patches, including benchmarks, and we decided
> that the fastest and safest approach for Tue^W yesterday was to use the
> new MSRs. Especially in light of the corner cases we would need to
> address for an empty RSB, etc. I'm adding Jeff Law because he and the
> tools team have done analysis on this and he may have thoughts.
> 
> There's also a cross-architecture concern here in that different
> solutions are needed across architectures. Retpolines are not endorsed
> or recommended by every architecture vendor at this time. It's important
> to make sure the necessary cross-vendor discussion happens now that it
> can happen in the open.
> 
> Longer term, it'll be good to see BTBs tagged using the full address
> space (including any address space IDs...) in future silicon.

P.S. I've an internal document where I've been tracking "nice to haves"
for later, and one of them is whether it makes sense to tag binaries as
"trusted" (e.g. extended attribute, label, whatever). It was something I
wanted to bring up at some point as potentially worth considering.

Jon.

-- 
Computer Architect | Sent from my Fedora powered laptop

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  0:41             ` Jon Masters
@ 2018-01-05  0:54               ` Thomas Gleixner
  2018-01-05  4:11                 ` Jon Masters
                                   ` (3 more replies)
  0 siblings, 4 replies; 107+ messages in thread
From: Thomas Gleixner @ 2018-01-05  0:54 UTC (permalink / raw)
  To: Jon Masters
  Cc: Woodhouse, David, Paolo Bonzini, Alan Cox, Linus Torvalds,
	Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

On Thu, 4 Jan 2018, Jon Masters wrote:
> P.S. I've an internal document where I've been tracking "nice to haves"
> for later, and one of them is whether it makes sense to tag binaries as
> "trusted" (e.g. extended attribute, label, whatever). It was something I
> wanted to bring up at some point as potentially worth considering.

Scratch that. There is no such thing as a trusted binary.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  0:19         ` Jiri Kosina
@ 2018-01-05  2:01           ` james harvey
  2018-01-05 10:40             ` Woodhouse, David
  2018-01-05 12:06             ` Alan Cox
  0 siblings, 2 replies; 107+ messages in thread
From: james harvey @ 2018-01-05  2:01 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Andi Kleen, Thomas Gleixner, Linus Torvalds, Greg Kroah-Hartman,
	dwmw, Tim Chen, Linux Kernel Mailing List, Dave Hansen

On Wed, Jan 3, 2018 at 7:19 PM, Jiri Kosina <jikos@kernel.org> wrote:
> On Wed, 3 Jan 2018, Andi Kleen wrote:
>
>> > It should be a CPU_BUG bit as we have for the other mess. And that can be
>> > used for patching.
>>
>> It has to be done at compile time because it requires a compiler option.
>
> If gcc anotates indirect calls/jumps in a way that we could patch them
> using alternatives in runtime, that'd be enough.
>
> --
> Jiri Kosina
> SUSE Labs

I understand the GCC patches being discussed will fix the
vulnerability because newly compiled kernels will be compiled with a
GCC with these patches.

But, are the GCC patches being discussed also expected to fix the
vulnerability because user binaries will be compiled using them?  In
such case, a binary could be maliciously changed back, or a custom GCC
made with the patches reverted.

Please forgive me if my ignorance about all the related GCC patches
makes this a stupid question.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  0:54               ` Thomas Gleixner
@ 2018-01-05  4:11                 ` Jon Masters
  2018-01-05  9:59                   ` Thomas Gleixner
  2018-01-05  6:49                 ` Willy Tarreau
                                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 107+ messages in thread
From: Jon Masters @ 2018-01-05  4:11 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Woodhouse, David, Paolo Bonzini, Alan Cox, Linus Torvalds,
	Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

On 01/04/2018 07:54 PM, Thomas Gleixner wrote:
> On Thu, 4 Jan 2018, Jon Masters wrote:
>> P.S. I've an internal document where I've been tracking "nice to haves"
>> for later, and one of them is whether it makes sense to tag binaries as
>> "trusted" (e.g. extended attribute, label, whatever). It was something I
>> wanted to bring up at some point as potentially worth considering.
> 
> Scratch that. There is no such thing as a trusted binary.

I agree with your sentiment, but for those mitigations that carry a
significant performance overhead (for example IBRS at the moment, and on
some other architectures where we might not end up with retpolines)
there /could/ be some value in leaving them on by default but allowing a
sysadmin to decide to trust a given application/container and accept the
risk. Sure, it's selectively weakened security, I get that. I am not
necessarily advocating this, just suggesting it be discussed.

[ I also totally get that you can extend variant 2 to have any
application that interacts with another abuse it (even over a pipe or a
socket, etc. provided they share the same cache and take untrusted data
that can lead to some kind of load within a speculation window), and
there are a ton of ways to still cause an attack in that case. ]

Jon.

-- 
Computer Architect | Sent from my Fedora powered laptop

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  0:54               ` Thomas Gleixner
  2018-01-05  4:11                 ` Jon Masters
@ 2018-01-05  6:49                 ` Willy Tarreau
  2018-01-05  6:57                   ` Dave Hansen
  2018-01-05 12:12                 ` Alan Cox
       [not found]                 ` <CAL9bgJ8XNJgCtxR6+M+Vm9eDBVZ4Dyi_-Lt-Q1ei9N=TE2c6cg@mail.gmail.com>
  3 siblings, 1 reply; 107+ messages in thread
From: Willy Tarreau @ 2018-01-05  6:49 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jon Masters, Woodhouse, David, Paolo Bonzini, Alan Cox,
	Linus Torvalds, Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

On Fri, Jan 05, 2018 at 01:54:13AM +0100, Thomas Gleixner wrote:
> On Thu, 4 Jan 2018, Jon Masters wrote:
> > P.S. I've an internal document where I've been tracking "nice to haves"
> > for later, and one of them is whether it makes sense to tag binaries as
> > "trusted" (e.g. extended attribute, label, whatever). It was something I
> > wanted to bring up at some point as potentially worth considering.
> 
> Scratch that. There is no such thing as a trusted binary.

I disagree with you on this Thomas. "trusted" means "we agree to share the
risk this binary takes because it's critical to our service". When you
build a load balancing appliance on which 100% of the service is assured
by a single executable and the rest is just config management, you'd better
trust that process. If the binary or process cannot be trusted, the product
is dead anyway. It doesn't mean the binary is safe. It just means that for
the product there's nothing worse than its compromission or failure. And
when it suffers from the performance impact of workarounds supposed to
protect the whole device against this process' possible abuses, you
easily see how the situation becomes ridiculous.

We need to still think about performance a lot. There's already an ongoing
trend of kernel bypass mechanisms in the wild for performance reasons, and
the new increase of syscall costs will necessarily amplify this willingness
to avoid the kernel. I personally don't want to see the kernel being reduced
to booting and executing SSH to manage the machines.

Willy

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  6:49                 ` Willy Tarreau
@ 2018-01-05  6:57                   ` Dave Hansen
  2018-01-05  7:13                     ` Willy Tarreau
  0 siblings, 1 reply; 107+ messages in thread
From: Dave Hansen @ 2018-01-05  6:57 UTC (permalink / raw)
  To: Willy Tarreau, Thomas Gleixner
  Cc: Jon Masters, Woodhouse, David, Paolo Bonzini, Alan Cox,
	Linus Torvalds, Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Jeff Law, Nick Clifton

On 01/04/2018 10:49 PM, Willy Tarreau wrote:
> On Fri, Jan 05, 2018 at 01:54:13AM +0100, Thomas Gleixner wrote:
>> On Thu, 4 Jan 2018, Jon Masters wrote:
>>> P.S. I've an internal document where I've been tracking "nice to haves"
>>> for later, and one of them is whether it makes sense to tag binaries as
>>> "trusted" (e.g. extended attribute, label, whatever). It was something I
>>> wanted to bring up at some point as potentially worth considering.
>> Scratch that. There is no such thing as a trusted binary.
> I disagree with you on this Thomas. "trusted" means "we agree to share the
> risk this binary takes because it's critical to our service". When you
> build a load balancing appliance on which 100% of the service is assured
> by a single executable and the rest is just config management, you'd better
> trust that process.

So you want to run this "one binary" as fast as possible and without
mitigations in place?  But, you want mitigations *available* on that
system at the same time?  For what?  If there's only one binary, why not
just disable the mitigations entirely?

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  6:57                   ` Dave Hansen
@ 2018-01-05  7:13                     ` Willy Tarreau
  2018-01-07 14:14                       ` Borislav Petkov
  0 siblings, 1 reply; 107+ messages in thread
From: Willy Tarreau @ 2018-01-05  7:13 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Thomas Gleixner, Jon Masters, Woodhouse, David, Paolo Bonzini,
	Alan Cox, Linus Torvalds, Andi Kleen, Greg Kroah-Hartman,
	Tim Chen, Linux Kernel Mailing List, Jeff Law, Nick Clifton

On Thu, Jan 04, 2018 at 10:57:19PM -0800, Dave Hansen wrote:
> On 01/04/2018 10:49 PM, Willy Tarreau wrote:
> > On Fri, Jan 05, 2018 at 01:54:13AM +0100, Thomas Gleixner wrote:
> >> On Thu, 4 Jan 2018, Jon Masters wrote:
> >>> P.S. I've an internal document where I've been tracking "nice to haves"
> >>> for later, and one of them is whether it makes sense to tag binaries as
> >>> "trusted" (e.g. extended attribute, label, whatever). It was something I
> >>> wanted to bring up at some point as potentially worth considering.
> >> Scratch that. There is no such thing as a trusted binary.
> > I disagree with you on this Thomas. "trusted" means "we agree to share the
> > risk this binary takes because it's critical to our service". When you
> > build a load balancing appliance on which 100% of the service is assured
> > by a single executable and the rest is just config management, you'd better
> > trust that process.
> 
> So you want to run this "one binary" as fast as possible and without
> mitigations in place?  But, you want mitigations *available* on that
> system at the same time?  For what?  If there's only one binary, why not
> just disable the mitigations entirely?

I'm not fond of running the mitigations, but given that a few sysops can
connect to the machine to collect stats or counters, I think it would be
better to ensure these people can't happily play with the exploits to
dump stuff they shouldn't have access to. It's even easier to understand
on a database or key-value server for example, where you may expect the
highest performance the CPU can bring for a specific process and the rest
can be mitigated and will never ever notice any performance impact at all.

That's why I was saying in another thread that it would be nice over the
long term if we could 1) make the mitigation dynamic, and 2) make it
possible for an admin to disable it for certain processes/programs.

Don't get me wrong, I'm perfectly aware that it's far from being simple
and for now we need to get a reliable mitigation. I'm just saying that
the performance impact is a huge loss for certain use cases and that
once things settle down we should start to work on ways to recover what
was lost.

Regards,
Willy

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  4:11                 ` Jon Masters
@ 2018-01-05  9:59                   ` Thomas Gleixner
  2018-01-08 10:28                     ` Andrea Arcangeli
  0 siblings, 1 reply; 107+ messages in thread
From: Thomas Gleixner @ 2018-01-05  9:59 UTC (permalink / raw)
  To: Jon Masters
  Cc: Woodhouse, David, Paolo Bonzini, Alan Cox, Linus Torvalds,
	Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

On Thu, 4 Jan 2018, Jon Masters wrote:
> On 01/04/2018 07:54 PM, Thomas Gleixner wrote:
> > On Thu, 4 Jan 2018, Jon Masters wrote:
> >> P.S. I've an internal document where I've been tracking "nice to haves"
> >> for later, and one of them is whether it makes sense to tag binaries as
> >> "trusted" (e.g. extended attribute, label, whatever). It was something I
> >> wanted to bring up at some point as potentially worth considering.
> > 
> > Scratch that. There is no such thing as a trusted binary.
> 
> I agree with your sentiment, but for those mitigations that carry a
> significant performance overhead (for example IBRS at the moment, and on
> some other architectures where we might not end up with retpolines)
> there /could/ be some value in leaving them on by default but allowing a
> sysadmin to decide to trust a given application/container and accept the
> risk. Sure, it's selectively weakened security, I get that. I am not
> necessarily advocating this, just suggesting it be discussed.
> 
> [ I also totally get that you can extend variant 2 to have any
> application that interacts with another abuse it (even over a pipe or a
> socket, etc. provided they share the same cache and take untrusted data
> that can lead to some kind of load within a speculation window), and
> there are a ton of ways to still cause an attack in that case. ]

Correct.

We have neither the basic mitigations in place nor has anyone fully
understood the implications and possible further issues.

So can we please all sit back and fix the problems at hand in a sane way
before we start discussing things like selective trust or whatever?

I've seen the insanities which were crammed into the distro kernels, which
have sysctls and whatever, but at the same time these kernels shipped in a
haste do not even boot on a specific class of machines. Great engineering
work.

The thing which sits between the ears is not an acronyn for:

    Big Revenue All Intelligence Nuked

But it seems that in some ways it has been degraded to exactly that or do
you have a sane explanation why quite some of the chip vendors ignored the
textbooks from the 90es about speculative execution, which clearly say that
speculation has to stop on domain borders and permission violations.

We already lost a lot of precious time due to other even more disgusting
big corporate games and many of us haven't had a quite moment in the past
two month.

So can we please fix the stuff on the oldest and most important principle
of engineering "Correctness first" and then once that done think about ways
how to optimize that w/o digging yet another hole.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  2:01           ` james harvey
@ 2018-01-05 10:40             ` Woodhouse, David
  2018-01-05 12:29               ` james harvey
  2018-01-05 12:06             ` Alan Cox
  1 sibling, 1 reply; 107+ messages in thread
From: Woodhouse, David @ 2018-01-05 10:40 UTC (permalink / raw)
  To: jikos, elena.reshetova, jamespharvey20
  Cc: tglx, torvalds, gregkh, linux-kernel, andi, tim.c.chen, dave.hansen


[-- Attachment #1.1: Type: text/plain, Size: 1912 bytes --]

On Thu, 2018-01-04 at 21:01 -0500, james harvey wrote:
> 
> 
> I understand the GCC patches being discussed will fix the
> vulnerability because newly compiled kernels will be compiled with a
> GCC with these patches.

The GCC patches work by eliminating all indirect branches, thus
avoiding 'variant 2' of the three problems which have been discovered.

Note that we also need to eliminate all the indirect branches which
occurred in native assembler code too, and provide the 'thunks' that
GCC uses instead, which is why there's a series of kernel patches to go
with it.

But building a kernel this way is *only* sufficient to protect the
kernel. Attacks between userspace processes are still possible — you
need the updated microcode, with branch-predictor flushes/restrictions,
to protect existing userspace processes from each other.

> But, are the GCC patches being discussed also expected to fix the
> vulnerability because user binaries will be compiled using them? 

It would be possible to do that. Sensitive userspace processes could be
built this way, rendering them invulnerable to 'variant 2' attacks
without the kernel having to use the microcode features.

> In such case, a binary could be maliciously changed back, or a custom GCC
> made with the patches reverted.

If the attacker can replace the sensitive binary, or replace the
compiler with which the sensitive binary was compiled, then we have
other problems. I'm not going to lose sleep over that.


Note that *none* of this addresses 'variant 1'. There's a separate
patch series which addresses likely 'variant 1 gadgets' in the kernel,
which I haven't seen posted in public yet. And I'm not sure what we do
about that for userspace except extending the existing Coverity ruleset
and teaching GCC to emit barriers automatically in the right places,
which is a bit far-fetched right now. Elena?

[-- Attachment #1.2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5210 bytes --]

[-- Attachment #2.1: Type: text/plain, Size: 197 bytes --]




Amazon Web Services UK Limited. Registered in England and Wales with registration number 08650665 and which has its registered office at 60 Holborn Viaduct, London EC1A 2FD, United Kingdom.

[-- Attachment #2.2: Type: text/html, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  2:01           ` james harvey
  2018-01-05 10:40             ` Woodhouse, David
@ 2018-01-05 12:06             ` Alan Cox
  1 sibling, 0 replies; 107+ messages in thread
From: Alan Cox @ 2018-01-05 12:06 UTC (permalink / raw)
  To: james harvey
  Cc: Jiri Kosina, Andi Kleen, Thomas Gleixner, Linus Torvalds,
	Greg Kroah-Hartman, dwmw, Tim Chen, Linux Kernel Mailing List,
	Dave Hansen

> But, are the GCC patches being discussed also expected to fix the
> vulnerability because user binaries will be compiled using them?  In

If you have a system with just a few user binaries where you are
concerned about such a thing you might go that way.

> such case, a binary could be maliciously changed back, or a custom GCC
> made with the patches reverted.

If I can change your gcc or your binary then instead of removing the
speculation protection I can make it encrypt all your files instead. Much
simpler.

At the point I can do this you already lost.

Alan

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  0:54               ` Thomas Gleixner
  2018-01-05  4:11                 ` Jon Masters
  2018-01-05  6:49                 ` Willy Tarreau
@ 2018-01-05 12:12                 ` Alan Cox
  2018-01-09  1:44                   ` Samir Bellabes
       [not found]                 ` <CAL9bgJ8XNJgCtxR6+M+Vm9eDBVZ4Dyi_-Lt-Q1ei9N=TE2c6cg@mail.gmail.com>
  3 siblings, 1 reply; 107+ messages in thread
From: Alan Cox @ 2018-01-05 12:12 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jon Masters, Woodhouse, David, Paolo Bonzini, Linus Torvalds,
	Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

On Fri, 5 Jan 2018 01:54:13 +0100 (CET)
Thomas Gleixner <tglx@linutronix.de> wrote:

> On Thu, 4 Jan 2018, Jon Masters wrote:
> > P.S. I've an internal document where I've been tracking "nice to haves"
> > for later, and one of them is whether it makes sense to tag binaries as
> > "trusted" (e.g. extended attribute, label, whatever). It was something I
> > wanted to bring up at some point as potentially worth considering.  
> 
> Scratch that. There is no such thing as a trusted binary.

There is if you are using signing and the like. I'm sure SELiux and
friends will grow the ability to set per process policy but that's
certainly not a priority.

However the question is wrong. 'trusted' is a binary operator not a unary
one.

The question that matters is

	If I am executing A and about to switch to B does B trust A

because if B trusts A (which in Linuxspeak is 'can A ptrace B') then
there's not much point worrying about protection between them because what
you are trying to prevent is already expressly permitted.

It's even more important if there is a cost to the barrier imposition
because not only can you skip it sometimes but your scheduler can
schedule considering that cost just as it does cache eviction costs.

Alan

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05 10:40             ` Woodhouse, David
@ 2018-01-05 12:29               ` james harvey
  0 siblings, 0 replies; 107+ messages in thread
From: james harvey @ 2018-01-05 12:29 UTC (permalink / raw)
  To: Woodhouse, David
  Cc: jikos, elena.reshetova, tglx, torvalds, gregkh, linux-kernel,
	andi, tim.c.chen, dave.hansen

On Fri, Jan 5, 2018 at 5:40 AM, Woodhouse, David <dwmw@amazon.co.uk> wrote:
> On Thu, 2018-01-04 at 21:01 -0500, james harvey wrote:
>>
>>
>> I understand the GCC patches being discussed will fix the
>> vulnerability because newly compiled kernels will be compiled with a
>> GCC with these patches.
>
> The GCC patches work by eliminating all indirect branches, thus
> avoiding 'variant 2' of the three problems which have been discovered.
>
> Note that we also need to eliminate all the indirect branches which
> occurred in native assembler code too, and provide the 'thunks' that
> GCC uses instead, which is why there's a series of kernel patches to go
> with it.
>
> But building a kernel this way is *only* sufficient to protect the
> kernel. Attacks between userspace processes are still possible — you
> need the updated microcode, with branch-predictor flushes/restrictions,
> to protect existing userspace processes from each other.
>
>> But, are the GCC patches being discussed also expected to fix the
>> vulnerability because user binaries will be compiled using them?
>
> It would be possible to do that. Sensitive userspace processes could be
> built this way, rendering them invulnerable to 'variant 2' attacks
> without the kernel having to use the microcode features.
>
>> In such case, a binary could be maliciously changed back, or a custom GCC
>> made with the patches reverted.
>
> If the attacker can replace the sensitive binary, or replace the
> compiler with which the sensitive binary was compiled, then we have
> other problems. I'm not going to lose sleep over that.
>
>
> Note that *none* of this addresses 'variant 1'. There's a separate
> patch series which addresses likely 'variant 1 gadgets' in the kernel,
> which I haven't seen posted in public yet. And I'm not sure what we do
> about that for userspace except extending the existing Coverity ruleset
> and teaching GCC to emit barriers automatically in the right places,
> which is a bit far-fetched right now. Elena?
> Amazon Web Services UK Limited. Registered in England and Wales with registration number 08650665 and which has its registered office at 60 Holborn Viaduct, London EC1A 2FD, United Kingdom.

I could have written more clearly.  What I'm getting at is if any of
the GCC patches are intended to prevent an exploit from being able to
be attempted, rather than making binaries immune from attacks.  So, I
don't mean being able to modify a sensitive binary or the system's
compiler.  I mean someone has non-root SSH access.  Compiles GCC with
whatever patches wind up for variant 1-3 reverted, or uses an old
version.  Leaves their custom compiled GCC in their user directory, to
compile malicious code.  Executes the compiled malicious code from
their user directory.  (Or, compiles a malicious program on their own
system compatible in architecture, kernel and library versions, using
GCC without new patches, and just copies over the binary to their user
directory to execute.)  Or, malicious customer with a VM on a shared
machine who has root access within their VM, reverting GCC patches
attempting to see into the host or other VM's on the machine.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Fwd: Avoid speculative indirect calls in kernel
       [not found]                 ` <CAL9bgJ8XNJgCtxR6+M+Vm9eDBVZ4Dyi_-Lt-Q1ei9N=TE2c6cg@mail.gmail.com>
@ 2018-01-07  5:04                   ` Kiernan Hager
  2018-01-07  6:39                     ` Willy Tarreau
  2018-01-07 14:01                     ` Alan Cox
  0 siblings, 2 replies; 107+ messages in thread
From: Kiernan Hager @ 2018-01-07  5:04 UTC (permalink / raw)
  To: linux-kernel

On Thu, Jan 4, 2018 at 5:54 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Thu, 4 Jan 2018, Jon Masters wrote:
>> P.S. I've an internal document where I've been tracking "nice to haves"
>> for later, and one of them is whether it makes sense to tag binaries as
>> "trusted" (e.g. extended attribute, label, whatever). It was something I
>> wanted to bring up at some point as potentially worth considering.
>
> Scratch that. There is no such thing as a trusted binary.
>

I disagree. When there are patches that slow execution down up to 30%,
I want to be able to mark a binary as "trusted" so that I can run it
without those patches if it is important. This is a boon to
configurability and helps lessen the significant performance impact of
these patches. Besides, anything run as root can already not only
read, but also write kernel memory and other processes' memory, so
it's not like this particular ability for processes trusted by the
user is anything new. This flag should probably only be settable by
root though, for obvious reasons.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Fwd: Avoid speculative indirect calls in kernel
  2018-01-07  5:04                   ` Fwd: " Kiernan Hager
@ 2018-01-07  6:39                     ` Willy Tarreau
  2018-01-07 14:01                     ` Alan Cox
  1 sibling, 0 replies; 107+ messages in thread
From: Willy Tarreau @ 2018-01-07  6:39 UTC (permalink / raw)
  To: Kiernan Hager; +Cc: linux-kernel

On Sat, Jan 06, 2018 at 10:04:26PM -0700, Kiernan Hager wrote:
> On Thu, Jan 4, 2018 at 5:54 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > On Thu, 4 Jan 2018, Jon Masters wrote:
> >> P.S. I've an internal document where I've been tracking "nice to haves"
> >> for later, and one of them is whether it makes sense to tag binaries as
> >> "trusted" (e.g. extended attribute, label, whatever). It was something I
> >> wanted to bring up at some point as potentially worth considering.
> >
> > Scratch that. There is no such thing as a trusted binary.
> >
> 
> I disagree. When there are patches that slow execution down up to 30%,
> I want to be able to mark a binary as "trusted" so that I can run it
> without those patches if it is important. This is a boon to
> configurability and helps lessen the significant performance impact of
> these patches. Besides, anything run as root can already not only
> read, but also write kernel memory and other processes' memory, so
> it's not like this particular ability for processes trusted by the
> user is anything new. This flag should probably only be settable by
> root though, for obvious reasons.

I think everyone agrees on this, but most developers are still very
busy trying to get all issues addressed first. We should simply start
to work in parallel on what could consistute the next steps without
polluting them.

BTW the performance loss can be even worse, I have a packet generator
here whose performance was divided by 4 in a VM :-) No tests yet on
bare metal (it's easier to reboot a VM).

Willy

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07  5:04                   ` Fwd: " Kiernan Hager
  2018-01-07  6:39                     ` Willy Tarreau
@ 2018-01-07 14:01                     ` Alan Cox
  2018-01-07 17:47                       ` Willy Tarreau
  1 sibling, 1 reply; 107+ messages in thread
From: Alan Cox @ 2018-01-07 14:01 UTC (permalink / raw)
  To: Kiernan Hager; +Cc: linux-kernel

> I disagree. When there are patches that slow execution down up to 30%,
> I want to be able to mark a binary as "trusted" so that I can run it

It's not a binary that is trusted - it's a binary in a given use case.
You could easily have the same binary being run in two situations on the
same box at the same time and run just one of them 'trusted'.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  7:13                     ` Willy Tarreau
@ 2018-01-07 14:14                       ` Borislav Petkov
  2018-01-07 17:21                         ` David Lang
  2018-01-07 17:44                         ` Willy Tarreau
  0 siblings, 2 replies; 107+ messages in thread
From: Borislav Petkov @ 2018-01-07 14:14 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Dave Hansen, Thomas Gleixner, Jon Masters, Woodhouse, David,
	Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Jeff Law, Nick Clifton

On Fri, Jan 05, 2018 at 08:13:33AM +0100, Willy Tarreau wrote:
> I'm not fond of running the mitigations, but given that a few sysops can
> connect to the machine to collect stats or counters, I think it would be
> better to ensure these people can't happily play with the exploits to
> dump stuff they shouldn't have access to.

So if someone exploits the "trusted" process, and then dumps all memory,
you have practically lost.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07 14:14                       ` Borislav Petkov
@ 2018-01-07 17:21                         ` David Lang
  2018-01-07 18:49                           ` Borislav Petkov
  2018-01-07 17:44                         ` Willy Tarreau
  1 sibling, 1 reply; 107+ messages in thread
From: David Lang @ 2018-01-07 17:21 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Willy Tarreau, Dave Hansen, Thomas Gleixner, Jon Masters,
	Woodhouse, David, Paolo Bonzini, Alan Cox, Linus Torvalds,
	Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Jeff Law, Nick Clifton

The point is that in many cases, if someone explits the "trusted" process, they 
already have everything that the machine is able to do anyway.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07 14:14                       ` Borislav Petkov
  2018-01-07 17:21                         ` David Lang
@ 2018-01-07 17:44                         ` Willy Tarreau
  2018-01-07 18:55                           ` Borislav Petkov
  1 sibling, 1 reply; 107+ messages in thread
From: Willy Tarreau @ 2018-01-07 17:44 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dave Hansen, Thomas Gleixner, Jon Masters, Woodhouse, David,
	Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Jeff Law, Nick Clifton

On Sun, Jan 07, 2018 at 03:14:10PM +0100, Borislav Petkov wrote:
> On Fri, Jan 05, 2018 at 08:13:33AM +0100, Willy Tarreau wrote:
> > I'm not fond of running the mitigations, but given that a few sysops can
> > connect to the machine to collect stats or counters, I think it would be
> > better to ensure these people can't happily play with the exploits to
> > dump stuff they shouldn't have access to.
> 
> So if someone exploits the "trusted" process, and then dumps all memory,
> you have practically lost.

Exactly, but there's much more to gain by owning this process anyway in
certain cases than just dumping a few hundreds of kernel bytes.

That's where I consider that "trusted" is more "critical" than "safe" :
if it dies, we all die anyway. Just like you have to trust your plane's
pilot eventhough you don't know him personally.

Willy

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07 14:01                     ` Alan Cox
@ 2018-01-07 17:47                       ` Willy Tarreau
  2018-01-07 18:01                         ` Ivan Ivanov
  0 siblings, 1 reply; 107+ messages in thread
From: Willy Tarreau @ 2018-01-07 17:47 UTC (permalink / raw)
  To: Alan Cox; +Cc: Kiernan Hager, linux-kernel

On Sun, Jan 07, 2018 at 02:01:38PM +0000, Alan Cox wrote:
> > I disagree. When there are patches that slow execution down up to 30%,
> > I want to be able to mark a binary as "trusted" so that I can run it
> 
> It's not a binary that is trusted - it's a binary in a given use case.
> You could easily have the same binary being run in two situations on the
> same box at the same time and run just one of them 'trusted'.

That's what I like with the prctl approach. This can end up as a config
option in the application itself. At least I'd see it like this in
haproxy. Basically :
  - start it with enough privileges (always the case to warrant chroot()
    then setuid())

  - if config option "disable-kpti" is set, run prctl() to disable it.


It is sufficiently inconvenient to ensure that it's only done where
relevant and regardless of the executable itself (ie it should not be
an xattr on the FS for example).

Willy

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07 17:47                       ` Willy Tarreau
@ 2018-01-07 18:01                         ` Ivan Ivanov
  2018-01-07 18:16                           ` Woodhouse, David
  0 siblings, 1 reply; 107+ messages in thread
From: Ivan Ivanov @ 2018-01-07 18:01 UTC (permalink / raw)
  To: kah.listaddress, Dave Hansen, Thomas Gleixner, Jon Masters,
	Woodhouse, David, Paolo Bonzini, Alan Cox, Linus Torvalds,
	Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Jeff Law, Nick Clifton, bp

Make sure that your patches do not affect AMD CPU,
because they are unaffected by Meltdown vulnerability
for which this "30% slowdown Intel patch" is required

All your security patches regarding Meltdown should be like:
*) if its Intel, it is " cpu_insecure " ==> take a safe and slow route
*) if its AMD, it is " secure cpu " ==> take a normal route

AMD users should not suffer because of Intel screwups.
if Intel is responsible they should accept the CPU returns

Best regards
Ivan Ivanov,
coreboot developer and
open source enthusiast
<div id="DAB4FAD8-2DD7-40BB-A1B8-4E2AA1F9FDF2"><br /> <table
style="border-top: 1px solid #D3D4DE;">
	<tr>
      <td style="width: 55px; padding-top: 18px;"><a
href="https://www.avast.com/sig-email?utm_medium=email&utm_source=link&utm_campaign=sig-email&utm_content=webmail"
target="_blank"><img
src="https://ipmcdn.avast.com/images/icons/icon-envelope-tick-round-orange-animated-no-repeat-v1.gif"
alt="" width="46" height="29" style="width: 46px; height: 29px;"
/></a></td>
		<td style="width: 470px; padding-top: 17px; color: #41424e;
font-size: 13px; font-family: Arial, Helvetica, sans-serif;
line-height: 18px;">Без вирусов. <a
href="https://www.avast.com/sig-email?utm_medium=email&utm_source=link&utm_campaign=sig-email&utm_content=webmail"
target="_blank" style="color: #4453ea;">www.avast.ru</a> 		</td>
	</tr>
</table>
<a href="#DAB4FAD8-2DD7-40BB-A1B8-4E2AA1F9FDF2" width="1" height="1"></a></div>

2018-01-07 20:47 GMT+03:00 Willy Tarreau <w@1wt.eu>:
> On Sun, Jan 07, 2018 at 02:01:38PM +0000, Alan Cox wrote:
>> > I disagree. When there are patches that slow execution down up to 30%,
>> > I want to be able to mark a binary as "trusted" so that I can run it
>>
>> It's not a binary that is trusted - it's a binary in a given use case.
>> You could easily have the same binary being run in two situations on the
>> same box at the same time and run just one of them 'trusted'.
>
> That's what I like with the prctl approach. This can end up as a config
> option in the application itself. At least I'd see it like this in
> haproxy. Basically :
>   - start it with enough privileges (always the case to warrant chroot()
>     then setuid())
>
>   - if config option "disable-kpti" is set, run prctl() to disable it.
>
>
> It is sufficiently inconvenient to ensure that it's only done where
> relevant and regardless of the executable itself (ie it should not be
> an xattr on the FS for example).
>
> Willy

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07 18:01                         ` Ivan Ivanov
@ 2018-01-07 18:16                           ` Woodhouse, David
  0 siblings, 0 replies; 107+ messages in thread
From: Woodhouse, David @ 2018-01-07 18:16 UTC (permalink / raw)
  To: Ivan Ivanov, kah.listaddress, Dave Hansen, Thomas Gleixner,
	Jon Masters, Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Jeff Law, Nick Clifton, bp

[-- Attachment #1: Type: text/plain, Size: 344 bytes --]

On Sun, 2018-01-07 at 21:01 +0300, Ivan Ivanov wrote:
> Make sure that your patches do not affect AMD CPU,
> because they are unaffected by Meltdown vulnerability
> for which this "30% slowdown Intel patch" is required

These patches *do* affect AMD CPUs, because they address one of the
issues for which AMD CPUs are also vulnerable.


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5210 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07 17:21                         ` David Lang
@ 2018-01-07 18:49                           ` Borislav Petkov
  0 siblings, 0 replies; 107+ messages in thread
From: Borislav Petkov @ 2018-01-07 18:49 UTC (permalink / raw)
  To: David Lang
  Cc: Willy Tarreau, Dave Hansen, Thomas Gleixner, Jon Masters,
	Woodhouse, David, Paolo Bonzini, Alan Cox, Linus Torvalds,
	Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Jeff Law, Nick Clifton

On Sun, Jan 07, 2018 at 09:21:44AM -0800, David Lang wrote:
> The point is that in many cases, if someone explits the "trusted" process,
> they already have everything that the machine is able to do anyway.

...and then we don't need the per-process complication anyway.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07 17:44                         ` Willy Tarreau
@ 2018-01-07 18:55                           ` Borislav Petkov
  2018-01-07 22:10                             ` Willy Tarreau
  0 siblings, 1 reply; 107+ messages in thread
From: Borislav Petkov @ 2018-01-07 18:55 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Dave Hansen, Thomas Gleixner, Jon Masters, Woodhouse, David,
	Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Jeff Law, Nick Clifton

On Sun, Jan 07, 2018 at 06:44:51PM +0100, Willy Tarreau wrote:
> Exactly, but there's much more to gain by owning this process anyway in
> certain cases than just dumping a few hundreds of kernel bytes.

A few hundred? It is *all* machine bytes.

> That's where I consider that "trusted" is more "critical" than "safe" :
> if it dies, we all die anyway.

No, not die. Exploit it and since it is "trusted", use it to dump all
memory. All your memories belongs to us.

> Just like you have to trust your plane's pilot eventhough you don't
> know him personally.

Funny you should make that analogy. Remember that germanwings pilot?
People trusted him too.

Now imagine if the plane had protection against insane pilots... some of
those people might still be alive, who knows...

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07 18:55                           ` Borislav Petkov
@ 2018-01-07 22:10                             ` Willy Tarreau
  2018-01-08  9:18                               ` Thomas Gleixner
  2018-01-08 16:22                               ` Borislav Petkov
  0 siblings, 2 replies; 107+ messages in thread
From: Willy Tarreau @ 2018-01-07 22:10 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dave Hansen, Thomas Gleixner, Jon Masters, Woodhouse, David,
	Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Jeff Law, Nick Clifton

On Sun, Jan 07, 2018 at 07:55:11PM +0100, Borislav Petkov wrote:
> > Just like you have to trust your plane's pilot eventhough you don't
> > know him personally.
> 
> Funny you should make that analogy. Remember that germanwings pilot?
> People trusted him too.
> 
> Now imagine if the plane had protection against insane pilots... some of
> those people might still be alive, who knows...

Sure but despite this case many people continue to take the plane because
it's their only option to cross half of the world in a reasonable time.

Boris, I'm *not* contesting the performance resulting from the fixes,
and I would never have been able to produce them myself had I to, so
I'm really glad we have them. I just want to be clear that the big drop
some of us are facing is not an option *at all* for certain processes
in certain environments and that we'll either continue to run with
pti=off or with pti=on + a finer grained setting ASAP.

I mean, the kernel is not the only sensitive part in a system (and
sometimes it's even not at all). A kernel + a userland processes
deliver a service, each in it role. Breaking one or the other can be
similar or sometimes the trouble can be worse for one than the other.
But for some situations, the good work condition of the combination of
the two is critical, and even a kernel compromission could be a detail
compared to the impact of something crashing at full load. Sometimes a
userspace compromission would already be critical enough that the risk
is not higher by accepting to take it for the kernel as well.

In my specific case, on LB appliances, I don't really care what happens
once haproxy has already been compromised, it's too late. End of the
game, all sensitive information are already disclosed at this point.
What I'd rather avoid however is the occasional sysop who has an account
on the machine to retrieve some stats once in a while that would suddenly
be able to get more than these stats. That's where I draw the line for
*this* use case. Plenty of others will have plenty of other perception
and that's fine.

Cheers,
Willy

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07 22:10                             ` Willy Tarreau
@ 2018-01-08  9:18                               ` Thomas Gleixner
  2018-01-08  9:29                                 ` Willy Tarreau
  2018-01-08 16:22                               ` Borislav Petkov
  1 sibling, 1 reply; 107+ messages in thread
From: Thomas Gleixner @ 2018-01-08  9:18 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Borislav Petkov, Dave Hansen, Jon Masters, Woodhouse, David,
	Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Jeff Law, Nick Clifton

On Sun, 7 Jan 2018, Willy Tarreau wrote:
> On Sun, Jan 07, 2018 at 07:55:11PM +0100, Borislav Petkov wrote:
> > > Just like you have to trust your plane's pilot eventhough you don't
> > > know him personally.
> > 
> > Funny you should make that analogy. Remember that germanwings pilot?
> > People trusted him too.
> > 
> > Now imagine if the plane had protection against insane pilots... some of
> > those people might still be alive, who knows...
> 
> Sure but despite this case many people continue to take the plane because
> it's their only option to cross half of the world in a reasonable time.
> 
> Boris, I'm *not* contesting the performance resulting from the fixes,
> and I would never have been able to produce them myself had I to, so
> I'm really glad we have them. I just want to be clear that the big drop
> some of us are facing is not an option *at all* for certain processes
> in certain environments and that we'll either continue to run with
> pti=off or with pti=on + a finer grained setting ASAP.

No argument about that. We've looked into per process PTI very early and
decided not to go that route because of the time pressure and the risk. I'm
glad that we managed to pull it off at all without breaking the world
completely. It's surely doable and we all know that it has to be done, just
not right now as we have to fast track at least the basic protections for
the other two attack vectors.

You can be sure, that all people involved hate it more than you do.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-08  9:18                               ` Thomas Gleixner
@ 2018-01-08  9:29                                 ` Willy Tarreau
  0 siblings, 0 replies; 107+ messages in thread
From: Willy Tarreau @ 2018-01-08  9:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Borislav Petkov, Dave Hansen, Jon Masters, Woodhouse, David,
	Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Jeff Law, Nick Clifton

Hi Thomas,

On Mon, Jan 08, 2018 at 10:18:09AM +0100, Thomas Gleixner wrote:
> On Sun, 7 Jan 2018, Willy Tarreau wrote:
> > On Sun, Jan 07, 2018 at 07:55:11PM +0100, Borislav Petkov wrote:
> > > > Just like you have to trust your plane's pilot eventhough you don't
> > > > know him personally.
> > > 
> > > Funny you should make that analogy. Remember that germanwings pilot?
> > > People trusted him too.
> > > 
> > > Now imagine if the plane had protection against insane pilots... some of
> > > those people might still be alive, who knows...
> > 
> > Sure but despite this case many people continue to take the plane because
> > it's their only option to cross half of the world in a reasonable time.
> > 
> > Boris, I'm *not* contesting the performance resulting from the fixes,
> > and I would never have been able to produce them myself had I to, so
> > I'm really glad we have them. I just want to be clear that the big drop
> > some of us are facing is not an option *at all* for certain processes
> > in certain environments and that we'll either continue to run with
> > pti=off or with pti=on + a finer grained setting ASAP.
> 
> No argument about that. We've looked into per process PTI very early and
> decided not to go that route because of the time pressure and the risk. I'm
> glad that we managed to pull it off at all without breaking the world
> completely. It's surely doable and we all know that it has to be done, just
> not right now as we have to fast track at least the basic protections for
> the other two attack vectors.

I know that most people with the skills to do it are very busy, which is
why I started to take a look at it, not being involved at all in this and
having interest in seeing it done. For me the road is long, progressively
discovering asid/pcid etc in the code, you can guess I won't come up with
something testable any time soon ;-)

My idea would be to use a privileged prctl() call to set a new TIF_NOPTI
on the task and to see where to check for this to avoid switching to the
user-only PGD when returning to userspace. I have no idea if this is
doable at all nor if this would be sufficient (I hope so) but reading
the code to try to figure whether it makes sense cannot hurt.

> You can be sure, that all people involved hate it more than you do.

I'm definitely convinced about this, we're all proud to save one CPU
cycle here and there from time to time and having to suddenly flush TLBs
and throw hundreds or thousands of cycles at once down the drain must be
a very hard decision to take. And by the way I don't hate what was done
because there's a config option and I still have the choice. Other OS
users probably don't even have this choice, so thanks to all involved
for this!

Willy

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05  9:59                   ` Thomas Gleixner
@ 2018-01-08 10:28                     ` Andrea Arcangeli
  2018-01-08 20:42                       ` [tip:x86/pti] x86/tboot: Unbreak tboot with PTI enabled tip-bot for Dave Hansen
  2018-01-08 20:53                       ` Avoid speculative indirect calls in kernel Thomas Gleixner
  0 siblings, 2 replies; 107+ messages in thread
From: Andrea Arcangeli @ 2018-01-08 10:28 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jon Masters, Woodhouse, David, Paolo Bonzini, Alan Cox,
	Linus Torvalds, Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

On Fri, Jan 05, 2018 at 10:59:28AM +0100, Thomas Gleixner wrote:
> I've seen the insanities which were crammed into the distro kernels, which
> have sysctls and whatever, but at the same time these kernels shipped in a

Debugfs tunables only, there are no sysctl, quoting Greg:

http://lkml.kernel.org/r/20180107082026.GA11510@kroah.com

"It's a debugfs api, it can be changed at any time, to be anything we
want, and all is fine :)"

> haste do not even boot on a specific class of machines. [..]

If you refer to the two efi_64.c and tboot.c corner case boot failures
found over the last weekend those affected upstream 4.15-rc 4.14.12
and all PTI branches in linux-tip too (perhaps less reproducible there
because of differences in old_memmap handling).

I sent you a better version of the efi_64.c fix from Jiri privately
and you still miss the tboot fix in linux-tip so you still got a boot
failure to fix there.

This is incremental with
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/log/?h=WIP.x86/pti
where the "Unbreak EFI old_memmap" fix is applied.

I respinned it after doing the more correct fix in this case too (same
as the efi_64.c improvement) while leaving the attribution to the fix
to Dave as he did the hard part.

>From 0c480d1eeabd56379144a4ed6b6fb24f3b84e40e Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Sat, 6 Jan 2018 18:41:14 +0100
Subject: [PATCH 1/1] x86/kaiser/efi: unbreak tboot

This is another case similar to what EFI does: create a new set of
page tables, map some code at a low address, and jump to it.  PTI
mistakes this low address for userspace and mistakenly marks it
non-executable in an effort to make it unusable for userspace.  Undo
the poison to allow execution.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ning Sun <ning.sun@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: tboot-devel@lists.sourceforge.net
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 arch/x86/kernel/tboot.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a4eb27918ceb..75869a4b6c41 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 	p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
 	if (!p4d)
 		return -1;
+	pgd->pgd &= ~_PAGE_NX;
 	pud = pud_alloc(&tboot_mm, p4d, vaddr);
 	if (!pud)
 		return -1;

If I can help and assist in any other way let me know.

Thanks,
Andrea

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-07 22:10                             ` Willy Tarreau
  2018-01-08  9:18                               ` Thomas Gleixner
@ 2018-01-08 16:22                               ` Borislav Petkov
  2018-01-08 16:53                                 ` Willy Tarreau
  1 sibling, 1 reply; 107+ messages in thread
From: Borislav Petkov @ 2018-01-08 16:22 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Dave Hansen, Thomas Gleixner, Jon Masters, Woodhouse, David,
	Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Jeff Law, Nick Clifton

On Sun, Jan 07, 2018 at 11:10:38PM +0100, Willy Tarreau wrote:
>  I just want to be clear that the big drop some of us are facing is
> not an option *at all* for certain processes in certain environments
> and that we'll either continue to run with pti=off or with pti=on + a
> finer grained setting ASAP.

And that's all I'm saying: do pti=off in that case. The finer-grained
"solution" is just silly.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-08 16:22                               ` Borislav Petkov
@ 2018-01-08 16:53                                 ` Willy Tarreau
  0 siblings, 0 replies; 107+ messages in thread
From: Willy Tarreau @ 2018-01-08 16:53 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dave Hansen, Thomas Gleixner, Jon Masters, Woodhouse, David,
	Paolo Bonzini, Alan Cox, Linus Torvalds, Andi Kleen,
	Greg Kroah-Hartman, Tim Chen, Linux Kernel Mailing List,
	Jeff Law, Nick Clifton

On Mon, Jan 08, 2018 at 05:22:41PM +0100, Borislav Petkov wrote:
> On Sun, Jan 07, 2018 at 11:10:38PM +0100, Willy Tarreau wrote:
> >  I just want to be clear that the big drop some of us are facing is
> > not an option *at all* for certain processes in certain environments
> > and that we'll either continue to run with pti=off or with pti=on + a
> > finer grained setting ASAP.
> 
> And that's all I'm saying: do pti=off in that case. The finer-grained
> "solution" is just silly.

I disagree because I want that, as much as possible, occasional
unprivileged local users can't exploit it. pti=off gives them full
access. The finer-grained solution ensures that only a few processes
share the same risk as the kernel as they work together to deliver
the service. And that's what I've implemented in a patch series I
sent in another thread :-)

   https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1580131.html

Cheers,
Willy

^ permalink raw reply	[flat|nested] 107+ messages in thread

* [tip:x86/pti] x86/tboot: Unbreak tboot with PTI enabled
  2018-01-08 10:28                     ` Andrea Arcangeli
@ 2018-01-08 20:42                       ` tip-bot for Dave Hansen
  2018-01-08 20:53                       ` Avoid speculative indirect calls in kernel Thomas Gleixner
  1 sibling, 0 replies; 107+ messages in thread
From: tip-bot for Dave Hansen @ 2018-01-08 20:42 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: hpa, law, jcm, tim.c.chen, aarcange, dwmw, gnomes, pbonzini,
	dave.hansen, torvalds, nickc, andi, tglx, dave.hansen, gregkh,
	mingo, linux-kernel

Commit-ID:  262b6b30087246abf09d6275eb0c0dc421bcbe38
Gitweb:     https://git.kernel.org/tip/262b6b30087246abf09d6275eb0c0dc421bcbe38
Author:     Dave Hansen <dave.hansen@linux.intel.com>
AuthorDate: Sat, 6 Jan 2018 18:41:14 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Mon, 8 Jan 2018 17:29:18 +0100

x86/tboot: Unbreak tboot with PTI enabled

This is another case similar to what EFI does: create a new set of
page tables, map some code at a low address, and jump to it.  PTI
mistakes this low address for userspace and mistakenly marks it
non-executable in an effort to make it unusable for userspace.

Undo the poison to allow execution.

Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig")
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Jon Masters <jcm@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Jeff Law <law@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: David" <dwmw@amazon.co.uk>
Cc: Nick Clifton <nickc@redhat.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180108102805.GK25546@redhat.com
---
 arch/x86/kernel/tboot.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a4eb279..75869a4 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 	p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
 	if (!p4d)
 		return -1;
+	pgd->pgd &= ~_PAGE_NX;
 	pud = pud_alloc(&tboot_mm, p4d, vaddr);
 	if (!pud)
 		return -1;

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-08 10:28                     ` Andrea Arcangeli
  2018-01-08 20:42                       ` [tip:x86/pti] x86/tboot: Unbreak tboot with PTI enabled tip-bot for Dave Hansen
@ 2018-01-08 20:53                       ` Thomas Gleixner
  2018-01-08 21:32                         ` Andrea Arcangeli
  1 sibling, 1 reply; 107+ messages in thread
From: Thomas Gleixner @ 2018-01-08 20:53 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jon Masters, Woodhouse, David, Paolo Bonzini, Alan Cox,
	Linus Torvalds, Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

On Mon, 8 Jan 2018, Andrea Arcangeli wrote:
> On Fri, Jan 05, 2018 at 10:59:28AM +0100, Thomas Gleixner wrote:
> I sent you a better version of the efi_64.c fix from Jiri privately
> and you still miss the tboot fix in linux-tip so you still got a boot
> failure to fix there.

Missed that in the pile ...

> This is incremental with
> https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/log/?h=WIP.x86/pti
> where the "Unbreak EFI old_memmap" fix is applied.
> 
> I respinned it after doing the more correct fix in this case too (same
> as the efi_64.c improvement) while leaving the attribution to the fix
> to Dave as he did the hard part.

Thanks for resending it.

> >From 0c480d1eeabd56379144a4ed6b6fb24f3b84e40e Mon Sep 17 00:00:00 2001
> From: Dave Hansen <dave.hansen@linux.intel.com>
> Date: Sat, 6 Jan 2018 18:41:14 +0100
> Subject: [PATCH 1/1] x86/kaiser/efi: unbreak tboot
> 
> This is another case similar to what EFI does: create a new set of
> page tables, map some code at a low address, and jump to it.  PTI
> mistakes this low address for userspace and mistakenly marks it
> non-executable in an effort to make it unusable for userspace.  Undo
> the poison to allow execution.
> 
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Ning Sun <ning.sun@intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: x86@kernel.org
> Cc: tboot-devel@lists.sourceforge.net
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> ---
>  arch/x86/kernel/tboot.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
> index a4eb27918ceb..75869a4b6c41 100644
> --- a/arch/x86/kernel/tboot.c
> +++ b/arch/x86/kernel/tboot.c
> @@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
>  	p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
>  	if (!p4d)
>  		return -1;
> +	pgd->pgd &= ~_PAGE_NX;
>  	pud = pud_alloc(&tboot_mm, p4d, vaddr);
>  	if (!pud)
>  		return -1;
> 
> If I can help and assist in any other way let me know.
> 
> Thanks,
> Andrea
> 

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-08 20:53                       ` Avoid speculative indirect calls in kernel Thomas Gleixner
@ 2018-01-08 21:32                         ` Andrea Arcangeli
  2018-01-10  0:45                           ` Thomas Gleixner
  0 siblings, 1 reply; 107+ messages in thread
From: Andrea Arcangeli @ 2018-01-08 21:32 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jon Masters, Woodhouse, David, Paolo Bonzini, Alan Cox,
	Linus Torvalds, Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

On Mon, Jan 08, 2018 at 09:53:02PM +0100, Thomas Gleixner wrote:
> Thanks for resending it.

Thanks to you for the PTI improvements!

Did my best to do the cleanest patch for tip, but I now figured Dave's
original comment was spot on: a _PAGE_NX clear then becomes necessary
also after pud_alloc not only after p4d_alloc.

pmd_alloc would run into the same with x86 32bit non-PAE too.

So there are two choices, either going back to one single _PAGE_NX
clear from the original Dave's original patch as below, or to add
multiple clear after each level which was my objective and is more
robust, but it may be overkill in this case. As long as it was one
line it looked a clear improvement.

Considering the caller in both cases is going to abort I guess we can
use the one liner approach as Dave and Jiri did originally.

It's up to you, doing it at each level would be more resilent in case
the caller is changed.

For the efi_64 same issue, the current tip patch will work better, but
it can still be cleaned up with pgd_efi instead of pgd_offset_k().

I got partly fooled because it worked great with 4levels, but it
wasn't ok anyway for 32bit non-PAE. Sometime it's the simpler stuff
that gets more subtle.

Andrea

>From 391517951e904cdd231dda9943c36a25a7bf01b9 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Sat, 6 Jan 2018 18:41:14 +0100
Subject: [PATCH 1/1] x86/kaiser/efi: unbreak tboot

This is another case similar to what EFI does: create a new set of
page tables, map some code at a low address, and jump to it.  PTI
mistakes this low address for userspace and mistakenly marks it
non-executable in an effort to make it unusable for userspace.  Undo
the poison to allow execution.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ning Sun <ning.sun@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: tboot-devel@lists.sourceforge.net
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 arch/x86/kernel/tboot.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a4eb27918ceb..a2486f444073 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -138,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 		return -1;
 	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
 	pte_unmap(pte);
+
+	/*
+	 * PTI poisons low addresses in the kernel page tables in the
+	 * name of making them unusable for userspace.  To execute
+	 * code at such a low address, the poison must be cleared.
+	 *
+	 * Note: 'pgd' actually gets set in p4d_alloc() _or_
+	 * pud_alloc() depending on 4/5-level paging.
+	 */
+	pgd->pgd &= ~_PAGE_NX;
+
 	return 0;
 }
 

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-05 12:12                 ` Alan Cox
@ 2018-01-09  1:44                   ` Samir Bellabes
  0 siblings, 0 replies; 107+ messages in thread
From: Samir Bellabes @ 2018-01-09  1:44 UTC (permalink / raw)
  To: Alan Cox
  Cc: Thomas Gleixner, Jon Masters, Woodhouse, David, Paolo Bonzini,
	Linus Torvalds, Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton,
	Willy Tarreau

Alan Cox <gnomes@lxorguk.ukuu.org.uk> writes:

> On Fri, 5 Jan 2018 01:54:13 +0100 (CET)
> Thomas Gleixner <tglx@linutronix.de> wrote:
>
>> On Thu, 4 Jan 2018, Jon Masters wrote:
>> > P.S. I've an internal document where I've been tracking "nice to haves"
>> > for later, and one of them is whether it makes sense to tag binaries as
>> > "trusted" (e.g. extended attribute, label, whatever). It was something I
>> > wanted to bring up at some point as potentially worth considering.  
>> 
>> Scratch that. There is no such thing as a trusted binary.
>
> There is if you are using signing and the like. I'm sure SELiux and
> friends will grow the ability to set per process policy but that's
> certainly not a priority.

There was a proposed security module providing such a per-process
policy. 

When a process want to execute a specific networking syscall regarding
specific "transport protocol", the security module catches the syscall
at the LSM hook, and ask user about the "verdict" (authorized or not ?) 

Verdicts are put inside "tickets" (it's a struct of information
regarding the autorization). Verdicts can have timeout or live
forever. They are managed by a hashtable.

The policy can be define by attaching tickets to process with a
userspace tool. Interface between userspace command tool and kernel is
using netlink protocol.

I managed to do the same on process and memory. memory access requires
process to delivery a available ticket. Sharing memory is like "process
A has a ticket required to access memory of process B"

Of course, direct assignation, throught asm code or operation like :
buffer[x] = y;
are impossible to catch at this level. It requires hooks at the asm
level.

As I understand, Willy needs to built such a took to classify "trusted"
binaries from others.

This is just the top of the iceberg, because, after starting to mark
process as "trusted" or not, there is a need of an architecture to track
such operations, evaluate incoherences, evaluate the convergence of such
classification, regarding thousands of binaries, in a lot of
contexts. This was the big part of the job.


last series I propose was years ago under the name :
[RFC,v3,00/10] snet: Security for NETwork syscalls

and particulary :
[RFC,v3,08/10] snet: introduce snet_ticket
http://patchwork.ozlabs.org/patch/93808/ 



thanks;
sam

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-08 21:32                         ` Andrea Arcangeli
@ 2018-01-10  0:45                           ` Thomas Gleixner
  2018-01-10  1:11                             ` Dave Hansen
  0 siblings, 1 reply; 107+ messages in thread
From: Thomas Gleixner @ 2018-01-10  0:45 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jon Masters, Woodhouse, David, Paolo Bonzini, Alan Cox,
	Linus Torvalds, Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton,
	Andy Lutomirski, Peter Zijlstra

On Mon, 8 Jan 2018, Andrea Arcangeli wrote:

> On Mon, Jan 08, 2018 at 09:53:02PM +0100, Thomas Gleixner wrote:
> > Thanks for resending it.
> 
> Thanks to you for the PTI improvements!
> 
> Did my best to do the cleanest patch for tip, but I now figured Dave's
> original comment was spot on: a _PAGE_NX clear then becomes necessary
> also after pud_alloc not only after p4d_alloc.
> 
> pmd_alloc would run into the same with x86 32bit non-PAE too.
> 
> So there are two choices, either going back to one single _PAGE_NX
> clear from the original Dave's original patch as below, or to add
> multiple clear after each level which was my objective and is more
> robust, but it may be overkill in this case. As long as it was one
> line it looked a clear improvement.
> 
> Considering the caller in both cases is going to abort I guess we can
> use the one liner approach as Dave and Jiri did originally.

Dave ?

> 
> It's up to you, doing it at each level would be more resilent in case
> the caller is changed.
> 
> For the efi_64 same issue, the current tip patch will work better, but
> it can still be cleaned up with pgd_efi instead of pgd_offset_k().
> 
> I got partly fooled because it worked great with 4levels, but it
> wasn't ok anyway for 32bit non-PAE. Sometime it's the simpler stuff
> that gets more subtle.
> 
> Andrea
> 
> >From 391517951e904cdd231dda9943c36a25a7bf01b9 Mon Sep 17 00:00:00 2001
> From: Dave Hansen <dave.hansen@linux.intel.com>
> Date: Sat, 6 Jan 2018 18:41:14 +0100
> Subject: [PATCH 1/1] x86/kaiser/efi: unbreak tboot
> 
> This is another case similar to what EFI does: create a new set of
> page tables, map some code at a low address, and jump to it.  PTI
> mistakes this low address for userspace and mistakenly marks it
> non-executable in an effort to make it unusable for userspace.  Undo
> the poison to allow execution.
> 
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Ning Sun <ning.sun@intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: x86@kernel.org
> Cc: tboot-devel@lists.sourceforge.net
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> ---
>  arch/x86/kernel/tboot.c | 11 +++++++++++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
> index a4eb27918ceb..a2486f444073 100644
> --- a/arch/x86/kernel/tboot.c
> +++ b/arch/x86/kernel/tboot.c
> @@ -138,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
>  		return -1;
>  	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
>  	pte_unmap(pte);
> +
> +	/*
> +	 * PTI poisons low addresses in the kernel page tables in the
> +	 * name of making them unusable for userspace.  To execute
> +	 * code at such a low address, the poison must be cleared.
> +	 *
> +	 * Note: 'pgd' actually gets set in p4d_alloc() _or_
> +	 * pud_alloc() depending on 4/5-level paging.
> +	 */
> +	pgd->pgd &= ~_PAGE_NX;
> +
>  	return 0;
>  }
>  
> 

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-10  0:45                           ` Thomas Gleixner
@ 2018-01-10  1:11                             ` Dave Hansen
  2018-01-10 16:02                               ` Thomas Gleixner
  0 siblings, 1 reply; 107+ messages in thread
From: Dave Hansen @ 2018-01-10  1:11 UTC (permalink / raw)
  To: Thomas Gleixner, Andrea Arcangeli
  Cc: Jon Masters, Woodhouse, David, Paolo Bonzini, Alan Cox,
	Linus Torvalds, Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Jeff Law, Nick Clifton,
	Andy Lutomirski, Peter Zijlstra

[-- Attachment #1: Type: text/plain, Size: 1457 bytes --]

On 01/09/2018 04:45 PM, Thomas Gleixner wrote:
> On Mon, 8 Jan 2018, Andrea Arcangeli wrote:
>> On Mon, Jan 08, 2018 at 09:53:02PM +0100, Thomas Gleixner wrote:
>> Did my best to do the cleanest patch for tip, but I now figured Dave's
>> original comment was spot on: a _PAGE_NX clear then becomes necessary
>> also after pud_alloc not only after p4d_alloc.
>>
>> pmd_alloc would run into the same with x86 32bit non-PAE too.

non-PAE doesn't have an NX bit. :)

But we #define _PAGE_NX down to 0 there so it's harmless.

>> So there are two choices, either going back to one single _PAGE_NX
>> clear from the original Dave's original patch as below, or to add
>> multiple clear after each level which was my objective and is more
>> robust, but it may be overkill in this case. As long as it was one
>> line it looked a clear improvement.
>>
>> Considering the caller in both cases is going to abort I guess we can
>> use the one liner approach as Dave and Jiri did originally.
> 
> Dave ?

I agree with Andrea.  The patch in -tip potentially misses the pgd
clearing if pud_alloc() sets a PGD.  It would also be nice to have that
comment back.

Note that the -tip commit probably works in *practice* because for two
adjacent calls to map_tboot_page() that share a PGD entry, the first
will clear NX, *then* allocate and set the PGD (without NX clear).  The
second call will *not* allocate but will clear the NX bit.

The patch I think we want is attached.

[-- Attachment #2: pti-tboot-fix.patch --]
[-- Type: text/x-patch, Size: 1449 bytes --]


From: Dave Hansen <dave.hansen@linux.intel.com>

This is another case similar to what EFI does: create a new set of
page tables, map some code at a low address, and jump to it.  PTI
mistakes this low address for userspace and mistakenly marks it
non-executable in an effort to make it unusable for userspace.  Undo
the poison to allow execution.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ning Sun <ning.sun@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: tboot-devel@lists.sourceforge.net
Cc: linux-kernel@vger.kernel.org
---

 b/arch/x86/kernel/tboot.c |   11 +++++++++++
 1 file changed, 11 insertions(+)

diff -puN arch/x86/kernel/tboot.c~pti-tboot-fix arch/x86/kernel/tboot.c
--- a/arch/x86/kernel/tboot.c~pti-tboot-fix	2018-01-05 21:50:55.755554960 -0800
+++ b/arch/x86/kernel/tboot.c	2018-01-05 23:51:41.368536890 -0800
@@ -138,6 +138,17 @@ static int map_tboot_page(unsigned long
 		return -1;
 	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
 	pte_unmap(pte);
+
+	/*
+	 * PTI poisons low addresses in the kernel page tables in the
+	 * name of making them unusable for userspace.  To execute
+	 * code at such a low address, the poison must be cleared.
+	 *
+	 * Note: 'pgd' actually gets set in p4d_alloc() _or_
+	 * pud_alloc() depending on 4/5-level paging.
+	 */
+	pgd->pgd &= ~_PAGE_NX;
+
 	return 0;
 }
 
_

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-10  1:11                             ` Dave Hansen
@ 2018-01-10 16:02                               ` Thomas Gleixner
  0 siblings, 0 replies; 107+ messages in thread
From: Thomas Gleixner @ 2018-01-10 16:02 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andrea Arcangeli, Jon Masters, Woodhouse, David, Paolo Bonzini,
	Alan Cox, Linus Torvalds, Andi Kleen, Greg Kroah-Hartman,
	Tim Chen, Linux Kernel Mailing List, Jeff Law, Nick Clifton,
	Andy Lutomirski, Peter Zijlstra

On Tue, 9 Jan 2018, Dave Hansen wrote:
> On 01/09/2018 04:45 PM, Thomas Gleixner wrote:
> > On Mon, 8 Jan 2018, Andrea Arcangeli wrote:
> >> On Mon, Jan 08, 2018 at 09:53:02PM +0100, Thomas Gleixner wrote:
> >> Did my best to do the cleanest patch for tip, but I now figured Dave's
> >> original comment was spot on: a _PAGE_NX clear then becomes necessary
> >> also after pud_alloc not only after p4d_alloc.
> >>
> >> pmd_alloc would run into the same with x86 32bit non-PAE too.
> 
> non-PAE doesn't have an NX bit. :)
> 
> But we #define _PAGE_NX down to 0 there so it's harmless.
> 
> >> So there are two choices, either going back to one single _PAGE_NX
> >> clear from the original Dave's original patch as below, or to add
> >> multiple clear after each level which was my objective and is more
> >> robust, but it may be overkill in this case. As long as it was one
> >> line it looked a clear improvement.
> >>
> >> Considering the caller in both cases is going to abort I guess we can
> >> use the one liner approach as Dave and Jiri did originally.
> > 
> > Dave ?
> 
> I agree with Andrea.  The patch in -tip potentially misses the pgd
> clearing if pud_alloc() sets a PGD.  It would also be nice to have that
> comment back.
> 
> Note that the -tip commit probably works in *practice* because for two
> adjacent calls to map_tboot_page() that share a PGD entry, the first
> will clear NX, *then* allocate and set the PGD (without NX clear).  The
> second call will *not* allocate but will clear the NX bit.
> 
> The patch I think we want is attached.

Color me confused. I have queued the one below in tip. It lacks the comment
and does the !NX at a different place.

Thanks,

	tglx

8<-----------------	

commit 262b6b30087246abf09d6275eb0c0dc421bcbe38
Author: Dave Hansen <dave.hansen@linux.intel.com>
Date:   Sat Jan 6 18:41:14 2018 +0100

    x86/tboot: Unbreak tboot with PTI enabled
    
    This is another case similar to what EFI does: create a new set of
    page tables, map some code at a low address, and jump to it.  PTI
    mistakes this low address for userspace and mistakenly marks it
    non-executable in an effort to make it unusable for userspace.
    
    Undo the poison to allow execution.
    
    Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig")
    Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
    Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
    Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
    Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
    Cc: Tim Chen <tim.c.chen@linux.intel.com>
    Cc: Jon Masters <jcm@redhat.com>
    Cc: Dave Hansen <dave.hansen@intel.com>
    Cc: Andi Kleen <andi@firstfloor.org>
    Cc: Jeff Law <law@redhat.com>
    Cc: Paolo Bonzini <pbonzini@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
    Cc: David" <dwmw@amazon.co.uk>
    Cc: Nick Clifton <nickc@redhat.com>
    Cc: stable@vger.kernel.org
    Link: https://lkml.kernel.org/r/20180108102805.GK25546@redhat.com

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a4eb27918ceb..75869a4b6c41 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 	p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
 	if (!p4d)
 		return -1;
+	pgd->pgd &= ~_PAGE_NX;
 	pud = pud_alloc(&tboot_mm, p4d, vaddr);
 	if (!pud)
 		return -1;

^ permalink raw reply related	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
@ 2018-02-23 21:10 Ywe Cærlyn
  0 siblings, 0 replies; 107+ messages in thread
From: Ywe Cærlyn @ 2018-02-23 21:10 UTC (permalink / raw)
  To: LKML

Patchmeister Torvalds:

"Or is Intel basically saying "we are committed to selling you shit
forever and ever, and never fixing anything"?"

Back in Celeron days, Intel was popular because you could clock the 
lesser cached Celeron 300mhz to ~500mhz.

Everybody knew then not to get anything pricier. But still Intel sells 
Xeons at 3x the price, for little noticable gain?

Basically I did research on philosophy, and indeed the mainconcept of a 
culture is what determines a cultures behaviour. Even the internal 
design of the cpu seems to be inspired by "God".

I have tried the zén-realized version "Zün", instead, God of absolute 
reality. Because regressions in philosophy, is regressions in computing, 
is ultimately Bill Gates talking about fecal water.

-- 
Fredelige hilsener,
Ywe Cærlyn,

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
@ 2018-01-12  8:20 Dr. Greg Wettstein
  0 siblings, 0 replies; 107+ messages in thread
From: Dr. Greg Wettstein @ 2018-01-12  8:20 UTC (permalink / raw)
  To: Alan Cox, Thomas Gleixner
  Cc: Jon Masters, Woodhouse, David, Paolo Bonzini, Linus Torvalds,
	Andi Kleen, Greg Kroah-Hartman, Tim Chen,
	Linux Kernel Mailing List, Dave Hansen, Jeff Law, Nick Clifton

On Jan 5, 12:12pm, Alan Cox wrote:
} Subject: Re: Avoid speculative indirect calls in kernel

Good morning to everyone, a bit behind on mail given everything which
has been going on.

> On Fri, 5 Jan 2018 01:54:13 +0100 (CET)
> Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> > On Thu, 4 Jan 2018, Jon Masters wrote:
> > > P.S. I've an internal document where I've been tracking "nice to haves"
> > > for later, and one of them is whether it makes sense to tag binaries as
> > > "trusted" (e.g. extended attribute, label, whatever). It was something I
> > > wanted to bring up at some point as potentially worth considering.  
> > 
> > Scratch that. There is no such thing as a trusted binary.

> There is if you are using signing and the like. I'm sure SELinux and
> friends will grow the ability to set per process policy but that's
> certainly not a priority.
>
> However the question is wrong. 'trusted' is a binary operator not a
> unary one.

Alan's observations are correct.

In our autonomous introspection work we apply the notion that
'trusted' is a binary characteristic of a context of execution (COE).
Its value is an expression of whether or not the information exchange
events it has been involved in have deviated from the desired
execution trajectory path of the system.

It is a decidedly different way of thinking about things.  Most
importantly it is a namespaceable characteristic.

We have already written the futuristic LSM that Alan aludes to in
order to implement per COE security policies and forensics for
actors/COE's that have gone over to the 'dark side'.

> Alan

Have a good weekend.

Dr. Greg

}-- End of excerpt from Alan Cox

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"Given a choice between a complex, difficult-to-understand,
 disconcerting explanation and a simplistic, comforting one, many
 prefer simplistic comfort if it's remotely plausible, especially if it
 involves blaming someone else for their problems."
                                -- Bob Lewis
                                   _Infoworld_

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 12:09   ` Alan Cox
@ 2018-01-04 13:32     ` Pavel Machek
  0 siblings, 0 replies; 107+ messages in thread
From: Pavel Machek @ 2018-01-04 13:32 UTC (permalink / raw)
  To: Alan Cox; +Cc: Andi Kleen, tglx, torvalds, gregkh, linux-kernel, tim.c.chen

[-- Attachment #1: Type: text/plain, Size: 2020 bytes --]

On Thu 2018-01-04 12:09:14, Alan Cox wrote:
> On Thu, 4 Jan 2018 12:49:17 +0100
> Pavel Machek <pavel@ucw.cz> wrote:
> 
> > Hi!
> > 
> > > This is a fix for Variant 2 in 
> > > https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
> > > 
> > > Any speculative indirect calls in the kernel can be tricked 
> > > to execute any kernel code, which may allow side channel
> > > attacks that can leak arbitrary kernel data.  
> > 
> > Ok.
> > 
> > > So we want to avoid speculative indirect calls in the kernel.
> > > 
> > > There's a special code sequence called a retpoline that can
> > > do indirect calls without speculation. We use a new compiler
> > > option -mindirect-branch=thunk-extern (gcc patch will be released
> > > separately) to recompile the kernel with this new sequence.  
> > 
> > So... this "retpoline" code is quite tricky; I guess it does the right
> > on recent Intel CPUs. Does it also do the right thing on all the AMD,
> > Cyrix, ... variants?
> > 
> > Is it neccessary on all the CPUs? I guess 486 does not need this?
> 
> It's architecturally valid x86 code so it should work on any x86-32
> processor.
> 
> You are correct older processors and some of the newer ones don't need
> it. AMD and VIA I don't know about but for Intel we can probably avoid it
> on older Atom, on Quark, and the really old CPUs nobody actually runs any
> more.

Ok. I guess I'd like to see comment in file explaining that.

> That's all an optimization once we have correctness.

Well, correctness first sounds as a good idea. OTOH for Spectre
problem, seems like a fix would be microcode update to flush branch
predictor caches on priviledge and context switches. That should make
it impractical to exploit for all the systems, not just latest Linux,
compiled by lastest Gcc, right?
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04 11:49 ` Pavel Machek
@ 2018-01-04 12:09   ` Alan Cox
  2018-01-04 13:32     ` Pavel Machek
  0 siblings, 1 reply; 107+ messages in thread
From: Alan Cox @ 2018-01-04 12:09 UTC (permalink / raw)
  To: Pavel Machek; +Cc: Andi Kleen, tglx, torvalds, gregkh, linux-kernel, tim.c.chen

On Thu, 4 Jan 2018 12:49:17 +0100
Pavel Machek <pavel@ucw.cz> wrote:

> Hi!
> 
> > This is a fix for Variant 2 in 
> > https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
> > 
> > Any speculative indirect calls in the kernel can be tricked 
> > to execute any kernel code, which may allow side channel
> > attacks that can leak arbitrary kernel data.  
> 
> Ok.
> 
> > So we want to avoid speculative indirect calls in the kernel.
> > 
> > There's a special code sequence called a retpoline that can
> > do indirect calls without speculation. We use a new compiler
> > option -mindirect-branch=thunk-extern (gcc patch will be released
> > separately) to recompile the kernel with this new sequence.  
> 
> So... this "retpoline" code is quite tricky; I guess it does the right
> on recent Intel CPUs. Does it also do the right thing on all the AMD,
> Cyrix, ... variants?
> 
> Is it neccessary on all the CPUs? I guess 486 does not need this?

It's architecturally valid x86 code so it should work on any x86-32
processor.

You are correct older processors and some of the newer ones don't need
it. AMD and VIA I don't know about but for Intel we can probably avoid it
on older Atom, on Quark, and the really old CPUs nobody actually runs any
more.

That's all an optimization once we have correctness.

Alan

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Re: Avoid speculative indirect calls in kernel
  2018-01-04  2:00 Andi Kleen
@ 2018-01-04 11:49 ` Pavel Machek
  2018-01-04 12:09   ` Alan Cox
  0 siblings, 1 reply; 107+ messages in thread
From: Pavel Machek @ 2018-01-04 11:49 UTC (permalink / raw)
  To: Andi Kleen; +Cc: tglx, torvalds, gregkh, linux-kernel, tim.c.chen

[-- Attachment #1: Type: text/plain, Size: 1039 bytes --]

Hi!

> This is a fix for Variant 2 in 
> https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
> 
> Any speculative indirect calls in the kernel can be tricked 
> to execute any kernel code, which may allow side channel
> attacks that can leak arbitrary kernel data.

Ok.

> So we want to avoid speculative indirect calls in the kernel.
> 
> There's a special code sequence called a retpoline that can
> do indirect calls without speculation. We use a new compiler
> option -mindirect-branch=thunk-extern (gcc patch will be released
> separately) to recompile the kernel with this new sequence.

So... this "retpoline" code is quite tricky; I guess it does the right
on recent Intel CPUs. Does it also do the right thing on all the AMD,
Cyrix, ... variants?

Is it neccessary on all the CPUs? I guess 486 does not need this?

									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 107+ messages in thread

* Avoid speculative indirect calls in kernel
@ 2018-01-04  2:00 Andi Kleen
  2018-01-04 11:49 ` Pavel Machek
  0 siblings, 1 reply; 107+ messages in thread
From: Andi Kleen @ 2018-01-04  2:00 UTC (permalink / raw)
  To: tglx; +Cc: torvalds, gregkh, linux-kernel, tim.c.chen

This is a fix for Variant 2 in 
https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html

Any speculative indirect calls in the kernel can be tricked 
to execute any kernel code, which may allow side channel
attacks that can leak arbitrary kernel data.

So we want to avoid speculative indirect calls in the kernel.

There's a special code sequence called a retpoline that can
do indirect calls without speculation. We use a new compiler
option -mindirect-branch=thunk-extern (gcc patch will be released
separately) to recompile the kernel with this new sequence.

We also patch all the assembler code in the kernel to use
the new sequence.

The patches were originally from David Woodhouse and Tim Chen,
but then reworked and enhanced by me.

No performance numbers at this point. 32bit is only boot tested.

Git tree available in 
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc spec/retpoline-415-2

v1: Initial post.
v2:
Add CONFIG_RETPOLINE to build kernel without it.
Change warning messages.
Hide modpost warning message

^ permalink raw reply	[flat|nested] 107+ messages in thread

end of thread, other threads:[~2018-02-23 21:09 UTC | newest]

Thread overview: 107+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-03 23:09 Avoid speculative indirect calls in kernel Andi Kleen
2018-01-03 23:09 ` [PATCH 01/11] x86/retpoline: Define retpoline indirect thunk and macros Andi Kleen
2018-01-03 23:09 ` [PATCH 02/11] x86/retpoline/crypto: Convert crypto assembler indirect jumps Andi Kleen
2018-01-03 23:09 ` [PATCH 03/11] x86/retpoline/entry: Convert entry " Andi Kleen
2018-01-03 23:09 ` [PATCH 04/11] x86/retpoline/ftrace: Convert ftrace " Andi Kleen
2018-01-03 23:09 ` [PATCH 05/11] x86/retpoline/hyperv: Convert " Andi Kleen
2018-01-03 23:09 ` [PATCH 06/11] x86/retpoline/crypto: Convert xen " Andi Kleen
2018-01-03 23:09 ` [PATCH 07/11] x86/retpoline/checksum32: Convert " Andi Kleen
2018-01-03 23:09 ` [PATCH 08/11] x86/retpoline/irq32: " Andi Kleen
2018-01-03 23:09 ` [PATCH 09/11] x86/retpoline: Finally enable retpoline for C code Andi Kleen
2018-01-04  8:28   ` Greg KH
2018-01-04  8:30     ` Dave Hansen
2018-01-03 23:09 ` [PATCH 10/11] retpoline/taint: Taint kernel for missing retpoline in compiler Andi Kleen
2018-01-04  0:29   ` Thomas Gleixner
2018-01-04  0:35     ` Randy Dunlap
2018-01-03 23:09 ` [PATCH 11/11] retpoline/objtool: Disable some objtool warnings Andi Kleen
2018-01-03 23:51 ` Avoid speculative indirect calls in kernel Linus Torvalds
2018-01-04  0:00   ` Alan Cox
2018-01-04  0:09   ` Andi Kleen
2018-01-04  0:12     ` Thomas Gleixner
2018-01-04  0:15       ` Andi Kleen
2018-01-04  0:19         ` Jiri Kosina
2018-01-05  2:01           ` james harvey
2018-01-05 10:40             ` Woodhouse, David
2018-01-05 12:29               ` james harvey
2018-01-05 12:06             ` Alan Cox
2018-01-04  0:29         ` Alan Cox
2018-01-04  0:31           ` Thomas Gleixner
2018-01-04  0:38             ` Alan Cox
2018-01-04  0:40             ` Andi Kleen
2018-01-04  8:15               ` Woodhouse, David
2018-01-04 15:53                 ` Andi Kleen
2018-01-04 15:55                   ` Woodhouse, David
2018-01-04  0:20       ` Linus Torvalds
2018-01-04  0:26         ` Thomas Gleixner
2018-01-04  0:18     ` David Lang
2018-01-04  1:00   ` Paul Turner
2018-01-04  1:41   ` Paolo Bonzini
2018-01-04  1:59     ` Alan Cox
2018-01-04  2:11       ` Paolo Bonzini
2018-01-04  8:20         ` Woodhouse, David
2018-01-04 11:42           ` Pavel Machek
2018-01-04 11:47             ` Woodhouse, David
2018-01-04 14:20               ` Paolo Bonzini
2018-01-04 14:51                 ` Andrew Cooper
2018-01-04 15:29                   ` Woodhouse, David
2018-01-04 15:32                     ` Paolo Bonzini
2018-01-04 15:37                       ` Andrew Cooper
2018-01-04 16:15                     ` David Woodhouse
2018-01-04 20:00                       ` Tom Lendacky
2018-01-04 20:05                         ` David Woodhouse
2018-01-04 23:47                           ` Tom Lendacky
2018-01-05  0:06                             ` Andrew Cooper
2018-01-05  0:26                             ` Tom Lendacky
2018-01-04 16:52                     ` Andrea Arcangeli
2018-01-04 15:32                   ` Paolo Bonzini
2018-01-04 16:25                     ` Andrea Arcangeli
2018-01-04 17:04                       ` Alan Cox
2018-01-04 17:40                         ` Andrea Arcangeli
2018-01-04 17:13                       ` Dave Hansen
2018-01-04 17:15                         ` Paolo Bonzini
2018-01-04 18:05                           ` Andrea Arcangeli
2018-01-04 14:55                 ` Woodhouse, David
2018-01-04 18:24                 ` Pavel Machek
2018-01-04 19:57           ` Jon Masters
2018-01-05  0:41             ` Jon Masters
2018-01-05  0:54               ` Thomas Gleixner
2018-01-05  4:11                 ` Jon Masters
2018-01-05  9:59                   ` Thomas Gleixner
2018-01-08 10:28                     ` Andrea Arcangeli
2018-01-08 20:42                       ` [tip:x86/pti] x86/tboot: Unbreak tboot with PTI enabled tip-bot for Dave Hansen
2018-01-08 20:53                       ` Avoid speculative indirect calls in kernel Thomas Gleixner
2018-01-08 21:32                         ` Andrea Arcangeli
2018-01-10  0:45                           ` Thomas Gleixner
2018-01-10  1:11                             ` Dave Hansen
2018-01-10 16:02                               ` Thomas Gleixner
2018-01-05  6:49                 ` Willy Tarreau
2018-01-05  6:57                   ` Dave Hansen
2018-01-05  7:13                     ` Willy Tarreau
2018-01-07 14:14                       ` Borislav Petkov
2018-01-07 17:21                         ` David Lang
2018-01-07 18:49                           ` Borislav Petkov
2018-01-07 17:44                         ` Willy Tarreau
2018-01-07 18:55                           ` Borislav Petkov
2018-01-07 22:10                             ` Willy Tarreau
2018-01-08  9:18                               ` Thomas Gleixner
2018-01-08  9:29                                 ` Willy Tarreau
2018-01-08 16:22                               ` Borislav Petkov
2018-01-08 16:53                                 ` Willy Tarreau
2018-01-05 12:12                 ` Alan Cox
2018-01-09  1:44                   ` Samir Bellabes
     [not found]                 ` <CAL9bgJ8XNJgCtxR6+M+Vm9eDBVZ4Dyi_-Lt-Q1ei9N=TE2c6cg@mail.gmail.com>
2018-01-07  5:04                   ` Fwd: " Kiernan Hager
2018-01-07  6:39                     ` Willy Tarreau
2018-01-07 14:01                     ` Alan Cox
2018-01-07 17:47                       ` Willy Tarreau
2018-01-07 18:01                         ` Ivan Ivanov
2018-01-07 18:16                           ` Woodhouse, David
2018-01-04 11:26   ` Pavel Machek
2018-01-04 11:54     ` Alan Cox
2018-01-04 18:33     ` Linus Torvalds
2018-01-04 20:08       ` Jon Masters
2018-01-04  2:00 Andi Kleen
2018-01-04 11:49 ` Pavel Machek
2018-01-04 12:09   ` Alan Cox
2018-01-04 13:32     ` Pavel Machek
2018-01-12  8:20 Dr. Greg Wettstein
2018-02-23 21:10 Ywe Cærlyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.