All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch V11 00/16] SSB 0
@ 2018-05-02 21:51 Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 01/16] SSB 1 Thomas Gleixner
                   ` (17 more replies)
  0 siblings, 18 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Changes since V10:

  - Addressed Ingos review feedback

  - Picked up Reviewed-bys

Delta patch below. Bundle is coming in separate mail. Git repo branches are
updated as well. The master branch contains also the fix for the lost IBRS
issue Tim was seeing.

If there are no further issues and nitpicks, I'm going to make the
changes immutable and changes need to go incremental on top.

Thanks,

	tglx

8<--------------------
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 29984fd3dd18..a8d2ae1e335b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4051,11 +4051,12 @@
 
 			on     - Unconditionally disable Speculative Store Bypass
 			off    - Unconditionally enable Speculative Store Bypass
-			auto   - Kernel detects whether the CPU model contains a
+			auto   - Kernel detects whether the CPU model contains an
 				 implementation of Speculative Store Bypass and
-				 picks the most appropriate mitigation
-			prctl  - Control Speculative Store Bypass for a thread
-				 via prctl. By default it is enabled. The state
+				 picks the most appropriate mitigation.
+			prctl  - Control Speculative Store Bypass per thread
+				 via prctl. Speculative Store Bypass is enabled
+				 for a process by default. The state of the control
 				 is inherited on fork.
 
 			Not specifying this option is equivalent to
diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst
index 8ff39a26a992..ddbebcd01208 100644
--- a/Documentation/userspace-api/spec_ctrl.rst
+++ b/Documentation/userspace-api/spec_ctrl.rst
@@ -10,7 +10,7 @@ The kernel provides mitigation for such vulnerabilities in various
 forms. Some of these mitigations are compile time configurable and some on
 the kernel command line.
 
-There is also a class of mitigations which is very expensive, but they can
+There is also a class of mitigations which are very expensive, but they can
 be restricted to a certain set of processes or tasks in controlled
 environments. The mechanism to control these mitigations is via
 :manpage:`prctl(2)`.
@@ -25,7 +25,7 @@ PR_GET_SPECULATION_CTRL
 -----------------------
 
 PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature
-which is selected with arg2 of prctl(2). The return value uses bit 0-2 with
+which is selected with arg2 of prctl(2). The return value uses bits 0-2 with
 the following meaning:
 
 ==== ================ ===================================================
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5bee7a2ca4ff..810f50bb338d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -70,7 +70,11 @@
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
 #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
-#define ARCH_CAP_RDS_NO			(1 << 4)   /* Not susceptible to speculative store bypass */
+#define ARCH_CAP_RDS_NO			(1 << 4)   /*
+						    * Not susceptible to Speculative Store Bypass
+						    * attack, so no Reduced Data Speculation control
+						    * required.
+						    */
 
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 023e2edc0f3c..71ad01422655 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -225,8 +225,8 @@ enum spectre_v2_mitigation {
  * ourselves and always use this as the base for SPEC_CTRL.
  * We also use this when handling guest entry/exit as below.
  */
-extern void x86_set_spec_ctrl(u64);
-extern u64 x86_get_default_spec_ctrl(void);
+extern void x86_spec_ctrl_set(u64);
+extern u64 x86_spec_ctrl_get_default(void);
 
 /* The Speculative Store Bypass disable variants */
 enum ssb_mitigation {
@@ -285,7 +285,7 @@ static inline void indirect_branch_prediction_barrier(void)
  */
 #define firmware_restrict_branch_speculation_start()			\
 do {									\
-	u64 val = x86_get_default_spec_ctrl() | SPEC_CTRL_IBRS;		\
+	u64 val = x86_spec_ctrl_get_default() | SPEC_CTRL_IBRS;		\
 									\
 	preempt_disable();						\
 	alternative_msr_write(MSR_IA32_SPEC_CTRL, val,			\
@@ -294,7 +294,7 @@ do {									\
 
 #define firmware_restrict_branch_speculation_end()			\
 do {									\
-	u64 val = x86_get_default_spec_ctrl();				\
+	u64 val = x86_spec_ctrl_get_default();				\
 									\
 	alternative_msr_write(MSR_IA32_SPEC_CTRL, val,			\
 			      X86_FEATURE_USE_IBRS_FW);			\
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index 607236af4008..45ef00ad5105 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -12,8 +12,8 @@
  * shadowable for guests but this is not (currently) the case.
  * Takes the guest view of SPEC_CTRL MSR as a parameter.
  */
-extern void x86_set_guest_spec_ctrl(u64);
-extern void x86_restore_host_spec_ctrl(u64);
+extern void x86_spec_ctrl_set_guest(u64);
+extern void x86_spec_ctrl_restore_host(u64);
 
 /* AMD specific Speculative Store Bypass MSR data */
 extern u64 x86_amd_ls_cfg_base;
@@ -30,7 +30,7 @@ static inline u64 rds_tif_to_spec_ctrl(u64 tifn)
 
 static inline u64 rds_tif_to_amd_ls_cfg(u64 tifn)
 {
-	return tifn & _TIF_RDS ? x86_amd_ls_cfg_rds_mask : 0ULL;
+	return (tifn & _TIF_RDS) ? x86_amd_ls_cfg_rds_mask : 0ULL;
 }
 
 extern void speculative_store_bypass_update(void);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 50c6ba6d031b..18efc33a8d2e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -572,7 +572,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
 			setup_force_cpu_cap(X86_FEATURE_RDS);
 			setup_force_cpu_cap(X86_FEATURE_AMD_RDS);
-			x86_amd_ls_cfg_rds_mask = (1ULL << bit);
+			x86_amd_ls_cfg_rds_mask = 1ULL << bit;
 		}
 	}
 }
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c28856e475c8..15f77d4518c7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -32,7 +32,7 @@ static void __init spectre_v2_select_mitigation(void);
 static void __init ssb_select_mitigation(void);
 
 /*
- * Our boot-time value of SPEC_CTRL MSR. We read it once so that any
+ * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
  * writes to SPEC_CTRL contain whatever reserved bits have been set.
  */
 u64 __ro_after_init x86_spec_ctrl_base;
@@ -41,11 +41,11 @@ u64 __ro_after_init x86_spec_ctrl_base;
  * The vendor and possibly platform specific bits which can be modified in
  * x86_spec_ctrl_base.
  */
-static u64 __ro_after_init x86_spec_ctrl_mask = ~(SPEC_CTRL_IBRS);
+static u64 __ro_after_init x86_spec_ctrl_mask = ~SPEC_CTRL_IBRS;
 
 /*
- * AMD specific MSR info for Store Bypass control.  x86_amd_ls_cfg_rds_mask
- * is initialized in identify_boot_cpu().
+ * AMD specific MSR info for Speculative Store Bypass control.
+ * x86_amd_ls_cfg_rds_mask is initialized in identify_boot_cpu().
  */
 u64 __ro_after_init x86_amd_ls_cfg_base;
 u64 __ro_after_init x86_amd_ls_cfg_rds_mask;
@@ -61,7 +61,7 @@ void __init check_bugs(void)
 
 	/*
 	 * Read the SPEC_CTRL MSR to account for reserved bits which may
-	 * have unknown values. AMD64_LS_CFG msr is cached in the early AMD
+	 * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
 	 * init code as it is not enumerated and depends on the family.
 	 */
 	if (boot_cpu_has(X86_FEATURE_IBRS))
@@ -131,22 +131,22 @@ static const char *spectre_v2_strings[] = {
 
 static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
 
-void x86_set_spec_ctrl(u64 val)
+void x86_spec_ctrl_set(u64 val)
 {
 	if (val & x86_spec_ctrl_mask)
 		WARN_ONCE(1, "SPEC_CTRL MSR value 0x%16llx is unknown.\n", val);
 	else
 		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base | val);
 }
-EXPORT_SYMBOL_GPL(x86_set_spec_ctrl);
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_set);
 
-u64 x86_get_default_spec_ctrl(void)
+u64 x86_spec_ctrl_get_default(void)
 {
 	return x86_spec_ctrl_base;
 }
-EXPORT_SYMBOL_GPL(x86_get_default_spec_ctrl);
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_default);
 
-void x86_set_guest_spec_ctrl(u64 guest_spec_ctrl)
+void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl)
 {
 	u64 host = x86_spec_ctrl_base;
 
@@ -159,9 +159,9 @@ void x86_set_guest_spec_ctrl(u64 guest_spec_ctrl)
 	if (host != guest_spec_ctrl)
 		wrmsrl(MSR_IA32_SPEC_CTRL, guest_spec_ctrl);
 }
-EXPORT_SYMBOL_GPL(x86_set_guest_spec_ctrl);
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_set_guest);
 
-void x86_restore_host_spec_ctrl(u64 guest_spec_ctrl)
+void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl)
 {
 	u64 host = x86_spec_ctrl_base;
 
@@ -174,7 +174,7 @@ void x86_restore_host_spec_ctrl(u64 guest_spec_ctrl)
 	if (host != guest_spec_ctrl)
 		wrmsrl(MSR_IA32_SPEC_CTRL, host);
 }
-EXPORT_SYMBOL_GPL(x86_restore_host_spec_ctrl);
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_restore_host);
 
 static void x86_amd_rds_enable(void)
 {
@@ -504,8 +504,8 @@ static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
 		switch (boot_cpu_data.x86_vendor) {
 		case X86_VENDOR_INTEL:
 			x86_spec_ctrl_base |= SPEC_CTRL_RDS;
-			x86_spec_ctrl_mask &= ~(SPEC_CTRL_RDS);
-			x86_set_spec_ctrl(SPEC_CTRL_RDS);
+			x86_spec_ctrl_mask &= ~SPEC_CTRL_RDS;
+			x86_spec_ctrl_set(SPEC_CTRL_RDS);
 			break;
 		case X86_VENDOR_AMD:
 			x86_amd_rds_enable();
@@ -560,7 +560,7 @@ static int ssb_prctl_get(void)
 	}
 }
 
-int arch_prctl_set_spec_ctrl(unsigned long which, unsigned long ctrl)
+int arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl)
 {
 	if (ctrl != PR_SPEC_ENABLE && ctrl != PR_SPEC_DISABLE)
 		return -ERANGE;
@@ -573,7 +573,7 @@ int arch_prctl_set_spec_ctrl(unsigned long which, unsigned long ctrl)
 	}
 }
 
-int arch_prctl_get_spec_ctrl(unsigned long which)
+int arch_prctl_spec_ctrl_get(unsigned long which)
 {
 	switch (which) {
 	case PR_SPEC_STORE_BYPASS:
@@ -583,10 +583,10 @@ int arch_prctl_get_spec_ctrl(unsigned long which)
 	}
 }
 
-void x86_setup_ap_spec_ctrl(void)
+void x86_spec_ctrl_setup_ap(void)
 {
 	if (boot_cpu_has(X86_FEATURE_IBRS))
-		x86_set_spec_ctrl(x86_spec_ctrl_base & ~x86_spec_ctrl_mask);
+		x86_spec_ctrl_set(x86_spec_ctrl_base & ~x86_spec_ctrl_mask);
 
 	if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
 		x86_amd_rds_enable();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f3dbdde978a4..e0517bcee446 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -848,6 +848,11 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_power = edx;
 	}
 
+	if (c->extended_cpuid_level >= 0x80000008) {
+		cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+		c->x86_capability[CPUID_8000_0008_EBX] = ebx;
+	}
+
 	if (c->extended_cpuid_level >= 0x8000000a)
 		c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
 
@@ -871,7 +876,6 @@ static void get_cpu_address_sizes(struct cpuinfo_x86 *c)
 
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
-		c->x86_capability[CPUID_8000_0008_EBX] = ebx;
 	}
 #ifdef CONFIG_X86_32
 	else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
@@ -924,26 +928,26 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
 };
 
 static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_PINEVIEW },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_LINCROFT },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_PENWELL },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_CLOVERVIEW },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_CEDARVIEW },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_SILVERMONT1 },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_AIRMONT },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_SILVERMONT2 },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_ATOM_MERRIFIELD },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_CORE_YONAH },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_XEON_PHI_KNL },
-	{ X86_VENDOR_INTEL,     6, INTEL_FAM6_XEON_PHI_KNM },
-	{ X86_VENDOR_CENTAUR,	5 },
-	{ X86_VENDOR_INTEL,	5 },
-	{ X86_VENDOR_NSC,	5 },
-	{ X86_VENDOR_AMD,	0xf },
-	{ X86_VENDOR_AMD,	0x10 },
-	{ X86_VENDOR_AMD,	0x11 },
-	{ X86_VENDOR_AMD,	0x12 },
-	{ X86_VENDOR_ANY,	4 },
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_PINEVIEW	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_LINCROFT	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_PENWELL		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_CLOVERVIEW	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_CEDARVIEW	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT1	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT2	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_MERRIFIELD	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_CORE_YONAH		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
+	{ X86_VENDOR_CENTAUR,	5,					},
+	{ X86_VENDOR_INTEL,	5,					},
+	{ X86_VENDOR_NSC,	5,					},
+	{ X86_VENDOR_AMD,	0x12,					},
+	{ X86_VENDOR_AMD,	0x11,					},
+	{ X86_VENDOR_AMD,	0x10,					},
+	{ X86_VENDOR_AMD,	0xf,					},
+	{ X86_VENDOR_ANY,	4,					},
 	{}
 };
 
@@ -1384,7 +1388,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
 #endif
 	mtrr_ap_init();
 	validate_apic_and_package_id(c);
-	x86_setup_ap_spec_ctrl();
+	x86_spec_ctrl_setup_ap();
 }
 
 static __init int setup_noclflush(char *arg)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index faaabc160293..37672d299e35 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -50,6 +50,6 @@ extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
 
 unsigned int aperfmperf_get_khz(int cpu);
 
-extern void x86_setup_ap_spec_ctrl(void);
+extern void x86_spec_ctrl_setup_ap(void);
 
 #endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ba4763e9a285..437c1b371129 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5557,7 +5557,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	x86_set_guest_spec_ctrl(svm->spec_ctrl);
+	x86_spec_ctrl_set_guest(svm->spec_ctrl);
 
 	asm volatile (
 		"push %%" _ASM_BP "; \n\t"
@@ -5669,7 +5669,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	x86_restore_host_spec_ctrl(svm->spec_ctrl);
+	x86_spec_ctrl_restore_host(svm->spec_ctrl);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9744e48457d6..16a111e44691 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9722,7 +9722,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	x86_set_guest_spec_ctrl(vmx->spec_ctrl);
+	x86_spec_ctrl_set_guest(vmx->spec_ctrl);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
 
@@ -9870,7 +9870,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	x86_restore_host_spec_ctrl(vmx->spec_ctrl);
+	x86_spec_ctrl_restore_host(vmx->spec_ctrl);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
diff --git a/include/linux/nospec.h b/include/linux/nospec.h
index 1e63a0a90e96..700bb8a4e4ea 100644
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -57,7 +57,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 })
 
 /* Speculation control prctl */
-int arch_prctl_set_spec_ctrl(unsigned long which, unsigned long ctrl);
-int arch_prctl_get_spec_ctrl(unsigned long which);
+int arch_prctl_spec_ctrl_get(unsigned long which);
+int arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl);
 
 #endif /* _LINUX_NOSPEC_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 4e7a160d3b28..ebf057ac1346 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -208,8 +208,8 @@ struct prctl_mm_map {
 # define PR_SVE_VL_INHERIT		(1 << 17) /* inherit across exec */
 
 /* Per task speculation control */
-#define PR_SET_SPECULATION_CTRL		52
-#define PR_GET_SPECULATION_CTRL		53
+#define PR_GET_SPECULATION_CTRL		52
+#define PR_SET_SPECULATION_CTRL		53
 /* Speculation control variants */
 # define PR_SPEC_STORE_BYPASS		0
 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */
diff --git a/kernel/sys.c b/kernel/sys.c
index d7afe29319f1..b76dee23bdc9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2244,12 +2244,12 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data)
 	return 1;
 }
 
-int __weak arch_prctl_set_spec_ctrl(unsigned long which, unsigned long ctrl)
+int __weak arch_prctl_spec_ctrl_get(unsigned long which)
 {
 	return -EINVAL;
 }
 
-int __weak arch_prctl_get_spec_ctrl(unsigned long which)
+int __weak arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl)
 {
 	return -EINVAL;
 }
@@ -2462,15 +2462,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SVE_GET_VL:
 		error = SVE_GET_VL();
 		break;
-	case PR_SET_SPECULATION_CTRL:
-		if (arg4 || arg5)
-			return -EINVAL;
-		error = arch_prctl_set_spec_ctrl(arg2, arg3);
-		break;
 	case PR_GET_SPECULATION_CTRL:
 		if (arg3 || arg4 || arg5)
 			return -EINVAL;
-		error = arch_prctl_get_spec_ctrl(arg2);
+		error = arch_prctl_spec_ctrl_get(arg2);
+		break;
+	case PR_SET_SPECULATION_CTRL:
+		if (arg4 || arg5)
+			return -EINVAL;
+		error = arch_prctl_spec_ctrl_set(arg2, arg3);
 		break;
 	default:
 		error = -EINVAL;

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [patch V11 01/16] SSB 1
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 02/16] SSB 2 Thomas Gleixner
                   ` (16 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

The macro is not type safe and I did look for why that "g" constraint for
the asm doesn't work: it's because the asm is more fundamentally wrong.

It does

        movl %[val], %%eax

but "val" isn't a 32-bit value, so then gcc will pass it in a register, 
and generate code like

        movl %rsi, %eax

and gas will complain about a nonsensical 'mov' instruction (it's moving a 
64-bit register to a 32-bit one).

Passing it through memory will just hide the real bug - gcc still thinks 
the memory location is 64-bit, but the "movl" will only load the first 32 
bits and it all happens to work because x86 is little-endian.

Convert it to a type safe inline function with a little trick which hands
the feature into the ALTERNATIVE macro.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nospec-branch.h |   19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -241,15 +241,16 @@ static inline void vmexit_fill_RSB(void)
 #endif
 }
 
-#define alternative_msr_write(_msr, _val, _feature)		\
-	asm volatile(ALTERNATIVE("",				\
-				 "movl %[msr], %%ecx\n\t"	\
-				 "movl %[val], %%eax\n\t"	\
-				 "movl $0, %%edx\n\t"		\
-				 "wrmsr",			\
-				 _feature)			\
-		     : : [msr] "i" (_msr), [val] "i" (_val)	\
-		     : "eax", "ecx", "edx", "memory")
+static __always_inline
+void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
+{
+	asm volatile(ALTERNATIVE("", "wrmsr", %c[feature])
+		: : "c" (msr),
+		    "a" (val),
+		    "d" (val >> 32),
+		    [feature] "i" (feature)
+		: "memory");
+}
 
 static inline void indirect_branch_prediction_barrier(void)
 {

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 02/16] SSB 2
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 01/16] SSB 1 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 03/16] SSB 3 Thomas Gleixner
                   ` (15 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Combine the various logic which goes through all those
x86_cpu_id matching structures in one function.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
v2: New patch
v3: Added Reviewed-by
v4: Changed title of the patch
---
 arch/x86/kernel/cpu/common.c |   21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -927,21 +927,27 @@ static const __initconst struct x86_cpu_
 	{}
 };
 
-static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 ia32_cap = 0;
 
+	if (x86_match_cpu(cpu_no_speculation))
+		return;
+
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+
 	if (x86_match_cpu(cpu_no_meltdown))
-		return false;
+		return;
 
 	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
 		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
 
 	/* Rogue Data Cache Load? No! */
 	if (ia32_cap & ARCH_CAP_RDCL_NO)
-		return false;
+		return;
 
-	return true;
+	setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 }
 
 /*
@@ -992,12 +998,7 @@ static void __init early_identify_cpu(st
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
-	if (!x86_match_cpu(cpu_no_speculation)) {
-		if (cpu_vulnerable_to_meltdown(c))
-			setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
-		setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
-		setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
-	}
+	cpu_set_bug_bits(c);
 
 	fpu__init_system(c);
 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 03/16] SSB 3
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 01/16] SSB 1 Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 02/16] SSB 2 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 04/16] SSB 4 Thomas Gleixner
                   ` (14 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Those SysFS functions have a similar preamble, as such make common
code to handle them.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
v2: New patch
v3: Added Reviewed-by
v4: Fixed the title
---
 arch/x86/kernel/cpu/bugs.c | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bfca937bdcc3..ad613f73d071 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -314,30 +314,48 @@ static void __init spectre_v2_select_mitigation(void)
 #undef pr_fmt
 
 #ifdef CONFIG_SYSFS
-ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+
+ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
+			char *buf, unsigned int bug)
 {
-	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+	if (!boot_cpu_has_bug(bug))
 		return sprintf(buf, "Not affected\n");
-	if (boot_cpu_has(X86_FEATURE_PTI))
-		return sprintf(buf, "Mitigation: PTI\n");
+
+	switch (bug) {
+	case X86_BUG_CPU_MELTDOWN:
+		if (boot_cpu_has(X86_FEATURE_PTI))
+			return sprintf(buf, "Mitigation: PTI\n");
+
+		break;
+
+	case X86_BUG_SPECTRE_V1:
+		return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+
+	case X86_BUG_SPECTRE_V2:
+		return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+			       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
+			       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+			       spectre_v2_module_string());
+
+	default:
+		break;
+	}
+
 	return sprintf(buf, "Vulnerable\n");
 }
 
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN);
+}
+
 ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
-		return sprintf(buf, "Not affected\n");
-	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+	return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1);
 }
 
 ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
-		return sprintf(buf, "Not affected\n");
-
-	return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
-		       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
-		       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
-		       spectre_v2_module_string());
+	return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
 }
 #endif
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [patch V11 04/16] SSB 4
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (2 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 03/16] SSB 3 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 05/16] SSB 5 Thomas Gleixner
                   ` (13 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

The 336996-Speculative-Execution-Side-Channel-Mitigations.pdf refers to all
the other bits as reserved. The Intel SDM glossary defines reserved as
implementation specific - aka unknown.

As such at bootup this must be taken it into account and proper masking for
the bits in use applied.

A copy of this document is available at
https://bugzilla.kernel.org/show_bug.cgi?id=199511

[ tglx: Made x86_spec_ctrl_base __ro_after_init ]

Suggested-by: Jon Masters <jcm@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nospec-branch.h |   24 ++++++++++++++++++++----
 arch/x86/kernel/cpu/bugs.c           |   28 ++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 4 deletions(-)

--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -217,6 +217,17 @@ enum spectre_v2_mitigation {
 	SPECTRE_V2_IBRS,
 };
 
+/*
+ * The Intel specification for the SPEC_CTRL MSR requires that we
+ * preserve any already set reserved bits at boot time (e.g. for
+ * future additions that this kernel is not currently aware of).
+ * We then set any additional mitigation bits that we want
+ * ourselves and always use this as the base for SPEC_CTRL.
+ * We also use this when handling guest entry/exit as below.
+ */
+extern void x86_spec_ctrl_set(u64);
+extern u64 x86_spec_ctrl_get_default(void);
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
@@ -254,8 +265,9 @@ void alternative_msr_write(unsigned int
 
 static inline void indirect_branch_prediction_barrier(void)
 {
-	alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB,
-			      X86_FEATURE_USE_IBPB);
+	u64 val = PRED_CMD_IBPB;
+
+	alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB);
 }
 
 /*
@@ -266,14 +278,18 @@ static inline void indirect_branch_predi
  */
 #define firmware_restrict_branch_speculation_start()			\
 do {									\
+	u64 val = x86_spec_ctrl_get_default() | SPEC_CTRL_IBRS;		\
+									\
 	preempt_disable();						\
-	alternative_msr_write(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS,	\
+	alternative_msr_write(MSR_IA32_SPEC_CTRL, val,			\
 			      X86_FEATURE_USE_IBRS_FW);			\
 } while (0)
 
 #define firmware_restrict_branch_speculation_end()			\
 do {									\
-	alternative_msr_write(MSR_IA32_SPEC_CTRL, 0,			\
+	u64 val = x86_spec_ctrl_get_default();				\
+									\
+	alternative_msr_write(MSR_IA32_SPEC_CTRL, val,			\
 			      X86_FEATURE_USE_IBRS_FW);			\
 	preempt_enable();						\
 } while (0)
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -28,6 +28,12 @@
 
 static void __init spectre_v2_select_mitigation(void);
 
+/*
+ * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
+ * writes to SPEC_CTRL contain whatever reserved bits have been set.
+ */
+static u64 __ro_after_init x86_spec_ctrl_base;
+
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -37,6 +43,13 @@ void __init check_bugs(void)
 		print_cpu_info(&boot_cpu_data);
 	}
 
+	/*
+	 * Read the SPEC_CTRL MSR to account for reserved bits which may
+	 * have unknown values.
+	 */
+	if (boot_cpu_has(X86_FEATURE_IBRS))
+		rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
 	/* Select the proper spectre mitigation before patching alternatives */
 	spectre_v2_select_mitigation();
 
@@ -95,6 +108,21 @@ static const char *spectre_v2_strings[]
 
 static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
 
+void x86_spec_ctrl_set(u64 val)
+{
+	if (val & ~SPEC_CTRL_IBRS)
+		WARN_ONCE(1, "SPEC_CTRL MSR value 0x%16llx is unknown.\n", val);
+	else
+		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base | val);
+}
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_set);
+
+u64 x86_spec_ctrl_get_default(void)
+{
+	return x86_spec_ctrl_base;
+}
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_default);
+
 #ifdef RETPOLINE
 static bool spectre_v2_bad_module;
 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 05/16] SSB 5
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (3 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 04/16] SSB 4 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-10 17:52   ` [MODERATED] " Andi Kleen
  2018-05-02 21:51 ` [patch V11 06/16] SSB 6 Thomas Gleixner
                   ` (12 subsequent siblings)
  17 siblings, 1 reply; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

A guest may modify the SPEC_CTRL MSR from the value used by the
kernel. Since the kernel doesn't use IBRS, this means a value of zero is
what is needed in the host.

But the 336996-Speculative-Execution-Side-Channel-Mitigations.pdf refers to
the other bits as reserved so the kernel should respect the boot time
SPEC_CTRL value and use that.

This allows to deal with future extensions to the SPEC_CTRL interface if
any at all.

Note: This uses wrmsrl() instead of native_wrmsl(). I does not make any
difference as paravirt will over-write the callq *0xfff.. with the wrmsrl
assembler code.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
v2: New patch
v3: Use the two accessory functions instead of poking at the global variable.
v4: Use x86_get_spec_ctrl instead of global variable.
v5: Use x86_get_default_spec_ctrl instead of x86_get_spec_ctrl
---
 arch/x86/include/asm/nospec-branch.h |   10 ++++++++++
 arch/x86/kernel/cpu/bugs.c           |   18 ++++++++++++++++++
 arch/x86/kvm/svm.c                   |    6 ++----
 arch/x86/kvm/vmx.c                   |    6 ++----
 4 files changed, 32 insertions(+), 8 deletions(-)

--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -228,6 +228,16 @@ enum spectre_v2_mitigation {
 extern void x86_spec_ctrl_set(u64);
 extern u64 x86_spec_ctrl_get_default(void);
 
+/*
+ * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
+ * the guest has, while on VMEXIT we restore the host view. This
+ * would be easier if SPEC_CTRL were architecturally maskable or
+ * shadowable for guests but this is not (currently) the case.
+ * Takes the guest view of SPEC_CTRL MSR as a parameter.
+ */
+extern void x86_spec_ctrl_set_guest(u64);
+extern void x86_spec_ctrl_restore_host(u64);
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -123,6 +123,24 @@ u64 x86_spec_ctrl_get_default(void)
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_default);
 
+void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl)
+{
+	if (!boot_cpu_has(X86_FEATURE_IBRS))
+		return;
+	if (x86_spec_ctrl_base != guest_spec_ctrl)
+		wrmsrl(MSR_IA32_SPEC_CTRL, guest_spec_ctrl);
+}
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_set_guest);
+
+void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl)
+{
+	if (!boot_cpu_has(X86_FEATURE_IBRS))
+		return;
+	if (x86_spec_ctrl_base != guest_spec_ctrl)
+		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+}
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_restore_host);
+
 #ifdef RETPOLINE
 static bool spectre_v2_bad_module;
 
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5557,8 +5557,7 @@ static void svm_vcpu_run(struct kvm_vcpu
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	if (svm->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+	x86_spec_ctrl_set_guest(svm->spec_ctrl);
 
 	asm volatile (
 		"push %%" _ASM_BP "; \n\t"
@@ -5670,8 +5669,7 @@ static void svm_vcpu_run(struct kvm_vcpu
 	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	if (svm->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+	x86_spec_ctrl_restore_host(svm->spec_ctrl);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9720,8 +9720,7 @@ static void __noclone vmx_vcpu_run(struc
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	if (vmx->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+	x86_spec_ctrl_set_guest(vmx->spec_ctrl);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
 
@@ -9869,8 +9868,7 @@ static void __noclone vmx_vcpu_run(struc
 	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	if (vmx->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+	x86_spec_ctrl_restore_host(vmx->spec_ctrl);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 06/16] SSB 6
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (4 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 05/16] SSB 5 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 07/16] SSB 7 Thomas Gleixner
                   ` (11 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Add the sysfs file for the new vulerability. It does not do much except
show the words 'Vulnerable' for recent x86 cores.

Intel cores prior to family 6 are known not to be vulnerable, and so are
some Atoms and some Xeon Phi.

It assumes that older Cyrix, Centaur, etc. cores are immune.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
v1.3: Remove AMD
    s/md/mdd/
v1.4: s/mdd/sbb/
v3: s/SSB/SPEC_STORE_BYPASS
  Rework the logic in cpu_set_bug_bits to be inverse.
v4: Expanded the not affected array
  - s/X86_BUG_CPU_SPEC_STORE_BYPASS/X86_BUG_SPEC_STORE_BYPASS/
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |    1 
 arch/x86/include/asm/cpufeatures.h                 |    1 
 arch/x86/kernel/cpu/bugs.c                         |    5 ++++
 arch/x86/kernel/cpu/common.c                       |   23 +++++++++++++++++++++
 drivers/base/cpu.c                                 |    8 +++++++
 include/linux/cpu.h                                |    2 +
 6 files changed, 40 insertions(+)

--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -478,6 +478,7 @@ What:		/sys/devices/system/cpu/vulnerabi
 		/sys/devices/system/cpu/vulnerabilities/meltdown
 		/sys/devices/system/cpu/vulnerabilities/spectre_v1
 		/sys/devices/system/cpu/vulnerabilities/spectre_v2
+		/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
 Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Information about CPU vulnerabilities
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -363,5 +363,6 @@
 #define X86_BUG_CPU_MELTDOWN		X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
 #define X86_BUG_SPECTRE_V1		X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
 #define X86_BUG_SPECTRE_V2		X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS	X86_BUG(17) /* CPU is affected by speculative store bypass attack */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -404,4 +404,9 @@ ssize_t cpu_show_spectre_v2(struct devic
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
 }
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
+}
 #endif
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -927,10 +927,33 @@ static const __initconst struct x86_cpu_
 	{}
 };
 
+static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_PINEVIEW	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_LINCROFT	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_PENWELL		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_CLOVERVIEW	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_CEDARVIEW	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT1	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT2	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_MERRIFIELD	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_CORE_YONAH		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
+	{ X86_VENDOR_CENTAUR,	5,					},
+	{ X86_VENDOR_INTEL,	5,					},
+	{ X86_VENDOR_NSC,	5,					},
+	{ X86_VENDOR_ANY,	4,					},
+	{}
+};
+
 static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 ia32_cap = 0;
 
+	if (!x86_match_cpu(cpu_no_spec_store_bypass))
+		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
 	if (x86_match_cpu(cpu_no_speculation))
 		return;
 
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -534,14 +534,22 @@ ssize_t __weak cpu_show_spectre_v2(struc
 	return sprintf(buf, "Not affected\n");
 }
 
+ssize_t __weak cpu_show_spec_store_bypass(struct device *dev,
+					  struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Not affected\n");
+}
+
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
 static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
+static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_meltdown.attr,
 	&dev_attr_spectre_v1.attr,
 	&dev_attr_spectre_v2.attr,
+	&dev_attr_spec_store_bypass.attr,
 	NULL
 };
 
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -53,6 +53,8 @@ extern ssize_t cpu_show_spectre_v1(struc
 				   struct device_attribute *attr, char *buf);
 extern ssize_t cpu_show_spectre_v2(struct device *dev,
 				   struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
+					  struct device_attribute *attr, char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 07/16] SSB 7
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (5 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 06/16] SSB 6 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 08/16] SSB 8 Thomas Gleixner
                   ` (10 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Add the CPU feature bit CPUID.7.0.EDX[31] which indicates whether the CPU
supports Reduced Data Speculation.

[ tglx: Split it out from a later patch ]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h |    1 +
 1 file changed, 1 insertion(+)

--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -334,6 +334,7 @@
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_RDS			(18*32+31) /* Reduced Data Speculation */
 
 /*
  * BUG word(s)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 08/16] SSB 8
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (6 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 07/16] SSB 7 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 09/16] SSB 9 Thomas Gleixner
                   ` (9 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Contemporary high performance processors use a common industry-wide
optimization known as "Speculative Store Bypass" in which loads from
addresses to which a recent store has occurred may (speculatively) see an
older value. Intel refers to this feature as "Memory Disambiguation" which
is part of their "Smart Memory Access" capability.

Memory Disambiguation can expose a cache side-channel attack against such
speculatively read values. An attacker can create exploit code that allows
them to read memory outside of a sandbox environment (for example,
malicious JavaScript in a web page), or to perform more complex attacks
against code running within the same privilege level, e.g. via the stack.

As a first step to mitigate against such attacks, provide two boot command
line control knobs:

 nospec_store_bypass_disable
 spec_store_bypass_disable=[off,auto,on]

By default affected x86 processors will power on with Speculative
Store Bypass enabled. Hence the provided kernel parameters are written
from the point of view of whether to enable a mitigation or not.
The parameters are as follows:

 - auto - Kernel detects whether your CPU model contains an implementation
	  of Speculative Store Bypass and picks the most appropriate
	  mitigation.

 - on   - disable Speculative Store Bypass
 - off  - enable Speculative Store Bypass

[ tglx: Reordered the checks so that the whole evaluation is not done
  	when the CPU does not support RDS ]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>

---
v1.3:
 - Wrong array was used when figuring mdd_v4= arguments
 - If broken microcode found, also disable MD feature.
 - s/mdd_v4/mdd/
 - s/kernel/userpsace/
 - Fix up the commit description
v1.4:
 - Incorporate various feedback and suggestions from Jon Masters
 - Change mdd parameters to allow only big hammer on/off for now
 - Introduce "ssbd" industry standard term for cross-architecture
 - Use mdd_ and MDD_ prefixes to clearly spell out that "enable"
   means we are actually disabling a CPU processor feature (MD)
v2:
 - s/mdd/ssb/ in the x86 code to match with the 'ssb' part
v3:
 - rip out mdd.
 - remove 'Haswell' and such and replace with 'contemporary'
 - s/ssb/spec_store_bypass/ and various other cleanups (Jon Masters)
v4: Rip out 'Memory Disambiguation Disable'
 - Fix up documentation.
 - Rip out the X86_FEATURE_STBUF_BYPASS
 - s/X86_FEATURE_STBUF_MITIGATE/X86_FEATURE_STBUF_BYPASS_MITIGATE/
 - On the functions change spec_store_bypass_ to ssb_
v5: Changed title.
  - s/X86_FEATURE_STBUF_BYPASS_MITIGATE/X86_FEATURE_SPEC_STORE_BYPASS_DISABLE/
  - Add logic to get out of ssb_parse_cmdline is bug is not present
---
 Documentation/admin-guide/kernel-parameters.txt |   33 +++++++
 arch/x86/include/asm/cpufeatures.h              |    1 
 arch/x86/include/asm/nospec-branch.h            |    6 +
 arch/x86/kernel/cpu/bugs.c                      |  103 ++++++++++++++++++++++++
 4 files changed, 143 insertions(+)

--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2680,6 +2680,9 @@
 			allow data leaks with this option, which is equivalent
 			to spectre_v2=off.
 
+	nospec_store_bypass_disable
+			[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
+
 	noxsave		[BUGS=X86] Disables x86 extended register state save
 			and restore using xsave. The kernel will fallback to
 			enabling legacy floating-point and sse state.
@@ -4025,6 +4028,36 @@
 			Not specifying this option is equivalent to
 			spectre_v2=auto.
 
+	spec_store_bypass_disable=
+			[HW] Control Speculative Store Bypass (SSB) Disable mitigation
+			(Speculative Store Bypass vulnerability)
+
+			Certain CPUs are vulnerable to an exploit against a
+			a common industry wide performance optimization known
+			as "Speculative Store Bypass" in which recent stores
+			to the same memory location may not be observed by
+			later loads during speculative execution. The idea
+			is that such stores are unlikely and that they can
+			be detected prior to instruction retirement at the
+			end of a particular speculation execution window.
+
+			In vulnerable processors, the speculatively forwarded
+			store can be used in a cache side channel attack, for
+			example to read memory to which the attacker does not
+			directly have access (e.g. inside sandboxed code).
+
+			This parameter controls whether the Speculative Store
+			Bypass optimization is used.
+
+			on     - Unconditionally disable Speculative Store Bypass
+			off    - Unconditionally enable Speculative Store Bypass
+			auto   - Kernel detects whether the CPU model contains an
+				 implementation of Speculative Store Bypass and
+				 picks the most appropriate mitigation
+
+			Not specifying this option is equivalent to
+			spec_store_bypass_disable=auto.
+
 	spia_io_base=	[HW,MTD]
 	spia_fio_base=
 	spia_pedr=
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -214,6 +214,7 @@
 
 #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
 #define X86_FEATURE_USE_IBRS_FW		( 7*32+22) /* "" Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE	( 7*32+23) /* "" Disable Speculative Store Bypass. */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -238,6 +238,12 @@ extern u64 x86_spec_ctrl_get_default(voi
 extern void x86_spec_ctrl_set_guest(u64);
 extern void x86_spec_ctrl_restore_host(u64);
 
+/* The Speculative Store Bypass disable variants */
+enum ssb_mitigation {
+	SPEC_STORE_BYPASS_NONE,
+	SPEC_STORE_BYPASS_DISABLE,
+};
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -27,6 +27,7 @@
 #include <asm/intel-family.h>
 
 static void __init spectre_v2_select_mitigation(void);
+static void __init ssb_select_mitigation(void);
 
 /*
  * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
@@ -53,6 +54,12 @@ void __init check_bugs(void)
 	/* Select the proper spectre mitigation before patching alternatives */
 	spectre_v2_select_mitigation();
 
+	/*
+	 * Select proper mitigation for any exposure to the Speculative Store
+	 * Bypass vulnerability.
+	 */
+	ssb_select_mitigation();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -358,6 +365,99 @@ static void __init spectre_v2_select_mit
 }
 
 #undef pr_fmt
+#define pr_fmt(fmt)	"Speculative Store Bypass: " fmt
+
+static enum ssb_mitigation ssb_mode = SPEC_STORE_BYPASS_NONE;
+
+/* The kernel command line selection */
+enum ssb_mitigation_cmd {
+	SPEC_STORE_BYPASS_CMD_NONE,
+	SPEC_STORE_BYPASS_CMD_AUTO,
+	SPEC_STORE_BYPASS_CMD_ON,
+};
+
+static const char *ssb_strings[] = {
+	[SPEC_STORE_BYPASS_NONE]	= "Vulnerable",
+	[SPEC_STORE_BYPASS_DISABLE]	= "Mitigation: Speculative Store Bypass disabled"
+};
+
+static const struct {
+	const char *option;
+	enum ssb_mitigation_cmd cmd;
+} ssb_mitigation_options[] = {
+	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
+	{ "on",		SPEC_STORE_BYPASS_CMD_ON },   /* Disable Speculative Store Bypass */
+	{ "off",	SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
+};
+
+static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
+{
+	enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+	char arg[20];
+	int ret, i;
+
+	if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
+		return SPEC_STORE_BYPASS_CMD_NONE;
+	} else {
+		ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
+					  arg, sizeof(arg));
+		if (ret < 0)
+			return SPEC_STORE_BYPASS_CMD_AUTO;
+
+		for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
+			if (!match_option(arg, ret, ssb_mitigation_options[i].option))
+				continue;
+
+			cmd = ssb_mitigation_options[i].cmd;
+			break;
+		}
+
+		if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
+			pr_err("unknown option (%s). Switching to AUTO select\n", arg);
+			return SPEC_STORE_BYPASS_CMD_AUTO;
+		}
+	}
+
+	return cmd;
+}
+
+static enum ssb_mitigation_cmd __init __ssb_select_mitigation(void)
+{
+	enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
+	enum ssb_mitigation_cmd cmd;
+
+	if (!boot_cpu_has(X86_FEATURE_RDS))
+		return mode;
+
+	cmd = ssb_parse_cmdline();
+	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) &&
+	    (cmd == SPEC_STORE_BYPASS_CMD_NONE ||
+	     cmd == SPEC_STORE_BYPASS_CMD_AUTO))
+		return mode;
+
+	switch (cmd) {
+	case SPEC_STORE_BYPASS_CMD_AUTO:
+	case SPEC_STORE_BYPASS_CMD_ON:
+		mode = SPEC_STORE_BYPASS_DISABLE;
+		break;
+	case SPEC_STORE_BYPASS_CMD_NONE:
+		break;
+	}
+
+	if (mode != SPEC_STORE_BYPASS_NONE)
+		setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+	return mode;
+}
+
+static void ssb_select_mitigation()
+{
+	ssb_mode = __ssb_select_mitigation();
+
+	if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+		pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+#undef pr_fmt
 
 #ifdef CONFIG_SYSFS
 
@@ -383,6 +483,9 @@ ssize_t cpu_show_common(struct device *d
 			       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
 			       spectre_v2_module_string());
 
+	case X86_BUG_SPEC_STORE_BYPASS:
+		return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+
 	default:
 		break;
 	}

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 09/16] SSB 9
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (7 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 08/16] SSB 8 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 10/16] SSB 10 Thomas Gleixner
                   ` (8 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Intel CPUs expose methods to:

 - Detect whether RDS capability is available via CPUID.7.0.EDX[31],

 - The SPEC_CTRL MSR(0x48), bit 2 set to enable RDS.

 - MSR_IA32_ARCH_CAPABILITIES, Bit(4) no need to enable RRS.

With that in mind if spec_store_bypass_disable=[auto,on] is selected set at
boot-time the SPEC_CTRL MSR to enable RDS if the platform requires it.

Note that this does not fix the KVM case where the SPEC_CTRL is exposed to
guests which can muck with it, see patch titled :
 KVM/SVM/VMX/x86/spectre_v2: Support the combination of guest and host IBRS.

And for the firmware (IBRS to be set), see patch titled:
 x86/spectre_v2: Read SPEC_CTRL MSR during boot and re-use reserved bits

[ tglx: Distangled it from the intel implementation and kept the call order ]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>

---

v1.2: Expand on the commit description
  s/md_v4/mdd/
  s/spec_ctrl_msr_on/spec_ctrl_priv/
  s/spec_ctrl_msr_off/spec_ctrp_unpriv/

v1.3:
 - Add comment about privilege level changes.

v1.4: Simplify and incorporate various suggestions from Jon Masters
 - Export a single x86_spec_ctrl_base value with initial bits

v2: Rip out the c_fix_cpu.
 Depend on synthetic CPU flag
v3: Move the generic_identify to be done _after_ we figure out whether
  we can do the mitigation.
v4: s/MDD/RDS/
   s/Memory Disambiguation Disable/Reduced Data Speculation/
   Tweak the various 'disable', enabled now that it is called RDS.
   Set the x86_spec_ctrl with SPEC_CTRL_RDS if RDS is detected
   Fixup x86_set_spec_ctrl to deal with two Bitfields.
v5: s/X86_FEATURE_DISABLE_SSB/X86_FEATURE_SPEC_STORE_BYPASS_DISABLE/
   Also check MSR_IA32_ARCH_CAPABILITIES for Bit(4)
   Add documentation on what those three flags mean
   Add docs on why we set x86_spec_ctrl only on Intel
   Add extra check in ssb_parse_cmdline for RDS be available
   In init_intel drop the check for RDS as the X86_FEATURE_SPEC_STORE_BYPASS_DISABLE
    is implicitly set only iff RDS has been set in ssb_parse_cmdline.
---
 arch/x86/include/asm/msr-index.h |    6 ++++++
 arch/x86/kernel/cpu/bugs.c       |   30 ++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/common.c     |   10 ++++++----
 arch/x86/kernel/cpu/cpu.h        |    2 ++
 arch/x86/kernel/cpu/intel.c      |    1 +
 5 files changed, 43 insertions(+), 6 deletions(-)

--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -42,6 +42,7 @@
 #define MSR_IA32_SPEC_CTRL		0x00000048 /* Speculation Control */
 #define SPEC_CTRL_IBRS			(1 << 0)   /* Indirect Branch Restricted Speculation */
 #define SPEC_CTRL_STIBP			(1 << 1)   /* Single Thread Indirect Branch Predictors */
+#define SPEC_CTRL_RDS			(1 << 2)   /* Reduced Data Speculation */
 
 #define MSR_IA32_PRED_CMD		0x00000049 /* Prediction Command */
 #define PRED_CMD_IBPB			(1 << 0)   /* Indirect Branch Prediction Barrier */
@@ -68,6 +69,11 @@
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
 #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
+#define ARCH_CAP_RDS_NO			(1 << 4)   /*
+						    * Not susceptible to Speculative Store Bypass
+						    * attack, so no Reduced Data Speculation control
+						    * required.
+						    */
 
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -117,7 +117,7 @@ static enum spectre_v2_mitigation spectr
 
 void x86_spec_ctrl_set(u64 val)
 {
-	if (val & ~SPEC_CTRL_IBRS)
+	if (val & ~(SPEC_CTRL_IBRS | SPEC_CTRL_RDS))
 		WARN_ONCE(1, "SPEC_CTRL MSR value 0x%16llx is unknown.\n", val);
 	else
 		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base | val);
@@ -444,8 +444,28 @@ static enum ssb_mitigation_cmd __init __
 		break;
 	}
 
-	if (mode != SPEC_STORE_BYPASS_NONE)
+	/*
+	 * We have three CPU feature flags that are in play here:
+	 *  - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
+	 *  - X86_FEATURE_RDS - CPU is able to turn off speculative store bypass
+	 *  - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
+	 */
+	if (mode != SPEC_STORE_BYPASS_NONE) {
 		setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+		/*
+		 * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
+		 * a completely different MSR and bit dependent on family.
+		 */
+		switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_INTEL:
+			x86_spec_ctrl_base |= SPEC_CTRL_RDS;
+			x86_spec_ctrl_set(SPEC_CTRL_RDS);
+			break;
+		case X86_VENDOR_AMD:
+			break;
+		}
+	}
+
 	return mode;
 }
 
@@ -459,6 +479,12 @@ static void ssb_select_mitigation()
 
 #undef pr_fmt
 
+void x86_spec_ctrl_setup_ap(void)
+{
+	if (boot_cpu_has(X86_FEATURE_IBRS))
+		x86_spec_ctrl_set(x86_spec_ctrl_base & (SPEC_CTRL_IBRS | SPEC_CTRL_RDS));
+}
+
 #ifdef CONFIG_SYSFS
 
 ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -951,7 +951,11 @@ static void __init cpu_set_bug_bits(stru
 {
 	u64 ia32_cap = 0;
 
-	if (!x86_match_cpu(cpu_no_spec_store_bypass))
+	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
+		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+	if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
+	   !(ia32_cap & ARCH_CAP_RDS_NO))
 		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
 
 	if (x86_match_cpu(cpu_no_speculation))
@@ -963,9 +967,6 @@ static void __init cpu_set_bug_bits(stru
 	if (x86_match_cpu(cpu_no_meltdown))
 		return;
 
-	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
-		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
-
 	/* Rogue Data Cache Load? No! */
 	if (ia32_cap & ARCH_CAP_RDCL_NO)
 		return;
@@ -1383,6 +1384,7 @@ void identify_secondary_cpu(struct cpuin
 #endif
 	mtrr_ap_init();
 	validate_apic_and_package_id(c);
+	x86_spec_ctrl_setup_ap();
 }
 
 static __init int setup_noclflush(char *arg)
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -50,4 +50,6 @@ extern void cpu_detect_cache_sizes(struc
 
 unsigned int aperfmperf_get_khz(int cpu);
 
+extern void x86_spec_ctrl_setup_ap(void);
+
 #endif /* ARCH_X86_CPU_H */
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -189,6 +189,7 @@ static void early_init_intel(struct cpui
 		setup_clear_cpu_cap(X86_FEATURE_STIBP);
 		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
 		setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
+		setup_clear_cpu_cap(X86_FEATURE_RDS);
 	}
 
 	/*

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 10/16] SSB 10
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (8 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 09/16] SSB 9 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 11/16] SSB 11 Thomas Gleixner
                   ` (7 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Intel and AMD SPEC_CTRL (0x48) MSR semantics may differ in the
future (or in fact use different MSRs for the same functionality).

As such a run-time mechanism is required to whitelist the appropriate MSR
values.

[ tglx: Made the variable __ro_after_init ]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>

---
v5: New patch
---
 arch/x86/kernel/cpu/bugs.c |   11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -35,6 +35,12 @@ static void __init ssb_select_mitigation
  */
 static u64 __ro_after_init x86_spec_ctrl_base;
 
+/*
+ * The vendor and possibly platform specific bits which can be modified in
+ * x86_spec_ctrl_base.
+ */
+static u64 __ro_after_init x86_spec_ctrl_mask = ~SPEC_CTRL_IBRS;
+
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -117,7 +123,7 @@ static enum spectre_v2_mitigation spectr
 
 void x86_spec_ctrl_set(u64 val)
 {
-	if (val & ~(SPEC_CTRL_IBRS | SPEC_CTRL_RDS))
+	if (val & x86_spec_ctrl_mask)
 		WARN_ONCE(1, "SPEC_CTRL MSR value 0x%16llx is unknown.\n", val);
 	else
 		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base | val);
@@ -459,6 +465,7 @@ static enum ssb_mitigation_cmd __init __
 		switch (boot_cpu_data.x86_vendor) {
 		case X86_VENDOR_INTEL:
 			x86_spec_ctrl_base |= SPEC_CTRL_RDS;
+			x86_spec_ctrl_mask &= ~SPEC_CTRL_RDS;
 			x86_spec_ctrl_set(SPEC_CTRL_RDS);
 			break;
 		case X86_VENDOR_AMD:
@@ -482,7 +489,7 @@ static void ssb_select_mitigation()
 void x86_spec_ctrl_setup_ap(void)
 {
 	if (boot_cpu_has(X86_FEATURE_IBRS))
-		x86_spec_ctrl_set(x86_spec_ctrl_base & (SPEC_CTRL_IBRS | SPEC_CTRL_RDS));
+		x86_spec_ctrl_set(x86_spec_ctrl_base & ~x86_spec_ctrl_mask);
 }
 
 #ifdef CONFIG_SYSFS

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 11/16] SSB 11
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (9 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 10/16] SSB 10 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-04 20:58   ` [MODERATED] " Konrad Rzeszutek Wilk
  2018-05-02 21:51 ` [patch V11 12/16] SSB 12 Thomas Gleixner
                   ` (6 subsequent siblings)
  17 siblings, 1 reply; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

AMD does not need the Speculative Store Bypass mitigation to be enabled.

The parameters for this are already available and can be done via MSR
C001_1020. Each family uses a different bit in that MSR for this.

[ tglx: Expose the bit mask via a variable and move the actual MSR fiddling
  	into the bugs code as that's the right thing to do and also required
	to prepare for dynamic enable/disable ]

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>

---
v2: New patch
v3: Use the right CPU features
    Move the whole logic in early_init_amd and init_amd
v5: Set X86_FEATURE_RDS on Fam16->17h
    Change title to have 'bugs' in it.
v7: Use a bit mask and a cached base
v8: Fix the bit mask ....
---
 arch/x86/include/asm/cpufeatures.h   |    1 +
 arch/x86/include/asm/nospec-branch.h |    4 ++++
 arch/x86/kernel/cpu/amd.c            |   26 ++++++++++++++++++++++++++
 arch/x86/kernel/cpu/bugs.c           |   27 ++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/common.c         |    4 ++++
 5 files changed, 61 insertions(+), 1 deletion(-)

--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -215,6 +215,7 @@
 #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
 #define X86_FEATURE_USE_IBRS_FW		( 7*32+22) /* "" Use IBRS during runtime firmware calls */
 #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE	( 7*32+23) /* "" Disable Speculative Store Bypass. */
+#define X86_FEATURE_AMD_RDS		(7*32+24)  /* "" AMD RDS implementation */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -244,6 +244,10 @@ enum ssb_mitigation {
 	SPEC_STORE_BYPASS_DISABLE,
 };
 
+/* AMD specific Speculative Store Bypass MSR data */
+extern u64 x86_amd_ls_cfg_base;
+extern u64 x86_amd_ls_cfg_rds_mask;
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -10,6 +10,7 @@
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
+#include <asm/nospec-branch.h>
 #include <asm/smp.h>
 #include <asm/pci-direct.h>
 #include <asm/delay.h>
@@ -554,6 +555,26 @@ static void bsp_init_amd(struct cpuinfo_
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
 		nodes_per_socket = ((value >> 3) & 7) + 1;
 	}
+
+	if (c->x86 >= 0x15 && c->x86 <= 0x17) {
+		unsigned int bit;
+
+		switch (c->x86) {
+		case 0x15: bit = 54; break;
+		case 0x16: bit = 33; break;
+		case 0x17: bit = 10; break;
+		default: return;
+		}
+		/*
+		 * Try to cache the base value so further operations can
+		 * avoid RMW. If that faults, do not enable RDS.
+		 */
+		if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+			setup_force_cpu_cap(X86_FEATURE_RDS);
+			setup_force_cpu_cap(X86_FEATURE_AMD_RDS);
+			x86_amd_ls_cfg_rds_mask = 1ULL << bit;
+		}
+	}
 }
 
 static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -898,6 +919,11 @@ static void init_amd(struct cpuinfo_x86
 	/* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
 	if (!cpu_has(c, X86_FEATURE_XENPV))
 		set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+	if (boot_cpu_has(X86_FEATURE_AMD_RDS)) {
+		set_cpu_cap(c, X86_FEATURE_RDS);
+		set_cpu_cap(c, X86_FEATURE_AMD_RDS);
+	}
 }
 
 #ifdef CONFIG_X86_32
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -41,6 +41,13 @@ static u64 __ro_after_init x86_spec_ctrl
  */
 static u64 __ro_after_init x86_spec_ctrl_mask = ~SPEC_CTRL_IBRS;
 
+/*
+ * AMD specific MSR info for Speculative Store Bypass control.
+ * x86_amd_ls_cfg_rds_mask is initialized in identify_boot_cpu().
+ */
+u64 __ro_after_init x86_amd_ls_cfg_base;
+u64 __ro_after_init x86_amd_ls_cfg_rds_mask;
+
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -52,7 +59,8 @@ void __init check_bugs(void)
 
 	/*
 	 * Read the SPEC_CTRL MSR to account for reserved bits which may
-	 * have unknown values.
+	 * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+	 * init code as it is not enumerated and depends on the family.
 	 */
 	if (boot_cpu_has(X86_FEATURE_IBRS))
 		rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
@@ -154,6 +162,14 @@ void x86_spec_ctrl_restore_host(u64 gues
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_restore_host);
 
+static void x86_amd_rds_enable(void)
+{
+	u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_rds_mask;
+
+	if (boot_cpu_has(X86_FEATURE_AMD_RDS))
+		wrmsrl(MSR_AMD64_LS_CFG, msrval);
+}
+
 #ifdef RETPOLINE
 static bool spectre_v2_bad_module;
 
@@ -443,6 +459,11 @@ static enum ssb_mitigation_cmd __init __
 
 	switch (cmd) {
 	case SPEC_STORE_BYPASS_CMD_AUTO:
+		/*
+		 * AMD platforms by default don't need SSB mitigation.
+		 */
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+			break;
 	case SPEC_STORE_BYPASS_CMD_ON:
 		mode = SPEC_STORE_BYPASS_DISABLE;
 		break;
@@ -469,6 +490,7 @@ static enum ssb_mitigation_cmd __init __
 			x86_spec_ctrl_set(SPEC_CTRL_RDS);
 			break;
 		case X86_VENDOR_AMD:
+			x86_amd_rds_enable();
 			break;
 		}
 	}
@@ -490,6 +512,9 @@ void x86_spec_ctrl_setup_ap(void)
 {
 	if (boot_cpu_has(X86_FEATURE_IBRS))
 		x86_spec_ctrl_set(x86_spec_ctrl_base & ~x86_spec_ctrl_mask);
+
+	if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
+		x86_amd_rds_enable();
 }
 
 #ifdef CONFIG_SYSFS
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -943,6 +943,10 @@ static const __initconst struct x86_cpu_
 	{ X86_VENDOR_CENTAUR,	5,					},
 	{ X86_VENDOR_INTEL,	5,					},
 	{ X86_VENDOR_NSC,	5,					},
+	{ X86_VENDOR_AMD,	0x12,					},
+	{ X86_VENDOR_AMD,	0x11,					},
+	{ X86_VENDOR_AMD,	0x10,					},
+	{ X86_VENDOR_AMD,	0xf,					},
 	{ X86_VENDOR_ANY,	4,					},
 	{}
 };

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 12/16] SSB 12
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (10 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 11/16] SSB 11 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 13/16] SSB 13 Thomas Gleixner
                   ` (5 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Expose the CPUID.7.EDX[31] bit to the guest, and also guard against various
combinations of SPEC_CTRL MSR values.

The handling of the MSR (to take into account the host value of SPEC_CTRL
Bit(2)) is taken care of in patch:

  KVM/SVM/VMX/x86/spectre_v2: Support the combination of guest and host IBRS

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>

---
v2: New patch.
v4: s/MDD/RDS/
---
 arch/x86/kvm/cpuid.c |    2 +-
 arch/x86/kvm/vmx.c   |    8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -407,7 +407,7 @@ static inline int __do_cpuid_ent(struct
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
-		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(RDS) |
 		F(ARCH_CAPABILITIES);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3524,7 +3524,8 @@ static int vmx_get_msr(struct kvm_vcpu *
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_RDS))
 			return 1;
 
 		msr_info->data = to_vmx(vcpu)->spec_ctrl;
@@ -3643,11 +3644,12 @@ static int vmx_set_msr(struct kvm_vcpu *
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_RDS))
 			return 1;
 
 		/* The STIBP bit doesn't fault even if it's not advertised */
-		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_RDS))
 			return 1;
 
 		vmx->spec_ctrl = data;

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 13/16] SSB 13
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (11 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 12/16] SSB 12 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 21:51 ` [patch V11 14/16] SSB 14 Thomas Gleixner
                   ` (4 subsequent siblings)
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Having everything in nospec-branch.h creates a hell of dependencies when
adding the prctl based switching mechanism. Move everything which is not
required in nospec-branch.h to spec-ctrl.h and fix up the includes in the
relevant files.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nospec-branch.h |   14 --------------
 arch/x86/include/asm/spec-ctrl.h     |   21 +++++++++++++++++++++
 arch/x86/kernel/cpu/amd.c            |    2 +-
 arch/x86/kernel/cpu/bugs.c           |    2 +-
 arch/x86/kvm/svm.c                   |    2 +-
 arch/x86/kvm/vmx.c                   |    2 +-
 6 files changed, 25 insertions(+), 18 deletions(-)

--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -228,26 +228,12 @@ enum spectre_v2_mitigation {
 extern void x86_spec_ctrl_set(u64);
 extern u64 x86_spec_ctrl_get_default(void);
 
-/*
- * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
- * the guest has, while on VMEXIT we restore the host view. This
- * would be easier if SPEC_CTRL were architecturally maskable or
- * shadowable for guests but this is not (currently) the case.
- * Takes the guest view of SPEC_CTRL MSR as a parameter.
- */
-extern void x86_spec_ctrl_set_guest(u64);
-extern void x86_spec_ctrl_restore_host(u64);
-
 /* The Speculative Store Bypass disable variants */
 enum ssb_mitigation {
 	SPEC_STORE_BYPASS_NONE,
 	SPEC_STORE_BYPASS_DISABLE,
 };
 
-/* AMD specific Speculative Store Bypass MSR data */
-extern u64 x86_amd_ls_cfg_base;
-extern u64 x86_amd_ls_cfg_rds_mask;
-
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
--- /dev/null
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPECCTRL_H_
+#define _ASM_X86_SPECCTRL_H_
+
+#include <asm/nospec-branch.h>
+
+/*
+ * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
+ * the guest has, while on VMEXIT we restore the host view. This
+ * would be easier if SPEC_CTRL were architecturally maskable or
+ * shadowable for guests but this is not (currently) the case.
+ * Takes the guest view of SPEC_CTRL MSR as a parameter.
+ */
+extern void x86_spec_ctrl_set_guest(u64);
+extern void x86_spec_ctrl_restore_host(u64);
+
+/* AMD specific Speculative Store Bypass MSR data */
+extern u64 x86_amd_ls_cfg_base;
+extern u64 x86_amd_ls_cfg_rds_mask;
+
+#endif
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -10,7 +10,7 @@
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 #include <asm/smp.h>
 #include <asm/pci-direct.h>
 #include <asm/delay.h>
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -13,7 +13,7 @@
 #include <linux/cpu.h>
 #include <linux/module.h>
 
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 #include <asm/cmdline.h>
 #include <asm/bugs.h>
 #include <asm/processor.h>
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -49,7 +49,7 @@
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
 #include <asm/irq_remapping.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,7 +51,7 @@
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 #include <asm/mmu_context.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 #include <asm/mshyperv.h>
 
 #include "trace.h"

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 14/16] SSB 14
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (12 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 13/16] SSB 13 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-03  7:19   ` [MODERATED] " Konrad Rzeszutek Wilk
  2018-05-03  7:22   ` [MODERATED] " Konrad Rzeszutek Wilk
  2018-05-02 21:51 ` [patch V11 15/16] SSB 15 Thomas Gleixner
                   ` (3 subsequent siblings)
  17 siblings, 2 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Add two new prctls to control aspects of speculation related vulnerabilites
and their mitigations to provide finer grained control over performance
impacting mitigations.

PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature
which is selected with arg2 of prctl(2). The return value uses bit 0-2 with
the following meaning:

Bit  Define           Description
0    PR_SPEC_PRCTL    Mitigation can be controlled per task by
                      PR_SET_SPECULATION_CTRL
1    PR_SPEC_ENABLE   The speculation feature is enabled, mitigation is
                      disabled
2    PR_SPEC_DISABLE  The speculation feature is disabled, mitigation is
                      enabled

If all bits are 0 the CPU is not affected by the speculation misfeature.

If PR_SPEC_PRCTL is set, then the per task control of the mitigation is
available. If not set, prctl(PR_SET_SPECULATION_CTRL) for the speculation
misfeature will fail.

PR_SET_SPECULATION_CTRL allows to control the speculation misfeature, which
is selected by arg2 of prctl(2) per task. arg3 is used to hand in the
control value, i.e. either PR_SPEC_ENABLE or PR_SPEC_DISABLE.

The common return values are:

EINVAL  prctl is not implemented by the architecture or the unused prctl()
        arguments are not 0
ENODEV  arg2 is selecting a not supported speculation misfeature

PR_SET_SPECULATION_CTRL has these additional return values:

ERANGE  arg3 is incorrect, i.e. it's not either PR_SPEC_ENABLE or PR_SPEC_DISABLE
ENXIO   prctl control of the selected speculation misfeature is disabled

The first supported controlable speculation misfeature is
PR_SPEC_STORE_BYPASS. Add the define so this can be shared between
architectures.

TODO: Write a man prctl(2) patch.

Based on an initial patch from Tim Chen and mostly rewritten.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
V2: Reword documentation and add unsused argument check
V3: Use EINVAL for unused argument check
---
 Documentation/userspace-api/index.rst     |    1 
 Documentation/userspace-api/spec_ctrl.rst |   86 ++++++++++++++++++++++++++++++
 include/linux/nospec.h                    |    5 +
 include/uapi/linux/prctl.h                |   11 +++
 kernel/sys.c                              |   22 +++++++
 5 files changed, 125 insertions(+)

--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -19,6 +19,7 @@ place where this information is gathered
    no_new_privs
    seccomp_filter
    unshare
+   spec_ctrl
 
 .. only::  subproject and html
 
--- /dev/null
+++ b/Documentation/userspace-api/spec_ctrl.rst
@@ -0,0 +1,86 @@
+===================
+Speculation Control
+===================
+
+Quite some CPUs have speculation related misfeatures which are in fact
+vulnerabilites causing data leaks in various forms even accross privilege
+domains.
+
+The kernel provides mitigation for such vulnerabilities in various
+forms. Some of these mitigations are compile time configurable and some on
+the kernel command line.
+
+There is also a class of mitigations which are very expensive, but they can
+be restricted to a certain set of processes or tasks in controlled
+environments. The mechanism to control these mitigations is via
+:manpage:`prctl(2)`.
+
+There are two prctl options which are related to this:
+
+ * PR_GET_SPECULATION_CTRL
+
+ * PR_SET_SPECULATION_CTRL
+
+PR_GET_SPECULATION_CTRL
+-----------------------
+
+PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature
+which is selected with arg2 of prctl(2). The return value uses bits 0-2 with
+the following meaning:
+
+==== ================ ===================================================
+Bit  Define           Description
+==== ================ ===================================================
+0    PR_SPEC_PRCTL    Mitigation can be controlled per task by
+                      PR_SET_SPECULATION_CTRL
+1    PR_SPEC_ENABLE   The speculation feature is enabled, mitigation is
+                      disabled
+2    PR_SPEC_DISABLE  The speculation feature is disabled, mitigation is
+                      enabled
+==== ================ ===================================================
+
+If all bits are 0 the CPU is not affected by the speculation misfeature.
+
+If PR_SPEC_PRCTL is set, then the per task control of the mitigation is
+available. If not set, prctl(PR_SET_SPECULATION_CTRL) for the speculation
+misfeature will fail.
+
+PR_SET_SPECULATION_CTRL
+-----------------------
+PR_SET_SPECULATION_CTRL allows to control the speculation misfeature, which
+is selected by arg2 of :manpage:`prctl(2)` per task. arg3 is used to hand
+in the control value, i.e. either PR_SPEC_ENABLE or PR_SPEC_DISABLE.
+
+Common error codes
+------------------
+======= =================================================================
+Value   Meaning
+======= =================================================================
+EINVAL  The prctl is not implemented by the architecture or unused
+        prctl(2) arguments are not 0
+
+ENODEV  arg2 is selecting a not supported speculation misfeature
+======= =================================================================
+
+PR_SET_SPECULATION_CTRL error codes
+-----------------------------------
+======= =================================================================
+Value   Meaning
+======= =================================================================
+0       Success
+
+ERANGE  arg3 is incorrect, i.e. it's neither PR_SPEC_ENABLE nor
+        PR_SPEC_DISABLE
+
+ENXIO   Control of the selected speculation misfeature is not possible.
+        See PR_GET_SPECULATION_CTRL.
+======= =================================================================
+
+Speculation misfeature controls
+-------------------------------
+- PR_SPEC_STORE_BYPASS: Speculative Store Bypass
+
+  Invocations:
+   * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, 0, 0, 0);
+   * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0);
+   * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0);
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -55,4 +55,9 @@ static inline unsigned long array_index_
 									\
 	(typeof(_i)) (_i & _mask);					\
 })
+
+/* Speculation control prctl */
+int arch_prctl_spec_ctrl_get(unsigned long which);
+int arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl);
+
 #endif /* _LINUX_NOSPEC_H */
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -207,4 +207,15 @@ struct prctl_mm_map {
 # define PR_SVE_VL_LEN_MASK		0xffff
 # define PR_SVE_VL_INHERIT		(1 << 17) /* inherit across exec */
 
+/* Per task speculation control */
+#define PR_GET_SPECULATION_CTRL		52
+#define PR_SET_SPECULATION_CTRL		53
+/* Speculation control variants */
+# define PR_SPEC_STORE_BYPASS		0
+/* Return and control values for PR_SET/GET_SPECULATION_CTRL */
+# define PR_SPEC_NOT_AFFECTED		0
+# define PR_SPEC_PRCTL			(1UL << 0)
+# define PR_SPEC_ENABLE			(1UL << 1)
+# define PR_SPEC_DISABLE		(1UL << 2)
+
 #endif /* _LINUX_PRCTL_H */
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -61,6 +61,8 @@
 #include <linux/uidgid.h>
 #include <linux/cred.h>
 
+#include <linux/nospec.h>
+
 #include <linux/kmsg_dump.h>
 /* Move somewhere else to avoid recompiling? */
 #include <generated/utsrelease.h>
@@ -2242,6 +2244,16 @@ static int propagate_has_child_subreaper
 	return 1;
 }
 
+int __weak arch_prctl_spec_ctrl_get(unsigned long which)
+{
+	return -EINVAL;
+}
+
+int __weak arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl)
+{
+	return -EINVAL;
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2450,6 +2462,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
 	case PR_SVE_GET_VL:
 		error = SVE_GET_VL();
 		break;
+	case PR_GET_SPECULATION_CTRL:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_prctl_spec_ctrl_get(arg2);
+		break;
+	case PR_SET_SPECULATION_CTRL:
+		if (arg4 || arg5)
+			return -EINVAL;
+		error = arch_prctl_spec_ctrl_set(arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 15/16] SSB 15
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (13 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 14/16] SSB 14 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-03  7:21   ` [MODERATED] " Konrad Rzeszutek Wilk
  2018-05-02 21:51 ` [patch V11 16/16] SSB 16 Thomas Gleixner
                   ` (2 subsequent siblings)
  17 siblings, 1 reply; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

The Speculative Store Bypass vulnerability can be mitigated with the
Reduced Data Speculation (RDS) feature. To allow finer grained control of
this eventually expensive mitigation a per task mitigation control is
required.

Add a new TIF_RDS flag and put it into the group of TIF flags which are
evaluated for mismatch in switch_to(). If these bits differ in the previous
and the next task, then the slow path function __switch_to_xtra() is
invoked. Implement the TIF_RDS dependent mitigation control in the slow
path.

If the prctl for controlling Speculative Store Bypass is disabled or no
task uses the prctl then there is no overhead in the switch_to() fast
path.

Update the KVM related speculation control functions to take TID_RDS into
account as well.

Based on a patch from Tim Chen. Completely rewritten.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr-index.h   |    3 ++-
 arch/x86/include/asm/spec-ctrl.h   |   17 +++++++++++++++++
 arch/x86/include/asm/thread_info.h |    4 +++-
 arch/x86/kernel/cpu/bugs.c         |   20 ++++++++++++++++----
 arch/x86/kernel/process.c          |   22 ++++++++++++++++++++++
 5 files changed, 60 insertions(+), 6 deletions(-)

--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -42,7 +42,8 @@
 #define MSR_IA32_SPEC_CTRL		0x00000048 /* Speculation Control */
 #define SPEC_CTRL_IBRS			(1 << 0)   /* Indirect Branch Restricted Speculation */
 #define SPEC_CTRL_STIBP			(1 << 1)   /* Single Thread Indirect Branch Predictors */
-#define SPEC_CTRL_RDS			(1 << 2)   /* Reduced Data Speculation */
+#define SPEC_CTRL_RDS_SHIFT		2	   /* Reduced Data Speculation bit */
+#define SPEC_CTRL_RDS			(1 << SPEC_CTRL_RDS_SHIFT)   /* Reduced Data Speculation */
 
 #define MSR_IA32_PRED_CMD		0x00000049 /* Prediction Command */
 #define PRED_CMD_IBPB			(1 << 0)   /* Indirect Branch Prediction Barrier */
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_SPECCTRL_H_
 #define _ASM_X86_SPECCTRL_H_
 
+#include <linux/thread_info.h>
 #include <asm/nospec-branch.h>
 
 /*
@@ -18,4 +19,20 @@ extern void x86_spec_ctrl_restore_host(u
 extern u64 x86_amd_ls_cfg_base;
 extern u64 x86_amd_ls_cfg_rds_mask;
 
+/* The Intel SPEC CTRL MSR base value cache */
+extern u64 x86_spec_ctrl_base;
+
+static inline u64 rds_tif_to_spec_ctrl(u64 tifn)
+{
+	BUILD_BUG_ON(TIF_RDS < SPEC_CTRL_RDS_SHIFT);
+	return (tifn & _TIF_RDS) >> (TIF_RDS - SPEC_CTRL_RDS_SHIFT);
+}
+
+static inline u64 rds_tif_to_amd_ls_cfg(u64 tifn)
+{
+	return (tifn & _TIF_RDS) ? x86_amd_ls_cfg_rds_mask : 0ULL;
+}
+
+extern void speculative_store_bypass_update(void);
+
 #endif
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,6 +79,7 @@ struct thread_info {
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
+#define TIF_RDS			5	/* Reduced data speculation */
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
@@ -105,6 +106,7 @@ struct thread_info {
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
+#define _TIF_RDS		(1 << TIF_RDS)
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
@@ -144,7 +146,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
+	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_RDS)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -33,7 +33,7 @@ static void __init ssb_select_mitigation
  * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
  * writes to SPEC_CTRL contain whatever reserved bits have been set.
  */
-static u64 __ro_after_init x86_spec_ctrl_base;
+u64 __ro_after_init x86_spec_ctrl_base;
 
 /*
  * The vendor and possibly platform specific bits which can be modified in
@@ -146,19 +146,31 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_get_defa
 
 void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl)
 {
+	u64 host = x86_spec_ctrl_base;
+
 	if (!boot_cpu_has(X86_FEATURE_IBRS))
 		return;
-	if (x86_spec_ctrl_base != guest_spec_ctrl)
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		host |= rds_tif_to_spec_ctrl(current_thread_info()->flags);
+
+	if (host != guest_spec_ctrl)
 		wrmsrl(MSR_IA32_SPEC_CTRL, guest_spec_ctrl);
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_set_guest);
 
 void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl)
 {
+	u64 host = x86_spec_ctrl_base;
+
 	if (!boot_cpu_has(X86_FEATURE_IBRS))
 		return;
-	if (x86_spec_ctrl_base != guest_spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		host |= rds_tif_to_spec_ctrl(current_thread_info()->flags);
+
+	if (host != guest_spec_ctrl)
+		wrmsrl(MSR_IA32_SPEC_CTRL, host);
 }
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_restore_host);
 
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -38,6 +38,7 @@
 #include <asm/switch_to.h>
 #include <asm/desc.h>
 #include <asm/prctl.h>
+#include <asm/spec-ctrl.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -278,6 +279,24 @@ static inline void switch_to_bitmap(stru
 	}
 }
 
+static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
+{
+	u64 msr;
+
+	if (static_cpu_has(X86_FEATURE_AMD_RDS)) {
+		msr = x86_amd_ls_cfg_base | rds_tif_to_amd_ls_cfg(tifn);
+		wrmsrl(MSR_AMD64_LS_CFG, msr);
+	} else {
+		msr = x86_spec_ctrl_base | rds_tif_to_spec_ctrl(tifn);
+		wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+	}
+}
+
+void speculative_store_bypass_update(void)
+{
+	__speculative_store_bypass_update(current_thread_info()->flags);
+}
+
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss)
 {
@@ -309,6 +328,9 @@ void __switch_to_xtra(struct task_struct
 
 	if ((tifp ^ tifn) & _TIF_NOCPUID)
 		set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
+
+	if ((tifp ^ tifn) & _TIF_RDS)
+		__speculative_store_bypass_update(tifn);
 }
 
 /*

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [patch V11 16/16] SSB 16
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (14 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 15/16] SSB 15 Thomas Gleixner
@ 2018-05-02 21:51 ` Thomas Gleixner
  2018-05-02 23:21 ` [patch V11 00/16] SSB 0 Thomas Gleixner
  2018-05-03  4:27 ` [MODERATED] Encrypted Message Tim Chen
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 21:51 UTC (permalink / raw)
  To: speck

Add prctl based control for Speculative Store Bypass mitigation and make it
the default mitigation for Intel and AMD.

Andi Kleen provided the following rationale (slightly redacted):

 There are multiple levels of impact of Speculative Store Bypass:

 1) JITed sandbox.
    It cannot invoke system calls, but can do PRIME+PROBE and may have call
    interfaces to other code

 2) Native code process.
    No protection inside the process at this level.

 3) Kernel.

 4) Between processes. 

 The prctl tries to protect against case (1) doing attacks.

 If the untrusted code can do random system calls then control is already
 lost in a much worse way. So there needs to be system call protection in
 some way (using a JIT not allowing them or seccomp). Or rather if the
 process can subvert its environment somehow to do the prctl it can already
 execute arbitrary code, which is much worse than SSB.

 To put it differently, the point of the prctl is to not allow JITed code
 to read data it shouldn't read from its JITed sandbox. If it already has
 escaped its sandbox then it can already read everything it wants in its
 address space, and do much worse.

 The ability to control Speculative Store Bypass allows to enable the
 protection selectively without affecting overall system performance.

Based on an initial patch from Tim Chen. Completely rewritten.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
V9: Fixed the sbb typo and made AMD default to prctl as well
---
 Documentation/admin-guide/kernel-parameters.txt |    6 +
 arch/x86/include/asm/nospec-branch.h            |    1 
 arch/x86/kernel/cpu/bugs.c                      |   83 +++++++++++++++++++++---
 3 files changed, 79 insertions(+), 11 deletions(-)

--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4053,7 +4053,11 @@
 			off    - Unconditionally enable Speculative Store Bypass
 			auto   - Kernel detects whether the CPU model contains an
 				 implementation of Speculative Store Bypass and
-				 picks the most appropriate mitigation
+				 picks the most appropriate mitigation.
+			prctl  - Control Speculative Store Bypass per thread
+				 via prctl. Speculative Store Bypass is enabled
+				 for a process by default. The state of the control
+				 is inherited on fork.
 
 			Not specifying this option is equivalent to
 			spec_store_bypass_disable=auto.
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -232,6 +232,7 @@ extern u64 x86_spec_ctrl_get_default(voi
 enum ssb_mitigation {
 	SPEC_STORE_BYPASS_NONE,
 	SPEC_STORE_BYPASS_DISABLE,
+	SPEC_STORE_BYPASS_PRCTL,
 };
 
 extern char __indirect_thunk_start[];
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -12,6 +12,8 @@
 #include <linux/utsname.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
 
 #include <asm/spec-ctrl.h>
 #include <asm/cmdline.h>
@@ -408,20 +410,23 @@ enum ssb_mitigation_cmd {
 	SPEC_STORE_BYPASS_CMD_NONE,
 	SPEC_STORE_BYPASS_CMD_AUTO,
 	SPEC_STORE_BYPASS_CMD_ON,
+	SPEC_STORE_BYPASS_CMD_PRCTL,
 };
 
 static const char *ssb_strings[] = {
 	[SPEC_STORE_BYPASS_NONE]	= "Vulnerable",
-	[SPEC_STORE_BYPASS_DISABLE]	= "Mitigation: Speculative Store Bypass disabled"
+	[SPEC_STORE_BYPASS_DISABLE]	= "Mitigation: Speculative Store Bypass disabled",
+	[SPEC_STORE_BYPASS_PRCTL]	= "Mitigation: Speculative Store Bypass disabled via prctl"
 };
 
 static const struct {
 	const char *option;
 	enum ssb_mitigation_cmd cmd;
 } ssb_mitigation_options[] = {
-	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
-	{ "on",		SPEC_STORE_BYPASS_CMD_ON },   /* Disable Speculative Store Bypass */
-	{ "off",	SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
+	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO },  /* Platform decides */
+	{ "on",		SPEC_STORE_BYPASS_CMD_ON },    /* Disable Speculative Store Bypass */
+	{ "off",	SPEC_STORE_BYPASS_CMD_NONE },  /* Don't touch Speculative Store Bypass */
+	{ "prctl",	SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */
 };
 
 static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
@@ -471,14 +476,15 @@ static enum ssb_mitigation_cmd __init __
 
 	switch (cmd) {
 	case SPEC_STORE_BYPASS_CMD_AUTO:
-		/*
-		 * AMD platforms by default don't need SSB mitigation.
-		 */
-		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-			break;
+		/* Choose prctl as the default mode */
+		mode = SPEC_STORE_BYPASS_PRCTL;
+		break;
 	case SPEC_STORE_BYPASS_CMD_ON:
 		mode = SPEC_STORE_BYPASS_DISABLE;
 		break;
+	case SPEC_STORE_BYPASS_CMD_PRCTL:
+		mode = SPEC_STORE_BYPASS_PRCTL;
+		break;
 	case SPEC_STORE_BYPASS_CMD_NONE:
 		break;
 	}
@@ -489,7 +495,7 @@ static enum ssb_mitigation_cmd __init __
 	 *  - X86_FEATURE_RDS - CPU is able to turn off speculative store bypass
 	 *  - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
 	 */
-	if (mode != SPEC_STORE_BYPASS_NONE) {
+	if (mode == SPEC_STORE_BYPASS_DISABLE) {
 		setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
 		/*
 		 * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
@@ -520,6 +526,63 @@ static void ssb_select_mitigation()
 
 #undef pr_fmt
 
+static int ssb_prctl_set(unsigned long ctrl)
+{
+	bool rds = !!test_tsk_thread_flag(current, TIF_RDS);
+
+	if (ssb_mode != SPEC_STORE_BYPASS_PRCTL)
+		return -ENXIO;
+
+	if (ctrl == PR_SPEC_ENABLE)
+		clear_tsk_thread_flag(current, TIF_RDS);
+	else
+		set_tsk_thread_flag(current, TIF_RDS);
+
+	if (rds != !!test_tsk_thread_flag(current, TIF_RDS))
+		speculative_store_bypass_update();
+
+	return 0;
+}
+
+static int ssb_prctl_get(void)
+{
+	switch (ssb_mode) {
+	case SPEC_STORE_BYPASS_DISABLE:
+		return PR_SPEC_DISABLE;
+	case SPEC_STORE_BYPASS_PRCTL:
+		if (test_tsk_thread_flag(current, TIF_RDS))
+			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+	default:
+		if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+			return PR_SPEC_ENABLE;
+		return PR_SPEC_NOT_AFFECTED;
+	}
+}
+
+int arch_prctl_spec_ctrl_set(unsigned long which, unsigned long ctrl)
+{
+	if (ctrl != PR_SPEC_ENABLE && ctrl != PR_SPEC_DISABLE)
+		return -ERANGE;
+
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssb_prctl_set(ctrl);
+	default:
+		return -ENODEV;
+	}
+}
+
+int arch_prctl_spec_ctrl_get(unsigned long which)
+{
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssb_prctl_get();
+	default:
+		return -ENODEV;
+	}
+}
+
 void x86_spec_ctrl_setup_ap(void)
 {
 	if (boot_cpu_has(X86_FEATURE_IBRS))

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch V11 00/16] SSB 0
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (15 preceding siblings ...)
  2018-05-02 21:51 ` [patch V11 16/16] SSB 16 Thomas Gleixner
@ 2018-05-02 23:21 ` Thomas Gleixner
  2018-05-03  4:27 ` [MODERATED] Encrypted Message Tim Chen
  17 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-02 23:21 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 482 bytes --]

On Wed, 2 May 2018, speck for Thomas Gleixner wrote:

> Changes since V10:
> 
>   - Addressed Ingos review feedback
> 
>   - Picked up Reviewed-bys
> 
> Delta patch below. Bundle is coming in separate mail. Git repo branches are
> updated as well. The master branch contains also the fix for the lost IBRS
> issue Tim was seeing.
> 
> If there are no further issues and nitpicks, I'm going to make the
> changes immutable and changes need to go incremental on top.

Bundle attached.

[-- Attachment #2: Type: application/octet-stream, Size: 29655 bytes --]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Encrypted Message
  2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
                   ` (16 preceding siblings ...)
  2018-05-02 23:21 ` [patch V11 00/16] SSB 0 Thomas Gleixner
@ 2018-05-03  4:27 ` Tim Chen
  2018-05-03  6:10   ` [MODERATED] Re: [patch V11 00/16] SSB 0 Ingo Molnar
  2018-05-03  6:30   ` Thomas Gleixner
  17 siblings, 2 replies; 35+ messages in thread
From: Tim Chen @ 2018-05-03  4:27 UTC (permalink / raw)
  To: speck

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/rfc822-headers; protected-headers="v1", Size: 133 bytes --]

From: Tim Chen <tim.c.chen@linux.intel.com>
To: speck for Thomas Gleixner <speck@linutronix.de>
Subject: Re: [patch V11 00/16] SSB 0

[-- Attachment #2: Type: text/plain, Size: 1580 bytes --]



On 05/02/2018 02:51 PM, speck for Thomas Gleixner wrote:
> Changes since V10:
> 
>   - Addressed Ingos review feedback
> 
>   - Picked up Reviewed-bys
> 
> Delta patch below. Bundle is coming in separate mail. Git repo branches are
> updated as well. The master branch contains also the fix for the lost IBRS
> issue Tim was seeing.
> 
> If there are no further issues and nitpicks, I'm going to make the
> changes immutable and changes need to go incremental on top.
> 
> Thanks,
> 
> 	tglx
> 
> 

I notice that this code ignores the current process's TIF_RDS setting
in the prctl case:

#define firmware_restrict_branch_speculation_end()                      \
do {                                                                    \
        u64 val = x86_get_default_spec_ctrl();                          \
                                                                        \
        alternative_msr_write(MSR_IA32_SPEC_CTRL, val,                  \
                              X86_FEATURE_USE_IBRS_FW);                 \
        preempt_enable();                                               \
} while (0)

x86_get_default_spec_ctrl will return x86_spec_ctrl_base, which
will result in x86_spec_ctrl_base written to the MSR
in the prctl case for Intel CPU.  That incorrectly ignores current
process's TIF_RDS setting and the RDS bit will not be set.

Instead, the following value should have been written to the MSR
for Intel CPU:
x86_spec_ctrl_base | rds_tif_to_spec_ctrl(current_thread_info()->flags)

Thanks.

Tim


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 00/16] SSB 0
  2018-05-03  4:27 ` [MODERATED] Encrypted Message Tim Chen
@ 2018-05-03  6:10   ` Ingo Molnar
  2018-05-03  6:30   ` Thomas Gleixner
  1 sibling, 0 replies; 35+ messages in thread
From: Ingo Molnar @ 2018-05-03  6:10 UTC (permalink / raw)
  To: speck


* speck for Tim Chen <speck@linutronix.de> wrote:


> On 05/02/2018 02:51 PM, speck for Thomas Gleixner wrote:
> > Changes since V10:
> > 
> >   - Addressed Ingos review feedback
> > 
> >   - Picked up Reviewed-bys
> > 
> > Delta patch below. Bundle is coming in separate mail. Git repo branches are
> > updated as well. The master branch contains also the fix for the lost IBRS
> > issue Tim was seeing.
> > 
> > If there are no further issues and nitpicks, I'm going to make the
> > changes immutable and changes need to go incremental on top.
> > 
> > Thanks,
> > 
> > 	tglx
> > 
> > 
> 
> I notice that this code ignores the current process's TIF_RDS setting
> in the prctl case:
> 
> #define firmware_restrict_branch_speculation_end()                      \
> do {                                                                    \
>         u64 val = x86_get_default_spec_ctrl();                          \
>                                                                         \
>         alternative_msr_write(MSR_IA32_SPEC_CTRL, val,                  \
>                               X86_FEATURE_USE_IBRS_FW);                 \
>         preempt_enable();                                               \
> } while (0)
> 
> x86_get_default_spec_ctrl will return x86_spec_ctrl_base, which
> will result in x86_spec_ctrl_base written to the MSR
> in the prctl case for Intel CPU.  That incorrectly ignores current
> process's TIF_RDS setting and the RDS bit will not be set.
> 
> Instead, the following value should have been written to the MSR
> for Intel CPU:
> x86_spec_ctrl_base | rds_tif_to_spec_ctrl(current_thread_info()->flags)

I wanted to suggest to do testing on affected and non-affected CPUs, both AMD and 
Intel, because the runtime TIF and MSR indexing logic looks sufficiently complex 
to me for it to be possibly wrong in some of the scenarios.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch V11 00/16] SSB 0
  2018-05-03  4:27 ` [MODERATED] Encrypted Message Tim Chen
  2018-05-03  6:10   ` [MODERATED] Re: [patch V11 00/16] SSB 0 Ingo Molnar
@ 2018-05-03  6:30   ` Thomas Gleixner
  1 sibling, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-03  6:30 UTC (permalink / raw)
  To: speck

On Wed, 2 May 2018, speck for Tim Chen wrote:
> I notice that this code ignores the current process's TIF_RDS setting
> in the prctl case:
> 
> #define firmware_restrict_branch_speculation_end()                      \
> do {                                                                    \
>         u64 val = x86_get_default_spec_ctrl();                          \
>                                                                         \
>         alternative_msr_write(MSR_IA32_SPEC_CTRL, val,                  \
>                               X86_FEATURE_USE_IBRS_FW);                 \
>         preempt_enable();                                               \
> } while (0)
> 
> x86_get_default_spec_ctrl will return x86_spec_ctrl_base, which
> will result in x86_spec_ctrl_base written to the MSR
> in the prctl case for Intel CPU.  That incorrectly ignores current
> process's TIF_RDS setting and the RDS bit will not be set.
> 
> Instead, the following value should have been written to the MSR
> for Intel CPU:
> x86_spec_ctrl_base | rds_tif_to_spec_ctrl(current_thread_info()->flags)

Good catch. I'll have a look how to solve that proper. Not sure out of the
top of my head whether x86_spec_ctrl_get_default() should take that into
account or not.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 14/16] SSB 14
  2018-05-02 21:51 ` [patch V11 14/16] SSB 14 Thomas Gleixner
@ 2018-05-03  7:19   ` Konrad Rzeszutek Wilk
  2018-05-03  7:31     ` Thomas Gleixner
  2018-05-03  7:22   ` [MODERATED] " Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 35+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-05-03  7:19 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 4358 bytes --]

> The first supported controlable speculation misfeature is
> PR_SPEC_STORE_BYPASS. Add the define so this can be shared between
> architectures.
> 
> TODO: Write a man prctl(2) patch.

I think you can delete that? I wrote the patch to it - do you want to include it
in your patchset?

See attached and inline

From 12805eeaf003b99993805c01613a5897355ee12a Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 30 Apr 2018 13:25:20 -0400
Subject: [PATCH] SSB MANPAGE #1

prctl.2: PR_[SET|GET]_SPECULATION_CTRL

field.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
v8: New patch
v9: s/EUCLEAN/EINVAL/
   Also add section in PR_SET_SPECULATION_CTRL about arg[4,5] being zero.
---
 man2/prctl.2 | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/man2/prctl.2 b/man2/prctl.2
index 54764d881..3614ff4b6 100644
--- a/man2/prctl.2
+++ b/man2/prctl.2
@@ -1008,6 +1008,88 @@ the "securebits" flags of the calling thread.
 See
 .BR capabilities (7).
 .TP
+.BR PR_GET_SPECULATION_CTRL
+Returns the state of the speculation misfeature which is selected with
+the value of
+.IR arg2 ,
+which must be
+.B PR_SPEC_STORE_BYPASS.
+Otherwise the call fails with the error
+.BR ENODEV .
+The return value uses bit 0-2 with the following meaning:
+.RS
+.TP
+.BR PR_SPEC_PRCTL
+Mitigation can be controlled per task by
+.B PR_SET_SPECULATION_CTRL
+.TP
+.BR PR_SPEC_ENABLE
+The speculation feature is enabled, mitigation is disabled.
+.TP
+.BR PR_SPEC_DISABLE
+The speculation feature is disabled, mitigation is enabled
+.RE
+.IP
+If all bits are
+.B 0
+then the CPU is not affected by the speculation misfeature.
+.IP
+If
+.B PR_SPEC_PRCTL
+is set, then the per task control of the mitigation is available. If not set,
+.B prctl()
+for the speculation misfeature will fail.
+In the above operation
+.I arg3
+,
+.I arg4,
+and
+.I arg5
+must be specified as 0, otherwise the call fails with the error
+.BR EINVAL.
+.TP
+.BR PR_SET_SPECULATION_CTRL
+Sets the state of the speculation misfeature which is selected with
+the value of
+.IR arg2 ,
+which must be
+.B PR_SPEC_STORE_BYPASS.
+Otherwise the call fails with the error
+.BR ENODEV .
+This control is per task. The
+.IR arg3
+is used to hand in the control value, which can be either:
+.RS
+.TP
+.BR PR_SPEC_ENABLE
+The speculation feature is enabled, mitigation is disabled.
+.TP
+.BR PR_SPEC_DISABLE
+The speculation feature is disabled, mitigation is enabled
+.RE
+.IP
+Any other value in
+.IR arg3
+will result in the call failure with the error
+.BR ERANGE .
+Also
+.I arg4,
+and
+.I arg5
+must be specified as 0, otherwise the call fails with ethe rror
+.BR EINVAL.
+.IP
+Furtheremore this speculation feature can also be controlled by the boot-time
+parameter of
+.B
+spec_store_bypass_disable=
+Which could enforce a read-only policy which will result in the call failure
+with the error
+.BR ENXIO .
+Consult the
+.B PR_GET_SPECULATION_CTRL
+for details on the possible enumerations.
+.TP
 .BR PR_SET_THP_DISABLE " (since Linux 3.15)"
 .\" commit a0715cc22601e8830ace98366c0c2bd8da52af52
 Set the state of the "THP disable" flag for the calling thread.
@@ -1501,6 +1583,12 @@ and
 .IR arg3
 does not specify a valid capability.
 .TP
+.B ENODEV
+.I option
+was
+.BR PR_SET_SPECULATION_CTRL
+the kernel or CPU does not support the requested speculation misfeature.
+.TP
 .B ENXIO
 .I option
 was
@@ -1510,6 +1598,15 @@ or
 and the kernel or the CPU does not support MPX management.
 Check that the kernel and processor have MPX support.
 .TP
+.B ENXIO
+.I option
+was
+.BR PR_SET_SPECULATION_CTRL
+implies that the control of the selected speculation misfeature is not possible.
+See
+.BR PR_GET_SPECULATION_CTRL
+for the bit fields to determine which option is available.
+.TP
 .B EOPNOTSUPP
 .I option
 is
@@ -1570,6 +1667,28 @@ is not present in the process's permitted and inheritable capability sets,
 or the
 .B PR_CAP_AMBIENT_LOWER
 securebit has been set.
+.TP
+.B ERANGE
+.I option
+was
+.BR PR_SET_SPECULATION_CTRL
+and
+.IR arg3
+is incorrect - neither
+.B PR_SPEC_ENABLE
+nor
+.B PR_SPEC_DISABLE
+.
+.TP
+.B EINVAL
+.I option
+was
+.BR PR_GET_SPECULATION_CTRL
+or
+.BR PR_SET_SPECULATION_CTRL
+and unused arguments to
+.B prctl()
+are not 0.
 .SH VERSIONS
 The
 .BR prctl ()
-- 
2.13.4


[-- Attachment #2: 0001-SSB-MANPAGE-1.patch --]
[-- Type: text/plain, Size: 4047 bytes --]

From 12805eeaf003b99993805c01613a5897355ee12a Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 30 Apr 2018 13:25:20 -0400
Subject: [PATCH] SSB MANPAGE #1

prctl.2: PR_[SET|GET]_SPECULATION_CTRL

field.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
v8: New patch
v9: s/EUCLEAN/EINVAL/
   Also add section in PR_SET_SPECULATION_CTRL about arg[4,5] being zero.
---
 man2/prctl.2 | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/man2/prctl.2 b/man2/prctl.2
index 54764d881..3614ff4b6 100644
--- a/man2/prctl.2
+++ b/man2/prctl.2
@@ -1008,6 +1008,88 @@ the "securebits" flags of the calling thread.
 See
 .BR capabilities (7).
 .TP
+.BR PR_GET_SPECULATION_CTRL
+Returns the state of the speculation misfeature which is selected with
+the value of
+.IR arg2 ,
+which must be
+.B PR_SPEC_STORE_BYPASS.
+Otherwise the call fails with the error
+.BR ENODEV .
+The return value uses bit 0-2 with the following meaning:
+.RS
+.TP
+.BR PR_SPEC_PRCTL
+Mitigation can be controlled per task by
+.B PR_SET_SPECULATION_CTRL
+.TP
+.BR PR_SPEC_ENABLE
+The speculation feature is enabled, mitigation is disabled.
+.TP
+.BR PR_SPEC_DISABLE
+The speculation feature is disabled, mitigation is enabled
+.RE
+.IP
+If all bits are
+.B 0
+then the CPU is not affected by the speculation misfeature.
+.IP
+If
+.B PR_SPEC_PRCTL
+is set, then the per task control of the mitigation is available. If not set,
+.B prctl()
+for the speculation misfeature will fail.
+In the above operation
+.I arg3
+,
+.I arg4,
+and
+.I arg5
+must be specified as 0, otherwise the call fails with the error
+.BR EINVAL.
+.TP
+.BR PR_SET_SPECULATION_CTRL
+Sets the state of the speculation misfeature which is selected with
+the value of
+.IR arg2 ,
+which must be
+.B PR_SPEC_STORE_BYPASS.
+Otherwise the call fails with the error
+.BR ENODEV .
+This control is per task. The
+.IR arg3
+is used to hand in the control value, which can be either:
+.RS
+.TP
+.BR PR_SPEC_ENABLE
+The speculation feature is enabled, mitigation is disabled.
+.TP
+.BR PR_SPEC_DISABLE
+The speculation feature is disabled, mitigation is enabled
+.RE
+.IP
+Any other value in
+.IR arg3
+will result in the call failure with the error
+.BR ERANGE .
+Also
+.I arg4,
+and
+.I arg5
+must be specified as 0, otherwise the call fails with ethe rror
+.BR EINVAL.
+.IP
+Furtheremore this speculation feature can also be controlled by the boot-time
+parameter of
+.B
+spec_store_bypass_disable=
+Which could enforce a read-only policy which will result in the call failure
+with the error
+.BR ENXIO .
+Consult the
+.B PR_GET_SPECULATION_CTRL
+for details on the possible enumerations.
+.TP
 .BR PR_SET_THP_DISABLE " (since Linux 3.15)"
 .\" commit a0715cc22601e8830ace98366c0c2bd8da52af52
 Set the state of the "THP disable" flag for the calling thread.
@@ -1501,6 +1583,12 @@ and
 .IR arg3
 does not specify a valid capability.
 .TP
+.B ENODEV
+.I option
+was
+.BR PR_SET_SPECULATION_CTRL
+the kernel or CPU does not support the requested speculation misfeature.
+.TP
 .B ENXIO
 .I option
 was
@@ -1510,6 +1598,15 @@ or
 and the kernel or the CPU does not support MPX management.
 Check that the kernel and processor have MPX support.
 .TP
+.B ENXIO
+.I option
+was
+.BR PR_SET_SPECULATION_CTRL
+implies that the control of the selected speculation misfeature is not possible.
+See
+.BR PR_GET_SPECULATION_CTRL
+for the bit fields to determine which option is available.
+.TP
 .B EOPNOTSUPP
 .I option
 is
@@ -1570,6 +1667,28 @@ is not present in the process's permitted and inheritable capability sets,
 or the
 .B PR_CAP_AMBIENT_LOWER
 securebit has been set.
+.TP
+.B ERANGE
+.I option
+was
+.BR PR_SET_SPECULATION_CTRL
+and
+.IR arg3
+is incorrect - neither
+.B PR_SPEC_ENABLE
+nor
+.B PR_SPEC_DISABLE
+.
+.TP
+.B EINVAL
+.I option
+was
+.BR PR_GET_SPECULATION_CTRL
+or
+.BR PR_SET_SPECULATION_CTRL
+and unused arguments to
+.B prctl()
+are not 0.
 .SH VERSIONS
 The
 .BR prctl ()
-- 
2.13.4


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 15/16] SSB 15
  2018-05-02 21:51 ` [patch V11 15/16] SSB 15 Thomas Gleixner
@ 2018-05-03  7:21   ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 35+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-05-03  7:21 UTC (permalink / raw)
  To: speck

On Wed, May 02, 2018 at 11:51:17PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V11 15/16] x86/process: Allow runtime control of Speculative Store Bypass
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> The Speculative Store Bypass vulnerability can be mitigated with the
> Reduced Data Speculation (RDS) feature. To allow finer grained control of
> this eventually expensive mitigation a per task mitigation control is
> required.
> 
> Add a new TIF_RDS flag and put it into the group of TIF flags which are
> evaluated for mismatch in switch_to(). If these bits differ in the previous
> and the next task, then the slow path function __switch_to_xtra() is
> invoked. Implement the TIF_RDS dependent mitigation control in the slow
> path.
> 
> If the prctl for controlling Speculative Store Bypass is disabled or no
> task uses the prctl then there is no overhead in the switch_to() fast
> path.
> 
> Update the KVM related speculation control functions to take TID_RDS into
> account as well.
> 
> Based on a patch from Tim Chen. Completely rewritten.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

In case you want to add that ..

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 14/16] SSB 14
  2018-05-02 21:51 ` [patch V11 14/16] SSB 14 Thomas Gleixner
  2018-05-03  7:19   ` [MODERATED] " Konrad Rzeszutek Wilk
@ 2018-05-03  7:22   ` Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 35+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-05-03  7:22 UTC (permalink / raw)
  To: speck

On Wed, May 02, 2018 at 11:51:16PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V11 14/16] prctl: Add speculation control prctls
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> Add two new prctls to control aspects of speculation related vulnerabilites
> and their mitigations to provide finer grained control over performance
> impacting mitigations.
> 
> PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature
> which is selected with arg2 of prctl(2). The return value uses bit 0-2 with
> the following meaning:
> 
> Bit  Define           Description
> 0    PR_SPEC_PRCTL    Mitigation can be controlled per task by
>                       PR_SET_SPECULATION_CTRL
> 1    PR_SPEC_ENABLE   The speculation feature is enabled, mitigation is
>                       disabled
> 2    PR_SPEC_DISABLE  The speculation feature is disabled, mitigation is
>                       enabled
> 
> If all bits are 0 the CPU is not affected by the speculation misfeature.
> 
> If PR_SPEC_PRCTL is set, then the per task control of the mitigation is
> available. If not set, prctl(PR_SET_SPECULATION_CTRL) for the speculation
> misfeature will fail.
> 
> PR_SET_SPECULATION_CTRL allows to control the speculation misfeature, which
> is selected by arg2 of prctl(2) per task. arg3 is used to hand in the
> control value, i.e. either PR_SPEC_ENABLE or PR_SPEC_DISABLE.
> 
> The common return values are:
> 
> EINVAL  prctl is not implemented by the architecture or the unused prctl()
>         arguments are not 0
> ENODEV  arg2 is selecting a not supported speculation misfeature
> 
> PR_SET_SPECULATION_CTRL has these additional return values:
> 
> ERANGE  arg3 is incorrect, i.e. it's not either PR_SPEC_ENABLE or PR_SPEC_DISABLE
> ENXIO   prctl control of the selected speculation misfeature is disabled
> 
> The first supported controlable speculation misfeature is
> PR_SPEC_STORE_BYPASS. Add the define so this can be shared between
> architectures.
> 
> TODO: Write a man prctl(2) patch.
> 
> Based on an initial patch from Tim Chen and mostly rewritten.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Reviewed-by: Ingo Molnar <mingo@kernel.org>


Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

with the "TODO: Write a man ..." removed from it.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [patch V11 14/16] SSB 14
  2018-05-03  7:19   ` [MODERATED] " Konrad Rzeszutek Wilk
@ 2018-05-03  7:31     ` Thomas Gleixner
  0 siblings, 0 replies; 35+ messages in thread
From: Thomas Gleixner @ 2018-05-03  7:31 UTC (permalink / raw)
  To: speck

On Thu, 3 May 2018, speck for Konrad Rzeszutek Wilk wrote:
> > The first supported controlable speculation misfeature is
> > PR_SPEC_STORE_BYPASS. Add the define so this can be shared between
> > architectures.
> > 
> > TODO: Write a man prctl(2) patch.
> 
> I think you can delete that? I wrote the patch to it - do you want to include it
> in your patchset?

Delete yes, but it needs to go through Micheal Kerrisk once the mess gets public.
Thanks for writing that up!

Btw, public a german news site runs an article today that there is more
shit going to hit the fan soonn.

https://www.heise.de/security/meldung/Spectre-NG-Intel-Prozessoren-von-neuen-hochriskanten-Sicherheitsluecken-betroffen-4039302.html

Sorry German only.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 11/16] SSB 11
  2018-05-02 21:51 ` [patch V11 11/16] SSB 11 Thomas Gleixner
@ 2018-05-04 20:58   ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 35+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-05-04 20:58 UTC (permalink / raw)
  To: speck

..snip..
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -215,6 +215,7 @@
>  #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
>  #define X86_FEATURE_USE_IBRS_FW		( 7*32+22) /* "" Use IBRS during runtime firmware calls */
>  #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE	( 7*32+23) /* "" Disable Speculative Store Bypass. */
> +#define X86_FEATURE_AMD_RDS		(7*32+24)  /* "" AMD RDS implementation */
                                         ^
That is missing an space.

Of course please feel free to ignore this, it is such a tiny thing.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 05/16] SSB 5
  2018-05-02 21:51 ` [patch V11 05/16] SSB 5 Thomas Gleixner
@ 2018-05-10 17:52   ` Andi Kleen
  2018-05-10 18:30     ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2018-05-10 17:52 UTC (permalink / raw)
  To: speck

Hi,

I went over this patch again and I'm not sure i understand
how it works.


On Wed, May 02, 2018 at 11:51:07PM +0200, speck for Thomas Gleixner wrote:
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -9720,8 +9720,7 @@ static void __noclone vmx_vcpu_run(struc
>  	 * is no need to worry about the conditional branch over the wrmsr
>  	 * being speculatively taken.
>  	 */
> -	if (vmx->spec_ctrl)
> -		native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> +	x86_spec_ctrl_set_guest(vmx->spec_ctrl);
>  
>  	vmx->__launched = vmx->loaded_vmcs->launched;
>  
> @@ -9869,8 +9868,7 @@ static void __noclone vmx_vcpu_run(struc
>  	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
>  		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
>  
> -	if (vmx->spec_ctrl)
> -		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> +	x86_spec_ctrl_restore_host(vmx->spec_ctrl);

So we assume that vmx->spec_ctrl is always the correct value of the guest.


But vmx_set_msr does

	        vmx->spec_ctrl = data;

                if (!data)
                        break;

                /*
                 * For non-nested:
                 * When it's written (to non-zero) for the first time, pass
                 * it through.
                 *
                 * For nested:
                 * The handling of the MSR bitmap for L2 guests is done in
                 * nested_vmx_merge_msr_bitmap. We should not touch the
                 * vmcs02.msr_bitmap here since it gets completely overwritten
                 * in the merging. We update the vmcs01 here for L1 as well
                 * since it will end up touching the MSR anyway now.
                 */
                vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
                                              MSR_IA32_SPEC_CTRL,
                                              MSR_TYPE_RW);

That means only the value of the first write in the guest is saved in vmx->spec_ctrl.

But what happens when a later write is different from the first write?

The host wouldn't notice it and force it back to the original value on 
the next VMENTRY, silently losing the right state.

Now with three different bits that can be set dynamically this is highly likely
that the first write is different than later writes.

On Linux it was ok before SSBD because the only bit that could be set was IBRS
or IBRS_ALL, but on other OS that use STIBP for only some processes it could also
lose the value. With MDD in the picture it can happen on Linux too.

Do I miss something here?

-Andi

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 05/16] SSB 5
  2018-05-10 17:52   ` [MODERATED] " Andi Kleen
@ 2018-05-10 18:30     ` Konrad Rzeszutek Wilk
  2018-05-10 19:08       ` Andi Kleen
  0 siblings, 1 reply; 35+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-05-10 18:30 UTC (permalink / raw)
  To: speck

On Thu, May 10, 2018 at 10:52:57AM -0700, speck for Andi Kleen wrote:
> Hi,
> 
> I went over this patch again and I'm not sure i understand
> how it works.
> 
> 
> On Wed, May 02, 2018 at 11:51:07PM +0200, speck for Thomas Gleixner wrote:
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -9720,8 +9720,7 @@ static void __noclone vmx_vcpu_run(struc
> >  	 * is no need to worry about the conditional branch over the wrmsr
> >  	 * being speculatively taken.
> >  	 */
> > -	if (vmx->spec_ctrl)
> > -		native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> > +	x86_spec_ctrl_set_guest(vmx->spec_ctrl);
> >  
> >  	vmx->__launched = vmx->loaded_vmcs->launched;
> >  
> > @@ -9869,8 +9868,7 @@ static void __noclone vmx_vcpu_run(struc
> >  	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
> >  		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
> >  
> > -	if (vmx->spec_ctrl)
> > -		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> > +	x86_spec_ctrl_restore_host(vmx->spec_ctrl);
> 
> So we assume that vmx->spec_ctrl is always the correct value of the guest.
> 
> 
> But vmx_set_msr does

.. only if we intercept it.
> 
> 	        vmx->spec_ctrl = data;
> 
>                 if (!data)
>                         break;
> 
>                 /*
>                  * For non-nested:
>                  * When it's written (to non-zero) for the first time, pass
>                  * it through.
>                  *
>                  * For nested:
>                  * The handling of the MSR bitmap for L2 guests is done in
>                  * nested_vmx_merge_msr_bitmap. We should not touch the
>                  * vmcs02.msr_bitmap here since it gets completely overwritten
>                  * in the merging. We update the vmcs01 here for L1 as well
>                  * since it will end up touching the MSR anyway now.
>                  */
>                 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
>                                               MSR_IA32_SPEC_CTRL,
>                                               MSR_TYPE_RW);
> 
> That means only the value of the first write in the guest is saved in vmx->spec_ctrl.
> 
> But what happens when a later write is different from the first write?

This code:

  	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
  		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);

Would read it on the VMEXITs.
> 
> The host wouldn't notice it and force it back to the original value on 
> the next VMENTRY, silently losing the right state.
> 
> Now with three different bits that can be set dynamically this is highly likely
> that the first write is different than later writes.
> 
> On Linux it was ok before SSBD because the only bit that could be set was IBRS
> or IBRS_ALL, but on other OS that use STIBP for only some processes it could also
> lose the value. With MDD in the picture it can happen on Linux too.
> 
> Do I miss something here?
> 
> -Andi

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 05/16] SSB 5
  2018-05-10 18:30     ` Konrad Rzeszutek Wilk
@ 2018-05-10 19:08       ` Andi Kleen
  2018-05-10 21:22         ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2018-05-10 19:08 UTC (permalink / raw)
  To: speck

On Thu, May 10, 2018 at 02:30:58PM -0400, speck for Konrad Rzeszutek Wilk wrote:
> On Thu, May 10, 2018 at 10:52:57AM -0700, speck for Andi Kleen wrote:
> > Hi,
> > 
> > I went over this patch again and I'm not sure i understand
> > how it works.
> > 
> > 
> > On Wed, May 02, 2018 at 11:51:07PM +0200, speck for Thomas Gleixner wrote:
> > > --- a/arch/x86/kvm/vmx.c
> > > +++ b/arch/x86/kvm/vmx.c
> > > @@ -9720,8 +9720,7 @@ static void __noclone vmx_vcpu_run(struc
> > >  	 * is no need to worry about the conditional branch over the wrmsr
> > >  	 * being speculatively taken.
> > >  	 */
> > > -	if (vmx->spec_ctrl)
> > > -		native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> > > +	x86_spec_ctrl_set_guest(vmx->spec_ctrl);
> > >  
> > >  	vmx->__launched = vmx->loaded_vmcs->launched;
> > >  
> > > @@ -9869,8 +9868,7 @@ static void __noclone vmx_vcpu_run(struc
> > >  	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
> > >  		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
> > >  
> > > -	if (vmx->spec_ctrl)
> > > -		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> > > +	x86_spec_ctrl_restore_host(vmx->spec_ctrl);
> > 
> > So we assume that vmx->spec_ctrl is always the correct value of the guest.
> > 
> > 
> > But vmx_set_msr does
> 
> .. only if we intercept it.
> > 
> > 	        vmx->spec_ctrl = data;
> > 
> >                 if (!data)
> >                         break;
> > 
> >                 /*
> >                  * For non-nested:
> >                  * When it's written (to non-zero) for the first time, pass
> >                  * it through.
> >                  *
> >                  * For nested:
> >                  * The handling of the MSR bitmap for L2 guests is done in
> >                  * nested_vmx_merge_msr_bitmap. We should not touch the
> >                  * vmcs02.msr_bitmap here since it gets completely overwritten
> >                  * in the merging. We update the vmcs01 here for L1 as well
> >                  * since it will end up touching the MSR anyway now.
> >                  */
> >                 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
> >                                               MSR_IA32_SPEC_CTRL,
> >                                               MSR_TYPE_RW);
> > 
> > That means only the value of the first write in the guest is saved in vmx->spec_ctrl.
> > 
> > But what happens when a later write is different from the first write?
> 
> This code:
> 
>   	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
>   		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
> 
> Would read it on the VMEXITs.

Thanks. I missed that. Good that it works.

Why don't we use the msr save/restore lists for that? That would likely
be faster than a manual msr accesses.

-Andi

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 05/16] SSB 5
  2018-05-10 19:08       ` Andi Kleen
@ 2018-05-10 21:22         ` Konrad Rzeszutek Wilk
  2018-05-10 22:25           ` Andi Kleen
  0 siblings, 1 reply; 35+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-05-10 21:22 UTC (permalink / raw)
  To: speck

..snip..
> > > That means only the value of the first write in the guest is saved in vmx->spec_ctrl.
> > > 
> > > But what happens when a later write is different from the first write?
> > 
> > This code:
> > 
> >   	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
> >   		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
> > 
> > Would read it on the VMEXITs.
> 
> Thanks. I missed that. Good that it works.
> 
> Why don't we use the msr save/restore lists for that? That would likely
> be faster than a manual msr accesses.

It is actually slower. I tried doing it last time with the spectre/meltdown
and the performance was way slower than doing it this way. I can dig up the patches
 - as I think we did the tests on Broadwell but hadn't tried Skylake or such
(or maybe it was the other way around).

> 
> -Andi

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 05/16] SSB 5
  2018-05-10 21:22         ` Konrad Rzeszutek Wilk
@ 2018-05-10 22:25           ` Andi Kleen
  2018-05-10 23:50             ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2018-05-10 22:25 UTC (permalink / raw)
  To: speck

> It is actually slower. I tried doing it last time with the spectre/meltdown
> and the performance was way slower than doing it this way. I can dig up the patches
>  - as I think we did the tests on Broadwell but hadn't tried Skylake or such
> (or maybe it was the other way around).

Was this with MSR lists unconditionally, or with MSR list combined with 
the "wait for the first write" approach?

-Andi

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 05/16] SSB 5
  2018-05-10 22:25           ` Andi Kleen
@ 2018-05-10 23:50             ` Konrad Rzeszutek Wilk
  2018-05-11 16:11               ` Andi Kleen
  2018-05-16  7:55               ` Paolo Bonzini
  0 siblings, 2 replies; 35+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-05-10 23:50 UTC (permalink / raw)
  To: speck

On Thu, May 10, 2018 at 03:25:33PM -0700, speck for Andi Kleen wrote:
> > It is actually slower. I tried doing it last time with the spectre/meltdown
> > and the performance was way slower than doing it this way. I can dig up the patches
> >  - as I think we did the tests on Broadwell but hadn't tried Skylake or such
> > (or maybe it was the other way around).
> 
> Was this with MSR lists unconditionally, or with MSR list combined with 
> the "wait for the first write" approach?

It was the unconditional. The patch went through a bunch of iterations and this
is what we ended up testing - but it showed around 2-3% performance degradation
when doing TPCC-like workloads. I am trying to track down exactly what hardware
that was done against.

It won't apply to 'master' as this was against our heavily backported 4.1 kernel
but you get the idea. Also ignore some of the commit description as we did
eventually figure out the 'very fragile' case.


From: Daniel Kiper <daniel.kiper@oracle.com>

x86/kvm: Initialize guest MSR_IA32_SPEC_CTRL from VMCS VM-entry MSR load area

Current solution, wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl) before VMLAUNCH,
is very fragile and it may not restore MSR_IA32_SPEC_CTRL guest value in some
cases, e.g. during rescheduling processes (it looks that it is separate bug which
have to be fixed; anyway...). So, let's initialize guest MSR_IA32_SPEC_CTRL from
VMCS VM-entry MSR load area which is designed to do such things. Additionally,
save guest MSR_IA32_SPEC_CTRL into VM-exit MSR store area before VM-exit instead
of rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl) after VM-exit. Later may not be
reliable too.

This patch replaces commit bc5d49f (x86/spec: Always set IBRS to guest value on
VMENTER and host on VMEXIT) and 51b5f53 (x86/spec: Always set IBRS to guest value
on VMENTER and host on VMEXIT (redux)).

Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>

---
 arch/x86/kvm/vmx.c |   56 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aa9bc4f..e7c0f8b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -162,8 +162,10 @@ module_param(ple_window_max, int, S_IRUGO);
 
 extern const ulong vmx_return;
 
-#define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
+#define NR_AUTOLOAD_MSRS	8
+#define NR_AUTOSTORE_MSRS	NR_AUTOLOAD_MSRS
+
+#define VMCS02_POOL_SIZE	1
 
 struct vmcs {
 	u32 revision_id;
@@ -504,6 +506,10 @@ struct vcpu_vmx {
 		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
 		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
 	} msr_autoload;
+	struct msr_autostore {
+		unsigned nr;
+		struct vmx_msr_entry guest[NR_AUTOSTORE_MSRS];
+	} msr_autostore;
 	struct {
 		int           loaded;
 		u16           fs_sel, gs_sel, ldt_sel;
@@ -1704,6 +1710,37 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 	m->host[i].value = host_val;
 }
 
+static void add_atomic_store_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+	unsigned i;
+	struct msr_autostore *m = &vmx->msr_autostore;
+
+	for (i = 0; i < m->nr; ++i)
+		if (m->guest[i].index == msr)
+			return;
+
+	if (i == NR_AUTOSTORE_MSRS) {
+		pr_err("Not enough msr store entries. Can't add msr %x\n", msr);
+		BUG();
+	}
+
+	m->guest[i].index = msr;
+	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, ++m->nr);
+}
+
+static u64 get_msr_vmcs_store(struct vcpu_vmx *vmx, unsigned msr)
+{
+	unsigned i;
+	struct msr_autostore *m = &vmx->msr_autostore;
+
+	for (i = 0; i < m->nr; ++i)
+		if (m->guest[i].index == msr)
+			return m->guest[i].value;
+
+	pr_err("Can't find msr %x in VMCS store\n", msr);
+	BUG();
+}
+
 static void reload_tss(void)
 {
 	/*
@@ -4716,6 +4753,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 #endif
 
 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+	vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest));
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
 	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
@@ -8192,8 +8230,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
 
-	if (ibrs_inuse)
-		wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+	if (ibrs_inuse) {
+		add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, vmx->spec_ctrl,
+				      SPEC_CTRL_FEATURE_ENABLE_IBRS);
+		add_atomic_store_msr(vmx, MSR_IA32_SPEC_CTRL);
+	}
 
 	asm(
 		/* Store host registers */
@@ -8317,12 +8358,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 	      );
 
-	if (ibrs_inuse) {
-		rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
-		wrmsrl(MSR_IA32_SPEC_CTRL, SPEC_CTRL_FEATURE_ENABLE_IBRS);
-	}
 	stuff_RSB();
 
+	if (ibrs_inuse)
+		vmx->spec_ctrl = get_msr_vmcs_store(vmx, MSR_IA32_SPEC_CTRL);
+
 	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
 	if (debugctlmsr)
 		update_debugctlmsr(debugctlmsr);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 05/16] SSB 5
  2018-05-10 23:50             ` Konrad Rzeszutek Wilk
@ 2018-05-11 16:11               ` Andi Kleen
  2018-05-16  7:55               ` Paolo Bonzini
  1 sibling, 0 replies; 35+ messages in thread
From: Andi Kleen @ 2018-05-11 16:11 UTC (permalink / raw)
  To: speck

On Thu, May 10, 2018 at 07:50:56PM -0400, speck for Konrad Rzeszutek Wilk wrote:
> On Thu, May 10, 2018 at 03:25:33PM -0700, speck for Andi Kleen wrote:
> > > It is actually slower. I tried doing it last time with the spectre/meltdown
> > > and the performance was way slower than doing it this way. I can dig up the patches
> > >  - as I think we did the tests on Broadwell but hadn't tried Skylake or such
> > > (or maybe it was the other way around).
> > 
> > Was this with MSR lists unconditionally, or with MSR list combined with 
> > the "wait for the first write" approach?
> 
> It was the unconditional. The patch went through a bunch of iterations and this

My suspicion is that a combination of the two would be best:

first wait for the first intercept
	so no overhead for the case of the guest not using IBRS at all
then use the msr save/restore list
	this is the most efficient way to do save/restore

With the new "rare MSR use" we may end up with SSB on Linux it may also be worth
to time out the save/restore and go back to intercept, so if SSB is only used temporarily
for a short time the exits wouldn't be slower forever just because
someone used it temporarily.

> It won't apply to 'master' as this was against our heavily backported 4.1 kernel
> but you get the idea. Also ignore some of the commit description as we did
> eventually figure out the 'very fragile' case.

Right that's unconditional.

So the "never use MSR" case is definitely slower, which is likely what you saw.

-Andi

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 05/16] SSB 5
  2018-05-10 23:50             ` Konrad Rzeszutek Wilk
  2018-05-11 16:11               ` Andi Kleen
@ 2018-05-16  7:55               ` Paolo Bonzini
  2018-05-16 13:52                 ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 35+ messages in thread
From: Paolo Bonzini @ 2018-05-16  7:55 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 727 bytes --]

On 11/05/2018 01:50, speck for Konrad Rzeszutek Wilk wrote:
>> Was this with MSR lists unconditionally, or with MSR list combined with 
>> the "wait for the first write" approach?
> It was the unconditional. The patch went through a bunch of iterations and this
> is what we ended up testing - but it showed around 2-3% performance degradation
> when doing TPCC-like workloads. I am trying to track down exactly what hardware
> that was done against.

FWIW the Xen people found the same.  Isn't there some magic in how
SPEC_CTRL is implemented, to avoid serialization?

Also we don't use MSR save lists at all, so we might get some fixed
overhead when we change the MSR save count from zero to nonzero.

Paolo


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [MODERATED] Re: [patch V11 05/16] SSB 5
  2018-05-16  7:55               ` Paolo Bonzini
@ 2018-05-16 13:52                 ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 35+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-05-16 13:52 UTC (permalink / raw)
  To: speck

On Wed, May 16, 2018 at 09:55:34AM +0200, speck for Paolo Bonzini wrote:
> On 11/05/2018 01:50, speck for Konrad Rzeszutek Wilk wrote:
> >> Was this with MSR lists unconditionally, or with MSR list combined with 
> >> the "wait for the first write" approach?
> > It was the unconditional. The patch went through a bunch of iterations and this
> > is what we ended up testing - but it showed around 2-3% performance degradation
> > when doing TPCC-like workloads. I am trying to track down exactly what hardware
> > that was done against.
> 
> FWIW the Xen people found the same.  Isn't there some magic in how

Oh. I didn't know anybody else cared about TPCC except Oracle.

> SPEC_CTRL is implemented, to avoid serialization?

As I understand it, it depends on the CPU family - some are latched, while others
have it more like a reset button. And some probably do nothing.

> 
> Also we don't use MSR save lists at all, so we might get some fixed
> overhead when we change the MSR save count from zero to nonzero.
> 
> Paolo
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2018-05-16 13:52 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-02 21:51 [patch V11 00/16] SSB 0 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 01/16] SSB 1 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 02/16] SSB 2 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 03/16] SSB 3 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 04/16] SSB 4 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 05/16] SSB 5 Thomas Gleixner
2018-05-10 17:52   ` [MODERATED] " Andi Kleen
2018-05-10 18:30     ` Konrad Rzeszutek Wilk
2018-05-10 19:08       ` Andi Kleen
2018-05-10 21:22         ` Konrad Rzeszutek Wilk
2018-05-10 22:25           ` Andi Kleen
2018-05-10 23:50             ` Konrad Rzeszutek Wilk
2018-05-11 16:11               ` Andi Kleen
2018-05-16  7:55               ` Paolo Bonzini
2018-05-16 13:52                 ` Konrad Rzeszutek Wilk
2018-05-02 21:51 ` [patch V11 06/16] SSB 6 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 07/16] SSB 7 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 08/16] SSB 8 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 09/16] SSB 9 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 10/16] SSB 10 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 11/16] SSB 11 Thomas Gleixner
2018-05-04 20:58   ` [MODERATED] " Konrad Rzeszutek Wilk
2018-05-02 21:51 ` [patch V11 12/16] SSB 12 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 13/16] SSB 13 Thomas Gleixner
2018-05-02 21:51 ` [patch V11 14/16] SSB 14 Thomas Gleixner
2018-05-03  7:19   ` [MODERATED] " Konrad Rzeszutek Wilk
2018-05-03  7:31     ` Thomas Gleixner
2018-05-03  7:22   ` [MODERATED] " Konrad Rzeszutek Wilk
2018-05-02 21:51 ` [patch V11 15/16] SSB 15 Thomas Gleixner
2018-05-03  7:21   ` [MODERATED] " Konrad Rzeszutek Wilk
2018-05-02 21:51 ` [patch V11 16/16] SSB 16 Thomas Gleixner
2018-05-02 23:21 ` [patch V11 00/16] SSB 0 Thomas Gleixner
2018-05-03  4:27 ` [MODERATED] Encrypted Message Tim Chen
2018-05-03  6:10   ` [MODERATED] Re: [patch V11 00/16] SSB 0 Ingo Molnar
2018-05-03  6:30   ` Thomas Gleixner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.