All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch V10 00/10] Control knobs and Documentation 0
@ 2018-07-12 14:19 Thomas Gleixner
  2018-07-12 14:19 ` [patch V10 01/10] Control knobs and Documentation 1 Thomas Gleixner
                   ` (13 more replies)
  0 siblings, 14 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

The following series provides the following changes:

  - Fix EPT=off handling so it avoids flushing
  
  - Expose proper VMX mitigation information in sysfs

  - Drops the MSR list mechanism for flush 'always' to prepare for runtime
    control. The default flush mechanism is conditional anyway and the MSR
    list is set up at guest init time, which is nasty to run time switch
    especially because the static key is a global control which can be
    flipped by an update.

  - Make the flush always/conditional static key based.

  - Serialize the kvm parameter setter function

  - Enable runtime control for the kvm parameter

  - Add the l1tf command line option. It's not run time controllable as it
    does not make sense to have 3 knobs at runtime. For the command line
    the combo knob setting the default is convenient

  - Documentation update

This takes the review comments into account as much as still applicable.

Thanks to Jiri for testing the lot and debugging and fixing my brainfarts!

Git bundle follows in separate mail.

Thanks,

	tglx

8<-------------
 Documentation/admin-guide/index.rst             |    9 
 Documentation/admin-guide/kernel-parameters.txt |   65 ++
 arch/x86/include/asm/processor.h                |   12 
 arch/x86/include/asm/vmx.h                      |   10 
 arch/x86/kernel/cpu/bugs.c                      |   81 +++
 arch/x86/kvm/vmx.c                              |  302 +++++++-----
 b/Documentation/admin-guide/l1tf.rst            |  572 ++++++++++++++++++++++++
 include/linux/cpu.h                             |    2 
 kernel/cpu.c                                    |   12 
 9 files changed, 930 insertions(+), 135 deletions(-)

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [patch V10 01/10] Control knobs and Documentation 1
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
@ 2018-07-12 14:19 ` Thomas Gleixner
  2018-07-12 15:34   ` [MODERATED] " Greg KH
  2018-07-12 14:19 ` [patch V10 02/10] Control knobs and Documentation 2 Thomas Gleixner
                   ` (12 subsequent siblings)
  13 siblings, 1 reply; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

Subject: [patch V10 01/10] x86/litf: Introduce vmx status variable
From: Thomas Gleixner <tglx@linutronix.de>

Store the effective mitigation of VMX in a status variable and use it to
report the VMX state in the l1tf sysfs file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/vmx.h |    9 +++++++++
 arch/x86/kernel/cpu/bugs.c |   36 ++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx.c         |   22 +++++++++++-----------
 3 files changed, 54 insertions(+), 13 deletions(-)

--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -573,4 +573,13 @@ enum vm_instruction_error_number {
 	VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
 };
 
+enum vmx_l1d_flush_state {
+	VMENTER_L1D_FLUSH_AUTO,
+	VMENTER_L1D_FLUSH_NEVER,
+	VMENTER_L1D_FLUSH_COND,
+	VMENTER_L1D_FLUSH_ALWAYS,
+};
+
+extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
+
 #endif
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -22,6 +22,7 @@
 #include <asm/processor-flags.h>
 #include <asm/fpu/internal.h>
 #include <asm/msr.h>
+#include <asm/vmx.h>
 #include <asm/paravirt.h>
 #include <asm/alternative.h>
 #include <asm/pgtable.h>
@@ -657,6 +658,12 @@ void x86_spec_ctrl_setup_ap(void)
 
 #undef pr_fmt
 #define pr_fmt(fmt)	"L1TF: " fmt
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+enum vmx_l1d_flush_state l1tf_vmx_mitigation __ro_after_init = VMENTER_L1D_FLUSH_AUTO;
+EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
+#endif
+
 static void __init l1tf_select_mitigation(void)
 {
 	u64 half_pa;
@@ -686,6 +693,32 @@ static void __init l1tf_select_mitigatio
 
 #ifdef CONFIG_SYSFS
 
+#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static const char *l1tf_vmx_states[] = {
+	[VMENTER_L1D_FLUSH_AUTO]	= "auto",
+	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
+	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
+	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
+};
+
+static ssize_t l1tf_show_state(char *buf)
+{
+	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
+		return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+
+	return sprintf(buf, "%s; VMX: SMT %s, L1D %s\n", L1TF_DEFAULT_MSG,
+		       cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled",
+		       l1tf_vmx_states[l1tf_vmx_mitigation]);
+}
+#else
+static ssize_t l1tf_show_state(char *buf)
+{
+	return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+}
+#endif
+
 static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
 			       char *buf, unsigned int bug)
 {
@@ -713,9 +746,8 @@ static ssize_t cpu_show_common(struct de
 
 	case X86_BUG_L1TF:
 		if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
-			return sprintf(buf, "Mitigation: Page Table Inversion\n");
+			return l1tf_show_state(buf);
 		break;
-
 	default:
 		break;
 	}
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -193,19 +193,13 @@ extern const ulong vmx_return;
 
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 
-/* These MUST be in sync with vmentry_l1d_param order. */
-enum vmx_l1d_flush_state {
-	VMENTER_L1D_FLUSH_NEVER,
-	VMENTER_L1D_FLUSH_COND,
-	VMENTER_L1D_FLUSH_ALWAYS,
-};
-
 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush = VMENTER_L1D_FLUSH_COND;
 
 static const struct {
 	const char *option;
 	enum vmx_l1d_flush_state cmd;
 } vmentry_l1d_param[] = {
+	{"auto",	VMENTER_L1D_FLUSH_AUTO},
 	{"never",	VMENTER_L1D_FLUSH_NEVER},
 	{"cond",	VMENTER_L1D_FLUSH_COND},
 	{"always",	VMENTER_L1D_FLUSH_ALWAYS},
@@ -13235,8 +13229,12 @@ static int __init vmx_setup_l1d_flush(vo
 {
 	struct page *page;
 
+	if (!boot_cpu_has_bug(X86_BUG_L1TF))
+		return 0;
+
+	l1tf_vmx_mitigation = vmentry_l1d_flush;
+
 	if (vmentry_l1d_flush == VMENTER_L1D_FLUSH_NEVER ||
-	    !boot_cpu_has_bug(X86_BUG_L1TF) ||
 	    vmx_l1d_use_msr_save_list())
 		return 0;
 
@@ -13251,12 +13249,14 @@ static int __init vmx_setup_l1d_flush(vo
 	return 0;
 }
 
-static void vmx_free_l1d_flush_pages(void)
+static void vmx_cleanup_l1d_flush(void)
 {
 	if (vmx_l1d_flush_pages) {
 		free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
 		vmx_l1d_flush_pages = NULL;
 	}
+	/* Restore state so sysfs ignores VMX */
+	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
 }
 
 static int __init vmx_init(void)
@@ -13299,7 +13299,7 @@ static int __init vmx_init(void)
 	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
 		     __alignof__(struct vcpu_vmx), THIS_MODULE);
 	if (r) {
-		vmx_free_l1d_flush_pages();
+		vmx_cleanup_l1d_flush();
 		return r;
 	}
 
@@ -13343,7 +13343,7 @@ static void __exit vmx_exit(void)
 		static_branch_disable(&enable_evmcs);
 	}
 #endif
-	vmx_free_l1d_flush_pages();
+	vmx_cleanup_l1d_flush();
 }
 
 module_init(vmx_init)

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [patch V10 02/10] Control knobs and Documentation 2
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
  2018-07-12 14:19 ` [patch V10 01/10] Control knobs and Documentation 1 Thomas Gleixner
@ 2018-07-12 14:19 ` Thomas Gleixner
  2018-07-12 17:09   ` [MODERATED] " Greg KH
  2018-07-12 14:19 ` [patch V10 03/10] Control knobs and Documentation 3 Thomas Gleixner
                   ` (11 subsequent siblings)
  13 siblings, 1 reply; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

Subject: [patch V10 02/10] x86/kvm: Drop L1TF MSR list approach
From: Thomas Gleixner <tglx@linutronix.de>

The VMX module parameter to control the L1D flush should become
writeable.

The MSR list is set up at VM init per guest VCPU, but the run time
switching is based on a static key which is global. Toggling the MSR list
at run time might be feasible, but for now drop this optimization and use
the regular MSR write to make run-time switching possible.

The default mitigation is the conditional flush anyway, so for extra
paranoid setups this will add some small overhead, but the extra code
executed is in the noise compared to the flush itself.

Aside of that the EPT disabled case is not handled correctly at the moment
and the MSR list magic is in the way for fixing that as well.

If it's really providing a significant advantage, then this needs to be
revisited after the code is correct and the control is writable.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kvm/vmx.c |   43 +++++++------------------------------------
 1 file changed, 7 insertions(+), 36 deletions(-)

--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6231,16 +6231,6 @@ static void ept_set_mmio_spte_mask(void)
 				   VMX_EPT_MISCONFIG_WX_VALUE);
 }
 
-static bool vmx_l1d_use_msr_save_list(void)
-{
-	if (!enable_ept || !boot_cpu_has_bug(X86_BUG_L1TF) ||
-	    static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-	    !static_cpu_has(X86_FEATURE_FLUSH_L1D))
-		return false;
-
-	return vmentry_l1d_flush == VMENTER_L1D_FLUSH_ALWAYS;
-}
-
 #define VMX_XSS_EXIT_BITMAP 0
 /*
  * Sets up the vmcs for emulated real mode.
@@ -6362,12 +6352,6 @@ static void vmx_vcpu_setup(struct vcpu_v
 		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 	}
-	/*
-	 * If flushing the L1D cache on every VMENTER is enforced and the
-	 * MSR is available, use the MSR save list.
-	 */
-	if (vmx_l1d_use_msr_save_list())
-		add_atomic_switch_msr(vmx, MSR_IA32_FLUSH_CMD, L1D_FLUSH, 0, true);
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -9617,26 +9601,14 @@ static void vmx_l1d_flush(struct kvm_vcp
 	bool always;
 
 	/*
-	 * This code is only executed when:
-	 * - the flush mode is 'cond'
-	 * - the flush mode is 'always' and the flush MSR is not
-	 *   available
-	 *
-	 * If the CPU has the flush MSR then clear the flush bit because
-	 * 'always' mode is handled via the MSR save list.
-	 *
-	 * If the MSR is not avaibable then act depending on the mitigation
-	 * mode: If 'flush always', keep the flush bit set, otherwise clear
-	 * it.
+	 * This code is only executed when the the flush mode is 'cond' or
+	 * 'always'
 	 *
-	 * The flush bit gets set again either from vcpu_run() or from one
-	 * of the unsafe VMEXIT handlers.
+	 * If 'flush always', keep the flush bit set, otherwise clear
+	 * it. The flush bit gets set again either from vcpu_run() or from
+	 * one of the unsafe VMEXIT handlers.
 	 */
-	if (static_cpu_has(X86_FEATURE_FLUSH_L1D))
-		always = false;
-	else
-		always = vmentry_l1d_flush == VMENTER_L1D_FLUSH_ALWAYS;
-
+	always = vmentry_l1d_flush == VMENTER_L1D_FLUSH_ALWAYS;
 	vcpu->arch.l1tf_flush_l1d = always;
 
 	vcpu->stat.l1d_flush++;
@@ -13234,8 +13206,7 @@ static int __init vmx_setup_l1d_flush(vo
 
 	l1tf_vmx_mitigation = vmentry_l1d_flush;
 
-	if (vmentry_l1d_flush == VMENTER_L1D_FLUSH_NEVER ||
-	    vmx_l1d_use_msr_save_list())
+	if (vmentry_l1d_flush == VMENTER_L1D_FLUSH_NEVER)
 		return 0;
 
 	if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [patch V10 03/10] Control knobs and Documentation 3
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
  2018-07-12 14:19 ` [patch V10 01/10] Control knobs and Documentation 1 Thomas Gleixner
  2018-07-12 14:19 ` [patch V10 02/10] Control knobs and Documentation 2 Thomas Gleixner
@ 2018-07-12 14:19 ` Thomas Gleixner
  2018-07-12 16:13   ` [MODERATED] " Josh Poimboeuf
  2018-07-12 17:09   ` [MODERATED] " Greg KH
  2018-07-12 14:19 ` [patch V10 04/10] Control knobs and Documentation 4 Thomas Gleixner
                   ` (10 subsequent siblings)
  13 siblings, 2 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

Subject: [patch V10 03/10] x86/l1tf: Handle EPT disabled state proper
From: Thomas Gleixner <tglx@linutronix.de>

If Extended Page Tables (EPT) are disabled or not supported, no L1D
flushing is required. The setup function can just avoid setting up the L1D
flush for the EPT=n case.

Invoke it after the hardware setup has be done and enable_ept has the
correct state and expose the EPT disabled state in the mitigation status as
well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/vmx.h |    1 +
 arch/x86/kernel/cpu/bugs.c |    1 +
 arch/x86/kvm/vmx.c         |   18 ++++++++++++++----
 3 files changed, 16 insertions(+), 4 deletions(-)

--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -578,6 +578,7 @@ enum vmx_l1d_flush_state {
 	VMENTER_L1D_FLUSH_NEVER,
 	VMENTER_L1D_FLUSH_COND,
 	VMENTER_L1D_FLUSH_ALWAYS,
+	VMENTER_L1D_FLUSH_EPT_DISABLED,
 };
 
 extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -701,6 +701,7 @@ static const char *l1tf_vmx_states[] = {
 	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
 	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
 	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
+	[VMENTER_L1D_FLUSH_EPT_DISABLED]= "EPT disabled"
 };
 
 static ssize_t l1tf_show_state(char *buf)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -13204,6 +13204,11 @@ static int __init vmx_setup_l1d_flush(vo
 	if (!boot_cpu_has_bug(X86_BUG_L1TF))
 		return 0;
 
+	if (!enable_ept) {
+		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+		return 0;
+	}
+
 	l1tf_vmx_mitigation = vmentry_l1d_flush;
 
 	if (vmentry_l1d_flush == VMENTER_L1D_FLUSH_NEVER)
@@ -13230,6 +13235,8 @@ static void vmx_cleanup_l1d_flush(void)
 	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
 }
 
+static void __exit vmx_exit(void);
+
 static int __init vmx_init(void)
 {
 	int r;
@@ -13263,14 +13270,17 @@ static int __init vmx_init(void)
 	}
 #endif
 
-	r = vmx_setup_l1d_flush();
+	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+		     __alignof__(struct vcpu_vmx), THIS_MODULE);
 	if (r)
 		return r;
 
-	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
-		     __alignof__(struct vcpu_vmx), THIS_MODULE);
+	/*
+	 * Must be called after kvm_init() so enable_ept is properly set up
+	 */
+	r = vmx_setup_l1d_flush();
 	if (r) {
-		vmx_cleanup_l1d_flush();
+		vmx_exit();
 		return r;
 	}
 

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [patch V10 04/10] Control knobs and Documentation 4
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (2 preceding siblings ...)
  2018-07-12 14:19 ` [patch V10 03/10] Control knobs and Documentation 3 Thomas Gleixner
@ 2018-07-12 14:19 ` Thomas Gleixner
  2018-07-12 17:10   ` [MODERATED] " Greg KH
  2018-07-12 14:19 ` [patch V10 05/10] Control knobs and Documentation 5 Thomas Gleixner
                   ` (9 subsequent siblings)
  13 siblings, 1 reply; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

Subject: [patch V10 04/10] x86/kvm: Move l1tf setup function
From: Thomas Gleixner <tglx@linutronix.de>

In preparation of allowing run time control for L1D flushing, move the
setup code to the module parameter handler.

In case of pre module init parsing, just store the value and let vmx_init()
do the actual setup after running kvm_init() so that enable_ept is having
the correct state.

During run-time invoke it directly from the parameter setter to prepare for
run-time control.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kvm/vmx.c |  198 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 113 insertions(+), 85 deletions(-)

--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -193,7 +193,8 @@ extern const ulong vmx_return;
 
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 
-static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush = VMENTER_L1D_FLUSH_COND;
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
 
 static const struct {
 	const char *option;
@@ -205,33 +206,85 @@ static const struct {
 	{"always",	VMENTER_L1D_FLUSH_ALWAYS},
 };
 
-static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 {
-	unsigned int i;
+	struct page *page;
 
-	if (!s)
-		return -EINVAL;
+	/* If set to 'auto' select 'cond' */
+	if (l1tf == VMENTER_L1D_FLUSH_AUTO)
+		l1tf = VMENTER_L1D_FLUSH_COND;
 
-	for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
-		if (!strcmp(s, vmentry_l1d_param[i].option)) {
-			vmentry_l1d_flush = vmentry_l1d_param[i].cmd;
-			return 0;
-		}
+	if (!enable_ept) {
+		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+		return 0;
 	}
 
+	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+	    !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+		if (!page)
+			return -ENOMEM;
+		vmx_l1d_flush_pages = page_address(page);
+	}
+
+	l1tf_vmx_mitigation = l1tf;
+
+	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+		static_branch_enable(&vmx_l1d_should_flush);
+	return 0;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+	unsigned int i;
+
+	if (s) {
+		for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+			if (!strcmp(s, vmentry_l1d_param[i].option))
+				return vmentry_l1d_param[i].cmd;
+		}
+	}
 	return -EINVAL;
 }
 
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+	int l1tf;
+
+	if (!boot_cpu_has(X86_BUG_L1TF))
+		return 0;
+
+	l1tf = vmentry_l1d_flush_parse(s);
+	if (l1tf < 0)
+		return l1tf;
+
+	/*
+	 * Has vmx_init() run already? If not then this is the pre init
+	 * parameter parsing. In that case just store the value and let
+	 * vmx_init() do the proper setup after enable_ept has been
+	 * established.
+	 */
+	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+		vmentry_l1d_flush_param = l1tf;
+		return 0;
+	}
+
+	return vmx_setup_l1d_flush(l1tf);
+}
+
 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
 {
-	return sprintf(s, "%s\n", vmentry_l1d_param[vmentry_l1d_flush].option);
+	return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
 }
 
 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 	.set = vmentry_l1d_flush_set,
 	.get = vmentry_l1d_flush_get,
 };
-module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, &vmentry_l1d_flush, S_IRUGO);
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, S_IRUGO);
 
 struct kvm_vmx {
 	struct kvm kvm;
@@ -9608,7 +9661,7 @@ static void vmx_l1d_flush(struct kvm_vcp
 	 * it. The flush bit gets set again either from vcpu_run() or from
 	 * one of the unsafe VMEXIT handlers.
 	 */
-	always = vmentry_l1d_flush == VMENTER_L1D_FLUSH_ALWAYS;
+	always = l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_ALWAYS;
 	vcpu->arch.l1tf_flush_l1d = always;
 
 	vcpu->stat.l1d_flush++;
@@ -13197,34 +13250,6 @@ static struct kvm_x86_ops vmx_x86_ops __
 	.enable_smi_window = enable_smi_window,
 };
 
-static int __init vmx_setup_l1d_flush(void)
-{
-	struct page *page;
-
-	if (!boot_cpu_has_bug(X86_BUG_L1TF))
-		return 0;
-
-	if (!enable_ept) {
-		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
-		return 0;
-	}
-
-	l1tf_vmx_mitigation = vmentry_l1d_flush;
-
-	if (vmentry_l1d_flush == VMENTER_L1D_FLUSH_NEVER)
-		return 0;
-
-	if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
-		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
-		if (!page)
-			return -ENOMEM;
-		vmx_l1d_flush_pages = page_address(page);
-	}
-
-	static_branch_enable(&vmx_l1d_should_flush);
-	return 0;
-}
-
 static void vmx_cleanup_l1d_flush(void)
 {
 	if (vmx_l1d_flush_pages) {
@@ -13235,7 +13260,40 @@ static void vmx_cleanup_l1d_flush(void)
 	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
 }
 
-static void __exit vmx_exit(void);
+static void vmx_exit(void)
+{
+#ifdef CONFIG_KEXEC_CORE
+	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
+	synchronize_rcu();
+#endif
+
+	kvm_exit();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+	if (static_branch_unlikely(&enable_evmcs)) {
+		int cpu;
+		struct hv_vp_assist_page *vp_ap;
+		/*
+		 * Reset everything to support using non-enlightened VMCS
+		 * access later (e.g. when we reload the module with
+		 * enlightened_vmcs=0)
+		 */
+		for_each_online_cpu(cpu) {
+			vp_ap =	hv_get_vp_assist_page(cpu);
+
+			if (!vp_ap)
+				continue;
+
+			vp_ap->current_nested_vmcs = 0;
+			vp_ap->enlighten_vmentry = 0;
+		}
+
+		static_branch_disable(&enable_evmcs);
+	}
+#endif
+	vmx_cleanup_l1d_flush();
+}
+module_exit(vmx_exit);
 
 static int __init vmx_init(void)
 {
@@ -13276,12 +13334,18 @@ static int __init vmx_init(void)
 		return r;
 
 	/*
-	 * Must be called after kvm_init() so enable_ept is properly set up
-	 */
-	r = vmx_setup_l1d_flush();
-	if (r) {
-		vmx_exit();
-		return r;
+	 * Must be called after kvm_init() so enable_ept is properly set
+	 * up. Hand the parameter mitigation value in which was stored in
+	 * the pre module init parser. If no parameter was given, it will
+	 * contain 'auto' which will be turned into the default 'cond'
+	 * mitigation mode.
+	 */
+	if (boot_cpu_has(X86_BUG_L1TF)) {
+		r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+		if (r) {
+			vmx_exit();
+			return r;
+		}
 	}
 
 #ifdef CONFIG_KEXEC_CORE
@@ -13292,40 +13356,4 @@ static int __init vmx_init(void)
 
 	return 0;
 }
-
-static void __exit vmx_exit(void)
-{
-#ifdef CONFIG_KEXEC_CORE
-	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
-	synchronize_rcu();
-#endif
-
-	kvm_exit();
-
-#if IS_ENABLED(CONFIG_HYPERV)
-	if (static_branch_unlikely(&enable_evmcs)) {
-		int cpu;
-		struct hv_vp_assist_page *vp_ap;
-		/*
-		 * Reset everything to support using non-enlightened VMCS
-		 * access later (e.g. when we reload the module with
-		 * enlightened_vmcs=0)
-		 */
-		for_each_online_cpu(cpu) {
-			vp_ap =	hv_get_vp_assist_page(cpu);
-
-			if (!vp_ap)
-				continue;
-
-			vp_ap->current_nested_vmcs = 0;
-			vp_ap->enlighten_vmentry = 0;
-		}
-
-		static_branch_disable(&enable_evmcs);
-	}
-#endif
-	vmx_cleanup_l1d_flush();
-}
-
-module_init(vmx_init)
-module_exit(vmx_exit)
+module_init(vmx_init);

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [patch V10 05/10] Control knobs and Documentation 5
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (3 preceding siblings ...)
  2018-07-12 14:19 ` [patch V10 04/10] Control knobs and Documentation 4 Thomas Gleixner
@ 2018-07-12 14:19 ` Thomas Gleixner
  2018-07-12 17:10   ` [MODERATED] " Greg KH
  2018-07-12 14:19 ` [patch V10 06/10] Control knobs and Documentation 6 Thomas Gleixner
                   ` (8 subsequent siblings)
  13 siblings, 1 reply; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

Subject: [patch V10 05/10] x86/kvm: Add static key for flush always
From: Thomas Gleixner <tglx@linutronix.de>

Avoid the conditional in the L1D flush control path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kvm/vmx.c |   16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -192,6 +192,7 @@ module_param(ple_window_max, uint, 0444)
 extern const ulong vmx_return;
 
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_always);
 
 /* Storage for pre module init parameter parsing */
 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
@@ -232,8 +233,12 @@ static int vmx_setup_l1d_flush(enum vmx_
 
 	l1tf_vmx_mitigation = l1tf;
 
-	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
-		static_branch_enable(&vmx_l1d_should_flush);
+	if (l1tf == VMENTER_L1D_FLUSH_NEVER)
+		return 0;
+
+	static_branch_enable(&vmx_l1d_should_flush);
+	if (l1tf == VMENTER_L1D_FLUSH_ALWAYS)
+		static_branch_enable(&vmx_l1d_flush_always);
 	return 0;
 }
 
@@ -9651,7 +9656,6 @@ static void *vmx_l1d_flush_pages;
 static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
 {
 	int size = PAGE_SIZE << L1D_CACHE_ORDER;
-	bool always;
 
 	/*
 	 * This code is only executed when the the flush mode is 'cond' or
@@ -9661,8 +9665,10 @@ static void vmx_l1d_flush(struct kvm_vcp
 	 * it. The flush bit gets set again either from vcpu_run() or from
 	 * one of the unsafe VMEXIT handlers.
 	 */
-	always = l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_ALWAYS;
-	vcpu->arch.l1tf_flush_l1d = always;
+	if (static_branch_unlikely(&vmx_l1d_flush_always))
+		vcpu->arch.l1tf_flush_l1d = true;
+	else
+		vcpu->arch.l1tf_flush_l1d = false;
 
 	vcpu->stat.l1d_flush++;
 

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [patch V10 06/10] Control knobs and Documentation 6
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (4 preceding siblings ...)
  2018-07-12 14:19 ` [patch V10 05/10] Control knobs and Documentation 5 Thomas Gleixner
@ 2018-07-12 14:19 ` Thomas Gleixner
  2018-07-12 16:14   ` [MODERATED] " Josh Poimboeuf
  2018-07-12 17:10   ` Greg KH
  2018-07-12 14:19 ` [patch V10 07/10] Control knobs and Documentation 7 Thomas Gleixner
                   ` (7 subsequent siblings)
  13 siblings, 2 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

Subject: [patch V10 06/10] x86/kvm: Serialize L!D flush parameter setter
From: Thomas Gleixner <tglx@linutronix.de>

Writes to the parameter files are not serialized at the sysfs core
level, so local serialization is required.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kvm/vmx.c |    8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -193,6 +193,7 @@ extern const ulong vmx_return;
 
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_always);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
 
 /* Storage for pre module init parameter parsing */
 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
@@ -257,7 +258,7 @@ static int vmentry_l1d_flush_parse(const
 
 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
 {
-	int l1tf;
+	int l1tf, ret;
 
 	if (!boot_cpu_has(X86_BUG_L1TF))
 		return 0;
@@ -277,7 +278,10 @@ static int vmentry_l1d_flush_set(const c
 		return 0;
 	}
 
-	return vmx_setup_l1d_flush(l1tf);
+	mutex_lock(&vmx_l1d_flush_mutex);
+	ret = vmx_setup_l1d_flush(l1tf);
+	mutex_unlock(&vmx_l1d_flush_mutex);
+	return ret;
 }
 
 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [patch V10 07/10] Control knobs and Documentation 7
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (5 preceding siblings ...)
  2018-07-12 14:19 ` [patch V10 06/10] Control knobs and Documentation 6 Thomas Gleixner
@ 2018-07-12 14:19 ` Thomas Gleixner
  2018-07-12 17:11   ` [MODERATED] " Greg KH
  2018-07-12 14:19 ` [patch V10 08/10] Control knobs and Documentation 8 Thomas Gleixner
                   ` (6 subsequent siblings)
  13 siblings, 1 reply; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

Subject: [patch V10 07/10] x86/kvm: Allow runtime control of L1D flush
From: Thomas Gleixner <tglx@linutronix.de>

All mitigation modes can be switched at run time with a static key now:

 - Use sysfs_streq() instead of strcmp() to handle the trailing new line
   from sysfs writes correctly.
 - Make the static key management handle multiple invocations properly.
 - Set the module parameter file to RW

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/bugs.c |    2 +-
 arch/x86/kvm/vmx.c         |   13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -661,7 +661,7 @@ void x86_spec_ctrl_setup_ap(void)
 #define pr_fmt(fmt)	"L1TF: " fmt
 
 #if IS_ENABLED(CONFIG_KVM_INTEL)
-enum vmx_l1d_flush_state l1tf_vmx_mitigation __ro_after_init = VMENTER_L1D_FLUSH_AUTO;
+enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
 EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
 #endif
 
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -234,12 +234,15 @@ static int vmx_setup_l1d_flush(enum vmx_
 
 	l1tf_vmx_mitigation = l1tf;
 
-	if (l1tf == VMENTER_L1D_FLUSH_NEVER)
-		return 0;
+	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+		static_branch_enable(&vmx_l1d_should_flush);
+	else
+		static_branch_disable(&vmx_l1d_should_flush);
 
-	static_branch_enable(&vmx_l1d_should_flush);
 	if (l1tf == VMENTER_L1D_FLUSH_ALWAYS)
 		static_branch_enable(&vmx_l1d_flush_always);
+	else
+		static_branch_disable(&vmx_l1d_flush_always);
 	return 0;
 }
 
@@ -249,7 +252,7 @@ static int vmentry_l1d_flush_parse(const
 
 	if (s) {
 		for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
-			if (!strcmp(s, vmentry_l1d_param[i].option))
+			if (sysfs_streq(s, vmentry_l1d_param[i].option))
 				return vmentry_l1d_param[i].cmd;
 		}
 	}
@@ -293,7 +296,7 @@ static const struct kernel_param_ops vme
 	.set = vmentry_l1d_flush_set,
 	.get = vmentry_l1d_flush_get,
 };
-module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, S_IRUGO);
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 
 struct kvm_vmx {
 	struct kvm kvm;

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [patch V10 08/10] Control knobs and Documentation 8
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (6 preceding siblings ...)
  2018-07-12 14:19 ` [patch V10 07/10] Control knobs and Documentation 7 Thomas Gleixner
@ 2018-07-12 14:19 ` Thomas Gleixner
  2018-07-12 16:22   ` [MODERATED] " Josh Poimboeuf
  2018-07-12 17:17   ` [MODERATED] " Greg KH
  2018-07-12 14:19 ` [patch V10 09/10] Control knobs and Documentation 9 Thomas Gleixner
                   ` (5 subsequent siblings)
  13 siblings, 2 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

Subject: [patch V10 08/10] cpu/hotplug: Expose SMT control init function
From: Jiri Kosina <jkosina@suse.cz>

The L1TF mitigation will gain a commend line parameter which allows to set
a combination of hypervisor mitigation and SMT control.

Expose cpu_smt_disable() so the command line parser can tweak SMT settings.

[ tglx: Split out of larger patch and made it preserve an already existing
  	force off state ]

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h |    2 ++
 kernel/cpu.c        |   12 ++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -177,8 +177,10 @@ enum cpuhp_smt_control {
 
 #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
 extern enum cpuhp_smt_control cpu_smt_control;
+extern void cpu_smt_disable(bool force);
 #else
 # define cpu_smt_control		(CPU_SMT_ENABLED)
+static inline void cpu_smt_disable(bool force) { }
 #endif
 
 #endif /* _LINUX_CPU_H_ */
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -347,13 +347,21 @@ EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
 EXPORT_SYMBOL_GPL(cpu_smt_control);
 
-static int __init smt_cmdline_disable(char *str)
+void __init cpu_smt_disable(bool force)
 {
+	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
+		return;
+
 	cpu_smt_control = CPU_SMT_DISABLED;
-	if (str && !strcmp(str, "force")) {
+	if (force) {
 		pr_info("SMT: Force disabled\n");
 		cpu_smt_control = CPU_SMT_FORCE_DISABLED;
 	}
+}
+
+static int __init smt_cmdline_disable(char *str)
+{
+	cpu_smt_disable(str && !strcmp(str, "force"));
 	return 0;
 }
 early_param("nosmt", smt_cmdline_disable);

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [patch V10 09/10] Control knobs and Documentation 9
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (7 preceding siblings ...)
  2018-07-12 14:19 ` [patch V10 08/10] Control knobs and Documentation 8 Thomas Gleixner
@ 2018-07-12 14:19 ` Thomas Gleixner
  2018-07-12 16:24   ` [MODERATED] " Josh Poimboeuf
                     ` (2 more replies)
  2018-07-12 14:19 ` [patch V10 10/10] Control knobs and Documentation 10 Thomas Gleixner
                   ` (4 subsequent siblings)
  13 siblings, 3 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

From: Jiri Kosina <jkosina@suse.cz>
Subject: [patch V10 09/10] x86/bugs, kvm: introduce boot-time control of L1TF mitigations

Introduce the 'l1tf=' kernel command line option to allow for boot-time
switching of mitigation that is used on processors affected by L1TF.

The possible values are:

  full
	Provides all available mitigations for the L1TF vulnerability. Disables
	SMT and enables all mitigations in the hypervisors. SMT control via
	/sys/devices/system/cpu/smt/control is still possible after boot.
	Hypervisors will issue a warning when the first VM is started in
	a potentially insecure configuration, i.e. SMT enabled or L1D flush
	disabled.

  full,force
	Same as 'full', but disables SMT control. Implies the 'nosmt=force'
	command line option. sysfs control of SMT and the hypervisor flush
	control is disabled.

  flush
	Leaves SMT enabled and enables the conditional hypervisor mitigation.
	Hypervisors will issue a warning when the first VM is started in a
	potentially insecure configuration, i.e. SMT enabled or L1D flush
	disabled.

  flush,nosmt
	Disables SMT and enables the conditional hypervisor mitigation. SMT
	control via /sys/devices/system/cpu/smt/control is still possible
	after boot. If SMT is reenabled or flushing disabled at runtime
	hypervisors will issue a warning.

  flush,nowarn
	Same as 'flush', but hypervisors will not warn when
	a VM is started in a potentially insecure configuration.

  off
	Disables hypervisor mitigations and doesn't emit any warnings.

Default is 'flush'.

Let KVM adhere to these semantics, which means:

  - 'lt1f=full,force'	: Performe L1D flushes. No runtime control
    			  possible.

  - 'l1tf=full'
  - 'l1tf-flush'
  - 'l1tf=flush,nosmt'	: Perform L1D flushes and warn on VM start if
			  SMT has been runtime enabled or L1D flushing
			  has been run-time enabled
			  
  - 'l1tf=flush,nowarn'	: Perform L1D flushes and no warnings are emitted.
  
  - 'l1tf=off'		: L1D flushes are not performed and no warnings
			  are emitted.

KVM can always override the L1D flushing behavior using its 'vmentry_l1d_flush'
module parameter except when lt1f=full,force is set.

This makes KVM's private 'nosmt' option redundant, and as it is a bit
non-systematic anyway (this is something to control globally, not on
hypervisor level), remove that option.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---

v9->v10:
	- Add full,nomst option to make it complete
	- Adopt to the KVM runtime control changes
	- Address review comments

v8->v9:
	- after a bit of discussion, switch to L1D flushes being the 
	  default
	- rename the option names (novirt -> flush) (Josh)
	- introduce 'off' option that turns the flushes off
	- remove the Reviewed-by: gathered so far, as we're changing
	  semantics

v7->v8:

        - adjusted the sysfs output for l1tf_vmx_states to avoid 
          potentially confusing "mostly protected"
        - L1TF_MSG_SMT is now printed _once()
        - fixed typos and stylistic issues pointed out by Josh and Ingo 
          both in commitlog and documentation
        - fixed code stylistic issues (ordering of the enum constants)
          pointed out by Ingo
        - fixed comment typo in vmx_l1d_flush()
        - added Ingo's Reviewed-by:

v6->v7:
        - Fixed the CVE number
        - Slightly reworded the parameter description
        - Take the l1tf command line parameter into account when
          initializing the VMX L1TF mitigation and expose the
          vmx mitigation state to the core.
        - Make the sysfs l1tf file show the VMX mitigation state
          in detail.

v5->v6:
        - 'full' implies 'nosmt', 'full,force' implies nosmt=force;
          print KVM warnings accordingly (one state more, and having
          bitflags would be needed for clarity)
        - now that we have full and full,force, drop KVM's private
          nosmt option
        - drop compile-time option to chose the default default :)
        - typo/grammar fixes

v4->v5:
        - rebase on top of KVM bundle

v3->v4:
        - unconfuse the meaning of 'off', both in the documentation and in 
          the code (spotted by Josh)

v2->v3:
        - provide l1tf=[full,novirt,off]
        - provide config option to chose the default
        - let KVM warn in novirt case

v1->v2
	- add forgotten dependency on X86_BUG_L1TF

 Documentation/admin-guide/kernel-parameters.txt |   65 +++++++++++++++++++++---
 arch/x86/include/asm/processor.h                |   12 ++++
 arch/x86/kernel/cpu/bugs.c                      |   44 ++++++++++++++++
 arch/x86/kvm/vmx.c                              |   56 +++++++++++++++-----
 4 files changed, 158 insertions(+), 19 deletions(-)

--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1946,12 +1946,6 @@
 			[KVM,ARM] Allow use of GICv4 for direct injection of
 			LPIs.
 
-	kvm-intel.nosmt=[KVM,Intel] If the L1TF CPU bug is present (CVE-2018-3620)
-			and the system has SMT (aka Hyper-Threading) enabled then
-			don't allow guests to be created.
-
-			Default is 0 (allow guests to be created).
-
 	kvm-intel.ept=	[KVM,Intel] Disable extended page tables
 			(virtualized MMU) support on capable Intel chips.
 			Default is 1 (enabled)
@@ -1989,6 +1983,65 @@
 			feature (tagged TLBs) on capable Intel chips.
 			Default is 1 (enabled)
 
+	l1tf=           [X86] Control mitigation of the L1TF vulnerability on
+			      affected CPUs
+
+			The kernel PTE inversion protection is unconditionally
+			enabled and cannot be disabled.
+
+			full
+				Provides all available mitigations for the
+				L1TF vulnerability. Disables SMT and
+				enables all mitigations in the
+				hypervisors.
+
+				SMT control and L1D flush control via the
+				sysfs interface is still possible after
+				boot.  Hypervisors will issue a warning
+				when the first VM is started in a
+				potentially insecure configuration,
+				i.e. SMT enabled or L1D flush disabled.
+
+			full,force
+				Same as 'full', but disables SMT and L1D
+				flush runtime control. Implies the
+				'nosmt=force' command line option.
+				(i.e. sysfs control of SMT is disabled.)
+
+			flush
+				Leaves SMT enabled and enables the default
+				hypervisor mitigation.
+
+				SMT control and L1D flush control via the
+				sysfs interface is still possible after
+				boot.  Hypervisors will issue a warning
+				when the first VM is started in a
+				potentially insecure configuration,
+				i.e. SMT enabled or L1D flush disabled.
+
+			flush,nosmt
+
+				Disables SMT and enables the default
+				hypervisor mitigation.
+
+				SMT control and L1D flush control via the
+				sysfs interface is still possible after
+				boot.  Hypervisors will issue a warning
+				when the first VM is started in a
+				potentially insecure configuration,
+				i.e. SMT enabled or L1D flush disabled.
+
+			flush,nowarn
+				Same as 'flush', but hypervisors will not
+				warn when a VM is started in a potentially
+				insecure configuration.
+
+			off
+				Disables hypervisor mitigations and doesn't
+				emit any warnings.
+
+			Default is 'flush'.
+
 	l2cr=		[PPC]
 
 	l3cr=		[PPC]
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -982,4 +982,16 @@ bool xen_set_default_idle(void);
 void stop_this_cpu(void *dummy);
 void df_debug(struct pt_regs *regs, long error_code);
 void microcode_check(void);
+
+enum l1tf_mitigations {
+	L1TF_MITIGATION_OFF,
+	L1TF_MITIGATION_FLUSH_NOWARN,
+	L1TF_MITIGATION_FLUSH,
+	L1TF_MITIGATION_FLUSH_NOSMT,
+	L1TF_MITIGATION_FULL,
+	L1TF_MITIGATION_FULL_FORCE
+};
+
+extern enum l1tf_mitigations l1tf_mitigation;
+
 #endif /* _ASM_X86_PROCESSOR_H */
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -660,7 +660,11 @@ void x86_spec_ctrl_setup_ap(void)
 #undef pr_fmt
 #define pr_fmt(fmt)	"L1TF: " fmt
 
+/* Default mitigation for L1TF-affected CPUs */
+enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
 #if IS_ENABLED(CONFIG_KVM_INTEL)
+EXPORT_SYMBOL_GPL(l1tf_mitigation);
+
 enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
 EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
 #endif
@@ -672,6 +676,20 @@ static void __init l1tf_select_mitigatio
 	if (!boot_cpu_has_bug(X86_BUG_L1TF))
 		return;
 
+	switch (l1tf_mitigation) {
+	case L1TF_MITIGATION_OFF:
+	case L1TF_MITIGATION_FLUSH_NOWARN:
+	case L1TF_MITIGATION_FLUSH:
+		break;
+	case L1TF_MITIGATION_FLUSH_NOSMT:
+	case L1TF_MITIGATION_FULL:
+		cpu_smt_disable(false);
+		break;
+	case L1TF_MITIGATION_FULL_FORCE:
+		cpu_smt_disable(true);
+		break;
+	}
+
 #if CONFIG_PGTABLE_LEVELS == 2
 	pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
 	return;
@@ -690,6 +708,32 @@ static void __init l1tf_select_mitigatio
 
 	setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
 }
+
+static int __init l1tf_cmdline(char *str)
+{
+	if (!boot_cpu_has_bug(X86_BUG_L1TF))
+		return 0;
+
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "off"))
+		l1tf_mitigation = L1TF_MITIGATION_OFF;
+	else if (!strcmp(str, "flush,nowarn"))
+		l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN;
+	else if (!strcmp(str, "flush"))
+		l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+	else if (!strcmp(str, "flush,nosmt"))
+		l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+	else if (!strcmp(str, "full"))
+		l1tf_mitigation = L1TF_MITIGATION_FULL;
+	else if (!strcmp(str, "full,force"))
+		l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE;
+
+	return 0;
+}
+early_param("l1tf", l1tf_cmdline);
+
 #undef pr_fmt
 
 #ifdef CONFIG_SYSFS
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,9 +71,6 @@ static const struct x86_cpu_id vmx_cpu_i
 };
 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 
-static bool __read_mostly nosmt;
-module_param(nosmt, bool, S_IRUGO);
-
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -215,15 +212,31 @@ static int vmx_setup_l1d_flush(enum vmx_
 {
 	struct page *page;
 
-	/* If set to 'auto' select 'cond' */
-	if (l1tf == VMENTER_L1D_FLUSH_AUTO)
-		l1tf = VMENTER_L1D_FLUSH_COND;
-
 	if (!enable_ept) {
 		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
 		return 0;
 	}
 
+	/* If set to auto use the default l1tf mitigation method */
+	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+		switch (l1tf_mitigation) {
+		case L1TF_MITIGATION_OFF:
+			l1tf = VMENTER_L1D_FLUSH_NEVER;
+			break;
+		case L1TF_MITIGATION_FLUSH_NOWARN:
+		case L1TF_MITIGATION_FLUSH:
+		case L1TF_MITIGATION_FLUSH_NOSMT:
+			l1tf = VMENTER_L1D_FLUSH_COND;
+			break;
+		case L1TF_MITIGATION_FULL:
+		case L1TF_MITIGATION_FULL_FORCE:
+			l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+			break;
+		}
+	} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+		l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+	}
+
 	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
 	    !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
 		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
@@ -10582,19 +10595,36 @@ static struct kvm_vcpu *vmx_create_vcpu(
 	return ERR_PTR(err);
 }
 
-#define L1TF_MSG "SMT enabled with L1TF CPU bug present. Refer to CVE-2018-3620 for details.\n"
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
 
 static int vmx_vm_init(struct kvm *kvm)
 {
 	if (!ple_gap)
 		kvm->arch.pause_in_guest = true;
 
-	if (boot_cpu_has(X86_BUG_L1TF) && cpu_smt_control == CPU_SMT_ENABLED) {
-		if (nosmt) {
-			pr_err(L1TF_MSG);
-			return -EOPNOTSUPP;
+	if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+		switch (l1tf_mitigation) {
+		case L1TF_MITIGATION_OFF:
+		case L1TF_MITIGATION_FLUSH_NOWARN:
+			/* 'I explicitly don't care' is set */
+			break;
+		case L1TF_MITIGATION_FLUSH:
+		case L1TF_MITIGATION_FLUSH_NOSMT:
+		case L1TF_MITIGATION_FULL:
+			/*
+			 * Warn upon starting the first VM in a potentially
+			 * insecure environment.
+			 */
+			if (cpu_smt_control == CPU_SMT_ENABLED)
+				pr_warn_once(L1TF_MSG_SMT);
+			if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+				pr_warn_once(L1TF_MSG_L1D);
+			break;
+		case L1TF_MITIGATION_FULL_FORCE:
+			/* Flush is enforced */
+			break;
 		}
-		pr_warn(L1TF_MSG);
 	}
 	return 0;
 }

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [patch V10 10/10] Control knobs and Documentation 10
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (8 preceding siblings ...)
  2018-07-12 14:19 ` [patch V10 09/10] Control knobs and Documentation 9 Thomas Gleixner
@ 2018-07-12 14:19 ` Thomas Gleixner
  2018-07-12 16:03   ` [MODERATED] " Linus Torvalds
                     ` (4 more replies)
  2018-07-12 14:54 ` [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (3 subsequent siblings)
  13 siblings, 5 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:19 UTC (permalink / raw)
  To: speck

Subject: [patch V10 10/10] Documentation: Add section about CPU vulnerabilities
From: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---

V1 -> V2:

 - Address review comments
 - Reword some paragraphs substantially
 - Add EPT disabling as possible mitigation and document that
   non EPT capabale systems are protected
 - Formatting fixes (removes the weird pipes)
 

 Documentation/admin-guide/index.rst |    9 
 Documentation/admin-guide/l1tf.rst  |  572 ++++++++++++++++++++++++++++++++++++
 2 files changed, 581 insertions(+)

--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -17,6 +17,15 @@ etc.
    kernel-parameters
    devices
 
+This section describes CPU vulnerabilities and provides an overview of the
+possible mitigations along with guidance for selecting mitigations if they
+are configurable at compile, boot or run time.
+
+.. toctree::
+   :maxdepth: 1
+
+   l1tf
+
 Here is a set of documents aimed at users who are trying to track down
 problems and bugs in particular.
 
--- /dev/null
+++ b/Documentation/admin-guide/l1tf.rst
@@ -0,0 +1,572 @@
+L1TF - L1 Terminal Fault
+========================
+
+L1 Terminal Fault is a hardware vulnerability which allows unprivileged
+speculative access to data which is available in the Level 1 Data Cache
+when the page table entry controlling the virtual address, which is used
+for the access, has the Present bit cleared or other reserved bits set.
+
+Affected processors
+-------------------
+
+This vulnerability affects a wide range of Intel processors. The
+vulnerability is not present on:
+
+   - Processors from AMD, Centaur and other non Intel vendors
+
+   - Older processor models, where the CPU family is < 6
+
+   - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
+     Penwell, Pineview, Slivermont, Airmont, Merrifield)
+
+   - The Intel Core Duo Yonah variants (2006 - 2008)
+
+   - The Intel XEON PHI family
+
+   - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
+     IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
+     by the Meltdown vulnerability either. These CPUs should become
+     available by end of 2018.
+
+Whether a processor is affected or not can be read out from the L1TF
+vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
+
+Related CVEs
+------------
+
+The following CVE entries are related to the L1TF vulnerability:
+
+   =============  =================  ==============================
+   CVE-2018-3615  L1 Terminal Fault  SGX related aspects
+   CVE-2018-3620  L1 Terminal Fault  OS, SMM related aspects
+   CVE-2018-3646  L1 Terminal Fault  Virtualization related aspects
+   =============  =================  ==============================
+
+Problem
+-------
+
+If an instruction accesses a virtual address for which the relevant page
+table entry (PTE) has the Present bit cleared or other reserved bits set,
+then speculative execution ignores the invalid PTE and loads the referenced
+data if it is present in the Level 1 Data Cache, as if the page referenced
+by the address bits in the PTE was still present and accessible.
+
+While this is a purely speculative mechanism and the instruction will raise
+a page fault when it is retired eventually, the pure act of loading the
+data and making it available to other speculative instructions opens up the
+opportunity for side channel attacks to unprivileged malicious code,
+similar to the Meltdown attack.
+
+While Meltdown breaks the user space to kernel space protection, L1TF
+allows to attack any physical memory address in the system and the attack
+works across all protection domains. It allows an attack of SGX and also
+works from inside virtual machines because the speculation bypasses the
+extended page table (EPT) protection mechanism.
+
+
+Attack scenarios
+----------------
+
+1. Malicious user space
+^^^^^^^^^^^^^^^^^^^^^^^
+
+   Operating Systems store arbitrary information in the address bits of a
+   PTE which is marked non present. This allows a malicious user space
+   application to attack the physical memory to which these PTEs resolve.
+   In some cases user-space can maliciously influence the information
+   encoded in the address bits of the PTE, thus making attacks more
+   deterministic and more practical.
+
+   The Linux kernel contains a mitigation for this attack vector, PTE
+   inversion, which is permanently enabled and has no performance
+   impact. The kernel ensures that the address bits of PTEs, which are not
+   marked present, never point to cacheable physical memory space.
+
+   A system with an up to date kernel is protected against attacks from
+   malicious user space applications.
+
+2. Malicious guest in a virtual machine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The fact that L1TF breaks all domain protections allows malicious guest
+   OSes, which can control the PTEs directly, and malicious guest user
+   space applications, which run on an unprotected guest kernel lacking the
+   PTE inversion mitigation for L1TF, to attack physical host memory.
+
+   A special aspect of L1TF in the context of virtualization is symmetric
+   multi threading (SMT). The Intel implementation of SMT is called
+   HyperThreading. The fact that Hyperthreads on the affected processors
+   share the L1 Data Cache (L1D) is important for this. As the flaw allows
+   only to attack data which is present in L1D, a malicious guest running
+   on one Hyperthread can attack the data which is brought into the L1D by
+   the context which runs on the sibling Hyperthread of the same physical
+   core. This context can be host OS, host user space or a different guest.
+
+   If the processor does not support Extended Page Tables, the attack is
+   only possible, when the hypervisor does not sanitize the content of the
+   effective (shadow) page tables.
+
+   While solutions exist to mitigate these attack vectors fully, these
+   mitigations are not enabled by default in the Linux kernel because they
+   can affect performance significantly. The kernel provides several
+   mechanisms which can be utilized to address the problem depending on the
+   deployment scenario. The mitigations, their protection scope and impact
+   are described in the next sections.
+
+   The default mitigations and the rationale for chosing them are explained
+   at the end of this document. See :ref:`default_mitigations`.
+
+.. _l1tf_sys_info:
+
+L1TF system information
+-----------------------
+
+The Linux kernel provides a sysfs interface to enumerate the current L1TF
+status of the system: whether the system is vulnerable, and which
+mitigations are active. The relevant sysfs file is:
+
+/sys/devices/system/cpu/vulnerabilities/l1tf
+
+The possible values in this file are:
+
+  ===========================   ===============================
+  'Not affected'		The processor is not vulnerable
+  'Mitigation: PTE Inversion'	The host protection is active
+  ===========================   ===============================
+
+If KVM/VMX is enabled and the processor is vulnerable then the following
+information is appended to the 'Mitigation: PTE Inversion' part:
+
+  - SMT status:
+
+    =====================  ================
+    'VMX: SMT vulnerable'  SMT is enabled
+    'VMX: SMT disabled'    SMT is disabled
+    =====================  ================
+
+  - L1D Flush mode:
+
+    ================================  ===================================
+    'L1D vulnerable'		      L1D flushing is disabled
+
+    'L1D conditional cache flushes'   L1D flush is conditionally enabled
+
+    'L1D cache flushes'		      SMT is disabled and L1D flush
+    ================================  ===================================
+
+The resulting grade of protection is discussed in the following sections.
+
+
+Host mitigation mechanism
+-------------------------
+
+The kernel is unconditionally protected against L1TF attacks from malicious
+user space running on the host.
+
+
+Guest mitigation mechanisms
+---------------------------
+
+1. L1D flush on VMENTER
+^^^^^^^^^^^^^^^^^^^^^^^
+
+   To make sure that a guest cannot attack data which is present in the L1D
+   the hypervisor flushes the L1D before entering the guest.
+
+   Flushing the L1D evicts not only the data which should not be accessed
+   by a potentially malicious guest, it also flushes the guest
+   data. Flushing the L1D has a performance impact as the processor has to
+   bring the flushed guest data back into the L1D. Depending on the
+   frequency of VMEXIT/VMENTER and the type of computations in the guest
+   performance degradation in the range of 1% to 50% has been observed. For
+   scenarios where guest VMEXIT/VMENTER are rare the performance impact is
+   minimal. Virtio and mechanisms like posted interrupts are designed to
+   confine the VMEXITs to a bare minimum, but specific configurations and
+   application scenarios might still suffer from a high VMEXIT rate.
+
+   The general recommendation is to enable L1D flush on VMENTER.
+
+   Note, that L1D flush does not prevent the SMT problem because the
+   sibling thread will also bring back its data into the L1D which makes it
+   attackable again.
+
+   L1D flush can be controlled by the administrator via the kernel command
+   line and sysfs control files. See :ref:`mitigation_control_command_line`
+   and :ref:`mitigation_control_kvm`.
+
+.. _guest_confinement:
+
+2. Guest VCPU confinement to dedicated physical cores
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   To address the SMT problem, it is possible to make a guest or a group of
+   guests affine to one or more physical cores. The proper mechanism for
+   that is to utilize exclusive cpusets to ensure that no other guest or
+   host tasks can run on these cores.
+
+   If only a single guest or related guests run on sibling SMT threads on
+   the same physical core then they can only attack their own memory and
+   restricted parts of the host memory.
+
+   Host memory is attackable, when one of the sibling SMT threads runs in
+   host OS (hypervisor) context and the other in guest context. The amount
+   of valuable information from the host OS context depends on the context
+   which the host OS executes, i.e. interrupts, soft interrupts and kernel
+   threads. The amount of valuable data from these contexts cannot be
+   declared as non-interesting for an attacker without deep inspection of
+   the code.
+
+   Note, that assigning guests to a fixed set of physical cores affects the
+   ability of the scheduler to do load balancing and might have negative
+   effects on CPU utilization depending on the hosting scenario. Disabling
+   SMT might be a viable alternative for particular scenarios.
+
+   For further information about confining guests to a single or to a group
+   of cores consult the cpusets documentation:
+
+   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
+
+.. _interrupt_isolation:
+
+3. Interrupt affinity
+^^^^^^^^^^^^^^^^^^^^^
+
+   Interrupts can be made affine to logical CPUs. This is not universally
+   true because there are types of interrupts which are truly per CPU
+   interrupts, e.g. the local timer interrupt. Aside of that multi queue
+   devices affine their interrupts to single CPUs or groups of CPUs per
+   queue without allowing the administrator to control the affinities.
+
+   Moving the interrupts, which can be affinity controlled, away from CPUs
+   which run untrusted guests, reduces the attack vector space.
+
+   Whether the interrupts with are affine to CPUs, which run untrusted
+   guests, provide interesting data for an attacker depends on the system
+   configuration and the scenarios which run on the system. While for some
+   of the interrupts it can be assumed that they wont expose interesting
+   information beyond exposing hints about the host OS memory layout, there
+   is no way to make general assumptions.
+
+   Interrupt affinity can be controlled by the administrator via the
+   /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
+   available at:
+
+   https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
+
+.. _smt_control:
+
+4. SMT control
+^^^^^^^^^^^^^^
+
+   To prevent the SMT issues of L1TF it might be necessary to disable SMT
+   completely. Disabling SMT can have a significant performance impact, but
+   the impact depends on the hosting scenario and the type of workloads.
+   The impact of disabling SMT needs also to be weighted against the impact
+   of other mitigation solutions like confining guests to dedicated cores.
+
+   The kernel provides a sysfs interface to retrieve the status of SMT and
+   to control it. It also provides a kernel command line interface to
+   control SMT.
+
+   The kernel command line interface consists of the following options:
+
+     =========== ==========================================================
+     nosmt	 Affects the bring up of the secondary CPUs during boot. The
+		 kernel tries to bring all present CPUs online during the
+		 boot process. "nosmt" makes sure that from each physical
+		 core only one - the so called primary (hyper) thread is
+		 activated. Due to a design flaw of Intel processors related
+	 	 to Machine Check Exceptions the non primary siblings have
+		 to be brought up at least partially and are then shut down
+		 again.  "nosmt" can be undone via the sysfs interface.
+
+     nosmt=force Has the same effect as "nosmt' but it does not allow to
+		 undo the SMT disable via the sysfs interface.
+     =========== ==========================================================
+
+   The sysfs interface provides two files:
+
+   - /sys/devices/system/cpu/smt/control
+   - /sys/devices/system/cpu/smt/active
+
+   /sys/devices/system/cpu/smt/control:
+
+     This file allows to read out the SMT control state and provides the
+     ability to disable or (re)enable SMT. The possible states are:
+
+	==============  ===================================================
+	on		SMT is supported by the CPU and enabled. All
+			logical CPUs can be onlined and offlined without
+			restrictions.
+
+	off		SMT is supported by the CPU and disabled. Only
+			the so called primary SMT threads can be onlined
+			and offlined without restrictions. An attempt to
+			online a non-primary sibling is rejected
+
+	forceoff	Same as 'off' but the state cannot be controlled.
+			Attempts to write to the control file are rejected.
+
+	notsupported	The processor does not support SMT. It's therefore
+			not affected by the SMT implications of L1TF.
+			Attempts to write to the control file are rejected.
+	==============  ===================================================
+
+     The possible states which can be written into this file to control SMT
+     state are:
+
+     - on
+     - off
+     - forceoff
+
+   /sys/devices/system/cpu/smt/active:
+
+     This file reports whether SMT is enabled and active, i.e. if on any
+     physical core two or more sibling threads are online.
+
+   SMT control is also possible at boot time via the l1tf kernel command
+   line parameter in combination with L1D flush control. See
+   :ref:`mitigation_control_command_line`.
+
+5. Disabling EPT
+^^^^^^^^^^^^^^^^
+
+  Disabling EPT for virtual machines provides full mitigation for L1TF even
+  with SMT enabled, because the effective page tables for guests are
+  managed and sanitized by the hypervisor. Though disabling EPT has a
+  significant performance impact especially when the Meltdown mitigation
+  KPTI is enabled.
+
+  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+
+There is ongoing research and development for new mitigation mechanisms to
+address the performance impact of disabling SMT or EPT.
+
+.. _mitigation_control_command_line:
+
+Mitigation control on the kernel command line
+---------------------------------------------
+
+The kernel command line allows to control the L1TF mitigations at boot
+time with the option "l1tf=". The valid arguments for this option are:
+
+  ============  ===================================================
+  full		Provides all available mitigations for the L1TF
+		vulnerability. Disables SMT and enables all mitigations in
+		the hypervisors.
+
+		SMT control and L1D flush control via the sysfs interface
+		is still possible after boot.  Hypervisors will issue a
+		warning when the first VM is started in a potentially
+		insecure configuration, i.e. SMT enabled or L1D flush
+		disabled.
+
+  full,force	Same as 'full', but disables SMT and L1D flush runtime
+		control. Implies the 'nosmt=force' command line option.
+		(i.e. sysfs control of SMT is disabled.)
+
+  flush		Leaves SMT enabled and enables the default hypervisor
+		mitigation.
+
+		SMT control and L1D flush control via the sysfs interface
+		is still possible after boot.  Hypervisors will issue a
+		warning when the first VM is started in a potentially
+		insecure configuration, i.e. SMT enabled or L1D flush
+		disabled.
+
+  flush,nosmt	Disables SMT and enables the default hypervisor mitigation.
+
+		SMT control and L1D flush control via the sysfs interface
+		is still possible after boot.  Hypervisors will issue a
+		warning when the first VM is started in a potentially
+		insecure configuration, i.e. SMT enabled or L1D flush
+		disabled.
+
+  flush,nowarn	Same as 'flush', but hypervisors will not warn when a VM is
+		started in a potentially insecure configuration.
+
+  off		Disables hypervisor mitigations and doesn't emit any
+		warnings.
+  ============  ===================================================
+
+The default is 'flush'.
+
+
+.. _mitigation_control_kvm:
+
+Mitigation control for KVM - module parameter
+-------------------------------------------------------------
+
+The KVM hypervisor mitigation mechanism, flushing the L1D cache when
+entering a guest, can be controlled with a module parameter.
+
+The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
+following arguments:
+
+  ============  ==============================================================
+  always	L1D cache flush on every VMENTER.
+
+  cond		Flush L1D on VMENTER only when the code between VMEXIT and
+		VMENTER can leak host memory which is considered
+		interesting for an attacker. This still can leak host memory
+		which allows e.g. to determine the hosts address space layout.
+
+  never		Disables the mitigation
+  ============  ==============================================================
+
+The parameter can be provided on the kernel command line, as a module
+parameter when loading the modules and at runtime modified via the sysfs
+file:
+
+ /sys/module/kvm_intel/parameters/vmentry_l1d_flush
+
+The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
+line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
+module parameter is ignored and writes to the sysfs file are rejected.
+
+
+Mitigation selection guide
+--------------------------
+
+1. No virtualization in use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The system is protected by the kernel unconditionally and no further
+   action is required.
+
+2. Virtualization with trusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   If the guest comes from a trusted source and the guest OS kernel is
+   guaranteed to have the L1TF mitigations in place the system is fully
+   protected against L1TF and no further action is required.
+
+   To avoid the overhead of the default L1D flushing on VMENTER the
+   administrator can disable the flushing via the kernel command line and
+   sysfs control files. See :ref:`mitigation_control_command_line` and
+   :ref:`mitigation_control_kvm`.
+
+
+3. Virtualization with untrusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+3.1. SMT not supported or disabled
+""""""""""""""""""""""""""""""""""
+
+  If SMT is not supported by the processor or disabled in the BIOS or by
+  the kernel, it's only required to enforce L1D flushing on VMENTER.
+
+  Conditional L1D flushing is the default behaviour and can be tuned. See
+  :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+
+3.2. EPT not supported or disabled
+""""""""""""""""""""""""""""""""""
+
+  If EPT is not supported by the processor or disabled in the hypervisor,
+  the system is fully protected. SMT can stay enabled and L1D flushing on
+  VMENTER is not required.
+
+  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+
+3.3. SMT and EPT supported and active
+"""""""""""""""""""""""""""""""""""""
+
+  If SMT and EPT are supported and active then various degrees of
+  mitigations can be employed:
+
+  - L1D flushing on VMENTER:
+
+    L1D flushing on VMENTER is the minimal protection requirement, but it
+    is only potent in combination with other mitigation methods.
+
+    Conditional L1D flushing is the default behaviour and can be tuned. See
+    :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+
+  - Guest confinement:
+
+    Confinement of guests to a single or a group of physical cores which
+    are not running any other processes, can reduce the attack surface
+    significantly, but interrupts, soft interrupts and kernel threads can
+    still expose valuable data to a potential attacker. See
+    :ref:`guest_confinement`.
+
+  - Interrupt isolation:
+
+    Isolating the guest CPUs from interrupts can reduce the attack surface
+    further, but still allows a malicious guest to explore a limited amount
+    of host physical memory. This can at least be used to gain knowledge
+    about the host address space layout. The interrupts which have a fixed
+    affinity to the CPUs which run the untrusted guests can depending on
+    the scenario still trigger soft interrupts and schedule kernel threads
+    which might expose valuable information. See
+    :ref:`interrupt_isolation`.
+
+The above three mitigation methods combined can provide protection to a
+certain degree, but the risk of the remaining attack surface has to be
+carefully analyzed. For full protection the following methods are
+available:
+
+  - Disabling SMT:
+
+    Disabling SMT and enforcing the L1D flushing provides the maximum
+    amount of protection. This mitigation is not depending on any of the
+    above mitigation methods.
+
+    SMT control and L1D flushing can be tuned by the command line
+    parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
+    time with the matching sysfs control files. See :ref:`smt_control`,
+    :ref:`mitigation_control_command_line` and
+    :ref:`mitigation_control_kvm`.
+
+  - Disabling EPT:
+
+    Disabling EPT provides the maximum amount of protection as well. It is
+    not depending on any of the above mitigation methods. SMT can stay
+    enabled and L1D flushing is not required, but the performance impact is
+    significant.
+
+    EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
+    parameter.
+
+
+.. _default_mitigations:
+
+Default mitigations
+-------------------
+
+  The kernel default mitigations for vulnerable processors are:
+
+  - PTE inversion to protect against malicious user space. This is done
+    unconditionally and cannot be controlled.
+
+  - L1D conditional flushing on VMENTER when EPT is enabled for
+    a guest.
+
+  The kernel does not by default enforce the disabling of SMT, which leaves
+  SMT systems vulnerable when running untrusted guests with EPT enabled.
+
+  The rationale for this choice is:
+
+  - Force disabling SMT can break existing setups, especially with
+    unattended updates.
+
+  - If regular users run untrusted guests on their machine, then L1TF is
+    just an add on to other malware which might be embedded in an untrusted
+    guest, e.g. spam-bots or attacks on the local network.
+
+    There is no technical way to prevent a user from running untrusted code
+    on their machines blindly.
+
+  - It's technically extremely unlikely and from today's knowledge even
+    impossible that L1TF can be exploited via the most popular attack
+    mechanisms like JavaScript because these mechanisms have no way to
+    control PTEs. If this would be possible and not other mitigation would
+    be possible, then the default might be different.
+
+  - The administrators of cloud and hosting setups have to carefully
+    analyze the risk for their scenarios and make the appropriate
+    mitigation choices, which might even vary across their deployed
+    machines and also result in other changes of their overall setup.
+    There is no way for the kernel to provide a sensible default for this
+    kind of scenarios.
+

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [patch V10 00/10] Control knobs and Documentation 0
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (9 preceding siblings ...)
  2018-07-12 14:19 ` [patch V10 10/10] Control knobs and Documentation 10 Thomas Gleixner
@ 2018-07-12 14:54 ` Thomas Gleixner
  2018-07-12 19:30 ` [MODERATED] " Josh Poimboeuf
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 14:54 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 119 bytes --]

On Thu, 12 Jul 2018, speck for Thomas Gleixner wrote:
> Git bundle follows in separate mail.

Attached.

Thanks,

	tglx

[-- Attachment #2: Type: application/octet-stream, Size: 75741 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 01/10] Control knobs and Documentation 1
  2018-07-12 14:19 ` [patch V10 01/10] Control knobs and Documentation 1 Thomas Gleixner
@ 2018-07-12 15:34   ` Greg KH
  2018-07-12 15:38     ` Thomas Gleixner
  0 siblings, 1 reply; 49+ messages in thread
From: Greg KH @ 2018-07-12 15:34 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:03PM +0200, speck for Thomas Gleixner wrote:
> +#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
> +
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +static const char *l1tf_vmx_states[] = {
> +	[VMENTER_L1D_FLUSH_AUTO]	= "auto",
> +	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
> +	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
> +	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
> +};
> +
> +static ssize_t l1tf_show_state(char *buf)
> +{
> +	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
> +		return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
> +
> +	return sprintf(buf, "%s; VMX: SMT %s, L1D %s\n", L1TF_DEFAULT_MSG,
> +		       cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled",
> +		       l1tf_vmx_states[l1tf_vmx_mitigation]);
> +}
> +#else
> +static ssize_t l1tf_show_state(char *buf)
> +{
> +	return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
> +}
> +#endif

I don't see any Documenatation/ABI/ update for this new sysfs file in
the patch series before this, or in this one.  Did I miss it?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [patch V10 01/10] Control knobs and Documentation 1
  2018-07-12 15:34   ` [MODERATED] " Greg KH
@ 2018-07-12 15:38     ` Thomas Gleixner
  2018-07-12 15:46       ` Thomas Gleixner
  0 siblings, 1 reply; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 15:38 UTC (permalink / raw)
  To: speck

On Thu, 12 Jul 2018, speck for Greg KH wrote:

> On Thu, Jul 12, 2018 at 04:19:03PM +0200, speck for Thomas Gleixner wrote:
> > +#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
> > +
> > +#if IS_ENABLED(CONFIG_KVM_INTEL)
> > +static const char *l1tf_vmx_states[] = {
> > +	[VMENTER_L1D_FLUSH_AUTO]	= "auto",
> > +	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
> > +	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
> > +	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
> > +};
> > +
> > +static ssize_t l1tf_show_state(char *buf)
> > +{
> > +	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
> > +		return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
> > +
> > +	return sprintf(buf, "%s; VMX: SMT %s, L1D %s\n", L1TF_DEFAULT_MSG,
> > +		       cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled",
> > +		       l1tf_vmx_states[l1tf_vmx_mitigation]);
> > +}
> > +#else
> > +static ssize_t l1tf_show_state(char *buf)
> > +{
> > +	return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
> > +}
> > +#endif
> 
> I don't see any Documenatation/ABI/ update for this new sysfs file in
> the patch series before this, or in this one.  Did I miss it?

Indeed. Missed that completely.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [patch V10 01/10] Control knobs and Documentation 1
  2018-07-12 15:38     ` Thomas Gleixner
@ 2018-07-12 15:46       ` Thomas Gleixner
  2018-07-12 17:08         ` [MODERATED] " Greg KH
  0 siblings, 1 reply; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-12 15:46 UTC (permalink / raw)
  To: speck


On Thu, 12 Jul 2018, speck for Thomas Gleixner wrote:
> On Thu, 12 Jul 2018, speck for Greg KH wrote:
> > > +static ssize_t l1tf_show_state(char *buf)
> > > +{
> > > +	return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
> > > +}
> > > +#endif
> > 
> > I don't see any Documenatation/ABI/ update for this new sysfs file in
> > the patch series before this, or in this one.  Did I miss it?
> 
> Indeed. Missed that completely.

Delta patch below.

Thanks,

	tglx

8<-----------------------

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 65d9b844ecfd..73318225a368 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -476,6 +476,7 @@ What:		/sys/devices/system/cpu/vulnerabilities
 		/sys/devices/system/cpu/vulnerabilities/spectre_v1
 		/sys/devices/system/cpu/vulnerabilities/spectre_v2
 		/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
+		/sys/devices/system/cpu/vulnerabilities/l1tf
 Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Information about CPU vulnerabilities
@@ -488,6 +489,9 @@ Description:	Information about CPU vulnerabilities
 		"Vulnerable"	  CPU is affected and no mitigation in effect
 		"Mitigation: $M"  CPU is affected and mitigation $M is in effect
 
+		Details about the l1tf file can be found in
+		Documentation/admin-guide/l1tf.rst
+
 What:		/sys/devices/system/cpu/smt
 		/sys/devices/system/cpu/smt/active
 		/sys/devices/system/cpu/smt/control

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 10/10] Control knobs and Documentation 10
  2018-07-12 14:19 ` [patch V10 10/10] Control knobs and Documentation 10 Thomas Gleixner
@ 2018-07-12 16:03   ` Linus Torvalds
  2018-07-12 16:31     ` Peter Zijlstra
  2018-07-12 16:13   ` Josh Poimboeuf
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 49+ messages in thread
From: Linus Torvalds @ 2018-07-12 16:03 UTC (permalink / raw)
  To: speck



On Thu, 12 Jul 2018, speck for Thomas Gleixner wrote:
> +
> +Default mitigations
> +-------------------
> +
> +  The kernel default mitigations for vulnerable processors are:

Ack.

> +  The kernel does not by default enforce the disabling of SMT, which leaves
> +  SMT systems vulnerable when running untrusted guests with EPT enabled.
> +
> +  The rationale for this choice is:

and good to have this explicitly mentioned.

I doubt distros want to disable SMT by default, but maybe some OpenBSD- 
like "security is everything" distro wants to. I guess that's their 
choice.

                Linus

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 10/10] Control knobs and Documentation 10
  2018-07-12 14:19 ` [patch V10 10/10] Control knobs and Documentation 10 Thomas Gleixner
  2018-07-12 16:03   ` [MODERATED] " Linus Torvalds
@ 2018-07-12 16:13   ` Josh Poimboeuf
  2018-07-12 16:26     ` Josh Poimboeuf
  2018-07-13  9:09     ` Thomas Gleixner
  2018-07-12 17:18   ` [MODERATED] " Greg KH
                     ` (2 subsequent siblings)
  4 siblings, 2 replies; 49+ messages in thread
From: Josh Poimboeuf @ 2018-07-12 16:13 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:12PM +0200, speck for Thomas Gleixner wrote:
> +  - L1D Flush mode:
> +
> +    ================================  ===================================
> +    'L1D vulnerable'		      L1D flushing is disabled
> +
> +    'L1D conditional cache flushes'   L1D flush is conditionally enabled
> +
> +    'L1D cache flushes'		      SMT is disabled and L1D flush

Typo in the description:

  s/SMT is disabled and L1D flush/L1D flushing is unconditionally enabled/

?

> +Mitigation control on the kernel command line
> +---------------------------------------------
> +
> +The kernel command line allows to control the L1TF mitigations at boot
> +time with the option "l1tf=". The valid arguments for this option are:
> +
> +  ============  ===================================================
> +  full		Provides all available mitigations for the L1TF
> +		vulnerability. Disables SMT and enables all mitigations in
> +		the hypervisors.
> +
> +		SMT control and L1D flush control via the sysfs interface
> +		is still possible after boot.  Hypervisors will issue a
> +		warning when the first VM is started in a potentially
> +		insecure configuration, i.e. SMT enabled or L1D flush
> +		disabled.
> +
> +  full,force	Same as 'full', but disables SMT and L1D flush runtime
> +		control. Implies the 'nosmt=force' command line option.
> +		(i.e. sysfs control of SMT is disabled.)
> +
> +  flush		Leaves SMT enabled and enables the default hypervisor
> +		mitigation.
> +
> +		SMT control and L1D flush control via the sysfs interface
> +		is still possible after boot.  Hypervisors will issue a
> +		warning when the first VM is started in a potentially
> +		insecure configuration, i.e. SMT enabled or L1D flush
> +		disabled.

The difference is between 'flush' and 'full' is quite vague here (and in
kernel-parameters.txt).  It might be a good idea to give a little more
detail.

It also might be helpful to add a pointer to this document in
kernel-parameters.txt, if the user needs more detail.

> +Mitigation control for KVM - module parameter
> +-------------------------------------------------------------
> +
> +The KVM hypervisor mitigation mechanism, flushing the L1D cache when
> +entering a guest, can be controlled with a module parameter.
> +
> +The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
> +following arguments:
> +
> +  ============  ==============================================================
> +  always	L1D cache flush on every VMENTER.
> +
> +  cond		Flush L1D on VMENTER only when the code between VMEXIT and
> +		VMENTER can leak host memory which is considered
> +		interesting for an attacker. This still can leak host memory
> +		which allows e.g. to determine the hosts address space layout.
> +
> +  never		Disables the mitigation
> +  ============  ==============================================================
> +
> +The parameter can be provided on the kernel command line, as a module
> +parameter when loading the modules and at runtime modified via the sysfs
> +file:
> +
> + /sys/module/kvm_intel/parameters/vmentry_l1d_flush
> +
> +The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
> +line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
> +module parameter is ignored and writes to the sysfs file are rejected.

I didn't see the disadvantage of 'always' (performance) described
anywhere -- here or in the "L1D flush on VMENTER section".  Though maybe
that's obvious...

-- 
Josh

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 03/10] Control knobs and Documentation 3
  2018-07-12 14:19 ` [patch V10 03/10] Control knobs and Documentation 3 Thomas Gleixner
@ 2018-07-12 16:13   ` Josh Poimboeuf
  2018-07-13  9:10     ` Thomas Gleixner
  2018-07-12 17:09   ` [MODERATED] " Greg KH
  1 sibling, 1 reply; 49+ messages in thread
From: Josh Poimboeuf @ 2018-07-12 16:13 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:05PM +0200, speck for Thomas Gleixner wrote:
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -701,6 +701,7 @@ static const char *l1tf_vmx_states[] = {
>  	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
>  	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
>  	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
> +	[VMENTER_L1D_FLUSH_EPT_DISABLED]= "EPT disabled"

The missing space looks a bit wonky, add an extra tab for all the
entries instead?  There's also a missing comma in the last entry.

-- 
Josh

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 06/10] Control knobs and Documentation 6
  2018-07-12 14:19 ` [patch V10 06/10] Control knobs and Documentation 6 Thomas Gleixner
@ 2018-07-12 16:14   ` Josh Poimboeuf
  2018-07-12 17:10   ` Greg KH
  1 sibling, 0 replies; 49+ messages in thread
From: Josh Poimboeuf @ 2018-07-12 16:14 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:08PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V10 06/10] x86/kvm: Serialize L!D flush parameter setter

"L1D"

-- 
Josh

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 08/10] Control knobs and Documentation 8
  2018-07-12 14:19 ` [patch V10 08/10] Control knobs and Documentation 8 Thomas Gleixner
@ 2018-07-12 16:22   ` Josh Poimboeuf
  2018-07-12 17:12     ` Greg KH
  2018-07-13  9:18     ` Thomas Gleixner
  2018-07-12 17:17   ` [MODERATED] " Greg KH
  1 sibling, 2 replies; 49+ messages in thread
From: Josh Poimboeuf @ 2018-07-12 16:22 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:10PM +0200, speck for Thomas Gleixner wrote:
> -static int __init smt_cmdline_disable(char *str)
> +void __init cpu_smt_disable(bool force)
>  {
> +	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)

Also needs to check for CPU_SMT_NOT_SUPPORTED.

> +		return;
> +
>  	cpu_smt_control = CPU_SMT_DISABLED;
> -	if (str && !strcmp(str, "force")) {
> +	if (force) {
>  		pr_info("SMT: Force disabled\n");
>  		cpu_smt_control = CPU_SMT_FORCE_DISABLED;
>  	}
> +}

A bit weird that cpu_smt_control is written to twice in the force case.
An if-else would be clearer.

-- 
Josh

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 09/10] Control knobs and Documentation 9
  2018-07-12 14:19 ` [patch V10 09/10] Control knobs and Documentation 9 Thomas Gleixner
@ 2018-07-12 16:24   ` Josh Poimboeuf
  2018-07-12 17:17     ` Greg KH
  2018-07-12 17:16   ` Greg KH
  2018-07-15  3:12   ` Kees Cook
  2 siblings, 1 reply; 49+ messages in thread
From: Josh Poimboeuf @ 2018-07-12 16:24 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:11PM +0200, speck for Thomas Gleixner wrote:
> -#define L1TF_MSG "SMT enabled with L1TF CPU bug present. Refer to CVE-2018-3620 for details.\n"
> +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
> +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"

These printks are very long, maybe split them into two lines, both with
the same "L1TF: " pr_fmt prefix?

-- 
Josh

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 10/10] Control knobs and Documentation 10
  2018-07-12 16:13   ` Josh Poimboeuf
@ 2018-07-12 16:26     ` Josh Poimboeuf
  2018-07-13  9:09     ` Thomas Gleixner
  1 sibling, 0 replies; 49+ messages in thread
From: Josh Poimboeuf @ 2018-07-12 16:26 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 11:13:30AM -0500, Josh Poimboeuf wrote:
> > +Mitigation control on the kernel command line
> > +---------------------------------------------
> > +
> > +The kernel command line allows to control the L1TF mitigations at boot
> > +time with the option "l1tf=". The valid arguments for this option are:
> > +
> > +  ============  ===================================================
> > +  full		Provides all available mitigations for the L1TF
> > +		vulnerability. Disables SMT and enables all mitigations in
> > +		the hypervisors.
> > +
> > +		SMT control and L1D flush control via the sysfs interface
> > +		is still possible after boot.  Hypervisors will issue a
> > +		warning when the first VM is started in a potentially
> > +		insecure configuration, i.e. SMT enabled or L1D flush
> > +		disabled.
> > +
> > +  full,force	Same as 'full', but disables SMT and L1D flush runtime
> > +		control. Implies the 'nosmt=force' command line option.
> > +		(i.e. sysfs control of SMT is disabled.)
> > +
> > +  flush		Leaves SMT enabled and enables the default hypervisor
> > +		mitigation.
> > +
> > +		SMT control and L1D flush control via the sysfs interface
> > +		is still possible after boot.  Hypervisors will issue a
> > +		warning when the first VM is started in a potentially
> > +		insecure configuration, i.e. SMT enabled or L1D flush
> > +		disabled.
> 
> The difference is between 'flush' and 'full' is quite vague here (and in
> kernel-parameters.txt).  It might be a good idea to give a little more
> detail.

I meant to say between 'flush,nosmt' and 'full'...

-- 
Josh

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 10/10] Control knobs and Documentation 10
  2018-07-12 16:03   ` [MODERATED] " Linus Torvalds
@ 2018-07-12 16:31     ` Peter Zijlstra
  0 siblings, 0 replies; 49+ messages in thread
From: Peter Zijlstra @ 2018-07-12 16:31 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 09:03:40AM -0700, speck for Linus Torvalds wrote:

> I doubt distros want to disable SMT by default, but maybe some OpenBSD- 
> like "security is everything" distro wants to. I guess that's their 
> choice.

https://www.theregister.co.uk/2018/06/20/openbsd_disables_intels_hyperthreading/

And they don't even know about this one yet ;-)

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 01/10] Control knobs and Documentation 1
  2018-07-12 15:46       ` Thomas Gleixner
@ 2018-07-12 17:08         ` Greg KH
  0 siblings, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:08 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 05:46:38PM +0200, speck for Thomas Gleixner wrote:
> 
> On Thu, 12 Jul 2018, speck for Thomas Gleixner wrote:
> > On Thu, 12 Jul 2018, speck for Greg KH wrote:
> > > > +static ssize_t l1tf_show_state(char *buf)
> > > > +{
> > > > +	return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
> > > > +}
> > > > +#endif
> > > 
> > > I don't see any Documenatation/ABI/ update for this new sysfs file in
> > > the patch series before this, or in this one.  Did I miss it?
> > 
> > Indeed. Missed that completely.
> 
> Delta patch below.

Looks good, with that merged in:

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 02/10] Control knobs and Documentation 2
  2018-07-12 14:19 ` [patch V10 02/10] Control knobs and Documentation 2 Thomas Gleixner
@ 2018-07-12 17:09   ` Greg KH
  0 siblings, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:09 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:04PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V10 02/10] x86/kvm: Drop L1TF MSR list approach
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> The VMX module parameter to control the L1D flush should become
> writeable.
> 
> The MSR list is set up at VM init per guest VCPU, but the run time
> switching is based on a static key which is global. Toggling the MSR list
> at run time might be feasible, but for now drop this optimization and use
> the regular MSR write to make run-time switching possible.
> 
> The default mitigation is the conditional flush anyway, so for extra
> paranoid setups this will add some small overhead, but the extra code
> executed is in the noise compared to the flush itself.
> 
> Aside of that the EPT disabled case is not handled correctly at the moment
> and the MSR list magic is in the way for fixing that as well.
> 
> If it's really providing a significant advantage, then this needs to be
> revisited after the code is correct and the control is writable.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 03/10] Control knobs and Documentation 3
  2018-07-12 14:19 ` [patch V10 03/10] Control knobs and Documentation 3 Thomas Gleixner
  2018-07-12 16:13   ` [MODERATED] " Josh Poimboeuf
@ 2018-07-12 17:09   ` Greg KH
  1 sibling, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:09 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:05PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V10 03/10] x86/l1tf: Handle EPT disabled state proper
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> If Extended Page Tables (EPT) are disabled or not supported, no L1D
> flushing is required. The setup function can just avoid setting up the L1D
> flush for the EPT=n case.
> 
> Invoke it after the hardware setup has be done and enable_ept has the
> correct state and expose the EPT disabled state in the mitigation status as
> well.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 04/10] Control knobs and Documentation 4
  2018-07-12 14:19 ` [patch V10 04/10] Control knobs and Documentation 4 Thomas Gleixner
@ 2018-07-12 17:10   ` Greg KH
  0 siblings, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:10 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:06PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V10 04/10] x86/kvm: Move l1tf setup function
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> In preparation of allowing run time control for L1D flushing, move the
> setup code to the module parameter handler.
> 
> In case of pre module init parsing, just store the value and let vmx_init()
> do the actual setup after running kvm_init() so that enable_ept is having
> the correct state.
> 
> During run-time invoke it directly from the parameter setter to prepare for
> run-time control.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 05/10] Control knobs and Documentation 5
  2018-07-12 14:19 ` [patch V10 05/10] Control knobs and Documentation 5 Thomas Gleixner
@ 2018-07-12 17:10   ` Greg KH
  0 siblings, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:10 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:07PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V10 05/10] x86/kvm: Add static key for flush always
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> Avoid the conditional in the L1D flush control path.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 06/10] Control knobs and Documentation 6
  2018-07-12 14:19 ` [patch V10 06/10] Control knobs and Documentation 6 Thomas Gleixner
  2018-07-12 16:14   ` [MODERATED] " Josh Poimboeuf
@ 2018-07-12 17:10   ` Greg KH
  1 sibling, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:10 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:08PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V10 06/10] x86/kvm: Serialize L!D flush parameter setter
> From: Thomas Gleixner <tglx@linutronix.de>
> 

I like the '!' :)

> Writes to the parameter files are not serialized at the sysfs core
> level, so local serialization is required.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 07/10] Control knobs and Documentation 7
  2018-07-12 14:19 ` [patch V10 07/10] Control knobs and Documentation 7 Thomas Gleixner
@ 2018-07-12 17:11   ` Greg KH
  0 siblings, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:11 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:09PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V10 07/10] x86/kvm: Allow runtime control of L1D flush
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> All mitigation modes can be switched at run time with a static key now:
> 
>  - Use sysfs_streq() instead of strcmp() to handle the trailing new line
>    from sysfs writes correctly.
>  - Make the static key management handle multiple invocations properly.
>  - Set the module parameter file to RW
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  arch/x86/kernel/cpu/bugs.c |    2 +-
>  arch/x86/kvm/vmx.c         |   13 ++++++++-----
>  2 files changed, 9 insertions(+), 6 deletions(-)
> 
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -661,7 +661,7 @@ void x86_spec_ctrl_setup_ap(void)
>  #define pr_fmt(fmt)	"L1TF: " fmt
>  
>  #if IS_ENABLED(CONFIG_KVM_INTEL)
> -enum vmx_l1d_flush_state l1tf_vmx_mitigation __ro_after_init = VMENTER_L1D_FLUSH_AUTO;
> +enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
>  EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
>  #endif
>  
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -234,12 +234,15 @@ static int vmx_setup_l1d_flush(enum vmx_
>  
>  	l1tf_vmx_mitigation = l1tf;
>  
> -	if (l1tf == VMENTER_L1D_FLUSH_NEVER)
> -		return 0;
> +	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
> +		static_branch_enable(&vmx_l1d_should_flush);
> +	else
> +		static_branch_disable(&vmx_l1d_should_flush);
>  
> -	static_branch_enable(&vmx_l1d_should_flush);
>  	if (l1tf == VMENTER_L1D_FLUSH_ALWAYS)
>  		static_branch_enable(&vmx_l1d_flush_always);
> +	else
> +		static_branch_disable(&vmx_l1d_flush_always);
>  	return 0;
>  }
>  
> @@ -249,7 +252,7 @@ static int vmentry_l1d_flush_parse(const
>  
>  	if (s) {
>  		for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
> -			if (!strcmp(s, vmentry_l1d_param[i].option))
> +			if (sysfs_streq(s, vmentry_l1d_param[i].option))
>  				return vmentry_l1d_param[i].cmd;
>  		}
>  	}
> @@ -293,7 +296,7 @@ static const struct kernel_param_ops vme
>  	.set = vmentry_l1d_flush_set,
>  	.get = vmentry_l1d_flush_get,
>  };
> -module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, S_IRUGO);
> +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);

Thanks for this last change, that was bothering me...

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 08/10] Control knobs and Documentation 8
  2018-07-12 16:22   ` [MODERATED] " Josh Poimboeuf
@ 2018-07-12 17:12     ` Greg KH
  2018-07-13  9:18     ` Thomas Gleixner
  1 sibling, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:12 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 11:22:57AM -0500, speck for Josh Poimboeuf wrote:
> On Thu, Jul 12, 2018 at 04:19:10PM +0200, speck for Thomas Gleixner wrote:
> > -static int __init smt_cmdline_disable(char *str)
> > +void __init cpu_smt_disable(bool force)
> >  {
> > +	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
> 
> Also needs to check for CPU_SMT_NOT_SUPPORTED.
> 
> > +		return;
> > +
> >  	cpu_smt_control = CPU_SMT_DISABLED;
> > -	if (str && !strcmp(str, "force")) {
> > +	if (force) {
> >  		pr_info("SMT: Force disabled\n");
> >  		cpu_smt_control = CPU_SMT_FORCE_DISABLED;
> >  	}
> > +}
> 
> A bit weird that cpu_smt_control is written to twice in the force case.
> An if-else would be clearer.

It's trying really really hard to force it :)

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 09/10] Control knobs and Documentation 9
  2018-07-12 14:19 ` [patch V10 09/10] Control knobs and Documentation 9 Thomas Gleixner
  2018-07-12 16:24   ` [MODERATED] " Josh Poimboeuf
@ 2018-07-12 17:16   ` Greg KH
  2018-07-15  3:12   ` Kees Cook
  2 siblings, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:16 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:11PM +0200, speck for Thomas Gleixner wrote:
> From: Jiri Kosina <jkosina@suse.cz>
> Subject: [patch V10 09/10] x86/bugs, kvm: introduce boot-time control of L1TF mitigations
> 
> Introduce the 'l1tf=' kernel command line option to allow for boot-time
> switching of mitigation that is used on processors affected by L1TF.
> 
> The possible values are:
> 
>   full
> 	Provides all available mitigations for the L1TF vulnerability. Disables
> 	SMT and enables all mitigations in the hypervisors. SMT control via
> 	/sys/devices/system/cpu/smt/control is still possible after boot.
> 	Hypervisors will issue a warning when the first VM is started in
> 	a potentially insecure configuration, i.e. SMT enabled or L1D flush
> 	disabled.
> 
>   full,force
> 	Same as 'full', but disables SMT control. Implies the 'nosmt=force'
> 	command line option. sysfs control of SMT and the hypervisor flush
> 	control is disabled.
> 
>   flush
> 	Leaves SMT enabled and enables the conditional hypervisor mitigation.
> 	Hypervisors will issue a warning when the first VM is started in a
> 	potentially insecure configuration, i.e. SMT enabled or L1D flush
> 	disabled.
> 
>   flush,nosmt
> 	Disables SMT and enables the conditional hypervisor mitigation. SMT
> 	control via /sys/devices/system/cpu/smt/control is still possible
> 	after boot. If SMT is reenabled or flushing disabled at runtime
> 	hypervisors will issue a warning.
> 
>   flush,nowarn
> 	Same as 'flush', but hypervisors will not warn when
> 	a VM is started in a potentially insecure configuration.
> 
>   off
> 	Disables hypervisor mitigations and doesn't emit any warnings.
> 
> Default is 'flush'.
> 
> Let KVM adhere to these semantics, which means:
> 
>   - 'lt1f=full,force'	: Performe L1D flushes. No runtime control
>     			  possible.
> 
>   - 'l1tf=full'
>   - 'l1tf-flush'
>   - 'l1tf=flush,nosmt'	: Perform L1D flushes and warn on VM start if
> 			  SMT has been runtime enabled or L1D flushing
> 			  has been run-time enabled
> 			  
>   - 'l1tf=flush,nowarn'	: Perform L1D flushes and no warnings are emitted.
>   
>   - 'l1tf=off'		: L1D flushes are not performed and no warnings
> 			  are emitted.
> 
> KVM can always override the L1D flushing behavior using its 'vmentry_l1d_flush'
> module parameter except when lt1f=full,force is set.
> 
> This makes KVM's private 'nosmt' option redundant, and as it is a bit
> non-systematic anyway (this is something to control globally, not on
> hypervisor level), remove that option.
> 
> Signed-off-by: Jiri Kosina <jkosina@suse.cz>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 09/10] Control knobs and Documentation 9
  2018-07-12 16:24   ` [MODERATED] " Josh Poimboeuf
@ 2018-07-12 17:17     ` Greg KH
  0 siblings, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:17 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 11:24:50AM -0500, speck for Josh Poimboeuf wrote:
> On Thu, Jul 12, 2018 at 04:19:11PM +0200, speck for Thomas Gleixner wrote:
> > -#define L1TF_MSG "SMT enabled with L1TF CPU bug present. Refer to CVE-2018-3620 for details.\n"
> > +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
> > +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
> 
> These printks are very long, maybe split them into two lines, both with
> the same "L1TF: " pr_fmt prefix?

Ugh, why?  It doesn't really matter, it's just a really long kernel log
line, we have not put length limits on it before, right?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 08/10] Control knobs and Documentation 8
  2018-07-12 14:19 ` [patch V10 08/10] Control knobs and Documentation 8 Thomas Gleixner
  2018-07-12 16:22   ` [MODERATED] " Josh Poimboeuf
@ 2018-07-12 17:17   ` Greg KH
  1 sibling, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:17 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:10PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V10 08/10] cpu/hotplug: Expose SMT control init function
> From: Jiri Kosina <jkosina@suse.cz>
> 
> The L1TF mitigation will gain a commend line parameter which allows to set
> a combination of hypervisor mitigation and SMT control.
> 
> Expose cpu_smt_disable() so the command line parser can tweak SMT settings.
> 
> [ tglx: Split out of larger patch and made it preserve an already existing
>   	force off state ]
> 
> Signed-off-by: Jiri Kosina <jkosina@suse.cz>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 10/10] Control knobs and Documentation 10
  2018-07-12 14:19 ` [patch V10 10/10] Control knobs and Documentation 10 Thomas Gleixner
  2018-07-12 16:03   ` [MODERATED] " Linus Torvalds
  2018-07-12 16:13   ` Josh Poimboeuf
@ 2018-07-12 17:18   ` Greg KH
  2018-07-15  7:30   ` Borislav Petkov
  2018-07-27 16:41   ` Dave Hansen
  4 siblings, 0 replies; 49+ messages in thread
From: Greg KH @ 2018-07-12 17:18 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:12PM +0200, speck for Thomas Gleixner wrote:
> Subject: [patch V10 10/10] Documentation: Add section about CPU vulnerabilities
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

No changelog text?  :)

Anyway, text for the file looks good to me, nice work with it.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 00/10] Control knobs and Documentation 0
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (10 preceding siblings ...)
  2018-07-12 14:54 ` [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
@ 2018-07-12 19:30 ` Josh Poimboeuf
  2018-07-13 15:03   ` Thomas Gleixner
  2018-07-13  8:30 ` [MODERATED] " Jiri Kosina
  2018-07-13 16:22 ` Paolo Bonzini
  13 siblings, 1 reply; 49+ messages in thread
From: Josh Poimboeuf @ 2018-07-12 19:30 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:02PM +0200, speck for Thomas Gleixner wrote:
> The following series provides the following changes:
> 
>   - Fix EPT=off handling so it avoids flushing
>   
>   - Expose proper VMX mitigation information in sysfs
> 
>   - Drops the MSR list mechanism for flush 'always' to prepare for runtime
>     control. The default flush mechanism is conditional anyway and the MSR
>     list is set up at guest init time, which is nasty to run time switch
>     especially because the static key is a global control which can be
>     flipped by an update.
> 
>   - Make the flush always/conditional static key based.
> 
>   - Serialize the kvm parameter setter function
> 
>   - Enable runtime control for the kvm parameter
> 
>   - Add the l1tf command line option. It's not run time controllable as it
>     does not make sense to have 3 knobs at runtime. For the command line
>     the combo knob setting the default is convenient
> 
>   - Documentation update
> 
> This takes the review comments into account as much as still applicable.
> 
> Thanks to Jiri for testing the lot and debugging and fixing my brainfarts!
> 
> Git bundle follows in separate mail.

It looks great to me (other than the few minor issues I already pointed
out).

Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>

-- 
Josh

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 00/10] Control knobs and Documentation 0
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (11 preceding siblings ...)
  2018-07-12 19:30 ` [MODERATED] " Josh Poimboeuf
@ 2018-07-13  8:30 ` Jiri Kosina
  2018-07-13 16:22 ` Paolo Bonzini
  13 siblings, 0 replies; 49+ messages in thread
From: Jiri Kosina @ 2018-07-13  8:30 UTC (permalink / raw)
  To: speck

On Thu, 12 Jul 2018, speck for Thomas Gleixner wrote:

> The following series provides the following changes:
> 
>   - Fix EPT=off handling so it avoids flushing
>   
>   - Expose proper VMX mitigation information in sysfs
> 
>   - Drops the MSR list mechanism for flush 'always' to prepare for runtime
>     control. The default flush mechanism is conditional anyway and the MSR
>     list is set up at guest init time, which is nasty to run time switch
>     especially because the static key is a global control which can be
>     flipped by an update.
> 
>   - Make the flush always/conditional static key based.
> 
>   - Serialize the kvm parameter setter function
> 
>   - Enable runtime control for the kvm parameter
> 
>   - Add the l1tf command line option. It's not run time controllable as it
>     does not make sense to have 3 knobs at runtime. For the command line
>     the combo knob setting the default is convenient
> 
>   - Documentation update
> 
> This takes the review comments into account as much as still applicable.
> 
> Thanks to Jiri for testing the lot and debugging and fixing my brainfarts!

Yup, I've been playing with it yesterday, so this is

	Tested-by: Jiri Kosina <jkosina@suse.cz>

Thanks,

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [patch V10 10/10] Control knobs and Documentation 10
  2018-07-12 16:13   ` Josh Poimboeuf
  2018-07-12 16:26     ` Josh Poimboeuf
@ 2018-07-13  9:09     ` Thomas Gleixner
  1 sibling, 0 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-13  9:09 UTC (permalink / raw)
  To: speck

On Thu, 12 Jul 2018, speck for Josh Poimboeuf wrote:
> On Thu, Jul 12, 2018 at 04:19:12PM +0200, speck for Thomas Gleixner wrote:
> > +  - L1D Flush mode:
> > +
> > +    ================================  ===================================
> > +    'L1D vulnerable'		      L1D flushing is disabled
> > +
> > +    'L1D conditional cache flushes'   L1D flush is conditionally enabled
> > +
> > +    'L1D cache flushes'		      SMT is disabled and L1D flush
> 
> Typo in the description:
> 
>   s/SMT is disabled and L1D flush/L1D flushing is unconditionally enabled/

Yeah. Copy and paste is a wonderful tool if used with brain enabled.
 
> 
> The difference is between 'flush' and 'full' is quite vague here (and in
> kernel-parameters.txt).  It might be a good idea to give a little more
> detail.

Done.

> It also might be helpful to add a pointer to this document in
> kernel-parameters.txt, if the user needs more detail.

Done.

> > +
> > +The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
> > +line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
> > +module parameter is ignored and writes to the sysfs file are rejected.
> 
> I didn't see the disadvantage of 'always' (performance) described
> anywhere -- here or in the "L1D flush on VMENTER section".  Though maybe
> that's obvious...

Added a paragraph to the L1D flush section and a link to that section here.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [patch V10 03/10] Control knobs and Documentation 3
  2018-07-12 16:13   ` [MODERATED] " Josh Poimboeuf
@ 2018-07-13  9:10     ` Thomas Gleixner
  0 siblings, 0 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-13  9:10 UTC (permalink / raw)
  To: speck

On Thu, 12 Jul 2018, speck for Josh Poimboeuf wrote:

> On Thu, Jul 12, 2018 at 04:19:05PM +0200, speck for Thomas Gleixner wrote:
> > --- a/arch/x86/kernel/cpu/bugs.c
> > +++ b/arch/x86/kernel/cpu/bugs.c
> > @@ -701,6 +701,7 @@ static const char *l1tf_vmx_states[] = {
> >  	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
> >  	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
> >  	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
> > +	[VMENTER_L1D_FLUSH_EPT_DISABLED]= "EPT disabled"
> 
> The missing space looks a bit wonky, add an extra tab for all the
> entries instead?  There's also a missing comma in the last entry.

Fixed.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [patch V10 08/10] Control knobs and Documentation 8
  2018-07-12 16:22   ` [MODERATED] " Josh Poimboeuf
  2018-07-12 17:12     ` Greg KH
@ 2018-07-13  9:18     ` Thomas Gleixner
  1 sibling, 0 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-13  9:18 UTC (permalink / raw)
  To: speck

On Thu, 12 Jul 2018, speck for Josh Poimboeuf wrote:

> On Thu, Jul 12, 2018 at 04:19:10PM +0200, speck for Thomas Gleixner wrote:
> > -static int __init smt_cmdline_disable(char *str)
> > +void __init cpu_smt_disable(bool force)
> >  {
> > +	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
> 
> Also needs to check for CPU_SMT_NOT_SUPPORTED.

For clarity sake, yes. Functional no as that is before the final decision
can be made which will set NOT_SUPPORTED.

But that made me look deeper and NOT_SUPPORTED is actually set too late
with those changes. Until now it was sufficient just to set it before the
sysfs control file is set up.

/me fixes.

> > +		return;
> > +
> >  	cpu_smt_control = CPU_SMT_DISABLED;
> > -	if (str && !strcmp(str, "force")) {
> > +	if (force) {
> >  		pr_info("SMT: Force disabled\n");
> >  		cpu_smt_control = CPU_SMT_FORCE_DISABLED;
> >  	}
> > +}
> 
> A bit weird that cpu_smt_control is written to twice in the force case.

That's to put more emphasis on it :) Fixed it up.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [patch V10 00/10] Control knobs and Documentation 0
  2018-07-12 19:30 ` [MODERATED] " Josh Poimboeuf
@ 2018-07-13 15:03   ` Thomas Gleixner
  0 siblings, 0 replies; 49+ messages in thread
From: Thomas Gleixner @ 2018-07-13 15:03 UTC (permalink / raw)
  To: speck

On Thu, 12 Jul 2018, speck for Josh Poimboeuf wrote:
> On Thu, Jul 12, 2018 at 04:19:02PM +0200, speck for Thomas Gleixner wrote:
> > Git bundle follows in separate mail.
> 
> It looks great to me (other than the few minor issues I already pointed
> out).
> 
> Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>

Thanks Josh! The delta patch against the V10 lot with the last fixups is
below.

Thanks,

	tglx
8<-------------
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1993,7 +1993,7 @@
 				Provides all available mitigations for the
 				L1TF vulnerability. Disables SMT and
 				enables all mitigations in the
-				hypervisors.
+				hypervisors, i.e. unconditional L1D flush.
 
 				SMT control and L1D flush control via the
 				sysfs interface is still possible after
@@ -2010,7 +2010,8 @@
 
 			flush
 				Leaves SMT enabled and enables the default
-				hypervisor mitigation.
+				hypervisor mitigation, i.e. conditional
+				L1D flush.
 
 				SMT control and L1D flush control via the
 				sysfs interface is still possible after
@@ -2042,6 +2043,8 @@
 
 			Default is 'flush'.
 
+			For details see: Documentation/admin-guide/l1tf.rst
+
 	l2cr=		[PPC]
 
 	l3cr=		[PPC]
--- a/Documentation/admin-guide/l1tf.rst
+++ b/Documentation/admin-guide/l1tf.rst
@@ -146,13 +146,13 @@ If KVM/VMX is enabled and the processor
 
   - L1D Flush mode:
 
-    ================================  ===================================
+    ================================  ====================================
     'L1D vulnerable'		      L1D flushing is disabled
 
     'L1D conditional cache flushes'   L1D flush is conditionally enabled
 
-    'L1D cache flushes'		      SMT is disabled and L1D flush
-    ================================  ===================================
+    'L1D cache flushes'		      L1D flush is unconditionally enabled
+    ================================  ====================================
 
 The resulting grade of protection is discussed in the following sections.
 
@@ -167,6 +167,8 @@ user space running on the host.
 Guest mitigation mechanisms
 ---------------------------
 
+.. _l1d_flush:
+
 1. L1D flush on VMENTER
 ^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -184,9 +186,25 @@ Guest mitigation mechanisms
    confine the VMEXITs to a bare minimum, but specific configurations and
    application scenarios might still suffer from a high VMEXIT rate.
 
-   The general recommendation is to enable L1D flush on VMENTER.
+   The kernel provides two L1D flush modes:
+    - conditional ('cond')
+    - unconditional ('always')
+
+   The conditional mode avoids L1D flushing after VMEXITs which execute
+   only audited code pathes before the corresponding VMENTER. These code
+   pathes have beed verified that they cannot expose secrets or other
+   interesting data to an attacker, but they can leak information about the
+   address space layout of the hypervisor.
+
+   Unconditional mode flushes L1D on all VMENTER invocations and provides
+   maximum protection. It has a higher overhead than the conditional
+   mode. The overhead cannot be quantified correctly as it depends on the
+   work load scenario and the resulting number of VMEXITs.
+
+   The general recommendation is to enable L1D flush on VMENTER. The kernel
+   defaults to conditional mode on affected processors.
 
-   Note, that L1D flush does not prevent the SMT problem because the
+   **Note**, that L1D flush does not prevent the SMT problem because the
    sibling thread will also bring back its data into the L1D which makes it
    attackable again.
 
@@ -216,10 +234,11 @@ Guest mitigation mechanisms
    declared as non-interesting for an attacker without deep inspection of
    the code.
 
-   Note, that assigning guests to a fixed set of physical cores affects the
-   ability of the scheduler to do load balancing and might have negative
-   effects on CPU utilization depending on the hosting scenario. Disabling
-   SMT might be a viable alternative for particular scenarios.
+   **Note**, that assigning guests to a fixed set of physical cores affects
+   the ability of the scheduler to do load balancing and might have
+   negative effects on CPU utilization depending on the hosting
+   scenario. Disabling SMT might be a viable alternative for particular
+   scenarios.
 
    For further information about confining guests to a single or to a group
    of cores consult the cpusets documentation:
@@ -276,7 +295,7 @@ Guest mitigation mechanisms
 		 boot process. "nosmt" makes sure that from each physical
 		 core only one - the so called primary (hyper) thread is
 		 activated. Due to a design flaw of Intel processors related
-	 	 to Machine Check Exceptions the non primary siblings have
+		 to Machine Check Exceptions the non primary siblings have
 		 to be brought up at least partially and are then shut down
 		 again.  "nosmt" can be undone via the sysfs interface.
 
@@ -350,10 +369,10 @@ Mitigation control on the kernel command
 The kernel command line allows to control the L1TF mitigations at boot
 time with the option "l1tf=". The valid arguments for this option are:
 
-  ============  ===================================================
+  ============  =============================================================
   full		Provides all available mitigations for the L1TF
 		vulnerability. Disables SMT and enables all mitigations in
-		the hypervisors.
+		the hypervisors, i.e. unconditional L1D flushing
 
 		SMT control and L1D flush control via the sysfs interface
 		is still possible after boot.  Hypervisors will issue a
@@ -366,7 +385,7 @@ The kernel command line allows to contro
 		(i.e. sysfs control of SMT is disabled.)
 
   flush		Leaves SMT enabled and enables the default hypervisor
-		mitigation.
+		mitigation, i.e. conditional L1D flushing
 
 		SMT control and L1D flush control via the sysfs interface
 		is still possible after boot.  Hypervisors will issue a
@@ -374,7 +393,8 @@ The kernel command line allows to contro
 		insecure configuration, i.e. SMT enabled or L1D flush
 		disabled.
 
-  flush,nosmt	Disables SMT and enables the default hypervisor mitigation.
+  flush,nosmt	Disables SMT and enables the default hypervisor mitigation,
+		i.e. conditional L1D flushing.
 
 		SMT control and L1D flush control via the sysfs interface
 		is still possible after boot.  Hypervisors will issue a
@@ -387,9 +407,9 @@ The kernel command line allows to contro
 
   off		Disables hypervisor mitigations and doesn't emit any
 		warnings.
-  ============  ===================================================
+  ============  =============================================================
 
-The default is 'flush'.
+The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
 
 
 .. _mitigation_control_kvm:
@@ -418,7 +438,7 @@ The parameter can be provided on the ker
 parameter when loading the modules and at runtime modified via the sysfs
 file:
 
- /sys/module/kvm_intel/parameters/vmentry_l1d_flush
+/sys/module/kvm_intel/parameters/vmentry_l1d_flush
 
 The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
 line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
@@ -569,4 +589,3 @@ Default mitigations
     machines and also result in other changes of their overall setup.
     There is no way for the kernel to provide a sensible default for this
     kind of scenarios.
-
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -59,6 +59,12 @@ void __init check_bugs(void)
 {
 	identify_boot_cpu();
 
+	/*
+	 * identify_boot_cpu() initialized SMT support information, let the
+	 * core code know.
+	 */
+	cpu_smt_check_topology();
+
 	if (!IS_ENABLED(CONFIG_SMP)) {
 		pr_info("CPU: ");
 		print_cpu_info(&boot_cpu_data);
@@ -742,11 +748,11 @@ early_param("l1tf", l1tf_cmdline);
 
 #if IS_ENABLED(CONFIG_KVM_INTEL)
 static const char *l1tf_vmx_states[] = {
-	[VMENTER_L1D_FLUSH_AUTO]	= "auto",
-	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
-	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
-	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
-	[VMENTER_L1D_FLUSH_EPT_DISABLED]= "EPT disabled"
+	[VMENTER_L1D_FLUSH_AUTO]		= "auto",
+	[VMENTER_L1D_FLUSH_NEVER]		= "vulnerable",
+	[VMENTER_L1D_FLUSH_COND]		= "conditional cache flushes",
+	[VMENTER_L1D_FLUSH_ALWAYS]		= "cache flushes",
+	[VMENTER_L1D_FLUSH_EPT_DISABLED]	= "EPT disabled",
 };
 
 static ssize_t l1tf_show_state(char *buf)
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -178,9 +178,11 @@ enum cpuhp_smt_control {
 #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
 extern enum cpuhp_smt_control cpu_smt_control;
 extern void cpu_smt_disable(bool force);
+extern void cpu_smt_check_topology(void);
 #else
 # define cpu_smt_control		(CPU_SMT_ENABLED)
 static inline void cpu_smt_disable(bool force) { }
+static inline void cpu_smt_check_topology(void) { }
 #endif
 
 #endif /* _LINUX_CPU_H_ */
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -349,16 +349,28 @@ EXPORT_SYMBOL_GPL(cpu_smt_control);
 
 void __init cpu_smt_disable(bool force)
 {
-	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
+	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
+		cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
 		return;
 
-	cpu_smt_control = CPU_SMT_DISABLED;
 	if (force) {
 		pr_info("SMT: Force disabled\n");
 		cpu_smt_control = CPU_SMT_FORCE_DISABLED;
+	} else {
+		cpu_smt_control = CPU_SMT_DISABLED;
 	}
 }
 
+/*
+ * The decision whether SMT is supported can only be done after the full
+ * CPU identification. Called from architecture code.
+ */
+void __init cpu_smt_check_topology(void)
+{
+	if (!topology_smt_supported())
+		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+}
+
 static int __init smt_cmdline_disable(char *str)
 {
 	cpu_smt_disable(str && !strcmp(str, "force"));
@@ -2113,9 +2125,6 @@ static const struct attribute_group cpuh
 
 static int __init cpu_smt_state_init(void)
 {
-	if (!topology_smt_supported())
-		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
-
 	return sysfs_create_group(&cpu_subsys.dev_root->kobj,
 				  &cpuhp_smt_attr_group);
 }
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -476,6 +476,7 @@ What:		/sys/devices/system/cpu/vulnerabi
 		/sys/devices/system/cpu/vulnerabilities/spectre_v1
 		/sys/devices/system/cpu/vulnerabilities/spectre_v2
 		/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
+		/sys/devices/system/cpu/vulnerabilities/l1tf
 Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Information about CPU vulnerabilities
@@ -488,6 +489,9 @@ Description:	Information about CPU vulne
 		"Vulnerable"	  CPU is affected and no mitigation in effect
 		"Mitigation: $M"  CPU is affected and mitigation $M is in effect
 
+		Details about the l1tf file can be found in
+		Documentation/admin-guide/l1tf.rst
+
 What:		/sys/devices/system/cpu/smt
 		/sys/devices/system/cpu/smt/active
 		/sys/devices/system/cpu/smt/control

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 00/10] Control knobs and Documentation 0
  2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
                   ` (12 preceding siblings ...)
  2018-07-13  8:30 ` [MODERATED] " Jiri Kosina
@ 2018-07-13 16:22 ` Paolo Bonzini
  2018-07-13 16:56   ` Andrew Cooper
  2018-07-13 17:28   ` Konrad Rzeszutek Wilk
  13 siblings, 2 replies; 49+ messages in thread
From: Paolo Bonzini @ 2018-07-13 16:22 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 5676 bytes --]

On 12/07/2018 16:19, speck for Thomas Gleixner wrote:
> The following series provides the following changes:
> 
>   - Fix EPT=off handling so it avoids flushing
>   
>   - Expose proper VMX mitigation information in sysfs
> 
>   - Drops the MSR list mechanism for flush 'always' to prepare for runtime
>     control. The default flush mechanism is conditional anyway and the MSR
>     list is set up at guest init time, which is nasty to run time switch
>     especially because the static key is a global control which can be
>     flipped by an update.
> 
>   - Make the flush always/conditional static key based.
> 
>   - Serialize the kvm parameter setter function
> 
>   - Enable runtime control for the kvm parameter
> 
>   - Add the l1tf command line option. It's not run time controllable as it
>     does not make sense to have 3 knobs at runtime. For the command line
>     the combo knob setting the default is convenient
> 
>   - Documentation update
> 
> This takes the review comments into account as much as still applicable.
> 
> Thanks to Jiri for testing the lot and debugging and fixing my brainfarts!
> 
> Git bundle follows in separate mail.

Another case on top of this series...

---------------------- 8< --------------------
From a0f605fed99cf1623f8716b22c11113653c258a3 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 13 Jul 2018 18:15:29 +0200
Subject: [PATCH] kvm: vmx: disable L1D flush when running as a nested
 hypervisor

VMENTER operations from the nested hypervisor into the nested guest
will always be processed by the bare metal hypervisor.  Therefore,
when running as a nested hypervisor, doing L1D cache flushes on vmentry
will result in twice the work and twice the slowdown, for no benefit.

Special case this situation and report it in sysfs.

(The three levels involved are usually called L0/L1/L2 in KVM slang.  I'm
avoiding that naming because of the confusion with cache levels).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/admin-guide/l1tf.rst | 23 ++++++++++++++++++++++-
 arch/x86/include/asm/vmx.h         |  1 +
 arch/x86/kernel/cpu/bugs.c         |  3 -++
 arch/x86/kvm/vmx.c                 |  5 +++++
 4 files changed, 30 insertions(+), 2 deletion(-)

diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst
index 5adf7d7c2b4e..a962afbce156 100644
--- a/Documentation/admin-guide/l1tf.rst
+++ b/Documentation/admin-guide/l1tf.rst
@@ -528,6 +528,27 @@ available:
     EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
     parameter.
 
+3.4. Nested virtual machines
+""""""""""""""""""""""""""""
+
+When nested virtualization is in use, three operating systems are involved:
+the bare metal hypervisor, the nested hypervisor, and the nested virtual
+machine.  VMENTER operations from the nested hypervisor into the nested
+guest will always be processed by the bare metal hypervisor.  Therefore,
+when running as a nested hypervisor, KVM will not perform any L1D cache
+flush, assuming instead that the "outermost" hypervisor takes care of
+flushing the L1D cache on VMENTER to nested guests.
+
+When running as a bare metal hypervisor, instead, KVM will:
+
+ - flush the L1D cache on every switch from nested hypervisor to
+   nested virtual machine, so that the nested hypervisor's secrets
+   are not exposed to the nested virtual machine;
+
+ - flush the L1D cache on every switch from nested virtual machine to
+   nested hypervisor; this is a complex operation, and flushing the L1D
+   cache avoids that the bare metal hypervisor's secrets be exposed
+   to the nested virtual machine.
 
 .. _default_mitigations:
 
@@ -540,7 +561,7 @@ Default mitigations
     unconditionally and cannot be controlled.
 
   - L1D conditional flushing on VMENTER when EPT is enabled for
-    a guest.
+    a guest, and the guest is not a nested virtual machine.
 
   The kernel does not by default enforce the disabling of SMT, which leaves
   SMT systems vulnerable when running untrusted guests with EPT enabled.
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 94a8547d915b..7c0438751fa5 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -579,6 +579,7 @@ enum vmx_l1d_flush_state {
 	VMENTER_L1D_FLUSH_COND,
 	VMENTER_L1D_FLUSH_ALWAYS,
 	VMENTER_L1D_FLUSH_EPT_DISABLED,
+	VMENTER_L1D_FLUSH_NESTED_VM,
 };
 
 extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d63cb1501784..87828f2f64a5 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -745,7 +745,8 @@ static const char *l1tf_vmx_states[] = {
 	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
 	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
 	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
-	[VMENTER_L1D_FLUSH_EPT_DISABLED]= "EPT disabled"
+	[VMENTER_L1D_FLUSH_EPT_DISABLED]= "EPT disabled",
+	[VMENTER_L1D_FLUSH_NESTED_VM]   = "nested virtual machine",
 };
 
 static ssize_t l1tf_show_state(char *buf)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c5c0118b126d..a7e41ac4256f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -212,6 +212,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 {
 	struct page *page;
 
+	if (static_cpu_has(X86_FEATURE_HYPERVISOR)) {
+		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NESTED_VM;
+		return 0;
+	}
+
 	if (!enable_ept) {
 		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
 		return 0;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 00/10] Control knobs and Documentation 0
  2018-07-13 16:22 ` Paolo Bonzini
@ 2018-07-13 16:56   ` Andrew Cooper
  2018-07-13 17:01     ` Paolo Bonzini
  2018-07-13 17:28   ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 49+ messages in thread
From: Andrew Cooper @ 2018-07-13 16:56 UTC (permalink / raw)
  To: speck

On 13/07/18 17:22, speck for Paolo Bonzini wrote:
> On 12/07/2018 16:19, speck for Thomas Gleixner wrote:
>> The following series provides the following changes:
>>
>>   - Fix EPT=off handling so it avoids flushing
>>   
>>   - Expose proper VMX mitigation information in sysfs
>>
>>   - Drops the MSR list mechanism for flush 'always' to prepare for runtime
>>     control. The default flush mechanism is conditional anyway and the MSR
>>     list is set up at guest init time, which is nasty to run time switch
>>     especially because the static key is a global control which can be
>>     flipped by an update.
>>
>>   - Make the flush always/conditional static key based.
>>
>>   - Serialize the kvm parameter setter function
>>
>>   - Enable runtime control for the kvm parameter
>>
>>   - Add the l1tf command line option. It's not run time controllable as it
>>     does not make sense to have 3 knobs at runtime. For the command line
>>     the combo knob setting the default is convenient
>>
>>   - Documentation update
>>
>> This takes the review comments into account as much as still applicable.
>>
>> Thanks to Jiri for testing the lot and debugging and fixing my brainfarts!
>>
>> Git bundle follows in separate mail.
> Another case on top of this series...
>
> ---------------------- 8< --------------------
> From a0f605fed99cf1623f8716b22c11113653c258a3 Mon Sep 17 00:00:00 2001
> From: Paolo Bonzini <pbonzini@redhat.com>
> Date: Fri, 13 Jul 2018 18:15:29 +0200
> Subject: [PATCH] kvm: vmx: disable L1D flush when running as a nested
>  hypervisor
>
> VMENTER operations from the nested hypervisor into the nested guest
> will always be processed by the bare metal hypervisor.  Therefore,
> when running as a nested hypervisor, doing L1D cache flushes on vmentry
> will result in twice the work and twice the slowdown, for no benefit.

Only if your outer hypervisor says so by setting MSR_ARCH_CAPS.VMENTRY_NO

In all other circumstances, it is not safe to make this assumption.

~Andrew

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 00/10] Control knobs and Documentation 0
  2018-07-13 16:56   ` Andrew Cooper
@ 2018-07-13 17:01     ` Paolo Bonzini
  0 siblings, 0 replies; 49+ messages in thread
From: Paolo Bonzini @ 2018-07-13 17:01 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 752 bytes --]

On 13/07/2018 18:56, speck for Andrew Cooper wrote:
>> VMENTER operations from the nested hypervisor into the nested guest
>> will always be processed by the bare metal hypervisor.  Therefore,
>> when running as a nested hypervisor, doing L1D cache flushes on vmentry
>> will result in twice the work and twice the slowdown, for no benefit.
> Only if your outer hypervisor says so by setting MSR_ARCH_CAPS.VMENTRY_NO
> 
> In all other circumstances, it is not safe to make this assumption.

That is indeed yet another case to check.

However, for the nested case specifically, wouldn't the nested
hypervisor be doomed anyway, if the bare metal hypervisor can do all
kind of stuff between your L1D flush and the actual vmentry?

Paolo


^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 00/10] Control knobs and Documentation 0
  2018-07-13 16:22 ` Paolo Bonzini
  2018-07-13 16:56   ` Andrew Cooper
@ 2018-07-13 17:28   ` Konrad Rzeszutek Wilk
  2018-07-15 13:58     ` Paolo Bonzini
  1 sibling, 1 reply; 49+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-07-13 17:28 UTC (permalink / raw)
  To: speck

On Fri, Jul 13, 2018 at 06:22:47PM +0200, speck for Paolo Bonzini wrote:
> On 12/07/2018 16:19, speck for Thomas Gleixner wrote:
> > The following series provides the following changes:
> > 
> >   - Fix EPT=off handling so it avoids flushing
> >   
> >   - Expose proper VMX mitigation information in sysfs
> > 
> >   - Drops the MSR list mechanism for flush 'always' to prepare for runtime
> >     control. The default flush mechanism is conditional anyway and the MSR
> >     list is set up at guest init time, which is nasty to run time switch
> >     especially because the static key is a global control which can be
> >     flipped by an update.
> > 
> >   - Make the flush always/conditional static key based.
> > 
> >   - Serialize the kvm parameter setter function
> > 
> >   - Enable runtime control for the kvm parameter
> > 
> >   - Add the l1tf command line option. It's not run time controllable as it
> >     does not make sense to have 3 knobs at runtime. For the command line
> >     the combo knob setting the default is convenient
> > 
> >   - Documentation update
> > 
> > This takes the review comments into account as much as still applicable.
> > 
> > Thanks to Jiri for testing the lot and debugging and fixing my brainfarts!
> > 
> > Git bundle follows in separate mail.
> 
> Another case on top of this series...
> 
> ---------------------- 8< --------------------
> From a0f605fed99cf1623f8716b22c11113653c258a3 Mon Sep 17 00:00:00 2001
> From: Paolo Bonzini <pbonzini@redhat.com>
> Date: Fri, 13 Jul 2018 18:15:29 +0200
> Subject: [PATCH] kvm: vmx: disable L1D flush when running as a nested
>  hypervisor
> 
> VMENTER operations from the nested hypervisor into the nested guest
> will always be processed by the bare metal hypervisor.  Therefore,
> when running as a nested hypervisor, doing L1D cache flushes on vmentry
> will result in twice the work and twice the slowdown, for no benefit.
> 
> Special case this situation and report it in sysfs.
> 
> (The three levels involved are usually called L0/L1/L2 in KVM slang.  I'm
> avoiding that naming because of the confusion with cache levels).
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  Documentation/admin-guide/l1tf.rst | 23 ++++++++++++++++++++++-
>  arch/x86/include/asm/vmx.h         |  1 +
>  arch/x86/kernel/cpu/bugs.c         |  3 -++
>  arch/x86/kvm/vmx.c                 |  5 +++++
>  4 files changed, 30 insertions(+), 2 deletion(-)
> 
> diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst
> index 5adf7d7c2b4e..a962afbce156 100644
> --- a/Documentation/admin-guide/l1tf.rst
> +++ b/Documentation/admin-guide/l1tf.rst
> @@ -528,6 +528,27 @@ available:
>      EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
>      parameter.
>  
> +3.4. Nested virtual machines
> +""""""""""""""""""""""""""""
> +
> +When nested virtualization is in use, three operating systems are involved:
> +the bare metal hypervisor, the nested hypervisor, and the nested virtual
> +machine.  VMENTER operations from the nested hypervisor into the nested
> +guest will always be processed by the bare metal hypervisor.  Therefore,
> +when running as a nested hypervisor, KVM will not perform any L1D cache
> +flush, assuming instead that the "outermost" hypervisor takes care of
> +flushing the L1D cache on VMENTER to nested guests.
> +
> +When running as a bare metal hypervisor, instead, KVM will:
> +
> + - flush the L1D cache on every switch from nested hypervisor to
> +   nested virtual machine, so that the nested hypervisor's secrets
> +   are not exposed to the nested virtual machine;
> +
> + - flush the L1D cache on every switch from nested virtual machine to
> +   nested hypervisor; this is a complex operation, and flushing the L1D
> +   cache avoids that the bare metal hypervisor's secrets be exposed
> +   to the nested virtual machine.
>  
>  .. _default_mitigations:
>  
> @@ -540,7 +561,7 @@ Default mitigations
>      unconditionally and cannot be controlled.
>  
>    - L1D conditional flushing on VMENTER when EPT is enabled for
> -    a guest.
> +    a guest, and the guest is not a nested virtual machine.
>  
>    The kernel does not by default enforce the disabling of SMT, which leaves
>    SMT systems vulnerable when running untrusted guests with EPT enabled.
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 94a8547d915b..7c0438751fa5 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -579,6 +579,7 @@ enum vmx_l1d_flush_state {
>  	VMENTER_L1D_FLUSH_COND,
>  	VMENTER_L1D_FLUSH_ALWAYS,
>  	VMENTER_L1D_FLUSH_EPT_DISABLED,
> +	VMENTER_L1D_FLUSH_NESTED_VM,
>  };
>  
>  extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> index d63cb1501784..87828f2f64a5 100644
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -745,7 +745,8 @@ static const char *l1tf_vmx_states[] = {
>  	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
>  	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
>  	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
> -	[VMENTER_L1D_FLUSH_EPT_DISABLED]= "EPT disabled"
> +	[VMENTER_L1D_FLUSH_EPT_DISABLED]= "EPT disabled",
> +	[VMENTER_L1D_FLUSH_NESTED_VM]   = "nested virtual machine",
>  };
>  
>  static ssize_t l1tf_show_state(char *buf)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index c5c0118b126d..a7e41ac4256f 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -212,6 +212,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
>  {
>  	struct page *page;
>  
> +	if (static_cpu_has(X86_FEATURE_HYPERVISOR)) {
> +		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NESTED_VM;
> +		return 0;
> +	}

Perhaps:

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 0e75170..f03ec33 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -70,6 +70,7 @@
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
 #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
+#define ARCH_CAP_SKIP_L1DFL_VMENTRY	(1 << 3)   /* Skip L1DF on VMENTRY */
 #define ARCH_CAP_SSB_NO			(1 << 4)   /*
 						    * Not susceptible to Speculative Store Bypass
 						    * attack, so no Speculative Store Bypass
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c5c0118..5209252 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -216,6 +216,15 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
 		return 0;
 	}
+	if (static_cpu_has(X86_FEATURE_HYPERVISOR) &&
+	    static_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+	    boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+		u64 msr;
+
+		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+		if (msr & ARCH_CAP_SKIP_L1DFL_VMENTRY)
+			l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NESTED_VM;
+	}
 
 	/* If set to auto use the default l1tf mitigation method */
 	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {

^ permalink raw reply related	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 09/10] Control knobs and Documentation 9
  2018-07-12 14:19 ` [patch V10 09/10] Control knobs and Documentation 9 Thomas Gleixner
  2018-07-12 16:24   ` [MODERATED] " Josh Poimboeuf
  2018-07-12 17:16   ` Greg KH
@ 2018-07-15  3:12   ` Kees Cook
  2 siblings, 0 replies; 49+ messages in thread
From: Kees Cook @ 2018-07-15  3:12 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:11PM +0200, speck for Thomas Gleixner wrote:
> From: Jiri Kosina <jkosina@suse.cz>
> Subject: [patch V10 09/10] x86/bugs, kvm: introduce boot-time control of L1TF mitigations
> 
> Introduce the 'l1tf=' kernel command line option to allow for boot-time
> switching of mitigation that is used on processors affected by L1TF.
> 
> The possible values are:
> 
>   full
> 	Provides all available mitigations for the L1TF vulnerability. Disables
> 	SMT and enables all mitigations in the hypervisors. SMT control via
> 	/sys/devices/system/cpu/smt/control is still possible after boot.
> 	Hypervisors will issue a warning when the first VM is started in
> 	a potentially insecure configuration, i.e. SMT enabled or L1D flush
> 	disabled.
> 
>   full,force
> 	Same as 'full', but disables SMT control. Implies the 'nosmt=force'
> 	command line option. sysfs control of SMT and the hypervisor flush
> 	control is disabled.
> 
>   flush
> 	Leaves SMT enabled and enables the conditional hypervisor mitigation.
> 	Hypervisors will issue a warning when the first VM is started in a
> 	potentially insecure configuration, i.e. SMT enabled or L1D flush
> 	disabled.
> 
>   flush,nosmt
> 	Disables SMT and enables the conditional hypervisor mitigation. SMT
> 	control via /sys/devices/system/cpu/smt/control is still possible
> 	after boot. If SMT is reenabled or flushing disabled at runtime
> 	hypervisors will issue a warning.
> 
>   flush,nowarn
> 	Same as 'flush', but hypervisors will not warn when
> 	a VM is started in a potentially insecure configuration.
> 
>   off
> 	Disables hypervisor mitigations and doesn't emit any warnings.
> 
> Default is 'flush'.

I think this read very well and I think it's a reasonable default choice.
The most sensitive workloads will be under VMs.

Reviewed-by: Kees Cook <keescook@chromium.org>

Thanks!

-Kees

-- 
Kees Cook                                            @outflux.net

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 10/10] Control knobs and Documentation 10
  2018-07-12 14:19 ` [patch V10 10/10] Control knobs and Documentation 10 Thomas Gleixner
                     ` (2 preceding siblings ...)
  2018-07-12 17:18   ` [MODERATED] " Greg KH
@ 2018-07-15  7:30   ` Borislav Petkov
  2018-07-27 16:41   ` Dave Hansen
  4 siblings, 0 replies; 49+ messages in thread
From: Borislav Petkov @ 2018-07-15  7:30 UTC (permalink / raw)
  To: speck

On Thu, Jul 12, 2018 at 04:19:12PM +0200, speck for Thomas Gleixner wrote:
>  Documentation/admin-guide/index.rst |    9 
>  Documentation/admin-guide/l1tf.rst  |  572 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 581 insertions(+)

Reads nicely, just a couple of minor things which sprang at me while
reading, below:

...

> +2. Malicious guest in a virtual machine
> +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> +
> +   The fact that L1TF breaks all domain protections allows malicious guest
> +   OSes, which can control the PTEs directly, and malicious guest user
> +   space applications, which run on an unprotected guest kernel lacking the
> +   PTE inversion mitigation for L1TF, to attack physical host memory.
> +
> +   A special aspect of L1TF in the context of virtualization is symmetric
> +   multi threading (SMT). The Intel implementation of SMT is called
> +   HyperThreading. The fact that Hyperthreads on the affected processors
> +   share the L1 Data Cache (L1D) is important for this. As the flaw allows
> +   only to attack data which is present in L1D, a malicious guest running
> +   on one Hyperthread can attack the data which is brought into the L1D by
> +   the context which runs on the sibling Hyperthread of the same physical
> +   core. This context can be host OS, host user space or a different guest.
> +
> +   If the processor does not support Extended Page Tables, the attack is
> +   only possible, when the hypervisor does not sanitize the content of the
> +   effective (shadow) page tables.
> +
> +   While solutions exist to mitigate these attack vectors fully, these
> +   mitigations are not enabled by default in the Linux kernel because they
> +   can affect performance significantly. The kernel provides several
> +   mechanisms which can be utilized to address the problem depending on the
> +   deployment scenario. The mitigations, their protection scope and impact
> +   are described in the next sections.
> +
> +   The default mitigations and the rationale for chosing them are explained

choosing

> +   at the end of this document. See :ref:`default_mitigations`.
> +
> +.. _l1tf_sys_info:
> +

...

> +1. L1D flush on VMENTER
> +^^^^^^^^^^^^^^^^^^^^^^^
> +
> +   To make sure that a guest cannot attack data which is present in the L1D
> +   the hypervisor flushes the L1D before entering the guest.
> +
> +   Flushing the L1D evicts not only the data which should not be accessed
> +   by a potentially malicious guest, it also flushes the guest
> +   data. Flushing the L1D has a performance impact as the processor has to

s/Flushing the L1D/Therefore it/

> +   bring the flushed guest data back into the L1D. Depending on the
> +   frequency of VMEXIT/VMENTER and the type of computations in the guest
> +   performance degradation in the range of 1% to 50% has been observed. For
> +   scenarios where guest VMEXIT/VMENTER are rare the performance impact is
> +   minimal. Virtio and mechanisms like posted interrupts are designed to
> +   confine the VMEXITs to a bare minimum, but specific configurations and
> +   application scenarios might still suffer from a high VMEXIT rate.
> +
> +   The general recommendation is to enable L1D flush on VMENTER.
> +
> +   Note, that L1D flush does not prevent the SMT problem because the

s/,//

> +   sibling thread will also bring back its data into the L1D which makes it
> +   attackable again.
> +
> +   L1D flush can be controlled by the administrator via the kernel command
> +   line and sysfs control files. See :ref:`mitigation_control_command_line`
> +   and :ref:`mitigation_control_kvm`.
> +
> +.. _guest_confinement:
> +
> +2. Guest VCPU confinement to dedicated physical cores
> +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> +
> +   To address the SMT problem, it is possible to make a guest or a group of
> +   guests affine to one or more physical cores. The proper mechanism for
> +   that is to utilize exclusive cpusets to ensure that no other guest or
> +   host tasks can run on these cores.
> +
> +   If only a single guest or related guests run on sibling SMT threads on
> +   the same physical core then they can only attack their own memory and
> +   restricted parts of the host memory.
> +
> +   Host memory is attackable, when one of the sibling SMT threads runs in
> +   host OS (hypervisor) context and the other in guest context. The amount
> +   of valuable information from the host OS context depends on the context
> +   which the host OS executes, i.e. interrupts, soft interrupts and kernel
> +   threads. The amount of valuable data from these contexts cannot be
> +   declared as non-interesting for an attacker without deep inspection of
> +   the code.
> +
> +   Note, that assigning guests to a fixed set of physical cores affects the

s/,//

> +   ability of the scheduler to do load balancing and might have negative
> +   effects on CPU utilization depending on the hosting scenario. Disabling
> +   SMT might be a viable alternative for particular scenarios.
> +
> +   For further information about confining guests to a single or to a group
> +   of cores consult the cpusets documentation:
> +
> +   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt

Should this reference be relative to our Documentation/ tree instead, i.e.,

       ../cgroup-v1/cpusets.txt

and the doc system will resolve it to the respective absolute URL where
it is displayed?

> +
> +.. _interrupt_isolation:
> +
> +3. Interrupt affinity
> +^^^^^^^^^^^^^^^^^^^^^
> +
> +   Interrupts can be made affine to logical CPUs. This is not universally
> +   true because there are types of interrupts which are truly per CPU
> +   interrupts, e.g. the local timer interrupt. Aside of that multi queue
							       ^
							       ,

> +   devices affine their interrupts to single CPUs or groups of CPUs per

s/affine/assign/

> +   queue without allowing the administrator to control the affinities.
> +
> +   Moving the interrupts, which can be affinity controlled, away from CPUs
> +   which run untrusted guests, reduces the attack vector space.
> +
> +   Whether the interrupts with are affine to CPUs, which run untrusted

s/with //

> +   guests, provide interesting data for an attacker depends on the system

	      provides

> +   configuration and the scenarios which run on the system. While for some
> +   of the interrupts it can be assumed that they wont expose interesting
> +   information beyond exposing hints about the host OS memory layout, there

s/exposing //

> +   is no way to make general assumptions.
> +
> +   Interrupt affinity can be controlled by the administrator via the
> +   /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
> +   available at:
> +
> +   https://www.kernel.org/doc/Documentation/IRQ-affinity.txt

Same comment as above.

> +
> +.. _smt_control:
> +
> +4. SMT control
> +^^^^^^^^^^^^^^
> +
> +   To prevent the SMT issues of L1TF it might be necessary to disable SMT
> +   completely. Disabling SMT can have a significant performance impact, but
> +   the impact depends on the hosting scenario and the type of workloads.
> +   The impact of disabling SMT needs also to be weighted against the impact

s/weighted/weighed/

> +   of other mitigation solutions like confining guests to dedicated cores.
> +
> +   The kernel provides a sysfs interface to retrieve the status of SMT and
> +   to control it. It also provides a kernel command line interface to
> +   control SMT.
> +
> +   The kernel command line interface consists of the following options:
> +

...

> +5. Disabling EPT
> +^^^^^^^^^^^^^^^^
> +
> +  Disabling EPT for virtual machines provides full mitigation for L1TF even
> +  with SMT enabled, because the effective page tables for guests are
> +  managed and sanitized by the hypervisor. Though disabling EPT has a

s/Though/However/

> +  significant performance impact especially when the Meltdown mitigation
> +  KPTI is enabled.
> +
> +  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
> +
> +There is ongoing research and development for new mitigation mechanisms to
> +address the performance impact of disabling SMT or EPT.
> +
> +.. _mitigation_control_command_line:
> +
> +Mitigation control on the kernel command line
> +---------------------------------------------
> +
> +The kernel command line allows to control the L1TF mitigations at boot

Passive:

"L1TF mitigations are controlled on the kernel command line with the
option ..."

> +time with the option "l1tf=". The valid arguments for this option are:
> +
> +  ============  ===================================================
> +  full		Provides all available mitigations for the L1TF
> +		vulnerability. Disables SMT and enables all mitigations in
> +		the hypervisors.
> +
> +		SMT control and L1D flush control via the sysfs interface
> +		is still possible after boot.  Hypervisors will issue a
> +		warning when the first VM is started in a potentially
> +		insecure configuration, i.e. SMT enabled or L1D flush
> +		disabled.
> +

...

> +
> +  - Interrupt isolation:
> +
> +    Isolating the guest CPUs from interrupts can reduce the attack surface
> +    further, but still allows a malicious guest to explore a limited amount
> +    of host physical memory. This can at least be used to gain knowledge
> +    about the host address space layout. The interrupts which have a fixed
> +    affinity to the CPUs which run the untrusted guests can depending on
> +    the scenario still trigger soft interrupts and schedule kernel threads

"... which run the untrusted guests can - depending on the scenario - still
trigger... "

> +    which might expose valuable information. See
> +    :ref:`interrupt_isolation`.
> +

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 00/10] Control knobs and Documentation 0
  2018-07-13 17:28   ` Konrad Rzeszutek Wilk
@ 2018-07-15 13:58     ` Paolo Bonzini
  0 siblings, 0 replies; 49+ messages in thread
From: Paolo Bonzini @ 2018-07-15 13:58 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 1665 bytes --]

On 13/07/2018 19:28, speck for Konrad Rzeszutek Wilk wrote:
> Perhaps:
> 
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 0e75170..f03ec33 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -70,6 +70,7 @@
>  #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
>  #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
>  #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
> +#define ARCH_CAP_SKIP_L1DFL_VMENTRY	(1 << 3)   /* Skip L1DF on VMENTRY */

If this bit is set, KVM is effectively not vulnerable.  I just sent a
more complete follow-up, replacing what I had cooked up last Friday.

Thanks,

Paolo

>  #define ARCH_CAP_SSB_NO			(1 << 4)   /*
>  						    * Not susceptible to Speculative Store Bypass
>  						    * attack, so no Speculative Store Bypass
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index c5c0118..5209252 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -216,6 +216,15 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
>  		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
>  		return 0;
>  	}
> +	if (static_cpu_has(X86_FEATURE_HYPERVISOR) &&
> +	    static_cpu_has(X86_FEATURE_FLUSH_L1D) &&
> +	    boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
> +		u64 msr;
> +
> +		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
> +		if (msr & ARCH_CAP_SKIP_L1DFL_VMENTRY)
> +			l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NESTED_VM;
> +	}
>  
>  	/* If set to auto use the default l1tf mitigation method */
>  	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
> 



^ permalink raw reply	[flat|nested] 49+ messages in thread

* [MODERATED] Re: [patch V10 10/10] Control knobs and Documentation 10
  2018-07-12 14:19 ` [patch V10 10/10] Control knobs and Documentation 10 Thomas Gleixner
                     ` (3 preceding siblings ...)
  2018-07-15  7:30   ` Borislav Petkov
@ 2018-07-27 16:41   ` Dave Hansen
  4 siblings, 0 replies; 49+ messages in thread
From: Dave Hansen @ 2018-07-27 16:41 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 552 bytes --]

On 07/12/2018 07:19 AM, speck for Thomas Gleixner wrote:
> +This vulnerability affects a wide range of Intel processors. The
> +vulnerability is not present on:
...
> +
> +   - The Intel Core Duo Yonah variants (2006 - 2008)

One bit of data: According to Intel folks that were reading your
documentation, Intel has not evaluated these, so it might be best to
leave them off the list of non-vulnerable ones.

We could call them out as not evaluated, instead of saying vulnerable
vs. not.  Or, "presumed not vulnerable", but unconfirmed.


^ permalink raw reply	[flat|nested] 49+ messages in thread

end of thread, other threads:[~2018-07-27 16:41 UTC | newest]

Thread overview: 49+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-12 14:19 [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
2018-07-12 14:19 ` [patch V10 01/10] Control knobs and Documentation 1 Thomas Gleixner
2018-07-12 15:34   ` [MODERATED] " Greg KH
2018-07-12 15:38     ` Thomas Gleixner
2018-07-12 15:46       ` Thomas Gleixner
2018-07-12 17:08         ` [MODERATED] " Greg KH
2018-07-12 14:19 ` [patch V10 02/10] Control knobs and Documentation 2 Thomas Gleixner
2018-07-12 17:09   ` [MODERATED] " Greg KH
2018-07-12 14:19 ` [patch V10 03/10] Control knobs and Documentation 3 Thomas Gleixner
2018-07-12 16:13   ` [MODERATED] " Josh Poimboeuf
2018-07-13  9:10     ` Thomas Gleixner
2018-07-12 17:09   ` [MODERATED] " Greg KH
2018-07-12 14:19 ` [patch V10 04/10] Control knobs and Documentation 4 Thomas Gleixner
2018-07-12 17:10   ` [MODERATED] " Greg KH
2018-07-12 14:19 ` [patch V10 05/10] Control knobs and Documentation 5 Thomas Gleixner
2018-07-12 17:10   ` [MODERATED] " Greg KH
2018-07-12 14:19 ` [patch V10 06/10] Control knobs and Documentation 6 Thomas Gleixner
2018-07-12 16:14   ` [MODERATED] " Josh Poimboeuf
2018-07-12 17:10   ` Greg KH
2018-07-12 14:19 ` [patch V10 07/10] Control knobs and Documentation 7 Thomas Gleixner
2018-07-12 17:11   ` [MODERATED] " Greg KH
2018-07-12 14:19 ` [patch V10 08/10] Control knobs and Documentation 8 Thomas Gleixner
2018-07-12 16:22   ` [MODERATED] " Josh Poimboeuf
2018-07-12 17:12     ` Greg KH
2018-07-13  9:18     ` Thomas Gleixner
2018-07-12 17:17   ` [MODERATED] " Greg KH
2018-07-12 14:19 ` [patch V10 09/10] Control knobs and Documentation 9 Thomas Gleixner
2018-07-12 16:24   ` [MODERATED] " Josh Poimboeuf
2018-07-12 17:17     ` Greg KH
2018-07-12 17:16   ` Greg KH
2018-07-15  3:12   ` Kees Cook
2018-07-12 14:19 ` [patch V10 10/10] Control knobs and Documentation 10 Thomas Gleixner
2018-07-12 16:03   ` [MODERATED] " Linus Torvalds
2018-07-12 16:31     ` Peter Zijlstra
2018-07-12 16:13   ` Josh Poimboeuf
2018-07-12 16:26     ` Josh Poimboeuf
2018-07-13  9:09     ` Thomas Gleixner
2018-07-12 17:18   ` [MODERATED] " Greg KH
2018-07-15  7:30   ` Borislav Petkov
2018-07-27 16:41   ` Dave Hansen
2018-07-12 14:54 ` [patch V10 00/10] Control knobs and Documentation 0 Thomas Gleixner
2018-07-12 19:30 ` [MODERATED] " Josh Poimboeuf
2018-07-13 15:03   ` Thomas Gleixner
2018-07-13  8:30 ` [MODERATED] " Jiri Kosina
2018-07-13 16:22 ` Paolo Bonzini
2018-07-13 16:56   ` Andrew Cooper
2018-07-13 17:01     ` Paolo Bonzini
2018-07-13 17:28   ` Konrad Rzeszutek Wilk
2018-07-15 13:58     ` Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.