All of lore.kernel.org
 help / color / mirror / Atom feed
* [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #0
@ 2018-07-15 13:57 Paolo Bonzini
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #1 Paolo Bonzini
                   ` (4 more replies)
  0 siblings, 5 replies; 20+ messages in thread
From: Paolo Bonzini @ 2018-07-15 13:57 UTC (permalink / raw)
  To: speck

Support for ARCH_CAPABILITIES bit 3, which can already be used to disable
the mitigations on a nested hypervisor.  Patch 3 will shortly go to Linus's
tree.

Paolo Bonzini (4):
  KVM: test L1TF EPT bug separately from X86_BUG_L1TF
  x86: use ARCH_CAPABILITIES to skip L1D flush on vmentry
  KVM: VMX: support MSR_IA32_ARCH_CAPABILITIES as a feature MSR
  KVM: VMX: tell nested hypervisor to skip L1D flush on vmentry

 Documentation/admin-guide/l1tf.rst | 24 ++++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/include/asm/msr-index.h   |  1 +
 arch/x86/include/asm/vmx.h         |  1 +
 arch/x86/kernel/cpu/bugs.c         | 26 ++++++++++++++++++++------
 arch/x86/kvm/vmx.c                 | 35 ++++++++++++++++++++++++-----------
 arch/x86/kvm/x86.c                 | 17 ++++++++++++++++-
 7 files changed, 87 insertions(+), 18 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #1
  2018-07-15 13:57 [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #0 Paolo Bonzini
@ 2018-07-15 13:57 ` Paolo Bonzini
  2018-07-16 14:36   ` [MODERATED] " Konrad Rzeszutek Wilk
  2018-07-16 20:02   ` Thomas Gleixner
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #2 Paolo Bonzini
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 20+ messages in thread
From: Paolo Bonzini @ 2018-07-15 13:57 UTC (permalink / raw)
  To: speck

If bit 3 of ARCH_CAPABILITIES is set, the L1TF bug is present but the hypervisor
does not need to flush the L1D cache on vmentry.  To prepare for that, use
l1tf_vmx_mitigation to test for the EPT variant of the bug in KVM.
boot_cpu_has(X86_BUG_L1TF) is only used for the non-hypervisor case.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h |  1 +
 arch/x86/kernel/cpu/bugs.c | 18 ++++++++++++------
 arch/x86/kvm/vmx.c         | 33 +++++++++++++++++++++++----------
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 94a8547d915b..3beb2f6f1a94 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -579,6 +579,7 @@ enum vmx_l1d_flush_state {
 	VMENTER_L1D_FLUSH_COND,
 	VMENTER_L1D_FLUSH_ALWAYS,
 	VMENTER_L1D_FLUSH_EPT_DISABLED,
+	VMENTER_L1D_FLUSH_NOT_VULNERABLE,
 };
 
 extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d63cb1501784..4e06eee987d4 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -672,8 +672,10 @@ static void __init l1tf_select_mitigation(void)
 {
 	u64 half_pa;
 
-	if (!boot_cpu_has_bug(X86_BUG_L1TF))
+	if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
+		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_VULNERABLE;
 		return;
+	}
 
 	switch (l1tf_mitigation) {
 	case L1TF_MITIGATION_OFF:
@@ -741,15 +743,19 @@ static int __init l1tf_cmdline(char *str)
 
 #if IS_ENABLED(CONFIG_KVM_INTEL)
 static const char *l1tf_vmx_states[] = {
-	[VMENTER_L1D_FLUSH_AUTO]	= "auto",
-	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
-	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
-	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
-	[VMENTER_L1D_FLUSH_EPT_DISABLED]= "EPT disabled"
+	[VMENTER_L1D_FLUSH_AUTO]	   = "auto",
+	[VMENTER_L1D_FLUSH_NEVER]	   = "vulnerable",
+	[VMENTER_L1D_FLUSH_COND]	   = "conditional cache flushes",
+	[VMENTER_L1D_FLUSH_ALWAYS]	   = "cache flushes",
+	[VMENTER_L1D_FLUSH_EPT_DISABLED]   = "EPT disabled",
+	[VMENTER_L1D_FLUSH_NOT_VULNERABLE] = "not vulnerable",
 };
 
 static ssize_t l1tf_show_state(char *buf)
 {
+	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NOT_VULNERABLE)
+		return sprintf(buf, "%s; VMX: not vulnerable\n", L1TF_DEFAULT_MSG);
+
 	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
 		return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c5c0118b126d..bf0801399cb4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -208,14 +208,17 @@
 #define L1D_CACHE_ORDER 4
 static void *vmx_l1d_flush_pages;
 
-static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+static bool vmx_l1tf_vulnerable(void)
+{
+	return l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NOT_VULNERABLE &&
+		l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_EPT_DISABLED;
+}
+
+static int vmx_update_l1d_flush(enum vmx_l1d_flush_state l1tf)
 {
 	struct page *page;
 
-	if (!enable_ept) {
-		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
-		return 0;
-	}
+	WARN_ON(!vmx_l1tf_vulnerable());
 
 	/* If set to auto use the default l1tf mitigation method */
 	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
@@ -259,6 +262,16 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 	return 0;
 }
 
+static int vmx_setup_l1d_flush(void)
+{
+	if (!enable_ept) {
+		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+		return 0;
+	}
+
+	return vmx_update_l1d_flush(vmentry_l1d_flush_param);
+}
+
 static int vmentry_l1d_flush_parse(const char *s)
 {
 	unsigned int i;
@@ -276,7 +289,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
 {
 	int l1tf, ret;
 
-	if (!boot_cpu_has(X86_BUG_L1TF))
+	if (!vmx_l1tf_vulnerable())
 		return 0;
 
 	l1tf = vmentry_l1d_flush_parse(s);
@@ -295,7 +308,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
 	}
 
 	mutex_lock(&vmx_l1d_flush_mutex);
-	ret = vmx_setup_l1d_flush(l1tf);
+	ret = vmx_update_l1d_flush(l1tf);
 	mutex_unlock(&vmx_l1d_flush_mutex);
 	return ret;
 }
@@ -10592,7 +10605,7 @@ static int vmx_vm_init(struct kvm *kvm)
 	if (!ple_gap)
 		kvm->arch.pause_in_guest = true;
 
-	if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+	if (vmx_l1tf_vulnerable()) {
 		switch (l1tf_mitigation) {
 		case L1TF_MITIGATION_OFF:
 		case L1TF_MITIGATION_FLUSH_NOWARN:
@@ -13383,8 +13396,8 @@ static int __init vmx_init(void)
 	 * contain 'auto' which will be turned into the default 'cond'
 	 * mitigation mode.
 	 */
-	if (boot_cpu_has(X86_BUG_L1TF)) {
-		r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NOT_VULNERABLE) {
+		r = vmx_setup_l1d_flush();
 		if (r) {
 			vmx_exit();
 			return r;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #2
  2018-07-15 13:57 [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #0 Paolo Bonzini
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #1 Paolo Bonzini
@ 2018-07-15 13:57 ` Paolo Bonzini
  2018-07-16 20:04   ` Thomas Gleixner
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #3 Paolo Bonzini
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 20+ messages in thread
From: Paolo Bonzini @ 2018-07-15 13:57 UTC (permalink / raw)
  To: speck

Bit 3 of ARCH_CAPABILITIES tells a hypervisor that L1D flush on vmentry is
not needed.  In that case, KVM can behave as if there is no L1TF bug at all.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/msr-index.h | 1 +
 arch/x86/kernel/cpu/bugs.c       | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 0e7517089b80..f03ec33878c3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -70,6 +70,7 @@
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
 #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
+#define ARCH_CAP_SKIP_L1DFL_VMENTRY	(1 << 3)   /* Skip L1DF on VMENTRY */
 #define ARCH_CAP_SSB_NO			(1 << 4)   /*
 						    * Not susceptible to Speculative Store Bypass
 						    * attack, so no Speculative Store Bypass
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 4e06eee987d4..a8bddb8b42ba 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -677,6 +677,14 @@ static void __init l1tf_select_mitigation(void)
 		return;
 	}
 
+	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+		u64 msr;
+
+		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+		if (msr & ARCH_CAP_SKIP_L1DFL_VMENTRY)
+			l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_VULNERABLE;
+	}
+
 	switch (l1tf_mitigation) {
 	case L1TF_MITIGATION_OFF:
 	case L1TF_MITIGATION_FLUSH_NOWARN:
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #3
  2018-07-15 13:57 [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #0 Paolo Bonzini
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #1 Paolo Bonzini
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #2 Paolo Bonzini
@ 2018-07-15 13:57 ` Paolo Bonzini
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #4 Paolo Bonzini
  2018-07-16 20:44 ` [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #0 Konrad Rzeszutek Wilk
  4 siblings, 0 replies; 20+ messages in thread
From: Paolo Bonzini @ 2018-07-15 13:57 UTC (permalink / raw)
  To: speck

This lets userspace read the MSR_IA32_ARCH_CAPABILITIES and check that all
requested features are available on the host.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 902d535dff8f..79c8ca2c2ad9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1098,6 +1098,7 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu)
 
 	MSR_F10H_DECFG,
 	MSR_IA32_UCODE_REV,
+	MSR_IA32_ARCH_CAPABILITIES,
 };
 
 static unsigned int num_msr_based_features;
@@ -1106,7 +1107,8 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
 {
 	switch (msr->index) {
 	case MSR_IA32_UCODE_REV:
-		rdmsrl(msr->index, msr->data);
+	case MSR_IA32_ARCH_CAPABILITIES:
+		rdmsrl_safe(msr->index, &msr->data);
 		break;
 	default:
 		if (kvm_x86_ops->get_msr_feature(msr))
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #4
  2018-07-15 13:57 [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #0 Paolo Bonzini
                   ` (2 preceding siblings ...)
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #3 Paolo Bonzini
@ 2018-07-15 13:57 ` Paolo Bonzini
  2018-07-16 14:58   ` [MODERATED] " Konrad Rzeszutek Wilk
  2018-07-16 20:06   ` Thomas Gleixner
  2018-07-16 20:44 ` [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #0 Konrad Rzeszutek Wilk
  4 siblings, 2 replies; 20+ messages in thread
From: Paolo Bonzini @ 2018-07-15 13:57 UTC (permalink / raw)
  To: speck

When nested virtualization is in use, VMENTER operations from the
nested hypervisor into the nested guest will always be processed by
the bare metal hypervisor, and KVM's "conditional cache flushes"
mode in particular does a flush on nested vmentry.  Therefore,
include the "skip L1D flush on vmentry" bit in KVM's suggested
ARCH_CAPABILITIES setting.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/admin-guide/l1tf.rst | 24 ++++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/kvm/vmx.c                 |  2 +-
 arch/x86/kvm/x86.c                 | 15 ++++++++++++++-
 4 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst
index 5adf7d7c2b4e..3b22561af512 100644
--- a/Documentation/admin-guide/l1tf.rst
+++ b/Documentation/admin-guide/l1tf.rst
@@ -528,6 +528,30 @@ available:
     EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
     parameter.
 
+3.4. Nested virtual machines
+""""""""""""""""""""""""""""
+
+When nested virtualization is in use, three operating systems are involved:
+the bare metal hypervisor, the nested hypervisor, and the nested virtual
+machine.  VMENTER operations from the nested hypervisor into the nested
+guest will always be processed by the bare metal hypervisor.  Therefore,
+when running as a bare metal hypervisor, instead, KVM will:
+
+ - flush the L1D cache on every switch from nested hypervisor to
+   nested virtual machine, so that the nested hypervisor's secrets
+   are not exposed to the nested virtual machine;
+
+ - flush the L1D cache on every switch from nested virtual machine to
+   nested hypervisor; this is a complex operation, and flushing the L1D
+   cache avoids that the bare metal hypervisor's secrets be exposed
+   to the nested virtual machine;
+
+ - instruct the nested hypervisor to not perform any L1D cache flush
+   on VMENTER.
+
+This behavior is disabled when KVM is loaded with the module option
+"kvm-intel.vmentry_l1d_flush=never".
+
 
 .. _default_mitigations:
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 57d418061c55..e375011108d1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1417,6 +1417,7 @@ enum {
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
 
+u64 kvm_get_arch_capabilities(void);
 void kvm_define_shared_msr(unsigned index, u32 msr);
 int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf0801399cb4..e2692920053d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6423,7 +6423,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	}
 
 	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
-		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
+		vmx->arch_capabilities = kvm_get_arch_capabilities();
 
 	vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 79c8ca2c2ad9..442d16228e29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1103,11 +1103,24 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu)
 
 static unsigned int num_msr_based_features;
 
+u64 kvm_get_arch_capabilities(void)
+{
+	u64 data;
+
+	rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
+	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
+		data |= ARCH_CAP_SKIP_L1DFL_VMENTRY;
+
+	return data;
+}
+
 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
 {
 	switch (msr->index) {
-	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_ARCH_CAPABILITIES:
+		return kvm_get_arch_capabilities();
+		break;
+	case MSR_IA32_UCODE_REV:
 		rdmsrl_safe(msr->index, &msr->data);
 		break;
 	default:
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #1
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #1 Paolo Bonzini
@ 2018-07-16 14:36   ` Konrad Rzeszutek Wilk
  2018-07-16 20:02   ` Thomas Gleixner
  1 sibling, 0 replies; 20+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-07-16 14:36 UTC (permalink / raw)
  To: speck

On Sun, Jul 15, 2018 at 03:57:13PM +0200, speck for Paolo Bonzini wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: [PATCH 1/4] KVM: test L1TF EPT bug separately from X86_BUG_L1TF
> 
> If bit 3 of ARCH_CAPABILITIES is set, the L1TF bug is present but the hypervisor
> does not need to flush the L1D cache on vmentry.  To prepare for that, use
> l1tf_vmx_mitigation to test for the EPT variant of the bug in KVM.
> boot_cpu_has(X86_BUG_L1TF) is only used for the non-hypervisor case.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/vmx.h |  1 +
>  arch/x86/kernel/cpu/bugs.c | 18 ++++++++++++------
>  arch/x86/kvm/vmx.c         | 33 +++++++++++++++++++++++----------
>  3 files changed, 36 insertions(+), 16 deletions(-)

patching file arch/x86/include/asm/vmx.h
patching file arch/x86/kernel/cpu/bugs.c
Hunk #1 succeeded at 678 (offset 6 lines).
Hunk #2 FAILED at 743.
1 out of 2 hunks FAILED -- saving rejects to file arch/x86/kernel/cpu/bugs.c.rej
patching file arch/x86/kvm/vmx.c
[konrad@char linux]$ 
Which seems to be related to the whitespace in l1tf_vmx_states?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #4
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #4 Paolo Bonzini
@ 2018-07-16 14:58   ` Konrad Rzeszutek Wilk
  2018-07-16 20:06   ` Thomas Gleixner
  1 sibling, 0 replies; 20+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-07-16 14:58 UTC (permalink / raw)
  To: speck

On Sun, Jul 15, 2018 at 03:57:16PM +0200, speck for Paolo Bonzini wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: [PATCH 4/4] KVM: VMX: tell nested hypervisor to skip L1D flush on
>  vmentry
> 
> When nested virtualization is in use, VMENTER operations from the
> nested hypervisor into the nested guest will always be processed by
> the bare metal hypervisor, and KVM's "conditional cache flushes"
> mode in particular does a flush on nested vmentry.  Therefore,
> include the "skip L1D flush on vmentry" bit in KVM's suggested
> ARCH_CAPABILITIES setting.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  Documentation/admin-guide/l1tf.rst | 24 ++++++++++++++++++++++++
>  arch/x86/include/asm/kvm_host.h    |  1 +
>  arch/x86/kvm/vmx.c                 |  2 +-
>  arch/x86/kvm/x86.c                 | 15 ++++++++++++++-
>  4 files changed, 40 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst
> index 5adf7d7c2b4e..3b22561af512 100644
> --- a/Documentation/admin-guide/l1tf.rst
> +++ b/Documentation/admin-guide/l1tf.rst
> @@ -528,6 +528,30 @@ available:
>      EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
>      parameter.
>  
> +3.4. Nested virtual machines
> +""""""""""""""""""""""""""""
> +
> +When nested virtualization is in use, three operating systems are involved:
> +the bare metal hypervisor, the nested hypervisor, and the nested virtual
> +machine.  VMENTER operations from the nested hypervisor into the nested
> +guest will always be processed by the bare metal hypervisor.  Therefore,
> +when running as a bare metal hypervisor, instead, KVM will:
> +
> + - flush the L1D cache on every switch from nested hypervisor to
> +   nested virtual machine, so that the nested hypervisor's secrets
> +   are not exposed to the nested virtual machine;
> +
> + - flush the L1D cache on every switch from nested virtual machine to
> +   nested hypervisor; this is a complex operation, and flushing the L1D
> +   cache avoids that the bare metal hypervisor's secrets be exposed
> +   to the nested virtual machine;
> +
> + - instruct the nested hypervisor to not perform any L1D cache flush
> +   on VMENTER.
> +
> +This behavior is disabled when KVM is loaded with the module option
> +"kvm-intel.vmentry_l1d_flush=never".
> +
>  
>  .. _default_mitigations:
>  
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 57d418061c55..e375011108d1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1417,6 +1417,7 @@ enum {
>  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
>  void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
>  
> +u64 kvm_get_arch_capabilities(void);
>  void kvm_define_shared_msr(unsigned index, u32 msr);
>  int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
>  
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index bf0801399cb4..e2692920053d 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -6423,7 +6423,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
>  	}
>  
>  	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
> -		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
> +		vmx->arch_capabilities = kvm_get_arch_capabilities();
>  
>  	vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 79c8ca2c2ad9..442d16228e29 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1103,11 +1103,24 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu)
>  
>  static unsigned int num_msr_based_features;
>  
> +u64 kvm_get_arch_capabilities(void)
> +{
> +	u64 data;
> +
> +	rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
> +	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
> +		data |= ARCH_CAP_SKIP_L1DFL_VMENTRY;
> +
> +	return data;
> +}


ODPOST 262 modules
ERROR: "kvm_get_arch_capabilities" [arch/x86/kvm/kvm-intel.ko] undefined!
make[2]: *** [/home/konrad/linux/scripts/Makefile.modpost:92: __modpost] Error 1
make[1]: *** [/home/konrad/linux/Makefile:1213: modules] Error 2
make[1]: *** Waiting for unfinished jobs....

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #1
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #1 Paolo Bonzini
  2018-07-16 14:36   ` [MODERATED] " Konrad Rzeszutek Wilk
@ 2018-07-16 20:02   ` Thomas Gleixner
  2018-07-17 11:20     ` [MODERATED] " Paolo Bonzini
  1 sibling, 1 reply; 20+ messages in thread
From: Thomas Gleixner @ 2018-07-16 20:02 UTC (permalink / raw)
  To: speck

On Sun, 15 Jul 2018, speck for Paolo Bonzini wrote:
>  extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> index d63cb1501784..4e06eee987d4 100644
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -672,8 +672,10 @@ static void __init l1tf_select_mitigation(void)
>  {
>  	u64 half_pa;
>  
> -	if (!boot_cpu_has_bug(X86_BUG_L1TF))
> +	if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
> +		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_VULNERABLE;
>  		return;

What's the point of this? If X86_BUG_L1TF is not set, then the system is
not vulnerable and l1tf_vmx_mitigation is not used in l1tf_show_state() at
all because that code is not reached.

l1tf_vmx_mitigation is supposed to be storage for VMX to tell the core code
what kind of mitigation is in place and not for the core code to set VMX
state.

And wha't worse is, that if KVM is not active then the status will show
now, VMX not vulnerable, while right now it does not say anything about VMX
because the bugs code does not even know if VMX exists or not.

>  #if IS_ENABLED(CONFIG_KVM_INTEL)
>  static const char *l1tf_vmx_states[] = {
> -	[VMENTER_L1D_FLUSH_AUTO]	= "auto",
> -	[VMENTER_L1D_FLUSH_NEVER]	= "vulnerable",
> -	[VMENTER_L1D_FLUSH_COND]	= "conditional cache flushes",
> -	[VMENTER_L1D_FLUSH_ALWAYS]	= "cache flushes",
> -	[VMENTER_L1D_FLUSH_EPT_DISABLED]= "EPT disabled"
> +	[VMENTER_L1D_FLUSH_AUTO]	   = "auto",
> +	[VMENTER_L1D_FLUSH_NEVER]	   = "vulnerable",
> +	[VMENTER_L1D_FLUSH_COND]	   = "conditional cache flushes",
> +	[VMENTER_L1D_FLUSH_ALWAYS]	   = "cache flushes",
> +	[VMENTER_L1D_FLUSH_EPT_DISABLED]   = "EPT disabled",
> +	[VMENTER_L1D_FLUSH_NOT_VULNERABLE] = "not vulnerable",

First of all we use tabs not a random number of spaces, secondly this does
not apply against te latest bundle / repo which already inserted another tab.

>  static ssize_t l1tf_show_state(char *buf)
>  {
> +	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NOT_VULNERABLE)
> +		return sprintf(buf, "%s; VMX: not vulnerable\n", L1TF_DEFAULT_MSG);
> +
>  	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
>  		return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
>  
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index c5c0118b126d..bf0801399cb4 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -208,14 +208,17 @@
>  #define L1D_CACHE_ORDER 4
>  static void *vmx_l1d_flush_pages;
>  
> -static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
> +static bool vmx_l1tf_vulnerable(void)
> +{
> +	return l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NOT_VULNERABLE &&
> +		l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_EPT_DISABLED;

That's really a bad choice.

Not vulnerable is when the CPU does not have X86_BUG_L1TF. If EPT=n then
the CPU is vulnerable, but EPT disabled mitigates the issue.

> +}
> +
> +static int vmx_update_l1d_flush(enum vmx_l1d_flush_state l1tf)
>  {
>  	struct page *page;
>  
> -	if (!enable_ept) {
> -		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
> -		return 0;
> -	}
> +	WARN_ON(!vmx_l1tf_vulnerable());
>  
>  	/* If set to auto use the default l1tf mitigation method */
>  	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
> @@ -259,6 +262,16 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
>  	return 0;
>  }
>  
> +static int vmx_setup_l1d_flush(void)
> +{
> +	if (!enable_ept) {
> +		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
> +		return 0;
> +	}
> +
> +	return vmx_update_l1d_flush(vmentry_l1d_flush_param);
> +}
> +
>  static int vmentry_l1d_flush_parse(const char *s)
>  {
>  	unsigned int i;
> @@ -276,7 +289,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
>  {
>  	int l1tf, ret;
>  
> -	if (!boot_cpu_has(X86_BUG_L1TF))
> +	if (!vmx_l1tf_vulnerable())
>  		return 0;

So you check for !vulnerable here and return. If vulnerable you call
vmx_update_l1d_flush() which has another check with a warnon. That's
convoluted, really.

>  	l1tf = vmentry_l1d_flush_parse(s);
> @@ -295,7 +308,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
>  	}
>  
>  	mutex_lock(&vmx_l1d_flush_mutex);
> -	ret = vmx_setup_l1d_flush(l1tf);
> +	ret = vmx_update_l1d_flush(l1tf);
>  	mutex_unlock(&vmx_l1d_flush_mutex);
>  	return ret;
>  }
> @@ -10592,7 +10605,7 @@ static int vmx_vm_init(struct kvm *kvm)
>  	if (!ple_gap)
>  		kvm->arch.pause_in_guest = true;
>  
> -	if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
> +	if (vmx_l1tf_vulnerable()) {

This should be a check for vmx_l1tf_flush_required() which checks for

     BUG_L1TF, enable_ept and then for that magic MSR bit.

>  		switch (l1tf_mitigation) {
>  		case L1TF_MITIGATION_OFF:
>  		case L1TF_MITIGATION_FLUSH_NOWARN:
> @@ -13383,8 +13396,8 @@ static int __init vmx_init(void)
>  	 * contain 'auto' which will be turned into the default 'cond'
>  	 * mitigation mode.
>  	 */
> -	if (boot_cpu_has(X86_BUG_L1TF)) {
> -		r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
> +	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NOT_VULNERABLE) {
> +		r = vmx_setup_l1d_flush();
>  		if (r) {
>  			vmx_exit();
>  			return r;

So this code should check for the magic MSR and act accordingly.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #2
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #2 Paolo Bonzini
@ 2018-07-16 20:04   ` Thomas Gleixner
  2018-07-16 20:31     ` Thomas Gleixner
  0 siblings, 1 reply; 20+ messages in thread
From: Thomas Gleixner @ 2018-07-16 20:04 UTC (permalink / raw)
  To: speck

On Sun, 15 Jul 2018, speck for Paolo Bonzini wrote:

> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: [PATCH 2/4] x86: use ARCH_CAPABILITIES to skip L1D flush on vmentry
> 
> Bit 3 of ARCH_CAPABILITIES tells a hypervisor that L1D flush on vmentry is
> not needed.  In that case, KVM can behave as if there is no L1TF bug at all.

Are you sure? It just says that you can skip the L1D flush, but it does not
say that this is protected completely also against all the SMP issues

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #4
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #4 Paolo Bonzini
  2018-07-16 14:58   ` [MODERATED] " Konrad Rzeszutek Wilk
@ 2018-07-16 20:06   ` Thomas Gleixner
  2018-07-17 11:14     ` [MODERATED] " Paolo Bonzini
  1 sibling, 1 reply; 20+ messages in thread
From: Thomas Gleixner @ 2018-07-16 20:06 UTC (permalink / raw)
  To: speck

On Sun, 15 Jul 2018, speck for Paolo Bonzini wrote:
>  
> +u64 kvm_get_arch_capabilities(void)
> +{
> +	u64 data;
> +
> +	rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
> +	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
> +		data |= ARCH_CAP_SKIP_L1DFL_VMENTRY;

Why is this not checking for BUG_L1TF? If the host does not have it then
the guest should not have it either, right?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #2
  2018-07-16 20:04   ` Thomas Gleixner
@ 2018-07-16 20:31     ` Thomas Gleixner
  2018-07-16 20:41       ` [MODERATED] " Luck, Tony
  0 siblings, 1 reply; 20+ messages in thread
From: Thomas Gleixner @ 2018-07-16 20:31 UTC (permalink / raw)
  To: speck

On Mon, 16 Jul 2018, speck for Thomas Gleixner wrote:
> On Sun, 15 Jul 2018, speck for Paolo Bonzini wrote:
> 
> > From: Paolo Bonzini <pbonzini@redhat.com>
> > Subject: [PATCH 2/4] x86: use ARCH_CAPABILITIES to skip L1D flush on vmentry
> > 
> > Bit 3 of ARCH_CAPABILITIES tells a hypervisor that L1D flush on vmentry is
> > not needed.  In that case, KVM can behave as if there is no L1TF bug at all.
> 
> Are you sure? It just says that you can skip the L1D flush, but it does not
> say that this is protected completely also against all the SMP issues

The magic PDF says:

   A nested VMM that finds IA32_FLUSH_CMD is enumerated should check whether
   IA32_ARCH_CAPABILITIES bit 3 9 (SKIP_L1DFL_VMENTRY) is set, which indicates
   that it is not required to flush L1D on VMENTER.
   
   First-level VMMs which perform an L1D flush before VMENTER may set
   SKIP_L1DFL_VMENTRY in the IA32_ARCH_CAPABILITIES value exposed to guests.
   These VMMs should set SKIP_L1DFL_VMENTRY in any case where a nested VMM may
   be present.

I'm not seing how that prevents the SMP crap. And also the information in
the nested VMM should be:

    ..... VMX ... L1D Flush: Host VMM

or such. The mitigation really depends on what the Host VMM does and just
claiming !vulnerable might be horribly misleading.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #2
  2018-07-16 20:31     ` Thomas Gleixner
@ 2018-07-16 20:41       ` Luck, Tony
  2018-07-16 21:13         ` Thomas Gleixner
  0 siblings, 1 reply; 20+ messages in thread
From: Luck, Tony @ 2018-07-16 20:41 UTC (permalink / raw)
  To: speck

On Mon, Jul 16, 2018 at 10:31:31PM +0200, speck for Thomas Gleixner wrote:
> The magic PDF says:
> 
>    A nested VMM that finds IA32_FLUSH_CMD is enumerated should check whether
>    IA32_ARCH_CAPABILITIES bit 3 9 (SKIP_L1DFL_VMENTRY) is set, which indicates
>    that it is not required to flush L1D on VMENTER.
>    
>    First-level VMMs which perform an L1D flush before VMENTER may set
>    SKIP_L1DFL_VMENTRY in the IA32_ARCH_CAPABILITIES value exposed to guests.
>    These VMMs should set SKIP_L1DFL_VMENTRY in any case where a nested VMM may
>    be present.

This is intended as an optimization to stop uselessly re-flushing L1D on
entry to a stack of nested VMMs.

E.g. if you are running a VMM "A", which is running a nested VMM "B", which
is running a guest "C".  When C does something to cause a VMEXIT we will pop
all the way out to "A". It can take whatever action it needs, and then do
a flush L1D before the VMENTER back into "B".

Now "B" won't need to flush L1D again before doing the VMENTER to get
back into "C".

So it is just a s/w convention to let nested VMMs on systems vulnerable
to L1TF know that they *are* nested VMMs and they can skip just this one
part of mitigation.

-Tony

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #0
  2018-07-15 13:57 [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #0 Paolo Bonzini
                   ` (3 preceding siblings ...)
  2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #4 Paolo Bonzini
@ 2018-07-16 20:44 ` Konrad Rzeszutek Wilk
  2018-07-17 11:22   ` Paolo Bonzini
  4 siblings, 1 reply; 20+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-07-16 20:44 UTC (permalink / raw)
  To: speck

On Sun, Jul 15, 2018 at 03:57:12PM +0200, speck for Paolo Bonzini wrote:
> Support for ARCH_CAPABILITIES bit 3, which can already be used to disable
> the mitigations on a nested hypervisor.  Patch 3 will shortly go to Linus's
> tree.

Paolo,

I am thinking I am missing something here. I was expecting if I do
rdmsr 0x10a inside the guest I would get '8' but instead I got '0'?

The guest does see CPUID.7.EDX. Bit 29 so the guest does expose 'arch_capability'
in the /proc/cpuinfo flags.

And the data |= ARCH_CAP_SKIP.. is certainly run (added a printk there just
in case).

Thoughts?

(I wrote the patch before I saw the Intel's patch). This is on top of yesterday
QEMU's master tree.

From 09742087d36c6b786c10cc2bc0539aaed0f80e96 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 13 Jul 2018 13:57:31 +0000
Subject: [PATCH] i386: Define the IA32_ARCH_CAPABILITIES capability.

GeminiLake CPUs have this and exposing this to the guest
allows kernels to sample the Enhanced IBRS support.

The definition of this MSR and detection (CPUID.(EAX=7H,ECX=0):EDX[29])
is nicely described in the Intel whitepaper titled:
336996-Speculative-Execution-Side-Channel-Mitigations.pdf

A copy of this document is available at
   https://bugzilla.kernel.org/show_bug.cgi?id=199511

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 target/i386/cpu.c     |  2 +-
 target/i386/cpu.h     |  3 ++-
 target/i386/kvm.c     | 13 +++++++++++++
 target/i386/machine.c | 20 ++++++++++++++++++++
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index e0e2f2e..070cb7c 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1000,7 +1000,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
             NULL, NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
             NULL, NULL, "spec-ctrl", NULL,
-            NULL, NULL, NULL, "ssbd",
+            NULL, "arch-cap", NULL, "ssbd",
         },
         .cpuid_eax = 7,
         .cpuid_needs_ecx = true, .cpuid_ecx = 0,
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 2c5a0d9..2211b01 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -355,7 +355,7 @@ typedef enum X86Seg {
 #define MSR_IA32_SPEC_CTRL              0x48
 #define MSR_VIRT_SSBD                   0xc001011f
 #define MSR_IA32_TSCDEADLINE            0x6e0
-
+#define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define FEATURE_CONTROL_LOCKED                    (1<<0)
 #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
 #define FEATURE_CONTROL_LMCE                      (1<<20)
@@ -1212,6 +1212,7 @@ typedef struct CPUX86State {
 
     uint64_t spec_ctrl;
     uint64_t virt_ssbd;
+    uint64_t arch_cap;
 
     /* End of state preserved by INIT (dummy marker).  */
     struct {} end_init_save;
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index ebb2d23..b96d9f6 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -93,6 +93,7 @@ static bool has_msr_hv_reenlightenment;
 static bool has_msr_xss;
 static bool has_msr_spec_ctrl;
 static bool has_msr_virt_ssbd;
+static bool has_msr_arch_cap;
 static bool has_msr_smi_count;
 
 static uint32_t has_architectural_pmu_version;
@@ -1288,6 +1289,9 @@ static int kvm_get_supported_msrs(KVMState *s)
                 case MSR_VIRT_SSBD:
                     has_msr_virt_ssbd = true;
                     break;
+                case MSR_IA32_ARCH_CAPABILITIES:
+                    has_msr_arch_cap = true;
+                    break;
                 }
             }
         }
@@ -1802,6 +1806,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
     if (has_msr_virt_ssbd) {
         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
     }
+    if (has_msr_arch_cap) {
+        kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, env->arch_cap);
+    }
 
 #ifdef TARGET_X86_64
     if (lm_capable_kernel) {
@@ -2185,6 +2192,9 @@ static int kvm_get_msrs(X86CPU *cpu)
     if (has_msr_virt_ssbd) {
         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
     }
+    if (has_msr_arch_cap) {
+        kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, 0);
+    }
     if (!env->tsc_valid) {
         kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
         env->tsc_valid = !runstate_is_running();
@@ -2567,6 +2577,9 @@ static int kvm_get_msrs(X86CPU *cpu)
         case MSR_VIRT_SSBD:
             env->virt_ssbd = msrs[i].data;
             break;
+        case MSR_IA32_ARCH_CAPABILITIES:
+            env->arch_cap = msrs[i].data;
+            break;
         case MSR_IA32_RTIT_CTL:
             env->msr_rtit_ctrl = msrs[i].data;
             break;
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 8b64dff..963cfc0 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -955,6 +955,25 @@ static const VMStateDescription vmstate_svm_npt = {
     }
 };
 
+static bool arch_cap_needed(void *opaque)
+{
+    X86CPU *cpu = opaque;
+    CPUX86State *env = &cpu->env;
+
+    return env->arch_cap != 0;
+}
+
+static const VMStateDescription vmstate_msr_arch_cap = {
+    .name = "cpu/arch_cap",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = arch_cap_needed,
+    .fields = (VMStateField[]){
+        VMSTATE_UINT64(env.arch_cap, X86CPU),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 VMStateDescription vmstate_x86_cpu = {
     .name = "cpu",
     .version_id = 12,
@@ -1080,6 +1099,7 @@ VMStateDescription vmstate_x86_cpu = {
         &vmstate_msr_intel_pt,
         &vmstate_msr_virt_ssbd,
         &vmstate_svm_npt,
+        &vmstate_msr_arch_cap,
         NULL
     }
 };
-- 
1.8.3.1


> 
> Paolo Bonzini (4):
>   KVM: test L1TF EPT bug separately from X86_BUG_L1TF
>   x86: use ARCH_CAPABILITIES to skip L1D flush on vmentry
>   KVM: VMX: support MSR_IA32_ARCH_CAPABILITIES as a feature MSR
>   KVM: VMX: tell nested hypervisor to skip L1D flush on vmentry
> 
>  Documentation/admin-guide/l1tf.rst | 24 ++++++++++++++++++++++++
>  arch/x86/include/asm/kvm_host.h    |  1 +
>  arch/x86/include/asm/msr-index.h   |  1 +
>  arch/x86/include/asm/vmx.h         |  1 +
>  arch/x86/kernel/cpu/bugs.c         | 26 ++++++++++++++++++++------
>  arch/x86/kvm/vmx.c                 | 35 ++++++++++++++++++++++++-----------
>  arch/x86/kvm/x86.c                 | 17 ++++++++++++++++-
>  7 files changed, 87 insertions(+), 18 deletions(-)
> 
> -- 
> 1.8.3.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #2
  2018-07-16 20:41       ` [MODERATED] " Luck, Tony
@ 2018-07-16 21:13         ` Thomas Gleixner
  0 siblings, 0 replies; 20+ messages in thread
From: Thomas Gleixner @ 2018-07-16 21:13 UTC (permalink / raw)
  To: speck

On Mon, 16 Jul 2018, speck for Luck, Tony wrote:

> On Mon, Jul 16, 2018 at 10:31:31PM +0200, speck for Thomas Gleixner wrote:
> > The magic PDF says:
> > 
> >    A nested VMM that finds IA32_FLUSH_CMD is enumerated should check whether
> >    IA32_ARCH_CAPABILITIES bit 3 9 (SKIP_L1DFL_VMENTRY) is set, which indicates
> >    that it is not required to flush L1D on VMENTER.
> >    
> >    First-level VMMs which perform an L1D flush before VMENTER may set
> >    SKIP_L1DFL_VMENTRY in the IA32_ARCH_CAPABILITIES value exposed to guests.
> >    These VMMs should set SKIP_L1DFL_VMENTRY in any case where a nested VMM may
> >    be present.
> 
> This is intended as an optimization to stop uselessly re-flushing L1D on
> entry to a stack of nested VMMs.
> 
> E.g. if you are running a VMM "A", which is running a nested VMM "B", which
> is running a guest "C".  When C does something to cause a VMEXIT we will pop
> all the way out to "A". It can take whatever action it needs, and then do
> a flush L1D before the VMENTER back into "B".
> 
> Now "B" won't need to flush L1D again before doing the VMENTER to get
> back into "C".
> 
> So it is just a s/w convention to let nested VMMs on systems vulnerable
> to L1TF know that they *are* nested VMMs and they can skip just this one
> part of mitigation.

Yes, that's how I understood it. But that means only that the optimization
is in effect and that the nested VMM does not exactly know what the
mitigation state is because it does not know whether the host VMM flushes
conditionally or unconditionally.

It neither knows about SMT state because the topology exposed to the L1
guest does not necessarily reflect the host situation.

So claiming !vulnerable because that optimization bit is set is not really
a brilliant idea.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #4
  2018-07-16 20:06   ` Thomas Gleixner
@ 2018-07-17 11:14     ` Paolo Bonzini
  0 siblings, 0 replies; 20+ messages in thread
From: Paolo Bonzini @ 2018-07-17 11:14 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 573 bytes --]

On 16/07/2018 22:06, speck for Thomas Gleixner wrote:
>> +
>> +	rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
>> +	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
>> +		data |= ARCH_CAP_SKIP_L1DFL_VMENTRY;
> Why is this not checking for BUG_L1TF? If the host does not have it then
> the guest should not have it either, right?

The CPU model of the host can also be different from the guest (of
course the latter should be older).  Always setting the bit avoids the
L1D flushes even if the CPU model does not match one of those in
cpu_no_l1tf.

Paolo


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #1
  2018-07-16 20:02   ` Thomas Gleixner
@ 2018-07-17 11:20     ` Paolo Bonzini
  2018-07-17 11:28       ` Thomas Gleixner
  0 siblings, 1 reply; 20+ messages in thread
From: Paolo Bonzini @ 2018-07-17 11:20 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 3676 bytes --]

I'll send more patches today or tomorrow, in the meanwhile a couple
clarifications.

On 16/07/2018 22:02, speck for Thomas Gleixner wrote:
> On Sun, 15 Jul 2018, speck for Paolo Bonzini wrote:
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index c5c0118b126d..bf0801399cb4 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -208,14 +208,17 @@
>>  #define L1D_CACHE_ORDER 4
>>  static void *vmx_l1d_flush_pages;
>>  
>> -static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
>> +static bool vmx_l1tf_vulnerable(void)
>> +{
>> +	return l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NOT_VULNERABLE &&
>> +		l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_EPT_DISABLED;
> 
> That's really a bad choice.
> 
> Not vulnerable is when the CPU does not have X86_BUG_L1TF.

It is a bit more complicated, if SMT is disabled and L1D flush on entry
is not required, then effectively _as far as VMX is concerned_ the
processor is not vulnerable.  I'm not sure if we want this to affect
disable_smt.

> If EPT=n then
> the CPU is vulnerable, but EPT disabled mitigates the issue.
> 
>> +}
>> +
>> +static int vmx_update_l1d_flush(enum vmx_l1d_flush_state l1tf)
>>  {
>>  	struct page *page;
>>  
>> -	if (!enable_ept) {
>> -		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
>> -		return 0;
>> -	}
>> +	WARN_ON(!vmx_l1tf_vulnerable());
>>  
>>  	/* If set to auto use the default l1tf mitigation method */
>>  	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
>> @@ -276,7 +289,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
>>  {
>>  	int l1tf, ret;
>>  
>> -	if (!boot_cpu_has(X86_BUG_L1TF))
>> +	if (!vmx_l1tf_vulnerable())
>>  		return 0;
> 
> So you check for !vulnerable here and return. If vulnerable you call
> vmx_update_l1d_flush() which has another check with a warnon. That's
> convoluted, really.

There are other calls to vmx_update_l1d_flush, the WARN_ON checks a
precondition that should be true for all such calls.  The precondition
can be true because the caller checks it, because of where the call is, etc.

I don't see what's convoluted about it, but actually there's a race
between two calls to vmentry_l1d_flush_set that could trigger the
WARN_ON, so this will have to be changed as well.

>>  	l1tf = vmentry_l1d_flush_parse(s);
>> @@ -295,7 +308,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
>>  	}
>>  
>>  	mutex_lock(&vmx_l1d_flush_mutex);
>> -	ret = vmx_setup_l1d_flush(l1tf);
>> +	ret = vmx_update_l1d_flush(l1tf);
>>  	mutex_unlock(&vmx_l1d_flush_mutex);
>>  	return ret;
>>  }
>> @@ -10592,7 +10605,7 @@ static int vmx_vm_init(struct kvm *kvm)
>>  	if (!ple_gap)
>>  		kvm->arch.pause_in_guest = true;
>>  
>> -	if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
>> +	if (vmx_l1tf_vulnerable()) {
> 
> This should be a check for vmx_l1tf_flush_required() which checks for
> 
>      BUG_L1TF, enable_ept and then for that magic MSR bit.

Ok.

>>  		switch (l1tf_mitigation) {
>>  		case L1TF_MITIGATION_OFF:
>>  		case L1TF_MITIGATION_FLUSH_NOWARN:
>> @@ -13383,8 +13396,8 @@ static int __init vmx_init(void)
>>  	 * contain 'auto' which will be turned into the default 'cond'
>>  	 * mitigation mode.
>>  	 */
>> -	if (boot_cpu_has(X86_BUG_L1TF)) {
>> -		r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
>> +	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NOT_VULNERABLE) {
>> +		r = vmx_setup_l1d_flush();
>>  		if (r) {
>>  			vmx_exit();
>>  			return r;
> 
> So this code should check for the magic MSR and act accordingly.
> 
> Thanks,
> 
> 	tglx
> 



^ permalink raw reply	[flat|nested] 20+ messages in thread

* [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #0
  2018-07-16 20:44 ` [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #0 Konrad Rzeszutek Wilk
@ 2018-07-17 11:22   ` Paolo Bonzini
  0 siblings, 0 replies; 20+ messages in thread
From: Paolo Bonzini @ 2018-07-17 11:22 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 525 bytes --]

On 16/07/2018 22:44, speck for Konrad Rzeszutek Wilk wrote:
> I am thinking I am missing something here. I was expecting if I do
> rdmsr 0x10a inside the guest I would get '8' but instead I got '0'?
> 
> The guest does see CPUID.7.EDX. Bit 29 so the guest does expose 'arch_capability'
> in the /proc/cpuinfo flags.
> 
> And the data |= ARCH_CAP_SKIP.. is certainly run (added a printk there just
> in case).
> 
> Thoughts?

I tested a similar patch and it worked, so I'll try yours and let you know.

Paolo


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #1
  2018-07-17 11:20     ` [MODERATED] " Paolo Bonzini
@ 2018-07-17 11:28       ` Thomas Gleixner
  2018-07-17 17:11         ` [MODERATED] " Paolo Bonzini
  0 siblings, 1 reply; 20+ messages in thread
From: Thomas Gleixner @ 2018-07-17 11:28 UTC (permalink / raw)
  To: speck

On Tue, 17 Jul 2018, speck for Paolo Bonzini wrote:
> On 16/07/2018 22:02, speck for Thomas Gleixner wrote:
> >> +static bool vmx_l1tf_vulnerable(void)
> >> +{
> >> +	return l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NOT_VULNERABLE &&
> >> +		l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_EPT_DISABLED;
> > 
> > That's really a bad choice.
> > 
> > Not vulnerable is when the CPU does not have X86_BUG_L1TF.
> 
> It is a bit more complicated, if SMT is disabled and L1D flush on entry
> is not required, then effectively _as far as VMX is concerned_ the
> processor is not vulnerable.  I'm not sure if we want this to affect
> disable_smt.

The important part is that the host has SMT disabled, but the L1 guest,
i.e. nested VMM, does not know that at all.

So for the L1 guest even if the host sets the SKIP FLUSH bit, it's not
clear whether it might still be vulnerable via SMT.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #1
  2018-07-17 11:28       ` Thomas Gleixner
@ 2018-07-17 17:11         ` Paolo Bonzini
  2018-07-17 19:23           ` Thomas Gleixner
  0 siblings, 1 reply; 20+ messages in thread
From: Paolo Bonzini @ 2018-07-17 17:11 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 679 bytes --]

On 17/07/2018 13:28, speck for Thomas Gleixner wrote:
>> It is a bit more complicated, if SMT is disabled and L1D flush on entry
>> is not required, then effectively _as far as VMX is concerned_ the
>> processor is not vulnerable.  I'm not sure if we want this to affect
>> disable_smt.
> The important part is that the host has SMT disabled, but the L1 guest,
> i.e. nested VMM, does not know that at all.
> 
> So for the L1 guest even if the host sets the SKIP FLUSH bit, it's not
> clear whether it might still be vulnerable via SMT.

Yes, the SMT configuration of the host and the nested VMM must be the
same, otherwise things become too complicated.

Paolo


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #1
  2018-07-17 17:11         ` [MODERATED] " Paolo Bonzini
@ 2018-07-17 19:23           ` Thomas Gleixner
  0 siblings, 0 replies; 20+ messages in thread
From: Thomas Gleixner @ 2018-07-17 19:23 UTC (permalink / raw)
  To: speck

On Tue, 17 Jul 2018, speck for Paolo Bonzini wrote:
> On 17/07/2018 13:28, speck for Thomas Gleixner wrote:
> >> It is a bit more complicated, if SMT is disabled and L1D flush on entry
> >> is not required, then effectively _as far as VMX is concerned_ the
> >> processor is not vulnerable.  I'm not sure if we want this to affect
> >> disable_smt.
> > The important part is that the host has SMT disabled, but the L1 guest,
> > i.e. nested VMM, does not know that at all.
> > 
> > So for the L1 guest even if the host sets the SKIP FLUSH bit, it's not
> > clear whether it might still be vulnerable via SMT.
> 
> Yes, the SMT configuration of the host and the nested VMM must be the
> same, otherwise things become too complicated.

But what guarantees that?

And how is that information transported to the L1 guest (nested VMM) so
that you can provided useful information, e.g. in the sysfs vulnerability
file?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2018-07-17 19:23 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-15 13:57 [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #0 Paolo Bonzini
2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #1 Paolo Bonzini
2018-07-16 14:36   ` [MODERATED] " Konrad Rzeszutek Wilk
2018-07-16 20:02   ` Thomas Gleixner
2018-07-17 11:20     ` [MODERATED] " Paolo Bonzini
2018-07-17 11:28       ` Thomas Gleixner
2018-07-17 17:11         ` [MODERATED] " Paolo Bonzini
2018-07-17 19:23           ` Thomas Gleixner
2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #2 Paolo Bonzini
2018-07-16 20:04   ` Thomas Gleixner
2018-07-16 20:31     ` Thomas Gleixner
2018-07-16 20:41       ` [MODERATED] " Luck, Tony
2018-07-16 21:13         ` Thomas Gleixner
2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #3 Paolo Bonzini
2018-07-15 13:57 ` [MODERATED] [PATCH] L1TF KVM ARCH_CAPABILITIES #4 Paolo Bonzini
2018-07-16 14:58   ` [MODERATED] " Konrad Rzeszutek Wilk
2018-07-16 20:06   ` Thomas Gleixner
2018-07-17 11:14     ` [MODERATED] " Paolo Bonzini
2018-07-16 20:44 ` [MODERATED] Re: [PATCH] L1TF KVM ARCH_CAPABILITIES #0 Konrad Rzeszutek Wilk
2018-07-17 11:22   ` Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.