All of lore.kernel.org
 help / color / mirror / Atom feed
* [MODERATED] [PATCH 0/2] L1TF KVM 0
@ 2018-05-29 19:42 Paolo Bonzini
  2018-05-29 19:42 ` [MODERATED] [PATCH 1/2] L1TF KVM 1 Paolo Bonzini
                   ` (6 more replies)
  0 siblings, 7 replies; 38+ messages in thread
From: Paolo Bonzini @ 2018-05-29 19:42 UTC (permalink / raw)
  To: speck

Here is the first version of the L1 terminal fault KVM mitigation patches,
adding a TLB flush on vmentry.

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] [PATCH 1/2] L1TF KVM 1
  2018-05-29 19:42 [MODERATED] [PATCH 0/2] L1TF KVM 0 Paolo Bonzini
@ 2018-05-29 19:42 ` Paolo Bonzini
  2018-05-29 19:42 ` [MODERATED] [PATCH 2/2] L1TF KVM 2 Paolo Bonzini
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Paolo Bonzini @ 2018-05-29 19:42 UTC (permalink / raw)
  To: speck

This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
fault.  The two modes are "vmexit_l1d_flush=1" and "vmexit_l1d_flush=2".

"vmexit_l1d_flush=2" is simply doing an L1 cache flush on every vmexit.
"vmexit_l1d_flush=1" is trying to avoid so on vmexits that are "confined"
in the kind of code that they execute.  The idea is based on Intel's
patches, but the final list of "confined" vmexits isn't quite.

Notably, L1 cache flushes are performed on EPT violations (which are
basically KVM-level page faults), vmexits that involve the emulator,
and on every KVM_RUN invocation (so each userspace exit).  However,
most vmexits are considered safe.  I singled out the emulator because
it may be a good target for other speculative execution-based threats,
and the MMU because it can bring host page tables in the L1 cache.

The mitigation does not in any way try to do anything about hyperthreading;
it is possible for a sibling thread to read data from the cache during a
vmexit, before the host completes the flush, or to read data from the cache
while a sibling runs.  This part of the work is not ready yet.

For now I'm leaving the default to "vmexit_l1d_flush=2", in case we need
to push out the patches in an emergency embargo break, but I don't think
it's the best setting.  The cost is up to 2.5x more expensive vmexits
on Haswell processors, and 30% on Coffee Lake (for the latter, this is
independent of whether microcode or the generic flush code are used).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  7 ++++-
 arch/x86/kvm/mmu.c              |  1 +
 arch/x86/kvm/svm.c              |  3 +-
 arch/x86/kvm/vmx.c              | 62 ++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              | 54 +++++++++++++++++++++++++++++++++--
 5 files changed, 122 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c25775fad4ed..ae4bab8b1f8a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -711,6 +711,9 @@ struct kvm_vcpu_arch {
 
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
+	
+	/* for L1 terminal fault vulnerability */
+	bool vcpu_unconfined;
 };
 
 struct kvm_lpage_info {
@@ -879,6 +882,7 @@ struct kvm_vcpu_stat {
 	u64 signal_exits;
 	u64 irq_window_exits;
 	u64 nmi_window_exits;
+	u64 l1d_flush;
 	u64 halt_exits;
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
@@ -937,7 +941,7 @@ struct kvm_x86_ops {
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
 
-	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
@@ -1446,6 +1450,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
+void kvm_l1d_flush(void);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8494dbae41b9..3b1140b156b2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3836,6 +3836,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 {
 	int r = 1;
 
+	vcpu->arch.vcpu_unconfined = true;
 	switch (vcpu->arch.apf.host_apf_reason) {
 	default:
 		trace_kvm_page_fault(fault_address, error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1fc05e428aba..849edcd31aad 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5404,8 +5404,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 		svm->asid_generation--;
 }
 
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
 {
+	*need_l1d_flush = false;
 }
 
 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3f1696570b41..b90ba122e73a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@
 };
 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 
+static int __read_mostly vmexit_l1d_flush = 2;
+module_param_named(vmexit_l1d_flush, vmexit_l1d_flush, int, 0644);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -938,6 +941,8 @@ static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bit
 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
 
+static DEFINE_PER_CPU(int, last_vector);
+
 enum {
 	VMX_VMREAD_BITMAP,
 	VMX_VMWRITE_BITMAP,
@@ -2423,6 +2428,59 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 				   vmx->guest_msrs[i].mask);
 }
 
+static inline bool vmx_handling_confined(int reason)
+{
+	switch (reason) {
+	case EXIT_REASON_EXCEPTION_NMI:
+	case EXIT_REASON_HLT:
+	case EXIT_REASON_PAUSE_INSTRUCTION:
+	case EXIT_REASON_APIC_WRITE:
+	case EXIT_REASON_MSR_WRITE:
+	case EXIT_REASON_VMCALL:
+	case EXIT_REASON_CR_ACCESS:
+	case EXIT_REASON_DR_ACCESS:
+	case EXIT_REASON_CPUID:
+	case EXIT_REASON_PREEMPTION_TIMER:
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_EOI_INDUCED:
+	case EXIT_REASON_WBINVD:
+	case EXIT_REASON_XSETBV:
+		/*
+		 * The next three set vcpu->arch.vcpu_unconfined themselves, so
+		 * we consider them confined here.
+		 */
+	case EXIT_REASON_EPT_VIOLATION:
+	case EXIT_REASON_EPT_MISCONFIG:
+	case EXIT_REASON_IO_INSTRUCTION:
+		return true;
+	case EXIT_REASON_EXTERNAL_INTERRUPT: {
+		int cpu = raw_smp_processor_id();
+		int vector = per_cpu(last_vector, cpu);
+		return vector == LOCAL_TIMER_VECTOR || vector == RESCHEDULE_VECTOR;
+	}
+	default:
+		return false;
+	}
+}
+
+static bool vmx_core_confined(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	return vmx_handling_confined(vmx->exit_reason);
+}
+
+static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
+{
+	vmx_save_host_state(vcpu);
+	if (vmexit_l1d_flush == 0 || !enable_ept)
+		*need_l1d_flush = false;
+	else if (vmexit_l1d_flush == 1)
+		*need_l1d_flush |= !vmx_core_confined(vcpu);
+	else
+		*need_l1d_flush = true;
+}
+
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
 	if (!vmx->host_state.loaded)
@@ -9457,11 +9515,13 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 		unsigned long entry;
 		gate_desc *desc;
 		struct vcpu_vmx *vmx = to_vmx(vcpu);
+		int cpu = raw_smp_processor_id();
 #ifdef CONFIG_X86_64
 		unsigned long tmp;
 #endif
 
 		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
+		per_cpu(last_vector, cpu) = vector;
 		desc = (gate_desc *)vmx->host_idt_base + vector;
 		entry = gate_offset(desc);
 		asm volatile(
@@ -12642,7 +12702,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
 
-	.prepare_guest_switch = vmx_save_host_state,
+	.prepare_guest_switch = vmx_prepare_guest_switch,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 59371de5d722..ada9e55fc871 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -194,6 +194,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "req_event", VCPU_STAT(req_event) },
+	{ "l1d_flush", VCPU_STAT(l1d_flush) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -6026,6 +6027,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	bool writeback = true;
 	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
+	vcpu->arch.vcpu_unconfined = true;
+
 	/*
 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 	 * never reused.
@@ -6509,10 +6512,40 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
 };
 #endif
 
+
+#define L1D_CACHE_ORDER 3
+static void *__read_mostly empty_zero_pages;
+
+void kvm_l1d_flush(void)
+{
+	asm volatile(
+		"movq %0, %%rax\n\t"
+		"leaq 65536(%0), %%rdx\n\t"
+		"11: \n\t"
+		"movzbl (%%rax), %%ecx\n\t"
+		"addq $4096, %%rax\n\t"
+		"cmpq %%rax, %%rdx\n\t"
+		"jne 11b\n\t"
+		"xorl %%eax, %%eax\n\t"
+		"cpuid\n\t"
+		"xorl %%eax, %%eax\n\t"
+		"12:\n\t"
+		"movzwl %%ax, %%edx\n\t"
+		"addl $64, %%eax\n\t"
+		"movzbl (%%rdx, %0), %%ecx\n\t"
+		"cmpl $65536, %%eax\n\t"
+		"jne 12b\n\t"
+		"lfence\n\t"
+		:
+		: "r" (empty_zero_pages)
+		: "rax", "rbx", "rcx", "rdx");
+}
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
 	struct kvm_x86_ops *ops = opaque;
+	struct page *page;
 
 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -6532,10 +6565,15 @@ int kvm_arch_init(void *opaque)
 	}
 
 	r = -ENOMEM;
+	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
+	if (!page)
+		goto out;
+	empty_zero_pages = page_address(page);
+
 	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 	if (!shared_msrs) {
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-		goto out;
+		goto out_free_zero_pages;
 	}
 
 	r = kvm_mmu_module_init();
@@ -6566,6 +6604,8 @@ int kvm_arch_init(void *opaque)
 
 	return 0;
 
+out_free_zero_pages:
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 out_free_percpu:
 	free_percpu(shared_msrs);
 out:
@@ -6590,6 +6630,7 @@ void kvm_arch_exit(void)
 #endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 	free_percpu(shared_msrs);
 }
 
@@ -7233,6 +7274,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_cpu_accept_dm_intr(vcpu);
 
 	bool req_immediate_exit = false;
+	bool need_l1d_flush;
 
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -7372,7 +7414,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	preempt_disable();
 
-	kvm_x86_ops->prepare_guest_switch(vcpu);
+	need_l1d_flush = vcpu->arch.vcpu_unconfined;
+	vcpu->arch.vcpu_unconfined = false;
+	kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
+	if (need_l1d_flush) {
+		vcpu->stat.l1d_flush++;
+		kvm_l1d_flush();
+	}
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -7559,6 +7607,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	struct kvm *kvm = vcpu->kvm;
 
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+	vcpu->arch.vcpu_unconfined = true;
 
 	for (;;) {
 		if (kvm_vcpu_running(vcpu)) {
@@ -8675,6 +8724,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+	vcpu->arch.vcpu_unconfined = true;
 	kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [MODERATED] [PATCH 2/2] L1TF KVM 2
  2018-05-29 19:42 [MODERATED] [PATCH 0/2] L1TF KVM 0 Paolo Bonzini
  2018-05-29 19:42 ` [MODERATED] [PATCH 1/2] L1TF KVM 1 Paolo Bonzini
@ 2018-05-29 19:42 ` Paolo Bonzini
       [not found] ` <20180529194240.7F1336110A@crypto-ml.lab.linutronix.de>
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Paolo Bonzini @ 2018-05-29 19:42 UTC (permalink / raw)
  To: speck

Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
If it is active, the displacement flush is unnecessary.  Tested on
a Coffee Lake machine.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/include/asm/msr-index.h   | 3 +++
 arch/x86/kvm/x86.c                 | 4 ++++
 3 files changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 578793e97431..aebf89c4175d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -333,6 +333,7 @@
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_FLUSH_L1D		(18*32+28) /* IA32_FLUSH_L1D MSR */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
 
 /*
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 53d5b1b9255e..f43bd9f23053 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -65,6 +65,9 @@
 
 #define MSR_MTRRcap			0x000000fe
 
+#define MSR_IA32_FLUSH_L1D             0x10b
+#define MSR_IA32_FLUSH_L1D_VALUE       0x00000001
+
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
 #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ada9e55fc871..43738283aa2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6518,6 +6518,10 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
 
 void kvm_l1d_flush(void)
 {
+	if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+		wrmsrl(MSR_IA32_FLUSH_L1D, MSR_IA32_FLUSH_L1D_VALUE);
+		return;
+	}
 	asm volatile(
 		"movq %0, %%rax\n\t"
 		"leaq 65536(%0), %%rdx\n\t"
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 1/2] L1TF KVM 1
       [not found] ` <20180529194240.7F1336110A@crypto-ml.lab.linutronix.de>
@ 2018-05-29 22:49   ` Thomas Gleixner
  2018-05-29 23:54     ` [MODERATED] " Andrew Cooper
                       ` (3 more replies)
  0 siblings, 4 replies; 38+ messages in thread
From: Thomas Gleixner @ 2018-05-29 22:49 UTC (permalink / raw)
  To: speck

On Tue, 29 May 2018, speck for Paolo Bonzini wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: [PATCH 1/2] kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
> 
> This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
> fault.  The two modes are "vmexit_l1d_flush=1" and "vmexit_l1d_flush=2".

This is confusing at best. Why is this vmexit_l1d_flush? You flush on
VMENTER not on VMEXIT.

What you are doing is to decide whether the last exit reason requires a
flush or not. But that decision happens on VMENTER.

> Notably, L1 cache flushes are performed on EPT violations (which are
> basically KVM-level page faults), vmexits that involve the emulator,
> and on every KVM_RUN invocation (so each userspace exit).  However,
> most vmexits are considered safe.  I singled out the emulator because
> it may be a good target for other speculative execution-based threats,
> and the MMU because it can bring host page tables in the L1 cache.

What about interrupts?

> @@ -2423,6 +2428,59 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>  				   vmx->guest_msrs[i].mask);
>  }
>  
> +static inline bool vmx_handling_confined(int reason)
> +{
> +	switch (reason) {
> +	case EXIT_REASON_EXCEPTION_NMI:
> +	case EXIT_REASON_HLT:
> +	case EXIT_REASON_PAUSE_INSTRUCTION:
> +	case EXIT_REASON_APIC_WRITE:
> +	case EXIT_REASON_MSR_WRITE:
> +	case EXIT_REASON_VMCALL:
> +	case EXIT_REASON_CR_ACCESS:
> +	case EXIT_REASON_DR_ACCESS:
> +	case EXIT_REASON_CPUID:
> +	case EXIT_REASON_PREEMPTION_TIMER:
> +	case EXIT_REASON_MSR_READ:
> +	case EXIT_REASON_EOI_INDUCED:
> +	case EXIT_REASON_WBINVD:
> +	case EXIT_REASON_XSETBV:
> +		/*
> +		 * The next three set vcpu->arch.vcpu_unconfined themselves, so
> +		 * we consider them confined here.

What's the logic behind that?

> +		 */
> +	case EXIT_REASON_EPT_VIOLATION:
> +	case EXIT_REASON_EPT_MISCONFIG:
> +	case EXIT_REASON_IO_INSTRUCTION:
> +		return true;
> +	case EXIT_REASON_EXTERNAL_INTERRUPT: {
> +		int cpu = raw_smp_processor_id();
> +		int vector = per_cpu(last_vector, cpu);
> +		return vector == LOCAL_TIMER_VECTOR || vector == RESCHEDULE_VECTOR;

That wants a comment why these two are considered safe.

The timer vector is a doubtful one. It does not necessarily cause a
reschedule and it can run arbitrary softirq code on interrupt return. I
wouldn't be that sure that it's safe.

> +	}
> +	default:
> +		return false;
> +	}
> +}
> +
> +static bool vmx_core_confined(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	return vmx_handling_confined(vmx->exit_reason);
> +}
> +
> +static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
> +{
> +	vmx_save_host_state(vcpu);
> +	if (vmexit_l1d_flush == 0 || !enable_ept)
> +		*need_l1d_flush = false;
> +	else if (vmexit_l1d_flush == 1)
> +		*need_l1d_flush |= !vmx_core_confined(vcpu);

This inverted logic does not make the code more readable. It's obfuscation
for no value.

> +	else
> +		*need_l1d_flush = true;
> +}

> @@ -9457,11 +9515,13 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
> 		unsigned long entry;
> 		gate_desc *desc;
>		struct vcpu_vmx *vmx = to_vmx(vcpu);
> +		int cpu = raw_smp_processor_id();
>  #ifdef CONFIG_X86_64
>  		unsigned long tmp;
>  #endif
>  
> 		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
> +		per_cpu(last_vector, cpu) = vector;

Why aren't you doing the evaluation of the vector right here and set the
unconfined bit instead of having yet another indirection and probably
another cache line for that per_cpu() storage? That does not make any
sense at all.

>		desc = (gate_desc *)vmx->host_idt_base + vector;
>		entry = gate_offset(desc);

> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6509,10 +6512,40 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
>  };
>  #endif
>  
> +
> +#define L1D_CACHE_ORDER 3

This should use the cache size information and not a hard coded value I think. 

> +static void *__read_mostly empty_zero_pages;
> +
> +void kvm_l1d_flush(void)
> +{
> +	asm volatile(
> +		"movq %0, %%rax\n\t"
> +		"leaq 65536(%0), %%rdx\n\t"

Why 64K?

> +		"11: \n\t"
> +		"movzbl (%%rax), %%ecx\n\t"
> +		"addq $4096, %%rax\n\t"
> +		"cmpq %%rax, %%rdx\n\t"
> +		"jne 11b\n\t"
> +		"xorl %%eax, %%eax\n\t"
> +		"cpuid\n\t"

What's the cpuid invocation for?

> +		"xorl %%eax, %%eax\n\t"
> +		"12:\n\t"
> +		"movzwl %%ax, %%edx\n\t"
> +		"addl $64, %%eax\n\t"
> +		"movzbl (%%rdx, %0), %%ecx\n\t"
> +		"cmpl $65536, %%eax\n\t"

And this whole magic should be documented.

> +		"jne 12b\n\t"
> +		"lfence\n\t"
> +		:
> +		: "r" (empty_zero_pages)
> +		: "rax", "rbx", "rcx", "rdx");

How is that supposed to compile on 32bit?

> +}

Aside of that do we really need that manual flush thingy? Is that ucode
update going to take forever?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
  2018-05-29 22:49   ` [PATCH 1/2] L1TF KVM 1 Thomas Gleixner
@ 2018-05-29 23:54     ` Andrew Cooper
  2018-05-30  9:01       ` Paolo Bonzini
  2018-05-30  8:55     ` [MODERATED] " Peter Zijlstra
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 38+ messages in thread
From: Andrew Cooper @ 2018-05-29 23:54 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 1698 bytes --]

On 29/05/2018 23:49, speck for Thomas Gleixner wrote:
> On Tue, 29 May 2018, speck for Paolo Bonzini wrote:
>> +static void *__read_mostly empty_zero_pages;
>> +
>> +void kvm_l1d_flush(void)
>> +{
>> +	asm volatile(
>> +		"movq %0, %%rax\n\t"
>> +		"leaq 65536(%0), %%rdx\n\t"
> Why 64K?
>
>> +		"11: \n\t"
>> +		"movzbl (%%rax), %%ecx\n\t"
>> +		"addq $4096, %%rax\n\t"
>> +		"cmpq %%rax, %%rdx\n\t"
>> +		"jne 11b\n\t"
>> +		"xorl %%eax, %%eax\n\t"
>> +		"cpuid\n\t"
> What's the cpuid invocation for?
>
>> +		"xorl %%eax, %%eax\n\t"
>> +		"12:\n\t"
>> +		"movzwl %%ax, %%edx\n\t"
>> +		"addl $64, %%eax\n\t"
>> +		"movzbl (%%rdx, %0), %%ecx\n\t"
>> +		"cmpl $65536, %%eax\n\t"
> And this whole magic should be documented.
>
>> +		"jne 12b\n\t"
>> +		"lfence\n\t"
>> +		:
>> +		: "r" (empty_zero_pages)
>> +		: "rax", "rbx", "rcx", "rdx");
> How is that supposed to compile on 32bit?
>
>> +}
> Aside of that do we really need that manual flush thingy? Is that ucode
> update going to take forever?

I already provided feedback on this software loop, but have had radio
silence as a result.  The CPUID is serialisation (best as I can guess)
to terminate any WC buffer which got hit, but this is going to truly
suck inside a VM.  If it is for full serialisation properties, the least
overhead option would be a write to %cr2, which is serialising on all
affected parts.

Other bits I don't understand are the 64k limit in the first place, why
it gets walked over in 4k strides to begin with (I'm not aware of any
prefetching which would benefit that...) and why a particularly
obfuscated piece of magic is used for the 64byte strides.

~Andrew


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 2/2] L1TF KVM 2
       [not found] ` <20180529194322.8B56F610F8@crypto-ml.lab.linutronix.de>
@ 2018-05-29 23:59   ` Andrew Cooper
  2018-05-30  8:38     ` Thomas Gleixner
  0 siblings, 1 reply; 38+ messages in thread
From: Andrew Cooper @ 2018-05-29 23:59 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 3014 bytes --]

On 29/05/2018 20:42, speck for Paolo Bonzini wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: [PATCH 2/2] kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
>
> Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
> If it is active, the displacement flush is unnecessary.  Tested on
> a Coffee Lake machine.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/cpufeatures.h | 1 +
>  arch/x86/include/asm/msr-index.h   | 3 +++
>  arch/x86/kvm/x86.c                 | 4 ++++
>  3 files changed, 8 insertions(+)
>
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index 578793e97431..aebf89c4175d 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -333,6 +333,7 @@
>  #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
>  #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
>  #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
> +#define X86_FEATURE_FLUSH_L1D		(18*32+28) /* IA32_FLUSH_L1D MSR */
>  #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
>  
>  /*
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 53d5b1b9255e..f43bd9f23053 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -65,6 +65,9 @@
>  
>  #define MSR_MTRRcap			0x000000fe
>  
> +#define MSR_IA32_FLUSH_L1D             0x10b
> +#define MSR_IA32_FLUSH_L1D_VALUE       0x00000001
> +
>  #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
>  #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
>  #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index ada9e55fc871..43738283aa2a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6518,6 +6518,10 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
>  
>  void kvm_l1d_flush(void)
>  {
> +	if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
> +		wrmsrl(MSR_IA32_FLUSH_L1D, MSR_IA32_FLUSH_L1D_VALUE);
> +		return;
> +	}

What is this supposed to achieve?  Sure, most of the cache content has
disappeared, but some of the most interesting parts are still left
available to the guest.

In Xen, I've come to the conclusion that the only viable option here is
for a guest load-only MSR entry.  Without this, the GPRs at the most
recent vmentry are still available in the cache (as there is no way for
the hypervisor to rationally zero them), which results in a guest kenrel
=> guest user leak if a vmentry occurs late in the guest kernels
return-to-userspace path.

A guest kernel which is implementing the PTE mitigation is immune to
this attack, but the hypervisor does not know a priori whether this is
the case or not.

~Andrew


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] L1TF KVM 2
  2018-05-29 23:59   ` [MODERATED] Re: [PATCH 2/2] L1TF KVM 2 Andrew Cooper
@ 2018-05-30  8:38     ` Thomas Gleixner
  2018-05-30 16:57       ` [MODERATED] " Andrew Cooper
  0 siblings, 1 reply; 38+ messages in thread
From: Thomas Gleixner @ 2018-05-30  8:38 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 812 bytes --]

On Wed, 30 May 2018, speck for Andrew Cooper wrote:
> On 29/05/2018 20:42, speck for Paolo Bonzini wrote:
> In Xen, I've come to the conclusion that the only viable option here is
> for a guest load-only MSR entry.  Without this, the GPRs at the most
> recent vmentry are still available in the cache (as there is no way for
> the hypervisor to rationally zero them), which results in a guest kenrel
> => guest user leak if a vmentry occurs late in the guest kernels
> return-to-userspace path.
> 
> A guest kernel which is implementing the PTE mitigation is immune to
> this attack, but the hypervisor does not know a priori whether this is
> the case or not.

And why would you care about some outdated guest kernel, which is
vulnerable against lots of other stuff as well?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
  2018-05-29 22:49   ` [PATCH 1/2] L1TF KVM 1 Thomas Gleixner
  2018-05-29 23:54     ` [MODERATED] " Andrew Cooper
@ 2018-05-30  8:55     ` Peter Zijlstra
  2018-05-30  9:02     ` Paolo Bonzini
  2018-05-31 19:00     ` Jon Masters
  3 siblings, 0 replies; 38+ messages in thread
From: Peter Zijlstra @ 2018-05-30  8:55 UTC (permalink / raw)
  To: speck

On Wed, May 30, 2018 at 12:49:55AM +0200, speck for Thomas Gleixner wrote:
> > +	case EXIT_REASON_EXTERNAL_INTERRUPT: {
> > +		int cpu = raw_smp_processor_id();
> > +		int vector = per_cpu(last_vector, cpu);
> > +		return vector == LOCAL_TIMER_VECTOR || vector == RESCHEDULE_VECTOR;
> 
> That wants a comment why these two are considered safe.
> 
> The timer vector is a doubtful one. It does not necessarily cause a
> reschedule and it can run arbitrary softirq code on interrupt return. I
> wouldn't be that sure that it's safe.

reschedule can also cause softirq to run.

And there's just no way we can guarantee softirq doesn't do something
sensitive, like IPsec processing or whatever.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
  2018-05-29 23:54     ` [MODERATED] " Andrew Cooper
@ 2018-05-30  9:01       ` Paolo Bonzini
  2018-05-30 11:58         ` Martin Pohlack
  2018-06-04  8:24         ` [MODERATED] " Martin Pohlack
  0 siblings, 2 replies; 38+ messages in thread
From: Paolo Bonzini @ 2018-05-30  9:01 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 488 bytes --]

On 30/05/2018 01:54, speck for Andrew Cooper wrote:
> Other bits I don't understand are the 64k limit in the first place, why
> it gets walked over in 4k strides to begin with (I'm not aware of any
> prefetching which would benefit that...) and why a particularly
> obfuscated piece of magic is used for the 64byte strides.

That is the only part I understood, :) the 4k strides ensure that the
source data is in the TLB.  Why that is needed is still a mystery though.

Paolo


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
  2018-05-29 22:49   ` [PATCH 1/2] L1TF KVM 1 Thomas Gleixner
  2018-05-29 23:54     ` [MODERATED] " Andrew Cooper
  2018-05-30  8:55     ` [MODERATED] " Peter Zijlstra
@ 2018-05-30  9:02     ` Paolo Bonzini
  2018-05-31 19:00     ` Jon Masters
  3 siblings, 0 replies; 38+ messages in thread
From: Paolo Bonzini @ 2018-05-30  9:02 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 3853 bytes --]

On 30/05/2018 00:49, speck for Thomas Gleixner wrote:
> On Tue, 29 May 2018, speck for Paolo Bonzini wrote:
>> From: Paolo Bonzini <pbonzini@redhat.com>
>> Subject: [PATCH 1/2] kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
>>
>> This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
>> fault.  The two modes are "vmexit_l1d_flush=1" and "vmexit_l1d_flush=2".
> 
> This is confusing at best. Why is this vmexit_l1d_flush? You flush on
> VMENTER not on VMEXIT.

Yes, that makes sense.

>> Notably, L1 cache flushes are performed on EPT violations (which are
>> basically KVM-level page faults), vmexits that involve the emulator,
>> and on every KVM_RUN invocation (so each userspace exit).  However,
>> most vmexits are considered safe.  I singled out the emulator because
>> it may be a good target for other speculative execution-based threats,
>> and the MMU because it can bring host page tables in the L1 cache.
> 
> What about interrupts?

Will fix the commit message and rework the patch to set vcpu_unconfined
at vmexit time.

>> +		/*
>> +		 * The next three set vcpu->arch.vcpu_unconfined themselves, so
>> +		 * we consider them confined here.
> 
> What's the logic behind that?
> 
>> +		 */
>> +	case EXIT_REASON_EPT_VIOLATION:
>> +	case EXIT_REASON_EPT_MISCONFIG:
>> +	case EXIT_REASON_IO_INSTRUCTION:
>> +		return true;

EPT misconfig and I/O instruction are very frequent, and most of the
time they run just a small fast path.  EPT violation can be put together
with the "always slow" ones.

>> +	case EXIT_REASON_EXTERNAL_INTERRUPT: {
>> +		int cpu = raw_smp_processor_id();
>> +		int vector = per_cpu(last_vector, cpu);
>> +		return vector == LOCAL_TIMER_VECTOR || vector == RESCHEDULE_VECTOR;
> 
> That wants a comment why these two are considered safe.
> 
> The timer vector is a doubtful one. It does not necessarily cause a
> reschedule and it can run arbitrary softirq code on interrupt return. I
> wouldn't be that sure that it's safe.

It's also the most frequent one. :(  But I see your and Peter's point,
I'll drop it and consider all external interrupts to be unconfined.

May there could be some kind of "ran softirq" generation count.

>> @@ -9457,11 +9515,13 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
>> 		unsigned long entry;
>> 		gate_desc *desc;
>> 		struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +		int cpu = raw_smp_processor_id();
>>  #ifdef CONFIG_X86_64
>>  		unsigned long tmp;
>>  #endif
>>  
>> 		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
>> +		per_cpu(last_vector, cpu) = vector;
> 
> Why aren't you doing the evaluation of the vector right here and set the
> unconfined bit instead of having yet another indirection and probably
> another cache line for that per_cpu() storage? That does not make any
> sense at all.

Because that's how the patches I got from Intel did it, and I kind of
liked keeping all the logic in one function.  But it's going to go away,
it's not safe.

>> 		desc = (gate_desc *)vmx->host_idt_base + vector;
>> 		entry = gate_offset(desc);
> 
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -6509,10 +6512,40 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
>>  };
>>  #endif
>>  
>> +
>> +#define L1D_CACHE_ORDER 3
> 
> This should use the cache size information and not a hard coded value I think. 

Plus it seems wrong.  Order 3 is 32K, not 64K, isn't it? :/

> And this whole magic should be documented.
> 
> Aside of that do we really need that manual flush thingy? Is that ucode
> update going to take forever?

As Andrew said, this was also copied from Intel's patch, assuming they
knew what they were doing.  I'll just drop it and just use the microcode
MSR.

Paolo


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
  2018-05-30  9:01       ` Paolo Bonzini
@ 2018-05-30 11:58         ` Martin Pohlack
  2018-05-30 12:25           ` Thomas Gleixner
  2018-06-04  8:24         ` [MODERATED] " Martin Pohlack
  1 sibling, 1 reply; 38+ messages in thread
From: Martin Pohlack @ 2018-05-30 11:58 UTC (permalink / raw)
  To: speck

On 30.05.2018 11:01, speck for Paolo Bonzini wrote:
> On 30/05/2018 01:54, speck for Andrew Cooper wrote:
>> Other bits I don't understand are the 64k limit in the first place, why
>> it gets walked over in 4k strides to begin with (I'm not aware of any
>> prefetching which would benefit that...) and why a particularly
>> obfuscated piece of magic is used for the 64byte strides.
> 
> That is the only part I understood, :) the 4k strides ensure that the
> source data is in the TLB.  Why that is needed is still a mystery though.

I think the reasoning is that you first want to populate the TLB for the
whole flush array, then fence, to make sure TLB walks do not interfere
with the actual flushing later, either for performance reasons or for
preventing leakage of partial walk results.

Not sure about the 64K, it likely is about the LRU implementation for L1
replacement not being perfect (but pseudo LRU), so you need to flush
more than the L1 size (32K) in software.  But I have also seen smaller
recommendations for that (52K).

Martin

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 1/2] L1TF KVM 1
  2018-05-30 11:58         ` Martin Pohlack
@ 2018-05-30 12:25           ` Thomas Gleixner
  2018-05-30 14:31             ` Thomas Gleixner
  0 siblings, 1 reply; 38+ messages in thread
From: Thomas Gleixner @ 2018-05-30 12:25 UTC (permalink / raw)
  To: speck

On Wed, 30 May 2018, speck for Martin Pohlack wrote:

> -----BEGIN PGP MESSAGE-----
> Charset: windows-1252
> Version: GnuPG v2

Sorry the remailer failed to decrypt that message. Investigating.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 1/2] L1TF KVM 1
  2018-05-30 12:25           ` Thomas Gleixner
@ 2018-05-30 14:31             ` Thomas Gleixner
  0 siblings, 0 replies; 38+ messages in thread
From: Thomas Gleixner @ 2018-05-30 14:31 UTC (permalink / raw)
  To: speck

On Wed, 30 May 2018, speck for Thomas Gleixner wrote:
> On Wed, 30 May 2018, speck for Martin Pohlack wrote:
> 
> > -----BEGIN PGP MESSAGE-----
> > Charset: windows-1252
> > Version: GnuPG v2
> 
> Sorry the remailer failed to decrypt that message. Investigating.

It was missing a decoding from quoted-printable which is required due to
charset = windows-1252. Fixed and replayed.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 2/2] L1TF KVM 2
  2018-05-30  8:38     ` Thomas Gleixner
@ 2018-05-30 16:57       ` Andrew Cooper
  2018-05-30 19:11         ` Thomas Gleixner
  0 siblings, 1 reply; 38+ messages in thread
From: Andrew Cooper @ 2018-05-30 16:57 UTC (permalink / raw)
  To: speck

On 30/05/18 09:38, speck for Thomas Gleixner wrote:
> On Wed, 30 May 2018, speck for Andrew Cooper wrote:
>> On 29/05/2018 20:42, speck for Paolo Bonzini wrote:
>> In Xen, I've come to the conclusion that the only viable option here is
>> for a guest load-only MSR entry.  Without this, the GPRs at the most
>> recent vmentry are still available in the cache (as there is no way for
>> the hypervisor to rationally zero them), which results in a guest kenrel
>> => guest user leak if a vmentry occurs late in the guest kernels
>> return-to-userspace path.
>>
>> A guest kernel which is implementing the PTE mitigation is immune to
>> this attack, but the hypervisor does not know a priori whether this is
>> the case or not.
> And why would you care about some outdated guest kernel, which is
> vulnerable against lots of other stuff as well?

That's a very good point.  It need not matter for the guest kernel =>
user leak.

However, to avoid leaking other hypervisor data into guest context, you
need to account for the possibility of interrupts/exceptions late in the
vmentry path, which includes an NMI hitting on the instruction boundary
before vmresume.  Failure to account for this case will, most easily,
leak hypervisor GPRs into guest context.

Unlike MSR_SPEC_CTRL (which can be written "shortly before the
iret/vmentry"), issuing the flush before restoring GPRs is ineffective
at preventing leakage, and a write to MSR_FLUSH_CMD cannot be performed
after restoring GPRs, other than with a MSR guest load list entry.

~Andrew

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] L1TF KVM 2
  2018-05-30 16:57       ` [MODERATED] " Andrew Cooper
@ 2018-05-30 19:11         ` Thomas Gleixner
  2018-05-30 21:10           ` [MODERATED] " Andi Kleen
  0 siblings, 1 reply; 38+ messages in thread
From: Thomas Gleixner @ 2018-05-30 19:11 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 1646 bytes --]

On Wed, 30 May 2018, speck for Andrew Cooper wrote:

> On 30/05/18 09:38, speck for Thomas Gleixner wrote:
> > On Wed, 30 May 2018, speck for Andrew Cooper wrote:
> >> On 29/05/2018 20:42, speck for Paolo Bonzini wrote:
> >> In Xen, I've come to the conclusion that the only viable option here is
> >> for a guest load-only MSR entry.  Without this, the GPRs at the most
> >> recent vmentry are still available in the cache (as there is no way for
> >> the hypervisor to rationally zero them), which results in a guest kenrel
> >> => guest user leak if a vmentry occurs late in the guest kernels
> >> return-to-userspace path.
> >>
> >> A guest kernel which is implementing the PTE mitigation is immune to
> >> this attack, but the hypervisor does not know a priori whether this is
> >> the case or not.
> > And why would you care about some outdated guest kernel, which is
> > vulnerable against lots of other stuff as well?
> 
> That's a very good point.  It need not matter for the guest kernel =>
> user leak.
> 
> However, to avoid leaking other hypervisor data into guest context, you
> need to account for the possibility of interrupts/exceptions late in the
> vmentry path, which includes an NMI hitting on the instruction boundary
> before vmresume.  Failure to account for this case will, most easily,
> leak hypervisor GPRs into guest context.

Right, but you really have to make a judgement whether this leak is useful
and can be orchestrated by the attacker in the guest. If it's just the
theoretical cornercase with no practical attack surface then you really can
leave that as an exercise for ivory tower people.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 2/2] L1TF KVM 2
  2018-05-30 19:11         ` Thomas Gleixner
@ 2018-05-30 21:10           ` Andi Kleen
  2018-05-30 23:19             ` Andrew Cooper
  0 siblings, 1 reply; 38+ messages in thread
From: Andi Kleen @ 2018-05-30 21:10 UTC (permalink / raw)
  To: speck

> Right, but you really have to make a judgement whether this leak is useful
> and can be orchestrated by the attacker in the guest. If it's just the
> theoretical cornercase with no practical attack surface then you really can
> leave that as an exercise for ivory tower people.

We care about user data and kernel secrets like keys.

NMIs don't have either.

(except for profile NMIs, but they only have the data of what
you just interrupted)

Same for machine checks etc. 

In fact most hard interrupts are likely uninteresting (except those
that copy user data), although soft interrupts may not be.

-Andi

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 2/2] L1TF KVM 2
  2018-05-30 21:10           ` [MODERATED] " Andi Kleen
@ 2018-05-30 23:19             ` Andrew Cooper
  0 siblings, 0 replies; 38+ messages in thread
From: Andrew Cooper @ 2018-05-30 23:19 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 1581 bytes --]

On 30/05/2018 22:10, speck for Andi Kleen wrote:
>> Right, but you really have to make a judgement whether this leak is useful
>> and can be orchestrated by the attacker in the guest. If it's just the
>> theoretical cornercase with no practical attack surface then you really can
>> leave that as an exercise for ivory tower people.
> We care about user data and kernel secrets like keys.
>
> NMIs don't have either.
>
> (except for profile NMIs, but they only have the data of what
> you just interrupted)
>
> Same for machine checks etc. 
>
> In fact most hard interrupts are likely uninteresting (except those
> that copy user data), although soft interrupts may not be.

Until we have instructions for how to turn off next-page prefetch
(Haswell and later, I believe), *any* memory access is liable to pull in
unrelated data from adjacent pages.  Accesses into the directmap are
liable to pull in data from a completely unrelated context, which in
Xen's case has a reasonable chance of being data from another VM, and in
Linux's case, data from another process.

Even with regular prefetching, you're highly likely to pull in a cache
line or two from an adjacent heap object.

Maybe I am being too pessimistic, but at this point I don't buy the
argument of "architecturally, we don't touch any sensitive data,
therefore it won't be the cache".  Whether said data is usefully
extractable by an attacker is a different matter, but I don't feel
comfortable betting peoples data isolation on the expectation that it
isn't extractable.

~Andrew


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
  2018-05-29 22:49   ` [PATCH 1/2] L1TF KVM 1 Thomas Gleixner
                       ` (2 preceding siblings ...)
  2018-05-30  9:02     ` Paolo Bonzini
@ 2018-05-31 19:00     ` Jon Masters
  3 siblings, 0 replies; 38+ messages in thread
From: Jon Masters @ 2018-05-31 19:00 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 1145 bytes --]

On 05/29/2018 06:49 PM, speck for Thomas Gleixner wrote:
> On Tue, 29 May 2018, speck for Paolo Bonzini wrote:

>> +void kvm_l1d_flush(void)
>> +{
>> +	asm volatile(
>> +		"movq %0, %%rax\n\t"
>> +		"leaq 65536(%0), %%rdx\n\t"
> 
> Why 64K?
> 
>> +		"11: \n\t"
>> +		"movzbl (%%rax), %%ecx\n\t"
>> +		"addq $4096, %%rax\n\t"
>> +		"cmpq %%rax, %%rdx\n\t"
>> +		"jne 11b\n\t"
>> +		"xorl %%eax, %%eax\n\t"
>> +		"cpuid\n\t"

My guess is they're saying that the maximum L1D$ size is 64K so they
want to stride it 1 4K page at a time to get the prefetchers going ahead
of the next loop...this in theory will make the following loop "faster".

> What's the cpuid invocation for?
> 
>> +		"xorl %%eax, %%eax\n\t"
>> +		"12:\n\t"
>> +		"movzwl %%ax, %%edx\n\t"
>> +		"addl $64, %%eax\n\t"
>> +		"movzbl (%%rdx, %0), %%ecx\n\t">> +		"cmpl $65536, %%eax\n\t"

...which then tries to do 64 bytes (Intel cache line) at a time.

They use the CPUID as a serializing instruction to ensure the store has
been observed, others have commented on that.

Jon.

-- 
Computer Architect | Sent from my Fedora powered laptop


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
       [not found] ` <20180529194239.768D561107@crypto-ml.lab.linutronix.de>
@ 2018-06-01 16:48   ` Konrad Rzeszutek Wilk
  2018-06-04 14:56     ` Paolo Bonzini
  0 siblings, 1 reply; 38+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-06-01 16:48 UTC (permalink / raw)
  To: speck

On Tue, May 29, 2018 at 09:42:13PM +0200, speck for Paolo Bonzini wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: [PATCH 1/2] kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
> 
> This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
> fault.  The two modes are "vmexit_l1d_flush=1" and "vmexit_l1d_flush=2".
> 
> "vmexit_l1d_flush=2" is simply doing an L1 cache flush on every vmexit.
> "vmexit_l1d_flush=1" is trying to avoid so on vmexits that are "confined"
> in the kind of code that they execute.  The idea is based on Intel's
> patches, but the final list of "confined" vmexits isn't quite.

s/quite/quite fleshed out/
> 
> Notably, L1 cache flushes are performed on EPT violations (which are
> basically KVM-level page faults), vmexits that involve the emulator,
> and on every KVM_RUN invocation (so each userspace exit).  However,
> most vmexits are considered safe.  I singled out the emulator because
> it may be a good target for other speculative execution-based threats,
> and the MMU because it can bring host page tables in the L1 cache.
> 
> The mitigation does not in any way try to do anything about hyperthreading;
> it is possible for a sibling thread to read data from the cache during a
> vmexit, before the host completes the flush, or to read data from the cache
> while a sibling runs.  This part of the work is not ready yet.
> 
> For now I'm leaving the default to "vmexit_l1d_flush=2", in case we need
> to push out the patches in an emergency embargo break, but I don't think
> it's the best setting.  The cost is up to 2.5x more expensive vmexits
> on Haswell processors, and 30% on Coffee Lake (for the latter, this is
> independent of whether microcode or the generic flush code are used).
> 

This looks very much like what Jun had. If this was based off Intel code would
it make sense to give credit to him by saying something along:

"Ideas and some code based off from Jun .." ?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] [PATCH 1/2] L1TF KVM 1
  2018-05-30  9:01       ` Paolo Bonzini
  2018-05-30 11:58         ` Martin Pohlack
@ 2018-06-04  8:24         ` Martin Pohlack
  2018-06-04 13:11           ` [MODERATED] Is: Tim, Q to you. Was:Re: " Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 38+ messages in thread
From: Martin Pohlack @ 2018-06-04  8:24 UTC (permalink / raw)
  To: speck

[resending as new message as the replay seems to have been lost on at
least some mail paths]

On 30.05.2018 11:01, speck for Paolo Bonzini wrote:
> On 30/05/2018 01:54, speck for Andrew Cooper wrote:
>> Other bits I don't understand are the 64k limit in the first place, why
>> it gets walked over in 4k strides to begin with (I'm not aware of any
>> prefetching which would benefit that...) and why a particularly
>> obfuscated piece of magic is used for the 64byte strides.
> 
> That is the only part I understood, :) the 4k strides ensure that the
> source data is in the TLB.  Why that is needed is still a mystery though.

I think the reasoning is that you first want to populate the TLB for the
whole flush array, then fence, to make sure TLB walks do not interfere
with the actual flushing later, either for performance reasons or for
preventing leakage of partial walk results.

Not sure about the 64K, it likely is about the LRU implementation for L1
replacement not being perfect (but pseudo LRU), so you need to flush
more than the L1 size (32K) in software.  But I have also seen smaller
recommendations for that (52K).

Martin

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1
  2018-06-04  8:24         ` [MODERATED] " Martin Pohlack
@ 2018-06-04 13:11           ` Konrad Rzeszutek Wilk
  2018-06-04 17:59             ` [MODERATED] Encrypted Message Tim Chen
                               ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-06-04 13:11 UTC (permalink / raw)
  To: speck

On Mon, Jun 04, 2018 at 10:24:59AM +0200, speck for Martin Pohlack wrote:
> [resending as new message as the replay seems to have been lost on at
> least some mail paths]
> 
> On 30.05.2018 11:01, speck for Paolo Bonzini wrote:
> > On 30/05/2018 01:54, speck for Andrew Cooper wrote:
> >> Other bits I don't understand are the 64k limit in the first place, why
> >> it gets walked over in 4k strides to begin with (I'm not aware of any
> >> prefetching which would benefit that...) and why a particularly
> >> obfuscated piece of magic is used for the 64byte strides.
> >
> > That is the only part I understood, :) the 4k strides ensure that the
> > source data is in the TLB.  Why that is needed is still a mystery though.
> 
> I think the reasoning is that you first want to populate the TLB for the
> whole flush array, then fence, to make sure TLB walks do not interfere
> with the actual flushing later, either for performance reasons or for
> preventing leakage of partial walk results.
> 
> Not sure about the 64K, it likely is about the LRU implementation for L1
> replacement not being perfect (but pseudo LRU), so you need to flush
> more than the L1 size (32K) in software.  But I have also seen smaller
> recommendations for that (52K).

Isn't Tim Chen from Intel on this mailing list? Tim, could you find out
please?

> 
> Martin

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
  2018-06-01 16:48   ` [MODERATED] Re: [PATCH 1/2] L1TF KVM 1 Konrad Rzeszutek Wilk
@ 2018-06-04 14:56     ` Paolo Bonzini
  0 siblings, 0 replies; 38+ messages in thread
From: Paolo Bonzini @ 2018-06-04 14:56 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 2491 bytes --]

On 01/06/2018 18:48, speck for Konrad Rzeszutek Wilk wrote:
> On Tue, May 29, 2018 at 09:42:13PM +0200, speck for Paolo Bonzini wrote:
>> From: Paolo Bonzini <pbonzini@redhat.com>
>> Subject: [PATCH 1/2] kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
>>
>> This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
>> fault.  The two modes are "vmexit_l1d_flush=1" and "vmexit_l1d_flush=2".
>>
>> "vmexit_l1d_flush=2" is simply doing an L1 cache flush on every vmexit.
>> "vmexit_l1d_flush=1" is trying to avoid so on vmexits that are "confined"
>> in the kind of code that they execute.  The idea is based on Intel's
>> patches, but the final list of "confined" vmexits isn't quite.
> 
> s/quite/quite fleshed out/

No, not quite based on Intel's patches. :)

>>
>> Notably, L1 cache flushes are performed on EPT violations (which are
>> basically KVM-level page faults), vmexits that involve the emulator,
>> and on every KVM_RUN invocation (so each userspace exit).  However,
>> most vmexits are considered safe.  I singled out the emulator because
>> it may be a good target for other speculative execution-based threats,
>> and the MMU because it can bring host page tables in the L1 cache.
>>
>> The mitigation does not in any way try to do anything about hyperthreading;
>> it is possible for a sibling thread to read data from the cache during a
>> vmexit, before the host completes the flush, or to read data from the cache
>> while a sibling runs.  This part of the work is not ready yet.
>>
>> For now I'm leaving the default to "vmexit_l1d_flush=2", in case we need
>> to push out the patches in an emergency embargo break, but I don't think
>> it's the best setting.  The cost is up to 2.5x more expensive vmexits
>> on Haswell processors, and 30% on Coffee Lake (for the latter, this is
>> independent of whether microcode or the generic flush code are used).
>>
> 
> This looks very much like what Jun had. If this was based off Intel code would
> it make sense to give credit to him by saying something along:
> 
> "Ideas and some code based off from Jun .." ?

I was not sure who the author of Intel's code was, so I left in the
references above.  Really the only thing I lifted from there was the L1
flush; the "confined" moniker got stuck in my brain, but all the other
code was rewritten from scratch.  I'll resend a new version with review
comments applied tomorrow or Wednesday.

Paolo


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Encrypted Message
  2018-06-04 13:11           ` [MODERATED] Is: Tim, Q to you. Was:Re: " Konrad Rzeszutek Wilk
@ 2018-06-04 17:59             ` Tim Chen
  2018-06-05  1:25             ` [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1 Jon Masters
  2018-06-05 23:34             ` [MODERATED] Encrypted Message Tim Chen
  2 siblings, 0 replies; 38+ messages in thread
From: Tim Chen @ 2018-06-04 17:59 UTC (permalink / raw)
  To: speck

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/rfc822-headers; protected-headers="v1", Size: 165 bytes --]

From: Tim Chen <tim.c.chen@linux.intel.com>
To: speck for Konrad Rzeszutek Wilk <speck@linutronix.de>
Subject: Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1

[-- Attachment #2: Type: text/plain, Size: 1464 bytes --]

On 06/04/2018 06:11 AM, speck for Konrad Rzeszutek Wilk wrote:
> On Mon, Jun 04, 2018 at 10:24:59AM +0200, speck for Martin Pohlack wrote:
>> [resending as new message as the replay seems to have been lost on at
>> least some mail paths]
>>
>> On 30.05.2018 11:01, speck for Paolo Bonzini wrote:
>>> On 30/05/2018 01:54, speck for Andrew Cooper wrote:
>>>> Other bits I don't understand are the 64k limit in the first place, why
>>>> it gets walked over in 4k strides to begin with (I'm not aware of any
>>>> prefetching which would benefit that...) and why a particularly
>>>> obfuscated piece of magic is used for the 64byte strides.
>>>
>>> That is the only part I understood, :) the 4k strides ensure that the
>>> source data is in the TLB.  Why that is needed is still a mystery though.
>>
>> I think the reasoning is that you first want to populate the TLB for the
>> whole flush array, then fence, to make sure TLB walks do not interfere
>> with the actual flushing later, either for performance reasons or for
>> preventing leakage of partial walk results.
>>
>> Not sure about the 64K, it likely is about the LRU implementation for L1
>> replacement not being perfect (but pseudo LRU), so you need to flush
>> more than the L1 size (32K) in software.  But I have also seen smaller
>> recommendations for that (52K).
> 
> Isn't Tim Chen from Intel on this mailing list? Tim, could you find out
> please?
> 

Will do.

Tim


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1
  2018-06-04 13:11           ` [MODERATED] Is: Tim, Q to you. Was:Re: " Konrad Rzeszutek Wilk
  2018-06-04 17:59             ` [MODERATED] Encrypted Message Tim Chen
@ 2018-06-05  1:25             ` Jon Masters
  2018-06-05  1:30               ` Linus Torvalds
  2018-06-05  7:10               ` Martin Pohlack
  2018-06-05 23:34             ` [MODERATED] Encrypted Message Tim Chen
  2 siblings, 2 replies; 38+ messages in thread
From: Jon Masters @ 2018-06-05  1:25 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 1848 bytes --]

On 06/04/2018 09:11 AM, speck for Konrad Rzeszutek Wilk wrote:
> On Mon, Jun 04, 2018 at 10:24:59AM +0200, speck for Martin Pohlack wrote:
>> [resending as new message as the replay seems to have been lost on at
>> least some mail paths]
>>
>> On 30.05.2018 11:01, speck for Paolo Bonzini wrote:
>>> On 30/05/2018 01:54, speck for Andrew Cooper wrote:
>>>> Other bits I don't understand are the 64k limit in the first place, why
>>>> it gets walked over in 4k strides to begin with (I'm not aware of any
>>>> prefetching which would benefit that...) and why a particularly
>>>> obfuscated piece of magic is used for the 64byte strides.
>>>
>>> That is the only part I understood, :) the 4k strides ensure that the
>>> source data is in the TLB.  Why that is needed is still a mystery though.
>>
>> I think the reasoning is that you first want to populate the TLB for the
>> whole flush array, then fence, to make sure TLB walks do not interfere
>> with the actual flushing later, either for performance reasons or for
>> preventing leakage of partial walk results.
>>
>> Not sure about the 64K, it likely is about the LRU implementation for L1
>> replacement not being perfect (but pseudo LRU), so you need to flush
>> more than the L1 size (32K) in software.  But I have also seen smaller
>> recommendations for that (52K).
> 
> Isn't Tim Chen from Intel on this mailing list? Tim, could you find out
> please?

I had assumed it was for the more straightforward reason that $future
uarch has a 64K L1D$, at least according to "The Internet" (TM):

https://en.wikichip.org/wiki/intel/core_i3/i3-8121u

It ought to be associativity that impacts the displacement itself, not
the LRU policy since you still end up with the L1D being updated.

Jon.

-- 
Computer Architect | Sent from my Fedora powered laptop


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1
  2018-06-05  1:25             ` [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1 Jon Masters
@ 2018-06-05  1:30               ` Linus Torvalds
  2018-06-05  7:10               ` Martin Pohlack
  1 sibling, 0 replies; 38+ messages in thread
From: Linus Torvalds @ 2018-06-05  1:30 UTC (permalink / raw)
  To: speck



On Mon, 4 Jun 2018, speck for Jon Masters wrote:
> 
> I had assumed it was for the more straightforward reason that $future
> uarch has a 64K L1D$, at least according to "The Internet" (TM):
> 
> https://en.wikichip.org/wiki/intel/core_i3/i3-8121u

No, that 64kB is just 2x32kB due to two cores.

It's still 8-way and 32kB per core, from what I can tell.

              Linus

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1
  2018-06-05  1:25             ` [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1 Jon Masters
  2018-06-05  1:30               ` Linus Torvalds
@ 2018-06-05  7:10               ` Martin Pohlack
  1 sibling, 0 replies; 38+ messages in thread
From: Martin Pohlack @ 2018-06-05  7:10 UTC (permalink / raw)
  To: speck

On 05.06.2018 03:25, speck for Jon Masters wrote:
> On 06/04/2018 09:11 AM, speck for Konrad Rzeszutek Wilk wrote:
>> On Mon, Jun 04, 2018 at 10:24:59AM +0200, speck for Martin Pohlack wrote:
>>> [resending as new message as the replay seems to have been lost on at
>>> least some mail paths]
>>>
>>> On 30.05.2018 11:01, speck for Paolo Bonzini wrote:
>>>> On 30/05/2018 01:54, speck for Andrew Cooper wrote:
>>>>> Other bits I don't understand are the 64k limit in the first place, why
>>>>> it gets walked over in 4k strides to begin with (I'm not aware of any
>>>>> prefetching which would benefit that...) and why a particularly
>>>>> obfuscated piece of magic is used for the 64byte strides.
>>>>
>>>> That is the only part I understood, :) the 4k strides ensure that the
>>>> source data is in the TLB.  Why that is needed is still a mystery though.
>>>
>>> I think the reasoning is that you first want to populate the TLB for the
>>> whole flush array, then fence, to make sure TLB walks do not interfere
>>> with the actual flushing later, either for performance reasons or for
>>> preventing leakage of partial walk results.
>>>
>>> Not sure about the 64K, it likely is about the LRU implementation for L1
>>> replacement not being perfect (but pseudo LRU), so you need to flush
>>> more than the L1 size (32K) in software.  But I have also seen smaller
>>> recommendations for that (52K).
>>
>> Isn't Tim Chen from Intel on this mailing list? Tim, could you find out
>> please?
> 
> I had assumed it was for the more straightforward reason that $future
> uarch has a 64K L1D$, at least according to "The Internet" (TM):
> 
> https://en.wikichip.org/wiki/intel/core_i3/i3-8121u
> 
> It ought to be associativity that impacts the displacement itself, not
> the LRU policy since you still end up with the L1D being updated.

Associativity should already be well covered as the flush array is laid
out contiguously, so reading 32K should be enough assuming a perfect LRU
implementation and no interference from the page table walker.

Martin

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Encrypted Message
  2018-06-04 13:11           ` [MODERATED] Is: Tim, Q to you. Was:Re: " Konrad Rzeszutek Wilk
  2018-06-04 17:59             ` [MODERATED] Encrypted Message Tim Chen
  2018-06-05  1:25             ` [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1 Jon Masters
@ 2018-06-05 23:34             ` Tim Chen
  2018-06-05 23:37               ` Tim Chen
  2 siblings, 1 reply; 38+ messages in thread
From: Tim Chen @ 2018-06-05 23:34 UTC (permalink / raw)
  To: speck

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/rfc822-headers; protected-headers="v1", Size: 165 bytes --]

From: Tim Chen <tim.c.chen@linux.intel.com>
To: speck for Konrad Rzeszutek Wilk <speck@linutronix.de>
Subject: Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1

[-- Attachment #2: Type: text/plain, Size: 1779 bytes --]

On 06/04/2018 06:11 AM, speck for Konrad Rzeszutek Wilk wrote:
> On Mon, Jun 04, 2018 at 10:24:59AM +0200, speck for Martin Pohlack wrote:
>> [resending as new message as the replay seems to have been lost on at
>> least some mail paths]
>>
>> On 30.05.2018 11:01, speck for Paolo Bonzini wrote:
>>> On 30/05/2018 01:54, speck for Andrew Cooper wrote:
>>>> Other bits I don't understand are the 64k limit in the first place, why
>>>> it gets walked over in 4k strides to begin with (I'm not aware of any
>>>> prefetching which would benefit that...) and why a particularly
>>>> obfuscated piece of magic is used for the 64byte strides.
>>>
>>> That is the only part I understood, :) the 4k strides ensure that the
>>> source data is in the TLB.  Why that is needed is still a mystery though.
>>
>> I think the reasoning is that you first want to populate the TLB for the
>> whole flush array, then fence, to make sure TLB walks do not interfere
>> with the actual flushing later, either for performance reasons or for
>> preventing leakage of partial walk results.
>>
>> Not sure about the 64K, it likely is about the LRU implementation for L1
>> replacement not being perfect (but pseudo LRU), so you need to flush
>> more than the L1 size (32K) in software.  But I have also seen smaller
>> recommendations for that (52K).
> 

Had some discussions with other Intel folks.

Our recommendation is not to use the software sequence for L1 clear but
use wrmsrl(MSR_IA32_FLUSH_L1D, MSR_IA32_FLUSH_L1D_VALUE).
We expect that all affected systems will be receiving a ucode update
to provide L1 clearing capability.

Yes, the 4k stride is for getting TLB walks out of the way and
the 64kB replacement is to accommodate pseudo LRU.

Thanks.

Tim


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Encrypted Message
  2018-06-05 23:34             ` [MODERATED] Encrypted Message Tim Chen
@ 2018-06-05 23:37               ` Tim Chen
  2018-06-07 19:11                 ` Tim Chen
  0 siblings, 1 reply; 38+ messages in thread
From: Tim Chen @ 2018-06-05 23:37 UTC (permalink / raw)
  To: speck

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/rfc822-headers; protected-headers="v1", Size: 165 bytes --]

From: Tim Chen <tim.c.chen@linux.intel.com>
To: speck for Konrad Rzeszutek Wilk <speck@linutronix.de>
Subject: Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1

[-- Attachment #2: Type: text/plain, Size: 1939 bytes --]

On 06/05/2018 04:34 PM, Tim Chen wrote:
> On 06/04/2018 06:11 AM, speck for Konrad Rzeszutek Wilk wrote:
>> On Mon, Jun 04, 2018 at 10:24:59AM +0200, speck for Martin Pohlack wrote:
>>> [resending as new message as the replay seems to have been lost on at
>>> least some mail paths]
>>>
>>> On 30.05.2018 11:01, speck for Paolo Bonzini wrote:
>>>> On 30/05/2018 01:54, speck for Andrew Cooper wrote:
>>>>> Other bits I don't understand are the 64k limit in the first place, why
>>>>> it gets walked over in 4k strides to begin with (I'm not aware of any
>>>>> prefetching which would benefit that...) and why a particularly
>>>>> obfuscated piece of magic is used for the 64byte strides.
>>>>
>>>> That is the only part I understood, :) the 4k strides ensure that the
>>>> source data is in the TLB.  Why that is needed is still a mystery though.
>>>
>>> I think the reasoning is that you first want to populate the TLB for the
>>> whole flush array, then fence, to make sure TLB walks do not interfere
>>> with the actual flushing later, either for performance reasons or for
>>> preventing leakage of partial walk results.
>>>
>>> Not sure about the 64K, it likely is about the LRU implementation for L1
>>> replacement not being perfect (but pseudo LRU), so you need to flush
>>> more than the L1 size (32K) in software.  But I have also seen smaller
>>> recommendations for that (52K).
>>
> 
> Had some discussions with other Intel folks.
> 
> Our recommendation is not to use the software sequence for L1 clear but
> use wrmsrl(MSR_IA32_FLUSH_L1D, MSR_IA32_FLUSH_L1D_VALUE).
> We expect that all affected systems will be receiving a ucode update
> to provide L1 clearing capability.
> 
> Yes, the 4k stride is for getting TLB walks out of the way and
> the 64kB replacement is to accommodate pseudo LRU.

I will try to see if I can get hold of the relevant documentation
on pseudo LRU.

Tim


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
       [not found] ` <20180529194236.EDB8561100@crypto-ml.lab.linutronix.de>
@ 2018-06-06  0:34   ` Dave Hansen
  2018-06-06 14:15     ` Dave Hansen
  0 siblings, 1 reply; 38+ messages in thread
From: Dave Hansen @ 2018-06-06  0:34 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 809 bytes --]

On 05/29/2018 12:42 PM, speck for Paolo Bonzini wrote:
>  	r = -ENOMEM;
> +	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
> +	if (!page)
> +		goto out;
> +	empty_zero_pages = page_address(page);

There is also an Intel suggestion to have guard pages before and after
the L1D flush buffer.  As it stands, the prefetchers might pull data
into the cache from pages adjacent to the allocation you have there.

You can use vmalloc(), where we get (unmapped) guard pages already.  Or,
you can just oversize the allocation using:

	alloc_pages_exact(L1D_CACHE_ORDER * PAGE_SIZE + 2 * PAGE_SIZE)

and just point empty_zero_pages to the second page in the buffer:

	empty_zero_pages = page_address(page + 1);

I'd suggest the alloc_pages_exact() version.  It will chew up fewer TLB
entries.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
  2018-06-06  0:34   ` Dave Hansen
@ 2018-06-06 14:15     ` Dave Hansen
  0 siblings, 0 replies; 38+ messages in thread
From: Dave Hansen @ 2018-06-06 14:15 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 729 bytes --]

>  	r = -ENOMEM;
> +	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
> +	if (!page)
> +		goto out;
> +	empty_zero_pages = page_address(page);
> +

Couple more things:

PAGE_SIZE<<L1D_CACHE_ORDER is is only 32k, right?  The assembly sequence
clears 64k IIRC:

> +void kvm_l1d_flush(void)
> +{
> +	asm volatile(
> +		"movq %0, %%rax\n\t"
> +		"leaq 65536(%0), %%rdx\n\t"
> +		"11: \n\t"
...

So we probably want the allocation to be something like:

	l1d_clear_buf_size = PAGE_SIZE << (L1D_CACHE_ORDER+1) +
			     2 * PAGE_SIZE;
	alloc_pages_exact(l1d_clear_buf_size, GFP_ZERO|GFP_ATOMIC);

Note that we need GFP_ZERO because we could theoretically get a page
that already had kernel secrets in it.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Encrypted Message
  2018-06-05 23:37               ` Tim Chen
@ 2018-06-07 19:11                 ` Tim Chen
  2018-06-07 23:24                   ` [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1 Andi Kleen
  0 siblings, 1 reply; 38+ messages in thread
From: Tim Chen @ 2018-06-07 19:11 UTC (permalink / raw)
  To: speck

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/rfc822-headers; protected-headers="v1", Size: 165 bytes --]

From: Tim Chen <tim.c.chen@linux.intel.com>
To: speck for Konrad Rzeszutek Wilk <speck@linutronix.de>
Subject: Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1

[-- Attachment #2: Type: text/plain, Size: 2489 bytes --]

On 06/05/2018 04:37 PM, Tim Chen wrote:
> On 06/05/2018 04:34 PM, Tim Chen wrote:
>> On 06/04/2018 06:11 AM, speck for Konrad Rzeszutek Wilk wrote:
>>> On Mon, Jun 04, 2018 at 10:24:59AM +0200, speck for Martin Pohlack wrote:
>>>> [resending as new message as the replay seems to have been lost on at
>>>> least some mail paths]
>>>>
>>>> On 30.05.2018 11:01, speck for Paolo Bonzini wrote:
>>>>> On 30/05/2018 01:54, speck for Andrew Cooper wrote:
>>>>>> Other bits I don't understand are the 64k limit in the first place, why
>>>>>> it gets walked over in 4k strides to begin with (I'm not aware of any
>>>>>> prefetching which would benefit that...) and why a particularly
>>>>>> obfuscated piece of magic is used for the 64byte strides.
>>>>>
>>>>> That is the only part I understood, :) the 4k strides ensure that the
>>>>> source data is in the TLB.  Why that is needed is still a mystery though.
>>>>
>>>> I think the reasoning is that you first want to populate the TLB for the
>>>> whole flush array, then fence, to make sure TLB walks do not interfere
>>>> with the actual flushing later, either for performance reasons or for
>>>> preventing leakage of partial walk results.
>>>>
>>>> Not sure about the 64K, it likely is about the LRU implementation for L1
>>>> replacement not being perfect (but pseudo LRU), so you need to flush
>>>> more than the L1 size (32K) in software.  But I have also seen smaller
>>>> recommendations for that (52K).
>>>
>>
>> Had some discussions with other Intel folks.
>>
>> Our recommendation is not to use the software sequence for L1 clear but
>> use wrmsrl(MSR_IA32_FLUSH_L1D, MSR_IA32_FLUSH_L1D_VALUE).
>> We expect that all affected systems will be receiving a ucode update
>> to provide L1 clearing capability.
>>
>> Yes, the 4k stride is for getting TLB walks out of the way and
>> the 64kB replacement is to accommodate pseudo LRU.
> 
> I will try to see if I can get hold of the relevant documentation
> on pseudo LRU.
> 

The HW folks mentioned that if we have nothing from the flush buffer in
L1, then 32 KB would be sufficient (if we load miss for everything).

However, that's not the case. If some data from the flush buffer is
already in L1, it could protect an unrelated line that's considered
"near" by the LRU from getting flushed.  To make sure that does not
happen, we go through 64 KB of data to guarantee every line in L1 will
encounter a load miss and is flushed.

Tim


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1
  2018-06-07 19:11                 ` Tim Chen
@ 2018-06-07 23:24                   ` Andi Kleen
  2018-06-08 16:29                     ` Thomas Gleixner
  0 siblings, 1 reply; 38+ messages in thread
From: Andi Kleen @ 2018-06-07 23:24 UTC (permalink / raw)
  To: speck

On Thu, Jun 07, 2018 at 12:11:21PM -0700, speck for Tim Chen wrote:
> From: Tim Chen <tim.c.chen@linux.intel.com>
> To: speck for Konrad Rzeszutek Wilk <speck@linutronix.de>
> Subject: Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1

> On 06/05/2018 04:37 PM, Tim Chen wrote:
> > On 06/05/2018 04:34 PM, Tim Chen wrote:
> >> On 06/04/2018 06:11 AM, speck for Konrad Rzeszutek Wilk wrote:
> >>> On Mon, Jun 04, 2018 at 10:24:59AM +0200, speck for Martin Pohlack wrote:
> >>>> [resending as new message as the replay seems to have been lost on at
> >>>> least some mail paths]
> >>>>
> >>>> On 30.05.2018 11:01, speck for Paolo Bonzini wrote:
> >>>>> On 30/05/2018 01:54, speck for Andrew Cooper wrote:
> >>>>>> Other bits I don't understand are the 64k limit in the first place, why
> >>>>>> it gets walked over in 4k strides to begin with (I'm not aware of any
> >>>>>> prefetching which would benefit that...) and why a particularly
> >>>>>> obfuscated piece of magic is used for the 64byte strides.
> >>>>>
> >>>>> That is the only part I understood, :) the 4k strides ensure that the
> >>>>> source data is in the TLB.  Why that is needed is still a mystery though.
> >>>>
> >>>> I think the reasoning is that you first want to populate the TLB for the
> >>>> whole flush array, then fence, to make sure TLB walks do not interfere
> >>>> with the actual flushing later, either for performance reasons or for
> >>>> preventing leakage of partial walk results.
> >>>>
> >>>> Not sure about the 64K, it likely is about the LRU implementation for L1
> >>>> replacement not being perfect (but pseudo LRU), so you need to flush
> >>>> more than the L1 size (32K) in software.  But I have also seen smaller
> >>>> recommendations for that (52K).
> >>>
> >>
> >> Had some discussions with other Intel folks.
> >>
> >> Our recommendation is not to use the software sequence for L1 clear but
> >> use wrmsrl(MSR_IA32_FLUSH_L1D, MSR_IA32_FLUSH_L1D_VALUE).
> >> We expect that all affected systems will be receiving a ucode update
> >> to provide L1 clearing capability.
> >>
> >> Yes, the 4k stride is for getting TLB walks out of the way and
> >> the 64kB replacement is to accommodate pseudo LRU.
> > 
> > I will try to see if I can get hold of the relevant documentation
> > on pseudo LRU.
> > 
> 
> The HW folks mentioned that if we have nothing from the flush buffer in
> L1, then 32 KB would be sufficient (if we load miss for everything).
> 
> However, that's not the case. If some data from the flush buffer is
> already in L1, it could protect an unrelated line that's considered
> "near" by the LRU from getting flushed.  To make sure that does not
> happen, we go through 64 KB of data to guarantee every line in L1 will
> encounter a load miss and is flushed.

Also the recommended mitigation is really to use the MSR write instead
of the magic software sequence. Perhaps it would be best to 
just remove the software sequence. Updated microcode is needed in
any case, it doesn't make sense to try to support partially updated systems.

-Andi

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1
  2018-06-07 23:24                   ` [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1 Andi Kleen
@ 2018-06-08 16:29                     ` Thomas Gleixner
  2018-06-08 17:51                       ` [MODERATED] " Josh Poimboeuf
  0 siblings, 1 reply; 38+ messages in thread
From: Thomas Gleixner @ 2018-06-08 16:29 UTC (permalink / raw)
  To: speck

On Thu, 7 Jun 2018, speck for Andi Kleen wrote:
> Also the recommended mitigation is really to use the MSR write instead
> of the magic software sequence. Perhaps it would be best to 
> just remove the software sequence. Updated microcode is needed in
> any case, it doesn't make sense to try to support partially updated systems.

ACK. That makes sense.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
       [not found] ` <20180529194240.5654A61109@crypto-ml.lab.linutronix.de>
@ 2018-06-08 17:49   ` Josh Poimboeuf
  2018-06-08 20:49     ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 38+ messages in thread
From: Josh Poimboeuf @ 2018-06-08 17:49 UTC (permalink / raw)
  To: speck

On Tue, May 29, 2018 at 09:42:13PM +0200, speck for Paolo Bonzini wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: [PATCH 1/2] kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
> 
> This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
> fault.  The two modes are "vmexit_l1d_flush=1" and "vmexit_l1d_flush=2".
> 
> "vmexit_l1d_flush=2" is simply doing an L1 cache flush on every vmexit.
> "vmexit_l1d_flush=1" is trying to avoid so on vmexits that are "confined"
> in the kind of code that they execute.  The idea is based on Intel's
> patches, but the final list of "confined" vmexits isn't quite.
> 
> Notably, L1 cache flushes are performed on EPT violations (which are
> basically KVM-level page faults), vmexits that involve the emulator,
> and on every KVM_RUN invocation (so each userspace exit).  However,
> most vmexits are considered safe.  I singled out the emulator because
> it may be a good target for other speculative execution-based threats,
> and the MMU because it can bring host page tables in the L1 cache.
> 
> The mitigation does not in any way try to do anything about hyperthreading;
> it is possible for a sibling thread to read data from the cache during a
> vmexit, before the host completes the flush, or to read data from the cache
> while a sibling runs.  This part of the work is not ready yet.
> 
> For now I'm leaving the default to "vmexit_l1d_flush=2", in case we need
> to push out the patches in an emergency embargo break, but I don't think
> it's the best setting.  The cost is up to 2.5x more expensive vmexits
> on Haswell processors, and 30% on Coffee Lake (for the latter, this is
> independent of whether microcode or the generic flush code are used).
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

I think we should report the L1 flushing mitigation value (i.e., 0, 1 or
2) in the 'l1tf' vulnerabilities sysfs file.

Also, it would be clearer to name them something like

  vmexit_l1d_flush=off
  vmexit_l1d_flush=confined
  vmexit_l1d_flush=on

etc.

-- 
Josh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1
  2018-06-08 16:29                     ` Thomas Gleixner
@ 2018-06-08 17:51                       ` Josh Poimboeuf
  2018-06-11 14:50                         ` Paolo Bonzini
  0 siblings, 1 reply; 38+ messages in thread
From: Josh Poimboeuf @ 2018-06-08 17:51 UTC (permalink / raw)
  To: speck

On Fri, Jun 08, 2018 at 06:29:16PM +0200, speck for Thomas Gleixner wrote:
> On Thu, 7 Jun 2018, speck for Andi Kleen wrote:
> > Also the recommended mitigation is really to use the MSR write instead
> > of the magic software sequence. Perhaps it would be best to 
> > just remove the software sequence. Updated microcode is needed in
> > any case, it doesn't make sense to try to support partially updated systems.
> 
> ACK. That makes sense.

In that case do we need plumbing to expose the L1 cache flush MSR to the
guest, for nested virt support?

-- 
Josh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
  2018-06-08 17:49   ` Josh Poimboeuf
@ 2018-06-08 20:49     ` Konrad Rzeszutek Wilk
  2018-06-08 22:13       ` Josh Poimboeuf
  0 siblings, 1 reply; 38+ messages in thread
From: Konrad Rzeszutek Wilk @ 2018-06-08 20:49 UTC (permalink / raw)
  To: speck

On Fri, Jun 08, 2018 at 12:49:04PM -0500, speck for Josh Poimboeuf wrote:
> On Tue, May 29, 2018 at 09:42:13PM +0200, speck for Paolo Bonzini wrote:
> > From: Paolo Bonzini <pbonzini@redhat.com>
> > Subject: [PATCH 1/2] kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
> > 
> > This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
> > fault.  The two modes are "vmexit_l1d_flush=1" and "vmexit_l1d_flush=2".
> > 
> > "vmexit_l1d_flush=2" is simply doing an L1 cache flush on every vmexit.
> > "vmexit_l1d_flush=1" is trying to avoid so on vmexits that are "confined"
> > in the kind of code that they execute.  The idea is based on Intel's
> > patches, but the final list of "confined" vmexits isn't quite.
> > 
> > Notably, L1 cache flushes are performed on EPT violations (which are
> > basically KVM-level page faults), vmexits that involve the emulator,
> > and on every KVM_RUN invocation (so each userspace exit).  However,
> > most vmexits are considered safe.  I singled out the emulator because
> > it may be a good target for other speculative execution-based threats,
> > and the MMU because it can bring host page tables in the L1 cache.
> > 
> > The mitigation does not in any way try to do anything about hyperthreading;
> > it is possible for a sibling thread to read data from the cache during a
> > vmexit, before the host completes the flush, or to read data from the cache
> > while a sibling runs.  This part of the work is not ready yet.
> > 
> > For now I'm leaving the default to "vmexit_l1d_flush=2", in case we need
> > to push out the patches in an emergency embargo break, but I don't think
> > it's the best setting.  The cost is up to 2.5x more expensive vmexits
> > on Haswell processors, and 30% on Coffee Lake (for the latter, this is
> > independent of whether microcode or the generic flush code are used).
> > 
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> 
> I think we should report the L1 flushing mitigation value (i.e., 0, 1 or
> 2) in the 'l1tf' vulnerabilities sysfs file.

But this is KVM module, not the generic code (bugs.c).

Or did you have in mind some sub-registration code so that the KVM module
can register its state of mind with bugs.c reporting?

Like 'L1D flush on VMENTER','IPI on VMEXIT', etc?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: [PATCH 1/2] L1TF KVM 1
  2018-06-08 20:49     ` Konrad Rzeszutek Wilk
@ 2018-06-08 22:13       ` Josh Poimboeuf
  0 siblings, 0 replies; 38+ messages in thread
From: Josh Poimboeuf @ 2018-06-08 22:13 UTC (permalink / raw)
  To: speck

On Fri, Jun 08, 2018 at 04:49:07PM -0400, speck for Konrad Rzeszutek Wilk wrote:
> On Fri, Jun 08, 2018 at 12:49:04PM -0500, speck for Josh Poimboeuf wrote:
> > On Tue, May 29, 2018 at 09:42:13PM +0200, speck for Paolo Bonzini wrote:
> > > From: Paolo Bonzini <pbonzini@redhat.com>
> > > Subject: [PATCH 1/2] kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
> > > 
> > > This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
> > > fault.  The two modes are "vmexit_l1d_flush=1" and "vmexit_l1d_flush=2".
> > > 
> > > "vmexit_l1d_flush=2" is simply doing an L1 cache flush on every vmexit.
> > > "vmexit_l1d_flush=1" is trying to avoid so on vmexits that are "confined"
> > > in the kind of code that they execute.  The idea is based on Intel's
> > > patches, but the final list of "confined" vmexits isn't quite.
> > > 
> > > Notably, L1 cache flushes are performed on EPT violations (which are
> > > basically KVM-level page faults), vmexits that involve the emulator,
> > > and on every KVM_RUN invocation (so each userspace exit).  However,
> > > most vmexits are considered safe.  I singled out the emulator because
> > > it may be a good target for other speculative execution-based threats,
> > > and the MMU because it can bring host page tables in the L1 cache.
> > > 
> > > The mitigation does not in any way try to do anything about hyperthreading;
> > > it is possible for a sibling thread to read data from the cache during a
> > > vmexit, before the host completes the flush, or to read data from the cache
> > > while a sibling runs.  This part of the work is not ready yet.
> > > 
> > > For now I'm leaving the default to "vmexit_l1d_flush=2", in case we need
> > > to push out the patches in an emergency embargo break, but I don't think
> > > it's the best setting.  The cost is up to 2.5x more expensive vmexits
> > > on Haswell processors, and 30% on Coffee Lake (for the latter, this is
> > > independent of whether microcode or the generic flush code are used).
> > > 
> > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > 
> > I think we should report the L1 flushing mitigation value (i.e., 0, 1 or
> > 2) in the 'l1tf' vulnerabilities sysfs file.
> 
> But this is KVM module, not the generic code (bugs.c).
> 
> Or did you have in mind some sub-registration code so that the KVM module
> can register its state of mind with bugs.c reporting?
> 
> Like 'L1D flush on VMENTER','IPI on VMEXIT', etc?

Right, just something as simple as setting a global variable (which
lives in bugs.c) would probably be good enough.

-- 
Josh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1
  2018-06-08 17:51                       ` [MODERATED] " Josh Poimboeuf
@ 2018-06-11 14:50                         ` Paolo Bonzini
  0 siblings, 0 replies; 38+ messages in thread
From: Paolo Bonzini @ 2018-06-11 14:50 UTC (permalink / raw)
  To: speck

[-- Attachment #1: Type: text/plain, Size: 877 bytes --]

On 08/06/2018 19:51, speck for Josh Poimboeuf wrote:
> On Fri, Jun 08, 2018 at 06:29:16PM +0200, speck for Thomas Gleixner wrote:
>> On Thu, 7 Jun 2018, speck for Andi Kleen wrote:
>>> Also the recommended mitigation is really to use the MSR write instead
>>> of the magic software sequence. Perhaps it would be best to 
>>> just remove the software sequence. Updated microcode is needed in
>>> any case, it doesn't make sense to try to support partially updated systems.
>> ACK. That makes sense.
> In that case do we need plumbing to expose the L1 cache flush MSR to the
> guest, for nested virt support?

No, because all L2->L1 exits go through L0 first.  So nested virt
systems are not vulnerable.

Is there a sanctioned way (in CPUID or elsewhere) to say the system does
not have the bug?

Paolo (back on track after moving to FPU land for a few days)


^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2018-06-11 14:51 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-29 19:42 [MODERATED] [PATCH 0/2] L1TF KVM 0 Paolo Bonzini
2018-05-29 19:42 ` [MODERATED] [PATCH 1/2] L1TF KVM 1 Paolo Bonzini
2018-05-29 19:42 ` [MODERATED] [PATCH 2/2] L1TF KVM 2 Paolo Bonzini
     [not found] ` <20180529194240.7F1336110A@crypto-ml.lab.linutronix.de>
2018-05-29 22:49   ` [PATCH 1/2] L1TF KVM 1 Thomas Gleixner
2018-05-29 23:54     ` [MODERATED] " Andrew Cooper
2018-05-30  9:01       ` Paolo Bonzini
2018-05-30 11:58         ` Martin Pohlack
2018-05-30 12:25           ` Thomas Gleixner
2018-05-30 14:31             ` Thomas Gleixner
2018-06-04  8:24         ` [MODERATED] " Martin Pohlack
2018-06-04 13:11           ` [MODERATED] Is: Tim, Q to you. Was:Re: " Konrad Rzeszutek Wilk
2018-06-04 17:59             ` [MODERATED] Encrypted Message Tim Chen
2018-06-05  1:25             ` [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1 Jon Masters
2018-06-05  1:30               ` Linus Torvalds
2018-06-05  7:10               ` Martin Pohlack
2018-06-05 23:34             ` [MODERATED] Encrypted Message Tim Chen
2018-06-05 23:37               ` Tim Chen
2018-06-07 19:11                 ` Tim Chen
2018-06-07 23:24                   ` [MODERATED] Re: Is: Tim, Q to you. Was:Re: [PATCH 1/2] L1TF KVM 1 Andi Kleen
2018-06-08 16:29                     ` Thomas Gleixner
2018-06-08 17:51                       ` [MODERATED] " Josh Poimboeuf
2018-06-11 14:50                         ` Paolo Bonzini
2018-05-30  8:55     ` [MODERATED] " Peter Zijlstra
2018-05-30  9:02     ` Paolo Bonzini
2018-05-31 19:00     ` Jon Masters
     [not found] ` <20180529194322.8B56F610F8@crypto-ml.lab.linutronix.de>
2018-05-29 23:59   ` [MODERATED] Re: [PATCH 2/2] L1TF KVM 2 Andrew Cooper
2018-05-30  8:38     ` Thomas Gleixner
2018-05-30 16:57       ` [MODERATED] " Andrew Cooper
2018-05-30 19:11         ` Thomas Gleixner
2018-05-30 21:10           ` [MODERATED] " Andi Kleen
2018-05-30 23:19             ` Andrew Cooper
     [not found] ` <20180529194239.768D561107@crypto-ml.lab.linutronix.de>
2018-06-01 16:48   ` [MODERATED] Re: [PATCH 1/2] L1TF KVM 1 Konrad Rzeszutek Wilk
2018-06-04 14:56     ` Paolo Bonzini
     [not found] ` <20180529194236.EDB8561100@crypto-ml.lab.linutronix.de>
2018-06-06  0:34   ` Dave Hansen
2018-06-06 14:15     ` Dave Hansen
     [not found] ` <20180529194240.5654A61109@crypto-ml.lab.linutronix.de>
2018-06-08 17:49   ` Josh Poimboeuf
2018-06-08 20:49     ` Konrad Rzeszutek Wilk
2018-06-08 22:13       ` Josh Poimboeuf

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.