From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <paolo.bonzini@gmail.com>
Received: from mail-wr0-x236.google.com ([2a00:1450:400c:c0c::236])
	by Galois.linutronix.de with esmtps (TLS1.2:RSA_AES_128_CBC_SHA1:128)
	(Exim 4.80)
	(envelope-from <paolo.bonzini@gmail.com>)
	id 1fNkVk-0000Xa-B2
	for speck@linutronix.de; Tue, 29 May 2018 21:42:24 +0200
Received: by mail-wr0-x236.google.com with SMTP id d2-v6so11740107wrm.10
        for <speck@linutronix.de>; Tue, 29 May 2018 12:42:24 -0700 (PDT)
Received: from donizetti.lan (dynamic-adsl-78-12-189-60.clienti.tiscali.it.
 [78.12.189.60])        by smtp.gmail.com with ESMTPSA id
 f10-v6sm13298551wmc.0.2018.05.29.12.42.17        for <speck@linutronix.de>
        (version=TLS1_2 cipher=ECDHE-RSA-CHACHA20-POLY1305 bits=256/256);
        Tue, 29 May 2018 12:42:17 -0700 (PDT)
Sender: Paolo Bonzini <paolo.bonzini@gmail.com>
From: Paolo Bonzini <pbonzini@redhat.com>
Subject: [MODERATED] [PATCH 1/2] L1TF KVM 1
Date: Tue, 29 May 2018 21:42:13 +0200
Message-Id: <20180529194214.2600-2-pbonzini@redhat.com>
In-Reply-To: <20180529194214.2600-1-pbonzini@redhat.com>
References: <20180529194214.2600-1-pbonzini@redhat.com>
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 7bit
MIME-Version: 1.0
To: speck@linutronix.de
List-ID: <speck.linutronix.de>

This patch adds two mitigation modes for CVE-2018-3620, aka L1 terminal
fault.  The two modes are "vmexit_l1d_flush=1" and "vmexit_l1d_flush=2".

"vmexit_l1d_flush=2" is simply doing an L1 cache flush on every vmexit.
"vmexit_l1d_flush=1" is trying to avoid so on vmexits that are "confined"
in the kind of code that they execute.  The idea is based on Intel's
patches, but the final list of "confined" vmexits isn't quite.

Notably, L1 cache flushes are performed on EPT violations (which are
basically KVM-level page faults), vmexits that involve the emulator,
and on every KVM_RUN invocation (so each userspace exit).  However,
most vmexits are considered safe.  I singled out the emulator because
it may be a good target for other speculative execution-based threats,
and the MMU because it can bring host page tables in the L1 cache.

The mitigation does not in any way try to do anything about hyperthreading;
it is possible for a sibling thread to read data from the cache during a
vmexit, before the host completes the flush, or to read data from the cache
while a sibling runs.  This part of the work is not ready yet.

For now I'm leaving the default to "vmexit_l1d_flush=2", in case we need
to push out the patches in an emergency embargo break, but I don't think
it's the best setting.  The cost is up to 2.5x more expensive vmexits
on Haswell processors, and 30% on Coffee Lake (for the latter, this is
independent of whether microcode or the generic flush code are used).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  7 ++++-
 arch/x86/kvm/mmu.c              |  1 +
 arch/x86/kvm/svm.c              |  3 +-
 arch/x86/kvm/vmx.c              | 62 ++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              | 54 +++++++++++++++++++++++++++++++++--
 5 files changed, 122 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c25775fad4ed..ae4bab8b1f8a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -711,6 +711,9 @@ struct kvm_vcpu_arch {
 
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
+	
+	/* for L1 terminal fault vulnerability */
+	bool vcpu_unconfined;
 };
 
 struct kvm_lpage_info {
@@ -879,6 +882,7 @@ struct kvm_vcpu_stat {
 	u64 signal_exits;
 	u64 irq_window_exits;
 	u64 nmi_window_exits;
+	u64 l1d_flush;
 	u64 halt_exits;
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
@@ -937,7 +941,7 @@ struct kvm_x86_ops {
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
 
-	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
@@ -1446,6 +1450,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
+void kvm_l1d_flush(void);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8494dbae41b9..3b1140b156b2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3836,6 +3836,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 {
 	int r = 1;
 
+	vcpu->arch.vcpu_unconfined = true;
 	switch (vcpu->arch.apf.host_apf_reason) {
 	default:
 		trace_kvm_page_fault(fault_address, error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1fc05e428aba..849edcd31aad 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5404,8 +5404,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 		svm->asid_generation--;
 }
 
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
 {
+	*need_l1d_flush = false;
 }
 
 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3f1696570b41..b90ba122e73a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@
 };
 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 
+static int __read_mostly vmexit_l1d_flush = 2;
+module_param_named(vmexit_l1d_flush, vmexit_l1d_flush, int, 0644);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -938,6 +941,8 @@ static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bit
 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
 
+static DEFINE_PER_CPU(int, last_vector);
+
 enum {
 	VMX_VMREAD_BITMAP,
 	VMX_VMWRITE_BITMAP,
@@ -2423,6 +2428,59 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 				   vmx->guest_msrs[i].mask);
 }
 
+static inline bool vmx_handling_confined(int reason)
+{
+	switch (reason) {
+	case EXIT_REASON_EXCEPTION_NMI:
+	case EXIT_REASON_HLT:
+	case EXIT_REASON_PAUSE_INSTRUCTION:
+	case EXIT_REASON_APIC_WRITE:
+	case EXIT_REASON_MSR_WRITE:
+	case EXIT_REASON_VMCALL:
+	case EXIT_REASON_CR_ACCESS:
+	case EXIT_REASON_DR_ACCESS:
+	case EXIT_REASON_CPUID:
+	case EXIT_REASON_PREEMPTION_TIMER:
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_EOI_INDUCED:
+	case EXIT_REASON_WBINVD:
+	case EXIT_REASON_XSETBV:
+		/*
+		 * The next three set vcpu->arch.vcpu_unconfined themselves, so
+		 * we consider them confined here.
+		 */
+	case EXIT_REASON_EPT_VIOLATION:
+	case EXIT_REASON_EPT_MISCONFIG:
+	case EXIT_REASON_IO_INSTRUCTION:
+		return true;
+	case EXIT_REASON_EXTERNAL_INTERRUPT: {
+		int cpu = raw_smp_processor_id();
+		int vector = per_cpu(last_vector, cpu);
+		return vector == LOCAL_TIMER_VECTOR || vector == RESCHEDULE_VECTOR;
+	}
+	default:
+		return false;
+	}
+}
+
+static bool vmx_core_confined(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	return vmx_handling_confined(vmx->exit_reason);
+}
+
+static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
+{
+	vmx_save_host_state(vcpu);
+	if (vmexit_l1d_flush == 0 || !enable_ept)
+		*need_l1d_flush = false;
+	else if (vmexit_l1d_flush == 1)
+		*need_l1d_flush |= !vmx_core_confined(vcpu);
+	else
+		*need_l1d_flush = true;
+}
+
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
 	if (!vmx->host_state.loaded)
@@ -9457,11 +9515,13 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 		unsigned long entry;
 		gate_desc *desc;
 		struct vcpu_vmx *vmx = to_vmx(vcpu);
+		int cpu = raw_smp_processor_id();
 #ifdef CONFIG_X86_64
 		unsigned long tmp;
 #endif
 
 		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
+		per_cpu(last_vector, cpu) = vector;
 		desc = (gate_desc *)vmx->host_idt_base + vector;
 		entry = gate_offset(desc);
 		asm volatile(
@@ -12642,7 +12702,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
 
-	.prepare_guest_switch = vmx_save_host_state,
+	.prepare_guest_switch = vmx_prepare_guest_switch,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 59371de5d722..ada9e55fc871 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -194,6 +194,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "req_event", VCPU_STAT(req_event) },
+	{ "l1d_flush", VCPU_STAT(l1d_flush) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -6026,6 +6027,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	bool writeback = true;
 	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
+	vcpu->arch.vcpu_unconfined = true;
+
 	/*
 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 	 * never reused.
@@ -6509,10 +6512,40 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
 };
 #endif
 
+
+#define L1D_CACHE_ORDER 3
+static void *__read_mostly empty_zero_pages;
+
+void kvm_l1d_flush(void)
+{
+	asm volatile(
+		"movq %0, %%rax\n\t"
+		"leaq 65536(%0), %%rdx\n\t"
+		"11: \n\t"
+		"movzbl (%%rax), %%ecx\n\t"
+		"addq $4096, %%rax\n\t"
+		"cmpq %%rax, %%rdx\n\t"
+		"jne 11b\n\t"
+		"xorl %%eax, %%eax\n\t"
+		"cpuid\n\t"
+		"xorl %%eax, %%eax\n\t"
+		"12:\n\t"
+		"movzwl %%ax, %%edx\n\t"
+		"addl $64, %%eax\n\t"
+		"movzbl (%%rdx, %0), %%ecx\n\t"
+		"cmpl $65536, %%eax\n\t"
+		"jne 12b\n\t"
+		"lfence\n\t"
+		:
+		: "r" (empty_zero_pages)
+		: "rax", "rbx", "rcx", "rdx");
+}
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
 	struct kvm_x86_ops *ops = opaque;
+	struct page *page;
 
 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -6532,10 +6565,15 @@ int kvm_arch_init(void *opaque)
 	}
 
 	r = -ENOMEM;
+	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
+	if (!page)
+		goto out;
+	empty_zero_pages = page_address(page);
+
 	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 	if (!shared_msrs) {
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-		goto out;
+		goto out_free_zero_pages;
 	}
 
 	r = kvm_mmu_module_init();
@@ -6566,6 +6604,8 @@ int kvm_arch_init(void *opaque)
 
 	return 0;
 
+out_free_zero_pages:
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 out_free_percpu:
 	free_percpu(shared_msrs);
 out:
@@ -6590,6 +6630,7 @@ void kvm_arch_exit(void)
 #endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 	free_percpu(shared_msrs);
 }
 
@@ -7233,6 +7274,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_cpu_accept_dm_intr(vcpu);
 
 	bool req_immediate_exit = false;
+	bool need_l1d_flush;
 
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -7372,7 +7414,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	preempt_disable();
 
-	kvm_x86_ops->prepare_guest_switch(vcpu);
+	need_l1d_flush = vcpu->arch.vcpu_unconfined;
+	vcpu->arch.vcpu_unconfined = false;
+	kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
+	if (need_l1d_flush) {
+		vcpu->stat.l1d_flush++;
+		kvm_l1d_flush();
+	}
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -7559,6 +7607,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	struct kvm *kvm = vcpu->kvm;
 
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+	vcpu->arch.vcpu_unconfined = true;
 
 	for (;;) {
 		if (kvm_vcpu_running(vcpu)) {
@@ -8675,6 +8724,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+	vcpu->arch.vcpu_unconfined = true;
 	kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
-- 
1.8.3.1