[MODERATED] [PATCH v4 2/8] [PATCH v4 2/8] Linux Patch #2

* [MODERATED] [PATCH v4 2/8] [PATCH v4 2/8] Linux Patch #2
@ 2018-06-23 13:54 konrad.wilk
  2018-06-27 10:20 ` Thomas Gleixner
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: konrad.wilk @ 2018-06-23 13:54 UTC (permalink / raw)
  To: speck

We add two mitigation modes for CVE-2018-3620, aka L1 terminal
fault.  The two modes are "vmentry_l1d_flush=1" and "vmentry_l1d_flush=2".

"vmentry_l1d_flush=2" is simply doing an L1 cache flush on every VMENTER.
"vmentry_l1d_flush=1" is trying to avoid so many L1 cache flueshes on
VMENTER and instead only does if the reason for entering the hypervisor
is based on the type of code that is executed. The idea is based on Intel's
patches, but we treat all vmexits as safe unless they execute
specific code that is considered unsafe. There is no hardcoded list of
"safe" exit reasons; but vmexits are considered safe unless:
 - They trigger the emulator, which could be a good target for
   other speculative execution-based threats,
 - or the MMU, which can bring host page tables in the L1 cache.
 - In addition, executing userspace or another process will trigger a flush.
 - external interrupts
 - nested operations that require the MMU (see above). That is
   vmptrld, vmptrst, vmclear,vmwrite,vmread.
 - Also when handling invept,invvpid

The default is "vmentry_l1d_flush=1".  The cost of "vmentry_l1d_flush=2"
is up to 2.5x more expensive vmexits on Haswell processors, and 30% on
Coffee Lake (for the latter, this is independent of whether microcode
or the generic flush code are used).

The mitigation does not in any way try to do anything about hyperthreading;
it is possible for a sibling thread to read data from the cache during a
vmexit, before the host completes the flush, or to read data from the cache
while a sibling runs.  The suggestion there is to disable hyperthreading
unless you've configured your system to dedicate each core to a specific
guest.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
v2: Add checks for X86_BUG_L1TF
   - Rework the commit description.
v3: change module parameter to S_IRUGO
    move the kvm_l1d_flush closer to VMENTER
    move the module parameter so it is in alphabetical order-ish.
    add two extra places that are used by handle_[vmptrld, vmptrst,mclear,vmwrite,vmread,invept,invvpid].
    Can be changed to be only specific handlers.
---
 Documentation/admin-guide/kernel-parameters.txt | 11 +++++
 arch/x86/include/asm/kvm_host.h                 |  8 ++++
 arch/x86/kvm/mmu.c                              |  1 +
 arch/x86/kvm/svm.c                              |  1 +
 arch/x86/kvm/vmx.c                              | 51 +++++++++++++++++++-
 arch/x86/kvm/x86.c                              | 63 ++++++++++++++++++++++++-
 6 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index d59b34d4e62a..b8f7a4ab693a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1919,6 +1919,17 @@
 	kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
 				   Default is false (don't support).
 
+	kvm.vmentry_l1d_flush=[KVM] Mitigation for L1 Terminal Fault CVE.
+			Valid arguments: 0, 1, 2
+
+			2 does an L1 cache flush on every VMENTER.
+			1 tries to avoid so many L1 cache flush on VMENTERs and instead
+			do it only if the kind of code that is executed would lead to
+			leaking host memory.
+			0 disables the mitigation
+
+			Default is 1 (do L1 cache flush in specific instances)
+
 	kvm.mmu_audit=	[KVM] This is a R/W parameter which allows audit
 			KVM MMU at runtime.
 			Default is 0 (off)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c13cd28d9d1b..78748925a370 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -713,6 +713,12 @@ struct kvm_vcpu_arch {
 
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
+
+	/* for L1 terminal fault vulnerability */
+	bool vcpu_unconfined;
+
+	/* must flush the L1 Data cache */
+	bool flush_cache_req;
 };
 
 struct kvm_lpage_info {
@@ -881,6 +887,7 @@ struct kvm_vcpu_stat {
 	u64 signal_exits;
 	u64 irq_window_exits;
 	u64 nmi_window_exits;
+	u64 l1d_flush;
 	u64 halt_exits;
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
@@ -1449,6 +1456,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
+void kvm_l1d_flush(void);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d594690d8b95..4d4e3dc2494e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 {
 	int r = 1;
 
+	vcpu->arch.vcpu_unconfined = true;
 	switch (vcpu->arch.apf.host_apf_reason) {
 	default:
 		trace_kvm_page_fault(fault_address, error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f059a73f0fd0..fffc447f5410 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5437,6 +5437,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
+	vcpu->arch.flush_cache_req = false;
 }
 
 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f08e33fc28ac..a51418429165 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -117,6 +117,9 @@ static u64 __read_mostly host_xss;
 static bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
+static int __read_mostly vmentry_l1d_flush = 1;
+module_param(vmentry_l1d_flush, int, S_IRUGO);
+
 #define MSR_TYPE_R	1
 #define MSR_TYPE_W	2
 #define MSR_TYPE_RW	3
@@ -2621,6 +2624,45 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 				   vmx->guest_msrs[i].mask);
 }
 
+static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu)
+{
+	vmx_save_host_state(vcpu);
+
+	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+	    !static_cpu_has(X86_BUG_L1TF)) {
+		vcpu->arch.flush_cache_req = false;
+		return;
+	}
+
+	switch (vmentry_l1d_flush) {
+	case 0:
+		vcpu->arch.flush_cache_req = false;
+		break;
+	case 1:
+		/*
+		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
+		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
+		 * following cases:
+		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
+		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
+		 *   on every nested entry.  Nested hypervisors do not bother clearing
+		 *   the cache.
+		 * - anything that runs the emulator (the slow paths for EPT misconfig
+		 *   or I/O instruction)
+		 * - anything that can cause get_user_pages (EPT violation, and again
+		 *   the slow paths for EPT misconfig or I/O instruction)
+		 * - anything that can run code outside KVM (external interrupt,
+		 *   which can run interrupt handlers or irqs; or the sched_in
+		 *   preempt notifier)
+		 */
+		break;
+	case 2:
+	default:
+		vcpu->arch.flush_cache_req = true;
+		break;
+	}
+}
+
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
 	if (!vmx->host_state.loaded)
@@ -9754,6 +9796,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			[ss]"i"(__KERNEL_DS),
 			[cs]"i"(__KERNEL_CS)
 			);
+		vcpu->arch.vcpu_unconfined = true;
 	}
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
@@ -10011,6 +10054,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
 		(unsigned long)&current_evmcs->host_rsp : 0;
 
+	if (vcpu->arch.flush_cache_req)
+		kvm_l1d_flush();
+
 	asm(
 		/* Store host registers */
 		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -11824,6 +11870,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return ret;
 	}
 
+	/* Hide L1D cache contents from the nested guest.  */
+	vmx->vcpu.arch.vcpu_unconfined = true;
+
 	/*
 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
 	 * by event injection, halt vcpu.
@@ -12941,7 +12990,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
 
-	.prepare_guest_switch = vmx_save_host_state,
+	.prepare_guest_switch = vmx_prepare_guest_switch,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0046aa70205a..4d2e4975f91d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -195,6 +195,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "req_event", VCPU_STAT(req_event) },
+	{ "l1d_flush", VCPU_STAT(l1d_flush) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -4799,6 +4800,8 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 
+	/* The gva_to_pa walker can pull in tons of pages. */
+	vcpu->arch.vcpu_unconfined = true;
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
 					  exception);
 }
@@ -4874,6 +4877,9 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
 int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
 				unsigned int bytes, struct x86_exception *exception)
 {
+	/* kvm_write_guest_virt_system can pull in tons of pages. */
+	vcpu->arch.vcpu_unconfined = true;
+
 	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
 					   PFERR_WRITE_MASK, exception);
 }
@@ -6050,6 +6056,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	bool writeback = true;
 	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
+	vcpu->arch.vcpu_unconfined = true;
+
 	/*
 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 	 * never reused.
@@ -6533,10 +6541,49 @@ static struct notifier_block pvclock_gtod_notifier = {
 };
 #endif
 
+
+/*
+ * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
+ * 64 KiB because the replacement algorithm is not exactly LRU.
+ */
+#define L1D_CACHE_ORDER 4
+static void *__read_mostly empty_zero_pages;
+
+void kvm_l1d_flush(void)
+{
+	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
+	int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+	ASSERT(boot_cpu_has(X86_BUG_L1TF));
+
+	asm volatile(
+		/* First ensure the pages are in the TLB */
+		"xorl %%eax, %%eax\n\t"
+		"11: \n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $4096, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 11b\n\t"
+		"xorl %%eax, %%eax\n\t"
+		"cpuid\n\t"
+		/* Now fill the cache */
+		"xorl %%eax, %%eax\n\t"
+		"12:\n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $64, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 12b\n\t"
+		"lfence\n\t"
+		: : "r" (empty_zero_pages), "r" (size)
+		: "eax", "ebx", "ecx", "edx");
+}
+EXPORT_SYMBOL_GPL(kvm_l1d_flush);
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
 	struct kvm_x86_ops *ops = opaque;
+	struct page *page;
 
 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -6556,10 +6603,15 @@ int kvm_arch_init(void *opaque)
 	}
 
 	r = -ENOMEM;
+	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
+	if (!page)
+		goto out;
+	empty_zero_pages = page_address(page);
+
 	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 	if (!shared_msrs) {
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-		goto out;
+		goto out_free_zero_pages;
 	}
 
 	r = kvm_mmu_module_init();
@@ -6590,6 +6642,8 @@ int kvm_arch_init(void *opaque)
 
 	return 0;
 
+out_free_zero_pages:
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 out_free_percpu:
 	free_percpu(shared_msrs);
 out:
@@ -6614,6 +6668,7 @@ void kvm_arch_exit(void)
 #endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 	free_percpu(shared_msrs);
 }
 
@@ -7392,7 +7447,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	preempt_disable();
 
+	vcpu->arch.flush_cache_req = vcpu->arch.vcpu_unconfined;
 	kvm_x86_ops->prepare_guest_switch(vcpu);
+	vcpu->arch.vcpu_unconfined = false;
+	if (vcpu->arch.flush_cache_req)
+		vcpu->stat.l1d_flush++;
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -7579,6 +7638,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	struct kvm *kvm = vcpu->kvm;
 
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+	vcpu->arch.vcpu_unconfined = true;
 
 	for (;;) {
 		if (kvm_vcpu_running(vcpu)) {
@@ -8698,6 +8758,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+	vcpu->arch.vcpu_unconfined = true;
 	kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 9+ messages in thread