[MODERATED] [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5

* [MODERATED] [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5
@ 2018-06-20 20:43 konrad.wilk
  2018-06-20 20:57 ` [MODERATED] " Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 5+ messages in thread
From: konrad.wilk @ 2018-06-20 20:43 UTC (permalink / raw)
  To: speck

Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
If it is active, the displacement flush is unnecessary.  Tested on
a Coffee Lake machine.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/kvm_host.h |  7 ++++-
 arch/x86/kvm/mmu.c              |  1 +
 arch/x86/kvm/svm.c              |  3 ++-
 arch/x86/kvm/vmx.c              | 46 +++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              | 59 +++++++++++++++++++++++++++++++++++++++--
 5 files changed, 111 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c13cd28d9d1b..7d7626cffc21 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -713,6 +713,9 @@ struct kvm_vcpu_arch {
 
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
+	
+	/* for L1 terminal fault vulnerability */
+	bool vcpu_unconfined;
 };
 
 struct kvm_lpage_info {
@@ -881,6 +884,7 @@ struct kvm_vcpu_stat {
 	u64 signal_exits;
 	u64 irq_window_exits;
 	u64 nmi_window_exits;
+	u64 l1d_flush;
 	u64 halt_exits;
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
@@ -939,7 +943,7 @@ struct kvm_x86_ops {
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
 
-	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
@@ -1449,6 +1453,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
+void kvm_l1d_flush(void);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d594690d8b95..4d4e3dc2494e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 {
 	int r = 1;
 
+	vcpu->arch.vcpu_unconfined = true;
 	switch (vcpu->arch.apf.host_apf_reason) {
 	default:
 		trace_kvm_page_fault(fault_address, error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f059a73f0fd0..302afb2643fa 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5435,8 +5435,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 		svm->asid_generation--;
 }
 
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
 {
+	*need_l1d_flush = false;
 }
 
 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 559a12b6184d..ba83d0cb9b03 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
 };
 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 
+static int __read_mostly vmentry_l1d_flush = 1;
+module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, 0644);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -2618,6 +2621,43 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 				   vmx->guest_msrs[i].mask);
 }
 
+static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
+{
+	vmx_save_host_state(vcpu);
+	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR)) {
+		*need_l1d_flush = false;
+		return;
+	}
+
+	switch (vmentry_l1d_flush) {
+	case 0:
+		*need_l1d_flush = false;
+		break;
+	case 1:
+		/*
+		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
+		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
+		 * following cases:
+		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
+		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
+		 *   on every nested entry.  Nested hypervisors do not bother clearing
+		 *   the cache.
+		 * - anything that runs the emulator (the slow paths for EPT misconfig
+		 *   or I/O instruction)
+		 * - anything that can cause get_user_pages (EPT violation, and again
+		 *   the slow paths for EPT misconfig or I/O instruction)
+		 * - anything that can run code outside KVM (external interrupt,
+		 *   which can run interrupt handlers or irqs; or the sched_in
+		 *   preempt notifier)
+		 */
+		break;
+	case 2:
+	default:
+		*need_l1d_flush = true;
+		break;
+	}
+}
+
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
 	if (!vmx->host_state.loaded)
@@ -9751,6 +9791,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			[ss]"i"(__KERNEL_DS),
 			[cs]"i"(__KERNEL_CS)
 			);
+		vcpu->arch.vcpu_unconfined = true;
 	}
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
@@ -11811,6 +11852,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return ret;
 	}
 
+	/* Hide L1D cache contents from the nested guest.  */
+	vmx->vcpu.arch.vcpu_unconfined = true;
+
 	/*
 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
 	 * by event injection, halt vcpu.
@@ -12928,7 +12972,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
 
-	.prepare_guest_switch = vmx_save_host_state,
+	.prepare_guest_switch = vmx_prepare_guest_switch,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1065d4e7c5fd..c0079aae38ba 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -199,6 +199,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "req_event", VCPU_STAT(req_event) },
+	{ "l1d_flush", VCPU_STAT(l1d_flush) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -6054,6 +6055,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	bool writeback = true;
 	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 
+	vcpu->arch.vcpu_unconfined = true;
+
 	/*
 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 	 * never reused.
@@ -6537,10 +6540,45 @@ static struct notifier_block pvclock_gtod_notifier = {
 };
 #endif
 
+
+/*
+ * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
+ * 64 KiB because the replacement algorithm is not exactly LRU.
+ */
+#define L1D_CACHE_ORDER 4
+static void *__read_mostly empty_zero_pages;
+
+void kvm_l1d_flush(void)
+{
+	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
+	int size = PAGE_SIZE << L1D_CACHE_ORDER;
+	asm volatile(
+		/* First ensure the pages are in the TLB */
+		"xorl %%eax, %%eax\n\t"
+		"11: \n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $4096, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 11b\n\t"
+		"xorl %%eax, %%eax\n\t"
+		"cpuid\n\t"
+		/* Now fill the cache */
+		"xorl %%eax, %%eax\n\t"
+		"12:\n\t"
+		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+		"addl $64, %%eax\n\t"
+		"cmpl %%eax, %1\n\t"
+		"jne 12b\n\t"
+		"lfence\n\t"
+		: : "r" (empty_zero_pages), "r" (size)
+		: "eax", "ebx", "ecx", "edx");
+}
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
 	struct kvm_x86_ops *ops = opaque;
+	struct page *page;
 
 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -6569,10 +6607,15 @@ int kvm_arch_init(void *opaque)
 				"being able to snoop the host memory!");
 	}
 	r = -ENOMEM;
+	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
+	if (!page)
+		goto out;
+	empty_zero_pages = page_address(page);
+
 	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 	if (!shared_msrs) {
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
-		goto out;
+		goto out_free_zero_pages;
 	}
 
 	r = kvm_mmu_module_init();
@@ -6603,6 +6646,8 @@ int kvm_arch_init(void *opaque)
 
 	return 0;
 
+out_free_zero_pages:
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 out_free_percpu:
 	free_percpu(shared_msrs);
 out:
@@ -6627,6 +6672,7 @@ void kvm_arch_exit(void)
 #endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
+	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
 	free_percpu(shared_msrs);
 }
 
@@ -7266,6 +7312,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_cpu_accept_dm_intr(vcpu);
 
 	bool req_immediate_exit = false;
+	bool need_l1d_flush;
 
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -7405,7 +7452,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	preempt_disable();
 
-	kvm_x86_ops->prepare_guest_switch(vcpu);
+	need_l1d_flush = vcpu->arch.vcpu_unconfined;
+	vcpu->arch.vcpu_unconfined = false;
+	kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
+	if (need_l1d_flush) {
+		vcpu->stat.l1d_flush++;
+		kvm_l1d_flush();
+	}
 
 	/*
 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -7592,6 +7645,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	struct kvm *kvm = vcpu->kvm;
 
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+	vcpu->arch.vcpu_unconfined = true;
 
 	for (;;) {
 		if (kvm_vcpu_running(vcpu)) {
@@ -8711,6 +8765,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+	vcpu->arch.vcpu_unconfined = true;
 	kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread