kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
@ 2021-01-12  6:37 Wei Huang
  2021-01-12  6:37 ` [PATCH 2/2] KVM: SVM: Add support for VMCB address check change Wei Huang
                   ` (6 more replies)
  0 siblings, 7 replies; 33+ messages in thread
From: Wei Huang @ 2021-01-12  6:37 UTC (permalink / raw)
  To: kvm
  Cc: linux-kernel, pbonzini, vkuznets, seanjc, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert, wei.huang2, mlevitsk

From: Bandan Das <bsd@redhat.com>

While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
before checking VMCB's instruction intercept. If EAX falls into such
memory areas, #GP is triggered before VMEXIT. This causes problem under
nested virtualization. To solve this problem, KVM needs to trap #GP and
check the instructions triggering #GP. For VM execution instructions,
KVM emulates these instructions; otherwise it re-injects #GP back to
guest VMs.

Signed-off-by: Bandan Das <bsd@redhat.com>
Co-developed-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Wei Huang <wei.huang2@amd.com>
---
 arch/x86/include/asm/kvm_host.h |   8 +-
 arch/x86/kvm/mmu.h              |   1 +
 arch/x86/kvm/mmu/mmu.c          |   7 ++
 arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
 arch/x86/kvm/svm/svm.h          |   8 ++
 arch/x86/kvm/vmx/vmx.c          |   2 +-
 arch/x86/kvm/x86.c              |  37 +++++++-
 7 files changed, 146 insertions(+), 74 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3d6616f6f6ef..0ddc309f5a14 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
  *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
  *			     Used to test the full emulator from userspace.
  *
- * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
+ * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
  *			backdoor emulation, which is opt in via module param.
  *			VMware backoor emulation handles select instructions
- *			and reinjects the #GP for all other cases.
+ *			and reinjects #GP for all other cases. This also
+ *			handles other cases where #GP condition needs to be
+ *			handled and emulated appropriately
  *
  * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
  *		 case the CR2/GPA value pass on the stack is valid.
@@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
 #define EMULTYPE_SKIP		    (1 << 2)
 #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
 #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
-#define EMULTYPE_VMWARE_GP	    (1 << 5)
+#define EMULTYPE_PARAVIRT_GP	    (1 << 5)
 #define EMULTYPE_PF		    (1 << 6)
 
 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 581925e476d6..1a2fff4e7140 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_post_init_vm(struct kvm *kvm);
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+bool kvm_is_host_reserved_region(u64 gpa);
 
 #endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..c5c4aaf01a1a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -50,6 +50,7 @@
 #include <asm/io.h>
 #include <asm/vmx.h>
 #include <asm/kvm_page_track.h>
+#include <asm/e820/api.h>
 #include "trace.h"
 
 extern bool itlb_multihit_kvm_mitigation;
@@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
+bool kvm_is_host_reserved_region(u64 gpa)
+{
+	return e820__mapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
+}
+EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
+
 void kvm_mmu_zap_all(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7ef171790d02..74620d32aa82 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 		if (!(efer & EFER_SVME)) {
 			svm_leave_nested(svm);
 			svm_set_gif(svm, true);
+			clr_exception_intercept(svm, GP_VECTOR);
 
 			/*
 			 * Free the nested guest state, unless we are in SMM.
@@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 	svm->vmcb->save.efer = efer | EFER_SVME;
 	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+	/* Enable GP interception for SVM instructions if needed */
+	if (efer & EFER_SVME)
+		set_exception_intercept(svm, GP_VECTOR);
+
 	return 0;
 }
 
@@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int vmload_interception(struct vcpu_svm *svm)
+{
+	struct vmcb *nested_vmcb;
+	struct kvm_host_map map;
+	int ret;
+
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+	if (ret) {
+		if (ret == -EINVAL)
+			kvm_inject_gp(&svm->vcpu, 0);
+		return 1;
+	}
+
+	nested_vmcb = map.hva;
+
+	ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
+	kvm_vcpu_unmap(&svm->vcpu, &map, true);
+
+	return ret;
+}
+
+static int vmsave_interception(struct vcpu_svm *svm)
+{
+	struct vmcb *nested_vmcb;
+	struct kvm_host_map map;
+	int ret;
+
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+	if (ret) {
+		if (ret == -EINVAL)
+			kvm_inject_gp(&svm->vcpu, 0);
+		return 1;
+	}
+
+	nested_vmcb = map.hva;
+
+	ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
+	kvm_vcpu_unmap(&svm->vcpu, &map, true);
+
+	return ret;
+}
+
+static int vmrun_interception(struct vcpu_svm *svm)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	return nested_svm_vmrun(svm);
+}
+
+/* Emulate SVM VM execution instructions */
+static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	switch (modrm) {
+	case 0xd8: /* VMRUN */
+		return vmrun_interception(svm);
+	case 0xda: /* VMLOAD */
+		return vmload_interception(svm);
+	case 0xdb: /* VMSAVE */
+		return vmsave_interception(svm);
+	default:
+		/* inject a #GP for all other cases */
+		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+		return 1;
+	}
+}
+
 static int gp_interception(struct vcpu_svm *svm)
 {
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	u32 error_code = svm->vmcb->control.exit_info_1;
-
-	WARN_ON_ONCE(!enable_vmware_backdoor);
+	int rc;
 
 	/*
-	 * VMware backdoor emulation on #GP interception only handles IN{S},
-	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
+	 * Only VMware backdoor and SVM VME errata are handled. Neither of
+	 * them has non-zero error codes.
 	 */
 	if (error_code) {
 		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 		return 1;
 	}
-	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
+
+	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
+	if (rc > 1)
+		rc = svm_emulate_vm_instr(vcpu, rc);
+	return rc;
 }
 
 static bool is_erratum_383(void)
@@ -2113,66 +2200,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
 	return kvm_emulate_hypercall(&svm->vcpu);
 }
 
-static int vmload_interception(struct vcpu_svm *svm)
-{
-	struct vmcb *nested_vmcb;
-	struct kvm_host_map map;
-	int ret;
-
-	if (nested_svm_check_permissions(svm))
-		return 1;
-
-	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
-	if (ret) {
-		if (ret == -EINVAL)
-			kvm_inject_gp(&svm->vcpu, 0);
-		return 1;
-	}
-
-	nested_vmcb = map.hva;
-
-	ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
-	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
-	kvm_vcpu_unmap(&svm->vcpu, &map, true);
-
-	return ret;
-}
-
-static int vmsave_interception(struct vcpu_svm *svm)
-{
-	struct vmcb *nested_vmcb;
-	struct kvm_host_map map;
-	int ret;
-
-	if (nested_svm_check_permissions(svm))
-		return 1;
-
-	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
-	if (ret) {
-		if (ret == -EINVAL)
-			kvm_inject_gp(&svm->vcpu, 0);
-		return 1;
-	}
-
-	nested_vmcb = map.hva;
-
-	ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
-	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
-	kvm_vcpu_unmap(&svm->vcpu, &map, true);
-
-	return ret;
-}
-
-static int vmrun_interception(struct vcpu_svm *svm)
-{
-	if (nested_svm_check_permissions(svm))
-		return 1;
-
-	return nested_svm_vmrun(svm);
-}
-
 void svm_set_gif(struct vcpu_svm *svm, bool value)
 {
 	if (value) {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 0fe874ae5498..d5dffcf59afa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -350,6 +350,14 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
 	recalc_intercepts(svm);
 }
 
+static inline bool is_exception_intercept(struct vcpu_svm *svm, u32 bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	WARN_ON_ONCE(bit >= 32);
+	return vmcb_is_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
+}
+
 static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
 {
 	struct vmcb *vmcb = get_host_vmcb(svm);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 2af05d3b0590..5fac2f7cba24 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4774,7 +4774,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 			return 1;
 		}
-		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
+		return kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
 	}
 
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a8969a6dd06..c3662fc3b1bc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7014,7 +7014,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
 	++vcpu->stat.insn_emulation_fail;
 	trace_kvm_emulate_insn_failed(vcpu);
 
-	if (emulation_type & EMULTYPE_VMWARE_GP) {
+	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
 		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 		return 1;
 	}
@@ -7267,6 +7267,28 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
 	return false;
 }
 
+static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
+{
+	unsigned long rax;
+
+	if (ctxt->b != 0x1)
+		return 0;
+
+	switch (ctxt->modrm) {
+	case 0xd8: /* VMRUN */
+	case 0xda: /* VMLOAD */
+	case 0xdb: /* VMSAVE */
+		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
+		if (!kvm_is_host_reserved_region(rax))
+			return 0;
+		break;
+	default:
+		return 0;
+	}
+
+	return ctxt->modrm;
+}
+
 static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
 {
 	switch (ctxt->opcode_len) {
@@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
 	bool writeback = true;
 	bool write_fault_to_spt;
+	int vminstr;
 
 	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
 		return 1;
@@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		}
 	}
 
-	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
-	    !is_vmware_backdoor_opcode(ctxt)) {
-		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
-		return 1;
+	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
+		vminstr = is_vm_instr_opcode(ctxt);
+		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
+			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+			return 1;
+		}
+		if (vminstr)
+			return vminstr;
 	}
 
 	/*
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/2] KVM: SVM: Add support for VMCB address check change
  2021-01-12  6:37 [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Wei Huang
@ 2021-01-12  6:37 ` Wei Huang
  2021-01-12 19:18   ` Sean Christopherson
  2021-01-14 12:04   ` Maxim Levitsky
  2021-01-12 11:09 ` [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Maxim Levitsky
                   ` (5 subsequent siblings)
  6 siblings, 2 replies; 33+ messages in thread
From: Wei Huang @ 2021-01-12  6:37 UTC (permalink / raw)
  To: kvm
  Cc: linux-kernel, pbonzini, vkuznets, seanjc, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert, wei.huang2, mlevitsk

New AMD CPUs have a change that checks VMEXIT intercept on special SVM
instructions before checking their EAX against reserved memory region.
This change is indicated by CPUID_0x8000000A_EDX[28]. If it is 1, KVM
doesn't need to intercept and emulate #GP faults for such instructions
because #GP isn't supposed to be triggered.

Co-developed-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Wei Huang <wei.huang2@amd.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 arch/x86/kvm/svm/svm.c             | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 84b887825f12..ea89d6fdd79a 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -337,6 +337,7 @@
 #define X86_FEATURE_AVIC		(15*32+13) /* Virtual Interrupt Controller */
 #define X86_FEATURE_V_VMSAVE_VMLOAD	(15*32+15) /* Virtual VMSAVE VMLOAD */
 #define X86_FEATURE_VGIF		(15*32+16) /* Virtual GIF */
+#define X86_FEATURE_SVME_ADDR_CHK	(15*32+28) /* "" SVME addr check */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
 #define X86_FEATURE_AVX512VBMI		(16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 74620d32aa82..451b82df2eab 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -311,7 +311,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	svm->vmcb->save.efer = efer | EFER_SVME;
 	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
 	/* Enable GP interception for SVM instructions if needed */
-	if (efer & EFER_SVME)
+	if ((efer & EFER_SVME) && !boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
 		set_exception_intercept(svm, GP_VECTOR);
 
 	return 0;
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12  6:37 [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Wei Huang
  2021-01-12  6:37 ` [PATCH 2/2] KVM: SVM: Add support for VMCB address check change Wei Huang
@ 2021-01-12 11:09 ` Maxim Levitsky
  2021-01-12 21:05   ` Wei Huang
  2021-01-12 12:15 ` Vitaly Kuznetsov
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 33+ messages in thread
From: Maxim Levitsky @ 2021-01-12 11:09 UTC (permalink / raw)
  To: Wei Huang, kvm
  Cc: linux-kernel, pbonzini, vkuznets, seanjc, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert

On Tue, 2021-01-12 at 00:37 -0600, Wei Huang wrote:
> From: Bandan Das <bsd@redhat.com>
> 
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept. If EAX falls into such
> memory areas, #GP is triggered before VMEXIT. This causes problem under
> nested virtualization. To solve this problem, KVM needs to trap #GP and
> check the instructions triggering #GP. For VM execution instructions,
> KVM emulates these instructions; otherwise it re-injects #GP back to
> guest VMs.
> 
> Signed-off-by: Bandan Das <bsd@redhat.com>
> Co-developed-by: Wei Huang <wei.huang2@amd.com>
> Signed-off-by: Wei Huang <wei.huang2@amd.com>

This is the ultimate fix for this bug that I had in mind, 
but I didn't dare to develop it, thinking it won't be accepted 
due to the added complexity.

From a cursory look this look all right, and I will review 
and test this either today or tomorrow.

Thank you very much for doing the right fix for this bug.

Best regards,
	Maxim Levitsky

> ---
>  arch/x86/include/asm/kvm_host.h |   8 +-
>  arch/x86/kvm/mmu.h              |   1 +
>  arch/x86/kvm/mmu/mmu.c          |   7 ++
>  arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>  arch/x86/kvm/svm/svm.h          |   8 ++
>  arch/x86/kvm/vmx/vmx.c          |   2 +-
>  arch/x86/kvm/x86.c              |  37 +++++++-
>  7 files changed, 146 insertions(+), 74 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..0ddc309f5a14 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>   *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>   *			     Used to test the full emulator from userspace.
>   *
> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>   *			backdoor emulation, which is opt in via module param.
>   *			VMware backoor emulation handles select instructions
> - *			and reinjects the #GP for all other cases.
> + *			and reinjects #GP for all other cases. This also
> + *			handles other cases where #GP condition needs to be
> + *			handled and emulated appropriately
>   *
>   * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>   *		 case the CR2/GPA value pass on the stack is valid.
> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>  #define EMULTYPE_SKIP		    (1 << 2)
>  #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
>  #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
> -#define EMULTYPE_VMWARE_GP	    (1 << 5)
> +#define EMULTYPE_PARAVIRT_GP	    (1 << 5)
>  #define EMULTYPE_PF		    (1 << 6)
>  
>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 581925e476d6..1a2fff4e7140 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>  
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
>  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> +bool kvm_is_host_reserved_region(u64 gpa);
>  
>  #endif
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..c5c4aaf01a1a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -50,6 +50,7 @@
>  #include <asm/io.h>
>  #include <asm/vmx.h>
>  #include <asm/kvm_page_track.h>
> +#include <asm/e820/api.h>
>  #include "trace.h"
>  
>  extern bool itlb_multihit_kvm_mitigation;
> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>  
> +bool kvm_is_host_reserved_region(u64 gpa)
> +{
> +	return e820__mapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> +}
> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
> +
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
>  	struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 7ef171790d02..74620d32aa82 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  		if (!(efer & EFER_SVME)) {
>  			svm_leave_nested(svm);
>  			svm_set_gif(svm, true);
> +			clr_exception_intercept(svm, GP_VECTOR);
>  
>  			/*
>  			 * Free the nested guest state, unless we are in SMM.
> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  
>  	svm->vmcb->save.efer = efer | EFER_SVME;
>  	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
> +	/* Enable GP interception for SVM instructions if needed */
> +	if (efer & EFER_SVME)
> +		set_exception_intercept(svm, GP_VECTOR);
> +
>  	return 0;
>  }
>  
> @@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
>  	return 1;
>  }
>  
> +static int vmload_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmsave_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmrun_interception(struct vcpu_svm *svm)
> +{
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	return nested_svm_vmrun(svm);
> +}
> +
> +/* Emulate SVM VM execution instructions */
> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	switch (modrm) {
> +	case 0xd8: /* VMRUN */
> +		return vmrun_interception(svm);
> +	case 0xda: /* VMLOAD */
> +		return vmload_interception(svm);
> +	case 0xdb: /* VMSAVE */
> +		return vmsave_interception(svm);
> +	default:
> +		/* inject a #GP for all other cases */
> +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +		return 1;
> +	}
> +}
> +
>  static int gp_interception(struct vcpu_svm *svm)
>  {
>  	struct kvm_vcpu *vcpu = &svm->vcpu;
>  	u32 error_code = svm->vmcb->control.exit_info_1;
> -
> -	WARN_ON_ONCE(!enable_vmware_backdoor);
> +	int rc;
>  
>  	/*
> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> +	 * them has non-zero error codes.
>  	 */
>  	if (error_code) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  		return 1;
>  	}
> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> +	if (rc > 1)
> +		rc = svm_emulate_vm_instr(vcpu, rc);
> +	return rc;
>  }
>  
>  static bool is_erratum_383(void)
> @@ -2113,66 +2200,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
>  	return kvm_emulate_hypercall(&svm->vcpu);
>  }
>  
> -static int vmload_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmsave_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmrun_interception(struct vcpu_svm *svm)
> -{
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	return nested_svm_vmrun(svm);
> -}
> -
>  void svm_set_gif(struct vcpu_svm *svm, bool value)
>  {
>  	if (value) {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 0fe874ae5498..d5dffcf59afa 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -350,6 +350,14 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
>  	recalc_intercepts(svm);
>  }
>  
> +static inline bool is_exception_intercept(struct vcpu_svm *svm, u32 bit)
> +{
> +	struct vmcb *vmcb = get_host_vmcb(svm);
> +
> +	WARN_ON_ONCE(bit >= 32);
> +	return vmcb_is_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
> +}
> +
>  static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
>  {
>  	struct vmcb *vmcb = get_host_vmcb(svm);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 2af05d3b0590..5fac2f7cba24 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -4774,7 +4774,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>  			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  			return 1;
>  		}
> -		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +		return kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>  	}
>  
>  	/*
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9a8969a6dd06..c3662fc3b1bc 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7014,7 +7014,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>  	++vcpu->stat.insn_emulation_fail;
>  	trace_kvm_emulate_insn_failed(vcpu);
>  
> -	if (emulation_type & EMULTYPE_VMWARE_GP) {
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>  		return 1;
>  	}
> @@ -7267,6 +7267,28 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
>  	return false;
>  }
>  
> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
> +{
> +	unsigned long rax;
> +
> +	if (ctxt->b != 0x1)
> +		return 0;
> +
> +	switch (ctxt->modrm) {
> +	case 0xd8: /* VMRUN */
> +	case 0xda: /* VMLOAD */
> +	case 0xdb: /* VMSAVE */
> +		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
> +		if (!kvm_is_host_reserved_region(rax))
> +			return 0;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	return ctxt->modrm;
> +}
> +
>  static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>  {
>  	switch (ctxt->opcode_len) {
> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>  	bool writeback = true;
>  	bool write_fault_to_spt;
> +	int vminstr;
>  
>  	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
>  		return 1;
> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		}
>  	}
>  
> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> -	    !is_vmware_backdoor_opcode(ctxt)) {
> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> -		return 1;
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> +		vminstr = is_vm_instr_opcode(ctxt);
> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +			return 1;
> +		}
> +		if (vminstr)
> +			return vminstr;
>  	}
>  
>  	/*



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12  6:37 [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Wei Huang
  2021-01-12  6:37 ` [PATCH 2/2] KVM: SVM: Add support for VMCB address check change Wei Huang
  2021-01-12 11:09 ` [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Maxim Levitsky
@ 2021-01-12 12:15 ` Vitaly Kuznetsov
  2021-01-12 15:11   ` Andy Lutomirski
  2021-01-12 21:50   ` Wei Huang
  2021-01-12 14:01 ` Paolo Bonzini
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 33+ messages in thread
From: Vitaly Kuznetsov @ 2021-01-12 12:15 UTC (permalink / raw)
  To: Wei Huang, kvm
  Cc: linux-kernel, pbonzini, seanjc, joro, bp, tglx, mingo, x86,
	jmattson, wanpengli, bsd, dgilbert, wei.huang2, mlevitsk

Wei Huang <wei.huang2@amd.com> writes:

> From: Bandan Das <bsd@redhat.com>
>
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept. If EAX falls into such
> memory areas, #GP is triggered before VMEXIT. This causes problem under
> nested virtualization. To solve this problem, KVM needs to trap #GP and
> check the instructions triggering #GP. For VM execution instructions,
> KVM emulates these instructions; otherwise it re-injects #GP back to
> guest VMs.
>
> Signed-off-by: Bandan Das <bsd@redhat.com>
> Co-developed-by: Wei Huang <wei.huang2@amd.com>
> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   8 +-
>  arch/x86/kvm/mmu.h              |   1 +
>  arch/x86/kvm/mmu/mmu.c          |   7 ++
>  arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>  arch/x86/kvm/svm/svm.h          |   8 ++
>  arch/x86/kvm/vmx/vmx.c          |   2 +-
>  arch/x86/kvm/x86.c              |  37 +++++++-
>  7 files changed, 146 insertions(+), 74 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..0ddc309f5a14 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>   *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>   *			     Used to test the full emulator from userspace.
>   *
> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>   *			backdoor emulation, which is opt in via module param.
>   *			VMware backoor emulation handles select instructions
> - *			and reinjects the #GP for all other cases.
> + *			and reinjects #GP for all other cases. This also
> + *			handles other cases where #GP condition needs to be
> + *			handled and emulated appropriately
>   *
>   * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>   *		 case the CR2/GPA value pass on the stack is valid.
> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>  #define EMULTYPE_SKIP		    (1 << 2)
>  #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
>  #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
> -#define EMULTYPE_VMWARE_GP	    (1 << 5)
> +#define EMULTYPE_PARAVIRT_GP	    (1 << 5)
>  #define EMULTYPE_PF		    (1 << 6)
>  
>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 581925e476d6..1a2fff4e7140 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>  
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
>  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> +bool kvm_is_host_reserved_region(u64 gpa);

Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 

>  
>  #endif
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..c5c4aaf01a1a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -50,6 +50,7 @@
>  #include <asm/io.h>
>  #include <asm/vmx.h>
>  #include <asm/kvm_page_track.h>
> +#include <asm/e820/api.h>
>  #include "trace.h"
>  
>  extern bool itlb_multihit_kvm_mitigation;
> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>  
> +bool kvm_is_host_reserved_region(u64 gpa)
> +{
> +	return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> +}

While _e820__mapped_any()'s doc says '..  checks if any part of the
range <start,end> is mapped ..' it seems to me that the real check is
[start, end) so we should use 'gpa' instead of 'gpa-1', no?

> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
> +
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
>  	struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 7ef171790d02..74620d32aa82 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  		if (!(efer & EFER_SVME)) {
>  			svm_leave_nested(svm);
>  			svm_set_gif(svm, true);
> +			clr_exception_intercept(svm, GP_VECTOR);
>  
>  			/*
>  			 * Free the nested guest state, unless we are in SMM.
> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  
>  	svm->vmcb->save.efer = efer | EFER_SVME;
>  	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
> +	/* Enable GP interception for SVM instructions if needed */
> +	if (efer & EFER_SVME)
> +		set_exception_intercept(svm, GP_VECTOR);
> +
>  	return 0;
>  }
>  
> @@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
>  	return 1;
>  }
>  
> +static int vmload_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmsave_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmrun_interception(struct vcpu_svm *svm)
> +{
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	return nested_svm_vmrun(svm);
> +}
> +
> +/* Emulate SVM VM execution instructions */
> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	switch (modrm) {
> +	case 0xd8: /* VMRUN */
> +		return vmrun_interception(svm);
> +	case 0xda: /* VMLOAD */
> +		return vmload_interception(svm);
> +	case 0xdb: /* VMSAVE */
> +		return vmsave_interception(svm);
> +	default:
> +		/* inject a #GP for all other cases */
> +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +		return 1;
> +	}
> +}
> +
>  static int gp_interception(struct vcpu_svm *svm)
>  {
>  	struct kvm_vcpu *vcpu = &svm->vcpu;
>  	u32 error_code = svm->vmcb->control.exit_info_1;
> -
> -	WARN_ON_ONCE(!enable_vmware_backdoor);
> +	int rc;
>  
>  	/*
> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> +	 * them has non-zero error codes.
>  	 */
>  	if (error_code) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  		return 1;
>  	}
> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> +	if (rc > 1)
> +		rc = svm_emulate_vm_instr(vcpu, rc);
> +	return rc;
>  }
>  
>  static bool is_erratum_383(void)
> @@ -2113,66 +2200,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
>  	return kvm_emulate_hypercall(&svm->vcpu);
>  }
>  
> -static int vmload_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmsave_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmrun_interception(struct vcpu_svm *svm)
> -{
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	return nested_svm_vmrun(svm);
> -}
> -

Maybe if you'd do it the other way around and put gp_interception()
after vm{load,save,run}_interception(), the diff (and code churn)
would've been smaller? 

>  void svm_set_gif(struct vcpu_svm *svm, bool value)
>  {
>  	if (value) {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 0fe874ae5498..d5dffcf59afa 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -350,6 +350,14 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
>  	recalc_intercepts(svm);
>  }
>  
> +static inline bool is_exception_intercept(struct vcpu_svm *svm, u32 bit)
> +{
> +	struct vmcb *vmcb = get_host_vmcb(svm);
> +
> +	WARN_ON_ONCE(bit >= 32);
> +	return vmcb_is_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
> +}
> +
>  static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
>  {
>  	struct vmcb *vmcb = get_host_vmcb(svm);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 2af05d3b0590..5fac2f7cba24 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -4774,7 +4774,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>  			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  			return 1;
>  		}
> -		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +		return kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>  	}
>  
>  	/*
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9a8969a6dd06..c3662fc3b1bc 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7014,7 +7014,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>  	++vcpu->stat.insn_emulation_fail;
>  	trace_kvm_emulate_insn_failed(vcpu);
>  
> -	if (emulation_type & EMULTYPE_VMWARE_GP) {
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>  		return 1;
>  	}
> @@ -7267,6 +7267,28 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
>  	return false;
>  }
>  
> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)

Nit: it seems we either return '0' or 'ctxt->modrm' which is 'u8', so
'u8' instead of 'int' maybe?

> +{
> +	unsigned long rax;
> +
> +	if (ctxt->b != 0x1)
> +		return 0;
> +
> +	switch (ctxt->modrm) {
> +	case 0xd8: /* VMRUN */
> +	case 0xda: /* VMLOAD */
> +	case 0xdb: /* VMSAVE */
> +		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
> +		if (!kvm_is_host_reserved_region(rax))
> +			return 0;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	return ctxt->modrm;
> +}
> +
>  static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>  {
>  	switch (ctxt->opcode_len) {
> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>  	bool writeback = true;
>  	bool write_fault_to_spt;
> +	int vminstr;
>  
>  	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
>  		return 1;
> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		}
>  	}
>  
> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> -	    !is_vmware_backdoor_opcode(ctxt)) {
> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> -		return 1;
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> +		vminstr = is_vm_instr_opcode(ctxt);
> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +			return 1;
> +		}
> +		if (vminstr)
> +			return vminstr;
>  	}
>  
>  	/*

-- 
Vitaly


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12  6:37 [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Wei Huang
                   ` (2 preceding siblings ...)
  2021-01-12 12:15 ` Vitaly Kuznetsov
@ 2021-01-12 14:01 ` Paolo Bonzini
  2021-01-12 17:42   ` Sean Christopherson
  2021-01-15  7:00   ` Wei Huang
  2021-01-12 17:36 ` Sean Christopherson
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 33+ messages in thread
From: Paolo Bonzini @ 2021-01-12 14:01 UTC (permalink / raw)
  To: Wei Huang, kvm
  Cc: linux-kernel, vkuznets, seanjc, joro, bp, tglx, mingo, x86,
	jmattson, wanpengli, bsd, dgilbert, mlevitsk

On 12/01/21 07:37, Wei Huang wrote:
>   static int gp_interception(struct vcpu_svm *svm)
>   {
>   	struct kvm_vcpu *vcpu = &svm->vcpu;
>   	u32 error_code = svm->vmcb->control.exit_info_1;
> -
> -	WARN_ON_ONCE(!enable_vmware_backdoor);
> +	int rc;
>   
>   	/*
> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> +	 * them has non-zero error codes.
>   	 */
>   	if (error_code) {
>   		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>   		return 1;
>   	}
> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> +	if (rc > 1)
> +		rc = svm_emulate_vm_instr(vcpu, rc);
> +	return rc;
>   }
>   

Passing back the third byte is quick hacky.  Instead of this change to 
kvm_emulate_instruction, I'd rather check the instruction bytes in 
gp_interception before calling kvm_emulate_instruction.  That would be 
something like:

- move "kvm_clear_exception_queue(vcpu);" inside the "if 
(!(emulation_type & EMULTYPE_NO_DECODE))".  It doesn't apply when you 
are coming back from userspace.

- extract the "if (!(emulation_type & EMULTYPE_NO_DECODE))" body to a 
new function x86_emulate_decoded_instruction.  Call it from 
gp_interception, we know this is not a pagefault and therefore 
vcpu->arch.write_fault_to_shadow_pgtable must be false.

- check ctxt->insn_bytes for an SVM instruction

- if not an SVM instruction, call kvm_emulate_instruction(vcpu, 
EMULTYPE_VMWARE_GP|EMULTYPE_NO_DECODE).

Thanks,

Paolo


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 12:15 ` Vitaly Kuznetsov
@ 2021-01-12 15:11   ` Andy Lutomirski
  2021-01-12 15:17     ` Maxim Levitsky
  2021-01-12 21:50   ` Wei Huang
  1 sibling, 1 reply; 33+ messages in thread
From: Andy Lutomirski @ 2021-01-12 15:11 UTC (permalink / raw)
  To: Vitaly Kuznetsov
  Cc: Wei Huang, kvm, linux-kernel, pbonzini, seanjc, joro, bp, tglx,
	mingo, x86, jmattson, wanpengli, bsd, dgilbert, wei.huang2,
	mlevitsk


> On Jan 12, 2021, at 4:15 AM, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
> 
> Wei Huang <wei.huang2@amd.com> writes:
> 
>> From: Bandan Das <bsd@redhat.com>
>> 
>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>> before checking VMCB's instruction intercept. If EAX falls into such
>> memory areas, #GP is triggered before VMEXIT. This causes problem under
>> nested virtualization. To solve this problem, KVM needs to trap #GP and
>> check the instructions triggering #GP. For VM execution instructions,
>> KVM emulates these instructions; otherwise it re-injects #GP back to
>> guest VMs.
>> 
>> Signed-off-by: Bandan Das <bsd@redhat.com>
>> Co-developed-by: Wei Huang <wei.huang2@amd.com>
>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
>> ---
>> arch/x86/include/asm/kvm_host.h |   8 +-
>> arch/x86/kvm/mmu.h              |   1 +
>> arch/x86/kvm/mmu/mmu.c          |   7 ++
>> arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>> arch/x86/kvm/svm/svm.h          |   8 ++
>> arch/x86/kvm/vmx/vmx.c          |   2 +-
>> arch/x86/kvm/x86.c              |  37 +++++++-
>> 7 files changed, 146 insertions(+), 74 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 3d6616f6f6ef..0ddc309f5a14 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>>  *                 due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>>  *                 Used to test the full emulator from userspace.
>>  *
>> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
>> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>>  *            backdoor emulation, which is opt in via module param.
>>  *            VMware backoor emulation handles select instructions
>> - *            and reinjects the #GP for all other cases.
>> + *            and reinjects #GP for all other cases. This also
>> + *            handles other cases where #GP condition needs to be
>> + *            handled and emulated appropriately
>>  *
>>  * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>>  *         case the CR2/GPA value pass on the stack is valid.
>> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>> #define EMULTYPE_SKIP            (1 << 2)
>> #define EMULTYPE_ALLOW_RETRY_PF        (1 << 3)
>> #define EMULTYPE_TRAP_UD_FORCED        (1 << 4)
>> -#define EMULTYPE_VMWARE_GP        (1 << 5)
>> +#define EMULTYPE_PARAVIRT_GP        (1 << 5)
>> #define EMULTYPE_PF            (1 << 6)
>> 
>> int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index 581925e476d6..1a2fff4e7140 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>> 
>> int kvm_mmu_post_init_vm(struct kvm *kvm);
>> void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
>> +bool kvm_is_host_reserved_region(u64 gpa);
> 
> Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 
> 
>> 
>> #endif
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index 6d16481aa29d..c5c4aaf01a1a 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -50,6 +50,7 @@
>> #include <asm/io.h>
>> #include <asm/vmx.h>
>> #include <asm/kvm_page_track.h>
>> +#include <asm/e820/api.h>
>> #include "trace.h"
>> 
>> extern bool itlb_multihit_kvm_mitigation;
>> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>> }
>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>> 
>> +bool kvm_is_host_reserved_region(u64 gpa)
>> +{
>> +    return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
>> +}
> 
> While _e820__mapped_any()'s doc says '..  checks if any part of the
> range <start,end> is mapped ..' it seems to me that the real check is
> [start, end) so we should use 'gpa' instead of 'gpa-1', no?

Why do you need to check GPA at all?



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 15:11   ` Andy Lutomirski
@ 2021-01-12 15:17     ` Maxim Levitsky
  2021-01-12 15:22       ` Andy Lutomirski
  0 siblings, 1 reply; 33+ messages in thread
From: Maxim Levitsky @ 2021-01-12 15:17 UTC (permalink / raw)
  To: Andy Lutomirski, Vitaly Kuznetsov
  Cc: Wei Huang, kvm, linux-kernel, pbonzini, seanjc, joro, bp, tglx,
	mingo, x86, jmattson, wanpengli, bsd, dgilbert

On Tue, 2021-01-12 at 07:11 -0800, Andy Lutomirski wrote:
> > On Jan 12, 2021, at 4:15 AM, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
> > 
> > Wei Huang <wei.huang2@amd.com> writes:
> > 
> > > From: Bandan Das <bsd@redhat.com>
> > > 
> > > While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> > > CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> > > before checking VMCB's instruction intercept. If EAX falls into such
> > > memory areas, #GP is triggered before VMEXIT. This causes problem under
> > > nested virtualization. To solve this problem, KVM needs to trap #GP and
> > > check the instructions triggering #GP. For VM execution instructions,
> > > KVM emulates these instructions; otherwise it re-injects #GP back to
> > > guest VMs.
> > > 
> > > Signed-off-by: Bandan Das <bsd@redhat.com>
> > > Co-developed-by: Wei Huang <wei.huang2@amd.com>
> > > Signed-off-by: Wei Huang <wei.huang2@amd.com>
> > > ---
> > > arch/x86/include/asm/kvm_host.h |   8 +-
> > > arch/x86/kvm/mmu.h              |   1 +
> > > arch/x86/kvm/mmu/mmu.c          |   7 ++
> > > arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
> > > arch/x86/kvm/svm/svm.h          |   8 ++
> > > arch/x86/kvm/vmx/vmx.c          |   2 +-
> > > arch/x86/kvm/x86.c              |  37 +++++++-
> > > 7 files changed, 146 insertions(+), 74 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 3d6616f6f6ef..0ddc309f5a14 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
> > >  *                 due to an intercepted #UD (see EMULTYPE_TRAP_UD).
> > >  *                 Used to test the full emulator from userspace.
> > >  *
> > > - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> > > + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
> > >  *            backdoor emulation, which is opt in via module param.
> > >  *            VMware backoor emulation handles select instructions
> > > - *            and reinjects the #GP for all other cases.
> > > + *            and reinjects #GP for all other cases. This also
> > > + *            handles other cases where #GP condition needs to be
> > > + *            handled and emulated appropriately
> > >  *
> > >  * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
> > >  *         case the CR2/GPA value pass on the stack is valid.
> > > @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
> > > #define EMULTYPE_SKIP            (1 << 2)
> > > #define EMULTYPE_ALLOW_RETRY_PF        (1 << 3)
> > > #define EMULTYPE_TRAP_UD_FORCED        (1 << 4)
> > > -#define EMULTYPE_VMWARE_GP        (1 << 5)
> > > +#define EMULTYPE_PARAVIRT_GP        (1 << 5)
> > > #define EMULTYPE_PF            (1 << 6)
> > > 
> > > int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> > > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> > > index 581925e476d6..1a2fff4e7140 100644
> > > --- a/arch/x86/kvm/mmu.h
> > > +++ b/arch/x86/kvm/mmu.h
> > > @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
> > > 
> > > int kvm_mmu_post_init_vm(struct kvm *kvm);
> > > void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> > > +bool kvm_is_host_reserved_region(u64 gpa);
> > 
> > Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 
> > 
> > > #endif
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 6d16481aa29d..c5c4aaf01a1a 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -50,6 +50,7 @@
> > > #include <asm/io.h>
> > > #include <asm/vmx.h>
> > > #include <asm/kvm_page_track.h>
> > > +#include <asm/e820/api.h>
> > > #include "trace.h"
> > > 
> > > extern bool itlb_multihit_kvm_mitigation;
> > > @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
> > > }
> > > EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
> > > 
> > > +bool kvm_is_host_reserved_region(u64 gpa)
> > > +{
> > > +    return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> > > +}
> > 
> > While _e820__mapped_any()'s doc says '..  checks if any part of the
> > range <start,end> is mapped ..' it seems to me that the real check is
> > [start, end) so we should use 'gpa' instead of 'gpa-1', no?
> 
> Why do you need to check GPA at all?
> 
To reduce the scope of the workaround.

The errata only happens when you use one of SVM instructions
in the guest with EAX that happens to be inside one
of the host reserved memory regions (for example SMM).

So it is not expected for an SVM instruction with EAX that is a valid host
physical address to get a #GP due to this errata. 

Best regards,
	Maxim Levitsky

> 



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 15:17     ` Maxim Levitsky
@ 2021-01-12 15:22       ` Andy Lutomirski
  2021-01-12 15:46         ` Bandan Das
  0 siblings, 1 reply; 33+ messages in thread
From: Andy Lutomirski @ 2021-01-12 15:22 UTC (permalink / raw)
  To: Maxim Levitsky
  Cc: Vitaly Kuznetsov, Wei Huang, kvm, linux-kernel, pbonzini, seanjc,
	joro, bp, tglx, mingo, x86, jmattson, wanpengli, bsd, dgilbert



> On Jan 12, 2021, at 7:17 AM, Maxim Levitsky <mlevitsk@redhat.com> wrote:
> 
> On Tue, 2021-01-12 at 07:11 -0800, Andy Lutomirski wrote:
>>>> On Jan 12, 2021, at 4:15 AM, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>>> 
>>> Wei Huang <wei.huang2@amd.com> writes:
>>> 
>>>> From: Bandan Das <bsd@redhat.com>
>>>> 
>>>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>>>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>>>> before checking VMCB's instruction intercept. If EAX falls into such
>>>> memory areas, #GP is triggered before VMEXIT. This causes problem under
>>>> nested virtualization. To solve this problem, KVM needs to trap #GP and
>>>> check the instructions triggering #GP. For VM execution instructions,
>>>> KVM emulates these instructions; otherwise it re-injects #GP back to
>>>> guest VMs.
>>>> 
>>>> Signed-off-by: Bandan Das <bsd@redhat.com>
>>>> Co-developed-by: Wei Huang <wei.huang2@amd.com>
>>>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
>>>> ---
>>>> arch/x86/include/asm/kvm_host.h |   8 +-
>>>> arch/x86/kvm/mmu.h              |   1 +
>>>> arch/x86/kvm/mmu/mmu.c          |   7 ++
>>>> arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>>>> arch/x86/kvm/svm/svm.h          |   8 ++
>>>> arch/x86/kvm/vmx/vmx.c          |   2 +-
>>>> arch/x86/kvm/x86.c              |  37 +++++++-
>>>> 7 files changed, 146 insertions(+), 74 deletions(-)
>>>> 
>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>> index 3d6616f6f6ef..0ddc309f5a14 100644
>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>>>> *                 due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>>>> *                 Used to test the full emulator from userspace.
>>>> *
>>>> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
>>>> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>>>> *            backdoor emulation, which is opt in via module param.
>>>> *            VMware backoor emulation handles select instructions
>>>> - *            and reinjects the #GP for all other cases.
>>>> + *            and reinjects #GP for all other cases. This also
>>>> + *            handles other cases where #GP condition needs to be
>>>> + *            handled and emulated appropriately
>>>> *
>>>> * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>>>> *         case the CR2/GPA value pass on the stack is valid.
>>>> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>>>> #define EMULTYPE_SKIP            (1 << 2)
>>>> #define EMULTYPE_ALLOW_RETRY_PF        (1 << 3)
>>>> #define EMULTYPE_TRAP_UD_FORCED        (1 << 4)
>>>> -#define EMULTYPE_VMWARE_GP        (1 << 5)
>>>> +#define EMULTYPE_PARAVIRT_GP        (1 << 5)
>>>> #define EMULTYPE_PF            (1 << 6)
>>>> 
>>>> int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
>>>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>>>> index 581925e476d6..1a2fff4e7140 100644
>>>> --- a/arch/x86/kvm/mmu.h
>>>> +++ b/arch/x86/kvm/mmu.h
>>>> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>>>> 
>>>> int kvm_mmu_post_init_vm(struct kvm *kvm);
>>>> void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
>>>> +bool kvm_is_host_reserved_region(u64 gpa);
>>> 
>>> Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 
>>> 
>>>> #endif
>>>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>>>> index 6d16481aa29d..c5c4aaf01a1a 100644
>>>> --- a/arch/x86/kvm/mmu/mmu.c
>>>> +++ b/arch/x86/kvm/mmu/mmu.c
>>>> @@ -50,6 +50,7 @@
>>>> #include <asm/io.h>
>>>> #include <asm/vmx.h>
>>>> #include <asm/kvm_page_track.h>
>>>> +#include <asm/e820/api.h>
>>>> #include "trace.h"
>>>> 
>>>> extern bool itlb_multihit_kvm_mitigation;
>>>> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>>>> }
>>>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>>>> 
>>>> +bool kvm_is_host_reserved_region(u64 gpa)
>>>> +{
>>>> +    return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
>>>> +}
>>> 
>>> While _e820__mapped_any()'s doc says '..  checks if any part of the
>>> range <start,end> is mapped ..' it seems to me that the real check is
>>> [start, end) so we should use 'gpa' instead of 'gpa-1', no?
>> 
>> Why do you need to check GPA at all?
>> 
> To reduce the scope of the workaround.
> 
> The errata only happens when you use one of SVM instructions
> in the guest with EAX that happens to be inside one
> of the host reserved memory regions (for example SMM).

This code reduces the scope of the workaround at the cost of increasing the complexity of the workaround and adding a nonsensical coupling between KVM and host details and adding an export that really doesn’t deserve to be exported.

Is there an actual concrete benefit to this check?


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 15:22       ` Andy Lutomirski
@ 2021-01-12 15:46         ` Bandan Das
  2021-01-12 15:51           ` Andy Lutomirski
  0 siblings, 1 reply; 33+ messages in thread
From: Bandan Das @ 2021-01-12 15:46 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Maxim Levitsky, Vitaly Kuznetsov, Wei Huang, kvm, linux-kernel,
	pbonzini, seanjc, joro, bp, tglx, mingo, x86, jmattson,
	wanpengli, dgilbert

Andy Lutomirski <luto@amacapital.net> writes:
...
>>>>> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
>>>>> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
>>>>> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
>>>>> -50,6 +50,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include
>>>>> <asm/kvm_page_track.h> +#include <asm/e820/api.h> #include
>>>>> "trace.h"
>>>>> 
>>>>> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
>>>>> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
>>>>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>>>>> 
>>>>> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
>>>>> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
>>>>  While _e820__mapped_any()'s doc says '..  checks if any part of
>>>> the range <start,end> is mapped ..' it seems to me that the real
>>>> check is [start, end) so we should use 'gpa' instead of 'gpa-1',
>>>> no?
>>>  Why do you need to check GPA at all?
>>> 
>> To reduce the scope of the workaround.
>> 
>> The errata only happens when you use one of SVM instructions in the
>> guest with EAX that happens to be inside one of the host reserved
>> memory regions (for example SMM).
>
> This code reduces the scope of the workaround at the cost of
> increasing the complexity of the workaround and adding a nonsensical
> coupling between KVM and host details and adding an export that really
> doesn’t deserve to be exported.
>
> Is there an actual concrete benefit to this check?

Besides reducing the scope, my intention for the check was that we should
know if such exceptions occur for any other undiscovered reasons with other
memory types rather than hiding them under this workaround.

Bandan




^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 15:46         ` Bandan Das
@ 2021-01-12 15:51           ` Andy Lutomirski
  2021-01-12 17:56             ` Sean Christopherson
  0 siblings, 1 reply; 33+ messages in thread
From: Andy Lutomirski @ 2021-01-12 15:51 UTC (permalink / raw)
  To: Bandan Das
  Cc: Maxim Levitsky, Vitaly Kuznetsov, Wei Huang, kvm, linux-kernel,
	pbonzini, seanjc, joro, bp, tglx, mingo, x86, jmattson,
	wanpengli, dgilbert


> On Jan 12, 2021, at 7:46 AM, Bandan Das <bsd@redhat.com> wrote:
> 
> Andy Lutomirski <luto@amacapital.net> writes:
> ...
>>>>>> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
>>>>>> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
>>>>>> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
>>>>>> -50,6 +50,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include
>>>>>> <asm/kvm_page_track.h> +#include <asm/e820/api.h> #include
>>>>>> "trace.h"
>>>>>> 
>>>>>> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
>>>>>> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
>>>>>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>>>>>> 
>>>>>> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
>>>>>> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
>>>>> While _e820__mapped_any()'s doc says '..  checks if any part of
>>>>> the range <start,end> is mapped ..' it seems to me that the real
>>>>> check is [start, end) so we should use 'gpa' instead of 'gpa-1',
>>>>> no?
>>>> Why do you need to check GPA at all?
>>>> 
>>> To reduce the scope of the workaround.
>>> 
>>> The errata only happens when you use one of SVM instructions in the
>>> guest with EAX that happens to be inside one of the host reserved
>>> memory regions (for example SMM).
>> 
>> This code reduces the scope of the workaround at the cost of
>> increasing the complexity of the workaround and adding a nonsensical
>> coupling between KVM and host details and adding an export that really
>> doesn’t deserve to be exported.
>> 
>> Is there an actual concrete benefit to this check?
> 
> Besides reducing the scope, my intention for the check was that we should
> know if such exceptions occur for any other undiscovered reasons with other
> memory types rather than hiding them under this workaround.

Ask AMD?

I would also believe that someone somewhere has a firmware that simply omits the problematic region instead of listing it as reserved.

> 
> Bandan
> 
> 
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12  6:37 [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Wei Huang
                   ` (3 preceding siblings ...)
  2021-01-12 14:01 ` Paolo Bonzini
@ 2021-01-12 17:36 ` Sean Christopherson
  2021-01-12 17:59   ` Sean Christopherson
  2021-01-12 19:40 ` Sean Christopherson
  2021-01-14 11:55 ` Maxim Levitsky
  6 siblings, 1 reply; 33+ messages in thread
From: Sean Christopherson @ 2021-01-12 17:36 UTC (permalink / raw)
  To: Wei Huang
  Cc: kvm, linux-kernel, pbonzini, vkuznets, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert, mlevitsk

On Tue, Jan 12, 2021, Wei Huang wrote:
> From: Bandan Das <bsd@redhat.com>
> 
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept.

It would be very helpful to list exactly which CPUs are/aren't affected, even if
that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
I assume it's all CPUs without the new CPUID flag?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 14:01 ` Paolo Bonzini
@ 2021-01-12 17:42   ` Sean Christopherson
  2021-01-13 12:35     ` Paolo Bonzini
  2021-01-15  7:00   ` Wei Huang
  1 sibling, 1 reply; 33+ messages in thread
From: Sean Christopherson @ 2021-01-12 17:42 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Wei Huang, kvm, linux-kernel, vkuznets, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert, mlevitsk

On Tue, Jan 12, 2021, Paolo Bonzini wrote:
> On 12/01/21 07:37, Wei Huang wrote:
> >   static int gp_interception(struct vcpu_svm *svm)
> >   {
> >   	struct kvm_vcpu *vcpu = &svm->vcpu;
> >   	u32 error_code = svm->vmcb->control.exit_info_1;
> > -
> > -	WARN_ON_ONCE(!enable_vmware_backdoor);
> > +	int rc;
> >   	/*
> > -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> > -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> > +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> > +	 * them has non-zero error codes.
> >   	 */
> >   	if (error_code) {
> >   		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
> >   		return 1;
> >   	}
> > -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> > +
> > +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> > +	if (rc > 1)
> > +		rc = svm_emulate_vm_instr(vcpu, rc);
> > +	return rc;
> >   }
> 
> Passing back the third byte is quick hacky.  Instead of this change to
> kvm_emulate_instruction, I'd rather check the instruction bytes in
> gp_interception before calling kvm_emulate_instruction.

Agreed.  And I'd also prefer that any pure refactoring is done in separate
patch(es) so that the actual functional change is better isolated.

On a related topic, it feels like nested should be disabled by default on SVM
until it's truly ready for primetime, with the patch tagged for stable.  That
way we don't have to worry about crafting non-trivial fixes (like this one) to
make them backport-friendly.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 15:51           ` Andy Lutomirski
@ 2021-01-12 17:56             ` Sean Christopherson
  2021-01-13  4:55               ` Wei Huang
  0 siblings, 1 reply; 33+ messages in thread
From: Sean Christopherson @ 2021-01-12 17:56 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Bandan Das, Maxim Levitsky, Vitaly Kuznetsov, Wei Huang, kvm,
	linux-kernel, pbonzini, joro, bp, tglx, mingo, x86, jmattson,
	wanpengli, dgilbert

On Tue, Jan 12, 2021, Andy Lutomirski wrote:
> 
> > On Jan 12, 2021, at 7:46 AM, Bandan Das <bsd@redhat.com> wrote:
> > 
> > Andy Lutomirski <luto@amacapital.net> writes:
> > ...
> >>>>>> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
> >>>>>> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
> >>>>>> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
> >>>>>> -50,6 +50,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include
> >>>>>> <asm/kvm_page_track.h> +#include <asm/e820/api.h> #include
> >>>>>> "trace.h"
> >>>>>> 
> >>>>>> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
> >>>>>> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
> >>>>>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
> >>>>>> 
> >>>>>> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
> >>>>>> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
> >>>>> While _e820__mapped_any()'s doc says '..  checks if any part of
> >>>>> the range <start,end> is mapped ..' it seems to me that the real
> >>>>> check is [start, end) so we should use 'gpa' instead of 'gpa-1',
> >>>>> no?
> >>>> Why do you need to check GPA at all?
> >>>> 
> >>> To reduce the scope of the workaround.
> >>> 
> >>> The errata only happens when you use one of SVM instructions in the
> >>> guest with EAX that happens to be inside one of the host reserved
> >>> memory regions (for example SMM).
> >> 
> >> This code reduces the scope of the workaround at the cost of
> >> increasing the complexity of the workaround and adding a nonsensical
> >> coupling between KVM and host details and adding an export that really
> >> doesn’t deserve to be exported.
> >> 
> >> Is there an actual concrete benefit to this check?
> > 
> > Besides reducing the scope, my intention for the check was that we should
> > know if such exceptions occur for any other undiscovered reasons with other
> > memory types rather than hiding them under this workaround.
> 
> Ask AMD?
> 
> I would also believe that someone somewhere has a firmware that simply omits
> the problematic region instead of listing it as reserved.

I agree with Andy, odds are very good that attempting to be precise will lead to
pain due to false negatives.

And, KVM's SVM instruction emulation needs to be be rock solid regardless of
this behavior since KVM unconditionally intercepts the instruction, i.e. there's
basically zero risk to KVM.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 17:36 ` Sean Christopherson
@ 2021-01-12 17:59   ` Sean Christopherson
  2021-01-12 18:58     ` Andy Lutomirski
                       ` (2 more replies)
  0 siblings, 3 replies; 33+ messages in thread
From: Sean Christopherson @ 2021-01-12 17:59 UTC (permalink / raw)
  To: Wei Huang
  Cc: kvm, linux-kernel, pbonzini, vkuznets, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert, mlevitsk

On Tue, Jan 12, 2021, Sean Christopherson wrote:
> On Tue, Jan 12, 2021, Wei Huang wrote:
> > From: Bandan Das <bsd@redhat.com>
> > 
> > While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> > CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> > before checking VMCB's instruction intercept.
> 
> It would be very helpful to list exactly which CPUs are/aren't affected, even if
> that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
> I assume it's all CPUs without the new CPUID flag?

Ah, despite calling this an 'errata', the bad behavior is explicitly documented
in the APM, i.e. it's an architecture bug, not a silicon bug.

Can you reword the changelog to make it clear that the premature #GP is the
correct architectural behavior for CPUs without the new CPUID flag?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 17:59   ` Sean Christopherson
@ 2021-01-12 18:58     ` Andy Lutomirski
  2021-01-13  5:15       ` Wei Huang
  2021-01-13  5:03     ` Wei Huang
  2021-01-13 12:40     ` Paolo Bonzini
  2 siblings, 1 reply; 33+ messages in thread
From: Andy Lutomirski @ 2021-01-12 18:58 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Wei Huang, kvm list, LKML, Paolo Bonzini, Vitaly Kuznetsov,
	Joerg Roedel, Borislav Petkov, Thomas Gleixner, Ingo Molnar,
	X86 ML, Jim Mattson, Wanpeng Li, Bandan Das,
	Dr. David Alan Gilbert, Maxim Levitsky

On Tue, Jan 12, 2021 at 9:59 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Jan 12, 2021, Sean Christopherson wrote:
> > On Tue, Jan 12, 2021, Wei Huang wrote:
> > > From: Bandan Das <bsd@redhat.com>
> > >
> > > While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> > > CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> > > before checking VMCB's instruction intercept.
> >
> > It would be very helpful to list exactly which CPUs are/aren't affected, even if
> > that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
> > I assume it's all CPUs without the new CPUID flag?
>
> Ah, despite calling this an 'errata', the bad behavior is explicitly documented
> in the APM, i.e. it's an architecture bug, not a silicon bug.
>
> Can you reword the changelog to make it clear that the premature #GP is the
> correct architectural behavior for CPUs without the new CPUID flag?

Andrew Cooper points out that there may be a nicer workaround.  Make
sure that the SMRAM and HT region (FFFD00000000 - FFFFFFFFFFFF) are
marked as reserved in the guest, too.

--Andy

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] KVM: SVM: Add support for VMCB address check change
  2021-01-12  6:37 ` [PATCH 2/2] KVM: SVM: Add support for VMCB address check change Wei Huang
@ 2021-01-12 19:18   ` Sean Christopherson
  2021-01-14 11:39     ` Maxim Levitsky
  2021-01-14 12:04   ` Maxim Levitsky
  1 sibling, 1 reply; 33+ messages in thread
From: Sean Christopherson @ 2021-01-12 19:18 UTC (permalink / raw)
  To: Wei Huang
  Cc: kvm, linux-kernel, pbonzini, vkuznets, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert, mlevitsk

On Tue, Jan 12, 2021, Wei Huang wrote:
> New AMD CPUs have a change that checks VMEXIT intercept on special SVM
> instructions before checking their EAX against reserved memory region.
> This change is indicated by CPUID_0x8000000A_EDX[28]. If it is 1, KVM
> doesn't need to intercept and emulate #GP faults for such instructions
> because #GP isn't supposed to be triggered.
> 
> Co-developed-by: Bandan Das <bsd@redhat.com>
> Signed-off-by: Bandan Das <bsd@redhat.com>
> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> ---
>  arch/x86/include/asm/cpufeatures.h | 1 +
>  arch/x86/kvm/svm/svm.c             | 2 +-
>  2 files changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index 84b887825f12..ea89d6fdd79a 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -337,6 +337,7 @@
>  #define X86_FEATURE_AVIC		(15*32+13) /* Virtual Interrupt Controller */
>  #define X86_FEATURE_V_VMSAVE_VMLOAD	(15*32+15) /* Virtual VMSAVE VMLOAD */
>  #define X86_FEATURE_VGIF		(15*32+16) /* Virtual GIF */
> +#define X86_FEATURE_SVME_ADDR_CHK	(15*32+28) /* "" SVME addr check */

Heh, KVM should advertise this to userspace by setting the kvm_cpu_cap bit.  KVM
KVM forwards relevant VM-Exits to L1 without checking if rAX points at an
invalid L1 GPA.

>  /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
>  #define X86_FEATURE_AVX512VBMI		(16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 74620d32aa82..451b82df2eab 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -311,7 +311,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  	svm->vmcb->save.efer = efer | EFER_SVME;
>  	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
>  	/* Enable GP interception for SVM instructions if needed */
> -	if (efer & EFER_SVME)
> +	if ((efer & EFER_SVME) && !boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
>  		set_exception_intercept(svm, GP_VECTOR);
>  
>  	return 0;
> -- 
> 2.27.0
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12  6:37 [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Wei Huang
                   ` (4 preceding siblings ...)
  2021-01-12 17:36 ` Sean Christopherson
@ 2021-01-12 19:40 ` Sean Christopherson
  2021-01-12 20:00   ` Bandan Das
  2021-01-14 11:55 ` Maxim Levitsky
  6 siblings, 1 reply; 33+ messages in thread
From: Sean Christopherson @ 2021-01-12 19:40 UTC (permalink / raw)
  To: Wei Huang
  Cc: kvm, linux-kernel, pbonzini, vkuznets, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert, mlevitsk

On Tue, Jan 12, 2021, Wei Huang wrote:
> +/* Emulate SVM VM execution instructions */
> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	switch (modrm) {
> +	case 0xd8: /* VMRUN */
> +		return vmrun_interception(svm);
> +	case 0xda: /* VMLOAD */
> +		return vmload_interception(svm);
> +	case 0xdb: /* VMSAVE */
> +		return vmsave_interception(svm);
> +	default:
> +		/* inject a #GP for all other cases */
> +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +		return 1;
> +	}
> +}
v> +
>  static int gp_interception(struct vcpu_svm *svm)
>  {
>  	struct kvm_vcpu *vcpu = &svm->vcpu;
>  	u32 error_code = svm->vmcb->control.exit_info_1;
> -
> -	WARN_ON_ONCE(!enable_vmware_backdoor);
> +	int rc;
>  
>  	/*
> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> +	 * them has non-zero error codes.
>  	 */
>  	if (error_code) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  		return 1;
>  	}
> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> +	if (rc > 1)
> +		rc = svm_emulate_vm_instr(vcpu, rc);
> +	return rc;
>  }
 
...
  
> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
> +{
> +	unsigned long rax;
> +
> +	if (ctxt->b != 0x1)
> +		return 0;
> +
> +	switch (ctxt->modrm) {
> +	case 0xd8: /* VMRUN */
> +	case 0xda: /* VMLOAD */
> +	case 0xdb: /* VMSAVE */
> +		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
> +		if (!kvm_is_host_reserved_region(rax))
> +			return 0;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	return ctxt->modrm;
> +}
> +
>  static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>  {
>  	switch (ctxt->opcode_len) {
> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>  	bool writeback = true;
>  	bool write_fault_to_spt;
> +	int vminstr;
>  
>  	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
>  		return 1;
> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		}
>  	}
>  
> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> -	    !is_vmware_backdoor_opcode(ctxt)) {
> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> -		return 1;
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> +		vminstr = is_vm_instr_opcode(ctxt);
> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +			return 1;
> +		}
> +		if (vminstr)
> +			return vminstr;

I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a bad
L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked until
x86_emulate_insn(), and the vm*_interception() helpers expect nested VM-Exits to
be handled further up the stack.

>  	}
>  
>  	/*
> -- 
> 2.27.0
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 19:40 ` Sean Christopherson
@ 2021-01-12 20:00   ` Bandan Das
  2021-01-14 11:47     ` Maxim Levitsky
  0 siblings, 1 reply; 33+ messages in thread
From: Bandan Das @ 2021-01-12 20:00 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Wei Huang, kvm, linux-kernel, pbonzini, vkuznets, joro, bp, tglx,
	mingo, x86, jmattson, wanpengli, dgilbert, mlevitsk

Sean Christopherson <seanjc@google.com> writes:
...
>> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
>> -	    !is_vmware_backdoor_opcode(ctxt)) {
>> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> -		return 1;
>> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>> +		vminstr = is_vm_instr_opcode(ctxt);
>> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
>> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> +			return 1;
>> +		}
>> +		if (vminstr)
>> +			return vminstr;
>
> I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a bad
> L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked until
> x86_emulate_insn(), and the vm*_interception() helpers expect nested VM-Exits to
> be handled further up the stack.
>
So, the condition is that L2 executes a vmload and #GPs on a reserved address, jumps to L0 - L0 doesn't
check if L1 has asked for the instruction to be intercepted and goes on with emulating
vmload and returning back to L2 ?

>>  	}
>>  
>>  	/*
>> -- 
>> 2.27.0
>> 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 11:09 ` [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Maxim Levitsky
@ 2021-01-12 21:05   ` Wei Huang
  0 siblings, 0 replies; 33+ messages in thread
From: Wei Huang @ 2021-01-12 21:05 UTC (permalink / raw)
  To: Maxim Levitsky, kvm
  Cc: linux-kernel, pbonzini, vkuznets, seanjc, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert



On 1/12/21 5:09 AM, Maxim Levitsky wrote:
> On Tue, 2021-01-12 at 00:37 -0600, Wei Huang wrote:
>> From: Bandan Das <bsd@redhat.com>
>>
>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>> before checking VMCB's instruction intercept. If EAX falls into such
>> memory areas, #GP is triggered before VMEXIT. This causes problem under
>> nested virtualization. To solve this problem, KVM needs to trap #GP and
>> check the instructions triggering #GP. For VM execution instructions,
>> KVM emulates these instructions; otherwise it re-injects #GP back to
>> guest VMs.
>>
>> Signed-off-by: Bandan Das <bsd@redhat.com>
>> Co-developed-by: Wei Huang <wei.huang2@amd.com>
>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> 
> This is the ultimate fix for this bug that I had in mind,
> but I didn't dare to develop it, thinking it won't be accepted
> due to the added complexity.
> 
>  From a cursory look this look all right, and I will review
> and test this either today or tomorrow.

My tests mainly relied on the kvm-unit-test you developed (thanks BTW), 
on machines w/ and w/o CPUID_0x8000000A_EDX[28]=1. Both cases passed.

>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 12:15 ` Vitaly Kuznetsov
  2021-01-12 15:11   ` Andy Lutomirski
@ 2021-01-12 21:50   ` Wei Huang
  1 sibling, 0 replies; 33+ messages in thread
From: Wei Huang @ 2021-01-12 21:50 UTC (permalink / raw)
  To: Vitaly Kuznetsov, kvm
  Cc: linux-kernel, pbonzini, seanjc, joro, bp, tglx, mingo, x86,
	jmattson, wanpengli, bsd, dgilbert, mlevitsk



On 1/12/21 6:15 AM, Vitaly Kuznetsov wrote:
> Wei Huang <wei.huang2@amd.com> writes:
> 
>> From: Bandan Das <bsd@redhat.com>
>>
>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>> before checking VMCB's instruction intercept. If EAX falls into such
>> memory areas, #GP is triggered before VMEXIT. This causes problem under
>> nested virtualization. To solve this problem, KVM needs to trap #GP and
>> check the instructions triggering #GP. For VM execution instructions,
>> KVM emulates these instructions; otherwise it re-injects #GP back to
>> guest VMs.
>>
>> Signed-off-by: Bandan Das <bsd@redhat.com>
>> Co-developed-by: Wei Huang <wei.huang2@amd.com>
>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
>> ---
>>   arch/x86/include/asm/kvm_host.h |   8 +-
>>   arch/x86/kvm/mmu.h              |   1 +
>>   arch/x86/kvm/mmu/mmu.c          |   7 ++
>>   arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>>   arch/x86/kvm/svm/svm.h          |   8 ++
>>   arch/x86/kvm/vmx/vmx.c          |   2 +-
>>   arch/x86/kvm/x86.c              |  37 +++++++-
>>   7 files changed, 146 insertions(+), 74 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 3d6616f6f6ef..0ddc309f5a14 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>>    *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>>    *			     Used to test the full emulator from userspace.
>>    *
>> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
>> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>>    *			backdoor emulation, which is opt in via module param.
>>    *			VMware backoor emulation handles select instructions
>> - *			and reinjects the #GP for all other cases.
>> + *			and reinjects #GP for all other cases. This also
>> + *			handles other cases where #GP condition needs to be
>> + *			handled and emulated appropriately
>>    *
>>    * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>>    *		 case the CR2/GPA value pass on the stack is valid.
>> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>>   #define EMULTYPE_SKIP		    (1 << 2)
>>   #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
>>   #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
>> -#define EMULTYPE_VMWARE_GP	    (1 << 5)
>> +#define EMULTYPE_PARAVIRT_GP	    (1 << 5)
>>   #define EMULTYPE_PF		    (1 << 6)
>>   
>>   int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index 581925e476d6..1a2fff4e7140 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>>   
>>   int kvm_mmu_post_init_vm(struct kvm *kvm);
>>   void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
>> +bool kvm_is_host_reserved_region(u64 gpa);
> 
> Just a suggestion: "kvm_gpa_in_host_reserved()" maybe?

Will do in v2.

> 
>>   
>>   #endif
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index 6d16481aa29d..c5c4aaf01a1a 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -50,6 +50,7 @@
>>   #include <asm/io.h>
>>   #include <asm/vmx.h>
>>   #include <asm/kvm_page_track.h>
>> +#include <asm/e820/api.h>
>>   #include "trace.h"
>>   
>>   extern bool itlb_multihit_kvm_mitigation;
>> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>>   }
>>   EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>>   
>> +bool kvm_is_host_reserved_region(u64 gpa)
>> +{
>> +	return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
>> +}
> 
> While _e820__mapped_any()'s doc says '..  checks if any part of the
> range <start,end> is mapped ..' it seems to me that the real check is
> [start, end) so we should use 'gpa' instead of 'gpa-1', no?

I think you are right. The statement of "entry->addr >= end || 
entry->addr + entry->size <= start" shows the checking is against the 
area of [start, end).

> 
>> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
>> +
>>   void kvm_mmu_zap_all(struct kvm *kvm)
>>   {
>>   	struct kvm_mmu_page *sp, *node;
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 7ef171790d02..74620d32aa82 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>>   		if (!(efer & EFER_SVME)) {
>>   			svm_leave_nested(svm);
>>   			svm_set_gif(svm, true);
>> +			clr_exception_intercept(svm, GP_VECTOR);
>>   
>>   			/*
>>   			 * Free the nested guest state, unless we are in SMM.
>> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>>   
>>   	svm->vmcb->save.efer = efer | EFER_SVME;
>>   	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
>> +	/* Enable GP interception for SVM instructions if needed */
>> +	if (efer & EFER_SVME)
>> +		set_exception_intercept(svm, GP_VECTOR);
>> +
>>   	return 0;
>>   }
>>   
>> @@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
>>   	return 1;
>>   }
>>   
>> +static int vmload_interception(struct vcpu_svm *svm)
>> +{
>> +	struct vmcb *nested_vmcb;
>> +	struct kvm_host_map map;
>> +	int ret;
>> +
>> +	if (nested_svm_check_permissions(svm))
>> +		return 1;
>> +
>> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
>> +	if (ret) {
>> +		if (ret == -EINVAL)
>> +			kvm_inject_gp(&svm->vcpu, 0);
>> +		return 1;
>> +	}
>> +
>> +	nested_vmcb = map.hva;
>> +
>> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
>> +
>> +	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
>> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
>> +
>> +	return ret;
>> +}
>> +
>> +static int vmsave_interception(struct vcpu_svm *svm)
>> +{
>> +	struct vmcb *nested_vmcb;
>> +	struct kvm_host_map map;
>> +	int ret;
>> +
>> +	if (nested_svm_check_permissions(svm))
>> +		return 1;
>> +
>> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
>> +	if (ret) {
>> +		if (ret == -EINVAL)
>> +			kvm_inject_gp(&svm->vcpu, 0);
>> +		return 1;
>> +	}
>> +
>> +	nested_vmcb = map.hva;
>> +
>> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
>> +
>> +	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
>> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
>> +
>> +	return ret;
>> +}
>> +
>> +static int vmrun_interception(struct vcpu_svm *svm)
>> +{
>> +	if (nested_svm_check_permissions(svm))
>> +		return 1;
>> +
>> +	return nested_svm_vmrun(svm);
>> +}
>> +
>> +/* Emulate SVM VM execution instructions */
>> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
>> +{
>> +	struct vcpu_svm *svm = to_svm(vcpu);
>> +
>> +	switch (modrm) {
>> +	case 0xd8: /* VMRUN */
>> +		return vmrun_interception(svm);
>> +	case 0xda: /* VMLOAD */
>> +		return vmload_interception(svm);
>> +	case 0xdb: /* VMSAVE */
>> +		return vmsave_interception(svm);
>> +	default:
>> +		/* inject a #GP for all other cases */
>> +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> +		return 1;
>> +	}
>> +}
>> +
>>   static int gp_interception(struct vcpu_svm *svm)
>>   {
>>   	struct kvm_vcpu *vcpu = &svm->vcpu;
>>   	u32 error_code = svm->vmcb->control.exit_info_1;
>> -
>> -	WARN_ON_ONCE(!enable_vmware_backdoor);
>> +	int rc;
>>   
>>   	/*
>> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
>> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
>> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
>> +	 * them has non-zero error codes.
>>   	 */
>>   	if (error_code) {
>>   		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>>   		return 1;
>>   	}
>> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
>> +
>> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>> +	if (rc > 1)
>> +		rc = svm_emulate_vm_instr(vcpu, rc);
>> +	return rc;
>>   }
>>   
>>   static bool is_erratum_383(void)
>> @@ -2113,66 +2200,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
>>   	return kvm_emulate_hypercall(&svm->vcpu);
>>   }
>>   
>> -static int vmload_interception(struct vcpu_svm *svm)
>> -{
>> -	struct vmcb *nested_vmcb;
>> -	struct kvm_host_map map;
>> -	int ret;
>> -
>> -	if (nested_svm_check_permissions(svm))
>> -		return 1;
>> -
>> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
>> -	if (ret) {
>> -		if (ret == -EINVAL)
>> -			kvm_inject_gp(&svm->vcpu, 0);
>> -		return 1;
>> -	}
>> -
>> -	nested_vmcb = map.hva;
>> -
>> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
>> -
>> -	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
>> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
>> -
>> -	return ret;
>> -}
>> -
>> -static int vmsave_interception(struct vcpu_svm *svm)
>> -{
>> -	struct vmcb *nested_vmcb;
>> -	struct kvm_host_map map;
>> -	int ret;
>> -
>> -	if (nested_svm_check_permissions(svm))
>> -		return 1;
>> -
>> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
>> -	if (ret) {
>> -		if (ret == -EINVAL)
>> -			kvm_inject_gp(&svm->vcpu, 0);
>> -		return 1;
>> -	}
>> -
>> -	nested_vmcb = map.hva;
>> -
>> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
>> -
>> -	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
>> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
>> -
>> -	return ret;
>> -}
>> -
>> -static int vmrun_interception(struct vcpu_svm *svm)
>> -{
>> -	if (nested_svm_check_permissions(svm))
>> -		return 1;
>> -
>> -	return nested_svm_vmrun(svm);
>> -}
>> -
> 
> Maybe if you'd do it the other way around and put gp_interception()
> after vm{load,save,run}_interception(), the diff (and code churn)
> would've been smaller?

Agreed.

> 
>>   void svm_set_gif(struct vcpu_svm *svm, bool value)
>>   {
>>   	if (value) {
>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
>> index 0fe874ae5498..d5dffcf59afa 100644
>> --- a/arch/x86/kvm/svm/svm.h
>> +++ b/arch/x86/kvm/svm/svm.h
>> @@ -350,6 +350,14 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
>>   	recalc_intercepts(svm);
>>   }
>>   
>> +static inline bool is_exception_intercept(struct vcpu_svm *svm, u32 bit)
>> +{
>> +	struct vmcb *vmcb = get_host_vmcb(svm);
>> +
>> +	WARN_ON_ONCE(bit >= 32);
>> +	return vmcb_is_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
>> +}
>> +
>>   static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
>>   {
>>   	struct vmcb *vmcb = get_host_vmcb(svm);
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 2af05d3b0590..5fac2f7cba24 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -4774,7 +4774,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>>   			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>>   			return 1;
>>   		}
>> -		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
>> +		return kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>>   	}
>>   
>>   	/*
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 9a8969a6dd06..c3662fc3b1bc 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -7014,7 +7014,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>>   	++vcpu->stat.insn_emulation_fail;
>>   	trace_kvm_emulate_insn_failed(vcpu);
>>   
>> -	if (emulation_type & EMULTYPE_VMWARE_GP) {
>> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>>   		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>>   		return 1;
>>   	}
>> @@ -7267,6 +7267,28 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
>>   	return false;
>>   }
>>   
>> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
> 
> Nit: it seems we either return '0' or 'ctxt->modrm' which is 'u8', so
> 'u8' instead of 'int' maybe?

Agreed. Also Paolo has some comments around this area as well. We will 
take these comments into consideration in v2.

> 
>> +{
>> +	unsigned long rax;
>> +
>> +	if (ctxt->b != 0x1)
>> +		return 0;
>> +
>> +	switch (ctxt->modrm) {
>> +	case 0xd8: /* VMRUN */
>> +	case 0xda: /* VMLOAD */
>> +	case 0xdb: /* VMSAVE */
>> +		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
>> +		if (!kvm_is_host_reserved_region(rax))
>> +			return 0;
>> +		break;
>> +	default:
>> +		return 0;
>> +	}
>> +
>> +	return ctxt->modrm;
>> +}
>> +
>>   static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>>   {
>>   	switch (ctxt->opcode_len) {
>> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>>   	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>>   	bool writeback = true;
>>   	bool write_fault_to_spt;
>> +	int vminstr;
>>   
>>   	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
>>   		return 1;
>> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>>   		}
>>   	}
>>   
>> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
>> -	    !is_vmware_backdoor_opcode(ctxt)) {
>> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> -		return 1;
>> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>> +		vminstr = is_vm_instr_opcode(ctxt);
>> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
>> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> +			return 1;
>> +		}
>> +		if (vminstr)
>> +			return vminstr;
>>   	}
>>   
>>   	/*
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 17:56             ` Sean Christopherson
@ 2021-01-13  4:55               ` Wei Huang
  0 siblings, 0 replies; 33+ messages in thread
From: Wei Huang @ 2021-01-13  4:55 UTC (permalink / raw)
  To: Sean Christopherson, Andy Lutomirski
  Cc: Bandan Das, Maxim Levitsky, Vitaly Kuznetsov, kvm, linux-kernel,
	pbonzini, joro, bp, tglx, mingo, x86, jmattson, wanpengli,
	dgilbert



On 1/12/21 11:56 AM, Sean Christopherson wrote:
> On Tue, Jan 12, 2021, Andy Lutomirski wrote:
>>
>>> On Jan 12, 2021, at 7:46 AM, Bandan Das <bsd@redhat.com> wrote:
>>>
>>> Andy Lutomirski <luto@amacapital.net> writes:
>>> ...
>>>>>>>> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
>>>>>>>> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
>>>>>>>> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
>>>>>>>> -50,6 +50,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include
>>>>>>>> <asm/kvm_page_track.h> +#include <asm/e820/api.h> #include
>>>>>>>> "trace.h"
>>>>>>>>
>>>>>>>> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
>>>>>>>> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
>>>>>>>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>>>>>>>>
>>>>>>>> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
>>>>>>>> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
>>>>>>> While _e820__mapped_any()'s doc says '..  checks if any part of
>>>>>>> the range <start,end> is mapped ..' it seems to me that the real
>>>>>>> check is [start, end) so we should use 'gpa' instead of 'gpa-1',
>>>>>>> no?
>>>>>> Why do you need to check GPA at all?
>>>>>>
>>>>> To reduce the scope of the workaround.
>>>>>
>>>>> The errata only happens when you use one of SVM instructions in the
>>>>> guest with EAX that happens to be inside one of the host reserved
>>>>> memory regions (for example SMM).
>>>>
>>>> This code reduces the scope of the workaround at the cost of
>>>> increasing the complexity of the workaround and adding a nonsensical
>>>> coupling between KVM and host details and adding an export that really
>>>> doesn’t deserve to be exported.
>>>>
>>>> Is there an actual concrete benefit to this check?
>>>
>>> Besides reducing the scope, my intention for the check was that we should
>>> know if such exceptions occur for any other undiscovered reasons with other
>>> memory types rather than hiding them under this workaround.
>>
>> Ask AMD?

There are several checking before VMRUN launch. The function, 
e820__mapped_raw_any(), was definitely one of the easies way to figure 
out the problematic regions we had.

>>
>> I would also believe that someone somewhere has a firmware that simply omits
>> the problematic region instead of listing it as reserved.
> 
> I agree with Andy, odds are very good that attempting to be precise will lead to
> pain due to false negatives.
> 
> And, KVM's SVM instruction emulation needs to be be rock solid regardless of
> this behavior since KVM unconditionally intercepts the instruction, i.e. there's
> basically zero risk to KVM.
> 

Are you saying that the instruction decode before 
kvm_is_host_reserved_region() already guarantee the instructions #GP hit 
are SVM execution instructions (see below)? If so, I think this argument 
is fair.

+	switch (ctxt->modrm) {
+	case 0xd8: /* VMRUN */
+	case 0xda: /* VMLOAD */
+	case 0xdb: /* VMSAVE */

Bandan: What is your thoughts about removing kvm_is_host_reserved_region()?


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 17:59   ` Sean Christopherson
  2021-01-12 18:58     ` Andy Lutomirski
@ 2021-01-13  5:03     ` Wei Huang
  2021-01-13 12:40     ` Paolo Bonzini
  2 siblings, 0 replies; 33+ messages in thread
From: Wei Huang @ 2021-01-13  5:03 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: kvm, linux-kernel, pbonzini, vkuznets, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert, mlevitsk



On 1/12/21 11:59 AM, Sean Christopherson wrote:
> On Tue, Jan 12, 2021, Sean Christopherson wrote:
>> On Tue, Jan 12, 2021, Wei Huang wrote:
>>> From: Bandan Das <bsd@redhat.com>
>>>
>>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>>> before checking VMCB's instruction intercept.
>>
>> It would be very helpful to list exactly which CPUs are/aren't affected, even if
>> that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
>> I assume it's all CPUs without the new CPUID flag?

This behavior was dated back to fairly old CPUs. It is fair to assume 
that _most_ CPUs without this CPUID bit can demonstrate such behavior.

> 
> Ah, despite calling this an 'errata', the bad behavior is explicitly documented
> in the APM, i.e. it's an architecture bug, not a silicon bug.
> 
> Can you reword the changelog to make it clear that the premature #GP is the
> correct architectural behavior for CPUs without the new CPUID flag?

Sure, will do in the next version.

> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 18:58     ` Andy Lutomirski
@ 2021-01-13  5:15       ` Wei Huang
  2021-01-14 11:42         ` Maxim Levitsky
  0 siblings, 1 reply; 33+ messages in thread
From: Wei Huang @ 2021-01-13  5:15 UTC (permalink / raw)
  To: Andy Lutomirski, Sean Christopherson
  Cc: kvm list, LKML, Paolo Bonzini, Vitaly Kuznetsov, Joerg Roedel,
	Borislav Petkov, Thomas Gleixner, Ingo Molnar, X86 ML,
	Jim Mattson, Wanpeng Li, Bandan Das, Dr. David Alan Gilbert,
	Maxim Levitsky



On 1/12/21 12:58 PM, Andy Lutomirski wrote:
> Andrew Cooper points out that there may be a nicer workaround.  Make
> sure that the SMRAM and HT region (FFFD00000000 - FFFFFFFFFFFF) are
> marked as reserved in the guest, too.

In theory this proposed solution can avoid intercepting #GP. But in 
reality SMRAM regions can be different on different machines. So this 
solution can break after VM migration.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 17:42   ` Sean Christopherson
@ 2021-01-13 12:35     ` Paolo Bonzini
  0 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2021-01-13 12:35 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Wei Huang, kvm, linux-kernel, vkuznets, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert, mlevitsk

On 12/01/21 18:42, Sean Christopherson wrote:
> On a related topic, it feels like nested should be disabled by default on SVM
> until it's truly ready for primetime, with the patch tagged for stable.  That
> way we don't have to worry about crafting non-trivial fixes (like this one) to
> make them backport-friendly.

Well, that's historical; I wish it had been disabled by default back in 
the day.

However, after 10 years and after the shakedown last year, it's hard to 
justify breaking backwards compatibility.  Nested SVM is not any less 
ready than nested VMX---just a little less optimized for things such as 
TLB flushes and ASID/VPID---even without this fix.  The erratum has 
visible effects only on a minority of AMD systems (it depends on an 
unlucky placement of TSEG on L0), and it is easy to work around it by 
lowering the amount of <4G memory in L1.

Paolo


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 17:59   ` Sean Christopherson
  2021-01-12 18:58     ` Andy Lutomirski
  2021-01-13  5:03     ` Wei Huang
@ 2021-01-13 12:40     ` Paolo Bonzini
  2 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2021-01-13 12:40 UTC (permalink / raw)
  To: Sean Christopherson, Wei Huang
  Cc: kvm, linux-kernel, vkuznets, joro, bp, tglx, mingo, x86,
	jmattson, wanpengli, bsd, dgilbert, mlevitsk

On 12/01/21 18:59, Sean Christopherson wrote:
>> It would be very helpful to list exactly which CPUs are/aren't affected, even if
>> that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
>> I assume it's all CPUs without the new CPUID flag?
> Ah, despite calling this an 'errata', the bad behavior is explicitly documented
> in the APM, i.e. it's an architecture bug, not a silicon bug.

I would still call it an errata for the case when virtualized 
VMSAVE/VMLOAD is enabled (and therefore VMLOAD intercepts are disabled). 
  In that case, the problem is that the GPA does not go through NPT 
before it is checked against *host* reserved memory regions.

In fact I hope that,  on processors that have the fix, VMSAVE/VMLOAD 
from guest mode _does_ check the GPA after it's been translated!

Paolo


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] KVM: SVM: Add support for VMCB address check change
  2021-01-12 19:18   ` Sean Christopherson
@ 2021-01-14 11:39     ` Maxim Levitsky
  0 siblings, 0 replies; 33+ messages in thread
From: Maxim Levitsky @ 2021-01-14 11:39 UTC (permalink / raw)
  To: Sean Christopherson, Wei Huang
  Cc: kvm, linux-kernel, pbonzini, vkuznets, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert

[-- Attachment #1: Type: text/plain, Size: 2888 bytes --]

On Tue, 2021-01-12 at 11:18 -0800, Sean Christopherson wrote:
> On Tue, Jan 12, 2021, Wei Huang wrote:
> > New AMD CPUs have a change that checks VMEXIT intercept on special SVM
> > instructions before checking their EAX against reserved memory region.
> > This change is indicated by CPUID_0x8000000A_EDX[28]. If it is 1, KVM
> > doesn't need to intercept and emulate #GP faults for such instructions
> > because #GP isn't supposed to be triggered.
> > 
> > Co-developed-by: Bandan Das <bsd@redhat.com>
> > Signed-off-by: Bandan Das <bsd@redhat.com>
> > Signed-off-by: Wei Huang <wei.huang2@amd.com>
> > ---
> >  arch/x86/include/asm/cpufeatures.h | 1 +
> >  arch/x86/kvm/svm/svm.c             | 2 +-
> >  2 files changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> > index 84b887825f12..ea89d6fdd79a 100644
> > --- a/arch/x86/include/asm/cpufeatures.h
> > +++ b/arch/x86/include/asm/cpufeatures.h
> > @@ -337,6 +337,7 @@
> >  #define X86_FEATURE_AVIC		(15*32+13) /* Virtual Interrupt Controller */
> >  #define X86_FEATURE_V_VMSAVE_VMLOAD	(15*32+15) /* Virtual VMSAVE VMLOAD */
> >  #define X86_FEATURE_VGIF		(15*32+16) /* Virtual GIF */
> > +#define X86_FEATURE_SVME_ADDR_CHK	(15*32+28) /* "" SVME addr check */
> 
> Heh, KVM should advertise this to userspace by setting the kvm_cpu_cap bit.  KVM
> KVM forwards relevant VM-Exits to L1 without checking if rAX points at an
> invalid L1 GPA.


I agree that we should be able to fix/hide the errata from the L1,
and expose this bit to L1 to avoid it trying to apply this workaround
itself when it itself runs nested guests.

Note that there is currently a bug in this patch series, that prevents
this workaround to work for a guest that runs nested guests itself (e.g L3):

(when we intercept the #GP, and we are running
a nested guest, we should do a vmexit with SVM_EXIT_VMRUN/VMSAVE/etc exit
reason instead of running the instruction), but this can be fixed,
I did it locally and it works.

(lightly tested) patch for that attached.

Best regards,
	Maxim Levitsky
> 
> >  /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
> >  #define X86_FEATURE_AVX512VBMI		(16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 74620d32aa82..451b82df2eab 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -311,7 +311,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
> >  	svm->vmcb->save.efer = efer | EFER_SVME;
> >  	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
> >  	/* Enable GP interception for SVM instructions if needed */
> > -	if (efer & EFER_SVME)
> > +	if ((efer & EFER_SVME) && !boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
> >  		set_exception_intercept(svm, GP_VECTOR);
> >  
> >  	return 0;
> > -- 
> > 2.27.0
> > 






[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 1315 bytes --]

commit 28ab89aaa11380306bafbf49265222f2a2da71da
Author: Maxim Levitsky <mlevitsk@redhat.com>
Date:   Thu Jan 14 10:53:25 2021 +0200

    kvm: x86: fix that errata for nested guests

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c31e005252d69..9cfa5946fac69 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2027,6 +2027,26 @@ static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_guest_mode(vcpu)) {
+		switch (modrm) {
+		case 0xd8: /* VMRUN */
+			svm->vmcb->control.exit_code = SVM_EXIT_VMRUN;
+			break;
+		case 0xda: /* VMLOAD */
+			svm->vmcb->control.exit_code = SVM_EXIT_VMLOAD;
+			break;
+		case 0xdb: /* VMSAVE */
+			svm->vmcb->control.exit_code = SVM_EXIT_VMLOAD;
+			break;
+		default:
+			goto inject_exception;
+		}
+
+		svm->vmcb->control.exit_info_1 = 0;
+		svm->vmcb->control.exit_info_2 = 0;
+		return nested_svm_vmexit(svm);
+	}
+
 	switch (modrm) {
 	case 0xd8: /* VMRUN */
 		return vmrun_interception(svm);
@@ -2035,6 +2055,7 @@ static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
 	case 0xdb: /* VMSAVE */
 		return vmsave_interception(svm);
 	default:
+inject_exception:
 		/* inject a #GP for all other cases */
 		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 		return 1;

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-13  5:15       ` Wei Huang
@ 2021-01-14 11:42         ` Maxim Levitsky
  0 siblings, 0 replies; 33+ messages in thread
From: Maxim Levitsky @ 2021-01-14 11:42 UTC (permalink / raw)
  To: Wei Huang, Andy Lutomirski, Sean Christopherson
  Cc: kvm list, LKML, Paolo Bonzini, Vitaly Kuznetsov, Joerg Roedel,
	Borislav Petkov, Thomas Gleixner, Ingo Molnar, X86 ML,
	Jim Mattson, Wanpeng Li, Bandan Das, Dr. David Alan Gilbert

On Tue, 2021-01-12 at 23:15 -0600, Wei Huang wrote:
> 
> On 1/12/21 12:58 PM, Andy Lutomirski wrote:
> > Andrew Cooper points out that there may be a nicer workaround.  Make
> > sure that the SMRAM and HT region (FFFD00000000 - FFFFFFFFFFFF) are
> > marked as reserved in the guest, too.
> 
> In theory this proposed solution can avoid intercepting #GP. But in 
> reality SMRAM regions can be different on different machines. So this 
> solution can break after VM migration.
> 
I should add to this, that on my 3970X,
I just noticed that the problematic SMRAM region moved on
its own (likely due to the fact that I moved some pcie cards around recently).

Best regards,
	Maxim Levitsky


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 20:00   ` Bandan Das
@ 2021-01-14 11:47     ` Maxim Levitsky
  2021-01-14 17:19       ` Sean Christopherson
  0 siblings, 1 reply; 33+ messages in thread
From: Maxim Levitsky @ 2021-01-14 11:47 UTC (permalink / raw)
  To: Bandan Das, Sean Christopherson
  Cc: Wei Huang, kvm, linux-kernel, pbonzini, vkuznets, joro, bp, tglx,
	mingo, x86, jmattson, wanpengli, dgilbert

[-- Attachment #1: Type: text/plain, Size: 1852 bytes --]

On Tue, 2021-01-12 at 15:00 -0500, Bandan Das wrote:
> Sean Christopherson <seanjc@google.com> writes:
> ...
> > > -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> > > -	    !is_vmware_backdoor_opcode(ctxt)) {
> > > -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > -		return 1;
> > > +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> > > +		vminstr = is_vm_instr_opcode(ctxt);
> > > +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> > > +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > +			return 1;
> > > +		}
> > > +		if (vminstr)
> > > +			return vminstr;
> > 
> > I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a bad
> > L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked until
> > x86_emulate_insn(), and the vm*_interception() helpers expect nested VM-Exits to
> > be handled further up the stack.

Actually IMHO this exactly what we want. We want L0 to always intercept
these #GPs, and hide them from the guest.

What we do need to do (and I prepared and attached a patch for that, is that if we run
a guest, we want to inject corresponding vmexit (like SVM_EXIT_VMRUN)
instead of emulating the instruction.
The attached patch does this, and it made my kvm unit test pass,
even if the test was run in a VM (with an unpatched kernel).

This together with setting that X86_FEATURE_SVME_ADDR_CHK bit for
the guest will allow us to hide that errata completely from the guest
which is a very good thing.
(for example for guests that we can't modify)


Best regards,
	Maxim Levitsky



> > 
> So, the condition is that L2 executes a vmload and #GPs on a reserved address, jumps to L0 - L0 doesn't
> check if L1 has asked for the instruction to be intercepted and goes on with emulating
> vmload and returning back to L2 ?



> 
> > >  	}
> > >  
> > >  	/*
> > > -- 
> > > 2.27.0
> > > 


[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 1315 bytes --]

commit 28ab89aaa11380306bafbf49265222f2a2da71da
Author: Maxim Levitsky <mlevitsk@redhat.com>
Date:   Thu Jan 14 10:53:25 2021 +0200

    kvm: x86: fix that errata for nested guests

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c31e005252d69..9cfa5946fac69 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2027,6 +2027,26 @@ static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_guest_mode(vcpu)) {
+		switch (modrm) {
+		case 0xd8: /* VMRUN */
+			svm->vmcb->control.exit_code = SVM_EXIT_VMRUN;
+			break;
+		case 0xda: /* VMLOAD */
+			svm->vmcb->control.exit_code = SVM_EXIT_VMLOAD;
+			break;
+		case 0xdb: /* VMSAVE */
+			svm->vmcb->control.exit_code = SVM_EXIT_VMLOAD;
+			break;
+		default:
+			goto inject_exception;
+		}
+
+		svm->vmcb->control.exit_info_1 = 0;
+		svm->vmcb->control.exit_info_2 = 0;
+		return nested_svm_vmexit(svm);
+	}
+
 	switch (modrm) {
 	case 0xd8: /* VMRUN */
 		return vmrun_interception(svm);
@@ -2035,6 +2055,7 @@ static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
 	case 0xdb: /* VMSAVE */
 		return vmsave_interception(svm);
 	default:
+inject_exception:
 		/* inject a #GP for all other cases */
 		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 		return 1;

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12  6:37 [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Wei Huang
                   ` (5 preceding siblings ...)
  2021-01-12 19:40 ` Sean Christopherson
@ 2021-01-14 11:55 ` Maxim Levitsky
  6 siblings, 0 replies; 33+ messages in thread
From: Maxim Levitsky @ 2021-01-14 11:55 UTC (permalink / raw)
  To: Wei Huang, kvm
  Cc: linux-kernel, pbonzini, vkuznets, seanjc, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert

On Tue, 2021-01-12 at 00:37 -0600, Wei Huang wrote:
> From: Bandan Das <bsd@redhat.com>
> 
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept. If EAX falls into such
> memory areas, #GP is triggered before VMEXIT. This causes problem under
> nested virtualization. To solve this problem, KVM needs to trap #GP and
> check the instructions triggering #GP. For VM execution instructions,
> KVM emulates these instructions; otherwise it re-injects #GP back to
> guest VMs.
> 
> Signed-off-by: Bandan Das <bsd@redhat.com>
> Co-developed-by: Wei Huang <wei.huang2@amd.com>
> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   8 +-
>  arch/x86/kvm/mmu.h              |   1 +
>  arch/x86/kvm/mmu/mmu.c          |   7 ++
>  arch/x86/kvm/svm/svm.c          | 157 +++++++++++++++++++-------------
>  arch/x86/kvm/svm/svm.h          |   8 ++
>  arch/x86/kvm/vmx/vmx.c          |   2 +-
>  arch/x86/kvm/x86.c              |  37 +++++++-
>  7 files changed, 146 insertions(+), 74 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..0ddc309f5a14 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>   *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>   *			     Used to test the full emulator from userspace.
>   *
> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
I would prefer to see this change in a separate patch.


>   *			backdoor emulation, which is opt in via module param.
>   *			VMware backoor emulation handles select instructions
> - *			and reinjects the #GP for all other cases.
> + *			and reinjects #GP for all other cases. This also
> + *			handles other cases where #GP condition needs to be
> + *			handled and emulated appropriately
>   *
>   * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
>   *		 case the CR2/GPA value pass on the stack is valid.
> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>  #define EMULTYPE_SKIP		    (1 << 2)
>  #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
>  #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
> -#define EMULTYPE_VMWARE_GP	    (1 << 5)
> +#define EMULTYPE_PARAVIRT_GP	    (1 << 5)
>  #define EMULTYPE_PF		    (1 << 6)
>  
>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 581925e476d6..1a2fff4e7140 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>  
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
>  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> +bool kvm_is_host_reserved_region(u64 gpa);
>  
>  #endif
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..c5c4aaf01a1a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -50,6 +50,7 @@
>  #include <asm/io.h>
>  #include <asm/vmx.h>
>  #include <asm/kvm_page_track.h>
> +#include <asm/e820/api.h>
>  #include "trace.h"
>  
>  extern bool itlb_multihit_kvm_mitigation;
> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>  
> +bool kvm_is_host_reserved_region(u64 gpa)
> +{
> +	return e820__mapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> +}
> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
> +
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
>  	struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 7ef171790d02..74620d32aa82 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  		if (!(efer & EFER_SVME)) {
>  			svm_leave_nested(svm);
>  			svm_set_gif(svm, true);
> +			clr_exception_intercept(svm, GP_VECTOR);
Wouldn't that be wrong if we intercept #GP due to the vmware backdoor?

I would add a flag that will be true when the workaround for the errata is enabled,
and use it together with flag that enables vmware backdoor for decisions
such as the above.

The flag can even be a module param to allow users to disable it if they
really want to.

>  
>  			/*
>  			 * Free the nested guest state, unless we are in SMM.
> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  
>  	svm->vmcb->save.efer = efer | EFER_SVME;
>  	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
> +	/* Enable GP interception for SVM instructions if needed */
> +	if (efer & EFER_SVME)
> +		set_exception_intercept(svm, GP_VECTOR);
> +
>  	return 0;
>  }
>  
> @@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
>  	return 1;
>  }
>  
> +static int vmload_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmsave_interception(struct vcpu_svm *svm)
> +{
> +	struct vmcb *nested_vmcb;
> +	struct kvm_host_map map;
> +	int ret;
> +
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> +	if (ret) {
> +		if (ret == -EINVAL)
> +			kvm_inject_gp(&svm->vcpu, 0);
> +		return 1;
> +	}
> +
> +	nested_vmcb = map.hva;
> +
> +	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> +
> +	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> +	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> +
> +	return ret;
> +}
> +
> +static int vmrun_interception(struct vcpu_svm *svm)
> +{
> +	if (nested_svm_check_permissions(svm))
> +		return 1;
> +
> +	return nested_svm_vmrun(svm);
> +}

I would prefer the move of these functions (I didn't check
if you changed them as well) to be done in a separate patch.

> +
> +/* Emulate SVM VM execution instructions */
> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	switch (modrm) {
> +	case 0xd8: /* VMRUN */
> +		return vmrun_interception(svm);
> +	case 0xda: /* VMLOAD */
> +		return vmload_interception(svm);
> +	case 0xdb: /* VMSAVE */
> +		return vmsave_interception(svm);
> +	default:
> +		/* inject a #GP for all other cases */
> +		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +		return 1;
> +	}
> +}
> +
>  static int gp_interception(struct vcpu_svm *svm)
>  {
>  	struct kvm_vcpu *vcpu = &svm->vcpu;
>  	u32 error_code = svm->vmcb->control.exit_info_1;
> -
> -	WARN_ON_ONCE(!enable_vmware_backdoor);
The warning could be kept with extended condition.

> +	int rc;
>  
>  	/*
> -	 * VMware backdoor emulation on #GP interception only handles IN{S},
> -	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +	 * Only VMware backdoor and SVM VME errata are handled. Neither of
> +	 * them has non-zero error codes.
Could be great to mention (once published) the link to the errata
here or somewhere so readers understand what it is all about.

>  	 */
>  	if (error_code) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  		return 1;
>  	}
> -	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> +	rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> +	if (rc > 1)
> +		rc = svm_emulate_vm_instr(vcpu, rc);
> +	return rc;
As others mentioned in the review, I also would try to use something
else that passing the mod/reg/rm byte in the return value.
Otherwise it will backfire sooner or later.


>  }
>  
>  static bool is_erratum_383(void)
> @@ -2113,66 +2200,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
>  	return kvm_emulate_hypercall(&svm->vcpu);
>  }
>  
> -static int vmload_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmsave_interception(struct vcpu_svm *svm)
> -{
> -	struct vmcb *nested_vmcb;
> -	struct kvm_host_map map;
> -	int ret;
> -
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
> -	if (ret) {
> -		if (ret == -EINVAL)
> -			kvm_inject_gp(&svm->vcpu, 0);
> -		return 1;
> -	}
> -
> -	nested_vmcb = map.hva;
> -
> -	ret = kvm_skip_emulated_instruction(&svm->vcpu);
> -
> -	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
> -	kvm_vcpu_unmap(&svm->vcpu, &map, true);
> -
> -	return ret;
> -}
> -
> -static int vmrun_interception(struct vcpu_svm *svm)
> -{
> -	if (nested_svm_check_permissions(svm))
> -		return 1;
> -
> -	return nested_svm_vmrun(svm);
> -}
> -
>  void svm_set_gif(struct vcpu_svm *svm, bool value)
>  {
>  	if (value) {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 0fe874ae5498..d5dffcf59afa 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -350,6 +350,14 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
>  	recalc_intercepts(svm);
>  }
>  
> +static inline bool is_exception_intercept(struct vcpu_svm *svm, u32 bit)
> +{
> +	struct vmcb *vmcb = get_host_vmcb(svm);
> +
> +	WARN_ON_ONCE(bit >= 32);
> +	return vmcb_is_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
> +}
This function doesn't seem to be used anywhere.

> +
>  static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
>  {
>  	struct vmcb *vmcb = get_host_vmcb(svm);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 2af05d3b0590..5fac2f7cba24 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -4774,7 +4774,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>  			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>  			return 1;
>  		}
> -		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +		return kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>  	}
>  
>  	/*
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9a8969a6dd06..c3662fc3b1bc 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7014,7 +7014,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>  	++vcpu->stat.insn_emulation_fail;
>  	trace_kvm_emulate_insn_failed(vcpu);
>  
> -	if (emulation_type & EMULTYPE_VMWARE_GP) {
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>  		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>  		return 1;
>  	}
> @@ -7267,6 +7267,28 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
>  	return false;
>  }
>  
> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
> +{
> +	unsigned long rax;
> +
> +	if (ctxt->b != 0x1)
> +		return 0;
Don't you also want to check that 'ctxt->opcode_len == 2'?
Prefixes? AMD's PRM doesn't mention if these are allowed/ignored/etc.

I guess they are ignored but this should be double checked vs real hardware.

> +
> +	switch (ctxt->modrm) {
> +	case 0xd8: /* VMRUN */
> +	case 0xda: /* VMLOAD */
> +	case 0xdb: /* VMSAVE */
> +		rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
> +		if (!kvm_is_host_reserved_region(rax))
> +			return 0;
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	return ctxt->modrm;
> +}
> +
>  static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>  {
>  	switch (ctxt->opcode_len) {
> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>  	bool writeback = true;
>  	bool write_fault_to_spt;
> +	int vminstr;
>  
>  	if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
>  		return 1;
> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  		}
>  	}
>  
> -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> -	    !is_vmware_backdoor_opcode(ctxt)) {
> -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> -		return 1;
> +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> +		vminstr = is_vm_instr_opcode(ctxt);

As I said above, I would add some flag if workaround for the errata is used,
and use it here.

> +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> +			return 1;
> +		}
> +		if (vminstr)
> +			return vminstr;
>  	}
>  
>  	/*

Best regards,
	Maxim Levitsky


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] KVM: SVM: Add support for VMCB address check change
  2021-01-12  6:37 ` [PATCH 2/2] KVM: SVM: Add support for VMCB address check change Wei Huang
  2021-01-12 19:18   ` Sean Christopherson
@ 2021-01-14 12:04   ` Maxim Levitsky
  1 sibling, 0 replies; 33+ messages in thread
From: Maxim Levitsky @ 2021-01-14 12:04 UTC (permalink / raw)
  To: Wei Huang, kvm
  Cc: linux-kernel, pbonzini, vkuznets, seanjc, joro, bp, tglx, mingo,
	x86, jmattson, wanpengli, bsd, dgilbert

On Tue, 2021-01-12 at 00:37 -0600, Wei Huang wrote:
> New AMD CPUs have a change that checks VMEXIT intercept on special SVM
> instructions before checking their EAX against reserved memory region.
> This change is indicated by CPUID_0x8000000A_EDX[28]. If it is 1, KVM
> doesn't need to intercept and emulate #GP faults for such instructions
> because #GP isn't supposed to be triggered.
> 
> Co-developed-by: Bandan Das <bsd@redhat.com>
> Signed-off-by: Bandan Das <bsd@redhat.com>
> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> ---
>  arch/x86/include/asm/cpufeatures.h | 1 +
>  arch/x86/kvm/svm/svm.c             | 2 +-
>  2 files changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index 84b887825f12..ea89d6fdd79a 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -337,6 +337,7 @@
>  #define X86_FEATURE_AVIC		(15*32+13) /* Virtual Interrupt Controller */
>  #define X86_FEATURE_V_VMSAVE_VMLOAD	(15*32+15) /* Virtual VMSAVE VMLOAD */
>  #define X86_FEATURE_VGIF		(15*32+16) /* Virtual GIF */
> +#define X86_FEATURE_SVME_ADDR_CHK	(15*32+28) /* "" SVME addr check */
Why ""?

>  
>  /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
>  #define X86_FEATURE_AVX512VBMI		(16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 74620d32aa82..451b82df2eab 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -311,7 +311,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  	svm->vmcb->save.efer = efer | EFER_SVME;
>  	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
>  	/* Enable GP interception for SVM instructions if needed */
> -	if (efer & EFER_SVME)
> +	if ((efer & EFER_SVME) && !boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
>  		set_exception_intercept(svm, GP_VECTOR);

As mentioned in the review for the other patch I would add a flag that
would enable the workaround for the errata, and I would force it disabled
if X86_FEATURE_SVME_ADDR_CHK is set in CPUID somewhere early in the 
kvm initialization.

And finally that new flag can be used here to enable the #GP interception
in the above code.

>  
>  	return 0;


Best regards,
	Maxim Levitsky


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-14 11:47     ` Maxim Levitsky
@ 2021-01-14 17:19       ` Sean Christopherson
  0 siblings, 0 replies; 33+ messages in thread
From: Sean Christopherson @ 2021-01-14 17:19 UTC (permalink / raw)
  To: Maxim Levitsky
  Cc: Bandan Das, Wei Huang, kvm, linux-kernel, pbonzini, vkuznets,
	joro, bp, tglx, mingo, x86, jmattson, wanpengli, dgilbert

On Thu, Jan 14, 2021, Maxim Levitsky wrote:
> On Tue, 2021-01-12 at 15:00 -0500, Bandan Das wrote:
> > Sean Christopherson <seanjc@google.com> writes:
> > ...
> > > > -	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> > > > -	    !is_vmware_backdoor_opcode(ctxt)) {
> > > > -		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > > -		return 1;
> > > > +	if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> > > > +		vminstr = is_vm_instr_opcode(ctxt);
> > > > +		if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> > > > +			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > > +			return 1;
> > > > +		}
> > > > +		if (vminstr)
> > > > +			return vminstr;
> > > 
> > > I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a bad
> > > L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked until
> > > x86_emulate_insn(), and the vm*_interception() helpers expect nested VM-Exits to
> > > be handled further up the stack.
> 
> Actually IMHO this exactly what we want. We want L0 to always intercept
> these #GPs, and hide them from the guest.
> 
> What we do need to do (and I prepared and attached a patch for that, is that
> if we run a guest, we want to inject corresponding vmexit (like
> SVM_EXIT_VMRUN) instead of emulating the instruction.

Yes, lack of forwarding to L1 as a nested exit is what I meant by "doesn't
correctly handle".

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-12 14:01 ` Paolo Bonzini
  2021-01-12 17:42   ` Sean Christopherson
@ 2021-01-15  7:00   ` Wei Huang
  2021-01-17 18:20     ` Paolo Bonzini
  1 sibling, 1 reply; 33+ messages in thread
From: Wei Huang @ 2021-01-15  7:00 UTC (permalink / raw)
  To: Paolo Bonzini, Wei Huang, kvm
  Cc: linux-kernel, vkuznets, seanjc, joro, bp, tglx, mingo, x86,
	jmattson, wanpengli, bsd, dgilbert, mlevitsk



On 1/12/21 8:01 AM, Paolo Bonzini wrote:
> On 12/01/21 07:37, Wei Huang wrote:
>>   static int gp_interception(struct vcpu_svm *svm)
>>   {
>>       struct kvm_vcpu *vcpu = &svm->vcpu;
>>       u32 error_code = svm->vmcb->control.exit_info_1;
>> -
>> -    WARN_ON_ONCE(!enable_vmware_backdoor);
>> +    int rc;
>>         /*
>> -     * VMware backdoor emulation on #GP interception only handles IN{S},
>> -     * OUT{S}, and RDPMC, none of which generate a non-zero error code.
>> +     * Only VMware backdoor and SVM VME errata are handled. Neither of
>> +     * them has non-zero error codes.
>>        */
>>       if (error_code) {
>>           kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>>           return 1;
>>       }
>> -    return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
>> +
>> +    rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>> +    if (rc > 1)
>> +        rc = svm_emulate_vm_instr(vcpu, rc);
>> +    return rc;
>>   }
>>   
> 
> Passing back the third byte is quick hacky.  Instead of this change to
> kvm_emulate_instruction, I'd rather check the instruction bytes in
> gp_interception before calling kvm_emulate_instruction.  That would be
> something like:
> 
> - move "kvm_clear_exception_queue(vcpu);" inside the "if
> (!(emulation_type & EMULTYPE_NO_DECODE))".  It doesn't apply when you
> are coming back from userspace.
> 
> - extract the "if (!(emulation_type & EMULTYPE_NO_DECODE))" body to a
> new function x86_emulate_decoded_instruction.  Call it from
> gp_interception, we know this is not a pagefault and therefore
> vcpu->arch.write_fault_to_shadow_pgtable must be false.

If the whole body inside if-statement is moved out, do you expect the
interface of x86_emulate_decoded_instruction to be something like:

int x86_emulate_decoded_instruction(struct kvm_vcpu *vcpu,
                                    gpa_t cr2_or_gpa,
                                    int emulation_type, void *insn,
                                    int insn_len,
                                    bool write_fault_to_spt)

And if so, what is the emulation type to use when calling this function
from svm.c? EMULTYPE_VMWARE_GP?

> 
> - check ctxt->insn_bytes for an SVM instruction
> 
> - if not an SVM instruction, call kvm_emulate_instruction(vcpu,
> EMULTYPE_VMWARE_GP|EMULTYPE_NO_DECODE).
> 
> Thanks,
> 
> Paolo
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions
  2021-01-15  7:00   ` Wei Huang
@ 2021-01-17 18:20     ` Paolo Bonzini
  0 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2021-01-17 18:20 UTC (permalink / raw)
  To: Wei Huang, Wei Huang, kvm
  Cc: linux-kernel, vkuznets, seanjc, joro, bp, tglx, mingo, x86,
	jmattson, wanpengli, bsd, dgilbert, mlevitsk

On 15/01/21 08:00, Wei Huang wrote:
> If the whole body inside if-statement is moved out, do you expect the
> interface of x86_emulate_decoded_instruction to be something like:
> 
> int x86_emulate_decoded_instruction(struct kvm_vcpu *vcpu,
>                                      gpa_t cr2_or_gpa,
>                                      int emulation_type, void *insn,
>                                      int insn_len,
>                                      bool write_fault_to_spt)

An idea is to making the body of the new function just

         init_emulate_ctxt(vcpu);

         /*
          * We will reenter on the same instruction since
          * we do not set complete_userspace_io.  This does not
          * handle watchpoints yet, those would be handled in
          * the emulate_ops.
          */
         if (!(emulation_type & EMULTYPE_SKIP) &&
             kvm_vcpu_check_breakpoint(vcpu, &r))
                 return r;

         ctxt->interruptibility = 0;
         ctxt->have_exception = false;
         ctxt->exception.vector = -1;
         ctxt->exception.error_code_valid = false;

         ctxt->perm_ok = false;

         ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;

         r = x86_decode_insn(ctxt, insn, insn_len);

         trace_kvm_emulate_insn_start(vcpu);
         ++vcpu->stat.insn_emulation;
         return r;

because for the new caller, on EMULATION_FAILED you can just re-enter 
the guest.

> And if so, what is the emulation type to use when calling this function
> from svm.c? EMULTYPE_VMWARE_GP?

Just 0 I think.

Paolo


^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2021-01-17 18:22 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-12  6:37 [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Wei Huang
2021-01-12  6:37 ` [PATCH 2/2] KVM: SVM: Add support for VMCB address check change Wei Huang
2021-01-12 19:18   ` Sean Christopherson
2021-01-14 11:39     ` Maxim Levitsky
2021-01-14 12:04   ` Maxim Levitsky
2021-01-12 11:09 ` [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions Maxim Levitsky
2021-01-12 21:05   ` Wei Huang
2021-01-12 12:15 ` Vitaly Kuznetsov
2021-01-12 15:11   ` Andy Lutomirski
2021-01-12 15:17     ` Maxim Levitsky
2021-01-12 15:22       ` Andy Lutomirski
2021-01-12 15:46         ` Bandan Das
2021-01-12 15:51           ` Andy Lutomirski
2021-01-12 17:56             ` Sean Christopherson
2021-01-13  4:55               ` Wei Huang
2021-01-12 21:50   ` Wei Huang
2021-01-12 14:01 ` Paolo Bonzini
2021-01-12 17:42   ` Sean Christopherson
2021-01-13 12:35     ` Paolo Bonzini
2021-01-15  7:00   ` Wei Huang
2021-01-17 18:20     ` Paolo Bonzini
2021-01-12 17:36 ` Sean Christopherson
2021-01-12 17:59   ` Sean Christopherson
2021-01-12 18:58     ` Andy Lutomirski
2021-01-13  5:15       ` Wei Huang
2021-01-14 11:42         ` Maxim Levitsky
2021-01-13  5:03     ` Wei Huang
2021-01-13 12:40     ` Paolo Bonzini
2021-01-12 19:40 ` Sean Christopherson
2021-01-12 20:00   ` Bandan Das
2021-01-14 11:47     ` Maxim Levitsky
2021-01-14 17:19       ` Sean Christopherson
2021-01-14 11:55 ` Maxim Levitsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).