All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 0/3] Expose VMFUNC to the nested hypervisor
@ 2017-07-10 20:49 Bandan Das
  2017-07-10 20:49 ` [PATCH v4 1/3] KVM: vmx: Enable VMFUNCs Bandan Das
                   ` (2 more replies)
  0 siblings, 3 replies; 34+ messages in thread
From: Bandan Das @ 2017-07-10 20:49 UTC (permalink / raw)
  To: kvm; +Cc: pbonzini, david, linux-kernel

v4:
 2/3:  Use WARN_ONCE to avoid logging dos

v3:
 https://lkml.org/lkml/2017/7/10/684
 3/3: Add missing nested_release_page_clean() and check the
 eptp as mentioned in SDM 24.6.14
 
v2:
 https://lkml.org/lkml/2017/7/6/813
 1/3: Patch to enable vmfunc on the host but cause a #UD if
      L1 tries to use it directly. (new)
 2/3: Expose vmfunc to the nested hypervisor, but no vm functions
      are exposed and L0 emulates a vmfunc vmexit to L1. 
 3/3: Force a vmfunc vmexit when L2 tries to use vmfunc and emulate
      eptp switching. Unconditionally expose EPTP switching to the
      L1 hypervisor since L0 fakes eptp switching via a mmu reload.

These patches expose eptp switching/vmfunc to the nested hypervisor.
vmfunc is enabled in the secondary controls for the host and is
exposed to the nested hypervisor. However, if the nested hypervisor
decides to use eptp switching, L0 emulates it.

v1:
 https://lkml.org/lkml/2017/6/29/958

Bandan Das (3):
  KVM: vmx: Enable VMFUNCs
  KVM: nVMX: Enable VMFUNC for the L1 hypervisor
  KVM: nVMX: Emulate EPTP switching for the L1 hypervisor

 arch/x86/include/asm/vmx.h |   9 ++++
 arch/x86/kvm/vmx.c         | 125 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 132 insertions(+), 2 deletions(-)

-- 
2.9.4

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v4 1/3] KVM: vmx: Enable VMFUNCs
  2017-07-10 20:49 [PATCH v4 0/3] Expose VMFUNC to the nested hypervisor Bandan Das
@ 2017-07-10 20:49 ` Bandan Das
  2017-07-10 20:49 ` [PATCH v4 2/3] KVM: nVMX: Enable VMFUNC for the L1 hypervisor Bandan Das
  2017-07-10 20:49 ` [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching " Bandan Das
  2 siblings, 0 replies; 34+ messages in thread
From: Bandan Das @ 2017-07-10 20:49 UTC (permalink / raw)
  To: kvm; +Cc: pbonzini, david, linux-kernel

Enable VMFUNC in the secondary execution controls.  This simplifies the
changes necessary to expose it to nested hypervisors.  VMFUNCs still
cause #UD when invoked.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Bandan Das <bsd@redhat.com>
---
 arch/x86/include/asm/vmx.h |  3 +++
 arch/x86/kvm/vmx.c         | 22 +++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 35cd06f..da5375e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,6 +72,7 @@
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
 #define SECONDARY_EXEC_RDRAND			0x00000800
 #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
+#define SECONDARY_EXEC_ENABLE_VMFUNC            0x00002000
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
 #define SECONDARY_EXEC_RDSEED			0x00010000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
@@ -187,6 +188,8 @@ enum vmcs_field {
 	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
 	POSTED_INTR_DESC_ADDR           = 0x00002016,
 	POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
+	VM_FUNCTION_CONTROL             = 0x00002018,
+	VM_FUNCTION_CONTROL_HIGH        = 0x00002019,
 	EPT_POINTER                     = 0x0000201a,
 	EPT_POINTER_HIGH                = 0x0000201b,
 	EOI_EXIT_BITMAP0                = 0x0000201c,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ca5d2b9..a483b49 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1314,6 +1314,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
 		SECONDARY_EXEC_TSC_SCALING;
 }
 
+static inline bool cpu_has_vmx_vmfunc(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_VMFUNC;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -3575,7 +3581,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_SHADOW_VMCS |
 			SECONDARY_EXEC_XSAVES |
 			SECONDARY_EXEC_ENABLE_PML |
-			SECONDARY_EXEC_TSC_SCALING;
+			SECONDARY_EXEC_TSC_SCALING |
+			SECONDARY_EXEC_ENABLE_VMFUNC;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -5233,6 +5240,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
 #endif
 
+	if (cpu_has_vmx_vmfunc())
+		vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
 	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
@@ -7740,6 +7750,12 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_vmfunc(struct kvm_vcpu *vcpu)
+{
+	kvm_queue_exception(vcpu, UD_VECTOR);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7790,6 +7806,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
+	[EXIT_REASON_VMFUNC]                  = handle_vmfunc,
 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 };
 
@@ -8111,6 +8128,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_PML_FULL:
 		/* We emulate PML support to L1. */
 		return false;
+	case EXIT_REASON_VMFUNC:
+		/* VM functions are emulated through L2->L0 vmexits. */
+		return false;
 	default:
 		return true;
 	}
-- 
2.9.4

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v4 2/3] KVM: nVMX: Enable VMFUNC for the L1 hypervisor
  2017-07-10 20:49 [PATCH v4 0/3] Expose VMFUNC to the nested hypervisor Bandan Das
  2017-07-10 20:49 ` [PATCH v4 1/3] KVM: vmx: Enable VMFUNCs Bandan Das
@ 2017-07-10 20:49 ` Bandan Das
  2017-07-10 20:49 ` [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching " Bandan Das
  2 siblings, 0 replies; 34+ messages in thread
From: Bandan Das @ 2017-07-10 20:49 UTC (permalink / raw)
  To: kvm; +Cc: pbonzini, david, linux-kernel

Expose VMFUNC in MSRs and VMCS fields. No actual VMFUNCs are enabled.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Bandan Das <bsd@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a483b49..fe8f5fc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -240,6 +240,7 @@ struct __packed vmcs12 {
 	u64 virtual_apic_page_addr;
 	u64 apic_access_addr;
 	u64 posted_intr_desc_addr;
+	u64 vm_function_control;
 	u64 ept_pointer;
 	u64 eoi_exit_bitmap0;
 	u64 eoi_exit_bitmap1;
@@ -481,6 +482,7 @@ struct nested_vmx {
 	u64 nested_vmx_cr4_fixed0;
 	u64 nested_vmx_cr4_fixed1;
 	u64 nested_vmx_vmcs_enum;
+	u64 nested_vmx_vmfunc_controls;
 };
 
 #define POSTED_INTR_ON  0
@@ -763,6 +765,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
 	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
 	FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
+	FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
 	FIELD64(EPT_POINTER, ept_pointer),
 	FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
 	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
@@ -1394,6 +1397,11 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
 	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
 }
 
+static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
+}
+
 static inline bool is_nmi(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2780,6 +2788,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
+	if (cpu_has_vmx_vmfunc()) {
+		vmx->nested.nested_vmx_secondary_ctls_high |=
+			SECONDARY_EXEC_ENABLE_VMFUNC;
+		vmx->nested.nested_vmx_vmfunc_controls = 0;
+	}
+
 	/*
 	 * Old versions of KVM use the single-context version without
 	 * checking for support, so declare that it is supported even
@@ -3149,6 +3163,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		*pdata = vmx->nested.nested_vmx_ept_caps |
 			((u64)vmx->nested.nested_vmx_vpid_caps << 32);
 		break;
+	case MSR_IA32_VMX_VMFUNC:
+		*pdata = vmx->nested.nested_vmx_vmfunc_controls;
+		break;
 	default:
 		return 1;
 	}
@@ -7752,7 +7769,29 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
 
 static int handle_vmfunc(struct kvm_vcpu *vcpu)
 {
-	kvm_queue_exception(vcpu, UD_VECTOR);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12;
+	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+
+	/*
+	 * VMFUNC is only supported for nested guests, but we always enable the
+	 * secondary control for simplicity; for non-nested mode, fake that we
+	 * didn't by injecting #UD.
+	 */
+	if (!is_guest_mode(vcpu)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmcs12 = get_vmcs12(vcpu);
+	if ((vmcs12->vm_function_control & (1 << function)) == 0)
+		goto fail;
+	WARN_ONCE(1, "VMCS12 VM function control should have been zero");
+
+fail:
+	nested_vmx_vmexit(vcpu, vmx->exit_reason,
+			  vmcs_read32(VM_EXIT_INTR_INFO),
+			  vmcs_readl(EXIT_QUALIFICATION));
 	return 1;
 }
 
@@ -10053,7 +10092,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 				  SECONDARY_EXEC_RDTSCP |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-				  SECONDARY_EXEC_APIC_REGISTER_VIRT);
+				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
+				  SECONDARY_EXEC_ENABLE_VMFUNC);
 		if (nested_cpu_has(vmcs12,
 				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
 			vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
@@ -10061,6 +10101,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 			exec_control |= vmcs12_exec_ctrl;
 		}
 
+		/* All VMFUNCs are currently emulated through L0 vmexits.  */
+		if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
+			vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
 		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
 			vmcs_write64(EOI_EXIT_BITMAP0,
 				vmcs12->eoi_exit_bitmap0);
@@ -10310,6 +10354,11 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 				vmx->nested.nested_vmx_entry_ctls_high))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+	if (nested_cpu_has_vmfunc(vmcs12) &&
+	    (vmcs12->vm_function_control &
+	     ~vmx->nested.nested_vmx_vmfunc_controls))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
 	if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-- 
2.9.4

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-10 20:49 [PATCH v4 0/3] Expose VMFUNC to the nested hypervisor Bandan Das
  2017-07-10 20:49 ` [PATCH v4 1/3] KVM: vmx: Enable VMFUNCs Bandan Das
  2017-07-10 20:49 ` [PATCH v4 2/3] KVM: nVMX: Enable VMFUNC for the L1 hypervisor Bandan Das
@ 2017-07-10 20:49 ` Bandan Das
  2017-07-11  7:51   ` David Hildenbrand
  2 siblings, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-10 20:49 UTC (permalink / raw)
  To: kvm; +Cc: pbonzini, david, linux-kernel

When L2 uses vmfunc, L0 utilizes the associated vmexit to
emulate a switching of the ept pointer by reloading the
guest MMU.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Bandan Das <bsd@redhat.com>
---
 arch/x86/include/asm/vmx.h |  6 +++++
 arch/x86/kvm/vmx.c         | 58 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index da5375e..5f63a2e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -115,6 +115,10 @@
 #define VMX_MISC_SAVE_EFER_LMA			0x00000020
 #define VMX_MISC_ACTIVITY_HLT			0x00000040
 
+/* VMFUNC functions */
+#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
+#define VMFUNC_EPTP_ENTRIES  512
+
 static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
 {
 	return vmx_basic & GENMASK_ULL(30, 0);
@@ -200,6 +204,8 @@ enum vmcs_field {
 	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
 	EOI_EXIT_BITMAP3                = 0x00002022,
 	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
+	EPTP_LIST_ADDRESS               = 0x00002024,
+	EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
 	VMREAD_BITMAP                   = 0x00002026,
 	VMWRITE_BITMAP                  = 0x00002028,
 	XSS_EXIT_BITMAP                 = 0x0000202C,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fe8f5fc..0a969fb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -246,6 +246,7 @@ struct __packed vmcs12 {
 	u64 eoi_exit_bitmap1;
 	u64 eoi_exit_bitmap2;
 	u64 eoi_exit_bitmap3;
+	u64 eptp_list_address;
 	u64 xss_exit_bitmap;
 	u64 guest_physical_address;
 	u64 vmcs_link_pointer;
@@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
 	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
 	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
 	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
 	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
 	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
 }
 
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has_vmfunc(vmcs12) &&
+		(vmcs12->vm_function_control &
+		 VMX_VMFUNC_EPTP_SWITCHING);
+}
+
 static inline bool is_nmi(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	if (cpu_has_vmx_vmfunc()) {
 		vmx->nested.nested_vmx_secondary_ctls_high |=
 			SECONDARY_EXEC_ENABLE_VMFUNC;
-		vmx->nested.nested_vmx_vmfunc_controls = 0;
+		/*
+		 * Advertise EPTP switching unconditionally
+		 * since we emulate it
+		 */
+		vmx->nested.nested_vmx_vmfunc_controls =
+			VMX_VMFUNC_EPTP_SWITCHING;
 	}
 
 	/*
@@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12;
 	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+	struct page *page = NULL;
+	u64 *l1_eptp_list, address;
 
 	/*
 	 * VMFUNC is only supported for nested guests, but we always enable the
@@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
 	}
 
 	vmcs12 = get_vmcs12(vcpu);
-	if ((vmcs12->vm_function_control & (1 << function)) == 0)
+	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
+	    WARN_ON_ONCE(function))
+		goto fail;
+
+	if (!nested_cpu_has_ept(vmcs12) ||
+	    !nested_cpu_has_eptp_switching(vmcs12))
+		goto fail;
+
+	if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
+		goto fail;
+
+	page = nested_get_page(vcpu, vmcs12->eptp_list_address);
+	if (!page)
 		goto fail;
-	WARN_ONCE(1, "VMCS12 VM function control should have been zero");
+
+	l1_eptp_list = kmap(page);
+	address = l1_eptp_list[index];
+	if (!address)
+		goto fail;
+	/*
+	 * If the (L2) guest does a vmfunc to the currently
+	 * active ept pointer, we don't have to do anything else
+	 */
+	if (vmcs12->ept_pointer != address) {
+		if (address >> cpuid_maxphyaddr(vcpu) ||
+		    !IS_ALIGNED(address, 4096))
+			goto fail;
+		kvm_mmu_unload(vcpu);
+		vmcs12->ept_pointer = address;
+		kvm_mmu_reload(vcpu);
+		kunmap(page);
+		nested_release_page_clean(page);
+	}
+	return kvm_skip_emulated_instruction(vcpu);
 
 fail:
+	if (page) {
+		kunmap(page);
+		nested_release_page_clean(page);
+	}
 	nested_vmx_vmexit(vcpu, vmx->exit_reason,
 			  vmcs_read32(VM_EXIT_INTR_INFO),
 			  vmcs_readl(EXIT_QUALIFICATION));
-- 
2.9.4

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-10 20:49 ` [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching " Bandan Das
@ 2017-07-11  7:51   ` David Hildenbrand
  2017-07-11  8:39     ` Paolo Bonzini
                       ` (2 more replies)
  0 siblings, 3 replies; 34+ messages in thread
From: David Hildenbrand @ 2017-07-11  7:51 UTC (permalink / raw)
  To: Bandan Das, kvm; +Cc: pbonzini, linux-kernel

On 10.07.2017 22:49, Bandan Das wrote:
> When L2 uses vmfunc, L0 utilizes the associated vmexit to
> emulate a switching of the ept pointer by reloading the
> guest MMU.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Bandan Das <bsd@redhat.com>
> ---
>  arch/x86/include/asm/vmx.h |  6 +++++
>  arch/x86/kvm/vmx.c         | 58 +++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 61 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index da5375e..5f63a2e 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -115,6 +115,10 @@
>  #define VMX_MISC_SAVE_EFER_LMA			0x00000020
>  #define VMX_MISC_ACTIVITY_HLT			0x00000040
>  
> +/* VMFUNC functions */
> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
> +#define VMFUNC_EPTP_ENTRIES  512
> +
>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>  {
>  	return vmx_basic & GENMASK_ULL(30, 0);
> @@ -200,6 +204,8 @@ enum vmcs_field {
>  	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
>  	EOI_EXIT_BITMAP3                = 0x00002022,
>  	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
> +	EPTP_LIST_ADDRESS               = 0x00002024,
> +	EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
>  	VMREAD_BITMAP                   = 0x00002026,
>  	VMWRITE_BITMAP                  = 0x00002028,
>  	XSS_EXIT_BITMAP                 = 0x0000202C,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index fe8f5fc..0a969fb 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -246,6 +246,7 @@ struct __packed vmcs12 {
>  	u64 eoi_exit_bitmap1;
>  	u64 eoi_exit_bitmap2;
>  	u64 eoi_exit_bitmap3;
> +	u64 eptp_list_address;
>  	u64 xss_exit_bitmap;
>  	u64 guest_physical_address;
>  	u64 vmcs_link_pointer;
> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
>  	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
>  	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
>  	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
> +	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
>  	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
>  	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
>  	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>  	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>  }
>  
> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
> +{
> +	return nested_cpu_has_vmfunc(vmcs12) &&
> +		(vmcs12->vm_function_control &

I wonder if it makes sense to rename vm_function_control to
- vmfunc_control
- vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
- vmfunc_ctrl

> +		 VMX_VMFUNC_EPTP_SWITCHING);
> +}
> +
>  static inline bool is_nmi(u32 intr_info)
>  {
>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>  	if (cpu_has_vmx_vmfunc()) {
>  		vmx->nested.nested_vmx_secondary_ctls_high |=
>  			SECONDARY_EXEC_ENABLE_VMFUNC;
> -		vmx->nested.nested_vmx_vmfunc_controls = 0;
> +		/*
> +		 * Advertise EPTP switching unconditionally
> +		 * since we emulate it
> +		 */
> +		vmx->nested.nested_vmx_vmfunc_controls =
> +			VMX_VMFUNC_EPTP_SWITCHING;>  	}
>  
>  	/*
> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	struct vmcs12 *vmcs12;
>  	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
> +	struct page *page = NULL;
> +	u64 *l1_eptp_list, address;
>  
>  	/*
>  	 * VMFUNC is only supported for nested guests, but we always enable the
> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>  	}
>  
>  	vmcs12 = get_vmcs12(vcpu);
> -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
> +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
> +	    WARN_ON_ONCE(function))

"... instruction causes a VM exit if the bit at position EAX is 0 in the
VM-function controls (the selected VM function is
not enabled)."

So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
completely.

> +		goto fail;
> +
> +	if (!nested_cpu_has_ept(vmcs12) ||
> +	    !nested_cpu_has_eptp_switching(vmcs12))
> +		goto fail;
> +
> +	if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
> +		goto fail;

I can find the definition for an vmexit in case of index >=
VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.

Can you give me a hint?

> +
> +	page = nested_get_page(vcpu, vmcs12->eptp_list_address);
> +	if (!page)
>  		goto fail;
> -	WARN_ONCE(1, "VMCS12 VM function control should have been zero");
> +
> +	l1_eptp_list = kmap(page);
> +	address = l1_eptp_list[index];
> +	if (!address)
> +		goto fail;

Can you move that check to the other address checks below? (or rework if
this make sense, see below)

> +	/*
> +	 * If the (L2) guest does a vmfunc to the currently
> +	 * active ept pointer, we don't have to do anything else
> +	 */
> +	if (vmcs12->ept_pointer != address) {
> +		if (address >> cpuid_maxphyaddr(vcpu) ||
> +		    !IS_ALIGNED(address, 4096))

Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
(triggering a KVM_REQ_TRIPLE_FAULT)

> +			goto fail;
> +		kvm_mmu_unload(vcpu);
> +		vmcs12->ept_pointer = address;
> +		kvm_mmu_reload(vcpu);

I was thinking about something like this:

kvm_mmu_unload(vcpu);
old = vmcs12->ept_pointer;
vmcs12->ept_pointer = address;
if (kvm_mmu_reload(vcpu)) {
	/* pointer invalid, restore previous state */
	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
	vmcs12->ept_pointer = old;
	kvm_mmu_reload(vcpu);
	goto fail;
}

The you can inherit the checks from mmu_check_root().


Wonder why I can't spot checks for cpuid_maxphyaddr() /
IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The
checks should be identical.


> +		kunmap(page);
> +		nested_release_page_clean(page);

shouldn't the kunmap + nested_release_page_clean go outside the if clause?

> +	}
> +	return kvm_skip_emulated_instruction(vcpu);
>  
>  fail:
> +	if (page) {
> +		kunmap(page);
> +		nested_release_page_clean(page);
> +	}
>  	nested_vmx_vmexit(vcpu, vmx->exit_reason,
>  			  vmcs_read32(VM_EXIT_INTR_INFO),
>  			  vmcs_readl(EXIT_QUALIFICATION));
> 

David and mmu code are not yet best friends. So sorry if I am missing
something.

-- 

Thanks,

David

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11  7:51   ` David Hildenbrand
@ 2017-07-11  8:39     ` Paolo Bonzini
  2017-07-11 13:52     ` Radim Krčmář
  2017-07-11 17:58     ` Bandan Das
  2 siblings, 0 replies; 34+ messages in thread
From: Paolo Bonzini @ 2017-07-11  8:39 UTC (permalink / raw)
  To: David Hildenbrand, Bandan Das, kvm; +Cc: linux-kernel

On 11/07/2017 09:51, David Hildenbrand wrote:
>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>> +{
>> +	return nested_cpu_has_vmfunc(vmcs12) &&
>> +		(vmcs12->vm_function_control &
>
> I wonder if it makes sense to rename vm_function_control to
> - vmfunc_control
> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
> - vmfunc_ctrl

Blame Intel for this. :) They use full English names for VMCS fields (so
"VM-function control") and uppercase names for MSRs (so "IA32_VMX_VMFUNC").

"nested_vmx_vmfunc_controls" could be renamed to "nested_vmx_vmfunc" for
consistency, but I like the longer name too.

Paolo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11  7:51   ` David Hildenbrand
  2017-07-11  8:39     ` Paolo Bonzini
@ 2017-07-11 13:52     ` Radim Krčmář
  2017-07-11 18:05       ` Bandan Das
  2017-07-11 17:58     ` Bandan Das
  2 siblings, 1 reply; 34+ messages in thread
From: Radim Krčmář @ 2017-07-11 13:52 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: Bandan Das, kvm, pbonzini, linux-kernel

[David did a great review, so I'll just point out things I noticed.]

2017-07-11 09:51+0200, David Hildenbrand:
> On 10.07.2017 22:49, Bandan Das wrote:
> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
> > emulate a switching of the ept pointer by reloading the
> > guest MMU.
> > 
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > Signed-off-by: Bandan Das <bsd@redhat.com>
> > ---
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
> >  	}
> >  
> >  	vmcs12 = get_vmcs12(vcpu);
> > -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
> > +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
> > +	    WARN_ON_ONCE(function))
> 
> "... instruction causes a VM exit if the bit at position EAX is 0 in the
> VM-function controls (the selected VM function is
> not enabled)."
> 
> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
> completely.

It assumes that vm_function_control is not > 1, which is (should be)
guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
is 1.

> > +		goto fail;

The rest of the code assumes that the function is
VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.

Writing it as

  WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)

would be cleared and I'd prefer to move the part that handles
VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
going to add more than one VM FUNC. :])

> > +	if (!nested_cpu_has_ept(vmcs12) ||
> > +	    !nested_cpu_has_eptp_switching(vmcs12))
> > +		goto fail;

This brings me to a missing vm-entry check:

 If “EPTP switching” VM-function control is 1, the “enable EPT”
 VM-execution control must also be 1. In addition, the EPTP-list address
 must satisfy the following checks:
 • Bits 11:0 of the address must be 0.
 • The address must not set any bits beyond the processor’s
   physical-address width.

so this one could be

  if (!nested_cpu_has_eptp_switching(vmcs12) ||
      WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))

after adding the check.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11  7:51   ` David Hildenbrand
  2017-07-11  8:39     ` Paolo Bonzini
  2017-07-11 13:52     ` Radim Krčmář
@ 2017-07-11 17:58     ` Bandan Das
  2017-07-11 18:22       ` Jim Mattson
                         ` (2 more replies)
  2 siblings, 3 replies; 34+ messages in thread
From: Bandan Das @ 2017-07-11 17:58 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: kvm, pbonzini, linux-kernel

David Hildenbrand <david@redhat.com> writes:

> On 10.07.2017 22:49, Bandan Das wrote:
>> When L2 uses vmfunc, L0 utilizes the associated vmexit to
>> emulate a switching of the ept pointer by reloading the
>> guest MMU.
>> 
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> Signed-off-by: Bandan Das <bsd@redhat.com>
>> ---
>>  arch/x86/include/asm/vmx.h |  6 +++++
>>  arch/x86/kvm/vmx.c         | 58 +++++++++++++++++++++++++++++++++++++++++++---
>>  2 files changed, 61 insertions(+), 3 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index da5375e..5f63a2e 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -115,6 +115,10 @@
>>  #define VMX_MISC_SAVE_EFER_LMA			0x00000020
>>  #define VMX_MISC_ACTIVITY_HLT			0x00000040
>>  
>> +/* VMFUNC functions */
>> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
>> +#define VMFUNC_EPTP_ENTRIES  512
>> +
>>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>>  {
>>  	return vmx_basic & GENMASK_ULL(30, 0);
>> @@ -200,6 +204,8 @@ enum vmcs_field {
>>  	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
>>  	EOI_EXIT_BITMAP3                = 0x00002022,
>>  	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
>> +	EPTP_LIST_ADDRESS               = 0x00002024,
>> +	EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
>>  	VMREAD_BITMAP                   = 0x00002026,
>>  	VMWRITE_BITMAP                  = 0x00002028,
>>  	XSS_EXIT_BITMAP                 = 0x0000202C,
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index fe8f5fc..0a969fb 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -246,6 +246,7 @@ struct __packed vmcs12 {
>>  	u64 eoi_exit_bitmap1;
>>  	u64 eoi_exit_bitmap2;
>>  	u64 eoi_exit_bitmap3;
>> +	u64 eptp_list_address;
>>  	u64 xss_exit_bitmap;
>>  	u64 guest_physical_address;
>>  	u64 vmcs_link_pointer;
>> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
>>  	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
>>  	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
>>  	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
>> +	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
>>  	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
>>  	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
>>  	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
>> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>>  	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>>  }
>>  
>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>> +{
>> +	return nested_cpu_has_vmfunc(vmcs12) &&
>> +		(vmcs12->vm_function_control &
>
> I wonder if it makes sense to rename vm_function_control to
> - vmfunc_control
> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
> - vmfunc_ctrl

I tend to follow the SDM names because it's easy to look for them.

>> +		 VMX_VMFUNC_EPTP_SWITCHING);
>> +}
>> +
>>  static inline bool is_nmi(u32 intr_info)
>>  {
>>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>>  	if (cpu_has_vmx_vmfunc()) {
>>  		vmx->nested.nested_vmx_secondary_ctls_high |=
>>  			SECONDARY_EXEC_ENABLE_VMFUNC;
>> -		vmx->nested.nested_vmx_vmfunc_controls = 0;
>> +		/*
>> +		 * Advertise EPTP switching unconditionally
>> +		 * since we emulate it
>> +		 */
>> +		vmx->nested.nested_vmx_vmfunc_controls =
>> +			VMX_VMFUNC_EPTP_SWITCHING;>  	}
>>  
>>  	/*
>> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>>  	struct vmcs12 *vmcs12;
>>  	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
>> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>> +	struct page *page = NULL;
>> +	u64 *l1_eptp_list, address;
>>  
>>  	/*
>>  	 * VMFUNC is only supported for nested guests, but we always enable the
>> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>  	}
>>  
>>  	vmcs12 = get_vmcs12(vcpu);
>> -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
>> +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>> +	    WARN_ON_ONCE(function))
>
> "... instruction causes a VM exit if the bit at position EAX is 0 in the
> VM-function controls (the selected VM function is
> not enabled)."
>
> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
> completely.

It's a good hint to see if L2 misbehaved and WARN_ON_ONCE makes sure it's
not misused.

>> +		goto fail;
>> +
>> +	if (!nested_cpu_has_ept(vmcs12) ||
>> +	    !nested_cpu_has_eptp_switching(vmcs12))
>> +		goto fail;
>> +
>> +	if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
>> +		goto fail;
>
> I can find the definition for an vmexit in case of index >=
> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>
> Can you give me a hint?

I don't think there is. Since, we are basically emulating eptp switching
for L2, this is a good check to have.

>> +
>> +	page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>> +	if (!page)
>>  		goto fail;
>> -	WARN_ONCE(1, "VMCS12 VM function control should have been zero");
>> +
>> +	l1_eptp_list = kmap(page);
>> +	address = l1_eptp_list[index];
>> +	if (!address)
>> +		goto fail;
>
> Can you move that check to the other address checks below? (or rework if
> this make sense, see below)
>
>> +	/*
>> +	 * If the (L2) guest does a vmfunc to the currently
>> +	 * active ept pointer, we don't have to do anything else
>> +	 */
>> +	if (vmcs12->ept_pointer != address) {
>> +		if (address >> cpuid_maxphyaddr(vcpu) ||
>> +		    !IS_ALIGNED(address, 4096))
>
> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
> (triggering a KVM_REQ_TRIPLE_FAULT)

If there's a triple fault, I think it's a good idea to inject it
back. Basically, there's no need to take care of damage control
that L1 is intentionally doing.

>> +			goto fail;
>> +		kvm_mmu_unload(vcpu);
>> +		vmcs12->ept_pointer = address;
>> +		kvm_mmu_reload(vcpu);
>
> I was thinking about something like this:
>
> kvm_mmu_unload(vcpu);
> old = vmcs12->ept_pointer;
> vmcs12->ept_pointer = address;
> if (kvm_mmu_reload(vcpu)) {
> 	/* pointer invalid, restore previous state */
> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> 	vmcs12->ept_pointer = old;
> 	kvm_mmu_reload(vcpu);
> 	goto fail;
> }
>
> The you can inherit the checks from mmu_check_root().
>
>
> Wonder why I can't spot checks for cpuid_maxphyaddr() /
> IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The
> checks should be identical.

I think the reason is vmcs12->ept_pointer is never used directly. It's
used to create a shadow table but nevertheless, the check should be there.

>
>> +		kunmap(page);
>> +		nested_release_page_clean(page);
>
> shouldn't the kunmap + nested_release_page_clean go outside the if clause?

:) Indeed, thanks for the catch.

Bandan

>> +	}
>> +	return kvm_skip_emulated_instruction(vcpu);
>>  
>>  fail:
>> +	if (page) {
>> +		kunmap(page);
>> +		nested_release_page_clean(page);
>> +	}
>>  	nested_vmx_vmexit(vcpu, vmx->exit_reason,
>>  			  vmcs_read32(VM_EXIT_INTR_INFO),
>>  			  vmcs_readl(EXIT_QUALIFICATION));
>> 
>
> David and mmu code are not yet best friends. So sorry if I am missing
> something.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 13:52     ` Radim Krčmář
@ 2017-07-11 18:05       ` Bandan Das
  2017-07-11 19:12         ` Radim Krčmář
  0 siblings, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-11 18:05 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

Radim Krčmář <rkrcmar@redhat.com> writes:

> [David did a great review, so I'll just point out things I noticed.]
>
> 2017-07-11 09:51+0200, David Hildenbrand:
>> On 10.07.2017 22:49, Bandan Das wrote:
>> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
>> > emulate a switching of the ept pointer by reloading the
>> > guest MMU.
>> > 
>> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> > Signed-off-by: Bandan Das <bsd@redhat.com>
>> > ---
>> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> >  	}
>> >  
>> >  	vmcs12 = get_vmcs12(vcpu);
>> > -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
>> > +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>> > +	    WARN_ON_ONCE(function))
>> 
>> "... instruction causes a VM exit if the bit at position EAX is 0 in the
>> VM-function controls (the selected VM function is
>> not enabled)."
>> 
>> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
>> completely.
>
> It assumes that vm_function_control is not > 1, which is (should be)
> guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
> is 1.
>
>> > +		goto fail;
>
> The rest of the code assumes that the function is
> VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.
>
> Writing it as
>
>   WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)
>
> would be cleared and I'd prefer to move the part that handles
> VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
> going to add more than one VM FUNC. :])

IMO, for now, this should be fine because we are not even passing through the
hardware's eptp switching. Even if there are other vm functions, they
won't be available for the nested case and cause any conflict.

>> > +	if (!nested_cpu_has_ept(vmcs12) ||
>> > +	    !nested_cpu_has_eptp_switching(vmcs12))
>> > +		goto fail;
>
> This brings me to a missing vm-entry check:
>
>  If “EPTP switching” VM-function control is 1, the “enable EPT”
>  VM-execution control must also be 1. In addition, the EPTP-list address
>  must satisfy the following checks:
>  • Bits 11:0 of the address must be 0.
>  • The address must not set any bits beyond the processor’s
>    physical-address width.
>
> so this one could be
>
>   if (!nested_cpu_has_eptp_switching(vmcs12) ||
>       WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))

I will reverse the order here but the vm entry check is unnecessary because
the check on the list address is already being done in this function.

> after adding the check.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 17:58     ` Bandan Das
@ 2017-07-11 18:22       ` Jim Mattson
  2017-07-11 18:35         ` Bandan Das
  2017-07-11 18:24       ` Bandan Das
  2017-07-13 15:39       ` David Hildenbrand
  2 siblings, 1 reply; 34+ messages in thread
From: Jim Mattson @ 2017-07-11 18:22 UTC (permalink / raw)
  To: Bandan Das; +Cc: David Hildenbrand, kvm list, Paolo Bonzini, LKML

On Tue, Jul 11, 2017 at 10:58 AM, Bandan Das <bsd@redhat.com> wrote:
> David Hildenbrand <david@redhat.com> writes:
>
>> On 10.07.2017 22:49, Bandan Das wrote:
>>> When L2 uses vmfunc, L0 utilizes the associated vmexit to
>>> emulate a switching of the ept pointer by reloading the
>>> guest MMU.
>>>
>>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>>> Signed-off-by: Bandan Das <bsd@redhat.com>
>>> ---
>>>  arch/x86/include/asm/vmx.h |  6 +++++
>>>  arch/x86/kvm/vmx.c         | 58 +++++++++++++++++++++++++++++++++++++++++++---
>>>  2 files changed, 61 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>>> index da5375e..5f63a2e 100644
>>> --- a/arch/x86/include/asm/vmx.h
>>> +++ b/arch/x86/include/asm/vmx.h
>>> @@ -115,6 +115,10 @@
>>>  #define VMX_MISC_SAVE_EFER_LMA                      0x00000020
>>>  #define VMX_MISC_ACTIVITY_HLT                       0x00000040
>>>
>>> +/* VMFUNC functions */
>>> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
>>> +#define VMFUNC_EPTP_ENTRIES  512
>>> +
>>>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>>>  {
>>>      return vmx_basic & GENMASK_ULL(30, 0);
>>> @@ -200,6 +204,8 @@ enum vmcs_field {
>>>      EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
>>>      EOI_EXIT_BITMAP3                = 0x00002022,
>>>      EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
>>> +    EPTP_LIST_ADDRESS               = 0x00002024,
>>> +    EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
>>>      VMREAD_BITMAP                   = 0x00002026,
>>>      VMWRITE_BITMAP                  = 0x00002028,
>>>      XSS_EXIT_BITMAP                 = 0x0000202C,
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index fe8f5fc..0a969fb 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -246,6 +246,7 @@ struct __packed vmcs12 {
>>>      u64 eoi_exit_bitmap1;
>>>      u64 eoi_exit_bitmap2;
>>>      u64 eoi_exit_bitmap3;
>>> +    u64 eptp_list_address;
>>>      u64 xss_exit_bitmap;
>>>      u64 guest_physical_address;
>>>      u64 vmcs_link_pointer;
>>> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
>>>      FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
>>>      FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
>>>      FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
>>> +    FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
>>>      FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
>>>      FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
>>>      FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
>>> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>>>      return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>>>  }
>>>
>>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>>> +{
>>> +    return nested_cpu_has_vmfunc(vmcs12) &&
>>> +            (vmcs12->vm_function_control &
>>
>> I wonder if it makes sense to rename vm_function_control to
>> - vmfunc_control
>> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
>> - vmfunc_ctrl
>
> I tend to follow the SDM names because it's easy to look for them.
>
>>> +             VMX_VMFUNC_EPTP_SWITCHING);
>>> +}
>>> +
>>>  static inline bool is_nmi(u32 intr_info)
>>>  {
>>>      return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>>>      if (cpu_has_vmx_vmfunc()) {
>>>              vmx->nested.nested_vmx_secondary_ctls_high |=
>>>                      SECONDARY_EXEC_ENABLE_VMFUNC;
>>> -            vmx->nested.nested_vmx_vmfunc_controls = 0;
>>> +            /*
>>> +             * Advertise EPTP switching unconditionally
>>> +             * since we emulate it
>>> +             */
>>> +            vmx->nested.nested_vmx_vmfunc_controls =
>>> +                    VMX_VMFUNC_EPTP_SWITCHING;>     }
>>>
>>>      /*
>>> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>>      struct vcpu_vmx *vmx = to_vmx(vcpu);
>>>      struct vmcs12 *vmcs12;
>>>      u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
>>> +    u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>>> +    struct page *page = NULL;
>>> +    u64 *l1_eptp_list, address;
>>>
>>>      /*
>>>       * VMFUNC is only supported for nested guests, but we always enable the
>>> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>>      }
>>>
>>>      vmcs12 = get_vmcs12(vcpu);
>>> -    if ((vmcs12->vm_function_control & (1 << function)) == 0)
>>> +    if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>>> +        WARN_ON_ONCE(function))
>>
>> "... instruction causes a VM exit if the bit at position EAX is 0 in the
>> VM-function controls (the selected VM function is
>> not enabled)."
>>
>> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
>> completely.
>
> It's a good hint to see if L2 misbehaved and WARN_ON_ONCE makes sure it's
> not misused.
>
>>> +            goto fail;
>>> +
>>> +    if (!nested_cpu_has_ept(vmcs12) ||
>>> +        !nested_cpu_has_eptp_switching(vmcs12))
>>> +            goto fail;
>>> +
>>> +    if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
>>> +            goto fail;
>>
>> I can find the definition for an vmexit in case of index >=
>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>>
>> Can you give me a hint?
>
> I don't think there is. Since, we are basically emulating eptp switching
> for L2, this is a good check to have.

There is nothing wrong with a hypervisor using physical page 0 for
whatever purpose it likes, including an EPTP list.

>>> +
>>> +    page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>>> +    if (!page)
>>>              goto fail;
>>> -    WARN_ONCE(1, "VMCS12 VM function control should have been zero");
>>> +
>>> +    l1_eptp_list = kmap(page);
>>> +    address = l1_eptp_list[index];
>>> +    if (!address)
>>> +            goto fail;
>>
>> Can you move that check to the other address checks below? (or rework if
>> this make sense, see below)
>>
>>> +    /*
>>> +     * If the (L2) guest does a vmfunc to the currently
>>> +     * active ept pointer, we don't have to do anything else
>>> +     */
>>> +    if (vmcs12->ept_pointer != address) {
>>> +            if (address >> cpuid_maxphyaddr(vcpu) ||
>>> +                !IS_ALIGNED(address, 4096))
>>
>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>> (triggering a KVM_REQ_TRIPLE_FAULT)
>
> If there's a triple fault, I think it's a good idea to inject it
> back. Basically, there's no need to take care of damage control
> that L1 is intentionally doing.
>
>>> +                    goto fail;
>>> +            kvm_mmu_unload(vcpu);
>>> +            vmcs12->ept_pointer = address;
>>> +            kvm_mmu_reload(vcpu);
>>
>> I was thinking about something like this:
>>
>> kvm_mmu_unload(vcpu);
>> old = vmcs12->ept_pointer;
>> vmcs12->ept_pointer = address;
>> if (kvm_mmu_reload(vcpu)) {
>>       /* pointer invalid, restore previous state */
>>       kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>>       vmcs12->ept_pointer = old;
>>       kvm_mmu_reload(vcpu);
>>       goto fail;
>> }
>>
>> The you can inherit the checks from mmu_check_root().
>>
>>
>> Wonder why I can't spot checks for cpuid_maxphyaddr() /
>> IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The
>> checks should be identical.
>
> I think the reason is vmcs12->ept_pointer is never used directly. It's
> used to create a shadow table but nevertheless, the check should be there.
>
>>
>>> +            kunmap(page);
>>> +            nested_release_page_clean(page);
>>
>> shouldn't the kunmap + nested_release_page_clean go outside the if clause?
>
> :) Indeed, thanks for the catch.
>
> Bandan
>
>>> +    }
>>> +    return kvm_skip_emulated_instruction(vcpu);
>>>
>>>  fail:
>>> +    if (page) {
>>> +            kunmap(page);
>>> +            nested_release_page_clean(page);
>>> +    }
>>>      nested_vmx_vmexit(vcpu, vmx->exit_reason,
>>>                        vmcs_read32(VM_EXIT_INTR_INFO),
>>>                        vmcs_readl(EXIT_QUALIFICATION));
>>>
>>
>> David and mmu code are not yet best friends. So sorry if I am missing
>> something.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 17:58     ` Bandan Das
  2017-07-11 18:22       ` Jim Mattson
@ 2017-07-11 18:24       ` Bandan Das
  2017-07-11 19:32         ` Radim Krčmář
  2017-07-13 15:39       ` David Hildenbrand
  2 siblings, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-11 18:24 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: kvm, pbonzini, linux-kernel

Bandan Das <bsd@redhat.com> writes:
....
>>> +	/*
>>> +	 * If the (L2) guest does a vmfunc to the currently
>>> +	 * active ept pointer, we don't have to do anything else
>>> +	 */
>>> +	if (vmcs12->ept_pointer != address) {
>>> +		if (address >> cpuid_maxphyaddr(vcpu) ||
>>> +		    !IS_ALIGNED(address, 4096))
>>
>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>> (triggering a KVM_REQ_TRIPLE_FAULT)
>
> If there's a triple fault, I think it's a good idea to inject it
> back. Basically, there's no need to take care of damage control
> that L1 is intentionally doing.
>
>>> +			goto fail;
>>> +		kvm_mmu_unload(vcpu);
>>> +		vmcs12->ept_pointer = address;
>>> +		kvm_mmu_reload(vcpu);
>>
>> I was thinking about something like this:
>>
>> kvm_mmu_unload(vcpu);
>> old = vmcs12->ept_pointer;
>> vmcs12->ept_pointer = address;
>> if (kvm_mmu_reload(vcpu)) {
>> 	/* pointer invalid, restore previous state */
>> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> 	vmcs12->ept_pointer = old;
>> 	kvm_mmu_reload(vcpu);
>> 	goto fail;
>> }
>>
>> The you can inherit the checks from mmu_check_root().

Actually, thinking about this a bit more, I agree with you. Any fault
with a vmfunc operation should end with a vmfunc vmexit, so this
is a good thing to have. Thank you for this idea! :)

Bandan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 18:22       ` Jim Mattson
@ 2017-07-11 18:35         ` Bandan Das
  2017-07-11 19:13           ` Radim Krčmář
  0 siblings, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-11 18:35 UTC (permalink / raw)
  To: Jim Mattson; +Cc: David Hildenbrand, kvm list, Paolo Bonzini, LKML

Jim Mattson <jmattson@google.com> writes:
...
>>> I can find the definition for an vmexit in case of index >=
>>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>>>
>>> Can you give me a hint?
>>
>> I don't think there is. Since, we are basically emulating eptp switching
>> for L2, this is a good check to have.
>
> There is nothing wrong with a hypervisor using physical page 0 for
> whatever purpose it likes, including an EPTP list.

Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
address most likely means it forgot to initialize it. Whatever damage it does will
still end up with vmfunc vmexit anyway.

Bandan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 18:05       ` Bandan Das
@ 2017-07-11 19:12         ` Radim Krčmář
  2017-07-11 19:34           ` Bandan Das
  0 siblings, 1 reply; 34+ messages in thread
From: Radim Krčmář @ 2017-07-11 19:12 UTC (permalink / raw)
  To: Bandan Das; +Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

2017-07-11 14:05-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> 
> > [David did a great review, so I'll just point out things I noticed.]
> >
> > 2017-07-11 09:51+0200, David Hildenbrand:
> >> On 10.07.2017 22:49, Bandan Das wrote:
> >> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
> >> > emulate a switching of the ept pointer by reloading the
> >> > guest MMU.
> >> > 
> >> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> >> > Signed-off-by: Bandan Das <bsd@redhat.com>
> >> > ---
> >> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
> >> >  	}
> >> >  
> >> >  	vmcs12 = get_vmcs12(vcpu);
> >> > -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
> >> > +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
> >> > +	    WARN_ON_ONCE(function))
> >> 
> >> "... instruction causes a VM exit if the bit at position EAX is 0 in the
> >> VM-function controls (the selected VM function is
> >> not enabled)."
> >> 
> >> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
> >> completely.
> >
> > It assumes that vm_function_control is not > 1, which is (should be)
> > guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
> > is 1.
> >
> >> > +		goto fail;
> >
> > The rest of the code assumes that the function is
> > VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.
> >
> > Writing it as
> >
> >   WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)
> >
> > would be cleared and I'd prefer to move the part that handles
> > VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
> > going to add more than one VM FUNC. :])
> 
> IMO, for now, this should be fine because we are not even passing through the
> hardware's eptp switching. Even if there are other vm functions, they
> won't be available for the nested case and cause any conflict.

Yeah, it is fine function-wise, I was just pointing out that it looks
ugly to me.

Btw. have you looked what we'd need to do for the hardware pass-through?
I'd expect big changes to MMU. :)

> >> > +	if (!nested_cpu_has_ept(vmcs12) ||
> >> > +	    !nested_cpu_has_eptp_switching(vmcs12))
> >> > +		goto fail;
> >
> > This brings me to a missing vm-entry check:
> >
> >  If “EPTP switching” VM-function control is 1, the “enable EPT”
> >  VM-execution control must also be 1. In addition, the EPTP-list address
> >  must satisfy the following checks:
> >  • Bits 11:0 of the address must be 0.
> >  • The address must not set any bits beyond the processor’s
> >    physical-address width.
> >
> > so this one could be
> >
> >   if (!nested_cpu_has_eptp_switching(vmcs12) ||
> >       WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
> 
> I will reverse the order here but the vm entry check is unnecessary because
> the check on the list address is already being done in this function.

Here is too late, the nested VM-entry should have failed, never letting
this situation happen.  We want an equivalent of

  if (nested_cpu_has_eptp_switching(vmcs12) && !nested_cpu_has_ept(vmcs12))
  	return VMXERR_ENTRY_INVALID_CONTROL_FIELD;

in nested controls checks, right next to the reserved fields check.
And then also the check EPTP-list check.  All of them only checked when
nested_cpu_has_vmfunc(vmcs12).

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 18:35         ` Bandan Das
@ 2017-07-11 19:13           ` Radim Krčmář
  2017-07-11 19:38             ` Bandan Das
  0 siblings, 1 reply; 34+ messages in thread
From: Radim Krčmář @ 2017-07-11 19:13 UTC (permalink / raw)
  To: Bandan Das; +Cc: Jim Mattson, David Hildenbrand, kvm list, Paolo Bonzini, LKML

2017-07-11 14:35-0400, Bandan Das:
> Jim Mattson <jmattson@google.com> writes:
> ...
> >>> I can find the definition for an vmexit in case of index >=
> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
> >>>
> >>> Can you give me a hint?
> >>
> >> I don't think there is. Since, we are basically emulating eptp switching
> >> for L2, this is a good check to have.
> >
> > There is nothing wrong with a hypervisor using physical page 0 for
> > whatever purpose it likes, including an EPTP list.
> 
> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
> address most likely means it forgot to initialize it. Whatever damage it does will
> still end up with vmfunc vmexit anyway.

Most likely, but not certainly.  I also don't see a to diverge from the
spec here.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 18:24       ` Bandan Das
@ 2017-07-11 19:32         ` Radim Krčmář
  2017-07-11 19:50           ` Bandan Das
  0 siblings, 1 reply; 34+ messages in thread
From: Radim Krčmář @ 2017-07-11 19:32 UTC (permalink / raw)
  To: Bandan Das; +Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

2017-07-11 14:24-0400, Bandan Das:
> Bandan Das <bsd@redhat.com> writes:
> > If there's a triple fault, I think it's a good idea to inject it
> > back. Basically, there's no need to take care of damage control
> > that L1 is intentionally doing.
> >
> >>> +			goto fail;
> >>> +		kvm_mmu_unload(vcpu);
> >>> +		vmcs12->ept_pointer = address;
> >>> +		kvm_mmu_reload(vcpu);
> >>
> >> I was thinking about something like this:
> >>
> >> kvm_mmu_unload(vcpu);
> >> old = vmcs12->ept_pointer;
> >> vmcs12->ept_pointer = address;
> >> if (kvm_mmu_reload(vcpu)) {
> >> 	/* pointer invalid, restore previous state */
> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> 	vmcs12->ept_pointer = old;
> >> 	kvm_mmu_reload(vcpu);
> >> 	goto fail;
> >> }
> >>
> >> The you can inherit the checks from mmu_check_root().
> 
> Actually, thinking about this a bit more, I agree with you. Any fault
> with a vmfunc operation should end with a vmfunc vmexit, so this
> is a good thing to have. Thank you for this idea! :)

SDM says

  IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
  if in EPTP) THEN VMexit;

and no other mentions of a VM exit, so I think that the VM exit happens
only under these conditions:

  — The EPT memory type (bits 2:0) must be a value supported by the
    processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
    Appendix A.10).
  — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
    an EPT page-walk length of 4; see Section 28.2.2.
  — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
    bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
    as 0, indicating that the processor does not support accessed and
    dirty flags for EPT.
  — Reserved bits 11:7 and 63:N (where N is the processor’s
    physical-address width) must all be 0.

And it looks like we need parts of nested_ept_init_mmu_context() to
properly handle VMX_EPT_AD_ENABLE_BIT.

The KVM_REQ_TRIPLE_FAULT can be handled by kvm_mmu_reload in vcpu_run if
we just invalidate the MMU.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 19:12         ` Radim Krčmář
@ 2017-07-11 19:34           ` Bandan Das
  0 siblings, 0 replies; 34+ messages in thread
From: Bandan Das @ 2017-07-11 19:34 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 14:05-0400, Bandan Das:
>> Radim Krčmář <rkrcmar@redhat.com> writes:
>> 
>> > [David did a great review, so I'll just point out things I noticed.]
>> >
>> > 2017-07-11 09:51+0200, David Hildenbrand:
>> >> On 10.07.2017 22:49, Bandan Das wrote:
>> >> > When L2 uses vmfunc, L0 utilizes the associated vmexit to
>> >> > emulate a switching of the ept pointer by reloading the
>> >> > guest MMU.
>> >> > 
>> >> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> >> > Signed-off-by: Bandan Das <bsd@redhat.com>
>> >> > ---
>> >> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> >> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> >> >  	}
>> >> >  
>> >> >  	vmcs12 = get_vmcs12(vcpu);
>> >> > -	if ((vmcs12->vm_function_control & (1 << function)) == 0)
>> >> > +	if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>> >> > +	    WARN_ON_ONCE(function))
>> >> 
>> >> "... instruction causes a VM exit if the bit at position EAX is 0 in the
>> >> VM-function controls (the selected VM function is
>> >> not enabled)."
>> >> 
>> >> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
>> >> completely.
>> >
>> > It assumes that vm_function_control is not > 1, which is (should be)
>> > guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR
>> > is 1.
>> >
>> >> > +		goto fail;
>> >
>> > The rest of the code assumes that the function is
>> > VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable.
>> >
>> > Writing it as
>> >
>> >   WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING)
>> >
>> > would be cleared and I'd prefer to move the part that handles
>> > VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is
>> > going to add more than one VM FUNC. :])
>> 
>> IMO, for now, this should be fine because we are not even passing through the
>> hardware's eptp switching. Even if there are other vm functions, they
>> won't be available for the nested case and cause any conflict.
>
> Yeah, it is fine function-wise, I was just pointing out that it looks
> ugly to me.

Ok, lemme switch this to a switch statement style handler function. That way,
future additions would be easier.

> Btw. have you looked what we'd need to do for the hardware pass-through?
> I'd expect big changes to MMU. :)

Yes, the first version was actually using vmfunc 0 directly, well not exatly, the
first time would go through this path and then the next time the processor would
handle it directly. Paolo pointed out an issue that I was ready to fix but he wasn't
comfortable with the idea. I actually agree with him, it's too much untested code
for something that would be rarely used.

>> >> > +	if (!nested_cpu_has_ept(vmcs12) ||
>> >> > +	    !nested_cpu_has_eptp_switching(vmcs12))
>> >> > +		goto fail;
>> >
>> > This brings me to a missing vm-entry check:
>> >
>> >  If “EPTP switching” VM-function control is 1, the “enable EPT”
>> >  VM-execution control must also be 1. In addition, the EPTP-list address
>> >  must satisfy the following checks:
>> >  • Bits 11:0 of the address must be 0.
>> >  • The address must not set any bits beyond the processor’s
>> >    physical-address width.
>> >
>> > so this one could be
>> >
>> >   if (!nested_cpu_has_eptp_switching(vmcs12) ||
>> >       WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
>> 
>> I will reverse the order here but the vm entry check is unnecessary because
>> the check on the list address is already being done in this function.
>
> Here is too late, the nested VM-entry should have failed, never letting
> this situation happen.  We want an equivalent of
>
>   if (nested_cpu_has_eptp_switching(vmcs12) && !nested_cpu_has_ept(vmcs12))
>   	return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
>
> in nested controls checks, right next to the reserved fields check.
> And then also the check EPTP-list check.  All of them only checked when
> nested_cpu_has_vmfunc(vmcs12).

Actually, I misread 25.5.5.3. There are two checks. Here, the list entry
needs to be checked so that eptp won't cause a vmentry failure. The vmentry
needs to check the eptp list address itself. I will add that check for the
list address in the next version.

Bandan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 19:13           ` Radim Krčmář
@ 2017-07-11 19:38             ` Bandan Das
  2017-07-11 20:22               ` Radim Krčmář
  0 siblings, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-11 19:38 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Jim Mattson, David Hildenbrand, kvm list, Paolo Bonzini, LKML

Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 14:35-0400, Bandan Das:
>> Jim Mattson <jmattson@google.com> writes:
>> ...
>> >>> I can find the definition for an vmexit in case of index >=
>> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>> >>>
>> >>> Can you give me a hint?
>> >>
>> >> I don't think there is. Since, we are basically emulating eptp switching
>> >> for L2, this is a good check to have.
>> >
>> > There is nothing wrong with a hypervisor using physical page 0 for
>> > whatever purpose it likes, including an EPTP list.
>> 
>> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
>> address most likely means it forgot to initialize it. Whatever damage it does will
>> still end up with vmfunc vmexit anyway.
>
> Most likely, but not certainly.  I also don't see a to diverge from the
> spec here.

Actually, this is a specific case where I would like to diverge from the spec.
But then again, it's L1 shooting itself in the foot and this would be a rarely
used code path, so, I am fine removing it.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 19:32         ` Radim Krčmář
@ 2017-07-11 19:50           ` Bandan Das
  2017-07-11 20:21             ` Radim Krčmář
  0 siblings, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-11 19:50 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 14:24-0400, Bandan Das:
>> Bandan Das <bsd@redhat.com> writes:
>> > If there's a triple fault, I think it's a good idea to inject it
>> > back. Basically, there's no need to take care of damage control
>> > that L1 is intentionally doing.
>> >
>> >>> +			goto fail;
>> >>> +		kvm_mmu_unload(vcpu);
>> >>> +		vmcs12->ept_pointer = address;
>> >>> +		kvm_mmu_reload(vcpu);
>> >>
>> >> I was thinking about something like this:
>> >>
>> >> kvm_mmu_unload(vcpu);
>> >> old = vmcs12->ept_pointer;
>> >> vmcs12->ept_pointer = address;
>> >> if (kvm_mmu_reload(vcpu)) {
>> >> 	/* pointer invalid, restore previous state */
>> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> >> 	vmcs12->ept_pointer = old;
>> >> 	kvm_mmu_reload(vcpu);
>> >> 	goto fail;
>> >> }
>> >>
>> >> The you can inherit the checks from mmu_check_root().
>> 
>> Actually, thinking about this a bit more, I agree with you. Any fault
>> with a vmfunc operation should end with a vmfunc vmexit, so this
>> is a good thing to have. Thank you for this idea! :)
>
> SDM says
>
>   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
>   if in EPTP) THEN VMexit;

This section here:
As noted in Section 25.5.5.2, an execution of the
EPTP-switching VM function that causes a VM exit (as specified
above), uses the basic exit reason 59, indicating “VMFUNC”.
The length of the VMFUNC instruction is saved into the
VM-exit instruction-length field. No additional VM-exit
information is provided.

Although, it adds (as specified above), from testing, any vmexit that
happens as a result of the execution of the vmfunc instruction always
has exit reason 59.

IMO, the case David pointed out comes under "as a result of the
execution of the vmfunc instruction", so I would prefer exiting
with reason 59.

> and no other mentions of a VM exit, so I think that the VM exit happens
> only under these conditions:
>
>   — The EPT memory type (bits 2:0) must be a value supported by the
>     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>     Appendix A.10).
>   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>     an EPT page-walk length of 4; see Section 28.2.2.
>   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>     as 0, indicating that the processor does not support accessed and
>     dirty flags for EPT.
>   — Reserved bits 11:7 and 63:N (where N is the processor’s
>     physical-address width) must all be 0.
>
> And it looks like we need parts of nested_ept_init_mmu_context() to
> properly handle VMX_EPT_AD_ENABLE_BIT.

I completely ignored AD and the #VE sections. I will add a TODO item
in the comment section.

> The KVM_REQ_TRIPLE_FAULT can be handled by kvm_mmu_reload in vcpu_run if
> we just invalidate the MMU.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 19:50           ` Bandan Das
@ 2017-07-11 20:21             ` Radim Krčmář
  2017-07-11 20:34               ` Bandan Das
  2017-07-17 17:58               ` Bandan Das
  0 siblings, 2 replies; 34+ messages in thread
From: Radim Krčmář @ 2017-07-11 20:21 UTC (permalink / raw)
  To: Bandan Das; +Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

2017-07-11 15:50-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> > 2017-07-11 14:24-0400, Bandan Das:
> >> Bandan Das <bsd@redhat.com> writes:
> >> > If there's a triple fault, I think it's a good idea to inject it
> >> > back. Basically, there's no need to take care of damage control
> >> > that L1 is intentionally doing.
> >> >
> >> >>> +			goto fail;
> >> >>> +		kvm_mmu_unload(vcpu);
> >> >>> +		vmcs12->ept_pointer = address;
> >> >>> +		kvm_mmu_reload(vcpu);
> >> >>
> >> >> I was thinking about something like this:
> >> >>
> >> >> kvm_mmu_unload(vcpu);
> >> >> old = vmcs12->ept_pointer;
> >> >> vmcs12->ept_pointer = address;
> >> >> if (kvm_mmu_reload(vcpu)) {
> >> >> 	/* pointer invalid, restore previous state */
> >> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> >> 	vmcs12->ept_pointer = old;
> >> >> 	kvm_mmu_reload(vcpu);
> >> >> 	goto fail;
> >> >> }
> >> >>
> >> >> The you can inherit the checks from mmu_check_root().
> >> 
> >> Actually, thinking about this a bit more, I agree with you. Any fault
> >> with a vmfunc operation should end with a vmfunc vmexit, so this
> >> is a good thing to have. Thank you for this idea! :)
> >
> > SDM says
> >
> >   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
> >   if in EPTP) THEN VMexit;
> 
> This section here:
> As noted in Section 25.5.5.2, an execution of the
> EPTP-switching VM function that causes a VM exit (as specified
> above), uses the basic exit reason 59, indicating “VMFUNC”.
> The length of the VMFUNC instruction is saved into the
> VM-exit instruction-length field. No additional VM-exit
> information is provided.
> 
> Although, it adds (as specified above), from testing, any vmexit that
> happens as a result of the execution of the vmfunc instruction always
> has exit reason 59.
> 
> IMO, the case David pointed out comes under "as a result of the
> execution of the vmfunc instruction", so I would prefer exiting
> with reason 59.

Right, the exit reason is 59 for reasons that trigger a VM exit
(i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
unrelated stuff.

If the EPTP value is correct, then the switch should succeed.
If the EPTP is correct, but bogus, then the guest should get
EPT_MISCONFIG VM exit on its first access (when reading the
instruction).  Source: I added

  vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));

shortly before a VMLAUNCH on L0. :)

I think that we might be emulating this case incorrectly and throwing
triple faults when it should be VM exits in vcpu_run().

> > and no other mentions of a VM exit, so I think that the VM exit happens
> > only under these conditions:
> >
> >   — The EPT memory type (bits 2:0) must be a value supported by the
> >     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
> >     Appendix A.10).
> >   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
> >     an EPT page-walk length of 4; see Section 28.2.2.
> >   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
> >     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
> >     as 0, indicating that the processor does not support accessed and
> >     dirty flags for EPT.
> >   — Reserved bits 11:7 and 63:N (where N is the processor’s
> >     physical-address width) must all be 0.
> >
> > And it looks like we need parts of nested_ept_init_mmu_context() to
> > properly handle VMX_EPT_AD_ENABLE_BIT.
> 
> I completely ignored AD and the #VE sections. I will add a TODO item
> in the comment section.

AFAIK, we don't support #VE, but AD would be nice to handle from the
beginning.  (I think that caling nested_ept_init_mmu_context() as-is
isn't that bad.)

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 19:38             ` Bandan Das
@ 2017-07-11 20:22               ` Radim Krčmář
  2017-07-11 20:45                 ` Bandan Das
  0 siblings, 1 reply; 34+ messages in thread
From: Radim Krčmář @ 2017-07-11 20:22 UTC (permalink / raw)
  To: Bandan Das; +Cc: Jim Mattson, David Hildenbrand, kvm list, Paolo Bonzini, LKML

2017-07-11 15:38-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> 
> > 2017-07-11 14:35-0400, Bandan Das:
> >> Jim Mattson <jmattson@google.com> writes:
> >> ...
> >> >>> I can find the definition for an vmexit in case of index >=
> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
> >> >>>
> >> >>> Can you give me a hint?
> >> >>
> >> >> I don't think there is. Since, we are basically emulating eptp switching
> >> >> for L2, this is a good check to have.
> >> >
> >> > There is nothing wrong with a hypervisor using physical page 0 for
> >> > whatever purpose it likes, including an EPTP list.
> >> 
> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
> >> address most likely means it forgot to initialize it. Whatever damage it does will
> >> still end up with vmfunc vmexit anyway.
> >
> > Most likely, but not certainly.  I also don't see a to diverge from the
> > spec here.
> 
> Actually, this is a specific case where I would like to diverge from the spec.
> But then again, it's L1 shooting itself in the foot and this would be a rarely
> used code path, so, I am fine removing it.

Thanks, we're not here to judge the guest, but to provide a bare-metal
experience. :)

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 20:21             ` Radim Krčmář
@ 2017-07-11 20:34               ` Bandan Das
  2017-07-11 20:45                 ` Radim Krčmář
  2017-07-17 17:58               ` Bandan Das
  1 sibling, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-11 20:34 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 15:50-0400, Bandan Das:
>> Radim Krčmář <rkrcmar@redhat.com> writes:
>> > 2017-07-11 14:24-0400, Bandan Das:
>> >> Bandan Das <bsd@redhat.com> writes:
>> >> > If there's a triple fault, I think it's a good idea to inject it
>> >> > back. Basically, there's no need to take care of damage control
>> >> > that L1 is intentionally doing.
>> >> >
>> >> >>> +			goto fail;
>> >> >>> +		kvm_mmu_unload(vcpu);
>> >> >>> +		vmcs12->ept_pointer = address;
>> >> >>> +		kvm_mmu_reload(vcpu);
>> >> >>
>> >> >> I was thinking about something like this:
>> >> >>
>> >> >> kvm_mmu_unload(vcpu);
>> >> >> old = vmcs12->ept_pointer;
>> >> >> vmcs12->ept_pointer = address;
>> >> >> if (kvm_mmu_reload(vcpu)) {
>> >> >> 	/* pointer invalid, restore previous state */
>> >> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> >> >> 	vmcs12->ept_pointer = old;
>> >> >> 	kvm_mmu_reload(vcpu);
>> >> >> 	goto fail;
>> >> >> }
>> >> >>
>> >> >> The you can inherit the checks from mmu_check_root().
>> >> 
>> >> Actually, thinking about this a bit more, I agree with you. Any fault
>> >> with a vmfunc operation should end with a vmfunc vmexit, so this
>> >> is a good thing to have. Thank you for this idea! :)
>> >
>> > SDM says
>> >
>> >   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
>> >   if in EPTP) THEN VMexit;
>> 
>> This section here:
>> As noted in Section 25.5.5.2, an execution of the
>> EPTP-switching VM function that causes a VM exit (as specified
>> above), uses the basic exit reason 59, indicating “VMFUNC”.
>> The length of the VMFUNC instruction is saved into the
>> VM-exit instruction-length field. No additional VM-exit
>> information is provided.
>> 
>> Although, it adds (as specified above), from testing, any vmexit that
>> happens as a result of the execution of the vmfunc instruction always
>> has exit reason 59.
>> 
>> IMO, the case David pointed out comes under "as a result of the
>> execution of the vmfunc instruction", so I would prefer exiting
>> with reason 59.
>
> Right, the exit reason is 59 for reasons that trigger a VM exit
> (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
> unrelated stuff.
>
> If the EPTP value is correct, then the switch should succeed.
> If the EPTP is correct, but bogus, then the guest should get
> EPT_MISCONFIG VM exit on its first access (when reading the
> instruction).  Source: I added

My point is that we are using kvm_mmu_reload() to emulate eptp
switching. If that emulation of vmfunc fails, it should exit with reason
59.

>   vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
>
> shortly before a VMLAUNCH on L0. :)

What happens if this ept pointer is actually in the eptp list and the guest
switches to it using vmfunc ? I think it will exit with reason 59.

> I think that we might be emulating this case incorrectly and throwing
> triple faults when it should be VM exits in vcpu_run().

No, I agree with not throwing a triple fault. We should clear it out.
But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.

>> > and no other mentions of a VM exit, so I think that the VM exit happens
>> > only under these conditions:
>> >
>> >   — The EPT memory type (bits 2:0) must be a value supported by the
>> >     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>> >     Appendix A.10).
>> >   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>> >     an EPT page-walk length of 4; see Section 28.2.2.
>> >   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>> >     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>> >     as 0, indicating that the processor does not support accessed and
>> >     dirty flags for EPT.
>> >   — Reserved bits 11:7 and 63:N (where N is the processor’s
>> >     physical-address width) must all be 0.
>> >
>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>> 
>> I completely ignored AD and the #VE sections. I will add a TODO item
>> in the comment section.
>
> AFAIK, we don't support #VE, but AD would be nice to handle from the

Nevertheless, it's good to have the nested hypervisor be able to use it
just like vmfunc.

> beginning.  (I think that caling nested_ept_init_mmu_context() as-is
> isn't that bad.)

Ok, I will take a look.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 20:34               ` Bandan Das
@ 2017-07-11 20:45                 ` Radim Krčmář
  2017-07-11 21:08                   ` Bandan Das
  0 siblings, 1 reply; 34+ messages in thread
From: Radim Krčmář @ 2017-07-11 20:45 UTC (permalink / raw)
  To: Bandan Das; +Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

2017-07-11 16:34-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> 
> > 2017-07-11 15:50-0400, Bandan Das:
> >> Radim Krčmář <rkrcmar@redhat.com> writes:
> >> > 2017-07-11 14:24-0400, Bandan Das:
> >> >> Bandan Das <bsd@redhat.com> writes:
> >> >> > If there's a triple fault, I think it's a good idea to inject it
> >> >> > back. Basically, there's no need to take care of damage control
> >> >> > that L1 is intentionally doing.
> >> >> >
> >> >> >>> +			goto fail;
> >> >> >>> +		kvm_mmu_unload(vcpu);
> >> >> >>> +		vmcs12->ept_pointer = address;
> >> >> >>> +		kvm_mmu_reload(vcpu);
> >> >> >>
> >> >> >> I was thinking about something like this:
> >> >> >>
> >> >> >> kvm_mmu_unload(vcpu);
> >> >> >> old = vmcs12->ept_pointer;
> >> >> >> vmcs12->ept_pointer = address;
> >> >> >> if (kvm_mmu_reload(vcpu)) {
> >> >> >> 	/* pointer invalid, restore previous state */
> >> >> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> >> >> 	vmcs12->ept_pointer = old;
> >> >> >> 	kvm_mmu_reload(vcpu);
> >> >> >> 	goto fail;
> >> >> >> }
> >> >> >>
> >> >> >> The you can inherit the checks from mmu_check_root().
> >> >> 
> >> >> Actually, thinking about this a bit more, I agree with you. Any fault
> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this
> >> >> is a good thing to have. Thank you for this idea! :)
> >> >
> >> > SDM says
> >> >
> >> >   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
> >> >   if in EPTP) THEN VMexit;
> >> 
> >> This section here:
> >> As noted in Section 25.5.5.2, an execution of the
> >> EPTP-switching VM function that causes a VM exit (as specified
> >> above), uses the basic exit reason 59, indicating “VMFUNC”.
> >> The length of the VMFUNC instruction is saved into the
> >> VM-exit instruction-length field. No additional VM-exit
> >> information is provided.
> >> 
> >> Although, it adds (as specified above), from testing, any vmexit that
> >> happens as a result of the execution of the vmfunc instruction always
> >> has exit reason 59.
> >> 
> >> IMO, the case David pointed out comes under "as a result of the
> >> execution of the vmfunc instruction", so I would prefer exiting
> >> with reason 59.
> >
> > Right, the exit reason is 59 for reasons that trigger a VM exit
> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
> > unrelated stuff.
> >
> > If the EPTP value is correct, then the switch should succeed.
> > If the EPTP is correct, but bogus, then the guest should get
> > EPT_MISCONFIG VM exit on its first access (when reading the
> > instruction).  Source: I added
> 
> My point is that we are using kvm_mmu_reload() to emulate eptp
> switching. If that emulation of vmfunc fails, it should exit with reason
> 59.

Yeah, we just disagree on what is a vmfunc failure.

> >   vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
> >
> > shortly before a VMLAUNCH on L0. :)
> 
> What happens if this ept pointer is actually in the eptp list and the guest
> switches to it using vmfunc ? I think it will exit with reason 59.

I think otherwise, because it doesn't cause a VM entry failure on
bare-metal (and SDM says that we get a VM exit only if there would be a
VM entry failure).
I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after.
(Experiment pending :])

> > I think that we might be emulating this case incorrectly and throwing
> > triple faults when it should be VM exits in vcpu_run().
> 
> No, I agree with not throwing a triple fault. We should clear it out.
> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.

Here we disagree.  I think that it's a bug do the VM exit, so we can
just keep the original bug -- we want to eventually fix it and it's no
worse till then.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 20:22               ` Radim Krčmář
@ 2017-07-11 20:45                 ` Bandan Das
  2017-07-12 13:41                   ` Radim Krčmář
  0 siblings, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-11 20:45 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Jim Mattson, David Hildenbrand, kvm list, Paolo Bonzini, LKML

Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 15:38-0400, Bandan Das:
>> Radim Krčmář <rkrcmar@redhat.com> writes:
>> 
>> > 2017-07-11 14:35-0400, Bandan Das:
>> >> Jim Mattson <jmattson@google.com> writes:
>> >> ...
>> >> >>> I can find the definition for an vmexit in case of index >=
>> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>> >> >>>
>> >> >>> Can you give me a hint?
>> >> >>
>> >> >> I don't think there is. Since, we are basically emulating eptp switching
>> >> >> for L2, this is a good check to have.
>> >> >
>> >> > There is nothing wrong with a hypervisor using physical page 0 for
>> >> > whatever purpose it likes, including an EPTP list.
>> >> 
>> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
>> >> address most likely means it forgot to initialize it. Whatever damage it does will
>> >> still end up with vmfunc vmexit anyway.
>> >
>> > Most likely, but not certainly.  I also don't see a to diverge from the
>> > spec here.
>> 
>> Actually, this is a specific case where I would like to diverge from the spec.
>> But then again, it's L1 shooting itself in the foot and this would be a rarely
>> used code path, so, I am fine removing it.
>
> Thanks, we're not here to judge the guest, but to provide a bare-metal
> experience. :)

There are certain cases where do. For example, when L2 instruction emulation
fails we decide to kill L2 instead of injecting the error to L1 and let it handle
that. Anyway, that's a different topic, I was just trying to point out there
are cases kvm does a somewhat policy decision...

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 20:45                 ` Radim Krčmář
@ 2017-07-11 21:08                   ` Bandan Das
  2017-07-12 13:24                     ` Radim Krčmář
  0 siblings, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-11 21:08 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-11 16:34-0400, Bandan Das:
>> Radim Krčmář <rkrcmar@redhat.com> writes:
>> 
>> > 2017-07-11 15:50-0400, Bandan Das:
>> >> Radim Krčmář <rkrcmar@redhat.com> writes:
>> >> > 2017-07-11 14:24-0400, Bandan Das:
>> >> >> Bandan Das <bsd@redhat.com> writes:
>> >> >> > If there's a triple fault, I think it's a good idea to inject it
>> >> >> > back. Basically, there's no need to take care of damage control
>> >> >> > that L1 is intentionally doing.
>> >> >> >
>> >> >> >>> +			goto fail;
>> >> >> >>> +		kvm_mmu_unload(vcpu);
>> >> >> >>> +		vmcs12->ept_pointer = address;
>> >> >> >>> +		kvm_mmu_reload(vcpu);
>> >> >> >>
>> >> >> >> I was thinking about something like this:
>> >> >> >>
>> >> >> >> kvm_mmu_unload(vcpu);
>> >> >> >> old = vmcs12->ept_pointer;
>> >> >> >> vmcs12->ept_pointer = address;
>> >> >> >> if (kvm_mmu_reload(vcpu)) {
>> >> >> >> 	/* pointer invalid, restore previous state */
>> >> >> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>> >> >> >> 	vmcs12->ept_pointer = old;
>> >> >> >> 	kvm_mmu_reload(vcpu);
>> >> >> >> 	goto fail;
>> >> >> >> }
>> >> >> >>
>> >> >> >> The you can inherit the checks from mmu_check_root().
>> >> >> 
>> >> >> Actually, thinking about this a bit more, I agree with you. Any fault
>> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this
>> >> >> is a good thing to have. Thank you for this idea! :)
>> >> >
>> >> > SDM says
>> >> >
>> >> >   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
>> >> >   if in EPTP) THEN VMexit;
>> >> 
>> >> This section here:
>> >> As noted in Section 25.5.5.2, an execution of the
>> >> EPTP-switching VM function that causes a VM exit (as specified
>> >> above), uses the basic exit reason 59, indicating “VMFUNC”.
>> >> The length of the VMFUNC instruction is saved into the
>> >> VM-exit instruction-length field. No additional VM-exit
>> >> information is provided.
>> >> 
>> >> Although, it adds (as specified above), from testing, any vmexit that
>> >> happens as a result of the execution of the vmfunc instruction always
>> >> has exit reason 59.
>> >> 
>> >> IMO, the case David pointed out comes under "as a result of the
>> >> execution of the vmfunc instruction", so I would prefer exiting
>> >> with reason 59.
>> >
>> > Right, the exit reason is 59 for reasons that trigger a VM exit
>> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
>> > unrelated stuff.
>> >
>> > If the EPTP value is correct, then the switch should succeed.
>> > If the EPTP is correct, but bogus, then the guest should get
>> > EPT_MISCONFIG VM exit on its first access (when reading the
>> > instruction).  Source: I added
>> 
>> My point is that we are using kvm_mmu_reload() to emulate eptp
>> switching. If that emulation of vmfunc fails, it should exit with reason
>> 59.
>
> Yeah, we just disagree on what is a vmfunc failure.
>
>> >   vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
>> >
>> > shortly before a VMLAUNCH on L0. :)
>> 
>> What happens if this ept pointer is actually in the eptp list and the guest
>> switches to it using vmfunc ? I think it will exit with reason 59.
>
> I think otherwise, because it doesn't cause a VM entry failure on
> bare-metal (and SDM says that we get a VM exit only if there would be a
> VM entry failure).
> I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after.
> (Experiment pending :])
>
>> > I think that we might be emulating this case incorrectly and throwing
>> > triple faults when it should be VM exits in vcpu_run().
>> 
>> No, I agree with not throwing a triple fault. We should clear it out.
>> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.
>
> Here we disagree.  I think that it's a bug do the VM exit, so we can

Why do you think it's a bug ? The eptp switching function really didn't
succeed as far as our emulation goes when kvm_mmu_reload() fails.
And as such, the generic vmexit failure event should be a vmfunc vmexit.
We cannot strictly follow the spec here, the spec doesn't even mention a way
to emulate eptp switching. If setting up the switching succeeded and the
new root pointer is invalid or whatever, I really don't care what happens
next but this is not the case. We fail to get a new root pointer and without
that, we can't even make a switch!

> just keep the original bug -- we want to eventually fix it and it's no
> worse till then.

Anyway, can you please confirm again what is the behavior that you
are expecting if kvm_mmu_reload fails ? This would be a rarely used
branch and I am actually fine diverging from what I think is right if
I can get the reviewers to agree on a common thing.

(Thanks for giving this a closer look, Radim. I really appreciate it.)

Bandan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 21:08                   ` Bandan Das
@ 2017-07-12 13:24                     ` Radim Krčmář
  2017-07-12 18:11                       ` Bandan Das
  0 siblings, 1 reply; 34+ messages in thread
From: Radim Krčmář @ 2017-07-12 13:24 UTC (permalink / raw)
  To: Bandan Das; +Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

2017-07-11 17:08-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> 
> > 2017-07-11 16:34-0400, Bandan Das:
> >> Radim Krčmář <rkrcmar@redhat.com> writes:
> >> 
> >> > 2017-07-11 15:50-0400, Bandan Das:
> >> >> Radim Krčmář <rkrcmar@redhat.com> writes:
> >> >> > 2017-07-11 14:24-0400, Bandan Das:
> >> >> >> Bandan Das <bsd@redhat.com> writes:
> >> >> >> > If there's a triple fault, I think it's a good idea to inject it
> >> >> >> > back. Basically, there's no need to take care of damage control
> >> >> >> > that L1 is intentionally doing.
> >> >> >> >
> >> >> >> >>> +			goto fail;
> >> >> >> >>> +		kvm_mmu_unload(vcpu);
> >> >> >> >>> +		vmcs12->ept_pointer = address;
> >> >> >> >>> +		kvm_mmu_reload(vcpu);
> >> >> >> >>
> >> >> >> >> I was thinking about something like this:
> >> >> >> >>
> >> >> >> >> kvm_mmu_unload(vcpu);
> >> >> >> >> old = vmcs12->ept_pointer;
> >> >> >> >> vmcs12->ept_pointer = address;
> >> >> >> >> if (kvm_mmu_reload(vcpu)) {
> >> >> >> >> 	/* pointer invalid, restore previous state */
> >> >> >> >> 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> >> >> >> >> 	vmcs12->ept_pointer = old;
> >> >> >> >> 	kvm_mmu_reload(vcpu);
> >> >> >> >> 	goto fail;
> >> >> >> >> }
> >> >> >> >>
> >> >> >> >> The you can inherit the checks from mmu_check_root().
> >> >> >> 
> >> >> >> Actually, thinking about this a bit more, I agree with you. Any fault
> >> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this
> >> >> >> is a good thing to have. Thank you for this idea! :)
> >> >> >
> >> >> > SDM says
> >> >> >
> >> >> >   IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail
> >> >> >   if in EPTP) THEN VMexit;
> >> >> 
> >> >> This section here:
> >> >> As noted in Section 25.5.5.2, an execution of the
> >> >> EPTP-switching VM function that causes a VM exit (as specified
> >> >> above), uses the basic exit reason 59, indicating “VMFUNC”.
> >> >> The length of the VMFUNC instruction is saved into the
> >> >> VM-exit instruction-length field. No additional VM-exit
> >> >> information is provided.
> >> >> 
> >> >> Although, it adds (as specified above), from testing, any vmexit that
> >> >> happens as a result of the execution of the vmfunc instruction always
> >> >> has exit reason 59.
> >> >> 
> >> >> IMO, the case David pointed out comes under "as a result of the
> >> >> execution of the vmfunc instruction", so I would prefer exiting
> >> >> with reason 59.
> >> >
> >> > Right, the exit reason is 59 for reasons that trigger a VM exit
> >> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks
> >> > unrelated stuff.
> >> >
> >> > If the EPTP value is correct, then the switch should succeed.
> >> > If the EPTP is correct, but bogus, then the guest should get
> >> > EPT_MISCONFIG VM exit on its first access (when reading the
> >> > instruction).  Source: I added
> >> 
> >> My point is that we are using kvm_mmu_reload() to emulate eptp
> >> switching. If that emulation of vmfunc fails, it should exit with reason
> >> 59.
>>
>> Yeah, we just disagree on what is a vmfunc failure.
>>
>>> >   vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40));
>>> >
>>> > shortly before a VMLAUNCH on L0. :)
>>> 
>>> What happens if this ept pointer is actually in the eptp list and the guest
>>> switches to it using vmfunc ? I think it will exit with reason 59.
>>
>> I think otherwise, because it doesn't cause a VM entry failure on
>> bare-metal (and SDM says that we get a VM exit only if there would be a
>> VM entry failure).
>> I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after.
>> (Experiment pending :])
>>
>>> > I think that we might be emulating this case incorrectly and throwing
>>> > triple faults when it should be VM exits in vcpu_run().
>>> 
>>> No, I agree with not throwing a triple fault. We should clear it out.
>>> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails.
>>
>> Here we disagree.  I think that it's a bug do the VM exit, so we can
> 
> Why do you think it's a bug ?

SDM defines a different behavior and hardware doesn't do that either.
There are only two reasons for a VMFUNC VM exit from EPTP switching:

 1) ECX > 0
 2) EPTP would cause VM entry to fail if in VMCS.EPT_POINTER

KVM can fail for other reasons because of its bugs, but that should be
notified to the guest in another way.  Rebooting the guest is kind of
acceptable in that case.

>                               The eptp switching function really didn't
> succeed as far as our emulation goes when kvm_mmu_reload() fails.
> And as such, the generic vmexit failure event should be a vmfunc vmexit.

I interpret it as two separate events -- at first, the vmfunc succeeds
and when it later tries to access memory through the new EPTP (valid,
but not pointing into backed memory), it results in a EPT_MISCONFIG VM
exit.

> We cannot strictly follow the spec here, the spec doesn't even mention a way
> to emulate eptp switching.  If setting up the switching succeeded and the
> new root pointer is invalid or whatever, I really don't care what happens
> next but this is not the case. We fail to get a new root pointer and without
> that, we can't even make a switch!

We just make it behave exactly how the spec says that it behaves.  We do
have a value (we read 'address') to put into VMCS.EPT_POINTER, which is
all we need for the emulation.
The function doesn't dereference that pointer, it just looks at its
value to decide whether it is valid or not.  (btw. we should check that
properly, because we cannot depend on VM entry failure pass-through like
the normal case.)

The dereference done in kvm_mmu_reload() should happen after EPTP
switching finishes, because the spec doesn't mention a VM exit for other
reason than invalid EPT_POINTER value.

>> just keep the original bug -- we want to eventually fix it and it's no
>> worse till then.
> 
> Anyway, can you please confirm again what is the behavior that you
> are expecting if kvm_mmu_reload fails ? This would be a rarely used
> branch and I am actually fine diverging from what I think is right if
> I can get the reviewers to agree on a common thing.

kvm_mmu_reload() fails when mmu_check_root() is false, which means that
the pointed physical address is not backed.  We've hit this corner-case
in the past -- Jim said that the chipset returns all 1s if a read is not
claimed.

So in theory, KVM should not fail kvm_mmu_reload(), but behave as if the
pointer pointed to a memory of all 1s, which would likely result in
EPT_MISCONFIG when the guest does a memory access.

It is a mishandled corner case, but turning it into VM exit would only
confuse an OS that receives the impossible VM exit and potentially
confuse reader of the KVM logic.

I think that not using kvm_mmu_reload() directly in EPTP switching is
best.  The bug is not really something we care about.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 20:45                 ` Bandan Das
@ 2017-07-12 13:41                   ` Radim Krčmář
  2017-07-12 18:04                     ` Bandan Das
  0 siblings, 1 reply; 34+ messages in thread
From: Radim Krčmář @ 2017-07-12 13:41 UTC (permalink / raw)
  To: Bandan Das; +Cc: Jim Mattson, David Hildenbrand, kvm list, Paolo Bonzini, LKML

2017-07-11 16:45-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> 
> > 2017-07-11 15:38-0400, Bandan Das:
> >> Radim Krčmář <rkrcmar@redhat.com> writes:
> >> 
> >> > 2017-07-11 14:35-0400, Bandan Das:
> >> >> Jim Mattson <jmattson@google.com> writes:
> >> >> ...
> >> >> >>> I can find the definition for an vmexit in case of index >=
> >> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
> >> >> >>>
> >> >> >>> Can you give me a hint?
> >> >> >>
> >> >> >> I don't think there is. Since, we are basically emulating eptp switching
> >> >> >> for L2, this is a good check to have.
> >> >> >
> >> >> > There is nothing wrong with a hypervisor using physical page 0 for
> >> >> > whatever purpose it likes, including an EPTP list.
> >> >> 
> >> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list
> >> >> address most likely means it forgot to initialize it. Whatever damage it does will
> >> >> still end up with vmfunc vmexit anyway.
> >> >
> >> > Most likely, but not certainly.  I also don't see a to diverge from the
> >> > spec here.
> >> 
> >> Actually, this is a specific case where I would like to diverge from the spec.
> >> But then again, it's L1 shooting itself in the foot and this would be a rarely
> >> used code path, so, I am fine removing it.
> >
> > Thanks, we're not here to judge the guest, but to provide a bare-metal
> > experience. :)
> 
> There are certain cases where do. For example, when L2 instruction emulation
> fails we decide to kill L2 instead of injecting the error to L1 and let it handle
> that. Anyway, that's a different topic, I was just trying to point out there
> are cases kvm does a somewhat policy decision...

Emulation failure is a KVM bug and we are too lazy to implement the
bare-metal behavior correctly, but avoiding the EPTP list bug is
actually easier than introducing it.  You can make KVM simpler and
improve bare-metal emulation at the same time.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-12 13:41                   ` Radim Krčmář
@ 2017-07-12 18:04                     ` Bandan Das
  0 siblings, 0 replies; 34+ messages in thread
From: Bandan Das @ 2017-07-12 18:04 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Jim Mattson, David Hildenbrand, kvm list, Paolo Bonzini, LKML

Radim Krčmář <rkrcmar@redhat.com> writes:
...
>> > Thanks, we're not here to judge the guest, but to provide a bare-metal
>> > experience. :)
>> 
>> There are certain cases where do. For example, when L2 instruction emulation
>> fails we decide to kill L2 instead of injecting the error to L1 and let it handle
>> that. Anyway, that's a different topic, I was just trying to point out there
>> are cases kvm does a somewhat policy decision...
>
> Emulation failure is a KVM bug and we are too lazy to implement the
> bare-metal behavior correctly, but avoiding the EPTP list bug is
> actually easier than introducing it.  You can make KVM simpler and
> improve bare-metal emulation at the same time.

We are just talking past each other here trying to impose point of views.
Checking for 0 makes KVM simpler. As I said before, a 0 list_address means
that the hypervisor forgot to initialize it. Feel free to show me examples
where the hypervisor does indeed use a 0 address for eptp list address or
anything vm specific. You disagreed and I am fine with it.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-12 13:24                     ` Radim Krčmář
@ 2017-07-12 18:11                       ` Bandan Das
  2017-07-12 19:18                         ` Radim Krčmář
  0 siblings, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-12 18:11 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

Radim Krčmář <rkrcmar@redhat.com> writes:
...
>> Why do you think it's a bug ?
>
> SDM defines a different behavior and hardware doesn't do that either.
> There are only two reasons for a VMFUNC VM exit from EPTP switching:
>
>  1) ECX > 0
>  2) EPTP would cause VM entry to fail if in VMCS.EPT_POINTER
>
> KVM can fail for other reasons because of its bugs, but that should be
> notified to the guest in another way.  Rebooting the guest is kind of
> acceptable in that case.
>
>>                               The eptp switching function really didn't
>> succeed as far as our emulation goes when kvm_mmu_reload() fails.
>> And as such, the generic vmexit failure event should be a vmfunc vmexit.
>
> I interpret it as two separate events -- at first, the vmfunc succeeds
> and when it later tries to access memory through the new EPTP (valid,
> but not pointing into backed memory), it results in a EPT_MISCONFIG VM
> exit.
>
>> We cannot strictly follow the spec here, the spec doesn't even mention a way
>> to emulate eptp switching.  If setting up the switching succeeded and the
>> new root pointer is invalid or whatever, I really don't care what happens
>> next but this is not the case. We fail to get a new root pointer and without
>> that, we can't even make a switch!
>
> We just make it behave exactly how the spec says that it behaves.  We do
> have a value (we read 'address') to put into VMCS.EPT_POINTER, which is
> all we need for the emulation.
> The function doesn't dereference that pointer, it just looks at its
> value to decide whether it is valid or not.  (btw. we should check that
> properly, because we cannot depend on VM entry failure pass-through like
> the normal case.)
>
> The dereference done in kvm_mmu_reload() should happen after EPTP
> switching finishes, because the spec doesn't mention a VM exit for other
> reason than invalid EPT_POINTER value.
>
>>> just keep the original bug -- we want to eventually fix it and it's no
>>> worse till then.
>> 
>> Anyway, can you please confirm again what is the behavior that you
>> are expecting if kvm_mmu_reload fails ? This would be a rarely used
>> branch and I am actually fine diverging from what I think is right if
>> I can get the reviewers to agree on a common thing.
>
> kvm_mmu_reload() fails when mmu_check_root() is false, which means that
> the pointed physical address is not backed.  We've hit this corner-case
> in the past -- Jim said that the chipset returns all 1s if a read is not
> claimed.
>
> So in theory, KVM should not fail kvm_mmu_reload(), but behave as if the
> pointer pointed to a memory of all 1s, which would likely result in
> EPT_MISCONFIG when the guest does a memory access.

As much as I would like to disagree with you, I have already spent way more
time on this then I want. Please let's just leave it here, then ? The mmu unload
will make sure there's an invalid root hpa and whatever happens next, happens.

> It is a mishandled corner case, but turning it into VM exit would only
> confuse an OS that receives the impossible VM exit and potentially
> confuse reader of the KVM logic.
>
> I think that not using kvm_mmu_reload() directly in EPTP switching is
> best.  The bug is not really something we care about.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-12 18:11                       ` Bandan Das
@ 2017-07-12 19:18                         ` Radim Krčmář
  0 siblings, 0 replies; 34+ messages in thread
From: Radim Krčmář @ 2017-07-12 19:18 UTC (permalink / raw)
  To: Bandan Das; +Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

2017-07-12 14:11-0400, Bandan Das:
> As much as I would like to disagree with you, I have already spent way more
> time on this then I want. Please let's just leave it here, then ? The mmu unload
> will make sure there's an invalid root hpa and whatever happens next, happens.

Sure;  let's discuss the subtleties of hardware emulation over a beer.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 17:58     ` Bandan Das
  2017-07-11 18:22       ` Jim Mattson
  2017-07-11 18:24       ` Bandan Das
@ 2017-07-13 15:39       ` David Hildenbrand
  2017-07-13 17:08         ` Bandan Das
  2 siblings, 1 reply; 34+ messages in thread
From: David Hildenbrand @ 2017-07-13 15:39 UTC (permalink / raw)
  To: Bandan Das; +Cc: kvm, pbonzini, linux-kernel

>>> +	/*
>>> +	 * If the (L2) guest does a vmfunc to the currently
>>> +	 * active ept pointer, we don't have to do anything else
>>> +	 */
>>> +	if (vmcs12->ept_pointer != address) {
>>> +		if (address >> cpuid_maxphyaddr(vcpu) ||
>>> +		    !IS_ALIGNED(address, 4096))
>>
>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>> (triggering a KVM_REQ_TRIPLE_FAULT)
> 
> If there's a triple fault, I think it's a good idea to inject it
> back. Basically, there's no need to take care of damage control
> that L1 is intentionally doing.

I quickly rushed over the massive amount of comments. Sounds like you'll
be preparing a v5. Would be great if you could add some comments that
were the result of this discussion (for parts that are not that obvious
- triple faults) - thanks!

-- 

Thanks,

David

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-13 15:39       ` David Hildenbrand
@ 2017-07-13 17:08         ` Bandan Das
  0 siblings, 0 replies; 34+ messages in thread
From: Bandan Das @ 2017-07-13 17:08 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: kvm, pbonzini, linux-kernel, Radim Krčmář

David Hildenbrand <david@redhat.com> writes:

>>>> +	/*
>>>> +	 * If the (L2) guest does a vmfunc to the currently
>>>> +	 * active ept pointer, we don't have to do anything else
>>>> +	 */
>>>> +	if (vmcs12->ept_pointer != address) {
>>>> +		if (address >> cpuid_maxphyaddr(vcpu) ||
>>>> +		    !IS_ALIGNED(address, 4096))
>>>
>>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>>> (triggering a KVM_REQ_TRIPLE_FAULT)
>> 
>> If there's a triple fault, I think it's a good idea to inject it
>> back. Basically, there's no need to take care of damage control
>> that L1 is intentionally doing.
>
> I quickly rushed over the massive amount of comments. Sounds like you'll
> be preparing a v5. Would be great if you could add some comments that
> were the result of this discussion (for parts that are not that obvious
> - triple faults) - thanks!

Will do. Basically, we agreed that we don't need to do anything with mmu_reload() faillures
because the invalid eptp that mmu_unload will write to root_hpa will result in an ept
violation.

Bandan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-11 20:21             ` Radim Krčmář
  2017-07-11 20:34               ` Bandan Das
@ 2017-07-17 17:58               ` Bandan Das
  2017-07-19  9:30                 ` Radim Krčmář
  1 sibling, 1 reply; 34+ messages in thread
From: Bandan Das @ 2017-07-17 17:58 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

Radim Krčmář <rkrcmar@redhat.com> writes:
...
>> > and no other mentions of a VM exit, so I think that the VM exit happens
>> > only under these conditions:
>> >
>> >   — The EPT memory type (bits 2:0) must be a value supported by the
>> >     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>> >     Appendix A.10).
>> >   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>> >     an EPT page-walk length of 4; see Section 28.2.2.
>> >   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>> >     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>> >     as 0, indicating that the processor does not support accessed and
>> >     dirty flags for EPT.
>> >   — Reserved bits 11:7 and 63:N (where N is the processor’s
>> >     physical-address width) must all be 0.
>> >
>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>> 
>> I completely ignored AD and the #VE sections. I will add a TODO item
>> in the comment section.
>
> AFAIK, we don't support #VE, but AD would be nice to handle from the
> beginning.  (I think that caling nested_ept_init_mmu_context() as-is
> isn't that bad.)

I went back to the spec to take a look at the AD handling. It doesn't look
like anything needs to be done since nested_ept_init_mmu_context() is already
being called with the correct eptp in prepare_vmcs02 ? Anything else that
needs to be done for AD handling in vmfunc context ?

Thanks,
Bandan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-17 17:58               ` Bandan Das
@ 2017-07-19  9:30                 ` Radim Krčmář
  2017-07-19 17:54                   ` Bandan Das
  0 siblings, 1 reply; 34+ messages in thread
From: Radim Krčmář @ 2017-07-19  9:30 UTC (permalink / raw)
  To: Bandan Das; +Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

2017-07-17 13:58-0400, Bandan Das:
> Radim Krčmář <rkrcmar@redhat.com> writes:
> ...
>>> > and no other mentions of a VM exit, so I think that the VM exit happens
>>> > only under these conditions:
>>> >
>>> >   — The EPT memory type (bits 2:0) must be a value supported by the
>>> >     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>>> >     Appendix A.10).
>>> >   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>>> >     an EPT page-walk length of 4; see Section 28.2.2.
>>> >   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>>> >     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>>> >     as 0, indicating that the processor does not support accessed and
>>> >     dirty flags for EPT.
>>> >   — Reserved bits 11:7 and 63:N (where N is the processor’s
>>> >     physical-address width) must all be 0.
>>> >
>>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>>> 
>>> I completely ignored AD and the #VE sections. I will add a TODO item
>>> in the comment section.
>>
>> AFAIK, we don't support #VE, but AD would be nice to handle from the
>> beginning.  (I think that caling nested_ept_init_mmu_context() as-is
>> isn't that bad.)
> 
> I went back to the spec to take a look at the AD handling. It doesn't look
> like anything needs to be done since nested_ept_init_mmu_context() is already
> being called with the correct eptp in prepare_vmcs02 ? Anything else that
> needs to be done for AD handling in vmfunc context ?

AD is decided by EPTP bit 6, so it can be toggled by EPTP switching and
we don't call prepare_vmcs02() after emulating VMFUNC vm exit.
We want to forward the new AD configuration to KVM's MMU.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor
  2017-07-19  9:30                 ` Radim Krčmář
@ 2017-07-19 17:54                   ` Bandan Das
  0 siblings, 0 replies; 34+ messages in thread
From: Bandan Das @ 2017-07-19 17:54 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: David Hildenbrand, kvm, pbonzini, linux-kernel

Radim Krčmář <rkrcmar@redhat.com> writes:

> 2017-07-17 13:58-0400, Bandan Das:
>> Radim Krčmář <rkrcmar@redhat.com> writes:
>> ...
>>>> > and no other mentions of a VM exit, so I think that the VM exit happens
>>>> > only under these conditions:
>>>> >
>>>> >   — The EPT memory type (bits 2:0) must be a value supported by the
>>>> >     processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see
>>>> >     Appendix A.10).
>>>> >   — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating
>>>> >     an EPT page-walk length of 4; see Section 28.2.2.
>>>> >   — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if
>>>> >     bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read
>>>> >     as 0, indicating that the processor does not support accessed and
>>>> >     dirty flags for EPT.
>>>> >   — Reserved bits 11:7 and 63:N (where N is the processor’s
>>>> >     physical-address width) must all be 0.
>>>> >
>>>> > And it looks like we need parts of nested_ept_init_mmu_context() to
>>>> > properly handle VMX_EPT_AD_ENABLE_BIT.
>>>> 
>>>> I completely ignored AD and the #VE sections. I will add a TODO item
>>>> in the comment section.
>>>
>>> AFAIK, we don't support #VE, but AD would be nice to handle from the
>>> beginning.  (I think that caling nested_ept_init_mmu_context() as-is
>>> isn't that bad.)
>> 
>> I went back to the spec to take a look at the AD handling. It doesn't look
>> like anything needs to be done since nested_ept_init_mmu_context() is already
>> being called with the correct eptp in prepare_vmcs02 ? Anything else that
>> needs to be done for AD handling in vmfunc context ?
>
> AD is decided by EPTP bit 6, so it can be toggled by EPTP switching and
> we don't call prepare_vmcs02() after emulating VMFUNC vm exit.
> We want to forward the new AD configuration to KVM's MMU.

Thanks, I had incorrectly assumed that prepare_vmcs02 will be called after
an exit unconditionally. I will work something up soon.

Bandan

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2017-07-19 17:54 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-10 20:49 [PATCH v4 0/3] Expose VMFUNC to the nested hypervisor Bandan Das
2017-07-10 20:49 ` [PATCH v4 1/3] KVM: vmx: Enable VMFUNCs Bandan Das
2017-07-10 20:49 ` [PATCH v4 2/3] KVM: nVMX: Enable VMFUNC for the L1 hypervisor Bandan Das
2017-07-10 20:49 ` [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching " Bandan Das
2017-07-11  7:51   ` David Hildenbrand
2017-07-11  8:39     ` Paolo Bonzini
2017-07-11 13:52     ` Radim Krčmář
2017-07-11 18:05       ` Bandan Das
2017-07-11 19:12         ` Radim Krčmář
2017-07-11 19:34           ` Bandan Das
2017-07-11 17:58     ` Bandan Das
2017-07-11 18:22       ` Jim Mattson
2017-07-11 18:35         ` Bandan Das
2017-07-11 19:13           ` Radim Krčmář
2017-07-11 19:38             ` Bandan Das
2017-07-11 20:22               ` Radim Krčmář
2017-07-11 20:45                 ` Bandan Das
2017-07-12 13:41                   ` Radim Krčmář
2017-07-12 18:04                     ` Bandan Das
2017-07-11 18:24       ` Bandan Das
2017-07-11 19:32         ` Radim Krčmář
2017-07-11 19:50           ` Bandan Das
2017-07-11 20:21             ` Radim Krčmář
2017-07-11 20:34               ` Bandan Das
2017-07-11 20:45                 ` Radim Krčmář
2017-07-11 21:08                   ` Bandan Das
2017-07-12 13:24                     ` Radim Krčmář
2017-07-12 18:11                       ` Bandan Das
2017-07-12 19:18                         ` Radim Krčmář
2017-07-17 17:58               ` Bandan Das
2017-07-19  9:30                 ` Radim Krčmář
2017-07-19 17:54                   ` Bandan Das
2017-07-13 15:39       ` David Hildenbrand
2017-07-13 17:08         ` Bandan Das

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.