[PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
@ 2016-11-30 20:03 Jim Mattson
  2016-11-30 20:03 ` [PATCH 1/8] kvm: nVMX: Prepare for checkpointing L2 state Jim Mattson
                   ` (9 more replies)
  0 siblings, 10 replies; 63+ messages in thread
From: Jim Mattson @ 2016-11-30 20:03 UTC (permalink / raw)
  To: kvm; +Cc: Jim Mattson

This series of patches introduces checkpoint and restore operations
for the VMX state of a VM, so that VMX-capable VMs can be migrated.

Two new ioctls are introduced: KVM_GET_VMX_STATE and
KVM_SET_VMX_STATE. The VMX state that is checkpointed/restored
consists of the VMXON region address (if any), the current VMCS
address (if any), and the cached current VMCS contents (if any). One
piece of implementation-specific state that is also
checkpointed/restored is the nested_run_pending bit.

On live migration, our userspace process does not have access to guest
memory when the VM is set up on the target host, so some operations
that require a GPA->HPA mapping are deferred until the next vcpu_run
using a new vcpu request.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH 1/8] kvm: nVMX: Prepare for checkpointing L2 state
  2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
@ 2016-11-30 20:03 ` Jim Mattson
  2016-11-30 20:03 ` [PATCH 2/8] kvm: nVMX: Refactor handle_vmon() Jim Mattson
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 63+ messages in thread
From: Jim Mattson @ 2016-11-30 20:03 UTC (permalink / raw)
  To: kvm; +Cc: Jim Mattson

Split prepare_vmcs12 into two parts: the part that stores the current L2
guest state and the part that sets up the exit information fields. The
former will be used when checkpointing the vCPU's VMX state.

Modify prepare_vmcs02 so that it can construct a vmcs02 midway through
L2 execution, using the checkpointed L2 guest state saved into the
cached vmcs12 above.

Signed-off-by: Jim Mattson <jmattson@google.com>
---
 arch/x86/kvm/vmx.c | 73 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 34d3839..22c4a12 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9829,21 +9829,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
 	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
 
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+	if (vmx->nested.nested_run_pending &&
+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
 		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
 	} else {
 		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
 	}
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-		vmcs12->vm_entry_intr_info_field);
-	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-		vmcs12->vm_entry_exception_error_code);
-	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-		vmcs12->vm_entry_instruction_len);
-	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-		vmcs12->guest_interruptibility_info);
+	if (vmx->nested.nested_run_pending) {
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			     vmcs12->vm_entry_intr_info_field);
+		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+			     vmcs12->vm_entry_exception_error_code);
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmcs12->vm_entry_instruction_len);
+		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+			     vmcs12->guest_interruptibility_info);
+	} else {
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+	}
 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
 	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
 	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
@@ -10026,16 +10031,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 			~VM_ENTRY_IA32E_MODE) |
 		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
 
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
+	if (vmx->nested.nested_run_pending &&
+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
 		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
 		vcpu->arch.pat = vmcs12->guest_ia32_pat;
-	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
-
+	}
 
 	set_cr4_guest_host_mask(vmx);
 
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+	if (vmx->nested.nested_run_pending &&
+	    vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
 		vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
 
 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -10073,7 +10080,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		nested_ept_init_mmu_context(vcpu);
 	}
 
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+	if (vmx->nested.nested_run_pending &&
+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
 		vcpu->arch.efer = vmcs12->guest_ia32_efer;
 	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
 		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
@@ -10448,21 +10456,13 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 }
 
 /*
- * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
- * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
- * and this function updates it to reflect the changes to the guest state while
- * L2 was running (and perhaps made some exits which were handled directly by L0
- * without going back to L1), and to reflect the exit reason.
- * Note that we do not have to copy here all VMCS fields, just those that
- * could have changed by the L2 guest or the exit - i.e., the guest-state and
- * exit-information fields only. Other fields are modified by L1 with VMWRITE,
- * which already writes to vmcs12 directly.
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
  */
-static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-			   u32 exit_reason, u32 exit_intr_info,
-			   unsigned long exit_qualification)
+static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
-	/* update guest state fields: */
 	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
 	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
 
@@ -10568,6 +10568,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
 	if (nested_cpu_has_xsaves(vmcs12))
 		vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+			   u32 exit_reason, u32 exit_intr_info,
+			   unsigned long exit_qualification)
+{
+	/* update guest state fields: */
+	sync_vmcs12(vcpu, vmcs12);
 
 	/* update exit information fields: */
 
-- 
2.8.0.rc3.226.g39d4020


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 2/8] kvm: nVMX: Refactor handle_vmon()
  2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
  2016-11-30 20:03 ` [PATCH 1/8] kvm: nVMX: Prepare for checkpointing L2 state Jim Mattson
@ 2016-11-30 20:03 ` Jim Mattson
  2016-11-30 20:03 ` [PATCH 3/8] kvm: nVMX: Refactor handle_vmptrld() Jim Mattson
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 63+ messages in thread
From: Jim Mattson @ 2016-11-30 20:03 UTC (permalink / raw)
  To: kvm; +Cc: Jim Mattson

Handle_vmon is split into two parts: the part that handles the VMXON
instruction, and the part that modifies the vcpu state to transition
from legacy mode to VMX operation. The latter will be used when
restoring the checkpointed state of a vCPU that was in VMX operation
when a snapshot was taken.

Signed-off-by: Jim Mattson <jmattson@google.com>
---
 arch/x86/kvm/vmx.c | 91 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 51 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 22c4a12..0169120 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6974,6 +6974,53 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
 	return 0;
 }
 
+static int enter_vmx_operation(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs *shadow_vmcs;
+
+	if (cpu_has_vmx_msr_bitmap()) {
+		vmx->nested.msr_bitmap =
+				(unsigned long *)__get_free_page(GFP_KERNEL);
+		if (!vmx->nested.msr_bitmap)
+			goto out_msr_bitmap;
+	}
+
+	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+	if (!vmx->nested.cached_vmcs12)
+		goto out_cached_vmcs12;
+
+	if (enable_shadow_vmcs) {
+		shadow_vmcs = alloc_vmcs();
+		if (!shadow_vmcs)
+			goto out_shadow_vmcs;
+		/* mark vmcs as shadow */
+		shadow_vmcs->revision_id |= (1u << 31);
+		/* init shadow vmcs */
+		vmcs_clear(shadow_vmcs);
+		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
+	}
+
+	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+	vmx->nested.vmcs02_num = 0;
+
+	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL_PINNED);
+	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
+	vmx->nested.vmxon = true;
+	return 0;
+
+out_shadow_vmcs:
+	kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+	free_page((unsigned long)vmx->nested.msr_bitmap);
+
+out_msr_bitmap:
+	return -ENOMEM;
+}
+
 /*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
@@ -6984,9 +7031,9 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
  */
 static int handle_vmon(struct kvm_vcpu *vcpu)
 {
+	int ret;
 	struct kvm_segment cs;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct vmcs *shadow_vmcs;
 	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
 		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 
@@ -7028,49 +7075,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	if (cpu_has_vmx_msr_bitmap()) {
-		vmx->nested.msr_bitmap =
-				(unsigned long *)__get_free_page(GFP_KERNEL);
-		if (!vmx->nested.msr_bitmap)
-			goto out_msr_bitmap;
-	}
-
-	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
-	if (!vmx->nested.cached_vmcs12)
-		goto out_cached_vmcs12;
-
-	if (enable_shadow_vmcs) {
-		shadow_vmcs = alloc_vmcs();
-		if (!shadow_vmcs)
-			goto out_shadow_vmcs;
-		/* mark vmcs as shadow */
-		shadow_vmcs->revision_id |= (1u << 31);
-		/* init shadow vmcs */
-		vmcs_clear(shadow_vmcs);
-		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
-	}
-
-	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
-	vmx->nested.vmcs02_num = 0;
-
-	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_REL_PINNED);
-	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
-
-	vmx->nested.vmxon = true;
+	ret = enter_vmx_operation(vcpu);
+	if (ret)
+		return ret;
 
 	skip_emulated_instruction(vcpu);
 	nested_vmx_succeed(vcpu);
 	return 1;
-
-out_shadow_vmcs:
-	kfree(vmx->nested.cached_vmcs12);
-
-out_cached_vmcs12:
-	free_page((unsigned long)vmx->nested.msr_bitmap);
-
-out_msr_bitmap:
-	return -ENOMEM;
 }
 
 /*
-- 
2.8.0.rc3.226.g39d4020


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 3/8] kvm: nVMX: Refactor handle_vmptrld()
  2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
  2016-11-30 20:03 ` [PATCH 1/8] kvm: nVMX: Prepare for checkpointing L2 state Jim Mattson
  2016-11-30 20:03 ` [PATCH 2/8] kvm: nVMX: Refactor handle_vmon() Jim Mattson
@ 2016-11-30 20:03 ` Jim Mattson
  2016-11-30 20:03 ` [PATCH 4/8] kvm: nVMX: Refactor nested_get_vmcs12_pages() Jim Mattson
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 63+ messages in thread
From: Jim Mattson @ 2016-11-30 20:03 UTC (permalink / raw)
  To: kvm; +Cc: Jim Mattson

Handle_vmptrld is split into two parts: the part that handles the
VMPTRLD instruction, and the part that establishes the current VMCS
pointer.  The latter will be used when restoring the checkpointed state
of a vCPU that had a valid VMCS pointer when a snapshot was taken.

Signed-off-by: Jim Mattson <jmattson@google.com>
---
 arch/x86/kvm/vmx.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0169120..14c3529 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7534,6 +7534,18 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+	vmx->nested.current_vmptr = vmptr;
+	if (enable_shadow_vmcs) {
+		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+			      SECONDARY_EXEC_SHADOW_VMCS);
+		vmcs_write64(VMCS_LINK_POINTER,
+			     __pa(vmx->vmcs01.shadow_vmcs));
+		vmx->nested.sync_shadow_vmcs = true;
+	}
+}
+
 /* Emulate the VMPTRLD instruction */
 static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
@@ -7566,7 +7578,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		}
 
 		nested_release_vmcs12(vmx);
-		vmx->nested.current_vmptr = vmptr;
 		vmx->nested.current_vmcs12 = new_vmcs12;
 		vmx->nested.current_vmcs12_page = page;
 		/*
@@ -7575,14 +7586,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		 */
 		memcpy(vmx->nested.cached_vmcs12,
 		       vmx->nested.current_vmcs12, VMCS12_SIZE);
-
-		if (enable_shadow_vmcs) {
-			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
-				      SECONDARY_EXEC_SHADOW_VMCS);
-			vmcs_write64(VMCS_LINK_POINTER,
-				     __pa(vmx->vmcs01.shadow_vmcs));
-			vmx->nested.sync_shadow_vmcs = true;
-		}
+		set_current_vmptr(vmx, vmptr);
 	}
 
 	nested_vmx_succeed(vcpu);
-- 
2.8.0.rc3.226.g39d4020


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 4/8] kvm: nVMX: Refactor nested_get_vmcs12_pages()
  2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
                   ` (2 preceding siblings ...)
  2016-11-30 20:03 ` [PATCH 3/8] kvm: nVMX: Refactor handle_vmptrld() Jim Mattson
@ 2016-11-30 20:03 ` Jim Mattson
  2016-11-30 20:03 ` [PATCH 5/8] kvm: nVMX: Split VMCS checks from nested_vmx_run() Jim Mattson
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 63+ messages in thread
From: Jim Mattson @ 2016-11-30 20:03 UTC (permalink / raw)
  To: kvm; +Cc: Jim Mattson

Perform the checks on vmcs12 state early, but defer the gpa->hpa lookups
until after prepare_vmcs02. Later, when we restore the checkpointed
state of a vCPU in guest mode, we will not be able to do the gpa->hpa
lookups when the restore is done.

Signed-off-by: Jim Mattson <jmattson@google.com>
---
 arch/x86/kvm/vmx.c | 137 ++++++++++++++++++++++++++---------------------------
 1 file changed, 68 insertions(+), 69 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 14c3529..403c0f9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9414,17 +9414,16 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 		kvm_inject_page_fault(vcpu, fault);
 }
 
-static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+					       struct vmcs12 *vmcs12);
+
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 					struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int maxphyaddr = cpuid_maxphyaddr(vcpu);
+	u64 hpa;
 
 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-		if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
-		    vmcs12->apic_access_addr >> maxphyaddr)
-			return false;
-
 		/*
 		 * Translate L1 physical address to host physical
 		 * address for vmcs02. Keep the page pinned, so this
@@ -9435,59 +9434,80 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 			nested_release_page(vmx->nested.apic_access_page);
 		vmx->nested.apic_access_page =
 			nested_get_page(vcpu, vmcs12->apic_access_addr);
+		/*
+		 * If translation failed, no matter: This feature asks
+		 * to exit when accessing the given address, and if it
+		 * can never be accessed, this feature won't do
+		 * anything anyway.
+		 */
+		if (vmx->nested.apic_access_page) {
+			hpa = page_to_phys(vmx->nested.apic_access_page);
+			vmcs_write64(APIC_ACCESS_ADDR, hpa);
+		} else {
+			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+		}
+	} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+		   cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
+		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+			      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+		kvm_vcpu_reload_apic_access_page(vcpu);
 	}
 
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-		if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
-		    vmcs12->virtual_apic_page_addr >> maxphyaddr)
-			return false;
-
 		if (vmx->nested.virtual_apic_page) /* shouldn't happen */
 			nested_release_page(vmx->nested.virtual_apic_page);
 		vmx->nested.virtual_apic_page =
 			nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
 
 		/*
-		 * Failing the vm entry is _not_ what the processor does
-		 * but it's basically the only possibility we have.
-		 * We could still enter the guest if CR8 load exits are
-		 * enabled, CR8 store exits are enabled, and virtualize APIC
-		 * access is disabled; in this case the processor would never
-		 * use the TPR shadow and we could simply clear the bit from
-		 * the execution control.  But such a configuration is useless,
-		 * so let's keep the code simple.
+		 * If translation failed, VM entry will fail because
+		 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
+		 * Failing the vm entry is _not_ what the processor
+		 * does but it's basically the only possibility we
+		 * have.  We could still enter the guest if CR8 load
+		 * exits are enabled, CR8 store exits are enabled, and
+		 * virtualize APIC access is disabled; in this case
+		 * the processor would never use the TPR shadow and we
+		 * could simply clear the bit from the execution
+		 * control.  But such a configuration is useless, so
+		 * let's keep the code simple.
 		 */
-		if (!vmx->nested.virtual_apic_page)
-			return false;
+		if (!vmx->nested.virtual_apic_page) {
+			hpa = page_to_phys(vmx->nested.virtual_apic_page);
+			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+		}
 	}
 
 	if (nested_cpu_has_posted_intr(vmcs12)) {
-		if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
-		    vmcs12->posted_intr_desc_addr >> maxphyaddr)
-			return false;
-
 		if (vmx->nested.pi_desc_page) { /* shouldn't happen */
 			kunmap(vmx->nested.pi_desc_page);
 			nested_release_page(vmx->nested.pi_desc_page);
 		}
 		vmx->nested.pi_desc_page =
 			nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
-		if (!vmx->nested.pi_desc_page)
-			return false;
-
 		vmx->nested.pi_desc =
 			(struct pi_desc *)kmap(vmx->nested.pi_desc_page);
 		if (!vmx->nested.pi_desc) {
 			nested_release_page_clean(vmx->nested.pi_desc_page);
-			return false;
+			return;
 		}
 		vmx->nested.pi_desc =
 			(struct pi_desc *)((void *)vmx->nested.pi_desc +
 			(unsigned long)(vmcs12->posted_intr_desc_addr &
 			(PAGE_SIZE - 1)));
+		vmcs_write64(POSTED_INTR_DESC_ADDR,
+			page_to_phys(vmx->nested.pi_desc_page) +
+			(unsigned long)(vmcs12->posted_intr_desc_addr &
+			(PAGE_SIZE - 1)));
 	}
-
-	return true;
+	if (cpu_has_vmx_msr_bitmap() &&
+	    nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
+	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+		;
+	else
+		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+				CPU_BASED_USE_MSR_BITMAPS);
 }
 
 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
@@ -9892,12 +9912,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
 		vmx->nested.pi_pending = false;
 		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
-		vmcs_write64(POSTED_INTR_DESC_ADDR,
-			page_to_phys(vmx->nested.pi_desc_page) +
-			(unsigned long)(vmcs12->posted_intr_desc_addr &
-			(PAGE_SIZE - 1)));
-	} else
+	} else {
 		exec_control &= ~PIN_BASED_POSTED_INTR;
+	}
 
 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
 
@@ -9942,26 +9959,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
 			exec_control |= vmcs12->secondary_vm_exec_control;
 
-		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
-			/*
-			 * If translation failed, no matter: This feature asks
-			 * to exit when accessing the given address, and if it
-			 * can never be accessed, this feature won't do
-			 * anything anyway.
-			 */
-			if (!vmx->nested.apic_access_page)
-				exec_control &=
-				  ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-			else
-				vmcs_write64(APIC_ACCESS_ADDR,
-				  page_to_phys(vmx->nested.apic_access_page));
-		} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
-			    cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
-			exec_control |=
-				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-			kvm_vcpu_reload_apic_access_page(vcpu);
-		}
-
 		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
 			vmcs_write64(EOI_EXIT_BITMAP0,
 				vmcs12->eoi_exit_bitmap0);
@@ -9975,6 +9972,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 				vmcs12->guest_intr_status);
 		}
 
+		/*
+		 * Write an illegal value to APIC_ACCESS_ADDR. Later,
+		 * nested_get_vmcs12_pages will either fix it up or
+		 * remove the VM execution control.
+		 */
+		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+			vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
 
@@ -10002,19 +10007,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	exec_control &= ~CPU_BASED_TPR_SHADOW;
 	exec_control |= vmcs12->cpu_based_vm_exec_control;
 
+	/*
+	 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+	 * nested_get_vmcs12_pages can't fix it up, the illegal value
+	 * will result in a VM entry failure.
+	 */
 	if (exec_control & CPU_BASED_TPR_SHADOW) {
-		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-				page_to_phys(vmx->nested.virtual_apic_page));
+		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
 		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
 	}
 
-	if (cpu_has_vmx_msr_bitmap() &&
-	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
-	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
-		; /* MSR_BITMAP will be set by following vmx_set_efer. */
-	else
-		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
-
 	/*
 	 * Merging of IO bitmap not currently supported.
 	 * Rather, exit every time.
@@ -10186,11 +10188,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
-	if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
-	}
-
 	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
@@ -10311,6 +10308,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
 	prepare_vmcs02(vcpu, vmcs12);
 
+	nested_get_vmcs12_pages(vcpu, vmcs12);
+
 	msr_entry_idx = nested_vmx_load_msr(vcpu,
 					    vmcs12->vm_entry_msr_load_addr,
 					    vmcs12->vm_entry_msr_load_count);
-- 
2.8.0.rc3.226.g39d4020


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 5/8] kvm: nVMX: Split VMCS checks from nested_vmx_run()
  2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
                   ` (3 preceding siblings ...)
  2016-11-30 20:03 ` [PATCH 4/8] kvm: nVMX: Refactor nested_get_vmcs12_pages() Jim Mattson
@ 2016-11-30 20:03 ` Jim Mattson
  2016-11-30 20:03 ` [PATCH 6/8] kvm: nVMX: Refactor nested_vmx_run() Jim Mattson
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 63+ messages in thread
From: Jim Mattson @ 2016-11-30 20:03 UTC (permalink / raw)
  To: kvm; +Cc: Jim Mattson

The checks performed on the contents of the vmcs12 are extracted from
nested_vmx_run so that they can be used to validate a vmcs12 that has
been restored from a checkpoint.

Signed-off-by: Jim Mattson <jmattson@google.com>
---
 arch/x86/kvm/vmx.c | 166 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 88 insertions(+), 78 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 403c0f9..8b12461 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10142,66 +10142,22 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
 
-/*
- * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
- * for running an L2 nested guest.
- */
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
-	struct vmcs12 *vmcs12;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int cpu;
-	struct loaded_vmcs *vmcs02;
-	bool ia32e;
-	u32 msr_entry_idx;
-
-	if (!nested_vmx_check_permission(vcpu) ||
-	    !nested_vmx_check_vmcs12(vcpu))
-		return 1;
-
-	skip_emulated_instruction(vcpu);
-	vmcs12 = get_vmcs12(vcpu);
-
-	if (enable_shadow_vmcs)
-		copy_shadow_to_vmcs12(vmx);
-
-	/*
-	 * The nested entry process starts with enforcing various prerequisites
-	 * on vmcs12 as required by the Intel SDM, and act appropriately when
-	 * they fail: As the SDM explains, some conditions should cause the
-	 * instruction to fail, while others will cause the instruction to seem
-	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
-	 * To speed up the normal (success) code path, we should avoid checking
-	 * for misconfigurations which will anyway be caught by the processor
-	 * when using the merged vmcs02.
-	 */
-	if (vmcs12->launch_state == launch) {
-		nested_vmx_failValid(vcpu,
-			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
-			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
-		return 1;
-	}
 
 	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
-	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
-	}
+	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
-	}
+	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-	if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
-	}
+	if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
-	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
-	}
+	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
 				vmx->nested.nested_vmx_true_procbased_ctls_low,
@@ -10218,27 +10174,29 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	    !vmx_control_verify(vmcs12->vm_entry_controls,
 				vmx->nested.nested_vmx_true_entry_ctls_low,
 				vmx->nested.nested_vmx_entry_ctls_high))
-	{
-		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
-	}
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
 	if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
-	    ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
-		nested_vmx_failValid(vcpu,
-			VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
-		return 1;
-	}
+	    ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
+		return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+
+	return 0;
+}
+
+static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+				  u32 *exit_qual)
+{
+	bool ia32e;
+
+	*exit_qual = ENTRY_FAIL_DEFAULT;
 
 	if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
-	    ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
-		nested_vmx_entry_failure(vcpu, vmcs12,
-			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+	    ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
 		return 1;
-	}
-	if (vmcs12->vmcs_link_pointer != -1ull) {
-		nested_vmx_entry_failure(vcpu, vmcs12,
-			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+
+	if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
+	    vmcs12->vmcs_link_pointer != -1ull) {
+		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
 		return 1;
 	}
 
@@ -10251,16 +10209,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
 	 *   CR0.PG) is 1.
 	 */
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
+	if (to_vmx(vcpu)->nested.nested_run_pending &&
+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
 		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
 		if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
 		    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
 		    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
-		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
-			nested_vmx_entry_failure(vcpu, vmcs12,
-				EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
 			return 1;
-		}
 	}
 
 	/*
@@ -10274,11 +10230,65 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 			 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
 		if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
 		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
-		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
-			nested_vmx_entry_failure(vcpu, vmcs12,
-				EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
 			return 1;
-		}
+	}
+
+	return 0;
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+	struct vmcs12 *vmcs12;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int cpu;
+	struct loaded_vmcs *vmcs02;
+	u32 msr_entry_idx;
+	u32 exit_qual;
+	int ret;
+
+	if (!nested_vmx_check_permission(vcpu) ||
+	    !nested_vmx_check_vmcs12(vcpu))
+		return 1;
+
+	skip_emulated_instruction(vcpu);
+	vmcs12 = get_vmcs12(vcpu);
+
+	if (enable_shadow_vmcs)
+		copy_shadow_to_vmcs12(vmx);
+
+	/*
+	 * The nested entry process starts with enforcing various prerequisites
+	 * on vmcs12 as required by the Intel SDM, and act appropriately when
+	 * they fail: As the SDM explains, some conditions should cause the
+	 * instruction to fail, while others will cause the instruction to seem
+	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
+	 * To speed up the normal (success) code path, we should avoid checking
+	 * for misconfigurations which will anyway be caught by the processor
+	 * when using the merged vmcs02.
+	 */
+	if (vmcs12->launch_state == launch) {
+		nested_vmx_failValid(vcpu,
+			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+		return 1;
+	}
+
+	ret = check_vmentry_prereqs(vcpu, vmcs12);
+	if (ret) {
+		nested_vmx_failValid(vcpu, ret);
+		return 1;
+	}
+
+	ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
+	if (ret) {
+		nested_vmx_entry_failure(vcpu, vmcs12,
+					 EXIT_REASON_INVALID_STATE, exit_qual);
+		return 1;
 	}
 
 	/*
-- 
2.8.0.rc3.226.g39d4020


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 6/8] kvm: nVMX: Refactor nested_vmx_run()
  2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
                   ` (4 preceding siblings ...)
  2016-11-30 20:03 ` [PATCH 5/8] kvm: nVMX: Split VMCS checks from nested_vmx_run() Jim Mattson
@ 2016-11-30 20:03 ` Jim Mattson
  2016-11-30 20:03 ` [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE Jim Mattson
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 63+ messages in thread
From: Jim Mattson @ 2016-11-30 20:03 UTC (permalink / raw)
  To: kvm; +Cc: Jim Mattson

Nested_vmx_run is split into two parts: the part that handles the
VMLAUNCH/VMRESUME instruction, and the part that modifies the vcpu state
to transition from VMX root mode to VMX non-root mode. The latter will
be used when restoring the checkpointed state of a vCPU that was in VMX
operation when a snapshot was taken.

Signed-off-by: Jim Mattson <jmattson@google.com>
---
 arch/x86/kvm/vmx.c | 98 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 55 insertions(+), 43 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8b12461..9f0c747 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10237,6 +10237,58 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	return 0;
 }
 
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	struct loaded_vmcs *vmcs02;
+	int cpu;
+	u32 msr_entry_idx;
+
+	vmcs02 = nested_get_current_vmcs02(vmx);
+	if (!vmcs02)
+		return -ENOMEM;
+
+	enter_guest_mode(vcpu);
+
+	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+	cpu = get_cpu();
+	vmx->loaded_vmcs = vmcs02;
+	vmx_vcpu_put(vcpu);
+	vmx_vcpu_load(vcpu, cpu);
+	vcpu->cpu = cpu;
+	put_cpu();
+
+	vmx_segment_cache_clear(vmx);
+
+	prepare_vmcs02(vcpu, vmcs12);
+
+	nested_get_vmcs12_pages(vcpu, vmcs12);
+
+	msr_entry_idx = nested_vmx_load_msr(vcpu,
+					    vmcs12->vm_entry_msr_load_addr,
+					    vmcs12->vm_entry_msr_load_count);
+	if (msr_entry_idx) {
+		leave_guest_mode(vcpu);
+		vmx_load_vmcs01(vcpu);
+		nested_vmx_entry_failure(vcpu, vmcs12,
+				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+		return 1;
+	}
+
+	vmcs12->launch_state = 1;
+
+	/*
+	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+	 * returned as far as L1 is concerned. It will only return (and set
+	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
+	 */
+	return 0;
+}
+
 /*
  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
  * for running an L2 nested guest.
@@ -10245,9 +10297,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 {
 	struct vmcs12 *vmcs12;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int cpu;
-	struct loaded_vmcs *vmcs02;
-	u32 msr_entry_idx;
 	u32 exit_qual;
 	int ret;
 
@@ -10296,54 +10345,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	 * the nested entry.
 	 */
 
-	vmcs02 = nested_get_current_vmcs02(vmx);
-	if (!vmcs02)
-		return -ENOMEM;
-
-	enter_guest_mode(vcpu);
-
 	vmx->nested.nested_run_pending = 1;
 
-	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
-		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-
-	cpu = get_cpu();
-	vmx->loaded_vmcs = vmcs02;
-	vmx_vcpu_put(vcpu);
-	vmx_vcpu_load(vcpu, cpu);
-	vcpu->cpu = cpu;
-	put_cpu();
-
-	vmx_segment_cache_clear(vmx);
-
-	prepare_vmcs02(vcpu, vmcs12);
-
-	nested_get_vmcs12_pages(vcpu, vmcs12);
-
-	msr_entry_idx = nested_vmx_load_msr(vcpu,
-					    vmcs12->vm_entry_msr_load_addr,
-					    vmcs12->vm_entry_msr_load_count);
-	if (msr_entry_idx) {
-		leave_guest_mode(vcpu);
-		vmx_load_vmcs01(vcpu);
-		nested_vmx_entry_failure(vcpu, vmcs12,
-				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
-		return 1;
-	}
-
-	vmcs12->launch_state = 1;
+	ret = enter_vmx_non_root_mode(vcpu);
+	if (ret)
+		return ret;
 
 	if (unlikely(vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)) {
 		vmx->nested.nested_run_pending = 0;
 		return kvm_vcpu_halt(vcpu);
 	}
 
-	/*
-	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
-	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
-	 * returned as far as L1 is concerned. It will only return (and set
-	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
-	 */
 	return 1;
 }
 
-- 
2.8.0.rc3.226.g39d4020


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
                   ` (5 preceding siblings ...)
  2016-11-30 20:03 ` [PATCH 6/8] kvm: nVMX: Refactor nested_vmx_run() Jim Mattson
@ 2016-11-30 20:03 ` Jim Mattson
  2016-12-09 15:26   ` Paolo Bonzini
  2016-11-30 20:03 ` [PATCH 8/8] kvm: nVMX: Defer gpa->hpa lookups for set_vmx_state Jim Mattson
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2016-11-30 20:03 UTC (permalink / raw)
  To: kvm; +Cc: Jim Mattson

With this capability, there are two new vcpu ioctls:
KVM_GET_VMX_STATE and KVM_SET_VMX_STATE. These can be used
for saving and restoring a VM that is in VMX operation.

Signed-off-by: Jim Mattson <jmattson@google.com>
---
 Documentation/virtual/kvm/api.txt |  44 ++++++++++++
 arch/x86/include/asm/kvm_host.h   |   5 ++
 arch/x86/include/uapi/asm/kvm.h   |  12 ++++
 arch/x86/kvm/vmx.c                | 138 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c                |  19 ++++++
 include/uapi/linux/kvm.h          |   4 ++
 6 files changed, 222 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6bbceb9..8694eb9 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3198,6 +3198,50 @@ struct kvm_reinject_control {
 pit_reinject = 0 (!reinject mode) is recommended, unless running an old
 operating system that uses the PIT for timing (e.g. Linux 2.4.x).
 
+4.99 KVM_GET_VMX_STATE
+
+Capability: KVM_CAP_VMX_STATE
+Architectures: x86/vmx
+Type: vcpu ioctl
+Parameters: struct kvm_vmx_state (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  E2BIG:     the data size exceeds the value of data_size specified by
+             the user (the size required will be written into data_size).
+
+The maximum data size is currently 8192.
+
+struct kvm_vmx_state {
+	__u64 vmxon_ptr;
+	__u64 current_vmcs;
+	__u32 flags;
+	__u32 data_size;
+	__u8 data[0];
+};
+
+This ioctl copies the vcpu's kvm_vmx_state struct from the kernel to
+userspace.
+
+
+4.100 KVM_SET_VMX_STATE
+
+Capability: KVM_CAP_VMX_STATE
+Architectures: x86/vmx
+Type: vcpu ioctl
+Parameters: struct kvm_vmx_state (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_vmx_state {
+	__u64 vmxon_ptr;
+	__u64 current_vmcs;
+	__u32 flags;
+	__u32 data_size;
+	__u8 data[0];
+};
+
+This copies the vcpu's kvm_vmx_state struct from userspace to the
+kernel.
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bdde807..d6be6f1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1020,6 +1020,11 @@ struct kvm_x86_ops {
 	void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
 
 	void (*setup_mce)(struct kvm_vcpu *vcpu);
+
+	int (*get_vmx_state)(struct kvm_vcpu *vcpu,
+			     struct kvm_vmx_state __user *user_vmx_state);
+	int (*set_vmx_state)(struct kvm_vcpu *vcpu,
+			     struct kvm_vmx_state __user *user_vmx_state);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 739c0c5..5aaf8bb 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -357,4 +357,16 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
 #define KVM_X86_QUIRK_CD_NW_CLEARED	(1 << 1)
 
+#define KVM_VMX_STATE_GUEST_MODE	0x00000001
+#define KVM_VMX_STATE_RUN_PENDING	0x00000002
+
+/* for KVM_CAP_VMX_STATE */
+struct kvm_vmx_state {
+	__u64 vmxon_ptr;
+	__u64 current_vmptr;
+	__u32 flags;
+	__u32 data_size;
+	__u8 data[0];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9f0c747..d75c183 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11275,6 +11275,141 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 			~FEATURE_CONTROL_LMCE;
 }
 
+static int get_vmcs_cache(struct kvm_vcpu *vcpu,
+			  struct kvm_vmx_state __user *user_vmx_state,
+			  struct kvm_vmx_state vmx_state)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	/*
+	 * When running L2, the authoritative vmcs12 state is in the
+	 * vmcs02. When running L1, the authoritative vmcs12 state is
+	 * in the shadow vmcs linked to vmcs01, unless
+	 * sync_shadow_vmcs is set, in which case, the authoritative
+	 * vmcs12 state is in the vmcs12 already.
+	 */
+	if (is_guest_mode(vcpu))
+		sync_vmcs12(vcpu, vmcs12);
+	else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
+		copy_shadow_to_vmcs12(vmx);
+	if (copy_to_user(user_vmx_state->data, vmcs12, VMCS12_SIZE))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int get_vmx_state(struct kvm_vcpu *vcpu,
+			 struct kvm_vmx_state __user *user_vmx_state)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_vmx_state vmx_state = {
+		.vmxon_ptr = -1ull,
+		.current_vmptr = -1ull,
+		.flags = 0,
+		.data_size = 0
+	};
+	u32 user_data_size;
+
+	if (copy_from_user(&user_data_size, &user_vmx_state->data_size,
+			   sizeof(user_data_size)))
+		return -EFAULT;
+
+	if (nested_vmx_allowed(vcpu) && vmx->nested.vmxon) {
+		vmx_state.vmxon_ptr = vmx->nested.vmxon_ptr;
+		vmx_state.current_vmptr = vmx->nested.current_vmptr;
+		if (vmx_state.current_vmptr != -1ull)
+			vmx_state.data_size += VMCS12_SIZE;
+		if (is_guest_mode(vcpu)) {
+			vmx_state.flags |= KVM_VMX_STATE_GUEST_MODE;
+			if (vmx->nested.nested_run_pending)
+				vmx_state.flags |= KVM_VMX_STATE_RUN_PENDING;
+		}
+	}
+
+	if (copy_to_user(user_vmx_state, &vmx_state, sizeof(vmx_state)))
+		return -EFAULT;
+
+	if (user_data_size < vmx_state.data_size)
+		return -E2BIG;
+
+	if (vmx_state.data_size > 0)
+		return get_vmcs_cache(vcpu, user_vmx_state, vmx_state);
+
+	return 0;
+}
+
+static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
+}
+
+static int set_vmcs_cache(struct kvm_vcpu *vcpu,
+			  struct kvm_vmx_state __user *user_vmx_state,
+			  struct kvm_vmx_state vmx_state)
+
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	u32 exit_qual;
+
+	if (vmx_state.data_size < VMCS12_SIZE ||
+	    vmx_state.current_vmptr == vmx_state.vmxon_ptr ||
+	    !page_address_valid(vcpu, vmx_state.current_vmptr))
+		return -EINVAL;
+	if (copy_from_user(vmcs12, user_vmx_state->data, VMCS12_SIZE))
+		return -EFAULT;
+	if (vmcs12->revision_id != VMCS12_REVISION)
+		return -EINVAL;
+	set_current_vmptr(vmx, vmx_state.current_vmptr);
+	if (enable_shadow_vmcs)
+		vmx->nested.sync_shadow_vmcs = true;
+	if (!(vmx_state.flags & KVM_VMX_STATE_GUEST_MODE))
+		return 0;
+
+	if (check_vmentry_prereqs(vcpu, vmcs12) ||
+	    check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+		return -EINVAL;
+	return enter_vmx_non_root_mode(vcpu);
+}
+
+static int set_vmx_state(struct kvm_vcpu *vcpu,
+			 struct kvm_vmx_state __user *user_vmx_state)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_vmx_state vmx_state;
+	int ret;
+
+	if (copy_from_user(&vmx_state, user_vmx_state, sizeof(vmx_state)))
+		return -EFAULT;
+
+	if (vmx_state.flags &
+	    ~(KVM_VMX_STATE_RUN_PENDING | KVM_VMX_STATE_GUEST_MODE))
+		return -EINVAL;
+
+	if (!nested_vmx_allowed(vcpu))
+		return vmx_state.vmxon_ptr == -1ull ? 0 : -EINVAL;
+
+	vmx_leave_nested(vcpu);
+
+	vmx->nested.nested_run_pending =
+		!!(vmx_state.flags & KVM_VMX_STATE_RUN_PENDING);
+	if (vmx_state.vmxon_ptr == -1ull)
+		return 0;
+
+	if (!page_address_valid(vcpu, vmx_state.vmxon_ptr))
+		return -EINVAL;
+	vmx->nested.vmxon_ptr = vmx_state.vmxon_ptr;
+	ret = enter_vmx_operation(vcpu);
+	if (ret)
+		return ret;
+
+	if (vmx_state.current_vmptr == -1ull)
+		return 0;
+
+	return set_vmcs_cache(vcpu, user_vmx_state, vmx_state);
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -11403,6 +11538,9 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 #endif
 
 	.setup_mce = vmx_setup_mce,
+
+	.get_vmx_state = get_vmx_state,
+	.set_vmx_state = set_vmx_state,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04c5d96..e249215 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2685,6 +2685,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X2APIC_API:
 		r = KVM_X2APIC_API_VALID_FLAGS;
 		break;
+	case KVM_CAP_VMX_STATE:
+		r = !!kvm_x86_ops->get_vmx_state;
+		break;
 	default:
 		r = 0;
 		break;
@@ -3585,6 +3588,22 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		break;
 	}
+	case KVM_GET_VMX_STATE: {
+		struct kvm_vmx_state __user *user_vmx_state = argp;
+
+		r = -EINVAL;
+		if (kvm_x86_ops->get_vmx_state)
+			r = kvm_x86_ops->get_vmx_state(vcpu, user_vmx_state);
+		goto out;
+	}
+	case KVM_SET_VMX_STATE: {
+		struct kvm_vmx_state __user *user_vmx_state = argp;
+
+		r = -EINVAL;
+		if (kvm_x86_ops->set_vmx_state)
+			r = kvm_x86_ops->set_vmx_state(vcpu, user_vmx_state);
+		goto out;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4ee67cb..ba3c586 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -870,6 +870,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
 #define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_VMX_STATE 133
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1280,6 +1281,9 @@ struct kvm_s390_ucas_mapping {
 #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
 /* Available with KVM_CAP_X86_SMM */
 #define KVM_SMI                   _IO(KVMIO,   0xb7)
+/* Available with KVM_CAP_VMX_STATE */
+#define KVM_GET_VMX_STATE         _IOWR(KVMIO, 0xb8, struct kvm_vmx_state)
+#define KVM_SET_VMX_STATE         _IOW(KVMIO,  0xb9, struct kvm_vmx_state)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
-- 
2.8.0.rc3.226.g39d4020


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH 8/8] kvm: nVMX: Defer gpa->hpa lookups for set_vmx_state
  2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
                   ` (6 preceding siblings ...)
  2016-11-30 20:03 ` [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE Jim Mattson
@ 2016-11-30 20:03 ` Jim Mattson
  2016-12-09 15:35   ` Paolo Bonzini
  2016-12-06  9:04 ` [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state David Hildenbrand
  2017-02-08 16:24 ` Paolo Bonzini
  9 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2016-11-30 20:03 UTC (permalink / raw)
  To: kvm; +Cc: Jim Mattson

Prepare_vmcs02 needs to be able to translate some guest physical
addresses to host physical addresses, but this isn't possible until
the vcpu is running.

When entering VMX non-root operation from set_vmx_state, queue a request
to perform the gpa->hpa lookups at the next vcpu_run.

Signed-off-by: Jim Mattson <jmattson@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/vmx.c              | 21 ++++++++++++++++-----
 arch/x86/kvm/x86.c              |  2 ++
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d6be6f1..0f54387 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -73,6 +73,7 @@
 #define KVM_REQ_HV_RESET          28
 #define KVM_REQ_HV_EXIT           29
 #define KVM_REQ_HV_STIMER         30
+#define KVM_REQ_GET_VMCS12_PAGES  31
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -1025,6 +1026,7 @@ struct kvm_x86_ops {
 			     struct kvm_vmx_state __user *user_vmx_state);
 	int (*set_vmx_state)(struct kvm_vcpu *vcpu,
 			     struct kvm_vmx_state __user *user_vmx_state);
+	void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d75c183..5c459ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9417,10 +9417,10 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 					       struct vmcs12 *vmcs12);
 
-static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
-					struct vmcs12 *vmcs12)
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	u64 hpa;
 
 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -10265,8 +10265,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
 
 	prepare_vmcs02(vcpu, vmcs12);
 
-	nested_get_vmcs12_pages(vcpu, vmcs12);
-
 	msr_entry_idx = nested_vmx_load_msr(vcpu,
 					    vmcs12->vm_entry_msr_load_addr,
 					    vmcs12->vm_entry_msr_load_count);
@@ -10351,6 +10349,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	if (ret)
 		return ret;
 
+	nested_get_vmcs12_pages(vcpu);
+
 	if (unlikely(vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)) {
 		vmx->nested.nested_run_pending = 0;
 		return kvm_vcpu_halt(vcpu);
@@ -11352,6 +11352,7 @@ static int set_vmcs_cache(struct kvm_vcpu *vcpu,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	u32 exit_qual;
+	int ret;
 
 	if (vmx_state.data_size < VMCS12_SIZE ||
 	    vmx_state.current_vmptr == vmx_state.vmxon_ptr ||
@@ -11370,7 +11371,16 @@ static int set_vmcs_cache(struct kvm_vcpu *vcpu,
 	if (check_vmentry_prereqs(vcpu, vmcs12) ||
 	    check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
 		return -EINVAL;
-	return enter_vmx_non_root_mode(vcpu);
+	ret = enter_vmx_non_root_mode(vcpu);
+	if (ret)
+		return ret;
+
+	/*
+	 * This request will result in a call to
+	 * nested_get_vmcs12_pages before the next VM-entry.
+	 */
+	kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+	return 0;
 }
 
 static int set_vmx_state(struct kvm_vcpu *vcpu,
@@ -11541,6 +11551,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.get_vmx_state = get_vmx_state,
 	.set_vmx_state = set_vmx_state,
+	.get_vmcs12_pages = nested_get_vmcs12_pages,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e249215..39c1517 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6551,6 +6551,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_immediate_exit = false;
 
 	if (vcpu->requests) {
+		if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
+			kvm_x86_ops->get_vmcs12_pages(vcpu);
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
-- 
2.8.0.rc3.226.g39d4020


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
                   ` (7 preceding siblings ...)
  2016-11-30 20:03 ` [PATCH 8/8] kvm: nVMX: Defer gpa->hpa lookups for set_vmx_state Jim Mattson
@ 2016-12-06  9:04 ` David Hildenbrand
  2016-12-06 19:02   ` Jim Mattson
  2017-02-08 16:24 ` Paolo Bonzini
  9 siblings, 1 reply; 63+ messages in thread
From: David Hildenbrand @ 2016-12-06  9:04 UTC (permalink / raw)
  To: Jim Mattson, kvm

Am 30.11.2016 um 21:03 schrieb Jim Mattson:
> This series of patches introduces checkpoint and restore operations
> for the VMX state of a VM, so that VMX-capable VMs can be migrated.
>
> Two new ioctls are introduced: KVM_GET_VMX_STATE and
> KVM_SET_VMX_STATE. The VMX state that is checkpointed/restored
> consists of the VMXON region address (if any), the current VMCS
> address (if any), and the cached current VMCS contents (if any). One
> piece of implementation-specific state that is also
> checkpointed/restored is the nested_run_pending bit.
>
> On live migration, our userspace process does not have access to guest
> memory when the VM is set up on the target host, so some operations
> that require a GPA->HPA mapping are deferred until the next vcpu_run
> using a new vcpu request.

This somewhat smells like giving user space the ability to directly 
control hardware virtualization. Will there be performed all 
security/filter actions on the passed in data just as when preparing the 
shadow vmcs? IOW could this interface allow user space to run anything 
unfiltered using vmx?

Why exactly can't we simply copy/filter from guest memory? (you mention 
that this is not available when setting up VM on the target - why can't 
we defer that completely then?).

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>


-- 

David

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2016-12-06  9:04 ` [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state David Hildenbrand
@ 2016-12-06 19:02   ` Jim Mattson
  2016-12-09 15:28     ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2016-12-06 19:02 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: kvm

On Tue, Dec 6, 2016 at 1:04 AM, David Hildenbrand <david@redhat.com> wrote:
> Am 30.11.2016 um 21:03 schrieb Jim Mattson:
>>
>> This series of patches introduces checkpoint and restore operations
>> for the VMX state of a VM, so that VMX-capable VMs can be migrated.
>>
>> Two new ioctls are introduced: KVM_GET_VMX_STATE and
>> KVM_SET_VMX_STATE. The VMX state that is checkpointed/restored
>> consists of the VMXON region address (if any), the current VMCS
>> address (if any), and the cached current VMCS contents (if any). One
>> piece of implementation-specific state that is also
>> checkpointed/restored is the nested_run_pending bit.
>>
>> On live migration, our userspace process does not have access to guest
>> memory when the VM is set up on the target host, so some operations
>> that require a GPA->HPA mapping are deferred until the next vcpu_run
>> using a new vcpu request.
>
>
> This somewhat smells like giving user space the ability to directly control
> hardware virtualization. Will there be performed all security/filter actions
> on the passed in data just as when preparing the shadow vmcs? IOW could this
> interface allow user space to run anything unfiltered using vmx?

All of the validity checks are performed on the VMX settings provided
from userspace.
set_vmx_state() checks that the VMXON region address is valid.
Set_vmcs_caches() checks that the current VMCS address is valid and
doesn't match the VMXON region address, and it calls
check_vmentry_prereqs() and check_vmentry_postreqs() on the contents
of the VMCS cache.

> Why exactly can't we simply copy/filter from guest memory? (you mention that
> this is not available when setting up VM on the target - why can't we defer
> that completely then?).

After the L1 VM has executed VMPTRLD, the authoritative VMCS12 data is
in L0 kernel memory (the VMCS cache) rather than in L1 guest memory.
[See commit 4f2777bc9797 ("kvm: x86: nVMX: maintain internal copy of
current VMCS").] There are many reasons why userspace may want access
to the data in the vCPU's VMCS cache in addition to live migration.
For example, a userspace instruction emulator may need access to
several VMCS fields to determine how to emulate an instruction when
the vCPU is in virtualized VMX non-root mode.

>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>
>
> --
>
> David

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2016-11-30 20:03 ` [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE Jim Mattson
@ 2016-12-09 15:26   ` Paolo Bonzini
  2016-12-09 15:55     ` David Hildenbrand
  2017-02-15 11:56     ` Paolo Bonzini
  0 siblings, 2 replies; 63+ messages in thread
From: Paolo Bonzini @ 2016-12-09 15:26 UTC (permalink / raw)
  To: Jim Mattson, kvm, David Hildenbrand



On 30/11/2016 21:03, Jim Mattson wrote:
> +#define KVM_VMX_STATE_GUEST_MODE	0x00000001
> +#define KVM_VMX_STATE_RUN_PENDING	0x00000002
> +
> +/* for KVM_CAP_VMX_STATE */
> +struct kvm_vmx_state {
> +	__u64 vmxon_ptr;
> +	__u64 current_vmptr;
> +	__u32 flags;
> +	__u32 data_size;
> +	__u8 data[0];
> +};

Let's prepare the API for nested SVM too.  Please rename it to
KVM_CAP/GET/SET_NESTED_STATE and let's organize it like this:

	/* In addition to guest mode and run pending, please
	 * add one for GIF.
	 */
	__u16 flags;
	/* 0 for VMX, 1 for SVM.  */
	__u16 format;
	/* 128 for SVM, 128 + VMCS size for VMX.  */
	__u32 size;
	/* Both structs padded to 120 bytes.  */
	union {
		/* VMXON, VMCS */
		struct kvm_vmx_state vmx;
		/* HSAVE_PA, VMCB */
		struct kvm_svm_state svm;
	}
	__u8 data[0];

David, would the above make sense for s390 nested SIE too?

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2016-12-06 19:02   ` Jim Mattson
@ 2016-12-09 15:28     ` Paolo Bonzini
  2016-12-09 16:16       ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2016-12-09 15:28 UTC (permalink / raw)
  To: Jim Mattson, David Hildenbrand; +Cc: kvm



On 06/12/2016 20:02, Jim Mattson wrote:
> For example, a userspace instruction emulator may need access to
> several VMCS fields to determine how to emulate an instruction when
> the vCPU is in virtualized VMX non-root mode.

Why?  Wouldn't KVM_GET_SREGS for example provide values descriptor cache
from the active L2 guest (and thus, indirectly, from the VMCS12)?

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 8/8] kvm: nVMX: Defer gpa->hpa lookups for set_vmx_state
  2016-11-30 20:03 ` [PATCH 8/8] kvm: nVMX: Defer gpa->hpa lookups for set_vmx_state Jim Mattson
@ 2016-12-09 15:35   ` Paolo Bonzini
  2016-12-09 16:26     ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2016-12-09 15:35 UTC (permalink / raw)
  To: Jim Mattson, kvm



On 30/11/2016 21:03, Jim Mattson wrote:
> Prepare_vmcs02 needs to be able to translate some guest physical
> addresses to host physical addresses, but this isn't possible until
> the vcpu is running.

Can you explain why in more detail?  Also, please just squash this in
patch 7.

Paolo

> When entering VMX non-root operation from set_vmx_state, queue a request
> to perform the gpa->hpa lookups at the next vcpu_run.
> 
> Signed-off-by: Jim Mattson <jmattson@google.com>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2016-12-09 15:26   ` Paolo Bonzini
@ 2016-12-09 15:55     ` David Hildenbrand
  2016-12-09 19:26       ` Paolo Bonzini
  2017-02-15 11:56     ` Paolo Bonzini
  1 sibling, 1 reply; 63+ messages in thread
From: David Hildenbrand @ 2016-12-09 15:55 UTC (permalink / raw)
  To: Paolo Bonzini, Jim Mattson, kvm, David Hildenbrand

Am 09.12.2016 um 16:26 schrieb Paolo Bonzini:
>
>
> On 30/11/2016 21:03, Jim Mattson wrote:
>> +#define KVM_VMX_STATE_GUEST_MODE	0x00000001
>> +#define KVM_VMX_STATE_RUN_PENDING	0x00000002
>> +
>> +/* for KVM_CAP_VMX_STATE */
>> +struct kvm_vmx_state {
>> +	__u64 vmxon_ptr;
>> +	__u64 current_vmptr;
>> +	__u32 flags;
>> +	__u32 data_size;
>> +	__u8 data[0];
>> +};
>
> Let's prepare the API for nested SVM too.  Please rename it to
> KVM_CAP/GET/SET_NESTED_STATE and let's organize it like this:
>
> 	/* In addition to guest mode and run pending, please
> 	 * add one for GIF.
> 	 */
> 	__u16 flags;
> 	/* 0 for VMX, 1 for SVM.  */
> 	__u16 format;
> 	/* 128 for SVM, 128 + VMCS size for VMX.  */
> 	__u32 size;
> 	/* Both structs padded to 120 bytes.  */
> 	union {
> 		/* VMXON, VMCS */
> 		struct kvm_vmx_state vmx;
> 		/* HSAVE_PA, VMCB */
> 		struct kvm_svm_state svm;
> 	}
> 	__u8 data[0];
>
> David, would the above make sense for s390 nested SIE too?
>

s390x doesn't have _any_ SIE state in the hardware. All g2 state is in
g1 memory and therefore migrated. So there is nothing needed at that
point for migration. It just works :)

shadowed SCBs (SIE control blocks) are simply recreated on the target
(after every VSIE execution, we write the data back into g1, so
whenever we leave the interception handler, we act according to the SIE
architecture).

I would have thought that migrating the current VMLOAD and VMXON PTR (+
VMX state) should also be enough for VMX, but I haven't looked into the
details of vmcs shadowing/caching yet that Jim mentioned.

BTW, I assume we can't migrate while having a nested guest from Intel
to AMD. Are there any checks in place for that? (being new to x86, is
it even possible at all to migrate a guest from AMD <-> Intel? I assume
so with apropriate CPU models).

-- 

David

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2016-12-09 15:28     ` Paolo Bonzini
@ 2016-12-09 16:16       ` Jim Mattson
  2016-12-09 19:15         ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2016-12-09 16:16 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: David Hildenbrand, kvm

At the very least, it needs to know the following to do a page walk:

a) Are the secondary processor based VM-execution controls activated
in the pimary processor based execution controls?
b) If so, is EPT enabled?
c) If so, what is the current EPTP?

I don't believe these are available through any existing mechanism.

On Fri, Dec 9, 2016 at 7:28 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 06/12/2016 20:02, Jim Mattson wrote:
>> For example, a userspace instruction emulator may need access to
>> several VMCS fields to determine how to emulate an instruction when
>> the vCPU is in virtualized VMX non-root mode.
>
> Why?  Wouldn't KVM_GET_SREGS for example provide values descriptor cache
> from the active L2 guest (and thus, indirectly, from the VMCS12)?
>
> Thanks,
>
> Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 8/8] kvm: nVMX: Defer gpa->hpa lookups for set_vmx_state
  2016-12-09 15:35   ` Paolo Bonzini
@ 2016-12-09 16:26     ` Jim Mattson
  2016-12-10  7:48       ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2016-12-09 16:26 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm

This may be an artifact of our userspace agent that isn't shared by
qemu. I can drop this from the upstream patch set, if you like.

On Fri, Dec 9, 2016 at 7:35 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 30/11/2016 21:03, Jim Mattson wrote:
>> Prepare_vmcs02 needs to be able to translate some guest physical
>> addresses to host physical addresses, but this isn't possible until
>> the vcpu is running.
>
> Can you explain why in more detail?  Also, please just squash this in
> patch 7.
>
> Paolo
>
>> When entering VMX non-root operation from set_vmx_state, queue a request
>> to perform the gpa->hpa lookups at the next vcpu_run.
>>
>> Signed-off-by: Jim Mattson <jmattson@google.com>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2016-12-09 16:16       ` Jim Mattson
@ 2016-12-09 19:15         ` Paolo Bonzini
  2017-02-15 19:30           ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2016-12-09 19:15 UTC (permalink / raw)
  To: Jim Mattson; +Cc: David Hildenbrand, kvm



On 09/12/2016 17:16, Jim Mattson wrote:
> At the very least, it needs to know the following to do a page walk:
> 
> a) Are the secondary processor based VM-execution controls activated
> in the pimary processor based execution controls?
> b) If so, is EPT enabled?
> c) If so, what is the current EPTP?

Oh, I see; this is when MMIO is passed through from L1 to L2.  Though
there is also the KVM_TRANSLATE ioctl.

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2016-12-09 15:55     ` David Hildenbrand
@ 2016-12-09 19:26       ` Paolo Bonzini
  2016-12-12 14:14         ` David Hildenbrand
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2016-12-09 19:26 UTC (permalink / raw)
  To: David Hildenbrand, Jim Mattson, kvm, David Hildenbrand



On 09/12/2016 16:55, David Hildenbrand wrote:
> Am 09.12.2016 um 16:26 schrieb Paolo Bonzini:
>>
>>
>> On 30/11/2016 21:03, Jim Mattson wrote:
>>> +#define KVM_VMX_STATE_GUEST_MODE    0x00000001
>>> +#define KVM_VMX_STATE_RUN_PENDING    0x00000002
>>> +
>>> +/* for KVM_CAP_VMX_STATE */
>>> +struct kvm_vmx_state {
>>> +    __u64 vmxon_ptr;
>>> +    __u64 current_vmptr;
>>> +    __u32 flags;
>>> +    __u32 data_size;
>>> +    __u8 data[0];
>>> +};
>>
>> Let's prepare the API for nested SVM too.  Please rename it to
>> KVM_CAP/GET/SET_NESTED_STATE and let's organize it like this:
>>
>>     /* In addition to guest mode and run pending, please
>>      * add one for GIF.
>>      */
>>     __u16 flags;
>>     /* 0 for VMX, 1 for SVM.  */
>>     __u16 format;
>>     /* 128 for SVM, 128 + VMCS size for VMX.  */
>>     __u32 size;
>>     /* Both structs padded to 120 bytes.  */
>>     union {
>>         /* VMXON, VMCS */
>>         struct kvm_vmx_state vmx;
>>         /* HSAVE_PA, VMCB */
>>         struct kvm_svm_state svm;
>>     }
>>     __u8 data[0];
>>
>> David, would the above make sense for s390 nested SIE too?
>>
> 
> s390x doesn't have _any_ SIE state in the hardware. All g2 state is in
> g1 memory and therefore migrated. So there is nothing needed at that
> point for migration. It just works :)
> 
> shadowed SCBs (SIE control blocks) are simply recreated on the target
> (after every VSIE execution, we write the data back into g1, so
> whenever we leave the interception handler, we act according to the SIE
> architecture).

If you get a userspace vmexit during vSIE, do you always exit the vSIE?
If not, the SIE operand is a hidden piece of processor state that you
need in order to restart execution of the vCPU.  This would be more or
less the same as what's needed for x86 AMD.

> BTW, I assume we can't migrate while having a nested guest from Intel
> to AMD. Are there any checks in place for that? (being new to x86, is
> it even possible at all to migrate a guest from AMD <-> Intel? I assume
> so with apropriate CPU models).

Yes, it's possible with appropriate CPU models, but VMX and SVM have
different bits in CPUID.  Therefore, such CPU models would never support
nested virtualization.

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 8/8] kvm: nVMX: Defer gpa->hpa lookups for set_vmx_state
  2016-12-09 16:26     ` Jim Mattson
@ 2016-12-10  7:48       ` Paolo Bonzini
  0 siblings, 0 replies; 63+ messages in thread
From: Paolo Bonzini @ 2016-12-10  7:48 UTC (permalink / raw)
  To: Jim Mattson; +Cc: kvm

> This may be an artifact of our userspace agent that isn't shared by
> qemu. I can drop this from the upstream patch set, if you like.

No, I think it makes sense, it just needs a better commit message.

Paolo

> On Fri, Dec 9, 2016 at 7:35 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >
> >
> > On 30/11/2016 21:03, Jim Mattson wrote:
> >> Prepare_vmcs02 needs to be able to translate some guest physical
> >> addresses to host physical addresses, but this isn't possible until
> >> the vcpu is running.
> >
> > Can you explain why in more detail?  Also, please just squash this in
> > patch 7.
> >
> > Paolo
> >
> >> When entering VMX non-root operation from set_vmx_state, queue a request
> >> to perform the gpa->hpa lookups at the next vcpu_run.
> >>
> >> Signed-off-by: Jim Mattson <jmattson@google.com>
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2016-12-09 19:26       ` Paolo Bonzini
@ 2016-12-12 14:14         ` David Hildenbrand
  0 siblings, 0 replies; 63+ messages in thread
From: David Hildenbrand @ 2016-12-12 14:14 UTC (permalink / raw)
  To: Paolo Bonzini, Jim Mattson, kvm, David Hildenbrand

>>
>> s390x doesn't have _any_ SIE state in the hardware. All g2 state is in
>> g1 memory and therefore migrated. So there is nothing needed at that
>> point for migration. It just works :)
>>
>> shadowed SCBs (SIE control blocks) are simply recreated on the target
>> (after every VSIE execution, we write the data back into g1, so
>> whenever we leave the interception handler, we act according to the SIE
>> architecture).
>
> If you get a userspace vmexit during vSIE, do you always exit the vSIE?
> If not, the SIE operand is a hidden piece of processor state that you
> need in order to restart execution of the vCPU.  This would be more or
> less the same as what's needed for x86 AMD.

For s390x, there are no instructions emulated for the vSIE, not even in
kernel space. So every time we leave our interception handler, the vSIE
has been fully handled and the next step would be to execute our g1 again.

Please note that (v)SIE and SIE don't share any code paths. vSIE is
really just an interception handler, emulating the SIE instruction (by
calling SIE in return).

So if we drop to user space (e.g. due to a signal), the vSIE has been
fully processed and the g2 SCB block has already been copied back to
g1. When re-entering kernel space, g1 is executed (continuing after the
SIE instruction), which will in return do a handful of checks and
execute g2 again (using SIE - intercept - vSIE).

So there is no hidden piece of processor state. Everytime we drop to
user space, everything is properly stored in g1 memory and therefore
migrated. As these user space exits are really rare, we don't have to
optimize for that. And I think if that should ever be needed, this 
interface should be sufficient to extract/set hidden state.

-- 

David

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
                   ` (8 preceding siblings ...)
  2016-12-06  9:04 ` [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state David Hildenbrand
@ 2017-02-08 16:24 ` Paolo Bonzini
  2017-02-14 22:17   ` Jim Mattson
  9 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2017-02-08 16:24 UTC (permalink / raw)
  To: Jim Mattson, kvm



On 30/11/2016 21:03, Jim Mattson wrote:
> This series of patches introduces checkpoint and restore operations
> for the VMX state of a VM, so that VMX-capable VMs can be migrated.
> 
> Two new ioctls are introduced: KVM_GET_VMX_STATE and
> KVM_SET_VMX_STATE. The VMX state that is checkpointed/restored
> consists of the VMXON region address (if any), the current VMCS
> address (if any), and the cached current VMCS contents (if any). One
> piece of implementation-specific state that is also
> checkpointed/restored is the nested_run_pending bit.
> 
> On live migration, our userspace process does not have access to guest
> memory when the VM is set up on the target host, so some operations
> that require a GPA->HPA mapping are deferred until the next vcpu_run
> using a new vcpu request.

I merged patches 1-6 into kvm/queue.  Rebasing took a bit of work, so
please take a look.

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2017-02-08 16:24 ` Paolo Bonzini
@ 2017-02-14 22:17   ` Jim Mattson
  2017-02-15 10:15     ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2017-02-14 22:17 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm

Yikes! Did I forget to include "kvm: nVMX: Set nested_run_pending
before prepare_vmcs02()"? Sorry. That obviates the need for
"from_vmentry," and is a little less awkward, I think. (The problem is
that kvm can exit to userspace with vmx->nested.nested_run_pending
set. If VMX state is saved at that time, then the restore code has to
behave as if "from_vmentry" is true.  In any event, your version looks
fine, and I can always clean it up later (or not).

After commit 06750494993be18e ("kvm: nVMX: Refactor
nested_get_vmcs12_pages()"), vmx.c:9688 should read:

if (vmx->nested.virtual_apic_page) {

i.e., drop the '!'

On Wed, Feb 8, 2017 at 8:24 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 30/11/2016 21:03, Jim Mattson wrote:
>> This series of patches introduces checkpoint and restore operations
>> for the VMX state of a VM, so that VMX-capable VMs can be migrated.
>>
>> Two new ioctls are introduced: KVM_GET_VMX_STATE and
>> KVM_SET_VMX_STATE. The VMX state that is checkpointed/restored
>> consists of the VMXON region address (if any), the current VMCS
>> address (if any), and the cached current VMCS contents (if any). One
>> piece of implementation-specific state that is also
>> checkpointed/restored is the nested_run_pending bit.
>>
>> On live migration, our userspace process does not have access to guest
>> memory when the VM is set up on the target host, so some operations
>> that require a GPA->HPA mapping are deferred until the next vcpu_run
>> using a new vcpu request.
>
> I merged patches 1-6 into kvm/queue.  Rebasing took a bit of work, so
> please take a look.
>
> Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2017-02-14 22:17   ` Jim Mattson
@ 2017-02-15 10:15     ` Paolo Bonzini
  2017-02-15 17:02       ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2017-02-15 10:15 UTC (permalink / raw)
  To: Jim Mattson; +Cc: kvm



On 14/02/2017 23:17, Jim Mattson wrote:
> Yikes! Did I forget to include "kvm: nVMX: Set nested_run_pending
> before prepare_vmcs02()"? Sorry. That obviates the need for
> "from_vmentry," and is a little less awkward, I think. (The problem is
> that kvm can exit to userspace with vmx->nested.nested_run_pending
> set. If VMX state is saved at that time, then the restore code has to
> behave as if "from_vmentry" is true.  In any event, your version looks
> fine, and I can always clean it up later (or not).

Looks like that, yes.  I went for "from_vmentry" because I wasn't sure
if your missing patch was just reverting this:

    commit 7af40ad37b3f097f367cbe9c0198caccce6fd83b
    Author: Jan Kiszka <jan.kiszka@siemens.com>
    Date:   Sat Jan 4 18:47:23 2014 +0100

    KVM: nVMX: Fix nested_run_pending on activity state HLT

    When we suspend the guest in HLT state, the nested run is no longer
    pending - we emulated it completely. So only set nested_run_pending
    after checking the activity state.

    Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

I guess it would be possible to reset nested_run_pending on activity
state HLT, too, but I didn't feel like mangling your patches even more.

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2016-12-09 15:26   ` Paolo Bonzini
  2016-12-09 15:55     ` David Hildenbrand
@ 2017-02-15 11:56     ` Paolo Bonzini
  2017-02-15 16:06       ` Jim Mattson
  1 sibling, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2017-02-15 11:56 UTC (permalink / raw)
  To: Jim Mattson, kvm, David Hildenbrand



On 09/12/2016 16:26, Paolo Bonzini wrote:
> Let's prepare the API for nested SVM too.  Please rename it to
> KVM_CAP/GET/SET_NESTED_STATE and let's organize it like this:
> 
> 	/* In addition to guest mode and run pending, please
> 	 * add one for GIF.
> 	 */
> 	__u16 flags;
> 	/* 0 for VMX, 1 for SVM.  */
> 	__u16 format;
> 	/* 128 for SVM, 128 + VMCS size for VMX.  */
> 	__u32 size;
> 	/* Both structs padded to 120 bytes.  */
> 	union {
> 		/* VMXON, VMCS */
> 		struct kvm_vmx_state vmx;
> 		/* HSAVE_PA, VMCB */
> 		struct kvm_svm_state svm;
> 	}
> 	__u8 data[0];

Thinking again about this, do we really need to store the VMCS in the
ioctl?  What would be the issues if we just flushed it to memory and
reloaded it at KVM_REQ_GET_VMCS12_PAGES time?

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-02-15 11:56     ` Paolo Bonzini
@ 2017-02-15 16:06       ` Jim Mattson
  2017-02-15 16:14         ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2017-02-15 16:06 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, David Hildenbrand

The VMCS cache can be safely flushed to guest memory at any time.
However, I think your proposal has some unfortunate consequences:

1. If KVM_SET_NESTED_STATE is asynchronous, then any subsequent set
operations (e.g. KVM_SET_SREGS) may be overridden on the next KVM_RUN.
2. Guest memory (at least the cached VMCS page(s)) has to be saved
after KVM_GET_NESTED_STATE.
3. KVM_GET_NESTED_STATE is not transparent to the guest.


On Wed, Feb 15, 2017 at 3:56 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 09/12/2016 16:26, Paolo Bonzini wrote:
>> Let's prepare the API for nested SVM too.  Please rename it to
>> KVM_CAP/GET/SET_NESTED_STATE and let's organize it like this:
>>
>>       /* In addition to guest mode and run pending, please
>>        * add one for GIF.
>>        */
>>       __u16 flags;
>>       /* 0 for VMX, 1 for SVM.  */
>>       __u16 format;
>>       /* 128 for SVM, 128 + VMCS size for VMX.  */
>>       __u32 size;
>>       /* Both structs padded to 120 bytes.  */
>>       union {
>>               /* VMXON, VMCS */
>>               struct kvm_vmx_state vmx;
>>               /* HSAVE_PA, VMCB */
>>               struct kvm_svm_state svm;
>>       }
>>       __u8 data[0];
>
> Thinking again about this, do we really need to store the VMCS in the
> ioctl?  What would be the issues if we just flushed it to memory and
> reloaded it at KVM_REQ_GET_VMCS12_PAGES time?
>
> Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-02-15 16:06       ` Jim Mattson
@ 2017-02-15 16:14         ` Paolo Bonzini
  2017-02-15 16:58           ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2017-02-15 16:14 UTC (permalink / raw)
  To: Jim Mattson; +Cc: kvm, David Hildenbrand

On 15/02/2017 17:06, Jim Mattson wrote:
> The VMCS cache can be safely flushed to guest memory at any time.
> However, I think your proposal has some unfortunate consequences:
> 
> 1. If KVM_SET_NESTED_STATE is asynchronous, then any subsequent set
> operations (e.g. KVM_SET_SREGS) may be overridden on the next KVM_RUN.
> 2. Guest memory (at least the cached VMCS page(s)) has to be saved
> after KVM_GET_NESTED_STATE.
> 3. KVM_GET_NESTED_STATE is not transparent to the guest.

I can't choose which is the worst of the three. :)

A better one perhaps is to flush the VMCS cache on L2->userspace exit,
since that should be pretty rare (suggested by David).  I think that
would help at least with (2) and (3).

As to (1), after KVM_SET_NESTED_STATE sets the in-guest-mode flag you
don't really need to reload all of the vmcs12 into vmcs02.  Only the
host state needs to be reloaded, while the guest state is set with
KVM_SET_SREGS and others.

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-02-15 16:14         ` Paolo Bonzini
@ 2017-02-15 16:58           ` Jim Mattson
  2017-02-15 17:37             ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2017-02-15 16:58 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, David Hildenbrand

Flushing the VMCS cache on L2->userspace exit solves (2), but it means
that L2->userspace exit is not transparent to the guest. Is this
better than (3)?

There is some redundancy in the VMCS cache state currently saved by
GET_NESTED_STATE and the VCPU state saved by GET_SREGS and others, but
it does simplify the vmcs02 initialization. (As an aside, this might
be a lot easier if we eliminated the vmcs02 entirely, as I've
suggested in the past.)

To be honest, I'm not sure what it is that you are trying to optimize
for. Is your concern the extra 4K of VCPU state? I would argue that
the VMCS cache is part of the physical CPU state (though on a physical
CPU, not every field is cached), and that it therefore makes sense to
include the VMCS cache as part of the VCPU state, apart from guest
physical memory.

On Wed, Feb 15, 2017 at 8:14 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 15/02/2017 17:06, Jim Mattson wrote:
>> The VMCS cache can be safely flushed to guest memory at any time.
>> However, I think your proposal has some unfortunate consequences:
>>
>> 1. If KVM_SET_NESTED_STATE is asynchronous, then any subsequent set
>> operations (e.g. KVM_SET_SREGS) may be overridden on the next KVM_RUN.
>> 2. Guest memory (at least the cached VMCS page(s)) has to be saved
>> after KVM_GET_NESTED_STATE.
>> 3. KVM_GET_NESTED_STATE is not transparent to the guest.
>
> I can't choose which is the worst of the three. :)
>
> A better one perhaps is to flush the VMCS cache on L2->userspace exit,
> since that should be pretty rare (suggested by David).  I think that
> would help at least with (2) and (3).
>
> As to (1), after KVM_SET_NESTED_STATE sets the in-guest-mode flag you
> don't really need to reload all of the vmcs12 into vmcs02.  Only the
> host state needs to be reloaded, while the guest state is set with
> KVM_SET_SREGS and others.
>
> Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2017-02-15 10:15     ` Paolo Bonzini
@ 2017-02-15 17:02       ` Jim Mattson
  2017-02-15 17:31         ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2017-02-15 17:02 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm

Should I send that missing patch? Rework the set? Do a cleanup after
you check in? I appreciate all the work that you've done, and I'd like
to do whatever makes the most sense to move forward.

On Wed, Feb 15, 2017 at 2:15 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 14/02/2017 23:17, Jim Mattson wrote:
>> Yikes! Did I forget to include "kvm: nVMX: Set nested_run_pending
>> before prepare_vmcs02()"? Sorry. That obviates the need for
>> "from_vmentry," and is a little less awkward, I think. (The problem is
>> that kvm can exit to userspace with vmx->nested.nested_run_pending
>> set. If VMX state is saved at that time, then the restore code has to
>> behave as if "from_vmentry" is true.  In any event, your version looks
>> fine, and I can always clean it up later (or not).
>
> Looks like that, yes.  I went for "from_vmentry" because I wasn't sure
> if your missing patch was just reverting this:
>
>     commit 7af40ad37b3f097f367cbe9c0198caccce6fd83b
>     Author: Jan Kiszka <jan.kiszka@siemens.com>
>     Date:   Sat Jan 4 18:47:23 2014 +0100
>
>     KVM: nVMX: Fix nested_run_pending on activity state HLT
>
>     When we suspend the guest in HLT state, the nested run is no longer
>     pending - we emulated it completely. So only set nested_run_pending
>     after checking the activity state.
>
>     Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>     Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>
> I guess it would be possible to reset nested_run_pending on activity
> state HLT, too, but I didn't feel like mangling your patches even more.
>
> Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2017-02-15 17:02       ` Jim Mattson
@ 2017-02-15 17:31         ` Paolo Bonzini
  0 siblings, 0 replies; 63+ messages in thread
From: Paolo Bonzini @ 2017-02-15 17:31 UTC (permalink / raw)
  To: Jim Mattson; +Cc: kvm



On 15/02/2017 18:02, Jim Mattson wrote:
> Should I send that missing patch? Rework the set? Do a cleanup after
> you check in? I appreciate all the work that you've done, and I'd like
> to do whatever makes the most sense to move forward.

Cleanup after I check in.  I will do some tests with KVM and/or Hyper-V
L1 and get the first six patches in 4.11.

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-02-15 16:58           ` Jim Mattson
@ 2017-02-15 17:37             ` Paolo Bonzini
  2017-02-15 18:19               ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2017-02-15 17:37 UTC (permalink / raw)
  To: Jim Mattson; +Cc: kvm, David Hildenbrand

On 15/02/2017 17:58, Jim Mattson wrote:
> Flushing the VMCS cache on L2->userspace exit solves (2), but it means
> that L2->userspace exit is not transparent to the guest. Is this
> better than (3)?

Yes, I think it is better.  The really nasty part of
"KVM_GET_NESTED_STATE is not transparent to the guest" is that
KVM_GET_NESTED_STATE typically happens while the guest is quiescent and
userspace doesn't expect guest memory to change.  (2) is a special case
of this.

Instead, flushing the VMCS on L2->userspace exit happens during KVM_RUN,
which should not break any invariants that userspace relies on.

> There is some redundancy in the VMCS cache state currently saved by
> GET_NESTED_STATE and the VCPU state saved by GET_SREGS and others, but
> it does simplify the vmcs02 initialization. (As an aside, this might
> be a lot easier if we eliminated the vmcs02 entirely, as I've
> suggested in the past.)

Do you have patches to eliminate the vmcs02 or was it just a proposal?

As things stand now, almost all of the vmcs01 has to be reloaded anyway
from the vmcs12's host state.  But as I said earlier we could be
different if we compared vmcs01 and vmcs12's state (especially the
descriptor caches) and optimize load_vmcs12_host_state consequently.
Somebody needs to write the code and benchmark it with vmexit.flat...

> To be honest, I'm not sure what it is that you are trying to optimize
> for. Is your concern the extra 4K of VCPU state? I would argue that
> the VMCS cache is part of the physical CPU state (though on a physical
> CPU, not every field is cached), and that it therefore makes sense to
> include the VMCS cache as part of the VCPU state, apart from guest
> physical memory.

It's not really the extra 4K, but rather the ugly "blob" aspect of it
and having to maintain compatibility for it.  Regarding the blobbiness,
see for example KVM_GET/SET_XSAVE, where you can actually marshal and
unmarshal the contents of the XSAVE area into the registers.  Regarding
compatibility, I'm worried that once someone looks at SVM more closely,
we will have to store the VMCB in GET_NESTED_STATE too.

Paolo

> On Wed, Feb 15, 2017 at 8:14 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>
>> On 15/02/2017 17:06, Jim Mattson wrote:
>>> The VMCS cache can be safely flushed to guest memory at any time.
>>> However, I think your proposal has some unfortunate consequences:
>>>
>>> 1. If KVM_SET_NESTED_STATE is asynchronous, then any subsequent set
>>> operations (e.g. KVM_SET_SREGS) may be overridden on the next KVM_RUN.
>>> 2. Guest memory (at least the cached VMCS page(s)) has to be saved
>>> after KVM_GET_NESTED_STATE.
>>> 3. KVM_GET_NESTED_STATE is not transparent to the guest.
>>
>> I can't choose which is the worst of the three. :)
>>
>> A better one perhaps is to flush the VMCS cache on L2->userspace exit,
>> since that should be pretty rare (suggested by David).  I think that
>> would help at least with (2) and (3).
>>
>> As to (1), after KVM_SET_NESTED_STATE sets the in-guest-mode flag you
>> don't really need to reload all of the vmcs12 into vmcs02.  Only the
>> host state needs to be reloaded, while the guest state is set with
>> KVM_SET_SREGS and others.
>>
>> Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-02-15 17:37             ` Paolo Bonzini
@ 2017-02-15 18:19               ` Jim Mattson
  2017-02-15 21:28                 ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2017-02-15 18:19 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, David Hildenbrand

On Wed, Feb 15, 2017 at 9:37 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 15/02/2017 17:58, Jim Mattson wrote:
>> Flushing the VMCS cache on L2->userspace exit solves (2), but it means
>> that L2->userspace exit is not transparent to the guest. Is this
>> better than (3)?
>
> Yes, I think it is better.  The really nasty part of
> "KVM_GET_NESTED_STATE is not transparent to the guest" is that
> KVM_GET_NESTED_STATE typically happens while the guest is quiescent and
> userspace doesn't expect guest memory to change.  (2) is a special case
> of this.
>
> Instead, flushing the VMCS on L2->userspace exit happens during KVM_RUN,
> which should not break any invariants that userspace relies on.

It is certainly within the bounds of the Intel specification to flush
the VMCS cache
at any time, but I think it would be better if the VCPU behaved more
deterministically (with respect to guest-visible events).

>> There is some redundancy in the VMCS cache state currently saved by
>> GET_NESTED_STATE and the VCPU state saved by GET_SREGS and others, but
>> it does simplify the vmcs02 initialization. (As an aside, this might
>> be a lot easier if we eliminated the vmcs02 entirely, as I've
>> suggested in the past.)
>
> Do you have patches to eliminate the vmcs02 or was it just a proposal?

Just a proposal. I think a separate vmcs02 has the potential for faster emulated
VM-entry/VM-exit, but at the cost of greater code complexity. A unified vmcs0X
could certainly be as fast as the existing implementation and a lot simpler.

>
> As things stand now, almost all of the vmcs01 has to be reloaded anyway
> from the vmcs12's host state.  But as I said earlier we could be
> different if we compared vmcs01 and vmcs12's state (especially the
> descriptor caches) and optimize load_vmcs12_host_state consequently.
> Somebody needs to write the code and benchmark it with vmexit.flat...
>
>> To be honest, I'm not sure what it is that you are trying to optimize
>> for. Is your concern the extra 4K of VCPU state? I would argue that
>> the VMCS cache is part of the physical CPU state (though on a physical
>> CPU, not every field is cached), and that it therefore makes sense to
>> include the VMCS cache as part of the VCPU state, apart from guest
>> physical memory.
>
> It's not really the extra 4K, but rather the ugly "blob" aspect of it
> and having to maintain compatibility for it.  Regarding the blobbiness,
> see for example KVM_GET/SET_XSAVE, where you can actually marshal and
> unmarshal the contents of the XSAVE area into the registers.  Regarding
> compatibility, I'm worried that once someone looks at SVM more closely,
> we will have to store the VMCB in GET_NESTED_STATE too.

Instead of a "blob," we could pass it out to userspace as a 'struct
vmcs12,' padded
out to 4k for compatibility purposes. Then userspace could marshal and unmarshal
the contents, and it would be a little easier than marshalling and
unmarshalling the
same contents in guest physical memory.

I don't know what to say about SVM. I think we should be prepared to
store the VMCB
at some point, even if we don't now.

BTW, eventually I'm going to want to have two entries in the VMCS
cache to improve
the performance of virtualized VMCS shadowing.

>
> Paolo
>
>> On Wed, Feb 15, 2017 at 8:14 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>
>>>
>>> On 15/02/2017 17:06, Jim Mattson wrote:
>>>> The VMCS cache can be safely flushed to guest memory at any time.
>>>> However, I think your proposal has some unfortunate consequences:
>>>>
>>>> 1. If KVM_SET_NESTED_STATE is asynchronous, then any subsequent set
>>>> operations (e.g. KVM_SET_SREGS) may be overridden on the next KVM_RUN.
>>>> 2. Guest memory (at least the cached VMCS page(s)) has to be saved
>>>> after KVM_GET_NESTED_STATE.
>>>> 3. KVM_GET_NESTED_STATE is not transparent to the guest.
>>>
>>> I can't choose which is the worst of the three. :)
>>>
>>> A better one perhaps is to flush the VMCS cache on L2->userspace exit,
>>> since that should be pretty rare (suggested by David).  I think that
>>> would help at least with (2) and (3).
>>>
>>> As to (1), after KVM_SET_NESTED_STATE sets the in-guest-mode flag you
>>> don't really need to reload all of the vmcs12 into vmcs02.  Only the
>>> host state needs to be reloaded, while the guest state is set with
>>> KVM_SET_SREGS and others.
>>>
>>> Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2016-12-09 19:15         ` Paolo Bonzini
@ 2017-02-15 19:30           ` Jim Mattson
  2017-02-15 21:26             ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2017-02-15 19:30 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: David Hildenbrand, kvm

On Fri, Dec 9, 2016 at 11:15 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 09/12/2016 17:16, Jim Mattson wrote:
>> At the very least, it needs to know the following to do a page walk:
>>
>> a) Are the secondary processor based VM-execution controls activated
>> in the pimary processor based execution controls?
>> b) If so, is EPT enabled?
>> c) If so, what is the current EPTP?
>
> Oh, I see; this is when MMIO is passed through from L1 to L2.  Though
> there is also the KVM_TRANSLATE ioctl.

I like the idea of supporting just one guest page walker, but
KVM_TRANSLATE looks incomplete. For instance, it doesn't include the
access type, which makes me wonder how it deals with SMEP faults.
Also, it doesn't seem to have a way to return page fault information
to the caller, let alone EPT violation information if the VCPU is in
VMX non-root mode. (Is there currently a way for userspace to cause an
emulated EPT violation VM-exit from L2 to L1 as the result of
instruction emulation?)

>
> Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2017-02-15 19:30           ` Jim Mattson
@ 2017-02-15 21:26             ` Paolo Bonzini
  2017-02-15 22:00               ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2017-02-15 21:26 UTC (permalink / raw)
  To: Jim Mattson; +Cc: David Hildenbrand, kvm

On 15/02/2017 20:30, Jim Mattson wrote:
> I like the idea of supporting just one guest page walker, but
> KVM_TRANSLATE looks incomplete. For instance, it doesn't include the
> access type, which makes me wonder how it deals with SMEP faults.
> Also, it doesn't seem to have a way to return page fault information
> to the caller, let alone EPT violation information if the VCPU is in
> VMX non-root mode.

Not surprising, I don't think anyone has actually used KVM_TRANSLATE in
years...

> (Is there currently a way for userspace to cause an
> emulated EPT violation VM-exit from L2 to L1 as the result of
> instruction emulation?)

Yes, of course.

There are two sets of function pointers for the MMU emulation while
doing L2 emulation: vcpu->arch.nested_mmu is L2's page tables, while
vcpu->arch.mmu is L1's EPT.  vcpu->arch.walk_mmu points to one of these
two, and it's where KVM starts walking page tables.

While doing L2 emulation, vcpu->arch.walk_mmu is set to
&vcpu->arch.nested_mmu (by nested_ept_init_mmu_context).

Each memory access goes through the MMU's translate_gpa function which
is dummy for arch.mmu and translate_nested_gpa for arch.nested_mmu.
translate_nested_gpa in turn is simply vcpu->arch.mmu.gva_to_gpa, and
that's how each L2 page table access causes a walk of the L1's EPT page
tables.

If something goes wrong, each MMU has its own inject_page_fault
callback.  For EPT page tables, vcpu->arch.mmu.inject_page_fault is
where an EPT vmexit is injected.  The emulator then just exits through
X86EMUL_PROPAGATE_FAULT.  It actually works, see commit ef54bcfeea6c
("KVM: x86: skip writeback on injection of nested exception",
2014-09-04) for an example of a bugfix in this area.

One thing where we're lacking a bit is that translate_nested_gpa should
have an argument for "translating translated guest address" vs.
"translating guest page structure address", in order to set EXITINFO or
exit qualification correctly.  This is incorrect right now.

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-02-15 18:19               ` Jim Mattson
@ 2017-02-15 21:28                 ` Paolo Bonzini
  2017-12-19  3:07                   ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2017-02-15 21:28 UTC (permalink / raw)
  To: Jim Mattson; +Cc: kvm, David Hildenbrand



On 15/02/2017 19:19, Jim Mattson wrote:
> On Wed, Feb 15, 2017 at 9:37 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> Yes, I think it is better.  The really nasty part of
>> "KVM_GET_NESTED_STATE is not transparent to the guest" is that
>> KVM_GET_NESTED_STATE typically happens while the guest is quiescent and
>> userspace doesn't expect guest memory to change.  (2) is a special case
>> of this.
>>
>> Instead, flushing the VMCS on L2->userspace exit happens during KVM_RUN,
>> which should not break any invariants that userspace relies on.
> 
> It is certainly within the bounds of the Intel specification to flush the
> VMCS cache at any time, but I think it would be better if the VCPU behaved more
> deterministically (with respect to guest-visible events).

Yes, it's a trade off.  Of course if you think of SMIs on physical
hardware, determinism is already thrown away.  But I understand that we
can do better than physical processors in this regard. :)

> Just a proposal. I think a separate vmcs02 has the potential for faster emulated
> VM-entry/VM-exit, but at the cost of greater code complexity. A unified vmcs0X
> could certainly be as fast as the existing implementation and a lot simpler.

I agree.  As I said, we need to benchmark it either way.  I honestly
have no idea where time is spent (on a processor with VMCS shadowing) in
a tight loop of L1->L2->L1 entries and exits.

>>> To be honest, I'm not sure what it is that you are trying to optimize
>>> for. Is your concern the extra 4K of VCPU state? I would argue that
>>> the VMCS cache is part of the physical CPU state (though on a physical
>>> CPU, not every field is cached), and that it therefore makes sense to
>>> include the VMCS cache as part of the VCPU state, apart from guest
>>> physical memory.
>>
>> It's not really the extra 4K, but rather the ugly "blob" aspect of it
>> and having to maintain compatibility for it.  Regarding the blobbiness,
>> see for example KVM_GET/SET_XSAVE, where you can actually marshal and
>> unmarshal the contents of the XSAVE area into the registers.  Regarding
>> compatibility, I'm worried that once someone looks at SVM more closely,
>> we will have to store the VMCB in GET_NESTED_STATE too.
> 
> Instead of a "blob," we could pass it out to userspace as a 'struct
> vmcs12,' padded out to 4k for compatibility purposes. Then userspace
> could marshal and unmarshal the contents, and it would be a little
> easier than marshalling and unmarshalling the same contents in guest
> physical memory.

You mean marshalling/unmarshalling it for stuff such as extracting the
EPTP?  Yeah, that's a good reason to keep it in GET_NESTED_STATE.

OTOH, it's more work to do in userspace.  Tradeoffs...

> I don't know what to say about SVM. I think we should be prepared to
> store the VMCB at some point, even if we don't now.

The in-memory VMCB is always consistent with processor state after
VMRUN, but not necessarily during it so I agree.  Of course the same
solution of flushing it out on L2->userspace exit would work, though I
understand why you think it's a hack.

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2017-02-15 21:26             ` Paolo Bonzini
@ 2017-02-15 22:00               ` Jim Mattson
  2017-02-15 22:05                 ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2017-02-15 22:00 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: David Hildenbrand, kvm

On Wed, Feb 15, 2017 at 1:26 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 15/02/2017 20:30, Jim Mattson wrote:
>> I like the idea of supporting just one guest page walker, but
>> KVM_TRANSLATE looks incomplete. For instance, it doesn't include the
>> access type, which makes me wonder how it deals with SMEP faults.
>> Also, it doesn't seem to have a way to return page fault information
>> to the caller, let alone EPT violation information if the VCPU is in
>> VMX non-root mode.
>
> Not surprising, I don't think anyone has actually used KVM_TRANSLATE in
> years...
>
>> (Is there currently a way for userspace to cause an
>> emulated EPT violation VM-exit from L2 to L1 as the result of
>> instruction emulation?)
>
> Yes, of course.
>
> There are two sets of function pointers for the MMU emulation while
> doing L2 emulation: vcpu->arch.nested_mmu is L2's page tables, while
> vcpu->arch.mmu is L1's EPT.  vcpu->arch.walk_mmu points to one of these
> two, and it's where KVM starts walking page tables.
>
> While doing L2 emulation, vcpu->arch.walk_mmu is set to
> &vcpu->arch.nested_mmu (by nested_ept_init_mmu_context).
>
> Each memory access goes through the MMU's translate_gpa function which
> is dummy for arch.mmu and translate_nested_gpa for arch.nested_mmu.
> translate_nested_gpa in turn is simply vcpu->arch.mmu.gva_to_gpa, and
> that's how each L2 page table access causes a walk of the L1's EPT page
> tables.
>
> If something goes wrong, each MMU has its own inject_page_fault
> callback.  For EPT page tables, vcpu->arch.mmu.inject_page_fault is
> where an EPT vmexit is injected.  The emulator then just exits through
> X86EMUL_PROPAGATE_FAULT.  It actually works, see commit ef54bcfeea6c
> ("KVM: x86: skip writeback on injection of nested exception",
> 2014-09-04) for an example of a bugfix in this area.

Yes, I see that this works for the in-kernel instruction emulator. My
question was
regarding a hypothetical user-space instruction emulator. I can see how a #PF
resulting from KVM_TRANSLATE could be injected with KVM_SET_VCPU_EVENTS
(if KVM_TRANSLATE was forthcoming with the error code, anyway). However,
what if the KVM_TRANSLATE ioctl were to fail because of an EPT violation?
I don't see an ioctl that would allow userspace to inject a VM-exit
event (complete with
exit reason, exit qualification, and all of the other VM-exit
information fields that might
be relevant.) Or are you saying that simply encountering the EPT violation while
trying to answer a KVM_TRANSLATE request would induce the VM-exit?

> One thing where we're lacking a bit is that translate_nested_gpa should
> have an argument for "translating translated guest address" vs.
> "translating guest page structure address", in order to set EXITINFO or
> exit qualification correctly.  This is incorrect right now.
>
> Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2017-02-15 22:00               ` Jim Mattson
@ 2017-02-15 22:05                 ` Paolo Bonzini
  2017-02-15 22:49                   ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2017-02-15 22:05 UTC (permalink / raw)
  To: Jim Mattson; +Cc: David Hildenbrand, kvm



On 15/02/2017 23:00, Jim Mattson wrote:
> Yes, I see that this works for the in-kernel instruction emulator. My
> question was regarding a hypothetical user-space instruction emulator. I
> can see how a #PF resulting from KVM_TRANSLATE could be injected with
> KVM_SET_VCPU_EVENTS (if KVM_TRANSLATE was forthcoming with the error code,
> anyway). However, what if the KVM_TRANSLATE ioctl were to fail because of
> an EPT violation? I don't see an ioctl that would allow userspace to inject
> a VM-exit event (complete with exit reason, exit qualification, and all of
> the other VM-exit information fields that might be relevant.) Or are you
> saying that simply encountering the EPT violation while
> trying to answer a KVM_TRANSLATE request would induce the VM-exit?

I think it should, yes.  There are other limitations of KVM_TRANSLATE
(it always assumes CPL=0 for example) but the same logic would apply to
KVM_TRANSLATE and to the in-kernel emulator.

Paolo

>> One thing where we're lacking a bit is that translate_nested_gpa should
>> have an argument for "translating translated guest address" vs.
>> "translating guest page structure address", in order to set EXITINFO or
>> exit qualification correctly.  This is incorrect right now.
>>
>> Paolo
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2017-02-15 22:05                 ` Paolo Bonzini
@ 2017-02-15 22:49                   ` Jim Mattson
  2017-02-15 23:09                     ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2017-02-15 22:49 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: David Hildenbrand, kvm

On Wed, Feb 15, 2017 at 2:05 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 15/02/2017 23:00, Jim Mattson wrote:
>> Yes, I see that this works for the in-kernel instruction emulator. My
>> question was regarding a hypothetical user-space instruction emulator. I
>> can see how a #PF resulting from KVM_TRANSLATE could be injected with
>> KVM_SET_VCPU_EVENTS (if KVM_TRANSLATE was forthcoming with the error code,
>> anyway). However, what if the KVM_TRANSLATE ioctl were to fail because of
>> an EPT violation? I don't see an ioctl that would allow userspace to inject
>> a VM-exit event (complete with exit reason, exit qualification, and all of
>> the other VM-exit information fields that might be relevant.) Or are you
>> saying that simply encountering the EPT violation while
>> trying to answer a KVM_TRANSLATE request would induce the VM-exit?
>
> I think it should, yes.  There are other limitations of KVM_TRANSLATE
> (it always assumes CPL=0 for example) but the same logic would apply to
> KVM_TRANSLATE and to the in-kernel emulator.

Since KVM_TRANSLATE doesn't supply access mode, the exit qualification would be
deficient in more than the final access/paging-structure-entry bit you
reference below.

Is it possible to fix this ioctl without breaking backwards
compatibility? Can existing
userspace code be expected to fill the pad[] bytes of the
kvm_translate structure
with 0? (There doesn't appear to be any enforcement of this, so I'm
assuming not.)

Between this and the CPL==0 assumption, KVM_TRANSLATE seems like a non-starter.
But perhaps a new and improved API in the same vein...

> Paolo
>
>>> One thing where we're lacking a bit is that translate_nested_gpa should
>>> have an argument for "translating translated guest address" vs.
>>> "translating guest page structure address", in order to set EXITINFO or
>>> exit qualification correctly.  This is incorrect right now.
>>>
>>> Paolo
>>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state
  2017-02-15 22:49                   ` Jim Mattson
@ 2017-02-15 23:09                     ` Paolo Bonzini
  0 siblings, 0 replies; 63+ messages in thread
From: Paolo Bonzini @ 2017-02-15 23:09 UTC (permalink / raw)
  To: Jim Mattson; +Cc: David Hildenbrand, kvm



On 15/02/2017 23:49, Jim Mattson wrote:
>> I think it should, yes.  There are other limitations of KVM_TRANSLATE
>> (it always assumes CPL=0 for example) but the same logic would apply to
>> KVM_TRANSLATE and to the in-kernel emulator.
> 
> Since KVM_TRANSLATE doesn't supply access mode, the exit qualification would be
> deficient in more than the final access/paging-structure-entry bit you
> reference below.
> 
> Is it possible to fix this ioctl without breaking backwards
> compatibility? Can existing
> userspace code be expected to fill the pad[] bytes of the
> kvm_translate structure
> with 0? (There doesn't appear to be any enforcement of this, so I'm
> assuming not.)
> 
> Between this and the CPL==0 assumption, KVM_TRANSLATE seems like a non-starter.
> But perhaps a new and improved API in the same vein...

Either a new ioctl, or KVM_ENABLE_CAP to enable parsing the padding
bytes.  The format of struct kvm_translation is a bit inflexible, so I
think a new ioctl would be better.

BTW, have you seen this patent:
https://www.google.com/patents/US20150378633?  Implemnting this for
nested EPT would complicate the MMU quite a lot.

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-02-15 21:28                 ` Paolo Bonzini
@ 2017-12-19  3:07                   ` Jim Mattson
  2017-12-19 10:35                     ` David Hildenbrand
  2017-12-19 12:45                     ` Paolo Bonzini
  0 siblings, 2 replies; 63+ messages in thread
From: Jim Mattson @ 2017-12-19  3:07 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm list, David Hildenbrand

The other unfortunate thing about flushing the "current" VMCS12 state
to guest memory on each L2->userspace transition is that much of this
state is in the VMCS02. So,it's not just a matter of writing a
VMCS12_SIZE blob to guest memory; first, the cached VMCS12 has to be
updated from the VMCS02 by calling sync_vmcs12(). This will be
particularly bad for double-nesting, where L1 kvm has to take all of
those VMREAD VM-exits.

If you still prefer this method, I will go ahead and do it, but I
remain opposed.

On Wed, Feb 15, 2017 at 1:28 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 15/02/2017 19:19, Jim Mattson wrote:
>> On Wed, Feb 15, 2017 at 9:37 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> Yes, I think it is better.  The really nasty part of
>>> "KVM_GET_NESTED_STATE is not transparent to the guest" is that
>>> KVM_GET_NESTED_STATE typically happens while the guest is quiescent and
>>> userspace doesn't expect guest memory to change.  (2) is a special case
>>> of this.
>>>
>>> Instead, flushing the VMCS on L2->userspace exit happens during KVM_RUN,
>>> which should not break any invariants that userspace relies on.
>>
>> It is certainly within the bounds of the Intel specification to flush the
>> VMCS cache at any time, but I think it would be better if the VCPU behaved more
>> deterministically (with respect to guest-visible events).
>
> Yes, it's a trade off.  Of course if you think of SMIs on physical
> hardware, determinism is already thrown away.  But I understand that we
> can do better than physical processors in this regard. :)
>
>> Just a proposal. I think a separate vmcs02 has the potential for faster emulated
>> VM-entry/VM-exit, but at the cost of greater code complexity. A unified vmcs0X
>> could certainly be as fast as the existing implementation and a lot simpler.
>
> I agree.  As I said, we need to benchmark it either way.  I honestly
> have no idea where time is spent (on a processor with VMCS shadowing) in
> a tight loop of L1->L2->L1 entries and exits.
>
>>>> To be honest, I'm not sure what it is that you are trying to optimize
>>>> for. Is your concern the extra 4K of VCPU state? I would argue that
>>>> the VMCS cache is part of the physical CPU state (though on a physical
>>>> CPU, not every field is cached), and that it therefore makes sense to
>>>> include the VMCS cache as part of the VCPU state, apart from guest
>>>> physical memory.
>>>
>>> It's not really the extra 4K, but rather the ugly "blob" aspect of it
>>> and having to maintain compatibility for it.  Regarding the blobbiness,
>>> see for example KVM_GET/SET_XSAVE, where you can actually marshal and
>>> unmarshal the contents of the XSAVE area into the registers.  Regarding
>>> compatibility, I'm worried that once someone looks at SVM more closely,
>>> we will have to store the VMCB in GET_NESTED_STATE too.
>>
>> Instead of a "blob," we could pass it out to userspace as a 'struct
>> vmcs12,' padded out to 4k for compatibility purposes. Then userspace
>> could marshal and unmarshal the contents, and it would be a little
>> easier than marshalling and unmarshalling the same contents in guest
>> physical memory.
>
> You mean marshalling/unmarshalling it for stuff such as extracting the
> EPTP?  Yeah, that's a good reason to keep it in GET_NESTED_STATE.
>
> OTOH, it's more work to do in userspace.  Tradeoffs...
>
>> I don't know what to say about SVM. I think we should be prepared to
>> store the VMCB at some point, even if we don't now.
>
> The in-memory VMCB is always consistent with processor state after
> VMRUN, but not necessarily during it so I agree.  Of course the same
> solution of flushing it out on L2->userspace exit would work, though I
> understand why you think it's a hack.
>
> Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-12-19  3:07                   ` Jim Mattson
@ 2017-12-19 10:35                     ` David Hildenbrand
  2017-12-19 17:26                       ` Jim Mattson
  2017-12-19 12:45                     ` Paolo Bonzini
  1 sibling, 1 reply; 63+ messages in thread
From: David Hildenbrand @ 2017-12-19 10:35 UTC (permalink / raw)
  To: Jim Mattson, Paolo Bonzini; +Cc: kvm list, David Hildenbrand

On 19.12.2017 04:07, Jim Mattson wrote:
> The other unfortunate thing about flushing the "current" VMCS12 state
> to guest memory on each L2->userspace transition is that much of this
> state is in the VMCS02. So,it's not just a matter of writing a
> VMCS12_SIZE blob to guest memory; first, the cached VMCS12 has to be
> updated from the VMCS02 by calling sync_vmcs12(). This will be
> particularly bad for double-nesting, where L1 kvm has to take all of
> those VMREAD VM-exits.
> 
> If you still prefer this method, I will go ahead and do it, but I
> remain opposed.

This can be easily solved by letting QEMU trigger a pre-migration ioctl.
(FLUSH_NESTED_VMCS). So that shouldn't really be the problem.

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-12-19  3:07                   ` Jim Mattson
  2017-12-19 10:35                     ` David Hildenbrand
@ 2017-12-19 12:45                     ` Paolo Bonzini
  1 sibling, 0 replies; 63+ messages in thread
From: Paolo Bonzini @ 2017-12-19 12:45 UTC (permalink / raw)
  To: Jim Mattson; +Cc: kvm list, David Hildenbrand

On 19/12/2017 04:07, Jim Mattson wrote:
> The other unfortunate thing about flushing the "current" VMCS12 state
> to guest memory on each L2->userspace transition is that much of this
> state is in the VMCS02. So,it's not just a matter of writing a
> VMCS12_SIZE blob to guest memory; first, the cached VMCS12 has to be
> updated from the VMCS02 by calling sync_vmcs12(). This will be
> particularly bad for double-nesting, where L1 kvm has to take all of
> those VMREAD VM-exits.
> 
> If you still prefer this method, I will go ahead and do it, but I
> remain opposed.

I don't (for a different reason---SVM also has off-RAM state).

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-12-19 10:35                     ` David Hildenbrand
@ 2017-12-19 17:26                       ` Jim Mattson
  2017-12-19 17:33                         ` David Hildenbrand
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2017-12-19 17:26 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

Yes, it can be done that way, but what makes this approach technically
superior to the original API?

On Tue, Dec 19, 2017 at 2:35 AM, David Hildenbrand <david@redhat.com> wrote:
> On 19.12.2017 04:07, Jim Mattson wrote:
>> The other unfortunate thing about flushing the "current" VMCS12 state
>> to guest memory on each L2->userspace transition is that much of this
>> state is in the VMCS02. So,it's not just a matter of writing a
>> VMCS12_SIZE blob to guest memory; first, the cached VMCS12 has to be
>> updated from the VMCS02 by calling sync_vmcs12(). This will be
>> particularly bad for double-nesting, where L1 kvm has to take all of
>> those VMREAD VM-exits.
>>
>> If you still prefer this method, I will go ahead and do it, but I
>> remain opposed.
>
> This can be easily solved by letting QEMU trigger a pre-migration ioctl.
> (FLUSH_NESTED_VMCS). So that shouldn't really be the problem.
>
> --
>
> Thanks,
>
> David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-12-19 17:26                       ` Jim Mattson
@ 2017-12-19 17:33                         ` David Hildenbrand
  2017-12-19 17:40                           ` David Hildenbrand
  0 siblings, 1 reply; 63+ messages in thread
From: David Hildenbrand @ 2017-12-19 17:33 UTC (permalink / raw)
  To: Jim Mattson; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

On 19.12.2017 18:26, Jim Mattson wrote:
> Yes, it can be done that way, but what makes this approach technically
> superior to the original API?

a) not having to migrate data twice
b) not having to think about a proper API to get data in/out

All you need to know is, if the guest was in nested mode when migrating,
no? That would be a simple flag.

> 
> On Tue, Dec 19, 2017 at 2:35 AM, David Hildenbrand <david@redhat.com> wrote:
>> On 19.12.2017 04:07, Jim Mattson wrote:
>>> The other unfortunate thing about flushing the "current" VMCS12 state
>>> to guest memory on each L2->userspace transition is that much of this
>>> state is in the VMCS02. So,it's not just a matter of writing a
>>> VMCS12_SIZE blob to guest memory; first, the cached VMCS12 has to be
>>> updated from the VMCS02 by calling sync_vmcs12(). This will be
>>> particularly bad for double-nesting, where L1 kvm has to take all of
>>> those VMREAD VM-exits.
>>>
>>> If you still prefer this method, I will go ahead and do it, but I
>>> remain opposed.
>>
>> This can be easily solved by letting QEMU trigger a pre-migration ioctl.
>> (FLUSH_NESTED_VMCS). So that shouldn't really be the problem.
>>
>> --
>>
>> Thanks,
>>
>> David / dhildenb


-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-12-19 17:33                         ` David Hildenbrand
@ 2017-12-19 17:40                           ` David Hildenbrand
  2017-12-19 19:21                             ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: David Hildenbrand @ 2017-12-19 17:40 UTC (permalink / raw)
  To: Jim Mattson; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

On 19.12.2017 18:33, David Hildenbrand wrote:
> On 19.12.2017 18:26, Jim Mattson wrote:
>> Yes, it can be done that way, but what makes this approach technically
>> superior to the original API?
> 
> a) not having to migrate data twice
> b) not having to think about a proper API to get data in/out
> 
> All you need to know is, if the guest was in nested mode when migrating,
> no? That would be a simple flag.
> 

(of course in addition, vmcsptr and if vmxon has been called).

But anyhow, if you have good reasons why you want to introduce and
maintain a new API, feel free to do so. Most likely I am missing
something important here :)


-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-12-19 17:40                           ` David Hildenbrand
@ 2017-12-19 19:21                             ` Jim Mattson
  2017-12-19 19:52                               ` David Hildenbrand
  2017-12-19 21:29                               ` Paolo Bonzini
  0 siblings, 2 replies; 63+ messages in thread
From: Jim Mattson @ 2017-12-19 19:21 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

Certain VMX state cannot be extracted from the kernel today. As you
point out, this includes the vCPU's VMX operating mode {legacy, VMX
root operation, VMX non-root operation}, the current VMCS GPA (if
any), and the VMXON region GPA (if any). Perhaps these could be
appended to the state(s) extracted by one or more existing APIs rather
than introducing a new API, but I think there's sufficient
justification here for a new GET/SET_NESTED_STATE API.

Most L2 guest state can already be extracted by existing APIs, like
GET_SREGS. However, restoring it is a bit problematic today. SET_SREGS
will write into the current VMCS, but we have no existing mechanism
for transferring guest state from vmcs01 to vmcs02. On restore, do we
want to dictate that the vCPU's VMX operating mode has to be restored
before SET_SREGS is called, or do we provide a mechanism for
transferring vmcs01 guest state to vmcs02? If we do dictate that the
vCPU's operating mode has to be restored first, then SET_SREGS will
naturally write into vmcs02, but we'll have to create a mechanism for
building an initial vmcs02 out of nothing.

The only mechanism we have today for building a vmcs02 starts with a
vmcs12. Building on that mechanism, it is fairly straightforward to
write GET/SET_NESTED_STATE. Though there is quite a bit of redundancy
with GET/SET_SREGS, GET_SET/VCPU_EVENTS, etc., if you capture all of
the L2 state in VMCS12 format, you can restore it pretty easily using
the existing infrastructure, without worrying about the ordering of
the SET_* ioctls.

Today, the cached VMCS12 is loaded when the guest executes VMPTRLD,
primarily as a defense against the guest modifying VMCS12 fields in
memory after the hypervisor has checked their validity. There were a
lot of time-of-check to time-of-use security issues before the cached
VMCS12 was introduced. Conveniently, all but the host state of the
cached VMCS12 is dead once the vCPU enters L2, so it seemed like a
reasonable place to stuff the current L2 state for later restoration.
But why pass the cached VMCS12 as a separate vCPU state component
rather than writing it back to guest memory as part of the "save vCPU
state" sequence?

One reason is that it is a bit awkward for GET_NESTED_STATE to modify
guest memory. I don't know about qemu, but our userspace agent expects
guest memory to be quiesced by the time it starts going through its
sequence of GET_* ioctls. Sure, we could introduce a pre-migration
ioctl, but is that the best way to handle this? Another reason is that
it is a bit awkward for SET_NESTED_STATE to require guest memory.
Again, I don't know about qemu, but our userspace agent does not
expect any guest memory to be available when it starts going through
its sequence of SET_* ioctls. Sure, we could prefetch the guest page
containing the current VMCS12, but is that better than simply
including the current VMCS12 in the NESTED_STATE payload? Moreover,
these unpredictable (from the guest's point of view) updates to guest
memory leave a bad taste in my mouth (much like SMM).

Perhaps qemu doesn't have the same limitations that our userspace
agent has, and I can certainly see why you would dismiss my concerns
if you are only interested in qemu as a userspace agent for kvm. At
the same time, I hope you can understand why I am not excited to be
drawn down a path that's going to ultimately mean more headaches for
me in my environment. AFAICT, the proposed API doesn't introduce any
additional headaches for those that use qemu. The principal objections
appear to be the "blob" of data, completely unstructured in the eyes
of the userspace agent, and the redundancy with state already
extracted by existing APIs. Is that right?

On Tue, Dec 19, 2017 at 9:40 AM, David Hildenbrand <david@redhat.com> wrote:
> On 19.12.2017 18:33, David Hildenbrand wrote:
>> On 19.12.2017 18:26, Jim Mattson wrote:
>>> Yes, it can be done that way, but what makes this approach technically
>>> superior to the original API?
>>
>> a) not having to migrate data twice
>> b) not having to think about a proper API to get data in/out
>>
>> All you need to know is, if the guest was in nested mode when migrating,
>> no? That would be a simple flag.
>>
>
> (of course in addition, vmcsptr and if vmxon has been called).
>
> But anyhow, if you have good reasons why you want to introduce and
> maintain a new API, feel free to do so. Most likely I am missing
> something important here :)
>
>
> --
>
> Thanks,
>
> David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-12-19 19:21                             ` Jim Mattson
@ 2017-12-19 19:52                               ` David Hildenbrand
  2017-12-19 21:29                               ` Paolo Bonzini
  1 sibling, 0 replies; 63+ messages in thread
From: David Hildenbrand @ 2017-12-19 19:52 UTC (permalink / raw)
  To: Jim Mattson; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

On 19.12.2017 20:21, Jim Mattson wrote:
> Certain VMX state cannot be extracted from the kernel today. As you
> point out, this includes the vCPU's VMX operating mode {legacy, VMX
> root operation, VMX non-root operation}, the current VMCS GPA (if
> any), and the VMXON region GPA (if any). Perhaps these could be
> appended to the state(s) extracted by one or more existing APIs rather
> than introducing a new API, but I think there's sufficient
> justification here for a new GET/SET_NESTED_STATE API.
> 
> Most L2 guest state can already be extracted by existing APIs, like
> GET_SREGS. However, restoring it is a bit problematic today. SET_SREGS
> will write into the current VMCS, but we have no existing mechanism
> for transferring guest state from vmcs01 to vmcs02. On restore, do we
> want to dictate that the vCPU's VMX operating mode has to be restored
> before SET_SREGS is called, or do we provide a mechanism for
> transferring vmcs01 guest state to vmcs02? If we do dictate that the
> vCPU's operating mode has to be restored first, then SET_SREGS will
> naturally write into vmcs02, but we'll have to create a mechanism for
> building an initial vmcs02 out of nothing.

A small rant before the Christmas holiday :)

The more I look at nested VMX the more headaches I get. I know somebody
thought it was a good idea to reuse the same execution loop in the guest
for a nested guest. Simply swap some registers (introducing tons of BUGs
e.g. related to interrupt injection, at least that's my impression) and
there you go.

I think this is a huge mess. As you say, state extraction/injection is a
mess. As user space, you don't really know what you're holding in you
hand. And from that stems the desire to just "dump out a huge blob of
state and feed it back into the migration target". I can understand
that. But I wonder if it has to be this way.

Sometimes nested VMX feels like a huge hack into the existing VMX
module. I once attempted to factor out certain pieces, and finally gave
up. Maybe it is me and not the code :)

> 
> The only mechanism we have today for building a vmcs02 starts with a
> vmcs12. Building on that mechanism, it is fairly straightforward to
> write GET/SET_NESTED_STATE. Though there is quite a bit of redundancy
> with GET/SET_SREGS, GET_SET/VCPU_EVENTS, etc., if you capture all of
> the L2 state in VMCS12 format, you can restore it pretty easily using
> the existing infrastructure, without worrying about the ordering of
> the SET_* ioctls.

And exactly the redundancy you are talking about is what I am afraid of.
Interfaces should be kept simple. If we can hide complexity, do so. But
it all stems from the fact that we turn nested guest state into guest
sate. And exit that way to user space.

> 
> Today, the cached VMCS12 is loaded when the guest executes VMPTRLD,
> primarily as a defense against the guest modifying VMCS12 fields in
> memory after the hypervisor has checked their validity. There were a
> lot of time-of-check to time-of-use security issues before the cached
> VMCS12 was introduced. Conveniently, all but the host state of the
> cached VMCS12 is dead once the vCPU enters L2, so it seemed like a
> reasonable place to stuff the current L2 state for later restoration.
> But why pass the cached VMCS12 as a separate vCPU state component
> rather than writing it back to guest memory as part of the "save vCPU
> state" sequence?
> 
> One reason is that it is a bit awkward for GET_NESTED_STATE to modify
> guest memory. I don't know about qemu, but our userspace agent expects
> guest memory to be quiesced by the time it starts going through its
> sequence of GET_* ioctls. Sure, we could introduce a pre-migration
> ioctl, but is that the best way to handle this? Another reason is that
> it is a bit awkward for SET_NESTED_STATE to require guest memory.
> Again, I don't know about qemu, but our userspace agent does not
> expect any guest memory to be available when it starts going through
> its sequence of SET_* ioctls. Sure, we could prefetch the guest page
> containing the current VMCS12, but is that better than simply
> including the current VMCS12 in the NESTED_STATE payload? Moreover,
> these unpredictable (from the guest's point of view) updates to guest
> memory leave a bad taste in my mouth (much like SMM).

In my perfect world, we would never leave with nested guest registers to
user space. When migrating, all state is implicitly contained in guest
memory.

I might have a bit of a bias opinion here. On s390x we were able to make
nested virtualization work completely transparent to user space.

1. every time we exit to user space, guest registers are guest registers
2. we have a separate nested execution loop for nested virtualization.
3. nested guest state is completely contained in guest memory. Whenever
we stop executing the nested guest. (to return to the guest or to user
space)

We are able to do that by not remembering if we were in nested mode or
not. When exiting to QEMU and reentering, we simply return to L1. L1
will figure out that there is nothing to do "0 intercept" and simply
rerun its guest, resulting in us running L2. But even if we wanted to
continue running L1, it would be as easy as rewinding the PC to point
again at the SIE instruction before exiting to user space. Or introduce
a separate flag. (at least I think it would be that easy)

Not saying s390x implementation is perfect, but it seems to be way
easier to handle compared to what we have with VMX right now.

> 
> Perhaps qemu doesn't have the same limitations that our userspace
> agent has, and I can certainly see why you would dismiss my concerns
> if you are only interested in qemu as a userspace agent for kvm. At
> the same time, I hope you can understand why I am not excited to be
> drawn down a path that's going to ultimately mean more headaches for
> me in my environment. AFAICT, the proposed API doesn't introduce any
> additional headaches for those that use qemu. The principal objections
> appear to be the "blob" of data, completely unstructured in the eyes
> of the userspace agent, and the redundancy with state already
> extracted by existing APIs. Is that right?

I am wondering if we should rather try to redesign nVMX to work somehow
similar to s390x way of running nested guests before we introduce a new
API that makes the current design "fixed".

Having that said, I assume this is higly unlikely (and many people will
most likely object to what I have written in this mail). So please feel
free to ignore what I have said in this email and go forward with your
approach. Having migration is better than not having migration.

Yes, I am a dreamer. :D

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-12-19 19:21                             ` Jim Mattson
  2017-12-19 19:52                               ` David Hildenbrand
@ 2017-12-19 21:29                               ` Paolo Bonzini
  2018-01-08 10:35                                 ` David Hildenbrand
  1 sibling, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2017-12-19 21:29 UTC (permalink / raw)
  To: Jim Mattson, David Hildenbrand; +Cc: kvm list, David Hildenbrand

On 19/12/2017 20:21, Jim Mattson wrote:
> One reason is that it is a bit awkward for GET_NESTED_STATE to modify
> guest memory. I don't know about qemu, but our userspace agent expects
> guest memory to be quiesced by the time it starts going through its
> sequence of GET_* ioctls. Sure, we could introduce a pre-migration
> ioctl, but is that the best way to handle this? Another reason is that
> it is a bit awkward for SET_NESTED_STATE to require guest memory.
> Again, I don't know about qemu, but our userspace agent does not
> expect any guest memory to be available when it starts going through
> its sequence of SET_* ioctls. Sure, we could prefetch the guest page
> containing the current VMCS12, but is that better than simply
> including the current VMCS12 in the NESTED_STATE payload? Moreover,
> these unpredictable (from the guest's point of view) updates to guest
> memory leave a bad taste in my mouth (much like SMM).

IIRC QEMU has no problem with either, but I think your concerns are
valid.  The active VMCS is processor state, not memory state.  Same for
the host save data in SVM.

The unstructured "blob" of data is not an issue.  If it becomes a
problem, we can always document the structure...

Paolo

> 
> Perhaps qemu doesn't have the same limitations that our userspace
> agent has, and I can certainly see why you would dismiss my concerns
> if you are only interested in qemu as a userspace agent for kvm. At
> the same time, I hope you can understand why I am not excited to be
> drawn down a path that's going to ultimately mean more headaches for
> me in my environment. AFAICT, the proposed API doesn't introduce any
> additional headaches for those that use qemu. The principal objections
> appear to be the "blob" of data, completely unstructured in the eyes
> of the userspace agent, and the redundancy with state already
> extracted by existing APIs. Is that right?
> 
> 
> On Tue, Dec 19, 2017 at 9:40 AM, David Hildenbrand <david@redhat.com> wrote:
>> On 19.12.2017 18:33, David Hildenbrand wrote:
>>> On 19.12.2017 18:26, Jim Mattson wrote:
>>>> Yes, it can be done that way, but what makes this approach technically
>>>> superior to the original API?
>>>
>>> a) not having to migrate data twice
>>> b) not having to think about a proper API to get data in/out
>>>
>>> All you need to know is, if the guest was in nested mode when migrating,
>>> no? That would be a simple flag.
>>>
>>
>> (of course in addition, vmcsptr and if vmxon has been called).
>>
>> But anyhow, if you have good reasons why you want to introduce and
>> maintain a new API, feel free to do so. Most likely I am missing
>> something important here :)
>>
>>
>> --
>>
>> Thanks,
>>
>> David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2017-12-19 21:29                               ` Paolo Bonzini
@ 2018-01-08 10:35                                 ` David Hildenbrand
  2018-01-08 17:25                                   ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: David Hildenbrand @ 2018-01-08 10:35 UTC (permalink / raw)
  To: Paolo Bonzini, Jim Mattson; +Cc: kvm list, David Hildenbrand

On 19.12.2017 22:29, Paolo Bonzini wrote:
> On 19/12/2017 20:21, Jim Mattson wrote:
>> One reason is that it is a bit awkward for GET_NESTED_STATE to modify
>> guest memory. I don't know about qemu, but our userspace agent expects
>> guest memory to be quiesced by the time it starts going through its
>> sequence of GET_* ioctls. Sure, we could introduce a pre-migration
>> ioctl, but is that the best way to handle this? Another reason is that
>> it is a bit awkward for SET_NESTED_STATE to require guest memory.
>> Again, I don't know about qemu, but our userspace agent does not
>> expect any guest memory to be available when it starts going through
>> its sequence of SET_* ioctls. Sure, we could prefetch the guest page
>> containing the current VMCS12, but is that better than simply
>> including the current VMCS12 in the NESTED_STATE payload? Moreover,
>> these unpredictable (from the guest's point of view) updates to guest
>> memory leave a bad taste in my mouth (much like SMM).
> 
> IIRC QEMU has no problem with either, but I think your concerns are
> valid.  The active VMCS is processor state, not memory state.  Same for
> the host save data in SVM.
> 
> The unstructured "blob" of data is not an issue.  If it becomes a
> problem, we can always document the structure...

Thinking about it, I agree. It might be simpler/cleaner to transfer the
"loaded" VMCS. But I think we should take care of only transferring data
that actually is CPU state and not special to our current
implementation. (e.g. nested_run_pending I would says is special to out
current implementation, but we can discuss)

So what I would consider VMX state:
- vmxon
- vmxon_ptr
- vmptr
- cached_vmcs12
- ... ?

> 
> Paolo


-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 10:35                                 ` David Hildenbrand
@ 2018-01-08 17:25                                   ` Jim Mattson
  2018-01-08 17:36                                     ` Paolo Bonzini
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2018-01-08 17:25 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

How do we eliminate nested_run_pending? Do we enforce the invariant
that nested_run_pending is never set on return to userspace, or do we
return an error if GET_NESTED_STATE is called when nested_run_pending
is set?

On Mon, Jan 8, 2018 at 2:35 AM, David Hildenbrand <david@redhat.com> wrote:
> On 19.12.2017 22:29, Paolo Bonzini wrote:
>> On 19/12/2017 20:21, Jim Mattson wrote:
>>> One reason is that it is a bit awkward for GET_NESTED_STATE to modify
>>> guest memory. I don't know about qemu, but our userspace agent expects
>>> guest memory to be quiesced by the time it starts going through its
>>> sequence of GET_* ioctls. Sure, we could introduce a pre-migration
>>> ioctl, but is that the best way to handle this? Another reason is that
>>> it is a bit awkward for SET_NESTED_STATE to require guest memory.
>>> Again, I don't know about qemu, but our userspace agent does not
>>> expect any guest memory to be available when it starts going through
>>> its sequence of SET_* ioctls. Sure, we could prefetch the guest page
>>> containing the current VMCS12, but is that better than simply
>>> including the current VMCS12 in the NESTED_STATE payload? Moreover,
>>> these unpredictable (from the guest's point of view) updates to guest
>>> memory leave a bad taste in my mouth (much like SMM).
>>
>> IIRC QEMU has no problem with either, but I think your concerns are
>> valid.  The active VMCS is processor state, not memory state.  Same for
>> the host save data in SVM.
>>
>> The unstructured "blob" of data is not an issue.  If it becomes a
>> problem, we can always document the structure...
>
> Thinking about it, I agree. It might be simpler/cleaner to transfer the
> "loaded" VMCS. But I think we should take care of only transferring data
> that actually is CPU state and not special to our current
> implementation. (e.g. nested_run_pending I would says is special to out
> current implementation, but we can discuss)
>
> So what I would consider VMX state:
> - vmxon
> - vmxon_ptr
> - vmptr
> - cached_vmcs12
> - ... ?
>
>>
>> Paolo
>
>
> --
>
> Thanks,
>
> David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 17:25                                   ` Jim Mattson
@ 2018-01-08 17:36                                     ` Paolo Bonzini
  2018-01-08 17:59                                       ` David Hildenbrand
  0 siblings, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2018-01-08 17:36 UTC (permalink / raw)
  To: Jim Mattson, David Hildenbrand; +Cc: kvm list, David Hildenbrand

On 08/01/2018 11:35, David Hildenbrand wrote:
> Thinking about it, I agree. It might be simpler/cleaner to transfer the
> "loaded" VMCS. But I think we should take care of only transferring data
> that actually is CPU state and not special to our current
> implementation. (e.g. nested_run_pending I would says is special to out
> current implementation, but we can discuss)
> 
> So what I would consider VMX state:
> - vmxon
> - vmxon_ptr
> - vmptr
> - cached_vmcs12
> - ... ?

nested_run_pending is in the same boat as the various
KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
"architectural" state, but it's part of the state machine so it has to
be serialized.

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 17:36                                     ` Paolo Bonzini
@ 2018-01-08 17:59                                       ` David Hildenbrand
  2018-01-08 18:11                                         ` Paolo Bonzini
  2018-01-08 18:17                                         ` Jim Mattson
  0 siblings, 2 replies; 63+ messages in thread
From: David Hildenbrand @ 2018-01-08 17:59 UTC (permalink / raw)
  To: Paolo Bonzini, Jim Mattson; +Cc: kvm list, David Hildenbrand

On 08.01.2018 18:36, Paolo Bonzini wrote:
> On 08/01/2018 11:35, David Hildenbrand wrote:
>> Thinking about it, I agree. It might be simpler/cleaner to transfer the
>> "loaded" VMCS. But I think we should take care of only transferring data
>> that actually is CPU state and not special to our current
>> implementation. (e.g. nested_run_pending I would says is special to out
>> current implementation, but we can discuss)
>>
>> So what I would consider VMX state:
>> - vmxon
>> - vmxon_ptr
>> - vmptr
>> - cached_vmcs12
>> - ... ?
> 
> nested_run_pending is in the same boat as the various
> KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
> "architectural" state, but it's part of the state machine so it has to
> be serialized.

I am wondering if we can get rid of it. In fact if we can go out of VMX
mode every time we go to user space. As soon as we put it into the
official VMX migration protocol, we have to support it. Now seems like
the last time we can change it.

I have the following things in mind (unfortunately I don't have time to
look into the details) to make nested_run_pending completely internal state.

1. When going into user space, if we have nested_run_pending=true, set
it to false and rewind the PSW to point again at the VMLAUNCH/VMRESUME
function. The next VCPU run will simply continue executing the nested
guest (trying to execute the VMLAUNCH/VMRESUME again).

2. When going into user space, if we have nested_run_pending=true, set
it to false and fake another VMX exit that has no side effects (if
possible). Something like the NULL intercept we have on s390x.

But I have no idea how that plays e.g. with KVM_GET_VCPU_EVENTS (again,
unfortunately no time to look into all the details). If we could get 1.
running it would be cool, but I am not sure if it is feasible.

If not, than we most likely will have to stick to it :( And as Paolo
said, migrate it.

Thanks

> 
> Thanks,
> 
> Paolo
> 

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 17:59                                       ` David Hildenbrand
@ 2018-01-08 18:11                                         ` Paolo Bonzini
  2018-01-08 18:23                                           ` David Hildenbrand
  2018-01-08 18:17                                         ` Jim Mattson
  1 sibling, 1 reply; 63+ messages in thread
From: Paolo Bonzini @ 2018-01-08 18:11 UTC (permalink / raw)
  To: David Hildenbrand, Jim Mattson; +Cc: kvm list, David Hildenbrand

On 08/01/2018 18:59, David Hildenbrand wrote:
> On 08.01.2018 18:36, Paolo Bonzini wrote:
>> On 08/01/2018 11:35, David Hildenbrand wrote:
>>> Thinking about it, I agree. It might be simpler/cleaner to transfer the
>>> "loaded" VMCS. But I think we should take care of only transferring data
>>> that actually is CPU state and not special to our current
>>> implementation. (e.g. nested_run_pending I would says is special to out
>>> current implementation, but we can discuss)
>>>
>>> So what I would consider VMX state:
>>> - vmxon
>>> - vmxon_ptr
>>> - vmptr
>>> - cached_vmcs12
>>> - ... ?
>>
>> nested_run_pending is in the same boat as the various
>> KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
>> "architectural" state, but it's part of the state machine so it has to
>> be serialized.
> 
> I am wondering if we can get rid of it. In fact if we can go out of VMX
> mode every time we go to user space.

There are cases where this is not possible, for example if you have a
nested "assigned device" that is emulated by userspace.

Paolo

> As soon as we put it into the
> official VMX migration protocol, we have to support it. Now seems like
> the last time we can change it.
> 
> I have the following things in mind (unfortunately I don't have time to
> look into the details) to make nested_run_pending completely internal state.
> 
> 1. When going into user space, if we have nested_run_pending=true, set
> it to false and rewind the PSW to point again at the VMLAUNCH/VMRESUME
> function. The next VCPU run will simply continue executing the nested
> guest (trying to execute the VMLAUNCH/VMRESUME again).
> 
> 2. When going into user space, if we have nested_run_pending=true, set
> it to false and fake another VMX exit that has no side effects (if
> possible). Something like the NULL intercept we have on s390x.
> 
> But I have no idea how that plays e.g. with KVM_GET_VCPU_EVENTS (again,
> unfortunately no time to look into all the details). If we could get 1.
> running it would be cool, but I am not sure if it is feasible.
> 
> If not, than we most likely will have to stick to it :( And as Paolo
> said, migrate it.
> 
> Thanks
> 
>>
>> Thanks,
>>
>> Paolo
>>
> 
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 17:59                                       ` David Hildenbrand
  2018-01-08 18:11                                         ` Paolo Bonzini
@ 2018-01-08 18:17                                         ` Jim Mattson
  1 sibling, 0 replies; 63+ messages in thread
From: Jim Mattson @ 2018-01-08 18:17 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

On Mon, Jan 8, 2018 at 9:59 AM, David Hildenbrand <david@redhat.com> wrote:
> On 08.01.2018 18:36, Paolo Bonzini wrote:
>> On 08/01/2018 11:35, David Hildenbrand wrote:
>>> Thinking about it, I agree. It might be simpler/cleaner to transfer the
>>> "loaded" VMCS. But I think we should take care of only transferring data
>>> that actually is CPU state and not special to our current
>>> implementation. (e.g. nested_run_pending I would says is special to out
>>> current implementation, but we can discuss)
>>>
>>> So what I would consider VMX state:
>>> - vmxon
>>> - vmxon_ptr
>>> - vmptr
>>> - cached_vmcs12
>>> - ... ?
>>
>> nested_run_pending is in the same boat as the various
>> KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
>> "architectural" state, but it's part of the state machine so it has to
>> be serialized.
>
> I am wondering if we can get rid of it. In fact if we can go out of VMX
> mode every time we go to user space. As soon as we put it into the
> official VMX migration protocol, we have to support it. Now seems like
> the last time we can change it.
>
> I have the following things in mind (unfortunately I don't have time to
> look into the details) to make nested_run_pending completely internal state.
>
> 1. When going into user space, if we have nested_run_pending=true, set
> it to false and rewind the PSW to point again at the VMLAUNCH/VMRESUME
> function. The next VCPU run will simply continue executing the nested
> guest (trying to execute the VMLAUNCH/VMRESUME again).

This would require some effort, since we've already committed some
state changes that we aren't currently prepared to roll back (e.g. the
VM-entry MSR load list, DR7, etc.).

We need to address these problems anyway, if we want to continue to
defer control field checks to the hardware (as we do now) and not be
completely broken (as we are now).

> 2. When going into user space, if we have nested_run_pending=true, set
> it to false and fake another VMX exit that has no side effects (if
> possible). Something like the NULL intercept we have on s390x.

I don't think this is feasible, basically because there is no such VM-exit.

> But I have no idea how that plays e.g. with KVM_GET_VCPU_EVENTS (again,
> unfortunately no time to look into all the details). If we could get 1.
> running it would be cool, but I am not sure if it is feasible.
>
> If not, than we most likely will have to stick to it :( And as Paolo
> said, migrate it.
>
> Thanks
>
>>
>> Thanks,
>>
>> Paolo
>>
>
>
> --
>
> Thanks,
>
> David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 18:11                                         ` Paolo Bonzini
@ 2018-01-08 18:23                                           ` David Hildenbrand
  2018-01-08 20:19                                             ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: David Hildenbrand @ 2018-01-08 18:23 UTC (permalink / raw)
  To: Paolo Bonzini, Jim Mattson; +Cc: kvm list, David Hildenbrand

On 08.01.2018 19:11, Paolo Bonzini wrote:
> On 08/01/2018 18:59, David Hildenbrand wrote:
>> On 08.01.2018 18:36, Paolo Bonzini wrote:
>>> On 08/01/2018 11:35, David Hildenbrand wrote:
>>>> Thinking about it, I agree. It might be simpler/cleaner to transfer the
>>>> "loaded" VMCS. But I think we should take care of only transferring data
>>>> that actually is CPU state and not special to our current
>>>> implementation. (e.g. nested_run_pending I would says is special to out
>>>> current implementation, but we can discuss)
>>>>
>>>> So what I would consider VMX state:
>>>> - vmxon
>>>> - vmxon_ptr
>>>> - vmptr
>>>> - cached_vmcs12
>>>> - ... ?
>>>
>>> nested_run_pending is in the same boat as the various
>>> KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
>>> "architectural" state, but it's part of the state machine so it has to
>>> be serialized.
>>
>> I am wondering if we can get rid of it. In fact if we can go out of VMX
>> mode every time we go to user space.
> 
> There are cases where this is not possible, for example if you have a
> nested "assigned device" that is emulated by userspace.

You mean L0 emulates a device for L1 (in userspace). L1 assigns this
device as "assigned device" to L2. Now if L2 tries to access the device,
we have to go in L0 into userspace to handle the access, therefore
requiring us to stay in nested mode so we can properly process the
request when re-entering KVM from userspace?

Didn't know that was even possible.

> 
> Paolo


-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 18:23                                           ` David Hildenbrand
@ 2018-01-08 20:19                                             ` Jim Mattson
  2018-01-08 20:27                                               ` David Hildenbrand
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2018-01-08 20:19 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

Even more trivially, what if the L2 VM is configured never to leave
VMX non-root operation? Then we never exit to userspace?

On Mon, Jan 8, 2018 at 10:23 AM, David Hildenbrand <david@redhat.com> wrote:
> On 08.01.2018 19:11, Paolo Bonzini wrote:
>> On 08/01/2018 18:59, David Hildenbrand wrote:
>>> On 08.01.2018 18:36, Paolo Bonzini wrote:
>>>> On 08/01/2018 11:35, David Hildenbrand wrote:
>>>>> Thinking about it, I agree. It might be simpler/cleaner to transfer the
>>>>> "loaded" VMCS. But I think we should take care of only transferring data
>>>>> that actually is CPU state and not special to our current
>>>>> implementation. (e.g. nested_run_pending I would says is special to out
>>>>> current implementation, but we can discuss)
>>>>>
>>>>> So what I would consider VMX state:
>>>>> - vmxon
>>>>> - vmxon_ptr
>>>>> - vmptr
>>>>> - cached_vmcs12
>>>>> - ... ?
>>>>
>>>> nested_run_pending is in the same boat as the various
>>>> KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
>>>> "architectural" state, but it's part of the state machine so it has to
>>>> be serialized.
>>>
>>> I am wondering if we can get rid of it. In fact if we can go out of VMX
>>> mode every time we go to user space.
>>
>> There are cases where this is not possible, for example if you have a
>> nested "assigned device" that is emulated by userspace.
>
> You mean L0 emulates a device for L1 (in userspace). L1 assigns this
> device as "assigned device" to L2. Now if L2 tries to access the device,
> we have to go in L0 into userspace to handle the access, therefore
> requiring us to stay in nested mode so we can properly process the
> request when re-entering KVM from userspace?
>
> Didn't know that was even possible.
>
>>
>> Paolo
>
>
> --
>
> Thanks,
>
> David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 20:19                                             ` Jim Mattson
@ 2018-01-08 20:27                                               ` David Hildenbrand
  2018-01-08 20:59                                                 ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: David Hildenbrand @ 2018-01-08 20:27 UTC (permalink / raw)
  To: Jim Mattson; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

On 08.01.2018 21:19, Jim Mattson wrote:
> Even more trivially, what if the L2 VM is configured never to leave
> VMX non-root operation? Then we never exit to userspace?

Well we would make the PC point at the VMLAUNCH. Then exit to userspace.

When returning from userspace, try to execute VMLAUNCH again, leading to
an intercept and us going back into nested mode.

A question would be, what happens to interrupts. They could be
delivered, but it would look like the guest was not executed. (as we are
pointing at the VMLAUNCH instruction). Not sure of this is a purely
theoretical problem.

But the "assigned devices" thing is a real difference to s390x.
Assigning devices requires special SIE extensions we don't implement for
vSIE. And there is no such thing as MMIO.

And if there is one case where we need it (assigned devices) - even
though we might find a way around it - it is very likely that there is more.

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 20:27                                               ` David Hildenbrand
@ 2018-01-08 20:59                                                 ` Jim Mattson
  2018-01-08 21:19                                                   ` David Hildenbrand
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2018-01-08 20:59 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

On Mon, Jan 8, 2018 at 12:27 PM, David Hildenbrand <david@redhat.com> wrote:
> On 08.01.2018 21:19, Jim Mattson wrote:
>> Even more trivially, what if the L2 VM is configured never to leave
>> VMX non-root operation? Then we never exit to userspace?
>
> Well we would make the PC point at the VMLAUNCH. Then exit to userspace.

That doesn't work, for so many reasons.

1. It's not sufficient to just rollback the instruction pointer. You
also need to rollback CS, CR0, CR3 (and possibly the PDPTEs), and CR4,
so that the virtual address of the instruction pointer will actually
map to the same physical address as it did the first time. If the page
tables have changed, or the L1 guest has overwritten the
VMLAUNCH/VMRESUME instruction, then you're out of luck.
2. As you point out, interrupts are a problem. Interrupts can't be
delivered in this context, because the vCPU shouldn't be in this
context (and the guest may have already observed the transition to
L2).
3. I'm assuming that you're planning to store most of the current L2
state in the cached VMCS12, at least where you can. Even so, the next
"VM-entry" can't perform any of the normal VM-entry actions that would
clobber the current L2 state that isn't in the cached VMCS12 (e.g.
processing the VM-entry MSR load list). So, you need to have a flag
indicating that this isn't a real VM-entry. That's no better than
carrying the nested_run_pending flag.

> When returning from userspace, try to execute VMLAUNCH again, leading to
> an intercept and us going back into nested mode.
>
> A question would be, what happens to interrupts. They could be
> delivered, but it would look like the guest was not executed. (as we are
> pointing at the VMLAUNCH instruction). Not sure of this is a purely
> theoretical problem.
>
> But the "assigned devices" thing is a real difference to s390x.
> Assigning devices requires special SIE extensions we don't implement for
> vSIE. And there is no such thing as MMIO.
>
> And if there is one case where we need it (assigned devices) - even
> though we might find a way around it - it is very likely that there is more.
>
> --
>
> Thanks,
>
> David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 20:59                                                 ` Jim Mattson
@ 2018-01-08 21:19                                                   ` David Hildenbrand
  2018-01-08 21:40                                                     ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: David Hildenbrand @ 2018-01-08 21:19 UTC (permalink / raw)
  To: Jim Mattson; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

On 08.01.2018 21:59, Jim Mattson wrote:
> On Mon, Jan 8, 2018 at 12:27 PM, David Hildenbrand <david@redhat.com> wrote:
>> On 08.01.2018 21:19, Jim Mattson wrote:
>>> Even more trivially, what if the L2 VM is configured never to leave
>>> VMX non-root operation? Then we never exit to userspace?
>>
>> Well we would make the PC point at the VMLAUNCH. Then exit to userspace.
> 
> That doesn't work, for so many reasons.
> 
> 1. It's not sufficient to just rollback the instruction pointer. You
> also need to rollback CS, CR0, CR3 (and possibly the PDPTEs), and CR4,
> so that the virtual address of the instruction pointer will actually
> map to the same physical address as it did the first time.

I expect these values to be the same once leaving non-root mode (as the
CPU itself hasn't executed anything except the nested guest) But yes, it
could be tricky.

If the page
> tables have changed, or the L1 guest has overwritten the
> VMLAUNCH/VMRESUME instruction, then you're out of luck.

Page tables getting changed by other CPUs is actually a good point. But
I would consider both as "theoretical" problems. At least compared to
the interrupt stuff, which could also happen on guests behaving in a
more sane way.

> 2. As you point out, interrupts are a problem. Interrupts can't be
> delivered in this context, because the vCPU shouldn't be in this
> context (and the guest may have already observed the transition to
> L2).

Yes, I also see this as the major problem.

> 3. I'm assuming that you're planning to store most of the current L2
> state in the cached VMCS12, at least where you can. Even so, the next
> "VM-entry" can't perform any of the normal VM-entry actions that would
> clobber the current L2 state that isn't in the cached VMCS12 (e.g.
> processing the VM-entry MSR load list). So, you need to have a flag
> indicating that this isn't a real VM-entry. That's no better than
> carrying the nested_run_pending flag.

Not sure if that would really be necessary (would have to look into the
details first). But sounds like nested_run_pending seems unavoidable on
x86. So I'd better get used to QEMU dealing with nested CPU state (which
is somehow scary to me - an emulator getting involved in nested
execution - what could go wrong :) )

Good we talked about it (and thanks for your time). I learned a lot today!

>>
>> --
>>
>> Thanks,
>>
>> David / dhildenb


-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 21:19                                                   ` David Hildenbrand
@ 2018-01-08 21:40                                                     ` Jim Mattson
  2018-01-08 21:46                                                       ` David Hildenbrand
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2018-01-08 21:40 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

On Mon, Jan 8, 2018 at 1:19 PM, David Hildenbrand <david@redhat.com> wrote:
> On 08.01.2018 21:59, Jim Mattson wrote:
>> On Mon, Jan 8, 2018 at 12:27 PM, David Hildenbrand <david@redhat.com> wrote:
>>> On 08.01.2018 21:19, Jim Mattson wrote:
>>>> Even more trivially, what if the L2 VM is configured never to leave
>>>> VMX non-root operation? Then we never exit to userspace?
>>>
>>> Well we would make the PC point at the VMLAUNCH. Then exit to userspace.
>>
>> That doesn't work, for so many reasons.
>>
>> 1. It's not sufficient to just rollback the instruction pointer. You
>> also need to rollback CS, CR0, CR3 (and possibly the PDPTEs), and CR4,
>> so that the virtual address of the instruction pointer will actually
>> map to the same physical address as it did the first time.
>
> I expect these values to be the same once leaving non-root mode (as the
> CPU itself hasn't executed anything except the nested guest) But yes, it
> could be tricky.

The vmcs01 does serve as a cache of L1 state at the time of VM-entry,
so if we simply restored the vmcs01 state, that would take care of
most of the rollback issues, as long as we don't deliver any
interrupts in this context. However, I would like to see the
vmcs01/vmcs02 separation go away at some point. svm.c seems to do fine
with just once VMCB.

> If the page
>> tables have changed, or the L1 guest has overwritten the
>> VMLAUNCH/VMRESUME instruction, then you're out of luck.
>
> Page tables getting changed by other CPUs is actually a good point. But
> I would consider both as "theoretical" problems. At least compared to
> the interrupt stuff, which could also happen on guests behaving in a
> more sane way.

My preference is for solutions that are architecturally correct,
thereby solving the theoretical problems as well as the empirical
ones. However, I grant that the Linux community leans the other way in
general.

>> 2. As you point out, interrupts are a problem. Interrupts can't be
>> delivered in this context, because the vCPU shouldn't be in this
>> context (and the guest may have already observed the transition to
>> L2).
>
> Yes, I also see this as the major problem.
>
>> 3. I'm assuming that you're planning to store most of the current L2
>> state in the cached VMCS12, at least where you can. Even so, the next
>> "VM-entry" can't perform any of the normal VM-entry actions that would
>> clobber the current L2 state that isn't in the cached VMCS12 (e.g.
>> processing the VM-entry MSR load list). So, you need to have a flag
>> indicating that this isn't a real VM-entry. That's no better than
>> carrying the nested_run_pending flag.
>
> Not sure if that would really be necessary (would have to look into the
> details first). But sounds like nested_run_pending seems unavoidable on
> x86. So I'd better get used to QEMU dealing with nested CPU state (which
> is somehow scary to me - an emulator getting involved in nested
> execution - what could go wrong :) )

For save/restore, QEMU doesn't actually have to know what the flag
means. It just has to pass it on. (Our userspace agent doesn't know
anything about the meaning of the flags in the kvm_vmx_state
structure.)

> Good we talked about it (and thanks for your time). I learned a lot today!
>
>>>
>>> --
>>>
>>> Thanks,
>>>
>>> David / dhildenb
>
>
> --
>
> Thanks,
>
> David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 21:40                                                     ` Jim Mattson
@ 2018-01-08 21:46                                                       ` David Hildenbrand
  2018-01-08 21:55                                                         ` Jim Mattson
  0 siblings, 1 reply; 63+ messages in thread
From: David Hildenbrand @ 2018-01-08 21:46 UTC (permalink / raw)
  To: Jim Mattson; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

> The vmcs01 does serve as a cache of L1 state at the time of VM-entry,
> so if we simply restored the vmcs01 state, that would take care of
> most of the rollback issues, as long as we don't deliver any
> interrupts in this context. However, I would like to see the
> vmcs01/vmcs02 separation go away at some point. svm.c seems to do fine
> with just once VMCB.

Interesting point, might make things easier for VMX.

> 
>> If the page
>>> tables have changed, or the L1 guest has overwritten the
>>> VMLAUNCH/VMRESUME instruction, then you're out of luck.
>>
>> Page tables getting changed by other CPUs is actually a good point. But
>> I would consider both as "theoretical" problems. At least compared to
>> the interrupt stuff, which could also happen on guests behaving in a
>> more sane way.
> 
> My preference is for solutions that are architecturally correct,
> thereby solving the theoretical problems as well as the empirical
> ones. However, I grant that the Linux community leans the other way in
> general.

I usually agree, unless it makes the code horribly complicated without
any real benefit. (e.g. for corner cases like this one: a CPU modifying
instruction text of another CPU which is currently executing them)

>>> 3. I'm assuming that you're planning to store most of the current L2
>>> state in the cached VMCS12, at least where you can. Even so, the next
>>> "VM-entry" can't perform any of the normal VM-entry actions that would
>>> clobber the current L2 state that isn't in the cached VMCS12 (e.g.
>>> processing the VM-entry MSR load list). So, you need to have a flag
>>> indicating that this isn't a real VM-entry. That's no better than
>>> carrying the nested_run_pending flag.
>>
>> Not sure if that would really be necessary (would have to look into the
>> details first). But sounds like nested_run_pending seems unavoidable on
>> x86. So I'd better get used to QEMU dealing with nested CPU state (which
>> is somehow scary to me - an emulator getting involved in nested
>> execution - what could go wrong :) )
> 
> For save/restore, QEMU doesn't actually have to know what the flag
> means. It just has to pass it on. (Our userspace agent doesn't know
> anything about the meaning of the flags in the kvm_vmx_state
> structure.)

I agree on save/restore. My comment was rather about involving a L0
emulator when running a L2 guest. But as Paolo pointed out, this can't
really be avoided.

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 21:46                                                       ` David Hildenbrand
@ 2018-01-08 21:55                                                         ` Jim Mattson
  2018-01-09  8:19                                                           ` David Hildenbrand
  0 siblings, 1 reply; 63+ messages in thread
From: Jim Mattson @ 2018-01-08 21:55 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

On Mon, Jan 8, 2018 at 1:46 PM, David Hildenbrand <david@redhat.com> wrote:
>> The vmcs01 does serve as a cache of L1 state at the time of VM-entry,
>> so if we simply restored the vmcs01 state, that would take care of
>> most of the rollback issues, as long as we don't deliver any
>> interrupts in this context. However, I would like to see the
>> vmcs01/vmcs02 separation go away at some point. svm.c seems to do fine
>> with just once VMCB.
>
> Interesting point, might make things easier for VMX.
>
>>
>>> If the page
>>>> tables have changed, or the L1 guest has overwritten the
>>>> VMLAUNCH/VMRESUME instruction, then you're out of luck.
>>>
>>> Page tables getting changed by other CPUs is actually a good point. But
>>> I would consider both as "theoretical" problems. At least compared to
>>> the interrupt stuff, which could also happen on guests behaving in a
>>> more sane way.
>>
>> My preference is for solutions that are architecturally correct,
>> thereby solving the theoretical problems as well as the empirical
>> ones. However, I grant that the Linux community leans the other way in
>> general.
>
> I usually agree, unless it makes the code horribly complicated without
> any real benefit. (e.g. for corner cases like this one: a CPU modifying
> instruction text of another CPU which is currently executing them)

I don't feel that one additional bit of serialized state is horribly
complicated. It's aesthetically unpleasant, to be sure, but not
horribly complicated. And it's considerably less complicated than your
proposal. :-)

>>>> 3. I'm assuming that you're planning to store most of the current L2
>>>> state in the cached VMCS12, at least where you can. Even so, the next
>>>> "VM-entry" can't perform any of the normal VM-entry actions that would
>>>> clobber the current L2 state that isn't in the cached VMCS12 (e.g.
>>>> processing the VM-entry MSR load list). So, you need to have a flag
>>>> indicating that this isn't a real VM-entry. That's no better than
>>>> carrying the nested_run_pending flag.
>>>
>>> Not sure if that would really be necessary (would have to look into the
>>> details first). But sounds like nested_run_pending seems unavoidable on
>>> x86. So I'd better get used to QEMU dealing with nested CPU state (which
>>> is somehow scary to me - an emulator getting involved in nested
>>> execution - what could go wrong :) )
>>
>> For save/restore, QEMU doesn't actually have to know what the flag
>> means. It just has to pass it on. (Our userspace agent doesn't know
>> anything about the meaning of the flags in the kvm_vmx_state
>> structure.)
>
> I agree on save/restore. My comment was rather about involving a L0
> emulator when running a L2 guest. But as Paolo pointed out, this can't
> really be avoided.
>
> --
>
> Thanks,
>
> David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE
  2018-01-08 21:55                                                         ` Jim Mattson
@ 2018-01-09  8:19                                                           ` David Hildenbrand
  0 siblings, 0 replies; 63+ messages in thread
From: David Hildenbrand @ 2018-01-09  8:19 UTC (permalink / raw)
  To: Jim Mattson; +Cc: Paolo Bonzini, kvm list, David Hildenbrand

On 08.01.2018 22:55, Jim Mattson wrote:
> On Mon, Jan 8, 2018 at 1:46 PM, David Hildenbrand <david@redhat.com> wrote:
>>> The vmcs01 does serve as a cache of L1 state at the time of VM-entry,
>>> so if we simply restored the vmcs01 state, that would take care of
>>> most of the rollback issues, as long as we don't deliver any
>>> interrupts in this context. However, I would like to see the
>>> vmcs01/vmcs02 separation go away at some point. svm.c seems to do fine
>>> with just once VMCB.
>>
>> Interesting point, might make things easier for VMX.
>>
>>>
>>>> If the page
>>>>> tables have changed, or the L1 guest has overwritten the
>>>>> VMLAUNCH/VMRESUME instruction, then you're out of luck.
>>>>
>>>> Page tables getting changed by other CPUs is actually a good point. But
>>>> I would consider both as "theoretical" problems. At least compared to
>>>> the interrupt stuff, which could also happen on guests behaving in a
>>>> more sane way.
>>>
>>> My preference is for solutions that are architecturally correct,
>>> thereby solving the theoretical problems as well as the empirical
>>> ones. However, I grant that the Linux community leans the other way in
>>> general.
>>
>> I usually agree, unless it makes the code horribly complicated without
>> any real benefit. (e.g. for corner cases like this one: a CPU modifying
>> instruction text of another CPU which is currently executing them)
> 
> I don't feel that one additional bit of serialized state is horribly
> complicated. It's aesthetically unpleasant, to be sure, but not
> horribly complicated. And it's considerably less complicated than your
> proposal. :-)

Well, nested_run_pending is just the tip of the ice berg of the (in my
opinion complicated) part of exiting to user space with nested state,
exposing all* VCPU ioctls to it.

But as we learned, it's the little things that cause big problems :)

-- 

Thanks,

David / dhildenb

^ permalink raw reply	[flat|nested] 63+ messages in thread

end of thread, other threads:[~2018-01-09  8:19 UTC | newest]

Thread overview: 63+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-30 20:03 [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state Jim Mattson
2016-11-30 20:03 ` [PATCH 1/8] kvm: nVMX: Prepare for checkpointing L2 state Jim Mattson
2016-11-30 20:03 ` [PATCH 2/8] kvm: nVMX: Refactor handle_vmon() Jim Mattson
2016-11-30 20:03 ` [PATCH 3/8] kvm: nVMX: Refactor handle_vmptrld() Jim Mattson
2016-11-30 20:03 ` [PATCH 4/8] kvm: nVMX: Refactor nested_get_vmcs12_pages() Jim Mattson
2016-11-30 20:03 ` [PATCH 5/8] kvm: nVMX: Split VMCS checks from nested_vmx_run() Jim Mattson
2016-11-30 20:03 ` [PATCH 6/8] kvm: nVMX: Refactor nested_vmx_run() Jim Mattson
2016-11-30 20:03 ` [PATCH 7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE Jim Mattson
2016-12-09 15:26   ` Paolo Bonzini
2016-12-09 15:55     ` David Hildenbrand
2016-12-09 19:26       ` Paolo Bonzini
2016-12-12 14:14         ` David Hildenbrand
2017-02-15 11:56     ` Paolo Bonzini
2017-02-15 16:06       ` Jim Mattson
2017-02-15 16:14         ` Paolo Bonzini
2017-02-15 16:58           ` Jim Mattson
2017-02-15 17:37             ` Paolo Bonzini
2017-02-15 18:19               ` Jim Mattson
2017-02-15 21:28                 ` Paolo Bonzini
2017-12-19  3:07                   ` Jim Mattson
2017-12-19 10:35                     ` David Hildenbrand
2017-12-19 17:26                       ` Jim Mattson
2017-12-19 17:33                         ` David Hildenbrand
2017-12-19 17:40                           ` David Hildenbrand
2017-12-19 19:21                             ` Jim Mattson
2017-12-19 19:52                               ` David Hildenbrand
2017-12-19 21:29                               ` Paolo Bonzini
2018-01-08 10:35                                 ` David Hildenbrand
2018-01-08 17:25                                   ` Jim Mattson
2018-01-08 17:36                                     ` Paolo Bonzini
2018-01-08 17:59                                       ` David Hildenbrand
2018-01-08 18:11                                         ` Paolo Bonzini
2018-01-08 18:23                                           ` David Hildenbrand
2018-01-08 20:19                                             ` Jim Mattson
2018-01-08 20:27                                               ` David Hildenbrand
2018-01-08 20:59                                                 ` Jim Mattson
2018-01-08 21:19                                                   ` David Hildenbrand
2018-01-08 21:40                                                     ` Jim Mattson
2018-01-08 21:46                                                       ` David Hildenbrand
2018-01-08 21:55                                                         ` Jim Mattson
2018-01-09  8:19                                                           ` David Hildenbrand
2018-01-08 18:17                                         ` Jim Mattson
2017-12-19 12:45                     ` Paolo Bonzini
2016-11-30 20:03 ` [PATCH 8/8] kvm: nVMX: Defer gpa->hpa lookups for set_vmx_state Jim Mattson
2016-12-09 15:35   ` Paolo Bonzini
2016-12-09 16:26     ` Jim Mattson
2016-12-10  7:48       ` Paolo Bonzini
2016-12-06  9:04 ` [PATCH 0/8] kvm: nVMX: Checkpoint/restore support for VMX state David Hildenbrand
2016-12-06 19:02   ` Jim Mattson
2016-12-09 15:28     ` Paolo Bonzini
2016-12-09 16:16       ` Jim Mattson
2016-12-09 19:15         ` Paolo Bonzini
2017-02-15 19:30           ` Jim Mattson
2017-02-15 21:26             ` Paolo Bonzini
2017-02-15 22:00               ` Jim Mattson
2017-02-15 22:05                 ` Paolo Bonzini
2017-02-15 22:49                   ` Jim Mattson
2017-02-15 23:09                     ` Paolo Bonzini
2017-02-08 16:24 ` Paolo Bonzini
2017-02-14 22:17   ` Jim Mattson
2017-02-15 10:15     ` Paolo Bonzini
2017-02-15 17:02       ` Jim Mattson
2017-02-15 17:31         ` Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.