All of lore.kernel.org
 help / color / mirror / Atom feed
* Nested VMX support v3
@ 2009-10-15 14:41 oritw
  2009-10-15 14:41 ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff oritw
                   ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: oritw @ 2009-10-15 14:41 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, abelg, muli, aliguori, mdday

Avi,
We have addressed all of the comments, please apply.

The following patches implement nested VMX support. The patches enable a guest
to use the VMX APIs in order to run its own nested guest (i.e., enable running
other hypervisors which use VMX under KVM). The current patches support running
Linux under a nested KVM using shadow page table (with bypass_guest_pf
disabled). SMP support was fixed.  Reworking EPT support to mesh cleanly with
the current shadow paging design per Avi's comments is a work-in-progress.  

The current patches only support a single nested hypervisor, which can only run
a single guest (multiple guests are work in progress). Only 64-bit nested
hypervisors are supported.

Additional patches for running Windows under nested KVM, and Linux under nested
VMware server(!), are currently running in the lab. We are in the process of
forward-porting those patches to -tip.

This patches were written by:
     Orit Wasserman, oritw@il.ibm.com
     Ben-Ami Yassor, benami@il.ibm.com
     Abel Gordon, abelg@il.ibm.com
     Muli Ben-Yehuda, muli@il.ibm.com
     
With contributions by:
     Anthony Liguori, aliguori@us.ibm.com
     Mike Day, mdday@us.ibm.com

This work was inspired by the nested SVM support by Alexander Graf and Joerg
Roedel.

Changes since v2:
	Added check to nested_vmx_get_msr.
	Static initialization of the vmcs_field_to_offset_table array.
	Use the memory allocated by L1 for VMCS12 to store the shadow vmcs.
	Some optimization to the prepare_vmcs_12 function.

vpid allocation will be updated with the multiguest support (work in progress).
We are working on fixing the cr0.TS handling, it works for nested kvm by not 
for vmware server.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff
  2009-10-15 14:41 Nested VMX support v3 oritw
@ 2009-10-15 14:41 ` oritw
  2009-10-15 14:41   ` [PATCH 2/5] Nested VMX patch 2 implements vmclear oritw
  2009-10-20  4:00   ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
  2009-10-19 10:47 ` Nested VMX support v3 Gleb Natapov
  2009-10-20  3:30 ` Avi Kivity
  2 siblings, 2 replies; 35+ messages in thread
From: oritw @ 2009-10-15 14:41 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/svm.c |    3 -
 arch/x86/kvm/vmx.c |  217 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c |    6 +-
 arch/x86/kvm/x86.h |    2 +
 4 files changed, 222 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2df9b45..3c1f22a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -124,9 +124,6 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
-static int nested = 1;
-module_param(nested, int, S_IRUGO);
-
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 78101dd..71bd91a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -67,6 +67,11 @@ struct vmcs {
 	char data[0];
 };
 
+struct nested_vmx {
+	/* Has the level1 guest done vmxon? */
+	bool vmxon;
+};
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	struct list_head      local_vcpus_link;
@@ -114,6 +119,9 @@ struct vcpu_vmx {
 	ktime_t entry_time;
 	s64 vnmi_blocked_time;
 	u32 exit_reason;
+
+	/* Nested vmx */
+	struct nested_vmx nested;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -967,6 +975,95 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 }
 
 /*
+ * Handles msr read for nested virtualization
+ */
+static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
+			      u64 *pdata)
+{
+	u64 vmx_msr = 0;
+
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
+		*pdata = 0;
+		break;
+	case MSR_IA32_VMX_BASIC:
+		*pdata = 0;
+		rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr);
+		*pdata = (vmx_msr & 0x00ffffcfffffffff);
+		break;
+	case MSR_IA32_VMX_PINBASED_CTLS:
+		rdmsrl(MSR_IA32_VMX_PINBASED_CTLS, vmx_msr);
+		*pdata = (PIN_BASED_EXT_INTR_MASK & vmcs_config.pin_based_exec_ctrl) |
+			(PIN_BASED_NMI_EXITING & vmcs_config.pin_based_exec_ctrl) |
+			(PIN_BASED_VIRTUAL_NMIS & vmcs_config.pin_based_exec_ctrl);
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+	{
+		u32 vmx_msr_high, vmx_msr_low;
+		u32 control = CPU_BASED_HLT_EXITING |
+#ifdef CONFIG_X86_64
+			CPU_BASED_CR8_LOAD_EXITING |
+			CPU_BASED_CR8_STORE_EXITING |
+#endif
+			CPU_BASED_CR3_LOAD_EXITING |
+			CPU_BASED_CR3_STORE_EXITING |
+			CPU_BASED_USE_IO_BITMAPS |
+			CPU_BASED_MOV_DR_EXITING |
+			CPU_BASED_USE_TSC_OFFSETING |
+			CPU_BASED_INVLPG_EXITING |
+			CPU_BASED_TPR_SHADOW |
+			CPU_BASED_USE_MSR_BITMAPS |
+			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
+		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+
+		control &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+		control |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
+
+		*pdata = (CPU_BASED_HLT_EXITING & control) |
+#ifdef CONFIG_X86_64
+			(CPU_BASED_CR8_LOAD_EXITING & control) |
+			(CPU_BASED_CR8_STORE_EXITING & control) |
+#endif
+			(CPU_BASED_CR3_LOAD_EXITING & control) |
+			(CPU_BASED_CR3_STORE_EXITING & control) |
+			(CPU_BASED_USE_IO_BITMAPS & control) |
+			(CPU_BASED_MOV_DR_EXITING & control) |
+			(CPU_BASED_USE_TSC_OFFSETING & control) |
+			(CPU_BASED_INVLPG_EXITING & control) ;
+
+		if (cpu_has_secondary_exec_ctrls())
+			*pdata |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
+		if (vm_need_tpr_shadow(vcpu->kvm))
+			*pdata |= CPU_BASED_TPR_SHADOW;
+		break;
+	}
+	case MSR_IA32_VMX_EXIT_CTLS:
+		*pdata = 0;
+#ifdef CONFIG_X86_64
+		*pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+		break;
+	case MSR_IA32_VMX_ENTRY_CTLS:
+		*pdata = 0;
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS2:
+		*pdata = 0;
+		if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+			*pdata |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		break;
+	case MSR_IA32_VMX_EPT_VPID_CAP:
+		*pdata = 0;
+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
@@ -1005,6 +1102,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
 	default:
+		if (nested &&
+		    !nested_vmx_get_msr(vcpu, msr_index, &data))
+			break;
 		vmx_load_host_state(to_vmx(vcpu));
 		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
@@ -1019,6 +1119,27 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 }
 
 /*
+ * Writes msr value for nested virtualization
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
+		if ((data & (FEATURE_CONTROL_LOCKED |
+			     FEATURE_CONTROL_VMXON_ENABLED))
+		    != (FEATURE_CONTROL_LOCKED |
+			FEATURE_CONTROL_VMXON_ENABLED))
+			return 1;
+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
@@ -1064,6 +1185,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		}
 		/* Otherwise falls through to kvm_set_msr_common */
 	default:
+		if (nested &&
+		    !nested_vmx_set_msr(vcpu, msr_index, data))
+			break;
 		vmx_load_host_state(vmx);
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
@@ -3095,12 +3219,101 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * Check to see if vcpu can execute vmx command
+ * Inject the corrseponding exception
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_msr_entry *msr;
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+	if (!vmx->nested.vmxon) {
+		printk(KERN_DEBUG "%s: vmx not on\n", __func__);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	msr = find_msr_entry(vmx, MSR_EFER);
+
+	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+		 ((msr->data & EFER_LMA) && !cs.l)) {
+		printk(KERN_DEBUG "%s: invalid mode cs.l %d lma %llu\n",
+		       __func__, cs.l, msr->data & EFER_LMA);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	if (vmx_get_cpl(vcpu)) {
+		kvm_inject_gp(vcpu, 0);
+		return 0;
+	}
+
+	return 1;
+}
+
 static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 {
 	kvm_queue_exception(vcpu, UD_VECTOR);
 	return 1;
 }
 
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vmx->nested.vmxon = 0;
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!nested) {
+		printk(KERN_DEBUG "%s: nested vmx not enabled\n", __func__);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+	if (!(vcpu->arch.cr4 & X86_CR4_VMXE) ||
+	    !(vcpu->arch.cr0 & X86_CR0_PE) ||
+	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		printk(KERN_INFO "%s invalid register state\n", __func__);
+		return 1;
+	}
+#ifdef CONFIG_X86_64
+	if (((find_msr_entry(to_vmx(vcpu),
+			     MSR_EFER)->data & EFER_LMA) && !cs.l)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		printk(KERN_INFO "%s invalid register state\n", __func__);
+		return 1;
+	}
+#endif
+	if (vmx_get_cpl(vcpu)) {
+		printk(KERN_INFO "%s no permission\n", __func__);
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	vmx->nested.vmxon = 1;
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3376,8 +3589,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
 	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
-	[EXIT_REASON_VMOFF]                   = handle_vmx_insn,
-	[EXIT_REASON_VMON]                    = handle_vmx_insn,
+	[EXIT_REASON_VMOFF]                   = handle_vmoff,
+	[EXIT_REASON_VMON]                    = handle_vmon,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b3a169..9c39092 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -87,6 +87,10 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
 int ignore_msrs = 0;
 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
+int nested = 1;
+EXPORT_SYMBOL_GPL(nested);
+module_param(nested, int, S_IRUGO);
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
 	{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -373,7 +377,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		return;
 	}
 
-	if (cr4 & X86_CR4_VMXE) {
+	if (cr4 & X86_CR4_VMXE && !nested) {
 		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 		kvm_inject_gp(vcpu, 0);
 		return;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea5..57204cb 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -35,4 +35,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index);
 
+extern int nested;
+
 #endif
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 2/5] Nested VMX patch 2 implements vmclear
  2009-10-15 14:41 ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff oritw
@ 2009-10-15 14:41   ` oritw
  2009-10-15 14:41     ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst oritw
  2009-10-20  4:06     ` [PATCH 2/5] Nested VMX patch 2 implements vmclear Avi Kivity
  2009-10-20  4:00   ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
  1 sibling, 2 replies; 35+ messages in thread
From: oritw @ 2009-10-15 14:41 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 71bd91a..411cbdb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,15 +61,26 @@ module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
-struct vmcs {
-	u32 revision_id;
-	u32 abort;
-	char data[0];
+struct __attribute__ ((__packed__)) level_state {
+	/* Has the level1 guest done vmclear? */
+	bool vmclear;
 };
 
 struct nested_vmx {
 	/* Has the level1 guest done vmxon? */
 	bool vmxon;
+
+	/*
+	 * Level 2 state : includes vmcs,registers and
+	 * a copy of vmcs12 for vmread/vmwrite
+	 */
+	struct level_state *l2_state;
+};
+
+struct vmcs {
+	u32 revision_id;
+	u32 abort;
+	char data[0];
 };
 
 struct vcpu_vmx {
@@ -186,6 +197,8 @@ static struct kvm_vmx_segment_field {
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
+static int create_l2_state(struct kvm_vcpu *vcpu);
+
 /*
  * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
  * away by decrementing the array size.
@@ -1293,6 +1306,30 @@ static void vmclear_local_vcpus(void)
 		__vcpu_clear(vmx);
 }
 
+struct level_state *create_state(void)
+{
+	struct level_state *state = NULL;
+
+	state = kzalloc(sizeof(struct level_state), GFP_KERNEL);
+	if (!state) {
+		printk(KERN_INFO "Error create level state\n");
+		return NULL;
+	}
+	return state;
+}
+
+int create_l2_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l2_state) {
+		vmx->nested.l2_state = create_state();
+		if (!vmx->nested.l2_state)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
 
 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
  * tricks.
@@ -3261,6 +3298,27 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_ZF);
+	vmx_set_rflags(vcpu, rflags);
+}
+
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	to_vmx(vcpu)->nested.l2_state->vmclear = 1;
+
+	skip_emulated_instruction(vcpu);
+	clear_rflags_cf_zf(vcpu);
+
+	return 1;
+}
+
 static int handle_vmoff(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3310,6 +3368,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
 	vmx->nested.vmxon = 1;
 
+	create_l2_state(vcpu);
+
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
@@ -3582,7 +3642,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_HLT]                     = handle_halt,
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
-	[EXIT_REASON_VMCLEAR]	              = handle_vmx_insn,
+	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
 	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-10-15 14:41   ` [PATCH 2/5] Nested VMX patch 2 implements vmclear oritw
@ 2009-10-15 14:41     ` oritw
  2009-10-15 14:41       ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite oritw
                         ` (3 more replies)
  2009-10-20  4:06     ` [PATCH 2/5] Nested VMX patch 2 implements vmclear Avi Kivity
  1 sibling, 4 replies; 35+ messages in thread
From: oritw @ 2009-10-15 14:41 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  468 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/x86.c |    3 +-
 2 files changed, 459 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 411cbdb..8c186e0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,20 +61,168 @@ module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
+
+struct __attribute__ ((__packed__)) shadow_vmcs {
+	u32 revision_id;
+	u32 abort;
+	u16 virtual_processor_id;
+	u16 guest_es_selector;
+	u16 guest_cs_selector;
+	u16 guest_ss_selector;
+	u16 guest_ds_selector;
+	u16 guest_fs_selector;
+	u16 guest_gs_selector;
+	u16 guest_ldtr_selector;
+	u16 guest_tr_selector;
+	u16 host_es_selector;
+	u16 host_cs_selector;
+	u16 host_ss_selector;
+	u16 host_ds_selector;
+	u16 host_fs_selector;
+	u16 host_gs_selector;
+	u16 host_tr_selector;
+	u64 io_bitmap_a;
+	u64 io_bitmap_b;
+	u64 msr_bitmap;
+	u64 vm_exit_msr_store_addr;
+	u64 vm_exit_msr_load_addr;
+	u64 vm_entry_msr_load_addr;
+	u64 tsc_offset;
+	u64 virtual_apic_page_addr;
+	u64 apic_access_addr;
+	u64 ept_pointer;
+	u64 guest_physical_address;
+	u64 vmcs_link_pointer;
+	u64 guest_ia32_debugctl;
+	u64 guest_ia32_pat;
+	u64 guest_pdptr0;
+	u64 guest_pdptr1;
+	u64 guest_pdptr2;
+	u64 guest_pdptr3;
+	u64 host_ia32_pat;
+	u32 pin_based_vm_exec_control;
+	u32 cpu_based_vm_exec_control;
+	u32 exception_bitmap;
+	u32 page_fault_error_code_mask;
+	u32 page_fault_error_code_match;
+	u32 cr3_target_count;
+	u32 vm_exit_controls;
+	u32 vm_exit_msr_store_count;
+	u32 vm_exit_msr_load_count;
+	u32 vm_entry_controls;
+	u32 vm_entry_msr_load_count;
+	u32 vm_entry_intr_info_field;
+	u32 vm_entry_exception_error_code;
+	u32 vm_entry_instruction_len;
+	u32 tpr_threshold;
+	u32 secondary_vm_exec_control;
+	u32 vm_instruction_error;
+	u32 vm_exit_reason;
+	u32 vm_exit_intr_info;
+	u32 vm_exit_intr_error_code;
+	u32 idt_vectoring_info_field;
+	u32 idt_vectoring_error_code;
+	u32 vm_exit_instruction_len;
+	u32 vmx_instruction_info;
+	u32 guest_es_limit;
+	u32 guest_cs_limit;
+	u32 guest_ss_limit;
+	u32 guest_ds_limit;
+	u32 guest_fs_limit;
+	u32 guest_gs_limit;
+	u32 guest_ldtr_limit;
+	u32 guest_tr_limit;
+	u32 guest_gdtr_limit;
+	u32 guest_idtr_limit;
+	u32 guest_es_ar_bytes;
+	u32 guest_cs_ar_bytes;
+	u32 guest_ss_ar_bytes;
+	u32 guest_ds_ar_bytes;
+	u32 guest_fs_ar_bytes;
+	u32 guest_gs_ar_bytes;
+	u32 guest_ldtr_ar_bytes;
+	u32 guest_tr_ar_bytes;
+	u32 guest_interruptibility_info;
+	u32 guest_activity_state;
+	u32 guest_sysenter_cs;
+	u32 host_ia32_sysenter_cs;
+	unsigned long cr0_guest_host_mask;
+	unsigned long cr4_guest_host_mask;
+	unsigned long cr0_read_shadow;
+	unsigned long cr4_read_shadow;
+	unsigned long cr3_target_value0;
+	unsigned long cr3_target_value1;
+	unsigned long cr3_target_value2;
+	unsigned long cr3_target_value3;
+	unsigned long exit_qualification;
+	unsigned long guest_linear_address;
+	unsigned long guest_cr0;
+	unsigned long guest_cr3;
+	unsigned long guest_cr4;
+	unsigned long guest_es_base;
+	unsigned long guest_cs_base;
+	unsigned long guest_ss_base;
+	unsigned long guest_ds_base;
+	unsigned long guest_fs_base;
+	unsigned long guest_gs_base;
+	unsigned long guest_ldtr_base;
+	unsigned long guest_tr_base;
+	unsigned long guest_gdtr_base;
+	unsigned long guest_idtr_base;
+	unsigned long guest_dr7;
+	unsigned long guest_rsp;
+	unsigned long guest_rip;
+	unsigned long guest_rflags;
+	unsigned long guest_pending_dbg_exceptions;
+	unsigned long guest_sysenter_esp;
+	unsigned long guest_sysenter_eip;
+	unsigned long host_cr0;
+	unsigned long host_cr3;
+	unsigned long host_cr4;
+	unsigned long host_fs_base;
+	unsigned long host_gs_base;
+	unsigned long host_tr_base;
+	unsigned long host_gdtr_base;
+	unsigned long host_idtr_base;
+	unsigned long host_ia32_sysenter_esp;
+	unsigned long host_ia32_sysenter_eip;
+	unsigned long host_rsp;
+	unsigned long host_rip;
+};
+
 struct __attribute__ ((__packed__)) level_state {
 	/* Has the level1 guest done vmclear? */
 	bool vmclear;
+	u16 vpid;
+	u64 shadow_efer;
+	unsigned long cr2;
+	unsigned long cr3;
+	unsigned long cr4;
+	unsigned long cr8;
+
+	u64 io_bitmap_a;
+	u64 io_bitmap_b;
+	u64 msr_bitmap;
+
+	struct shadow_vmcs *shadow_vmcs;
+
+	struct vmcs *vmcs;
+	int cpu;
+	int launched;
 };
 
 struct nested_vmx {
 	/* Has the level1 guest done vmxon? */
 	bool vmxon;
-
+	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
+	u64 vmptr;
 	/*
 	 * Level 2 state : includes vmcs,registers and
 	 * a copy of vmcs12 for vmread/vmwrite
 	 */
 	struct level_state *l2_state;
+	/* Level 1 state for switching to level 2 and back */
+	struct level_state *l1_state;
 };
 
 struct vmcs {
@@ -140,6 +288,25 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static struct page *nested_get_page(struct kvm_vcpu *vcpu,
+				    u64 vmcs_addr)
+{
+	struct page *vmcs_page = NULL;
+
+	down_read(&current->mm->mmap_sem);
+	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+
+	if (is_error_page(vmcs_page)) {
+		printk(KERN_ERR "%s error allocating page \n", __func__);
+		kvm_release_page_clean(vmcs_page);
+		return NULL;
+	}
+
+	return vmcs_page;
+
+}
+
 static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 
@@ -197,6 +364,8 @@ static struct kvm_vmx_segment_field {
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu);
+static int create_l1_state(struct kvm_vcpu *vcpu);
 static int create_l2_state(struct kvm_vcpu *vcpu);
 
 /*
@@ -715,6 +884,24 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	preempt_enable();
 }
 
+
+static int vmptrld(struct kvm_vcpu *vcpu,
+		   u64 phys_addr)
+{
+	u8 error;
+
+	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+		      : "cc");
+	if (error) {
+		printk(KERN_ERR "kvm: %s vmptrld %llx failed\n",
+		       __func__, phys_addr);
+		return 1;
+	}
+
+	return 0;
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -736,15 +923,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		u8 error;
-
 		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
-			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
-			      : "cc");
-		if (error)
-			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
-			       vmx->vmcs, phys_addr);
+		vmptrld(vcpu, phys_addr);
 	}
 
 	if (vcpu->cpu != cpu) {
@@ -1318,6 +1498,28 @@ struct level_state *create_state(void)
 	return state;
 }
 
+int create_l1_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l1_state) {
+		vmx->nested.l1_state = create_state();
+		if (!vmx->nested.l1_state)
+			return -ENOMEM;
+	} else
+		return 0;
+
+	vmx->nested.l1_state->shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!vmx->nested.l1_state->shadow_vmcs) {
+		printk(KERN_INFO "%s error creating shadow vmcs\n",
+		       __func__);
+		kfree(vmx->nested.l1_state);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static struct vmcs *alloc_vmcs(void);
 int create_l2_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1326,11 +1528,53 @@ int create_l2_state(struct kvm_vcpu *vcpu)
 		vmx->nested.l2_state = create_state();
 		if (!vmx->nested.l2_state)
 			return -ENOMEM;
+	} else
+		return 0;
+
+	vmx->nested.l2_state->vmcs = alloc_vmcs();
+	if (!vmx->nested.l2_state->vmcs) {
+		printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
+		kfree(vmx->nested.l2_state);
+		return -ENOMEM;
 	}
 
+	if (cpu_has_vmx_msr_bitmap())
+		vmx->nested.l2_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
+	else
+		vmx->nested.l2_state->msr_bitmap = 0;
+
+	vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+
 	return 0;
 }
 
+int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			struct kvm_vcpu *vcpu);
+
+int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
+{
+
+	int r = 0;
+
+	r = kvm_read_guest_virt(vcpu->arch.regs[VCPU_REGS_RAX], gentry,
+				sizeof(u64), vcpu);
+	if (r) {
+		printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
+		       __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
+		return r;
+	}
+
+	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
+		printk(KERN_DEBUG "%s addr %llx not aligned\n",
+		       __func__, *gentry);
+		return 1;
+	}
+
+	return 0;
+}
+
+
 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
  * tricks.
  */
@@ -3374,6 +3618,66 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct page *vmcs_page;
+	u64 guest_vmcs_addr;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (read_guest_vmcs_gpa(vcpu, &guest_vmcs_addr))
+		return 1;
+
+	if (create_l1_state(vcpu)) {
+		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
+		return 1;
+	}
+
+	if (create_l2_state(vcpu)) {
+		printk(KERN_ERR "%s create_l2_state failed\n", __func__);
+		return 1;
+	}
+
+	if (vmx->nested.vmptr != guest_vmcs_addr) {
+		/* checking vmptr address */
+		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
+		if (vmcs_page == NULL)
+			return 1;
+
+		vmx->nested.vmptr = guest_vmcs_addr;
+
+		kvm_release_page_clean(vmcs_page);
+	}
+
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			 struct kvm_vcpu *vcpu);
+
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+	int r = 0;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	r = kvm_write_guest_virt(vcpu->arch.regs[VCPU_REGS_RAX],
+				 (void *)&to_vmx(vcpu)->nested.vmptr,
+				 sizeof(u64), vcpu);
+	if (r) {
+		printk(KERN_INFO "%s failed to write vmptr\n", __func__);
+		return 1;
+	}
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3644,8 +3948,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
-	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
-	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
+	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
+	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
 	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
@@ -4183,6 +4487,148 @@ static bool vmx_gb_page_enable(void)
 	return false;
 }
 
+void save_vmcs(struct shadow_vmcs *dst)
+{
+	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
+	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
+	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
+	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
+	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
+	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
+	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
+	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+	if (cpu_has_vmx_msr_bitmap())
+		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
+
+	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
+	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
+	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
+	dst->tsc_offset = vmcs_read64(TSC_OFFSET);
+	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
+	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
+	if (enable_ept)
+		dst->ept_pointer = vmcs_read64(EPT_POINTER);
+
+	dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	if (enable_ept) {
+		dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+		dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+		dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+		dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+	}
+	dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+	dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
+	dst->page_fault_error_code_mask =
+		vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
+	dst->page_fault_error_code_match =
+		vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
+	dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
+	dst->vm_exit_msr_store_count = vmcs_read32(VM_EXIT_MSR_STORE_COUNT);
+	dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
+	dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
+	dst->vm_entry_msr_load_count = vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT);
+	dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	dst->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
+	dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	if (enable_vpid && dst->secondary_vm_exec_control &
+	    SECONDARY_EXEC_ENABLE_VPID)
+		dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
+	dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+	dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	dst->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+	dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+	dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
+	dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
+	dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
+	dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+	dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
+	dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
+	dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
+	dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
+	dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	dst->guest_cr0 = vmcs_readl(GUEST_CR0);
+	dst->guest_cr3 = vmcs_readl(GUEST_CR3);
+	dst->guest_cr4 = vmcs_readl(GUEST_CR4);
+	dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	dst->guest_dr7 = vmcs_readl(GUEST_DR7);
+	dst->guest_rsp = vmcs_readl(GUEST_RSP);
+	dst->guest_rip = vmcs_readl(GUEST_RIP);
+	dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	dst->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+	dst->host_cr0 = vmcs_readl(HOST_CR0);
+	dst->host_cr3 = vmcs_readl(HOST_CR3);
+	dst->host_cr4 = vmcs_readl(HOST_CR4);
+	dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
+	dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
+	dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
+	dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
+	dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
+	dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
+	dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
+	dst->host_rsp = vmcs_readl(HOST_RSP);
+	dst->host_rip = vmcs_readl(HOST_RIP);
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9c39092..74eb888 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2473,6 +2473,7 @@ static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
 out:
 	return r;
 }
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
 				struct kvm_vcpu *vcpu)
@@ -2503,7 +2504,7 @@ static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
 out:
 	return r;
 }
-
+EXPORT_SYMBOL_GPL(kvm_write_guest_virt);
 
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite
  2009-10-15 14:41     ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst oritw
@ 2009-10-15 14:41       ` oritw
  2009-10-15 14:41         ` [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume oritw
                           ` (2 more replies)
  2009-10-19 11:17       ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst Gleb Natapov
                         ` (2 subsequent siblings)
  3 siblings, 3 replies; 35+ messages in thread
From: oritw @ 2009-10-15 14:41 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  591 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 589 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8c186e0..6a4c252 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -225,6 +225,21 @@ struct nested_vmx {
 	struct level_state *l1_state;
 };
 
+enum vmcs_field_type {
+	VMCS_FIELD_TYPE_U16 = 0,
+	VMCS_FIELD_TYPE_U64 = 1,
+	VMCS_FIELD_TYPE_U32 = 2,
+	VMCS_FIELD_TYPE_ULONG = 3
+};
+
+#define VMCS_FIELD_LENGTH_OFFSET 13
+#define VMCS_FIELD_LENGTH_MASK 0x6000
+
+static inline int vmcs_field_length(unsigned long field)
+{
+	return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
+}
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -288,6 +303,404 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+#define SHADOW_VMCS_OFFSET(x) offsetof(struct shadow_vmcs, x)
+
+static unsigned short vmcs_field_to_offset_table[HOST_RIP+1] = {
+
+	[VIRTUAL_PROCESSOR_ID] =
+		SHADOW_VMCS_OFFSET(virtual_processor_id),
+	[GUEST_ES_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_es_selector),
+	[GUEST_CS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_cs_selector),
+	[GUEST_SS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_ss_selector),
+	[GUEST_DS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_ds_selector),
+	[GUEST_FS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_fs_selector),
+	[GUEST_GS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_gs_selector),
+	[GUEST_LDTR_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_ldtr_selector),
+	[GUEST_TR_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_tr_selector),
+	[HOST_ES_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_es_selector),
+	[HOST_CS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_cs_selector),
+	[HOST_SS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_ss_selector),
+	[HOST_DS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_ds_selector),
+	[HOST_FS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_fs_selector),
+	[HOST_GS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_gs_selector),
+	[HOST_TR_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_tr_selector),
+	[IO_BITMAP_A] =
+		SHADOW_VMCS_OFFSET(io_bitmap_a),
+	[IO_BITMAP_A_HIGH] =
+		SHADOW_VMCS_OFFSET(io_bitmap_a)+4,
+	[IO_BITMAP_B] =
+		SHADOW_VMCS_OFFSET(io_bitmap_b),
+	[IO_BITMAP_B_HIGH] =
+		SHADOW_VMCS_OFFSET(io_bitmap_b)+4,
+	[MSR_BITMAP] =
+		SHADOW_VMCS_OFFSET(msr_bitmap),
+	[MSR_BITMAP_HIGH] =
+		SHADOW_VMCS_OFFSET(msr_bitmap)+4,
+	[VM_EXIT_MSR_STORE_ADDR] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_store_addr),
+	[VM_EXIT_MSR_STORE_ADDR_HIGH] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_store_addr)+4,
+	[VM_EXIT_MSR_LOAD_ADDR] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_load_addr),
+	[VM_EXIT_MSR_LOAD_ADDR_HIGH] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_load_addr)+4,
+	[VM_ENTRY_MSR_LOAD_ADDR] =
+		SHADOW_VMCS_OFFSET(vm_entry_msr_load_addr),
+	[VM_ENTRY_MSR_LOAD_ADDR_HIGH] =
+		SHADOW_VMCS_OFFSET(vm_entry_msr_load_addr)+4,
+	[TSC_OFFSET] =
+		SHADOW_VMCS_OFFSET(tsc_offset),
+	[TSC_OFFSET_HIGH] =
+		SHADOW_VMCS_OFFSET(tsc_offset)+4,
+	[VIRTUAL_APIC_PAGE_ADDR] =
+		SHADOW_VMCS_OFFSET(virtual_apic_page_addr),
+	[VIRTUAL_APIC_PAGE_ADDR_HIGH] =
+		SHADOW_VMCS_OFFSET(virtual_apic_page_addr)+4,
+	[APIC_ACCESS_ADDR] =
+		SHADOW_VMCS_OFFSET(apic_access_addr),
+	[APIC_ACCESS_ADDR_HIGH] =
+		SHADOW_VMCS_OFFSET(apic_access_addr)+4,
+	[EPT_POINTER] =
+		SHADOW_VMCS_OFFSET(ept_pointer),
+	[EPT_POINTER_HIGH] =
+		SHADOW_VMCS_OFFSET(ept_pointer)+4,
+	[GUEST_PHYSICAL_ADDRESS] =
+		SHADOW_VMCS_OFFSET(guest_physical_address),
+	[GUEST_PHYSICAL_ADDRESS_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_physical_address)+4,
+	[VMCS_LINK_POINTER] =
+		SHADOW_VMCS_OFFSET(vmcs_link_pointer),
+	[VMCS_LINK_POINTER_HIGH] =
+		SHADOW_VMCS_OFFSET(vmcs_link_pointer)+4,
+	[GUEST_IA32_DEBUGCTL] =
+		SHADOW_VMCS_OFFSET(guest_ia32_debugctl),
+	[GUEST_IA32_DEBUGCTL_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_ia32_debugctl)+4,
+	[GUEST_IA32_PAT] =
+		SHADOW_VMCS_OFFSET(guest_ia32_pat),
+	[GUEST_IA32_PAT_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_ia32_pat)+4,
+	[GUEST_PDPTR0] =
+		SHADOW_VMCS_OFFSET(guest_pdptr0),
+	[GUEST_PDPTR0_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_pdptr0)+4,
+	[GUEST_PDPTR1] =
+		SHADOW_VMCS_OFFSET(guest_pdptr1),
+	[GUEST_PDPTR1_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_pdptr1)+4,
+	[GUEST_PDPTR2] =
+		SHADOW_VMCS_OFFSET(guest_pdptr2),
+	[GUEST_PDPTR2_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_pdptr2)+4,
+	[GUEST_PDPTR3] =
+		SHADOW_VMCS_OFFSET(guest_pdptr3),
+	[GUEST_PDPTR3_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_pdptr3)+4,
+	[HOST_IA32_PAT] =
+		SHADOW_VMCS_OFFSET(host_ia32_pat),
+	[HOST_IA32_PAT_HIGH] =
+		SHADOW_VMCS_OFFSET(host_ia32_pat)+4,
+	[PIN_BASED_VM_EXEC_CONTROL] =
+		SHADOW_VMCS_OFFSET(pin_based_vm_exec_control),
+	[CPU_BASED_VM_EXEC_CONTROL] =
+		SHADOW_VMCS_OFFSET(cpu_based_vm_exec_control),
+	[EXCEPTION_BITMAP] =
+		SHADOW_VMCS_OFFSET(exception_bitmap),
+	[PAGE_FAULT_ERROR_CODE_MASK] =
+		SHADOW_VMCS_OFFSET(page_fault_error_code_mask),
+	[PAGE_FAULT_ERROR_CODE_MATCH] =
+		SHADOW_VMCS_OFFSET(page_fault_error_code_match),
+	[CR3_TARGET_COUNT] =
+		SHADOW_VMCS_OFFSET(cr3_target_count),
+	[VM_EXIT_CONTROLS] =
+		SHADOW_VMCS_OFFSET(vm_exit_controls),
+	[VM_EXIT_MSR_STORE_COUNT] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_store_count),
+	[VM_EXIT_MSR_LOAD_COUNT] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_load_count),
+	[VM_ENTRY_CONTROLS] =
+		SHADOW_VMCS_OFFSET(vm_entry_controls),
+	[VM_ENTRY_MSR_LOAD_COUNT] =
+		SHADOW_VMCS_OFFSET(vm_entry_msr_load_count),
+	[VM_ENTRY_INTR_INFO_FIELD] =
+		SHADOW_VMCS_OFFSET(vm_entry_intr_info_field),
+	[VM_ENTRY_EXCEPTION_ERROR_CODE] =
+		SHADOW_VMCS_OFFSET(vm_entry_exception_error_code),
+	[VM_ENTRY_INSTRUCTION_LEN] =
+		SHADOW_VMCS_OFFSET(vm_entry_instruction_len),
+	[TPR_THRESHOLD] =
+		SHADOW_VMCS_OFFSET(tpr_threshold),
+	[SECONDARY_VM_EXEC_CONTROL] =
+		SHADOW_VMCS_OFFSET(secondary_vm_exec_control),
+	[VM_INSTRUCTION_ERROR] =
+		SHADOW_VMCS_OFFSET(vm_instruction_error),
+	[VM_EXIT_REASON] =
+		SHADOW_VMCS_OFFSET(vm_exit_reason),
+	[VM_EXIT_INTR_INFO] =
+		SHADOW_VMCS_OFFSET(vm_exit_intr_info),
+	[VM_EXIT_INTR_ERROR_CODE] =
+		SHADOW_VMCS_OFFSET(vm_exit_intr_error_code),
+	[IDT_VECTORING_INFO_FIELD] =
+		SHADOW_VMCS_OFFSET(idt_vectoring_info_field),
+	[IDT_VECTORING_ERROR_CODE] =
+		SHADOW_VMCS_OFFSET(idt_vectoring_error_code),
+	[VM_EXIT_INSTRUCTION_LEN] =
+		SHADOW_VMCS_OFFSET(vm_exit_instruction_len),
+	[VMX_INSTRUCTION_INFO] =
+		SHADOW_VMCS_OFFSET(vmx_instruction_info),
+	[GUEST_ES_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_es_limit),
+	[GUEST_CS_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_cs_limit),
+	[GUEST_SS_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_ss_limit),
+	[GUEST_DS_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_ds_limit),
+	[GUEST_FS_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_fs_limit),
+	[GUEST_GS_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_gs_limit),
+	[GUEST_LDTR_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_ldtr_limit),
+	[GUEST_TR_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_tr_limit),
+	[GUEST_GDTR_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_gdtr_limit),
+	[GUEST_IDTR_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_idtr_limit),
+	[GUEST_ES_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_es_ar_bytes),
+	[GUEST_CS_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_cs_ar_bytes),
+	[GUEST_SS_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_ss_ar_bytes),
+	[GUEST_DS_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_ds_ar_bytes),
+	[GUEST_FS_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_fs_ar_bytes),
+	[GUEST_GS_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_gs_ar_bytes),
+	[GUEST_LDTR_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_ldtr_ar_bytes),
+	[GUEST_TR_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_tr_ar_bytes),
+	[GUEST_INTERRUPTIBILITY_INFO] =
+		SHADOW_VMCS_OFFSET(guest_interruptibility_info),
+	[GUEST_ACTIVITY_STATE] =
+		SHADOW_VMCS_OFFSET(guest_activity_state),
+	[GUEST_SYSENTER_CS] =
+		SHADOW_VMCS_OFFSET(guest_sysenter_cs),
+	[HOST_IA32_SYSENTER_CS] =
+		SHADOW_VMCS_OFFSET(host_ia32_sysenter_cs),
+	[CR0_GUEST_HOST_MASK] =
+		SHADOW_VMCS_OFFSET(cr0_guest_host_mask),
+	[CR4_GUEST_HOST_MASK] =
+		SHADOW_VMCS_OFFSET(cr4_guest_host_mask),
+	[CR0_READ_SHADOW] =
+		SHADOW_VMCS_OFFSET(cr0_read_shadow),
+	[CR4_READ_SHADOW] =
+		SHADOW_VMCS_OFFSET(cr4_read_shadow),
+	[CR3_TARGET_VALUE0] =
+		SHADOW_VMCS_OFFSET(cr3_target_value0),
+	[CR3_TARGET_VALUE1] =
+		SHADOW_VMCS_OFFSET(cr3_target_value1),
+	[CR3_TARGET_VALUE2] =
+		SHADOW_VMCS_OFFSET(cr3_target_value2),
+	[CR3_TARGET_VALUE3] =
+		SHADOW_VMCS_OFFSET(cr3_target_value3),
+	[EXIT_QUALIFICATION] =
+		SHADOW_VMCS_OFFSET(exit_qualification),
+	[GUEST_LINEAR_ADDRESS] =
+		SHADOW_VMCS_OFFSET(guest_linear_address),
+	[GUEST_CR0] =
+		SHADOW_VMCS_OFFSET(guest_cr0),
+	[GUEST_CR3] =
+		SHADOW_VMCS_OFFSET(guest_cr3),
+	[GUEST_CR4] =
+		SHADOW_VMCS_OFFSET(guest_cr4),
+	[GUEST_ES_BASE] =
+		SHADOW_VMCS_OFFSET(guest_es_base),
+	[GUEST_CS_BASE] =
+		SHADOW_VMCS_OFFSET(guest_cs_base),
+	[GUEST_SS_BASE] =
+		SHADOW_VMCS_OFFSET(guest_ss_base),
+	[GUEST_DS_BASE] =
+		SHADOW_VMCS_OFFSET(guest_ds_base),
+	[GUEST_FS_BASE] =
+		SHADOW_VMCS_OFFSET(guest_fs_base),
+	[GUEST_GS_BASE] =
+		SHADOW_VMCS_OFFSET(guest_gs_base),
+	[GUEST_LDTR_BASE] =
+		SHADOW_VMCS_OFFSET(guest_ldtr_base),
+	[GUEST_TR_BASE] =
+		SHADOW_VMCS_OFFSET(guest_tr_base),
+	[GUEST_GDTR_BASE] =
+		SHADOW_VMCS_OFFSET(guest_gdtr_base),
+	[GUEST_IDTR_BASE] =
+		SHADOW_VMCS_OFFSET(guest_idtr_base),
+	[GUEST_DR7] =
+		SHADOW_VMCS_OFFSET(guest_dr7),
+	[GUEST_RSP] =
+		SHADOW_VMCS_OFFSET(guest_rsp),
+	[GUEST_RIP] =
+		SHADOW_VMCS_OFFSET(guest_rip),
+	[GUEST_RFLAGS] =
+		SHADOW_VMCS_OFFSET(guest_rflags),
+	[GUEST_PENDING_DBG_EXCEPTIONS] =
+		SHADOW_VMCS_OFFSET(guest_pending_dbg_exceptions),
+	[GUEST_SYSENTER_ESP] =
+		SHADOW_VMCS_OFFSET(guest_sysenter_esp),
+	[GUEST_SYSENTER_EIP] =
+		SHADOW_VMCS_OFFSET(guest_sysenter_eip),
+	[HOST_CR0] =
+		SHADOW_VMCS_OFFSET(host_cr0),
+	[HOST_CR3] =
+		SHADOW_VMCS_OFFSET(host_cr3),
+	[HOST_CR4] =
+		SHADOW_VMCS_OFFSET(host_cr4),
+	[HOST_FS_BASE] =
+		SHADOW_VMCS_OFFSET(host_fs_base),
+	[HOST_GS_BASE] =
+		SHADOW_VMCS_OFFSET(host_gs_base),
+	[HOST_TR_BASE] =
+		SHADOW_VMCS_OFFSET(host_tr_base),
+	[HOST_GDTR_BASE] =
+		SHADOW_VMCS_OFFSET(host_gdtr_base),
+	[HOST_IDTR_BASE] =
+		SHADOW_VMCS_OFFSET(host_idtr_base),
+	[HOST_IA32_SYSENTER_ESP] =
+		SHADOW_VMCS_OFFSET(host_ia32_sysenter_esp),
+	[HOST_IA32_SYSENTER_EIP] =
+		SHADOW_VMCS_OFFSET(host_ia32_sysenter_eip),
+	[HOST_RSP] =
+		SHADOW_VMCS_OFFSET(host_rsp),
+	[HOST_RIP] =
+		SHADOW_VMCS_OFFSET(host_rip),
+};
+
+static inline unsigned short vmcs_field_to_offset(unsigned long field)
+{
+
+	if (field > HOST_RIP || vmcs_field_to_offset_table[field] == 0) {
+		printk(KERN_ERR "invalid vmcs encoding 0x%lx\n", field);
+		return -1;
+	}
+
+	return vmcs_field_to_offset_table[field];
+}
+
+static inline unsigned long nested_vmcs_readl(struct kvm_vcpu *vcpu,
+					      unsigned long field)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long *entry;
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return -1;
+	}
+
+	entry = (unsigned long *)((char *)(vmx->nested.l2_state->shadow_vmcs) +
+				 vmcs_field_to_offset(field));
+	return *entry;
+}
+
+static inline u16 nested_vmcs_read16(struct kvm_vcpu *vcpu,
+				     unsigned long field)
+{
+	return nested_vmcs_readl(vcpu, field);
+}
+
+static inline u32 nested_vmcs_read32(struct kvm_vcpu *vcpu, unsigned long field)
+{
+	return nested_vmcs_readl(vcpu, field);
+}
+
+static inline u64 nested_vmcs_read64(struct kvm_vcpu *vcpu, unsigned long field)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 *entry;
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return -1;
+	}
+
+	entry = (u64 *)((char *)(vmx->nested.l2_state->shadow_vmcs) +
+				 vmcs_field_to_offset(field));
+	return *entry;
+}
+
+static inline void nested_vmcs_writel(struct kvm_vcpu *vcpu,
+				      unsigned long field, unsigned long value)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long entry =
+		(unsigned long)(vmx->nested.l2_state->shadow_vmcs);
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return;
+	}
+	entry += vmcs_field_to_offset(field);
+	*(unsigned long *)entry = value;
+}
+
+static inline void nested_vmcs_write16(struct kvm_vcpu *vcpu,
+				       unsigned long field, u16 value)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long entry =
+		(unsigned long)(vmx->nested.l2_state->shadow_vmcs);
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return;
+	}
+	entry += vmcs_field_to_offset(field);
+	*(u16 *)entry = value;
+}
+
+static inline void nested_vmcs_write32(struct kvm_vcpu *vcpu,
+				       unsigned long field, u32 value)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long entry =
+		(unsigned long)(vmx->nested.l2_state->shadow_vmcs);
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return;
+	}
+	entry += vmcs_field_to_offset(field);
+	*(u32 *)entry = value;
+}
+
+static inline void nested_vmcs_write64(struct kvm_vcpu *vcpu,
+				       unsigned long field, u64 value)
+{
+#ifdef CONFIG_X86_64
+	nested_vmcs_writel(vcpu, field, value);
+#else /* nested: 32 bit not actually tested */
+	nested_vmcs_writel(vcpu, field, value);
+	nested_vmcs_writel(vcpu, field+1, value >> 32);
+#endif
+}
+
 static struct page *nested_get_page(struct kvm_vcpu *vcpu,
 				    u64 vmcs_addr)
 {
@@ -307,6 +720,50 @@ static struct page *nested_get_page(struct kvm_vcpu *vcpu,
 
 }
 
+static int nested_map_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct page *vmcs_page = nested_get_page(vcpu, vmx->nested.vmptr);
+
+	if (vmcs_page == NULL) {
+		printk(KERN_INFO "%s: failure in nested_get_page\n",__func__);
+		return 0;
+	}
+
+	if (vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_INFO "%s: shadow vmcs already mapped\n",__func__);
+		return 0;
+	}
+
+	vmx->nested.l2_state->shadow_vmcs = kmap_atomic(vmcs_page, KM_USER0);
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_INFO "%s: error in kmap_atomic\n",__func__);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void nested_unmap_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct page *page;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk("Shadow vmcs already unmapped\n");
+		return;
+	}
+
+	page = kmap_atomic_to_page(vmx->nested.l2_state->shadow_vmcs);
+
+	kunmap_atomic(vmx->nested.l2_state->shadow_vmcs, KM_USER0);
+
+	kvm_release_page_dirty(page);
+
+	vmx->nested.l2_state->shadow_vmcs = NULL;
+}
+
 static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 
@@ -3550,6 +4007,26 @@ static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
 	vmx_set_rflags(vcpu, rflags);
 }
 
+static void set_rflags_to_vmx_fail_invalid(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags |= X86_EFLAGS_CF;
+	rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_ZF &
+		~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
+	vmx_set_rflags(vcpu, rflags);
+}
+
+static void set_rflags_to_vmx_fail_valid(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags |= X86_EFLAGS_ZF;
+	rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_CF &
+		~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
+	vmx_set_rflags(vcpu, rflags);
+}
+
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
 	if (!nested_vmx_check_permission(vcpu))
@@ -3563,6 +4040,116 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+#ifndef CONFIG_X86_64
+	u64 value;
+#endif
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!nested_map_shadow_vmcs(vcpu)) {
+		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
+		set_rflags_to_vmx_fail_invalid(vcpu);
+		return 1;
+	}
+
+	switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
+	case VMCS_FIELD_TYPE_U16:
+		vcpu->arch.regs[VCPU_REGS_RAX] =
+			nested_vmcs_read16(vcpu,
+					   vcpu->arch.regs[VCPU_REGS_RDX]);
+		break;
+	case VMCS_FIELD_TYPE_U32:
+		vcpu->arch.regs[VCPU_REGS_RAX] =
+			nested_vmcs_read32(vcpu,
+					   vcpu->arch.regs[VCPU_REGS_RDX]);
+		break;
+	case VMCS_FIELD_TYPE_U64:
+#ifdef CONFIG_X86_64
+		vcpu->arch.regs[VCPU_REGS_RAX] =
+		nested_vmcs_read64(vcpu,
+					   vcpu->arch.regs[VCPU_REGS_RDX]);
+#else /* nested: 32 bit not actually tested */
+		value =  nested_vmcs_read64(vcpu,
+					    vcpu->arch.regs[VCPU_REGS_RDX]);
+		vcpu->arch.regs[VCPU_REGS_RAX] = value;
+		vcpu->arch.regs[VCPU_REGS_RBX] = value >> 32;
+#endif
+	break;
+	case VMCS_FIELD_TYPE_ULONG:
+		vcpu->arch.regs[VCPU_REGS_RAX] =
+			nested_vmcs_readl(vcpu,
+					  vcpu->arch.regs[VCPU_REGS_RDX]);
+		break;
+	default:
+		printk(KERN_INFO "%s invalid field\n", __func__);
+		set_rflags_to_vmx_fail_valid(vcpu);
+		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
+		nested_unmap_shadow_vmcs(vcpu);
+		return 1;
+	}
+
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	nested_unmap_shadow_vmcs(vcpu);
+	return 1;
+}
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+#ifndef CONFIG_X86_64
+	u64 value ;
+#endif
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!nested_map_shadow_vmcs(vcpu)) {
+		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
+		set_rflags_to_vmx_fail_invalid(vcpu);
+		return 1;
+	}
+
+	switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
+	case VMCS_FIELD_TYPE_U16:
+		nested_vmcs_write16(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+				    vcpu->arch.regs[VCPU_REGS_RAX]);
+		break;
+	case VMCS_FIELD_TYPE_U32:
+		nested_vmcs_write32(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+				    vcpu->arch.regs[VCPU_REGS_RAX]);
+		break;
+	case VMCS_FIELD_TYPE_U64:
+#ifdef CONFIG_X86_64
+		nested_vmcs_write64(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+				    vcpu->arch.regs[VCPU_REGS_RAX]);
+#else /* nested: 32 bit not actually tested */
+		value =  vcpu->arch.regs[VCPU_REGS_RAX] |
+			(vcpu->arch.regs[VCPU_REGS_RBX] << 32);
+		nested_vmcs_write64(vcpu,
+				    vcpu->arch.regs[VCPU_REGS_RDX], value);
+#endif
+		break;
+	case VMCS_FIELD_TYPE_ULONG:
+		nested_vmcs_writel(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+				   vcpu->arch.regs[VCPU_REGS_RAX]);
+		break;
+	default:
+		printk(KERN_INFO "%s invalid field\n", __func__);
+		set_rflags_to_vmx_fail_valid(vcpu);
+		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
+		nested_unmap_shadow_vmcs(vcpu);
+		return 1;
+	}
+
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	nested_unmap_shadow_vmcs(vcpu);
+	return 1;
+}
+
 static int handle_vmoff(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3950,9 +4537,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
-	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
+	[EXIT_REASON_VMREAD]                  = handle_vmread,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
-	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
+	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-15 14:41       ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite oritw
@ 2009-10-15 14:41         ` oritw
  2009-10-19 17:29           ` Gleb Natapov
  2009-10-20  4:56           ` Avi Kivity
  2009-10-19 13:17         ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite Gleb Natapov
  2009-10-20  4:44         ` Avi Kivity
  2 siblings, 2 replies; 35+ messages in thread
From: oritw @ 2009-10-15 14:41 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c | 1173 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 1148 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a4c252..e814029 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -209,6 +209,7 @@ struct __attribute__ ((__packed__)) level_state {
 	struct vmcs *vmcs;
 	int cpu;
 	int launched;
+	bool first_launch;
 };
 
 struct nested_vmx {
@@ -216,6 +217,12 @@ struct nested_vmx {
 	bool vmxon;
 	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
 	u64 vmptr;
+	/* Are we running nested guest */
+	bool nested_mode;
+	/* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
+	bool nested_run_pending;
+	/* flag indicating if there was a valid IDT after exiting from l2 */
+	bool nested_valid_idt;
 	/*
 	 * Level 2 state : includes vmcs,registers and
 	 * a copy of vmcs12 for vmread/vmwrite
@@ -240,6 +247,10 @@ static inline int vmcs_field_length(unsigned long field)
 	return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
 }
 
+#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \
+					VM_EXIT_SAVE_IA32_PAT))
+#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \
+					 VM_ENTRY_IA32E_MODE))
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -303,6 +314,12 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static inline struct shadow_vmcs *get_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+	WARN_ON(!to_vmx(vcpu)->nested.l2_state->shadow_vmcs);
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
+}
+
 #define SHADOW_VMCS_OFFSET(x) offsetof(struct shadow_vmcs, x)
 
 static unsigned short vmcs_field_to_offset_table[HOST_RIP+1] = {
@@ -822,8 +839,16 @@ static struct kvm_vmx_segment_field {
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu);
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+				      bool has_error_code, u32 error_code);
+static int nested_vmx_intr(struct kvm_vcpu *vcpu);
 static int create_l1_state(struct kvm_vcpu *vcpu);
 static int create_l2_state(struct kvm_vcpu *vcpu);
+static int launch_guest(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override);
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt);
 
 /*
  * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
@@ -940,6 +965,18 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
 	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
 }
 
+static inline int is_exception(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_nmi(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+}
+
 static inline int cpu_has_vmx_invept_individual_addr(void)
 {
 	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -990,6 +1027,51 @@ static inline bool report_flexpriority(void)
 	return flexpriority_enabled;
 }
 
+static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu *vcpu)
+{
+	return cpu_has_vmx_tpr_shadow() &&
+		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
+		CPU_BASED_TPR_SHADOW;
+}
+
+static inline int nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu)
+{
+	return cpu_has_secondary_exec_ctrls() &&
+		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
+							   *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control &
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->
+		secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control &
+		SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->vm_entry_controls &
+		VM_ENTRY_LOAD_IA32_PAT;
+}
+
+static inline int nested_cpu_has_vmx_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
+		CPU_BASED_USE_MSR_BITMAPS;
+}
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -1501,6 +1583,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+	if (nested_vmx_check_exception(vmx, nr, has_error_code, error_code))
+		return;
+
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -1943,6 +2028,200 @@ static void vmclear_local_vcpus(void)
 		__vcpu_clear(vmx);
 }
 
+void prepare_vmcs_12(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *l2_shadow_vmcs =
+		get_shadow_vmcs(vcpu);
+
+	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+
+	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
+	l2_shadow_vmcs->guest_physical_address =
+		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	l2_shadow_vmcs->vm_entry_intr_info_field =
+		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	l2_shadow_vmcs->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	l2_shadow_vmcs->vm_entry_instruction_len =
+		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	l2_shadow_vmcs->vm_instruction_error =
+		vmcs_read32(VM_INSTRUCTION_ERROR);
+	l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	l2_shadow_vmcs->vm_exit_intr_error_code =
+		vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	l2_shadow_vmcs->idt_vectoring_info_field =
+		vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	l2_shadow_vmcs->idt_vectoring_error_code =
+		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	l2_shadow_vmcs->vm_exit_instruction_len =
+		vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	l2_shadow_vmcs->vmx_instruction_info =
+		vmcs_read32(VMX_INSTRUCTION_INFO);
+	l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	l2_shadow_vmcs->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	l2_shadow_vmcs->guest_activity_state =
+		vmcs_read32(GUEST_ACTIVITY_STATE);
+	l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+
+	l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+	l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
+
+	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
+	l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
+	l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
+	l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
+	l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	l2_shadow_vmcs->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+}
+
+int load_vmcs_common(struct shadow_vmcs *src)
+{
+	vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
+	vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
+	vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
+	vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
+	vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
+	vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
+	vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
+	vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
+
+	vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
+	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
+
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
+
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field);
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+		     src->vm_entry_exception_error_code);
+	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len);
+
+	vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
+	vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
+	vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
+	vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
+	vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
+	vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
+	vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
+	vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
+	vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
+	vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
+	vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
+	vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
+	vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
+	vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
+	vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
+	vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
+	vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
+	vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
+	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+		     src->guest_interruptibility_info);
+	vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
+	vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
+
+	vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
+	vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
+	vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
+	vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
+	vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
+	vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
+	vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
+	vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
+	vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
+	vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
+	vmcs_writel(GUEST_DR7, src->guest_dr7);
+	vmcs_writel(GUEST_RSP, src->guest_rsp);
+	vmcs_writel(GUEST_RIP, src->guest_rip);
+	vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
+	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+		    src->guest_pending_dbg_exceptions);
+	vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
+
+	return 0;
+}
+
+int load_vmcs_host_state(struct shadow_vmcs *src)
+{
+	vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector);
+	vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector);
+	vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector);
+	vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector);
+	vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector);
+	vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector);
+	vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector);
+
+	vmcs_write64(TSC_OFFSET, src->tsc_offset);
+
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+		vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat);
+
+	vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs);
+
+	vmcs_writel(HOST_CR0, src->host_cr0);
+	vmcs_writel(HOST_CR3, src->host_cr3);
+	vmcs_writel(HOST_CR4, src->host_cr4);
+	vmcs_writel(HOST_FS_BASE, src->host_fs_base);
+	vmcs_writel(HOST_GS_BASE, src->host_gs_base);
+	vmcs_writel(HOST_TR_BASE, src->host_tr_base);
+	vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base);
+	vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base);
+	vmcs_writel(HOST_RSP, src->host_rsp);
+	vmcs_writel(HOST_RIP, src->host_rip);
+	vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp);
+	vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip);
+
+	return 0;
+}
+
 struct level_state *create_state(void)
 {
 	struct level_state *state = NULL;
@@ -2003,6 +2282,8 @@ int create_l2_state(struct kvm_vcpu *vcpu)
 	vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
 	vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
 
+	vmx->nested.l2_state->first_launch = true;
+
 	return 0;
 }
 
@@ -3393,6 +3674,14 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
 
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		if (kvm_cpu_has_interrupt(vcpu)) {
+			if (nested_vmx_intr(vcpu))
+				return;
+		}
+		return;
+	}
+
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -3448,6 +3737,10 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (vmx->nested.nested_mode) {
+		return;
+	}
+
 	if (!cpu_has_virtual_nmis()) {
 		/*
 		 * Tracking the NMI-blocked state in software is built upon
@@ -3489,6 +3782,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		if (kvm_cpu_has_interrupt(vcpu)) {
+			if (!nested_vmx_intr(vcpu))
+				return 0;
+		}
+	}
+
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3993,12 +4293,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-static int handle_vmx_insn(struct kvm_vcpu *vcpu)
-{
-	kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
-}
-
 static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
 {
 	unsigned long rflags;
@@ -4040,6 +4334,27 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+	if (!to_vmx(vcpu)->nested.l2_state->vmclear)
+		return 1;
+
+	return launch_guest(vcpu);
+}
+
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (to_vmx(vcpu)->nested.l2_state->vmclear)
+		return 1;
+
+	return launch_guest(vcpu);
+}
+
 static int handle_vmread(struct kvm_vcpu *vcpu)
 {
 #ifndef CONFIG_X86_64
@@ -4050,7 +4365,6 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 		return 1;
 
 	if (!nested_map_shadow_vmcs(vcpu)) {
-		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
 		set_rflags_to_vmx_fail_invalid(vcpu);
 		return 1;
 	}
@@ -4107,7 +4421,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 		return 1;
 
 	if (!nested_map_shadow_vmcs(vcpu)) {
-		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
 		set_rflags_to_vmx_fail_invalid(vcpu);
 		return 1;
 	}
@@ -4137,16 +4450,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 				   vcpu->arch.regs[VCPU_REGS_RAX]);
 		break;
 	default:
+		nested_unmap_shadow_vmcs(vcpu);
 		printk(KERN_INFO "%s invalid field\n", __func__);
 		set_rflags_to_vmx_fail_valid(vcpu);
 		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
-		nested_unmap_shadow_vmcs(vcpu);
 		return 1;
 	}
 
+	nested_unmap_shadow_vmcs(vcpu);
 	clear_rflags_cf_zf(vcpu);
 	skip_emulated_instruction(vcpu);
-	nested_unmap_shadow_vmcs(vcpu);
 	return 1;
 }
 
@@ -4208,7 +4521,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct page *vmcs_page;
 	u64 guest_vmcs_addr;
 
 	if (!nested_vmx_check_permission(vcpu))
@@ -4228,14 +4540,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 	}
 
 	if (vmx->nested.vmptr != guest_vmcs_addr) {
-		/* checking vmptr address */
-		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
-		if (vmcs_page == NULL)
-			return 1;
-
 		vmx->nested.vmptr = guest_vmcs_addr;
-
-		kvm_release_page_clean(vmcs_page);
 	}
 
 	clear_rflags_cf_zf(vcpu);
@@ -4534,11 +4839,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
-	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
+	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmread,
-	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
+	[EXIT_REASON_VMRESUME]                = handle_vmresume,
 	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
@@ -4566,6 +4871,17 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
 	trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
 
+	if (exit_reason == EXIT_REASON_VMLAUNCH ||
+	    exit_reason == EXIT_REASON_VMRESUME)
+		vmx->nested.nested_run_pending = 1;
+	else
+		vmx->nested.nested_run_pending = 0;
+
+	if (vmx->nested.nested_mode && nested_vmx_exit_handled(vcpu, true)) {
+		nested_vmx_vmexit(vcpu, false);
+		return 1;
+	}
+
 	/* If we need to emulate an MMIO from handle_invalid_guest_state
 	 * we just return 0 */
 	if (vmx->emulation_required && emulate_invalid_guest_state) {
@@ -4585,7 +4901,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 			= vmcs_read32(VM_INSTRUCTION_ERROR);
 		return 0;
 	}
-
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
 			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
 			exit_reason != EXIT_REASON_EPT_VIOLATION &&
@@ -4593,8 +4908,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		printk(KERN_WARNING "%s: unexpected, valid vectoring info "
 		       "(0x%x) and exit reason is 0x%x\n",
 		       __func__, vectoring_info, exit_reason);
-
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+	if (!vmx->nested.nested_mode && unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
 		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -4641,10 +4955,13 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	int type;
 	bool idtv_info_valid;
 
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
+	if (vmx->nested.nested_mode)
+		return;
+
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
 	/* Handle machine checks before interrupts are enabled */
 	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
 	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
@@ -4747,6 +5064,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 		| vmx->rmode.irq.vector;
 }
 
+static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int irq;
+	int type;
+	int errCodeValid;
+	u32 idt_vectoring_info;
+	u32 guest_intr;
+	bool nmi_window_open;
+	bool interrupt_window_open;
+
+	if (vmx->nested.nested_mode && vmx->nested.nested_valid_idt) {
+		idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+		irq  = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+		type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+		errCodeValid = idt_vectoring_info &
+			VECTORING_INFO_DELIVER_CODE_MASK;
+
+		guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+		nmi_window_open =
+			!(guest_intr & (GUEST_INTR_STATE_STI |
+					GUEST_INTR_STATE_MOV_SS |
+					GUEST_INTR_STATE_NMI));
+
+		interrupt_window_open =
+			((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+			 !(guest_intr & (GUEST_INTR_STATE_STI |
+					 GUEST_INTR_STATE_MOV_SS)));
+
+		if (type == INTR_TYPE_EXT_INTR && !interrupt_window_open) {
+			printk(KERN_INFO "IDT ignored, l2 interrupt window closed!\n");
+			return 0;
+		}
+
+		if (type == INTR_TYPE_NMI_INTR && !nmi_window_open) {
+			printk(KERN_INFO "IDT ignored, l2 nmi window closed!\n");
+			return 0;
+		}
+
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			irq | type | INTR_INFO_VALID_MASK | errCodeValid);
+
+
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+
+		if (errCodeValid)
+			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+				     vmcs_read32(IDT_VECTORING_ERROR_CODE));
+	}
+
+	return 1;
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #define Q "q"
@@ -4758,6 +5129,26 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int r;
+
+	if (vmx->nested.nested_mode) {
+		r = nested_handle_valid_idt(vcpu);
+		if (!r) {
+			vmx->fail = 1;
+			return;
+		}
+
+		if (!nested_map_shadow_vmcs(vcpu)) {
+			vmx->fail = 1;
+			return;
+		}
+
+		vmcs_write32(EXCEPTION_BITMAP, get_shadow_vmcs(vcpu)->
+			     exception_bitmap |
+			     vmx->nested.l1_state->shadow_vmcs->exception_bitmap);
+
+		nested_unmap_shadow_vmcs(vcpu);
+	}
 
 	if (enable_ept && is_paging(vcpu)) {
 		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
@@ -4896,6 +5287,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	get_debugreg(vcpu->arch.dr6, 6);
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+	vmx->nested.nested_valid_idt = vmx->nested.nested_mode &&
+		(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
 
@@ -4984,6 +5379,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	vmx->nested.vmptr = 0;
+
+	vmx->nested.l1_state = NULL;
+	vmx->nested.l2_state = NULL;
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -5215,6 +5615,729 @@ void save_vmcs(struct shadow_vmcs *dst)
 	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
 		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
 }
+int prepare_vmcs_02(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct shadow_vmcs *src = get_shadow_vmcs(vcpu);
+	u32 exec_control;
+
+	if (!src) {
+		printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
+		return 1;
+	}
+
+	load_vmcs_common(src);
+
+	if (vmx->nested.l2_state->first_launch) {
+		if (cpu_has_vmx_vpid() && vmx->nested.l2_state->vpid != 0)
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->vpid);
+
+		if (vmx->nested.l2_state->io_bitmap_a)
+			vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
+
+		if (vmx->nested.l2_state->io_bitmap_b)
+			vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
+
+		if (vmx->nested.l2_state->msr_bitmap)
+			vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
+
+		if (src->vm_entry_msr_load_count > 0) {
+			struct page *page;
+
+			page = nested_get_page(vcpu,
+					       src->vm_entry_msr_load_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
+
+			kvm_release_page_clean(page);
+		}
+
+		if (nested_cpu_has_vmx_tpr_shadow(vcpu)) {
+			struct page *page;
+
+			page = nested_get_page(vcpu,
+					       src->virtual_apic_page_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
+
+			kvm_release_page_clean(page);
+		}
+
+		if (nested_vm_need_virtualize_apic_accesses(vcpu)) {
+			struct page *page =
+				nested_get_page(vcpu, src->apic_access_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+			kvm_release_page_clean(page);
+		}
+
+		vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+			     (vmx->nested.l1_state->shadow_vmcs->pin_based_vm_exec_control |
+			      src->pin_based_vm_exec_control));
+
+		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+			     (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_mask &
+			      src->page_fault_error_code_mask));
+
+		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+			     (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_match &
+			      src->page_fault_error_code_match));
+
+		if (cpu_has_secondary_exec_ctrls()) {
+
+			exec_control =
+				vmx->nested.l1_state->shadow_vmcs->secondary_vm_exec_control;
+
+			if (nested_cpu_has_secondary_exec_ctrls(vcpu)) {
+
+				exec_control |= src->secondary_vm_exec_control;
+
+				if (!vm_need_virtualize_apic_accesses(vcpu->kvm) ||
+				    !nested_vm_need_virtualize_apic_accesses(vcpu))
+					exec_control &=
+						~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+			}
+
+			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+		}
+
+		load_vmcs_host_state(vmx->nested.l1_state->shadow_vmcs);
+
+		vmx->nested.l2_state->first_launch = false;
+	}
+
+	if (vm_need_tpr_shadow(vcpu->kvm) &&
+	    nested_cpu_has_vmx_tpr_shadow(vcpu))
+		vmcs_write32(TPR_THRESHOLD, src->tpr_threshold);
+
+	if (enable_ept) {
+		if (!nested_cpu_has_vmx_ept(vcpu)) {
+			vmcs_write64(EPT_POINTER,
+				     vmx->nested.l1_state->shadow_vmcs->ept_pointer);
+			vmcs_write64(GUEST_PDPTR0,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr0);
+			vmcs_write64(GUEST_PDPTR1,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr1);
+			vmcs_write64(GUEST_PDPTR2,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr2);
+			vmcs_write64(GUEST_PDPTR3,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr3);
+		}
+	}
+
+	exec_control = vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
+
+	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+
+	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+
+	exec_control &= ~CPU_BASED_TPR_SHADOW;
+
+	exec_control |= src->cpu_based_vm_exec_control;
+
+	if (!vm_need_tpr_shadow(vcpu->kvm) ||
+	    src->virtual_apic_page_addr == 0) {
+		exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_STORE_EXITING |
+			CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	} else if (exec_control & CPU_BASED_TPR_SHADOW) {
+
+#ifdef CONFIG_X86_64
+		exec_control &= ~CPU_BASED_CR8_STORE_EXITING;
+		exec_control &= ~CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	}
+
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	vmcs_write32(EXCEPTION_BITMAP,
+		     (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
+		      src->exception_bitmap));
+
+	vmcs_write32(VM_EXIT_CONTROLS,
+		     ((vmx->nested.l1_state->shadow_vmcs->vm_exit_controls &
+		       NESTED_VM_EXIT_CONTROLS_MASK) | src->vm_exit_controls));
+
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		     (vmx->nested.l1_state->shadow_vmcs->vm_entry_controls &
+		      NESTED_VM_ENTRY_CONTROLS_MASK) | src->vm_entry_controls);
+
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	vmcs_writel(CR0_GUEST_HOST_MASK,
+		    (vmx->nested.l1_state->shadow_vmcs->cr0_guest_host_mask  &
+		     src->cr0_guest_host_mask));
+	vmcs_writel(CR4_GUEST_HOST_MASK,
+		    (vmx->nested.l1_state->shadow_vmcs->cr4_guest_host_mask  &
+		     src->cr4_guest_host_mask));
+
+	return 0;
+}
+
+int switch_back_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
+
+	if (enable_vpid && src->virtual_processor_id != 0)
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
+
+	vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
+	vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, src->msr_bitmap);
+
+	vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
+
+	if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+		vmcs_write64(APIC_ACCESS_ADDR,
+			     src->apic_access_addr);
+
+	if (enable_ept) {
+		vmcs_write64(EPT_POINTER, src->ept_pointer);
+		vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
+		vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
+		vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
+		vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
+	}
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control);
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control);
+	vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		     src->page_fault_error_code_mask);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		     src->page_fault_error_code_match);
+	vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
+	vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	if (cpu_has_secondary_exec_ctrls())
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+			     src->secondary_vm_exec_control);
+
+	load_vmcs_common(src);
+
+	load_vmcs_host_state(to_vmx(vcpu)->nested.l1_state->shadow_vmcs);
+
+	return 0;
+}
+
+void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
+{
+	unsigned long mask;
+
+	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+	mask = ~((1 << VCPU_REGS_RSP) | (1 << VCPU_REGS_RIP));
+
+	if (vcpu->arch.regs_dirty & mask) {
+		printk(KERN_INFO "WARNING: dirty cached registers regs_dirty 0x%x mask 0x%lx\n",
+		       vcpu->arch.regs_dirty, mask);
+		WARN_ON(1);
+	}
+
+	vcpu->arch.regs_dirty = 0;
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu)
+{
+	/* verify that l1 has done vmptrld for l2 earlier */
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int initial_pfu_active = vcpu->fpu_active;
+	int r = 0;
+
+	if (vmx->nested.nested_mode) {
+		printk(KERN_INFO "Nested guest already running\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+
+	vmx->nested.nested_mode = 1;
+
+	vcpu->arch.exception.pending = false;
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	save_vmcs(vmx->nested.l1_state->shadow_vmcs);
+
+	vmx->nested.l1_state->shadow_efer = vcpu->arch.shadow_efer;
+	if (!enable_ept)
+		vmx->nested.l1_state->cr3 = vcpu->arch.cr3;
+	vmx->nested.l1_state->cr4 = vcpu->arch.cr4;
+
+	if (enable_vpid) {
+		if (vmx->nested.l2_state->vpid == 0) {
+			allocate_vpid(vmx);
+			vmx->nested.l2_state->vpid = vmx->vpid;
+		}
+	}
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx->nested.l1_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
+	else
+		vmx->nested.l1_state->msr_bitmap = 0;
+
+	vmx->nested.l1_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	vmx->nested.l1_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+	vmx->nested.l1_state->vmcs = vmx->vmcs;
+	vmx->nested.l1_state->cpu = vcpu->cpu;
+	vmx->nested.l1_state->launched = vmx->launched;
+
+	vmx->vmcs = vmx->nested.l2_state->vmcs;
+	vcpu->cpu = vmx->nested.l2_state->cpu;
+	vmx->launched = vmx->nested.l2_state->launched;
+
+	if (vmx->nested.l2_state->vmclear || !vmx->launched) {
+		vmcs_clear(vmx->vmcs);
+		vmx->launched = 0;
+		vmx->nested.l2_state->vmclear = 0;
+	}
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+
+	if (!nested_map_shadow_vmcs(vcpu)) {
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	prepare_vmcs_02(vcpu);
+
+	if (get_shadow_vmcs(vcpu)->vm_entry_controls &
+	    VM_ENTRY_IA32E_MODE) {
+		if (!((vcpu->arch.shadow_efer & EFER_LMA) &&
+		      (vcpu->arch.shadow_efer & EFER_LME)))
+			vcpu->arch.shadow_efer |= (EFER_LMA | EFER_LME);
+	} else {
+		if ((vcpu->arch.shadow_efer & EFER_LMA) ||
+		    (vcpu->arch.shadow_efer & EFER_LME))
+			vcpu->arch.shadow_efer = 0;
+	}
+
+	vmx_set_cr0(vcpu, get_shadow_vmcs(vcpu)->guest_cr0);
+	vmcs_writel(CR0_READ_SHADOW,
+		    get_shadow_vmcs(vcpu)->cr0_read_shadow);
+	vmx_set_cr4(vcpu, get_shadow_vmcs(vcpu)->guest_cr4);
+	vmcs_writel(CR4_READ_SHADOW,
+		    get_shadow_vmcs(vcpu)->cr4_read_shadow);
+
+	vcpu->arch.cr0 |= X86_CR0_PG;
+
+	if (enable_ept && !nested_cpu_has_vmx_ept(vcpu)) {
+		vmcs_write32(GUEST_CR3, get_shadow_vmcs(vcpu)->guest_cr3);
+		vmx->vcpu.arch.cr3 = get_shadow_vmcs(vcpu)->guest_cr3;
+	} else {
+		kvm_set_cr3(vcpu, get_shadow_vmcs(vcpu)->guest_cr3);
+		kvm_mmu_reset_context(vcpu);
+
+		nested_unmap_shadow_vmcs(vcpu);
+
+		r = kvm_mmu_load(vcpu);
+		if (unlikely(r)) {
+			printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r);
+			nested_vmx_vmexit(vcpu, false);
+			set_rflags_to_vmx_fail_valid(vcpu);
+			return 1;
+		}
+
+		nested_map_shadow_vmcs(vcpu);
+	}
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   get_shadow_vmcs(vcpu)->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   get_shadow_vmcs(vcpu)->guest_rip);
+
+	vmcs_write32(EXCEPTION_BITMAP,
+		     (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
+		      get_shadow_vmcs(vcpu)->exception_bitmap));
+
+	nested_unmap_shadow_vmcs(vcpu);
+
+	if (initial_pfu_active)
+		vmx_fpu_activate(vcpu);
+
+	return 1;
+}
+
+static int launch_guest(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	skip_emulated_instruction(vcpu);
+
+	nested_vmx_run(vcpu);
+
+	return 1;
+}
+
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int initial_pfu_active = vcpu->fpu_active;
+
+	if (!vmx->nested.nested_mode) {
+		printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
+		       __func__);
+		return 0;
+	}
+
+	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	if (!nested_map_shadow_vmcs(vcpu)) {
+		printk(KERN_INFO "Error mapping shadow vmcs\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	prepare_vmcs_12(vcpu);
+	if (is_interrupt)
+		get_shadow_vmcs(vcpu)->vm_exit_reason =
+			EXIT_REASON_EXTERNAL_INTERRUPT;
+
+	vmx->nested.l2_state->launched = vmx->launched;
+	vmx->nested.l2_state->cpu = vcpu->cpu;
+
+	nested_unmap_shadow_vmcs(vcpu);
+
+	vmx->vmcs = vmx->nested.l1_state->vmcs;
+	vcpu->cpu = vmx->nested.l1_state->cpu;
+	vmx->launched = vmx->nested.l1_state->launched;
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+	vcpu->arch.exception.pending = false;
+
+	vcpu->arch.shadow_efer = vmx->nested.l1_state->shadow_efer;
+	vmx_set_cr0(vcpu, vmx->nested.l1_state->shadow_vmcs->cr0_read_shadow);
+	vmx_set_cr4(vcpu, vmx->nested.l1_state->cr4);
+
+	if (enable_ept) {
+		vcpu->arch.cr3 = vmx->nested.l1_state->shadow_vmcs->guest_cr3;
+		vmcs_write32(GUEST_CR3, vmx->nested.l1_state->shadow_vmcs->guest_cr3);
+	} else {
+		kvm_set_cr3(vcpu, vmx->nested.l1_state->cr3);
+	}
+
+	if (!nested_map_shadow_vmcs(vcpu)) {
+		printk(KERN_INFO "Error mapping shadow vmcs\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	switch_back_vmcs(vcpu);
+
+	nested_unmap_shadow_vmcs(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   vmx->nested.l1_state->shadow_vmcs->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   vmx->nested.l1_state->shadow_vmcs->guest_rip);
+
+	vmx->nested.nested_mode = 0;
+
+	kvm_mmu_reset_context(vcpu);
+	kvm_mmu_load(vcpu);
+
+	if (unlikely(vmx->fail)) {
+		vmx->fail = 0;
+		set_rflags_to_vmx_fail_valid(vcpu);
+	} else
+		clear_rflags_cf_zf(vcpu);
+
+	if (initial_pfu_active)
+		vmx_fpu_activate(vcpu);
+
+	return 0;
+}
+
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
+{
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		struct page *msr_page = NULL;
+		u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+		u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+		struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu);
+
+		if (!cpu_has_vmx_msr_bitmap()
+		    || !nested_cpu_has_vmx_msr_bitmap(vcpu))
+			return 1;
+
+		msr_page = nested_get_page(vcpu,
+					   l2svmcs->msr_bitmap);
+
+		if (!msr_page) {
+			printk(KERN_INFO "%s error in nested_get_page\n",
+			       __func__);
+			return 0;
+		}
+
+		switch (exit_code) {
+		case EXIT_REASON_MSR_READ:
+			if (msr_index <= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x000)))
+					return 1;
+			} else if ((msr_index >= 0xc0000000) &&
+				   (msr_index <= 0xc0001fff)) {
+				msr_index &= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x400)))
+					return 1;
+			}
+			break;
+		case EXIT_REASON_MSR_WRITE:
+			if (msr_index <= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x800)))
+						return 1;
+			} else if ((msr_index >= 0xc0000000) &&
+				   (msr_index <= 0xc0001fff)) {
+				msr_index &= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0xc00)))
+					return 1;
+			}
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
+{
+	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	struct shadow_vmcs *l2svmcs;
+
+	int r = 0;
+
+	if (vmx->nested.nested_run_pending)
+		return 0;
+
+	if (unlikely(vmx->fail)) {
+		printk(KERN_INFO "%s failed vm entry %x\n",
+		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		return 1;
+	}
+
+	if (kvm_override) {
+		switch (exit_code) {
+		case EXIT_REASON_EXTERNAL_INTERRUPT:
+			return 0;
+		case EXIT_REASON_EXCEPTION_NMI:
+			if (!is_exception(intr_info))
+				return 0;
+
+			if (is_page_fault(intr_info) && (!enable_ept))
+				return 0;
+
+			break;
+		case EXIT_REASON_EPT_VIOLATION:
+			if (enable_ept)
+				return 0;
+
+			break;
+		}
+	}
+
+
+	if (!nested_map_shadow_vmcs(vcpu))
+		return 0;
+	l2svmcs = get_shadow_vmcs(vcpu);
+
+	switch (exit_code) {
+	case EXIT_REASON_INVLPG:
+		if (l2svmcs->cpu_based_vm_exec_control &
+		    CPU_BASED_INVLPG_EXITING)
+			r = 1;
+		break;
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_MSR_WRITE:
+		r = nested_vmx_exit_handled_msr(vcpu);
+		break;
+	case EXIT_REASON_CR_ACCESS: {
+		unsigned long exit_qualification =
+			vmcs_readl(EXIT_QUALIFICATION);
+		int cr = exit_qualification & 15;
+		int reg = (exit_qualification >> 8) & 15;
+		unsigned long val = kvm_register_read(vcpu, reg);
+
+		switch ((exit_qualification >> 4) & 3) {
+		case 0: /* mov to cr */
+			switch (cr) {
+			case 0:
+				if (l2svmcs->cr0_guest_host_mask &
+				    (val ^ l2svmcs->cr0_read_shadow))
+					r = 1;
+				break;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR3_LOAD_EXITING)
+					r = 1;
+				break;
+			case 4:
+				if (l2svmcs->cr4_guest_host_mask &
+				    (l2svmcs->cr4_read_shadow ^ val))
+					r = 1;
+				break;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR8_LOAD_EXITING)
+					r = 1;
+				break;
+			}
+			break;
+		case 2: /* clts */
+			if (l2svmcs->cr0_guest_host_mask &
+			    (val ^ l2svmcs->cr0_read_shadow))
+				r = 1;
+			break;
+		case 1: /*mov from cr*/
+			switch (cr) {
+			case 0:
+				r = 1;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR3_STORE_EXITING)
+					r = 1;
+				break;
+			case 4:
+				r = 1;
+				break;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR8_STORE_EXITING)
+					r = 1;
+				break;
+			}
+			break;
+		case 3: /* lmsw */
+			if (l2svmcs->cr0_guest_host_mask &
+			    (val ^ l2svmcs->cr0_read_shadow))
+				r = 1;
+			break;
+		}
+		break;
+	}
+	case EXIT_REASON_DR_ACCESS: {
+		if (l2svmcs->cpu_based_vm_exec_control &
+		    CPU_BASED_MOV_DR_EXITING)
+			r = 1;
+		break;
+	}
+
+	case EXIT_REASON_EXCEPTION_NMI: {
+
+		if (is_external_interrupt(intr_info) &&
+		    (l2svmcs->pin_based_vm_exec_control &
+		     PIN_BASED_EXT_INTR_MASK))
+			r = 1;
+		else if (is_nmi(intr_info) &&
+		    (l2svmcs->pin_based_vm_exec_control &
+		     PIN_BASED_NMI_EXITING))
+			r = 1;
+		else if (is_exception(intr_info) &&
+		    (l2svmcs->exception_bitmap &
+		     (1u << (intr_info & INTR_INFO_VECTOR_MASK))))
+			r = 1;
+		else if (is_page_fault(intr_info))
+			r = 1;
+		break;
+	}
+
+	case EXIT_REASON_EXTERNAL_INTERRUPT:
+		if (l2svmcs->pin_based_vm_exec_control &
+		    PIN_BASED_EXT_INTR_MASK)
+			r = 1;
+		break;
+	default:
+		r = 1;
+	}
+	nested_unmap_shadow_vmcs(vcpu);
+
+	return r;
+}
+
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+				      bool has_error_code, u32 error_code)
+{
+	if (vmx->nested.nested_mode) {
+		if (nested_vmx_exit_handled(&vmx->vcpu, false)) {
+			nested_vmx_vmexit(&vmx->vcpu, false);
+			if (!nested_map_shadow_vmcs(&vmx->vcpu))
+				return 1;
+			get_shadow_vmcs(&vmx->vcpu)->vm_exit_reason =
+				EXIT_REASON_EXCEPTION_NMI;
+			get_shadow_vmcs(&vmx->vcpu)->vm_exit_intr_info =
+				(nr | INTR_TYPE_HARD_EXCEPTION
+				 | (has_error_code ?
+				    INTR_INFO_DELIVER_CODE_MASK : 0)
+				 | INTR_INFO_VALID_MASK);
+
+			if (has_error_code)
+				get_shadow_vmcs(&vmx->vcpu)->
+					vm_exit_intr_error_code = error_code;
+			nested_unmap_shadow_vmcs(&vmx->vcpu);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int nested_vmx_intr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vmx->nested.nested_mode) {
+
+		if (!nested_map_shadow_vmcs(vcpu))
+			return 0;
+
+		if (get_shadow_vmcs(vcpu)->pin_based_vm_exec_control &
+		    PIN_BASED_EXT_INTR_MASK) {
+
+			if (vmx->nested.nested_run_pending) {
+				nested_unmap_shadow_vmcs(vcpu);
+				return 0;
+			}
+
+			nested_unmap_shadow_vmcs(vcpu);
+			nested_vmx_vmexit(vcpu, true);
+			return 1;		
+		}
+
+		nested_unmap_shadow_vmcs(vcpu);
+
+	}
+
+	return 0;
+}
 
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: Nested VMX support v3
  2009-10-15 14:41 Nested VMX support v3 oritw
  2009-10-15 14:41 ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff oritw
@ 2009-10-19 10:47 ` Gleb Natapov
  2009-10-20  3:30 ` Avi Kivity
  2 siblings, 0 replies; 35+ messages in thread
From: Gleb Natapov @ 2009-10-19 10:47 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On Thu, Oct 15, 2009 at 04:41:41PM +0200, oritw@il.ibm.com wrote:
> Avi,
> We have addressed all of the comments, please apply.
> 
Doesn't apply to current kvm master.

--
			Gleb.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-10-15 14:41     ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst oritw
  2009-10-15 14:41       ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite oritw
@ 2009-10-19 11:17       ` Gleb Natapov
  2009-10-21 13:27         ` Orit Wasserman
  2009-10-19 12:59       ` Gleb Natapov
  2009-10-20  4:24       ` Avi Kivity
  3 siblings, 1 reply; 35+ messages in thread
From: Gleb Natapov @ 2009-10-19 11:17 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On Thu, Oct 15, 2009 at 04:41:44PM +0200, oritw@il.ibm.com wrote:
> From: Orit Wasserman <oritw@il.ibm.com>
> 
> ---
>  arch/x86/kvm/vmx.c |  468 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/x86.c |    3 +-
>  2 files changed, 459 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 411cbdb..8c186e0 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -61,20 +61,168 @@ module_param_named(unrestricted_guest,
>  static int __read_mostly emulate_invalid_guest_state = 0;
>  module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>  
> +
> +struct __attribute__ ((__packed__)) shadow_vmcs {
> +	u32 revision_id;
> +	u32 abort;
> +	u16 virtual_processor_id;
> +	u16 guest_es_selector;
> +	u16 guest_cs_selector;
> +	u16 guest_ss_selector;
> +	u16 guest_ds_selector;
> +	u16 guest_fs_selector;
> +	u16 guest_gs_selector;
> +	u16 guest_ldtr_selector;
> +	u16 guest_tr_selector;
> +	u16 host_es_selector;
> +	u16 host_cs_selector;
> +	u16 host_ss_selector;
> +	u16 host_ds_selector;
> +	u16 host_fs_selector;
> +	u16 host_gs_selector;
> +	u16 host_tr_selector;
> +	u64 io_bitmap_a;
> +	u64 io_bitmap_b;
> +	u64 msr_bitmap;
> +	u64 vm_exit_msr_store_addr;
> +	u64 vm_exit_msr_load_addr;
> +	u64 vm_entry_msr_load_addr;
> +	u64 tsc_offset;
> +	u64 virtual_apic_page_addr;
> +	u64 apic_access_addr;
> +	u64 ept_pointer;
> +	u64 guest_physical_address;
> +	u64 vmcs_link_pointer;
> +	u64 guest_ia32_debugctl;
> +	u64 guest_ia32_pat;
> +	u64 guest_pdptr0;
> +	u64 guest_pdptr1;
> +	u64 guest_pdptr2;
> +	u64 guest_pdptr3;
> +	u64 host_ia32_pat;
> +	u32 pin_based_vm_exec_control;
> +	u32 cpu_based_vm_exec_control;
> +	u32 exception_bitmap;
> +	u32 page_fault_error_code_mask;
> +	u32 page_fault_error_code_match;
> +	u32 cr3_target_count;
> +	u32 vm_exit_controls;
> +	u32 vm_exit_msr_store_count;
> +	u32 vm_exit_msr_load_count;
> +	u32 vm_entry_controls;
> +	u32 vm_entry_msr_load_count;
> +	u32 vm_entry_intr_info_field;
> +	u32 vm_entry_exception_error_code;
> +	u32 vm_entry_instruction_len;
> +	u32 tpr_threshold;
> +	u32 secondary_vm_exec_control;
> +	u32 vm_instruction_error;
> +	u32 vm_exit_reason;
> +	u32 vm_exit_intr_info;
> +	u32 vm_exit_intr_error_code;
> +	u32 idt_vectoring_info_field;
> +	u32 idt_vectoring_error_code;
> +	u32 vm_exit_instruction_len;
> +	u32 vmx_instruction_info;
> +	u32 guest_es_limit;
> +	u32 guest_cs_limit;
> +	u32 guest_ss_limit;
> +	u32 guest_ds_limit;
> +	u32 guest_fs_limit;
> +	u32 guest_gs_limit;
> +	u32 guest_ldtr_limit;
> +	u32 guest_tr_limit;
> +	u32 guest_gdtr_limit;
> +	u32 guest_idtr_limit;
> +	u32 guest_es_ar_bytes;
> +	u32 guest_cs_ar_bytes;
> +	u32 guest_ss_ar_bytes;
> +	u32 guest_ds_ar_bytes;
> +	u32 guest_fs_ar_bytes;
> +	u32 guest_gs_ar_bytes;
> +	u32 guest_ldtr_ar_bytes;
> +	u32 guest_tr_ar_bytes;
> +	u32 guest_interruptibility_info;
> +	u32 guest_activity_state;
> +	u32 guest_sysenter_cs;
> +	u32 host_ia32_sysenter_cs;
> +	unsigned long cr0_guest_host_mask;
> +	unsigned long cr4_guest_host_mask;
> +	unsigned long cr0_read_shadow;
> +	unsigned long cr4_read_shadow;
> +	unsigned long cr3_target_value0;
> +	unsigned long cr3_target_value1;
> +	unsigned long cr3_target_value2;
> +	unsigned long cr3_target_value3;
> +	unsigned long exit_qualification;
> +	unsigned long guest_linear_address;
> +	unsigned long guest_cr0;
> +	unsigned long guest_cr3;
> +	unsigned long guest_cr4;
> +	unsigned long guest_es_base;
> +	unsigned long guest_cs_base;
> +	unsigned long guest_ss_base;
> +	unsigned long guest_ds_base;
> +	unsigned long guest_fs_base;
> +	unsigned long guest_gs_base;
> +	unsigned long guest_ldtr_base;
> +	unsigned long guest_tr_base;
> +	unsigned long guest_gdtr_base;
> +	unsigned long guest_idtr_base;
> +	unsigned long guest_dr7;
> +	unsigned long guest_rsp;
> +	unsigned long guest_rip;
> +	unsigned long guest_rflags;
> +	unsigned long guest_pending_dbg_exceptions;
> +	unsigned long guest_sysenter_esp;
> +	unsigned long guest_sysenter_eip;
> +	unsigned long host_cr0;
> +	unsigned long host_cr3;
> +	unsigned long host_cr4;
> +	unsigned long host_fs_base;
> +	unsigned long host_gs_base;
> +	unsigned long host_tr_base;
> +	unsigned long host_gdtr_base;
> +	unsigned long host_idtr_base;
> +	unsigned long host_ia32_sysenter_esp;
> +	unsigned long host_ia32_sysenter_eip;
> +	unsigned long host_rsp;
> +	unsigned long host_rip;
> +};
> +
>  struct __attribute__ ((__packed__)) level_state {
>  	/* Has the level1 guest done vmclear? */
>  	bool vmclear;
> +	u16 vpid;
> +	u64 shadow_efer;
> +	unsigned long cr2;
> +	unsigned long cr3;
> +	unsigned long cr4;
> +	unsigned long cr8;
> +
> +	u64 io_bitmap_a;
> +	u64 io_bitmap_b;
> +	u64 msr_bitmap;
> +
> +	struct shadow_vmcs *shadow_vmcs;
> +
> +	struct vmcs *vmcs;
> +	int cpu;
> +	int launched;
>  };
>  
>  struct nested_vmx {
>  	/* Has the level1 guest done vmxon? */
>  	bool vmxon;
> -
> +	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
> +	u64 vmptr;
>  	/*
>  	 * Level 2 state : includes vmcs,registers and
>  	 * a copy of vmcs12 for vmread/vmwrite
>  	 */
>  	struct level_state *l2_state;
> +	/* Level 1 state for switching to level 2 and back */
> +	struct level_state *l1_state;
>  };
>  
>  struct vmcs {
> @@ -140,6 +288,25 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
>  	return container_of(vcpu, struct vcpu_vmx, vcpu);
>  }
>  
> +static struct page *nested_get_page(struct kvm_vcpu *vcpu,
> +				    u64 vmcs_addr)
> +{
> +	struct page *vmcs_page = NULL;
> +
> +	down_read(&current->mm->mmap_sem);
> +	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
> +	up_read(&current->mm->mmap_sem);
> +
> +	if (is_error_page(vmcs_page)) {
> +		printk(KERN_ERR "%s error allocating page \n", __func__);
> +		kvm_release_page_clean(vmcs_page);
> +		return NULL;
> +	}
> +
> +	return vmcs_page;
> +
> +}
> +
>  static int init_rmode(struct kvm *kvm);
>  static u64 construct_eptp(unsigned long root_hpa);
>  
> @@ -197,6 +364,8 @@ static struct kvm_vmx_segment_field {
>  
>  static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
>  
> +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu);
> +static int create_l1_state(struct kvm_vcpu *vcpu);
>  static int create_l2_state(struct kvm_vcpu *vcpu);
>  
>  /*
> @@ -715,6 +884,24 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
>  	preempt_enable();
>  }
>  
> +
> +static int vmptrld(struct kvm_vcpu *vcpu,
> +		   u64 phys_addr)
> +{
> +	u8 error;
> +
> +	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
> +		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
> +		      : "cc");
> +	if (error) {
> +		printk(KERN_ERR "kvm: %s vmptrld %llx failed\n",
> +		       __func__, phys_addr);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * Switches to specified vcpu, until a matching vcpu_put(), but assumes
>   * vcpu mutex is already taken.
> @@ -736,15 +923,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  	}
>  
>  	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
> -		u8 error;
> -
>  		per_cpu(current_vmcs, cpu) = vmx->vmcs;
> -		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
> -			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
> -			      : "cc");
> -		if (error)
> -			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
> -			       vmx->vmcs, phys_addr);
> +		vmptrld(vcpu, phys_addr);
>  	}
>  
>  	if (vcpu->cpu != cpu) {
> @@ -1318,6 +1498,28 @@ struct level_state *create_state(void)
>  	return state;
>  }
>  
> +int create_l1_state(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!vmx->nested.l1_state) {
> +		vmx->nested.l1_state = create_state();
> +		if (!vmx->nested.l1_state)
> +			return -ENOMEM;
> +	} else
> +		return 0;
> +
> +	vmx->nested.l1_state->shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!vmx->nested.l1_state->shadow_vmcs) {
> +		printk(KERN_INFO "%s error creating shadow vmcs\n",
> +		       __func__);
> +		kfree(vmx->nested.l1_state);
> +		return -ENOMEM;
> +	}
> +	return 0;
> +}
> +
> +static struct vmcs *alloc_vmcs(void);
>  int create_l2_state(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -1326,11 +1528,53 @@ int create_l2_state(struct kvm_vcpu *vcpu)
>  		vmx->nested.l2_state = create_state();
>  		if (!vmx->nested.l2_state)
>  			return -ENOMEM;
> +	} else
> +		return 0;
> +
> +	vmx->nested.l2_state->vmcs = alloc_vmcs();
> +	if (!vmx->nested.l2_state->vmcs) {
> +		printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
> +		kfree(vmx->nested.l2_state);
> +		return -ENOMEM;
>  	}
>  
> +	if (cpu_has_vmx_msr_bitmap())
> +		vmx->nested.l2_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
> +	else
> +		vmx->nested.l2_state->msr_bitmap = 0;
> +
> +	vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +
>  	return 0;
>  }
>  
> +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
> +			struct kvm_vcpu *vcpu);
> +
Move to header.

> +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> +{
Please make all local functions static. Here and in all other patches.

> +
> +	int r = 0;
> +
> +	r = kvm_read_guest_virt(vcpu->arch.regs[VCPU_REGS_RAX], gentry,
> +				sizeof(u64), vcpu);
vmptrld operand can be not only in RAX but in other registers too or in memory.

> +	if (r) {
> +		printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
> +		       __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
> +		return r;
> +	}
> +
> +	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> +		printk(KERN_DEBUG "%s addr %llx not aligned\n",
> +		       __func__, *gentry);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +
>  /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
>   * tricks.
>   */
> @@ -3374,6 +3618,66 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct page *vmcs_page;
> +	u64 guest_vmcs_addr;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (read_guest_vmcs_gpa(vcpu, &guest_vmcs_addr))
> +		return 1;
> +
> +	if (create_l1_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> +		return 1;
> +	}
> +
> +	if (create_l2_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> +		return 1;
> +	}
create_l2_state() is called on vmxon. As far as I can see this is nop
here.

> +
> +	if (vmx->nested.vmptr != guest_vmcs_addr) {
> +		/* checking vmptr address */
> +		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> +		if (vmcs_page == NULL)
> +			return 1;
> +
> +		vmx->nested.vmptr = guest_vmcs_addr;
> +
> +		kvm_release_page_clean(vmcs_page);
> +	}
> +
> +	clear_rflags_cf_zf(vcpu);
> +	skip_emulated_instruction(vcpu);
> +	return 1;
> +}
> +
> +int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
> +			 struct kvm_vcpu *vcpu);
Move to header.

> +
> +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> +{
> +	int r = 0;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	r = kvm_write_guest_virt(vcpu->arch.regs[VCPU_REGS_RAX],
> +				 (void *)&to_vmx(vcpu)->nested.vmptr,
> +				 sizeof(u64), vcpu);
Same as vmptrld. Why are you assuming RAX?

> +	if (r) {
> +		printk(KERN_INFO "%s failed to write vmptr\n", __func__);
> +		return 1;
> +	}
> +	clear_rflags_cf_zf(vcpu);
> +	skip_emulated_instruction(vcpu);
> +	return 1;
> +}
> +
>  static int handle_invlpg(struct kvm_vcpu *vcpu)
>  {
>  	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> @@ -3644,8 +3948,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_VMCALL]                  = handle_vmcall,
>  	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
>  	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
> -	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
> -	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
> +	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
> +	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
>  	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
>  	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
>  	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
> @@ -4183,6 +4487,148 @@ static bool vmx_gb_page_enable(void)
>  	return false;
>  }
>  
> +void save_vmcs(struct shadow_vmcs *dst)
> +{
Not used by this patch. May be introduce in the patch that uses it.

> +	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> +	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> +	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> +	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> +	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> +	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> +	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> +	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +	if (cpu_has_vmx_msr_bitmap())
> +		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
> +
> +	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> +	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> +	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
> +	dst->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
> +	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> +	if (enable_ept)
> +		dst->ept_pointer = vmcs_read64(EPT_POINTER);
> +
> +	dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> +	dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
> +		dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> +	if (enable_ept) {
> +		dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
> +		dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
> +		dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
> +		dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
> +	}
> +	dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
> +	dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +	dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
> +	dst->page_fault_error_code_mask =
> +		vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
> +	dst->page_fault_error_code_match =
> +		vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
> +	dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> +	dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
> +	dst->vm_exit_msr_store_count = vmcs_read32(VM_EXIT_MSR_STORE_COUNT);
> +	dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
> +	dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
> +	dst->vm_entry_msr_load_count = vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT);
> +	dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> +	dst->vm_entry_exception_error_code =
> +		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> +	dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> +	dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
> +	dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +	if (enable_vpid && dst->secondary_vm_exec_control &
> +	    SECONDARY_EXEC_ENABLE_VPID)
> +		dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
> +	dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
> +	dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> +	dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> +	dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +	dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
> +	dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> +	dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> +	dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> +	dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> +	dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> +	dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> +	dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> +	dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> +	dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> +	dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> +	dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> +	dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> +	dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> +	dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> +	dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> +	dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> +	dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> +	dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> +	dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> +	dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> +	dst->guest_interruptibility_info =
> +		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +	dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
> +	dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> +	dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
> +	dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
> +	dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
> +	dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> +	dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> +	dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
> +	dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
> +	dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
> +	dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
> +	dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> +	dst->guest_cr0 = vmcs_readl(GUEST_CR0);
> +	dst->guest_cr3 = vmcs_readl(GUEST_CR3);
> +	dst->guest_cr4 = vmcs_readl(GUEST_CR4);
> +	dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> +	dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> +	dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> +	dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> +	dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> +	dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> +	dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> +	dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> +	dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> +	dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +	dst->guest_dr7 = vmcs_readl(GUEST_DR7);
> +	dst->guest_rsp = vmcs_readl(GUEST_RSP);
> +	dst->guest_rip = vmcs_readl(GUEST_RIP);
> +	dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> +	dst->guest_pending_dbg_exceptions =
> +		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> +	dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> +	dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> +	dst->host_cr0 = vmcs_readl(HOST_CR0);
> +	dst->host_cr3 = vmcs_readl(HOST_CR3);
> +	dst->host_cr4 = vmcs_readl(HOST_CR4);
> +	dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
> +	dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
> +	dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
> +	dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> +	dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> +	dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
> +	dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
> +	dst->host_rsp = vmcs_readl(HOST_RSP);
> +	dst->host_rip = vmcs_readl(HOST_RIP);
> +	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
> +		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops = {
>  	.cpu_has_kvm_support = cpu_has_kvm_support,
>  	.disabled_by_bios = vmx_disabled_by_bios,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9c39092..74eb888 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2473,6 +2473,7 @@ static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
>  out:
>  	return r;
>  }
> +EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
>  
>  static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
>  				struct kvm_vcpu *vcpu)
> @@ -2503,7 +2504,7 @@ static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
>  out:
>  	return r;
>  }
> -
> +EXPORT_SYMBOL_GPL(kvm_write_guest_virt);
>  
>  static int emulator_read_emulated(unsigned long addr,
>  				  void *val,
> -- 
> 1.6.0.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-10-15 14:41     ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst oritw
  2009-10-15 14:41       ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite oritw
  2009-10-19 11:17       ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst Gleb Natapov
@ 2009-10-19 12:59       ` Gleb Natapov
  2009-10-21 13:28         ` Orit Wasserman
  2009-10-20  4:24       ` Avi Kivity
  3 siblings, 1 reply; 35+ messages in thread
From: Gleb Natapov @ 2009-10-19 12:59 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On Thu, Oct 15, 2009 at 04:41:44PM +0200, oritw@il.ibm.com wrote:
> +static struct page *nested_get_page(struct kvm_vcpu *vcpu,
> +				    u64 vmcs_addr)
> +{
> +	struct page *vmcs_page = NULL;
> +
> +	down_read(&current->mm->mmap_sem);
> +	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
> +	up_read(&current->mm->mmap_sem);
Why are you taking mmap_sem here? gup_fast() takes it if required.

> +
> +	if (is_error_page(vmcs_page)) {
> +		printk(KERN_ERR "%s error allocating page \n", __func__);
> +		kvm_release_page_clean(vmcs_page);
> +		return NULL;
> +	}
> +
> +	return vmcs_page;
> +
> +}
> +

--
			Gleb.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite
  2009-10-15 14:41       ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite oritw
  2009-10-15 14:41         ` [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume oritw
@ 2009-10-19 13:17         ` Gleb Natapov
  2009-10-21 13:32           ` Orit Wasserman
  2009-10-20  4:44         ` Avi Kivity
  2 siblings, 1 reply; 35+ messages in thread
From: Gleb Natapov @ 2009-10-19 13:17 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On Thu, Oct 15, 2009 at 04:41:45PM +0200, oritw@il.ibm.com wrote:
> From: Orit Wasserman <oritw@il.ibm.com>
> 
> ---
>  arch/x86/kvm/vmx.c |  591 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 589 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 8c186e0..6a4c252 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -225,6 +225,21 @@ struct nested_vmx {
>  	struct level_state *l1_state;
>  };
>  
> +enum vmcs_field_type {
> +	VMCS_FIELD_TYPE_U16 = 0,
> +	VMCS_FIELD_TYPE_U64 = 1,
> +	VMCS_FIELD_TYPE_U32 = 2,
> +	VMCS_FIELD_TYPE_ULONG = 3
> +};
> +
> +#define VMCS_FIELD_LENGTH_OFFSET 13
> +#define VMCS_FIELD_LENGTH_MASK 0x6000
> +
> +static inline int vmcs_field_length(unsigned long field)
> +{
> +	return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
> +}
> +
>  struct vmcs {
>  	u32 revision_id;
>  	u32 abort;
> @@ -288,6 +303,404 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
>  	return container_of(vcpu, struct vcpu_vmx, vcpu);
>  }
>  
> +#define SHADOW_VMCS_OFFSET(x) offsetof(struct shadow_vmcs, x)
> +
> +static unsigned short vmcs_field_to_offset_table[HOST_RIP+1] = {
> +
> +	[VIRTUAL_PROCESSOR_ID] =
> +		SHADOW_VMCS_OFFSET(virtual_processor_id),
> +	[GUEST_ES_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(guest_es_selector),
> +	[GUEST_CS_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(guest_cs_selector),
> +	[GUEST_SS_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(guest_ss_selector),
> +	[GUEST_DS_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(guest_ds_selector),
> +	[GUEST_FS_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(guest_fs_selector),
> +	[GUEST_GS_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(guest_gs_selector),
> +	[GUEST_LDTR_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(guest_ldtr_selector),
> +	[GUEST_TR_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(guest_tr_selector),
> +	[HOST_ES_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(host_es_selector),
> +	[HOST_CS_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(host_cs_selector),
> +	[HOST_SS_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(host_ss_selector),
> +	[HOST_DS_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(host_ds_selector),
> +	[HOST_FS_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(host_fs_selector),
> +	[HOST_GS_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(host_gs_selector),
> +	[HOST_TR_SELECTOR] =
> +		SHADOW_VMCS_OFFSET(host_tr_selector),
> +	[IO_BITMAP_A] =
> +		SHADOW_VMCS_OFFSET(io_bitmap_a),
> +	[IO_BITMAP_A_HIGH] =
> +		SHADOW_VMCS_OFFSET(io_bitmap_a)+4,
> +	[IO_BITMAP_B] =
> +		SHADOW_VMCS_OFFSET(io_bitmap_b),
> +	[IO_BITMAP_B_HIGH] =
> +		SHADOW_VMCS_OFFSET(io_bitmap_b)+4,
> +	[MSR_BITMAP] =
> +		SHADOW_VMCS_OFFSET(msr_bitmap),
> +	[MSR_BITMAP_HIGH] =
> +		SHADOW_VMCS_OFFSET(msr_bitmap)+4,
> +	[VM_EXIT_MSR_STORE_ADDR] =
> +		SHADOW_VMCS_OFFSET(vm_exit_msr_store_addr),
> +	[VM_EXIT_MSR_STORE_ADDR_HIGH] =
> +		SHADOW_VMCS_OFFSET(vm_exit_msr_store_addr)+4,
> +	[VM_EXIT_MSR_LOAD_ADDR] =
> +		SHADOW_VMCS_OFFSET(vm_exit_msr_load_addr),
> +	[VM_EXIT_MSR_LOAD_ADDR_HIGH] =
> +		SHADOW_VMCS_OFFSET(vm_exit_msr_load_addr)+4,
> +	[VM_ENTRY_MSR_LOAD_ADDR] =
> +		SHADOW_VMCS_OFFSET(vm_entry_msr_load_addr),
> +	[VM_ENTRY_MSR_LOAD_ADDR_HIGH] =
> +		SHADOW_VMCS_OFFSET(vm_entry_msr_load_addr)+4,
> +	[TSC_OFFSET] =
> +		SHADOW_VMCS_OFFSET(tsc_offset),
> +	[TSC_OFFSET_HIGH] =
> +		SHADOW_VMCS_OFFSET(tsc_offset)+4,
> +	[VIRTUAL_APIC_PAGE_ADDR] =
> +		SHADOW_VMCS_OFFSET(virtual_apic_page_addr),
> +	[VIRTUAL_APIC_PAGE_ADDR_HIGH] =
> +		SHADOW_VMCS_OFFSET(virtual_apic_page_addr)+4,
> +	[APIC_ACCESS_ADDR] =
> +		SHADOW_VMCS_OFFSET(apic_access_addr),
> +	[APIC_ACCESS_ADDR_HIGH] =
> +		SHADOW_VMCS_OFFSET(apic_access_addr)+4,
> +	[EPT_POINTER] =
> +		SHADOW_VMCS_OFFSET(ept_pointer),
> +	[EPT_POINTER_HIGH] =
> +		SHADOW_VMCS_OFFSET(ept_pointer)+4,
> +	[GUEST_PHYSICAL_ADDRESS] =
> +		SHADOW_VMCS_OFFSET(guest_physical_address),
> +	[GUEST_PHYSICAL_ADDRESS_HIGH] =
> +		SHADOW_VMCS_OFFSET(guest_physical_address)+4,
> +	[VMCS_LINK_POINTER] =
> +		SHADOW_VMCS_OFFSET(vmcs_link_pointer),
> +	[VMCS_LINK_POINTER_HIGH] =
> +		SHADOW_VMCS_OFFSET(vmcs_link_pointer)+4,
> +	[GUEST_IA32_DEBUGCTL] =
> +		SHADOW_VMCS_OFFSET(guest_ia32_debugctl),
> +	[GUEST_IA32_DEBUGCTL_HIGH] =
> +		SHADOW_VMCS_OFFSET(guest_ia32_debugctl)+4,
> +	[GUEST_IA32_PAT] =
> +		SHADOW_VMCS_OFFSET(guest_ia32_pat),
> +	[GUEST_IA32_PAT_HIGH] =
> +		SHADOW_VMCS_OFFSET(guest_ia32_pat)+4,
> +	[GUEST_PDPTR0] =
> +		SHADOW_VMCS_OFFSET(guest_pdptr0),
> +	[GUEST_PDPTR0_HIGH] =
> +		SHADOW_VMCS_OFFSET(guest_pdptr0)+4,
> +	[GUEST_PDPTR1] =
> +		SHADOW_VMCS_OFFSET(guest_pdptr1),
> +	[GUEST_PDPTR1_HIGH] =
> +		SHADOW_VMCS_OFFSET(guest_pdptr1)+4,
> +	[GUEST_PDPTR2] =
> +		SHADOW_VMCS_OFFSET(guest_pdptr2),
> +	[GUEST_PDPTR2_HIGH] =
> +		SHADOW_VMCS_OFFSET(guest_pdptr2)+4,
> +	[GUEST_PDPTR3] =
> +		SHADOW_VMCS_OFFSET(guest_pdptr3),
> +	[GUEST_PDPTR3_HIGH] =
> +		SHADOW_VMCS_OFFSET(guest_pdptr3)+4,
> +	[HOST_IA32_PAT] =
> +		SHADOW_VMCS_OFFSET(host_ia32_pat),
> +	[HOST_IA32_PAT_HIGH] =
> +		SHADOW_VMCS_OFFSET(host_ia32_pat)+4,
> +	[PIN_BASED_VM_EXEC_CONTROL] =
> +		SHADOW_VMCS_OFFSET(pin_based_vm_exec_control),
> +	[CPU_BASED_VM_EXEC_CONTROL] =
> +		SHADOW_VMCS_OFFSET(cpu_based_vm_exec_control),
> +	[EXCEPTION_BITMAP] =
> +		SHADOW_VMCS_OFFSET(exception_bitmap),
> +	[PAGE_FAULT_ERROR_CODE_MASK] =
> +		SHADOW_VMCS_OFFSET(page_fault_error_code_mask),
> +	[PAGE_FAULT_ERROR_CODE_MATCH] =
> +		SHADOW_VMCS_OFFSET(page_fault_error_code_match),
> +	[CR3_TARGET_COUNT] =
> +		SHADOW_VMCS_OFFSET(cr3_target_count),
> +	[VM_EXIT_CONTROLS] =
> +		SHADOW_VMCS_OFFSET(vm_exit_controls),
> +	[VM_EXIT_MSR_STORE_COUNT] =
> +		SHADOW_VMCS_OFFSET(vm_exit_msr_store_count),
> +	[VM_EXIT_MSR_LOAD_COUNT] =
> +		SHADOW_VMCS_OFFSET(vm_exit_msr_load_count),
> +	[VM_ENTRY_CONTROLS] =
> +		SHADOW_VMCS_OFFSET(vm_entry_controls),
> +	[VM_ENTRY_MSR_LOAD_COUNT] =
> +		SHADOW_VMCS_OFFSET(vm_entry_msr_load_count),
> +	[VM_ENTRY_INTR_INFO_FIELD] =
> +		SHADOW_VMCS_OFFSET(vm_entry_intr_info_field),
> +	[VM_ENTRY_EXCEPTION_ERROR_CODE] =
> +		SHADOW_VMCS_OFFSET(vm_entry_exception_error_code),
> +	[VM_ENTRY_INSTRUCTION_LEN] =
> +		SHADOW_VMCS_OFFSET(vm_entry_instruction_len),
> +	[TPR_THRESHOLD] =
> +		SHADOW_VMCS_OFFSET(tpr_threshold),
> +	[SECONDARY_VM_EXEC_CONTROL] =
> +		SHADOW_VMCS_OFFSET(secondary_vm_exec_control),
> +	[VM_INSTRUCTION_ERROR] =
> +		SHADOW_VMCS_OFFSET(vm_instruction_error),
> +	[VM_EXIT_REASON] =
> +		SHADOW_VMCS_OFFSET(vm_exit_reason),
> +	[VM_EXIT_INTR_INFO] =
> +		SHADOW_VMCS_OFFSET(vm_exit_intr_info),
> +	[VM_EXIT_INTR_ERROR_CODE] =
> +		SHADOW_VMCS_OFFSET(vm_exit_intr_error_code),
> +	[IDT_VECTORING_INFO_FIELD] =
> +		SHADOW_VMCS_OFFSET(idt_vectoring_info_field),
> +	[IDT_VECTORING_ERROR_CODE] =
> +		SHADOW_VMCS_OFFSET(idt_vectoring_error_code),
> +	[VM_EXIT_INSTRUCTION_LEN] =
> +		SHADOW_VMCS_OFFSET(vm_exit_instruction_len),
> +	[VMX_INSTRUCTION_INFO] =
> +		SHADOW_VMCS_OFFSET(vmx_instruction_info),
> +	[GUEST_ES_LIMIT] =
> +		SHADOW_VMCS_OFFSET(guest_es_limit),
> +	[GUEST_CS_LIMIT] =
> +		SHADOW_VMCS_OFFSET(guest_cs_limit),
> +	[GUEST_SS_LIMIT] =
> +		SHADOW_VMCS_OFFSET(guest_ss_limit),
> +	[GUEST_DS_LIMIT] =
> +		SHADOW_VMCS_OFFSET(guest_ds_limit),
> +	[GUEST_FS_LIMIT] =
> +		SHADOW_VMCS_OFFSET(guest_fs_limit),
> +	[GUEST_GS_LIMIT] =
> +		SHADOW_VMCS_OFFSET(guest_gs_limit),
> +	[GUEST_LDTR_LIMIT] =
> +		SHADOW_VMCS_OFFSET(guest_ldtr_limit),
> +	[GUEST_TR_LIMIT] =
> +		SHADOW_VMCS_OFFSET(guest_tr_limit),
> +	[GUEST_GDTR_LIMIT] =
> +		SHADOW_VMCS_OFFSET(guest_gdtr_limit),
> +	[GUEST_IDTR_LIMIT] =
> +		SHADOW_VMCS_OFFSET(guest_idtr_limit),
> +	[GUEST_ES_AR_BYTES] =
> +		SHADOW_VMCS_OFFSET(guest_es_ar_bytes),
> +	[GUEST_CS_AR_BYTES] =
> +		SHADOW_VMCS_OFFSET(guest_cs_ar_bytes),
> +	[GUEST_SS_AR_BYTES] =
> +		SHADOW_VMCS_OFFSET(guest_ss_ar_bytes),
> +	[GUEST_DS_AR_BYTES] =
> +		SHADOW_VMCS_OFFSET(guest_ds_ar_bytes),
> +	[GUEST_FS_AR_BYTES] =
> +		SHADOW_VMCS_OFFSET(guest_fs_ar_bytes),
> +	[GUEST_GS_AR_BYTES] =
> +		SHADOW_VMCS_OFFSET(guest_gs_ar_bytes),
> +	[GUEST_LDTR_AR_BYTES] =
> +		SHADOW_VMCS_OFFSET(guest_ldtr_ar_bytes),
> +	[GUEST_TR_AR_BYTES] =
> +		SHADOW_VMCS_OFFSET(guest_tr_ar_bytes),
> +	[GUEST_INTERRUPTIBILITY_INFO] =
> +		SHADOW_VMCS_OFFSET(guest_interruptibility_info),
> +	[GUEST_ACTIVITY_STATE] =
> +		SHADOW_VMCS_OFFSET(guest_activity_state),
> +	[GUEST_SYSENTER_CS] =
> +		SHADOW_VMCS_OFFSET(guest_sysenter_cs),
> +	[HOST_IA32_SYSENTER_CS] =
> +		SHADOW_VMCS_OFFSET(host_ia32_sysenter_cs),
> +	[CR0_GUEST_HOST_MASK] =
> +		SHADOW_VMCS_OFFSET(cr0_guest_host_mask),
> +	[CR4_GUEST_HOST_MASK] =
> +		SHADOW_VMCS_OFFSET(cr4_guest_host_mask),
> +	[CR0_READ_SHADOW] =
> +		SHADOW_VMCS_OFFSET(cr0_read_shadow),
> +	[CR4_READ_SHADOW] =
> +		SHADOW_VMCS_OFFSET(cr4_read_shadow),
> +	[CR3_TARGET_VALUE0] =
> +		SHADOW_VMCS_OFFSET(cr3_target_value0),
> +	[CR3_TARGET_VALUE1] =
> +		SHADOW_VMCS_OFFSET(cr3_target_value1),
> +	[CR3_TARGET_VALUE2] =
> +		SHADOW_VMCS_OFFSET(cr3_target_value2),
> +	[CR3_TARGET_VALUE3] =
> +		SHADOW_VMCS_OFFSET(cr3_target_value3),
> +	[EXIT_QUALIFICATION] =
> +		SHADOW_VMCS_OFFSET(exit_qualification),
> +	[GUEST_LINEAR_ADDRESS] =
> +		SHADOW_VMCS_OFFSET(guest_linear_address),
> +	[GUEST_CR0] =
> +		SHADOW_VMCS_OFFSET(guest_cr0),
> +	[GUEST_CR3] =
> +		SHADOW_VMCS_OFFSET(guest_cr3),
> +	[GUEST_CR4] =
> +		SHADOW_VMCS_OFFSET(guest_cr4),
> +	[GUEST_ES_BASE] =
> +		SHADOW_VMCS_OFFSET(guest_es_base),
> +	[GUEST_CS_BASE] =
> +		SHADOW_VMCS_OFFSET(guest_cs_base),
> +	[GUEST_SS_BASE] =
> +		SHADOW_VMCS_OFFSET(guest_ss_base),
> +	[GUEST_DS_BASE] =
> +		SHADOW_VMCS_OFFSET(guest_ds_base),
> +	[GUEST_FS_BASE] =
> +		SHADOW_VMCS_OFFSET(guest_fs_base),
> +	[GUEST_GS_BASE] =
> +		SHADOW_VMCS_OFFSET(guest_gs_base),
> +	[GUEST_LDTR_BASE] =
> +		SHADOW_VMCS_OFFSET(guest_ldtr_base),
> +	[GUEST_TR_BASE] =
> +		SHADOW_VMCS_OFFSET(guest_tr_base),
> +	[GUEST_GDTR_BASE] =
> +		SHADOW_VMCS_OFFSET(guest_gdtr_base),
> +	[GUEST_IDTR_BASE] =
> +		SHADOW_VMCS_OFFSET(guest_idtr_base),
> +	[GUEST_DR7] =
> +		SHADOW_VMCS_OFFSET(guest_dr7),
> +	[GUEST_RSP] =
> +		SHADOW_VMCS_OFFSET(guest_rsp),
> +	[GUEST_RIP] =
> +		SHADOW_VMCS_OFFSET(guest_rip),
> +	[GUEST_RFLAGS] =
> +		SHADOW_VMCS_OFFSET(guest_rflags),
> +	[GUEST_PENDING_DBG_EXCEPTIONS] =
> +		SHADOW_VMCS_OFFSET(guest_pending_dbg_exceptions),
> +	[GUEST_SYSENTER_ESP] =
> +		SHADOW_VMCS_OFFSET(guest_sysenter_esp),
> +	[GUEST_SYSENTER_EIP] =
> +		SHADOW_VMCS_OFFSET(guest_sysenter_eip),
> +	[HOST_CR0] =
> +		SHADOW_VMCS_OFFSET(host_cr0),
> +	[HOST_CR3] =
> +		SHADOW_VMCS_OFFSET(host_cr3),
> +	[HOST_CR4] =
> +		SHADOW_VMCS_OFFSET(host_cr4),
> +	[HOST_FS_BASE] =
> +		SHADOW_VMCS_OFFSET(host_fs_base),
> +	[HOST_GS_BASE] =
> +		SHADOW_VMCS_OFFSET(host_gs_base),
> +	[HOST_TR_BASE] =
> +		SHADOW_VMCS_OFFSET(host_tr_base),
> +	[HOST_GDTR_BASE] =
> +		SHADOW_VMCS_OFFSET(host_gdtr_base),
> +	[HOST_IDTR_BASE] =
> +		SHADOW_VMCS_OFFSET(host_idtr_base),
> +	[HOST_IA32_SYSENTER_ESP] =
> +		SHADOW_VMCS_OFFSET(host_ia32_sysenter_esp),
> +	[HOST_IA32_SYSENTER_EIP] =
> +		SHADOW_VMCS_OFFSET(host_ia32_sysenter_eip),
> +	[HOST_RSP] =
> +		SHADOW_VMCS_OFFSET(host_rsp),
> +	[HOST_RIP] =
> +		SHADOW_VMCS_OFFSET(host_rip),
> +};
> +
> +static inline unsigned short vmcs_field_to_offset(unsigned long field)
> +{
> +
> +	if (field > HOST_RIP || vmcs_field_to_offset_table[field] == 0) {
> +		printk(KERN_ERR "invalid vmcs encoding 0x%lx\n", field);
> +		return -1;
> +	}
> +
> +	return vmcs_field_to_offset_table[field];
> +}
> +
> +static inline unsigned long nested_vmcs_readl(struct kvm_vcpu *vcpu,
> +					      unsigned long field)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	unsigned long *entry;
> +
> +	if (!vmx->nested.l2_state->shadow_vmcs) {
> +		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> +		return -1;
> +	}
> +
> +	entry = (unsigned long *)((char *)(vmx->nested.l2_state->shadow_vmcs) +
> +				 vmcs_field_to_offset(field));
> +	return *entry;
> +}
> +
> +static inline u16 nested_vmcs_read16(struct kvm_vcpu *vcpu,
> +				     unsigned long field)
> +{
> +	return nested_vmcs_readl(vcpu, field);
> +}
> +
> +static inline u32 nested_vmcs_read32(struct kvm_vcpu *vcpu, unsigned long field)
> +{
> +	return nested_vmcs_readl(vcpu, field);
> +}
> +
> +static inline u64 nested_vmcs_read64(struct kvm_vcpu *vcpu, unsigned long field)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u64 *entry;
> +	if (!vmx->nested.l2_state->shadow_vmcs) {
> +		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> +		return -1;
> +	}
> +
> +	entry = (u64 *)((char *)(vmx->nested.l2_state->shadow_vmcs) +
> +				 vmcs_field_to_offset(field));
> +	return *entry;
> +}
> +
> +static inline void nested_vmcs_writel(struct kvm_vcpu *vcpu,
> +				      unsigned long field, unsigned long value)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	unsigned long entry =
> +		(unsigned long)(vmx->nested.l2_state->shadow_vmcs);
> +
> +	if (!vmx->nested.l2_state->shadow_vmcs) {
> +		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> +		return;
> +	}
> +	entry += vmcs_field_to_offset(field);
> +	*(unsigned long *)entry = value;
> +}
> +
> +static inline void nested_vmcs_write16(struct kvm_vcpu *vcpu,
> +				       unsigned long field, u16 value)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	unsigned long entry =
> +		(unsigned long)(vmx->nested.l2_state->shadow_vmcs);
> +
> +	if (!vmx->nested.l2_state->shadow_vmcs) {
> +		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> +		return;
> +	}
> +	entry += vmcs_field_to_offset(field);
> +	*(u16 *)entry = value;
> +}
> +
> +static inline void nested_vmcs_write32(struct kvm_vcpu *vcpu,
> +				       unsigned long field, u32 value)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	unsigned long entry =
> +		(unsigned long)(vmx->nested.l2_state->shadow_vmcs);
> +
> +	if (!vmx->nested.l2_state->shadow_vmcs) {
> +		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> +		return;
> +	}
> +	entry += vmcs_field_to_offset(field);
> +	*(u32 *)entry = value;
> +}
> +
> +static inline void nested_vmcs_write64(struct kvm_vcpu *vcpu,
> +				       unsigned long field, u64 value)
> +{
> +#ifdef CONFIG_X86_64
> +	nested_vmcs_writel(vcpu, field, value);
> +#else /* nested: 32 bit not actually tested */
> +	nested_vmcs_writel(vcpu, field, value);
> +	nested_vmcs_writel(vcpu, field+1, value >> 32);
> +#endif
> +}
> +
>  static struct page *nested_get_page(struct kvm_vcpu *vcpu,
>  				    u64 vmcs_addr)
>  {
> @@ -307,6 +720,50 @@ static struct page *nested_get_page(struct kvm_vcpu *vcpu,
>  
>  }
>  
> +static int nested_map_shadow_vmcs(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct page *vmcs_page = nested_get_page(vcpu, vmx->nested.vmptr);
> +
> +	if (vmcs_page == NULL) {
> +		printk(KERN_INFO "%s: failure in nested_get_page\n",__func__);
> +		return 0;
> +	}
> +
> +	if (vmx->nested.l2_state->shadow_vmcs) {
> +		printk(KERN_INFO "%s: shadow vmcs already mapped\n",__func__);
> +		return 0;
> +	}
> +
> +	vmx->nested.l2_state->shadow_vmcs = kmap_atomic(vmcs_page, KM_USER0);
> +
> +	if (!vmx->nested.l2_state->shadow_vmcs) {
> +		printk(KERN_INFO "%s: error in kmap_atomic\n",__func__);
> +		return 0;
> +	}
> +
> +	return 1;
> +}
Cleanup after error is non-existent in this function.

> +
> +static void nested_unmap_shadow_vmcs(struct kvm_vcpu *vcpu)
> +{
> +	struct page *page;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!vmx->nested.l2_state->shadow_vmcs) {
> +		printk("Shadow vmcs already unmapped\n");
> +		return;
> +	}
> +
> +	page = kmap_atomic_to_page(vmx->nested.l2_state->shadow_vmcs);
> +
> +	kunmap_atomic(vmx->nested.l2_state->shadow_vmcs, KM_USER0);
> +
> +	kvm_release_page_dirty(page);
> +
> +	vmx->nested.l2_state->shadow_vmcs = NULL;
> +}
> +
>  static int init_rmode(struct kvm *kvm);
>  static u64 construct_eptp(unsigned long root_hpa);
>  
> @@ -3550,6 +4007,26 @@ static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
>  	vmx_set_rflags(vcpu, rflags);
>  }
>  
> +static void set_rflags_to_vmx_fail_invalid(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long rflags;
> +	rflags = vmx_get_rflags(vcpu);
> +	rflags |= X86_EFLAGS_CF;
> +	rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_ZF &
> +		~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
> +	vmx_set_rflags(vcpu, rflags);
> +}
> +
> +static void set_rflags_to_vmx_fail_valid(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long rflags;
> +	rflags = vmx_get_rflags(vcpu);
> +	rflags |= X86_EFLAGS_ZF;
> +	rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_CF &
> +		~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
> +	vmx_set_rflags(vcpu, rflags);
> +}
> +
>  static int handle_vmclear(struct kvm_vcpu *vcpu)
>  {
>  	if (!nested_vmx_check_permission(vcpu))
> @@ -3563,6 +4040,116 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> +static int handle_vmread(struct kvm_vcpu *vcpu)
> +{
> +#ifndef CONFIG_X86_64
> +	u64 value;
> +#endif
Can you move this to where it's used to save #ifdef here?

> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (!nested_map_shadow_vmcs(vcpu)) {
> +		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
> +		set_rflags_to_vmx_fail_invalid(vcpu);
> +		return 1;
> +	}
> +
> +	switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
> +	case VMCS_FIELD_TYPE_U16:
> +		vcpu->arch.regs[VCPU_REGS_RAX] =
Once again only vmread %rdx,%rax is implemented. No operand decoding.

> +			nested_vmcs_read16(vcpu,
> +					   vcpu->arch.regs[VCPU_REGS_RDX]);
> +		break;
> +	case VMCS_FIELD_TYPE_U32:
> +		vcpu->arch.regs[VCPU_REGS_RAX] =
> +			nested_vmcs_read32(vcpu,
> +					   vcpu->arch.regs[VCPU_REGS_RDX]);
> +		break;
> +	case VMCS_FIELD_TYPE_U64:
> +#ifdef CONFIG_X86_64
> +		vcpu->arch.regs[VCPU_REGS_RAX] =
> +		nested_vmcs_read64(vcpu,
> +					   vcpu->arch.regs[VCPU_REGS_RDX]);
> +#else /* nested: 32 bit not actually tested */
> +		value =  nested_vmcs_read64(vcpu,
> +					    vcpu->arch.regs[VCPU_REGS_RDX]);
> +		vcpu->arch.regs[VCPU_REGS_RAX] = value;
> +		vcpu->arch.regs[VCPU_REGS_RBX] = value >> 32;
> +#endif
> +	break;
> +	case VMCS_FIELD_TYPE_ULONG:
> +		vcpu->arch.regs[VCPU_REGS_RAX] =
> +			nested_vmcs_readl(vcpu,
> +					  vcpu->arch.regs[VCPU_REGS_RDX]);
> +		break;
> +	default:
> +		printk(KERN_INFO "%s invalid field\n", __func__);
> +		set_rflags_to_vmx_fail_valid(vcpu);
> +		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
> +		nested_unmap_shadow_vmcs(vcpu);
> +		return 1;
> +	}
> +
> +	clear_rflags_cf_zf(vcpu);
> +	skip_emulated_instruction(vcpu);
> +	nested_unmap_shadow_vmcs(vcpu);
> +	return 1;
> +}
> +
> +static int handle_vmwrite(struct kvm_vcpu *vcpu)
> +{
> +#ifndef CONFIG_X86_64
> +	u64 value ;
> +#endif
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (!nested_map_shadow_vmcs(vcpu)) {
> +		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
> +		set_rflags_to_vmx_fail_invalid(vcpu);
> +		return 1;
> +	}
> +
> +	switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
> +	case VMCS_FIELD_TYPE_U16:
> +		nested_vmcs_write16(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
> +				    vcpu->arch.regs[VCPU_REGS_RAX]);
> +		break;
> +	case VMCS_FIELD_TYPE_U32:
> +		nested_vmcs_write32(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
> +				    vcpu->arch.regs[VCPU_REGS_RAX]);
> +		break;
> +	case VMCS_FIELD_TYPE_U64:
> +#ifdef CONFIG_X86_64
> +		nested_vmcs_write64(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
> +				    vcpu->arch.regs[VCPU_REGS_RAX]);
> +#else /* nested: 32 bit not actually tested */
> +		value =  vcpu->arch.regs[VCPU_REGS_RAX] |
> +			(vcpu->arch.regs[VCPU_REGS_RBX] << 32);
> +		nested_vmcs_write64(vcpu,
> +				    vcpu->arch.regs[VCPU_REGS_RDX], value);
Why not open code part of nested_vmcs_write64 and get rid of #ifdef
there?:
	nested_vmcs_writel(vcpu, vcpu->arch.regs[VCPU_REGS_RDX], vcpu->arch.regs[VCPU_REGS_RAX]);
	nested_vmcs_writel(vcpu, vcpu->arch.regs[VCPU_REGS_RDX]+1, vcpu->arch.regs[VCPU_REGS_RBX]);

> +#endif
> +		break;
> +	case VMCS_FIELD_TYPE_ULONG:
> +		nested_vmcs_writel(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
> +				   vcpu->arch.regs[VCPU_REGS_RAX]);
> +		break;
> +	default:
> +		printk(KERN_INFO "%s invalid field\n", __func__);
> +		set_rflags_to_vmx_fail_valid(vcpu);
> +		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
> +		nested_unmap_shadow_vmcs(vcpu);
> +		return 1;
> +	}
> +
> +	clear_rflags_cf_zf(vcpu);
> +	skip_emulated_instruction(vcpu);
> +	nested_unmap_shadow_vmcs(vcpu);
> +	return 1;
> +}
> +
>  static int handle_vmoff(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -3950,9 +4537,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
>  	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
>  	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
> -	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
> +	[EXIT_REASON_VMREAD]                  = handle_vmread,
>  	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
> -	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
> +	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
>  	[EXIT_REASON_VMOFF]                   = handle_vmoff,
>  	[EXIT_REASON_VMON]                    = handle_vmon,
>  	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
> -- 
> 1.6.0.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-15 14:41         ` [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume oritw
@ 2009-10-19 17:29           ` Gleb Natapov
  2009-10-21 14:43             ` Orit Wasserman
  2009-10-20  4:56           ` Avi Kivity
  1 sibling, 1 reply; 35+ messages in thread
From: Gleb Natapov @ 2009-10-19 17:29 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On Thu, Oct 15, 2009 at 04:41:46PM +0200, oritw@il.ibm.com wrote:
> From: Orit Wasserman <oritw@il.ibm.com>
> 
> ---
>  arch/x86/kvm/vmx.c | 1173 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 1148 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6a4c252..e814029 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -209,6 +209,7 @@ struct __attribute__ ((__packed__)) level_state {
>  	struct vmcs *vmcs;
>  	int cpu;
>  	int launched;
> +	bool first_launch;
>  };
>  
>  struct nested_vmx {
> @@ -216,6 +217,12 @@ struct nested_vmx {
>  	bool vmxon;
>  	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
>  	u64 vmptr;
> +	/* Are we running nested guest */
> +	bool nested_mode;
> +	/* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
> +	bool nested_run_pending;
> +	/* flag indicating if there was a valid IDT after exiting from l2 */
> +	bool nested_valid_idt;
>  	/*
>  	 * Level 2 state : includes vmcs,registers and
>  	 * a copy of vmcs12 for vmread/vmwrite
> @@ -240,6 +247,10 @@ static inline int vmcs_field_length(unsigned long field)
>  	return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
>  }
>  
> +#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \
> +					VM_EXIT_SAVE_IA32_PAT))
> +#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \
> +					 VM_ENTRY_IA32E_MODE))
>  struct vmcs {
>  	u32 revision_id;
>  	u32 abort;
> @@ -303,6 +314,12 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
>  	return container_of(vcpu, struct vcpu_vmx, vcpu);
>  }
>  
> +static inline struct shadow_vmcs *get_shadow_vmcs(struct kvm_vcpu *vcpu)
> +{
> +	WARN_ON(!to_vmx(vcpu)->nested.l2_state->shadow_vmcs);
> +	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
> +}
> +
>  #define SHADOW_VMCS_OFFSET(x) offsetof(struct shadow_vmcs, x)
>  
>  static unsigned short vmcs_field_to_offset_table[HOST_RIP+1] = {
> @@ -822,8 +839,16 @@ static struct kvm_vmx_segment_field {
>  static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
>  
>  static int nested_vmx_check_permission(struct kvm_vcpu *vcpu);
> +static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
> +				      bool has_error_code, u32 error_code);
> +static int nested_vmx_intr(struct kvm_vcpu *vcpu);
>  static int create_l1_state(struct kvm_vcpu *vcpu);
>  static int create_l2_state(struct kvm_vcpu *vcpu);
> +static int launch_guest(struct kvm_vcpu *vcpu);
> +static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu);
> +static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override);
> +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> +			     bool is_interrupt);
>  
>  /*
>   * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
> @@ -940,6 +965,18 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
>  	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
>  }
>  
> +static inline int is_exception(u32 intr_info)
> +{
> +	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
> +		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
> +}
> +
> +static inline int is_nmi(u32 intr_info)
> +{
> +	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
> +		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
> +}
> +
>  static inline int cpu_has_vmx_invept_individual_addr(void)
>  {
>  	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
> @@ -990,6 +1027,51 @@ static inline bool report_flexpriority(void)
>  	return flexpriority_enabled;
>  }
>  
> +static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu *vcpu)
> +{
> +	return cpu_has_vmx_tpr_shadow() &&
> +		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
> +		CPU_BASED_TPR_SHADOW;
> +}
> +
> +static inline int nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu)
> +{
> +	return cpu_has_secondary_exec_ctrls() &&
> +		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
> +		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +}
> +
> +static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
> +							   *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control &
> +		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +}
> +
> +static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->
> +		secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT;
> +}
> +
> +static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control &
> +		SECONDARY_EXEC_ENABLE_VPID;
> +}
> +
> +static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->vm_entry_controls &
> +		VM_ENTRY_LOAD_IA32_PAT;
> +}
> +
> +static inline int nested_cpu_has_vmx_msr_bitmap(struct kvm_vcpu *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
> +		CPU_BASED_USE_MSR_BITMAPS;
> +}
> +
>  static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
>  {
>  	int i;
> @@ -1501,6 +1583,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	u32 intr_info = nr | INTR_INFO_VALID_MASK;
>  
> +	if (nested_vmx_check_exception(vmx, nr, has_error_code, error_code))
> +		return;
> +
>  	if (has_error_code) {
>  		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
>  		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
> @@ -1943,6 +2028,200 @@ static void vmclear_local_vcpus(void)
>  		__vcpu_clear(vmx);
>  }
>  
> +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> +{
> +	struct shadow_vmcs *l2_shadow_vmcs =
> +		get_shadow_vmcs(vcpu);
> +
> +	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +
> +	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	l2_shadow_vmcs->guest_physical_address =
> +		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> +	l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
> +		l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> +	l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> +	l2_shadow_vmcs->vm_entry_intr_info_field =
> +		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> +	l2_shadow_vmcs->vm_entry_exception_error_code =
> +		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> +	l2_shadow_vmcs->vm_entry_instruction_len =
> +		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> +	l2_shadow_vmcs->vm_instruction_error =
> +		vmcs_read32(VM_INSTRUCTION_ERROR);
> +	l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> +	l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	l2_shadow_vmcs->vm_exit_intr_error_code =
> +		vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> +	l2_shadow_vmcs->idt_vectoring_info_field =
> +		vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +	l2_shadow_vmcs->idt_vectoring_error_code =
> +		vmcs_read32(IDT_VECTORING_ERROR_CODE);
> +	l2_shadow_vmcs->vm_exit_instruction_len =
> +		vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> +	l2_shadow_vmcs->vmx_instruction_info =
> +		vmcs_read32(VMX_INSTRUCTION_INFO);
> +	l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> +	l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> +	l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> +	l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> +	l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> +	l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> +	l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> +	l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> +	l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> +	l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> +	l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> +	l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> +	l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> +	l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> +	l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> +	l2_shadow_vmcs->guest_interruptibility_info =
> +		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +	l2_shadow_vmcs->guest_activity_state =
> +		vmcs_read32(GUEST_ACTIVITY_STATE);
> +	l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> +
> +	l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> +	l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> +	l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> +	l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
> +
> +	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
> +	l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> +	l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> +	l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> +	l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> +	l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> +	l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> +	l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> +	l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> +	l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> +	l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +	l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
> +	l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
> +	l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
> +	l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> +	l2_shadow_vmcs->guest_pending_dbg_exceptions =
> +		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> +	l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> +	l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> +}
> +
> +int load_vmcs_common(struct shadow_vmcs *src)
> +{
> +	vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
> +	vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
> +	vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
> +	vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
> +	vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
> +	vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
> +	vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
> +	vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
> +
> +	vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
> +	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
> +
> +	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
> +		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
> +
> +	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
> +	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field);
> +	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> +		     src->vm_entry_exception_error_code);
> +	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len);
> +
> +	vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
> +	vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
> +	vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
> +	vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
> +	vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
> +	vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
> +	vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
> +	vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
> +	vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
> +	vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
> +	vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
> +	vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
> +	vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
> +	vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
> +	vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
> +	vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
> +	vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
> +	vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
> +	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
> +		     src->guest_interruptibility_info);
> +	vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
> +	vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
> +
> +	vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
> +	vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
> +	vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
> +	vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
> +	vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
> +	vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
> +	vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
> +	vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
> +	vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
> +	vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
> +	vmcs_writel(GUEST_DR7, src->guest_dr7);
> +	vmcs_writel(GUEST_RSP, src->guest_rsp);
> +	vmcs_writel(GUEST_RIP, src->guest_rip);
> +	vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
> +	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
> +		    src->guest_pending_dbg_exceptions);
> +	vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
> +	vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
> +
> +	return 0;
> +}
> +
> +int load_vmcs_host_state(struct shadow_vmcs *src)
> +{
> +	vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector);
> +	vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector);
> +	vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector);
> +	vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector);
> +	vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector);
> +	vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector);
> +	vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector);
> +
> +	vmcs_write64(TSC_OFFSET, src->tsc_offset);
> +
> +	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
> +		vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat);
> +
> +	vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs);
> +
> +	vmcs_writel(HOST_CR0, src->host_cr0);
> +	vmcs_writel(HOST_CR3, src->host_cr3);
> +	vmcs_writel(HOST_CR4, src->host_cr4);
> +	vmcs_writel(HOST_FS_BASE, src->host_fs_base);
> +	vmcs_writel(HOST_GS_BASE, src->host_gs_base);
> +	vmcs_writel(HOST_TR_BASE, src->host_tr_base);
> +	vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base);
> +	vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base);
> +	vmcs_writel(HOST_RSP, src->host_rsp);
> +	vmcs_writel(HOST_RIP, src->host_rip);
> +	vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp);
> +	vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip);
> +
> +	return 0;
> +}
> +
>  struct level_state *create_state(void)
>  {
>  	struct level_state *state = NULL;
> @@ -2003,6 +2282,8 @@ int create_l2_state(struct kvm_vcpu *vcpu)
>  	vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
>  	vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
>  
> +	vmx->nested.l2_state->first_launch = true;
> +
>  	return 0;
>  }
>  
> @@ -3393,6 +3674,14 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
>  {
>  	u32 cpu_based_vm_exec_control;
>  
> +	if (to_vmx(vcpu)->nested.nested_mode) {
> +		if (kvm_cpu_has_interrupt(vcpu)) {
Why interrupt will not be present during the call to
enable_irq_window()?

> +			if (nested_vmx_intr(vcpu))
> +				return;
> +		}
> +		return;
> +	}
> +
>  	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
>  	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
>  	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
> @@ -3448,6 +3737,10 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  
> +	if (vmx->nested.nested_mode) {
> +		return;
> +	}
> +
>  	if (!cpu_has_virtual_nmis()) {
>  		/*
>  		 * Tracking the NMI-blocked state in software is built upon
> @@ -3489,6 +3782,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
>  
>  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
>  {
> +	if (to_vmx(vcpu)->nested.nested_mode) {
> +		if (kvm_cpu_has_interrupt(vcpu)) {
> +			if (!nested_vmx_intr(vcpu))
> +				return 0;
> +		}
> +	}
> +
Same as above. kvm_cpu_has_interrupt() should alway return true here.
More interesting question: why not return 0 here if in nested mode and
PIN_BASED_EXT_INTR_MASK is not set and let enable_irq_window() to do
nested exit. This is what svm does as far as I see.
 
>  	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
>  		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
>  			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
> @@ -3993,12 +4293,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> -static int handle_vmx_insn(struct kvm_vcpu *vcpu)
> -{
> -	kvm_queue_exception(vcpu, UD_VECTOR);
> -	return 1;
> -}
> -
>  static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
>  {
>  	unsigned long rflags;
> @@ -4040,6 +4334,27 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> +static int handle_vmlaunch(struct kvm_vcpu *vcpu)
> +{
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +	if (!to_vmx(vcpu)->nested.l2_state->vmclear)
> +		return 1;
> +
> +	return launch_guest(vcpu);
> +}
> +
> +static int handle_vmresume(struct kvm_vcpu *vcpu)
> +{
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (to_vmx(vcpu)->nested.l2_state->vmclear)
> +		return 1;
> +
> +	return launch_guest(vcpu);
> +}
> +
handle_vmlaunch() and handle_vmresume() are looking suspiciously
similar may be move vmclear checking logic into launch_guest()?
It will get additional parameter: expected value of vmclear.

>  static int handle_vmread(struct kvm_vcpu *vcpu)
>  {
>  #ifndef CONFIG_X86_64
> @@ -4050,7 +4365,6 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
>  		return 1;
>  
>  	if (!nested_map_shadow_vmcs(vcpu)) {
> -		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
>  		set_rflags_to_vmx_fail_invalid(vcpu);
>  		return 1;
>  	}
Remove from a patch that add it if you don't need this. Also all
prinks that can be triggered by a guest should be removed/changed to
debug output.

> @@ -4107,7 +4421,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
>  		return 1;
>  
>  	if (!nested_map_shadow_vmcs(vcpu)) {
> -		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
>  		set_rflags_to_vmx_fail_invalid(vcpu);
>  		return 1;
>  	}
> @@ -4137,16 +4450,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
>  				   vcpu->arch.regs[VCPU_REGS_RAX]);
>  		break;
>  	default:
> +		nested_unmap_shadow_vmcs(vcpu);
>  		printk(KERN_INFO "%s invalid field\n", __func__);
>  		set_rflags_to_vmx_fail_valid(vcpu);
>  		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
> -		nested_unmap_shadow_vmcs(vcpu);
>  		return 1;
>  	}
Why this is here and not in the patch that introduce the function?

>  
> +	nested_unmap_shadow_vmcs(vcpu);
>  	clear_rflags_cf_zf(vcpu);
>  	skip_emulated_instruction(vcpu);
> -	nested_unmap_shadow_vmcs(vcpu);
>  	return 1;
>  }
Same.

>  
> @@ -4208,7 +4521,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>  static int handle_vmptrld(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -	struct page *vmcs_page;
>  	u64 guest_vmcs_addr;
>  
>  	if (!nested_vmx_check_permission(vcpu))
> @@ -4228,14 +4540,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
>  	}
>  
>  	if (vmx->nested.vmptr != guest_vmcs_addr) {
> -		/* checking vmptr address */
> -		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> -		if (vmcs_page == NULL)
> -			return 1;
> -
>  		vmx->nested.vmptr = guest_vmcs_addr;
> -
> -		kvm_release_page_clean(vmcs_page);
>  	}
Same.

>  
>  	clear_rflags_cf_zf(vcpu);
> @@ -4534,11 +4839,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_INVLPG]		      = handle_invlpg,
>  	[EXIT_REASON_VMCALL]                  = handle_vmcall,
>  	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
> -	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
> +	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
>  	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
>  	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
>  	[EXIT_REASON_VMREAD]                  = handle_vmread,
> -	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
> +	[EXIT_REASON_VMRESUME]                = handle_vmresume,
>  	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
>  	[EXIT_REASON_VMOFF]                   = handle_vmoff,
>  	[EXIT_REASON_VMON]                    = handle_vmon,
> @@ -4566,6 +4871,17 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  
>  	trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
>  
> +	if (exit_reason == EXIT_REASON_VMLAUNCH ||
> +	    exit_reason == EXIT_REASON_VMRESUME)
> +		vmx->nested.nested_run_pending = 1;
> +	else
> +		vmx->nested.nested_run_pending = 0;
> +
> +	if (vmx->nested.nested_mode && nested_vmx_exit_handled(vcpu, true)) {
> +		nested_vmx_vmexit(vcpu, false);
> +		return 1;
> +	}
> +
>  	/* If we need to emulate an MMIO from handle_invalid_guest_state
>  	 * we just return 0 */
>  	if (vmx->emulation_required && emulate_invalid_guest_state) {
> @@ -4585,7 +4901,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  			= vmcs_read32(VM_INSTRUCTION_ERROR);
>  		return 0;
>  	}
> -
No spurious line deletions please.

>  	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
>  			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
>  			exit_reason != EXIT_REASON_EPT_VIOLATION &&
> @@ -4593,8 +4908,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  		printk(KERN_WARNING "%s: unexpected, valid vectoring info "
>  		       "(0x%x) and exit reason is 0x%x\n",
>  		       __func__, vectoring_info, exit_reason);
> -
> -	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
> +	if (!vmx->nested.nested_mode && unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
>  		if (vmx_interrupt_allowed(vcpu)) {
>  			vmx->soft_vnmi_blocked = 0;
>  		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
> @@ -4641,10 +4955,13 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
>  	int type;
>  	bool idtv_info_valid;
>  
> -	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> -
>  	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
>  
> +	if (vmx->nested.nested_mode)
> +		return;
> +
Why return here? What the function does that should not be done in
nested mode?

> +	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +
>  	/* Handle machine checks before interrupts are enabled */
>  	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
>  	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
> @@ -4747,6 +5064,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
>  		| vmx->rmode.irq.vector;
>  }
>  
> +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
> +{
It seems by this function you are trying to bypass general event
reinjection logic. Why?

> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int irq;
> +	int type;
> +	int errCodeValid;
> +	u32 idt_vectoring_info;
> +	u32 guest_intr;
> +	bool nmi_window_open;
> +	bool interrupt_window_open;
> +
> +	if (vmx->nested.nested_mode && vmx->nested.nested_valid_idt) {
Caller already checked nested_mode why recheck?

> +		idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +		irq  = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
> +		type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
> +		errCodeValid = idt_vectoring_info &
> +			VECTORING_INFO_DELIVER_CODE_MASK;
> +
> +		guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +		nmi_window_open =
> +			!(guest_intr & (GUEST_INTR_STATE_STI |
> +					GUEST_INTR_STATE_MOV_SS |
> +					GUEST_INTR_STATE_NMI));
> +
> +		interrupt_window_open =
> +			((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
> +			 !(guest_intr & (GUEST_INTR_STATE_STI |
> +					 GUEST_INTR_STATE_MOV_SS)));
> +
> +		if (type == INTR_TYPE_EXT_INTR && !interrupt_window_open) {
> +			printk(KERN_INFO "IDT ignored, l2 interrupt window closed!\n");
> +			return 0;
> +		}
> +
> +		if (type == INTR_TYPE_NMI_INTR && !nmi_window_open) {
> +			printk(KERN_INFO "IDT ignored, l2 nmi window closed!\n");
> +			return 0;
> +		}
> +
> +		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
> +			irq | type | INTR_INFO_VALID_MASK | errCodeValid);
> +
> +
> +		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
> +			     vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
> +
> +		if (errCodeValid)
> +			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> +				     vmcs_read32(IDT_VECTORING_ERROR_CODE));
> +	}
> +
> +	return 1;
> +}
> +
>  #ifdef CONFIG_X86_64
>  #define R "r"
>  #define Q "q"
> @@ -4758,6 +5129,26 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
>  static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int r;
> +
> +	if (vmx->nested.nested_mode) {
> +		r = nested_handle_valid_idt(vcpu);
> +		if (!r) {
> +			vmx->fail = 1;
> +			return;
> +		}
> +
> +		if (!nested_map_shadow_vmcs(vcpu)) {
> +			vmx->fail = 1;
> +			return;
> +		}
> +
> +		vmcs_write32(EXCEPTION_BITMAP, get_shadow_vmcs(vcpu)->
> +			     exception_bitmap |
> +			     vmx->nested.l1_state->shadow_vmcs->exception_bitmap);
> +
> +		nested_unmap_shadow_vmcs(vcpu);
> +	}
>  
>  	if (enable_ept && is_paging(vcpu)) {
>  		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
> @@ -4896,6 +5287,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  	get_debugreg(vcpu->arch.dr6, 6);
>  
>  	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +
> +	vmx->nested.nested_valid_idt = vmx->nested.nested_mode &&
> +		(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
> +
>  	if (vmx->rmode.irq.pending)
>  		fixup_rmode_irq(vmx);
>  
> @@ -4984,6 +5379,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>  			goto free_vmcs;
>  	}
>  
> +	vmx->nested.vmptr = 0;
> +
> +	vmx->nested.l1_state = NULL;
> +	vmx->nested.l2_state = NULL;
> +
>  	return &vmx->vcpu;
>  
>  free_vmcs:
> @@ -5215,6 +5615,729 @@ void save_vmcs(struct shadow_vmcs *dst)
>  	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
>  		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
>  }
> +int prepare_vmcs_02(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct shadow_vmcs *src = get_shadow_vmcs(vcpu);
> +	u32 exec_control;
> +
> +	if (!src) {
> +		printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
> +		return 1;
> +	}
> +
> +	load_vmcs_common(src);
> +
> +	if (vmx->nested.l2_state->first_launch) {
> +		if (cpu_has_vmx_vpid() && vmx->nested.l2_state->vpid != 0)
> +			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->vpid);
> +
> +		if (vmx->nested.l2_state->io_bitmap_a)
> +			vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
> +
> +		if (vmx->nested.l2_state->io_bitmap_b)
> +			vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
> +
> +		if (vmx->nested.l2_state->msr_bitmap)
> +			vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
> +
> +		if (src->vm_entry_msr_load_count > 0) {
> +			struct page *page;
> +
> +			page = nested_get_page(vcpu,
> +					       src->vm_entry_msr_load_addr);
> +			if (!page)
> +				return 1;
> +
> +			vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
> +
> +			kvm_release_page_clean(page);
> +		}
> +
> +		if (nested_cpu_has_vmx_tpr_shadow(vcpu)) {
> +			struct page *page;
> +
> +			page = nested_get_page(vcpu,
> +					       src->virtual_apic_page_addr);
> +			if (!page)
> +				return 1;
> +
> +			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
> +
> +			kvm_release_page_clean(page);
> +		}
> +
> +		if (nested_vm_need_virtualize_apic_accesses(vcpu)) {
> +			struct page *page =
> +				nested_get_page(vcpu, src->apic_access_addr);
> +			if (!page)
> +				return 1;
> +
> +			vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
> +			kvm_release_page_clean(page);
> +		}
> +
> +		vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> +			     (vmx->nested.l1_state->shadow_vmcs->pin_based_vm_exec_control |
> +			      src->pin_based_vm_exec_control));
> +
> +		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
> +			     (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_mask &
> +			      src->page_fault_error_code_mask));
> +
> +		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
> +			     (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_match &
> +			      src->page_fault_error_code_match));
> +
> +		if (cpu_has_secondary_exec_ctrls()) {
> +
> +			exec_control =
> +				vmx->nested.l1_state->shadow_vmcs->secondary_vm_exec_control;
> +
> +			if (nested_cpu_has_secondary_exec_ctrls(vcpu)) {
> +
> +				exec_control |= src->secondary_vm_exec_control;
> +
> +				if (!vm_need_virtualize_apic_accesses(vcpu->kvm) ||
> +				    !nested_vm_need_virtualize_apic_accesses(vcpu))
> +					exec_control &=
> +						~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +			}
> +
> +			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> +		}
> +
> +		load_vmcs_host_state(vmx->nested.l1_state->shadow_vmcs);
> +
> +		vmx->nested.l2_state->first_launch = false;
> +	}
> +
> +	if (vm_need_tpr_shadow(vcpu->kvm) &&
> +	    nested_cpu_has_vmx_tpr_shadow(vcpu))
> +		vmcs_write32(TPR_THRESHOLD, src->tpr_threshold);
> +
> +	if (enable_ept) {
> +		if (!nested_cpu_has_vmx_ept(vcpu)) {
> +			vmcs_write64(EPT_POINTER,
> +				     vmx->nested.l1_state->shadow_vmcs->ept_pointer);
> +			vmcs_write64(GUEST_PDPTR0,
> +				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr0);
> +			vmcs_write64(GUEST_PDPTR1,
> +				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr1);
> +			vmcs_write64(GUEST_PDPTR2,
> +				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr2);
> +			vmcs_write64(GUEST_PDPTR3,
> +				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr3);
> +		}
> +	}
> +
> +	exec_control = vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
> +
> +	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> +
> +	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> +
> +	exec_control &= ~CPU_BASED_TPR_SHADOW;
> +
> +	exec_control |= src->cpu_based_vm_exec_control;
> +
> +	if (!vm_need_tpr_shadow(vcpu->kvm) ||
> +	    src->virtual_apic_page_addr == 0) {
> +		exec_control &= ~CPU_BASED_TPR_SHADOW;
> +#ifdef CONFIG_X86_64
> +		exec_control |= CPU_BASED_CR8_STORE_EXITING |
> +			CPU_BASED_CR8_LOAD_EXITING;
> +#endif
> +	} else if (exec_control & CPU_BASED_TPR_SHADOW) {
> +
> +#ifdef CONFIG_X86_64
> +		exec_control &= ~CPU_BASED_CR8_STORE_EXITING;
> +		exec_control &= ~CPU_BASED_CR8_LOAD_EXITING;
> +#endif
> +	}
> +
> +	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
> +
> +	vmcs_write32(EXCEPTION_BITMAP,
> +		     (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
> +		      src->exception_bitmap));
> +
> +	vmcs_write32(VM_EXIT_CONTROLS,
> +		     ((vmx->nested.l1_state->shadow_vmcs->vm_exit_controls &
> +		       NESTED_VM_EXIT_CONTROLS_MASK) | src->vm_exit_controls));
> +
> +	vmcs_write32(VM_ENTRY_CONTROLS,
> +		     (vmx->nested.l1_state->shadow_vmcs->vm_entry_controls &
> +		      NESTED_VM_ENTRY_CONTROLS_MASK) | src->vm_entry_controls);
> +
> +	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
> +
> +	vmcs_writel(CR0_GUEST_HOST_MASK,
> +		    (vmx->nested.l1_state->shadow_vmcs->cr0_guest_host_mask  &
> +		     src->cr0_guest_host_mask));
> +	vmcs_writel(CR4_GUEST_HOST_MASK,
> +		    (vmx->nested.l1_state->shadow_vmcs->cr4_guest_host_mask  &
> +		     src->cr4_guest_host_mask));
> +
> +	return 0;
> +}
> +
> +int switch_back_vmcs(struct kvm_vcpu *vcpu)
> +{
> +	struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
> +
> +	if (enable_vpid && src->virtual_processor_id != 0)
> +		vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
> +
> +	vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
> +	vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
> +
> +	if (cpu_has_vmx_msr_bitmap())
> +		vmcs_write64(MSR_BITMAP, src->msr_bitmap);
> +
> +	vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
> +
> +	if (vm_need_virtualize_apic_accesses(vcpu->kvm))
> +		vmcs_write64(APIC_ACCESS_ADDR,
> +			     src->apic_access_addr);
> +
> +	if (enable_ept) {
> +		vmcs_write64(EPT_POINTER, src->ept_pointer);
> +		vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
> +		vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
> +		vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
> +		vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
> +	}
> +
> +	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control);
> +	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control);
> +	vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
> +	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
> +		     src->page_fault_error_code_mask);
> +	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
> +		     src->page_fault_error_code_match);
> +	vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
> +	vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
> +	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
> +
> +	if (cpu_has_secondary_exec_ctrls())
> +		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
> +			     src->secondary_vm_exec_control);
> +
> +	load_vmcs_common(src);
> +
> +	load_vmcs_host_state(to_vmx(vcpu)->nested.l1_state->shadow_vmcs);
> +
> +	return 0;
> +}
> +
> +void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long mask;
> +
> +	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
> +		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
> +	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
> +		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
> +
> +	mask = ~((1 << VCPU_REGS_RSP) | (1 << VCPU_REGS_RIP));
> +
> +	if (vcpu->arch.regs_dirty & mask) {
> +		printk(KERN_INFO "WARNING: dirty cached registers regs_dirty 0x%x mask 0x%lx\n",
> +		       vcpu->arch.regs_dirty, mask);
> +		WARN_ON(1);
> +	}
> +
> +	vcpu->arch.regs_dirty = 0;
> +}
> +
> +static int nested_vmx_run(struct kvm_vcpu *vcpu)
> +{
> +	/* verify that l1 has done vmptrld for l2 earlier */
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int initial_pfu_active = vcpu->fpu_active;
> +	int r = 0;
> +
> +	if (vmx->nested.nested_mode) {
> +		printk(KERN_INFO "Nested guest already running\n");
> +		set_rflags_to_vmx_fail_valid(vcpu);
> +		return 1;
> +	}
> +
> +
> +	vmx->nested.nested_mode = 1;
> +
> +	vcpu->arch.exception.pending = false;
Why need this?

> +
> +	sync_cached_regs_to_vmcs(vcpu);
> +
> +	save_vmcs(vmx->nested.l1_state->shadow_vmcs);
> +
> +	vmx->nested.l1_state->shadow_efer = vcpu->arch.shadow_efer;
> +	if (!enable_ept)
> +		vmx->nested.l1_state->cr3 = vcpu->arch.cr3;
> +	vmx->nested.l1_state->cr4 = vcpu->arch.cr4;
> +
> +	if (enable_vpid) {
> +		if (vmx->nested.l2_state->vpid == 0) {
> +			allocate_vpid(vmx);
> +			vmx->nested.l2_state->vpid = vmx->vpid;
> +		}
> +	}
> +
> +	if (cpu_has_vmx_msr_bitmap())
> +		vmx->nested.l1_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
> +	else
> +		vmx->nested.l1_state->msr_bitmap = 0;
> +
> +	vmx->nested.l1_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	vmx->nested.l1_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +	vmx->nested.l1_state->vmcs = vmx->vmcs;
> +	vmx->nested.l1_state->cpu = vcpu->cpu;
> +	vmx->nested.l1_state->launched = vmx->launched;
> +
> +	vmx->vmcs = vmx->nested.l2_state->vmcs;
> +	vcpu->cpu = vmx->nested.l2_state->cpu;
Who initialize vmx->nested.l2_state->cpu before first launch?
Why have different cpu for l1 and l2 guest? It seems like this is global
vcpu thread property.

> +	vmx->launched = vmx->nested.l2_state->launched;
> +
Can you explain why ->launched logic is needed?

> +	if (vmx->nested.l2_state->vmclear || !vmx->launched) {
> +		vmcs_clear(vmx->vmcs);
> +		vmx->launched = 0;
> +		vmx->nested.l2_state->vmclear = 0;
> +	}
> +
> +	vmx_vcpu_load(vcpu, get_cpu());
> +	put_cpu();
> +
> +
> +	if (!nested_map_shadow_vmcs(vcpu)) {
> +		set_rflags_to_vmx_fail_valid(vcpu);
> +		return 1;
> +	}
No cleanup on error. Is looks like we are on an l2 vmcs at this point.

> +
> +	prepare_vmcs_02(vcpu);
> +
> +	if (get_shadow_vmcs(vcpu)->vm_entry_controls &
> +	    VM_ENTRY_IA32E_MODE) {
> +		if (!((vcpu->arch.shadow_efer & EFER_LMA) &&
> +		      (vcpu->arch.shadow_efer & EFER_LME)))
> +			vcpu->arch.shadow_efer |= (EFER_LMA | EFER_LME);
> +	} else {
> +		if ((vcpu->arch.shadow_efer & EFER_LMA) ||
> +		    (vcpu->arch.shadow_efer & EFER_LME))
> +			vcpu->arch.shadow_efer = 0;
> +	}
> +
> +	vmx_set_cr0(vcpu, get_shadow_vmcs(vcpu)->guest_cr0);
> +	vmcs_writel(CR0_READ_SHADOW,
> +		    get_shadow_vmcs(vcpu)->cr0_read_shadow);
> +	vmx_set_cr4(vcpu, get_shadow_vmcs(vcpu)->guest_cr4);
> +	vmcs_writel(CR4_READ_SHADOW,
> +		    get_shadow_vmcs(vcpu)->cr4_read_shadow);
> +
> +	vcpu->arch.cr0 |= X86_CR0_PG;
> +
> +	if (enable_ept && !nested_cpu_has_vmx_ept(vcpu)) {
> +		vmcs_write32(GUEST_CR3, get_shadow_vmcs(vcpu)->guest_cr3);
> +		vmx->vcpu.arch.cr3 = get_shadow_vmcs(vcpu)->guest_cr3;
> +	} else {
> +		kvm_set_cr3(vcpu, get_shadow_vmcs(vcpu)->guest_cr3);
> +		kvm_mmu_reset_context(vcpu);
> +
> +		nested_unmap_shadow_vmcs(vcpu);
> +
> +		r = kvm_mmu_load(vcpu);
> +		if (unlikely(r)) {
> +			printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r);
> +			nested_vmx_vmexit(vcpu, false);
> +			set_rflags_to_vmx_fail_valid(vcpu);
> +			return 1;
> +		}
> +
> +		nested_map_shadow_vmcs(vcpu);
> +	}
> +
> +	kvm_register_write(vcpu, VCPU_REGS_RSP,
> +			   get_shadow_vmcs(vcpu)->guest_rsp);
> +	kvm_register_write(vcpu, VCPU_REGS_RIP,
> +			   get_shadow_vmcs(vcpu)->guest_rip);
> +
> +	vmcs_write32(EXCEPTION_BITMAP,
> +		     (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
> +		      get_shadow_vmcs(vcpu)->exception_bitmap));
> +
> +	nested_unmap_shadow_vmcs(vcpu);
> +
> +	if (initial_pfu_active)
> +		vmx_fpu_activate(vcpu);
> +
> +	return 1;
> +}
> +
> +static int launch_guest(struct kvm_vcpu *vcpu)
> +{
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	skip_emulated_instruction(vcpu);
> +
> +	nested_vmx_run(vcpu);
> +
> +	return 1;
> +}
> +
> +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> +			     bool is_interrupt)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int initial_pfu_active = vcpu->fpu_active;
> +
> +	if (!vmx->nested.nested_mode) {
> +		printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
> +		       __func__);
> +		return 0;
> +	}
> +
> +	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
> +
> +	sync_cached_regs_to_vmcs(vcpu);
> +
> +	if (!nested_map_shadow_vmcs(vcpu)) {
> +		printk(KERN_INFO "Error mapping shadow vmcs\n");
> +		set_rflags_to_vmx_fail_valid(vcpu);
Error during vmexit should set abort flag, not change flags.

> +		return 1;
> +	}
> +
> +	prepare_vmcs_12(vcpu);
> +	if (is_interrupt)
> +		get_shadow_vmcs(vcpu)->vm_exit_reason =
> +			EXIT_REASON_EXTERNAL_INTERRUPT;
> +
> +	vmx->nested.l2_state->launched = vmx->launched;
> +	vmx->nested.l2_state->cpu = vcpu->cpu;
> +
> +	nested_unmap_shadow_vmcs(vcpu);
> +
> +	vmx->vmcs = vmx->nested.l1_state->vmcs;
> +	vcpu->cpu = vmx->nested.l1_state->cpu;
> +	vmx->launched = vmx->nested.l1_state->launched;
> +
> +	vmx_vcpu_load(vcpu, get_cpu());
> +	put_cpu();
> +
> +	vcpu->arch.exception.pending = false;
Why need this?

> +
> +	vcpu->arch.shadow_efer = vmx->nested.l1_state->shadow_efer;
> +	vmx_set_cr0(vcpu, vmx->nested.l1_state->shadow_vmcs->cr0_read_shadow);
> +	vmx_set_cr4(vcpu, vmx->nested.l1_state->cr4);
> +
> +	if (enable_ept) {
> +		vcpu->arch.cr3 = vmx->nested.l1_state->shadow_vmcs->guest_cr3;
> +		vmcs_write32(GUEST_CR3, vmx->nested.l1_state->shadow_vmcs->guest_cr3);
> +	} else {
> +		kvm_set_cr3(vcpu, vmx->nested.l1_state->cr3);
> +	}
> +
> +	if (!nested_map_shadow_vmcs(vcpu)) {
> +		printk(KERN_INFO "Error mapping shadow vmcs\n");
> +		set_rflags_to_vmx_fail_valid(vcpu);
Abort not flags.

> +		return 1;
> +	}
> +
> +	switch_back_vmcs(vcpu);
> +
> +	nested_unmap_shadow_vmcs(vcpu);
> +
> +	kvm_register_write(vcpu, VCPU_REGS_RSP,
> +			   vmx->nested.l1_state->shadow_vmcs->guest_rsp);
> +	kvm_register_write(vcpu, VCPU_REGS_RIP,
> +			   vmx->nested.l1_state->shadow_vmcs->guest_rip);
> +
> +	vmx->nested.nested_mode = 0;
> +
> +	kvm_mmu_reset_context(vcpu);
> +	kvm_mmu_load(vcpu);
> +
> +	if (unlikely(vmx->fail)) {
> +		vmx->fail = 0;
> +		set_rflags_to_vmx_fail_valid(vcpu);
> +	} else
> +		clear_rflags_cf_zf(vcpu);
> +
> +	if (initial_pfu_active)
> +		vmx_fpu_activate(vcpu);
> +
> +	return 0;
> +}
> +
> +static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
> +{
> +	if (to_vmx(vcpu)->nested.nested_mode) {
> +		struct page *msr_page = NULL;
> +		u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
> +		u32 exit_code = vmcs_read32(VM_EXIT_REASON);
> +		struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu);
> +
> +		if (!cpu_has_vmx_msr_bitmap()
> +		    || !nested_cpu_has_vmx_msr_bitmap(vcpu))
> +			return 1;
> +
> +		msr_page = nested_get_page(vcpu,
> +					   l2svmcs->msr_bitmap);
> +
> +		if (!msr_page) {
> +			printk(KERN_INFO "%s error in nested_get_page\n",
> +			       __func__);
> +			return 0;
> +		}
> +
> +		switch (exit_code) {
> +		case EXIT_REASON_MSR_READ:
> +			if (msr_index <= 0x1fff) {
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0x000)))
> +					return 1;
> +			} else if ((msr_index >= 0xc0000000) &&
> +				   (msr_index <= 0xc0001fff)) {
> +				msr_index &= 0x1fff;
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0x400)))
> +					return 1;
> +			}
> +			break;
> +		case EXIT_REASON_MSR_WRITE:
> +			if (msr_index <= 0x1fff) {
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0x800)))
> +						return 1;
> +			} else if ((msr_index >= 0xc0000000) &&
> +				   (msr_index <= 0xc0001fff)) {
> +				msr_index &= 0x1fff;
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0xc00)))
> +					return 1;
> +			}
> +			break;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
> +{
> +	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	struct shadow_vmcs *l2svmcs;
> +
> +	int r = 0;
> +
> +	if (vmx->nested.nested_run_pending)
> +		return 0;
> +
> +	if (unlikely(vmx->fail)) {
> +		printk(KERN_INFO "%s failed vm entry %x\n",
> +		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
> +		return 1;
> +	}
> +
> +	if (kvm_override) {
> +		switch (exit_code) {
> +		case EXIT_REASON_EXTERNAL_INTERRUPT:
> +			return 0;
> +		case EXIT_REASON_EXCEPTION_NMI:
> +			if (!is_exception(intr_info))
> +				return 0;
> +
> +			if (is_page_fault(intr_info) && (!enable_ept))
> +				return 0;
> +
> +			break;
> +		case EXIT_REASON_EPT_VIOLATION:
> +			if (enable_ept)
> +				return 0;
> +
> +			break;
> +		}
> +	}
> +
> +
> +	if (!nested_map_shadow_vmcs(vcpu))
> +		return 0;
> +	l2svmcs = get_shadow_vmcs(vcpu);
> +
> +	switch (exit_code) {
> +	case EXIT_REASON_INVLPG:
> +		if (l2svmcs->cpu_based_vm_exec_control &
> +		    CPU_BASED_INVLPG_EXITING)
> +			r = 1;
> +		break;
> +	case EXIT_REASON_MSR_READ:
> +	case EXIT_REASON_MSR_WRITE:
> +		r = nested_vmx_exit_handled_msr(vcpu);
> +		break;
> +	case EXIT_REASON_CR_ACCESS: {
> +		unsigned long exit_qualification =
> +			vmcs_readl(EXIT_QUALIFICATION);
> +		int cr = exit_qualification & 15;
> +		int reg = (exit_qualification >> 8) & 15;
> +		unsigned long val = kvm_register_read(vcpu, reg);
> +
> +		switch ((exit_qualification >> 4) & 3) {
> +		case 0: /* mov to cr */
> +			switch (cr) {
> +			case 0:
> +				if (l2svmcs->cr0_guest_host_mask &
> +				    (val ^ l2svmcs->cr0_read_shadow))
> +					r = 1;
> +				break;
> +			case 3:
> +				if (l2svmcs->cpu_based_vm_exec_control &
> +				    CPU_BASED_CR3_LOAD_EXITING)
> +					r = 1;
> +				break;
> +			case 4:
> +				if (l2svmcs->cr4_guest_host_mask &
> +				    (l2svmcs->cr4_read_shadow ^ val))
> +					r = 1;
> +				break;
> +			case 8:
> +				if (l2svmcs->cpu_based_vm_exec_control &
> +				    CPU_BASED_CR8_LOAD_EXITING)
> +					r = 1;
> +				break;
> +			}
> +			break;
> +		case 2: /* clts */
> +			if (l2svmcs->cr0_guest_host_mask &
> +			    (val ^ l2svmcs->cr0_read_shadow))
> +				r = 1;
> +			break;
> +		case 1: /*mov from cr*/
> +			switch (cr) {
> +			case 0:
> +				r = 1;
> +			case 3:
> +				if (l2svmcs->cpu_based_vm_exec_control &
> +				    CPU_BASED_CR3_STORE_EXITING)
> +					r = 1;
> +				break;
> +			case 4:
> +				r = 1;
> +				break;
> +			case 8:
> +				if (l2svmcs->cpu_based_vm_exec_control &
> +				    CPU_BASED_CR8_STORE_EXITING)
> +					r = 1;
> +				break;
> +			}
> +			break;
> +		case 3: /* lmsw */
> +			if (l2svmcs->cr0_guest_host_mask &
> +			    (val ^ l2svmcs->cr0_read_shadow))
> +				r = 1;
> +			break;
> +		}
> +		break;
> +	}
> +	case EXIT_REASON_DR_ACCESS: {
> +		if (l2svmcs->cpu_based_vm_exec_control &
> +		    CPU_BASED_MOV_DR_EXITING)
> +			r = 1;
> +		break;
> +	}
> +
> +	case EXIT_REASON_EXCEPTION_NMI: {
> +
> +		if (is_external_interrupt(intr_info) &&
> +		    (l2svmcs->pin_based_vm_exec_control &
> +		     PIN_BASED_EXT_INTR_MASK))
> +			r = 1;
> +		else if (is_nmi(intr_info) &&
> +		    (l2svmcs->pin_based_vm_exec_control &
> +		     PIN_BASED_NMI_EXITING))
> +			r = 1;
> +		else if (is_exception(intr_info) &&
> +		    (l2svmcs->exception_bitmap &
> +		     (1u << (intr_info & INTR_INFO_VECTOR_MASK))))
> +			r = 1;
> +		else if (is_page_fault(intr_info))
> +			r = 1;
> +		break;
> +	}
> +
> +	case EXIT_REASON_EXTERNAL_INTERRUPT:
> +		if (l2svmcs->pin_based_vm_exec_control &
> +		    PIN_BASED_EXT_INTR_MASK)
> +			r = 1;
> +		break;
> +	default:
> +		r = 1;
> +	}
> +	nested_unmap_shadow_vmcs(vcpu);
> +
> +	return r;
> +}
> +
> +static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
> +				      bool has_error_code, u32 error_code)
> +{
> +	if (vmx->nested.nested_mode) {
> +		if (nested_vmx_exit_handled(&vmx->vcpu, false)) {
> +			nested_vmx_vmexit(&vmx->vcpu, false);
> +			if (!nested_map_shadow_vmcs(&vmx->vcpu))
> +				return 1;
> +			get_shadow_vmcs(&vmx->vcpu)->vm_exit_reason =
> +				EXIT_REASON_EXCEPTION_NMI;
> +			get_shadow_vmcs(&vmx->vcpu)->vm_exit_intr_info =
> +				(nr | INTR_TYPE_HARD_EXCEPTION
> +				 | (has_error_code ?
> +				    INTR_INFO_DELIVER_CODE_MASK : 0)
> +				 | INTR_INFO_VALID_MASK);
> +
> +			if (has_error_code)
> +				get_shadow_vmcs(&vmx->vcpu)->
> +					vm_exit_intr_error_code = error_code;
> +			nested_unmap_shadow_vmcs(&vmx->vcpu);
> +			return 1;
> +		}
> +	}
> +	return 0;
> +}
> +
> +static int nested_vmx_intr(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (vmx->nested.nested_mode) {
This function is called only after checking nested_mode. Why recheck?

> +
> +		if (!nested_map_shadow_vmcs(vcpu))
> +			return 0;
> +
> +		if (get_shadow_vmcs(vcpu)->pin_based_vm_exec_control &
> +		    PIN_BASED_EXT_INTR_MASK) {
> +
> +			if (vmx->nested.nested_run_pending) {
> +				nested_unmap_shadow_vmcs(vcpu);
> +				return 0;
> +			}
> +
> +			nested_unmap_shadow_vmcs(vcpu);
> +			nested_vmx_vmexit(vcpu, true);
> +			return 1;		
> +		}
> +
> +		nested_unmap_shadow_vmcs(vcpu);
> +
> +	}
> +
> +	return 0;
> +}
>  
>  static struct kvm_x86_ops vmx_x86_ops = {
>  	.cpu_has_kvm_support = cpu_has_kvm_support,
> -- 
> 1.6.0.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: Nested VMX support v3
  2009-10-15 14:41 Nested VMX support v3 oritw
  2009-10-15 14:41 ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff oritw
  2009-10-19 10:47 ` Nested VMX support v3 Gleb Natapov
@ 2009-10-20  3:30 ` Avi Kivity
  2009-10-21 14:50   ` Orit Wasserman
  2 siblings, 1 reply; 35+ messages in thread
From: Avi Kivity @ 2009-10-20  3:30 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
> Avi,
> We have addressed all of the comments, please apply.
>
> The following patches implement nested VMX support. The patches enable a guest
> to use the VMX APIs in order to run its own nested guest (i.e., enable running
> other hypervisors which use VMX under KVM). The current patches support running
> Linux under a nested KVM using shadow page table (with bypass_guest_pf
> disabled). SMP support was fixed.  Reworking EPT support to mesh cleanly with
> the current shadow paging design per Avi's comments is a work-in-progress.
>    

Why is bypass_guest_pf disabled?

> The current patches only support a single nested hypervisor, which can only run
> a single guest (multiple guests are work in progress). Only 64-bit nested
> hypervisors are supported.
>    

Multiple guests and 32-bit support are merge requirements.  As far as I 
can tell there shouldn't be anything special required to support them?


> vpid allocation will be updated with the multiguest support (work in progress).
> We are working on fixing the cr0.TS handling, it works for nested kvm by not
> for vmware server.
>    

Please either drop or fix vpid before merging.  What's wrong with 
cr0.ts?  I'd like to see that fixed as well.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff
  2009-10-15 14:41 ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff oritw
  2009-10-15 14:41   ` [PATCH 2/5] Nested VMX patch 2 implements vmclear oritw
@ 2009-10-20  4:00   ` Avi Kivity
  2009-10-22 12:41     ` Orit Wasserman
  1 sibling, 1 reply; 35+ messages in thread
From: Avi Kivity @ 2009-10-20  4:00 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
>
>   /*
> + * Handles msr read for nested virtualization
> + */
> +static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
> +			      u64 *pdata)
> +{
> +	u64 vmx_msr = 0;
> +
> +	switch (msr_index) {
> +	case MSR_IA32_FEATURE_CONTROL:
> +		*pdata = 0;
> +		break;
> +	case MSR_IA32_VMX_BASIC:
> +		*pdata = 0;
> +		rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr);
> +		*pdata = (vmx_msr&  0x00ffffcfffffffff);
> +		break;
> +
>    

This (and the rest of the msrs) must be controllable from userspace.  
Otherwise a live migration from a newer host to an older host would break.

>
>   /*
> + * Writes msr value for nested virtualization
> + * Returns 0 on success, non-0 otherwise.
> + */
> +static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
> +{
> +	switch (msr_index) {
> +	case MSR_IA32_FEATURE_CONTROL:
> +		if ((data&  (FEATURE_CONTROL_LOCKED |
> +			     FEATURE_CONTROL_VMXON_ENABLED))
> +		    != (FEATURE_CONTROL_LOCKED |
> +			FEATURE_CONTROL_VMXON_ENABLED))
> +			return 1;
> +		break;
> +	default:
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
>    

Need to export this msr to userspace for live migration.  See 
msrs_to_save[].

>
> +/*
> + * Check to see if vcpu can execute vmx command
> + * Inject the corrseponding exception
> + */
> +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_segment cs;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct kvm_msr_entry *msr;
> +
> +	vmx_get_segment(vcpu,&cs, VCPU_SREG_CS);
> +
> +	if (!vmx->nested.vmxon) {
> +		printk(KERN_DEBUG "%s: vmx not on\n", __func__);
>    

pr_debug

> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 0;
> +	}
> +
> +	msr = find_msr_entry(vmx, MSR_EFER);
> +
> +	if ((vmx_get_rflags(vcpu)&  X86_EFLAGS_VM) ||
> +		 ((msr->data&  EFER_LMA)&&  !cs.l)) {
>    

is_long_mode()

>   static int handle_vmx_insn(struct kvm_vcpu *vcpu)
>   {
>   	kvm_queue_exception(vcpu, UD_VECTOR);
>   	return 1;
>   }
>
> +static int handle_vmoff(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	vmx->nested.vmxon = 0;
> +
> +	skip_emulated_instruction(vcpu);
> +	return 1;
> +}
> +
> +static int handle_vmon(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_segment cs;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!nested) {
> +		printk(KERN_DEBUG "%s: nested vmx not enabled\n", __func__);
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 1;
> +	}
> +
> +	vmx_get_segment(vcpu,&cs, VCPU_SREG_CS);
> +
> +	if (!(vcpu->arch.cr4&  X86_CR4_VMXE) ||
> +	    !(vcpu->arch.cr0&  X86_CR0_PE) ||
> +	    (vmx_get_rflags(vcpu)&  X86_EFLAGS_VM)) {
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		printk(KERN_INFO "%s invalid register state\n", __func__);
> +		return 1;
> +	}
> +#ifdef CONFIG_X86_64
> +	if (((find_msr_entry(to_vmx(vcpu),
> +			     MSR_EFER)->data&  EFER_LMA)&&  !cs.l)) {
>    

is_long_mode(), and you can avoid the #ifdef.


VMXON is supposed to block INIT, please add that (in a separate patch).

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/5] Nested VMX patch 2 implements vmclear
  2009-10-15 14:41   ` [PATCH 2/5] Nested VMX patch 2 implements vmclear oritw
  2009-10-15 14:41     ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst oritw
@ 2009-10-20  4:06     ` Avi Kivity
  2009-10-21 14:56       ` Orit Wasserman
  1 sibling, 1 reply; 35+ messages in thread
From: Avi Kivity @ 2009-10-20  4:06 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
> ---
>   arch/x86/kvm/vmx.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++---
>   1 files changed, 65 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 71bd91a..411cbdb 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -61,15 +61,26 @@ module_param_named(unrestricted_guest,
>   static int __read_mostly emulate_invalid_guest_state = 0;
>   module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>
> -struct vmcs {
> -	u32 revision_id;
> -	u32 abort;
> -	char data[0];
> +struct __attribute__ ((__packed__)) level_state {
> +	/* Has the level1 guest done vmclear? */
> +	bool vmclear;
>   };
>    

Why __packed__?

>
>   struct nested_vmx {
>   	/* Has the level1 guest done vmxon? */
>   	bool vmxon;
> +
> +	/*
> +	 * Level 2 state : includes vmcs,registers and
> +	 * a copy of vmcs12 for vmread/vmwrite
> +	 */
> +	struct level_state *l2_state;
> +};
> +
> +struct vmcs {
> +	u32 revision_id;
> +	u32 abort;
> +	char data[0];
>   };
>    

Why move struct vmcs around?

> +
>   static int handle_vmoff(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -3310,6 +3368,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>
>   	vmx->nested.vmxon = 1;
>
> +	create_l2_state(vcpu);
> +
>    

Need to check return code.


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-10-15 14:41     ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst oritw
                         ` (2 preceding siblings ...)
  2009-10-19 12:59       ` Gleb Natapov
@ 2009-10-20  4:24       ` Avi Kivity
  2009-10-22 12:48         ` Orit Wasserman
  3 siblings, 1 reply; 35+ messages in thread
From: Avi Kivity @ 2009-10-20  4:24 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
>
> +
> +struct __attribute__ ((__packed__)) shadow_vmcs {
>    

Since this is in guest memory, we need it packed so the binary format is 
preserved across migration.  Please add a comment so it isn't changed 
(at least without changing the revision_id).

vmclear state should be here, that will help multiguest support.

>
>   struct nested_vmx {
>   	/* Has the level1 guest done vmxon? */
>   	bool vmxon;
> -
> +	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
> +	u64 vmptr;
>    

Need to expose it for live migration.

>   	/*
>   	 * Level 2 state : includes vmcs,registers and
>   	 * a copy of vmcs12 for vmread/vmwrite
>   	 */
>   	struct level_state *l2_state;
> +	/* Level 1 state for switching to level 2 and back */
> +	struct level_state *l1_state;
>    

This creates a ton of duplication.

Some of the data is completely unnecessary, for example we can 
recalculate cr0 from HOST_CR0 and GUEST_CR0.

> +
> +static int vmptrld(struct kvm_vcpu *vcpu,
> +		   u64 phys_addr)
> +{
> +	u8 error;
> +
> +	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
> +		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
> +		      : "cc");
> +	if (error) {
> +		printk(KERN_ERR "kvm: %s vmptrld %llx failed\n",
> +		       __func__, phys_addr);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
>   /*
>    * Switches to specified vcpu, until a matching vcpu_put(), but assumes
>    * vcpu mutex is already taken.
> @@ -736,15 +923,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>   	}
>
>   	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
> -		u8 error;
> -
>   		per_cpu(current_vmcs, cpu) = vmx->vmcs;
> -		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
> -			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
> -			      : "cc");
> -		if (error)
> -			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
> -			       vmx->vmcs, phys_addr);
> +		vmptrld(vcpu, phys_addr);
>   	}
>    

This part of the patch is no longer needed.
> +	if (cpu_has_vmx_msr_bitmap())
> +		vmx->nested.l2_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
> +	else
> +		vmx->nested.l2_state->msr_bitmap = 0;
> +
> +	vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +
>    

This no longer works, since we don't load the guest vmcs.

> +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
> +			struct kvm_vcpu *vcpu);
>    

Isn't this in a header somewhere?

> +
> +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> +{
> +
> +	int r = 0;
> +
> +	r = kvm_read_guest_virt(vcpu->arch.regs[VCPU_REGS_RAX], gentry,
> +				sizeof(u64), vcpu);
> +	if (r) {
> +		printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
> +		       __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
> +		return r;
> +	}
> +
> +	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> +		printk(KERN_DEBUG "%s addr %llx not aligned\n",
> +		       __func__, *gentry);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
>    

Should go through the emulator to evaluate arguments.

> +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct page *vmcs_page;
> +	u64 guest_vmcs_addr;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (read_guest_vmcs_gpa(vcpu,&guest_vmcs_addr))
> +		return 1;
> +
> +	if (create_l1_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> +		return 1;
> +	}
> +
> +	if (create_l2_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> +		return 1;
> +	}
>    

return errors here, so we see the problem.

> +
> +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> +{
> +	int r = 0;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	r = kvm_write_guest_virt(vcpu->arch.regs[VCPU_REGS_RAX],
> +				 (void *)&to_vmx(vcpu)->nested.vmptr,
> +				 sizeof(u64), vcpu);
>    

Emulator again.

> +void save_vmcs(struct shadow_vmcs *dst)
> +{
>
> +	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
>    

These (and many others) can never change due to a nested guest running, 
so no need to save them.

> +	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
>    

In general, you need to translate host physical addresses to guest 
physical addresses.

> +	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> +	if (enable_ept)
> +		dst->ept_pointer = vmcs_read64(EPT_POINTER);
> +
>    

Not all hosts support these features.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite
  2009-10-15 14:41       ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite oritw
  2009-10-15 14:41         ` [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume oritw
  2009-10-19 13:17         ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite Gleb Natapov
@ 2009-10-20  4:44         ` Avi Kivity
  2009-10-22 12:50           ` Orit Wasserman
  2 siblings, 1 reply; 35+ messages in thread
From: Avi Kivity @ 2009-10-20  4:44 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
>
> +static int nested_map_shadow_vmcs(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct page *vmcs_page = nested_get_page(vcpu, vmx->nested.vmptr);
> +
> +	if (vmcs_page == NULL) {
> +		printk(KERN_INFO "%s: failure in nested_get_page\n",__func__);
> +		return 0;
> +	}
> +
> +	if (vmx->nested.l2_state->shadow_vmcs) {
> +		printk(KERN_INFO "%s: shadow vmcs already mapped\n",__func__);
> +		return 0;
> +	}
> +
>    

Consider dropping shadow_vmcs from l2_state and just passing it 
everywhere.  Less convenient but safer.

> +	vmx->nested.l2_state->shadow_vmcs = kmap_atomic(vmcs_page, KM_USER0);
> +
> +	if (!vmx->nested.l2_state->shadow_vmcs) {
> +		printk(KERN_INFO "%s: error in kmap_atomic\n",__func__);
> +		return 0;
> +	}
>    

kmap_atomic() can't fail.
>
> +static int handle_vmread(struct kvm_vcpu *vcpu)
> +{
> +#ifndef CONFIG_X86_64
> +	u64 value;
> +#endif
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (!nested_map_shadow_vmcs(vcpu)) {
> +		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
> +		set_rflags_to_vmx_fail_invalid(vcpu);
> +		return 1;
> +	}
>    

return an error.

> +
> +	switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
> +	case VMCS_FIELD_TYPE_U16:
> +		vcpu->arch.regs[VCPU_REGS_RAX] =
> +			nested_vmcs_read16(vcpu,
> +					   vcpu->arch.regs[VCPU_REGS_RDX]);
> +		break;
>    

Use the emulator to decode operands.


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-15 14:41         ` [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume oritw
  2009-10-19 17:29           ` Gleb Natapov
@ 2009-10-20  4:56           ` Avi Kivity
  2009-10-22 12:56             ` Orit Wasserman
  1 sibling, 1 reply; 35+ messages in thread
From: Avi Kivity @ 2009-10-20  4:56 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
> ---
>   arch/x86/kvm/vmx.c | 1173 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>   1 files changed, 1148 insertions(+), 25 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6a4c252..e814029 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -209,6 +209,7 @@ struct __attribute__ ((__packed__)) level_state {
>   	struct vmcs *vmcs;
>   	int cpu;
>   	int launched;
> +	bool first_launch;
>   };
>
>   struct nested_vmx {
> @@ -216,6 +217,12 @@ struct nested_vmx {
>   	bool vmxon;
>   	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
>   	u64 vmptr;
> +	/* Are we running nested guest */
> +	bool nested_mode;
> +	/* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
> +	bool nested_run_pending;
> +	/* flag indicating if there was a valid IDT after exiting from l2 */
> +	bool nested_valid_idt;
>    

Did you mean valid_idt_vectoring_info?

No need to prefix everything with nested_ inside nested_vmx.

> +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> +{
> +	struct shadow_vmcs *l2_shadow_vmcs =
> +		get_shadow_vmcs(vcpu);
> +
> +	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +
> +	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	l2_shadow_vmcs->guest_physical_address =
> +		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
>    

Physical addresses need translation,  no?

> +	l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
> +
> +	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
>    

We don't allow the guest to modify these, so no need to read them.  If 
you do, you need to remove the bits that we modify.

> +
> +int load_vmcs_common(struct shadow_vmcs *src)
> +{
> +
> +	vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
>    

Why load this?

> +	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
>    

I think some features there are dangerous.

> +	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
>    

Need to verify?  Also need to validate the loaded MSRs and run them 
through kvm_set_msr() instead of letting the cpu do it.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-10-19 11:17       ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst Gleb Natapov
@ 2009-10-21 13:27         ` Orit Wasserman
  0 siblings, 0 replies; 35+ messages in thread
From: Orit Wasserman @ 2009-10-21 13:27 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Gleb Natapov <gleb@redhat.com> wrote on 19/10/2009 13:17:41:

> From:
>
> Gleb Natapov <gleb@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Abel Gordon/
> Haifa/IBM@IBMIL, Muli Ben-Yehuda/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mdday@us.ibm.com
>
> Date:
>
> 19/10/2009 13:17
>
> Subject:
>
> Re: [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On Thu, Oct 15, 2009 at 04:41:44PM +0200, oritw@il.ibm.com wrote:
> > From: Orit Wasserman <oritw@il.ibm.com>
> >
> > ---
> >  arch/x86/kvm/vmx.c |  468 +++++++++++++++++++++++++++++++++++++++
> +++++++++++--
> >  arch/x86/kvm/x86.c |    3 +-
> >  2 files changed, 459 insertions(+), 12 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 411cbdb..8c186e0 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -61,20 +61,168 @@ module_param_named(unrestricted_guest,
> >  static int __read_mostly emulate_invalid_guest_state = 0;
> >  module_param(emulate_invalid_guest_state, bool, S_IRUGO);
> >
> > +
> > +struct __attribute__ ((__packed__)) shadow_vmcs {
> > +   u32 revision_id;
> > +   u32 abort;
> > +   u16 virtual_processor_id;
> > +   u16 guest_es_selector;
> > +   u16 guest_cs_selector;
> > +   u16 guest_ss_selector;
> > +   u16 guest_ds_selector;
> > +   u16 guest_fs_selector;
> > +   u16 guest_gs_selector;
> > +   u16 guest_ldtr_selector;
> > +   u16 guest_tr_selector;
> > +   u16 host_es_selector;
> > +   u16 host_cs_selector;
> > +   u16 host_ss_selector;
> > +   u16 host_ds_selector;
> > +   u16 host_fs_selector;
> > +   u16 host_gs_selector;
> > +   u16 host_tr_selector;
> > +   u64 io_bitmap_a;
> > +   u64 io_bitmap_b;
> > +   u64 msr_bitmap;
> > +   u64 vm_exit_msr_store_addr;
> > +   u64 vm_exit_msr_load_addr;
> > +   u64 vm_entry_msr_load_addr;
> > +   u64 tsc_offset;
> > +   u64 virtual_apic_page_addr;
> > +   u64 apic_access_addr;
> > +   u64 ept_pointer;
> > +   u64 guest_physical_address;
> > +   u64 vmcs_link_pointer;
> > +   u64 guest_ia32_debugctl;
> > +   u64 guest_ia32_pat;
> > +   u64 guest_pdptr0;
> > +   u64 guest_pdptr1;
> > +   u64 guest_pdptr2;
> > +   u64 guest_pdptr3;
> > +   u64 host_ia32_pat;
> > +   u32 pin_based_vm_exec_control;
> > +   u32 cpu_based_vm_exec_control;
> > +   u32 exception_bitmap;
> > +   u32 page_fault_error_code_mask;
> > +   u32 page_fault_error_code_match;
> > +   u32 cr3_target_count;
> > +   u32 vm_exit_controls;
> > +   u32 vm_exit_msr_store_count;
> > +   u32 vm_exit_msr_load_count;
> > +   u32 vm_entry_controls;
> > +   u32 vm_entry_msr_load_count;
> > +   u32 vm_entry_intr_info_field;
> > +   u32 vm_entry_exception_error_code;
> > +   u32 vm_entry_instruction_len;
> > +   u32 tpr_threshold;
> > +   u32 secondary_vm_exec_control;
> > +   u32 vm_instruction_error;
> > +   u32 vm_exit_reason;
> > +   u32 vm_exit_intr_info;
> > +   u32 vm_exit_intr_error_code;
> > +   u32 idt_vectoring_info_field;
> > +   u32 idt_vectoring_error_code;
> > +   u32 vm_exit_instruction_len;
> > +   u32 vmx_instruction_info;
> > +   u32 guest_es_limit;
> > +   u32 guest_cs_limit;
> > +   u32 guest_ss_limit;
> > +   u32 guest_ds_limit;
> > +   u32 guest_fs_limit;
> > +   u32 guest_gs_limit;
> > +   u32 guest_ldtr_limit;
> > +   u32 guest_tr_limit;
> > +   u32 guest_gdtr_limit;
> > +   u32 guest_idtr_limit;
> > +   u32 guest_es_ar_bytes;
> > +   u32 guest_cs_ar_bytes;
> > +   u32 guest_ss_ar_bytes;
> > +   u32 guest_ds_ar_bytes;
> > +   u32 guest_fs_ar_bytes;
> > +   u32 guest_gs_ar_bytes;
> > +   u32 guest_ldtr_ar_bytes;
> > +   u32 guest_tr_ar_bytes;
> > +   u32 guest_interruptibility_info;
> > +   u32 guest_activity_state;
> > +   u32 guest_sysenter_cs;
> > +   u32 host_ia32_sysenter_cs;
> > +   unsigned long cr0_guest_host_mask;
> > +   unsigned long cr4_guest_host_mask;
> > +   unsigned long cr0_read_shadow;
> > +   unsigned long cr4_read_shadow;
> > +   unsigned long cr3_target_value0;
> > +   unsigned long cr3_target_value1;
> > +   unsigned long cr3_target_value2;
> > +   unsigned long cr3_target_value3;
> > +   unsigned long exit_qualification;
> > +   unsigned long guest_linear_address;
> > +   unsigned long guest_cr0;
> > +   unsigned long guest_cr3;
> > +   unsigned long guest_cr4;
> > +   unsigned long guest_es_base;
> > +   unsigned long guest_cs_base;
> > +   unsigned long guest_ss_base;
> > +   unsigned long guest_ds_base;
> > +   unsigned long guest_fs_base;
> > +   unsigned long guest_gs_base;
> > +   unsigned long guest_ldtr_base;
> > +   unsigned long guest_tr_base;
> > +   unsigned long guest_gdtr_base;
> > +   unsigned long guest_idtr_base;
> > +   unsigned long guest_dr7;
> > +   unsigned long guest_rsp;
> > +   unsigned long guest_rip;
> > +   unsigned long guest_rflags;
> > +   unsigned long guest_pending_dbg_exceptions;
> > +   unsigned long guest_sysenter_esp;
> > +   unsigned long guest_sysenter_eip;
> > +   unsigned long host_cr0;
> > +   unsigned long host_cr3;
> > +   unsigned long host_cr4;
> > +   unsigned long host_fs_base;
> > +   unsigned long host_gs_base;
> > +   unsigned long host_tr_base;
> > +   unsigned long host_gdtr_base;
> > +   unsigned long host_idtr_base;
> > +   unsigned long host_ia32_sysenter_esp;
> > +   unsigned long host_ia32_sysenter_eip;
> > +   unsigned long host_rsp;
> > +   unsigned long host_rip;
> > +};
> > +
> >  struct __attribute__ ((__packed__)) level_state {
> >     /* Has the level1 guest done vmclear? */
> >     bool vmclear;
> > +   u16 vpid;
> > +   u64 shadow_efer;
> > +   unsigned long cr2;
> > +   unsigned long cr3;
> > +   unsigned long cr4;
> > +   unsigned long cr8;
> > +
> > +   u64 io_bitmap_a;
> > +   u64 io_bitmap_b;
> > +   u64 msr_bitmap;
> > +
> > +   struct shadow_vmcs *shadow_vmcs;
> > +
> > +   struct vmcs *vmcs;
> > +   int cpu;
> > +   int launched;
> >  };
> >
> >  struct nested_vmx {
> >     /* Has the level1 guest done vmxon? */
> >     bool vmxon;
> > -
> > +   /* What is the location of the  vmcs l1 keeps for l2? (in level1
gpa) */
> > +   u64 vmptr;
> >     /*
> >      * Level 2 state : includes vmcs,registers and
> >      * a copy of vmcs12 for vmread/vmwrite
> >      */
> >     struct level_state *l2_state;
> > +   /* Level 1 state for switching to level 2 and back */
> > +   struct level_state *l1_state;
> >  };
> >
> >  struct vmcs {
> > @@ -140,6 +288,25 @@ static inline struct vcpu_vmx *to_vmx(struct
> kvm_vcpu *vcpu)
> >     return container_of(vcpu, struct vcpu_vmx, vcpu);
> >  }
> >
> > +static struct page *nested_get_page(struct kvm_vcpu *vcpu,
> > +                u64 vmcs_addr)
> > +{
> > +   struct page *vmcs_page = NULL;
> > +
> > +   down_read(&current->mm->mmap_sem);
> > +   vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
> > +   up_read(&current->mm->mmap_sem);
> > +
> > +   if (is_error_page(vmcs_page)) {
> > +      printk(KERN_ERR "%s error allocating page \n", __func__);
> > +      kvm_release_page_clean(vmcs_page);
> > +      return NULL;
> > +   }
> > +
> > +   return vmcs_page;
> > +
> > +}
> > +
> >  static int init_rmode(struct kvm *kvm);
> >  static u64 construct_eptp(unsigned long root_hpa);
> >
> > @@ -197,6 +364,8 @@ static struct kvm_vmx_segment_field {
> >
> >  static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
> >
> > +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu);
> > +static int create_l1_state(struct kvm_vcpu *vcpu);
> >  static int create_l2_state(struct kvm_vcpu *vcpu);
> >
> >  /*
> > @@ -715,6 +884,24 @@ static void vmx_load_host_state(struct vcpu_vmx
*vmx)
> >     preempt_enable();
> >  }
> >
> > +
> > +static int vmptrld(struct kvm_vcpu *vcpu,
> > +         u64 phys_addr)
> > +{
> > +   u8 error;
> > +
> > +   asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
> > +            : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
> > +            : "cc");
> > +   if (error) {
> > +      printk(KERN_ERR "kvm: %s vmptrld %llx failed\n",
> > +             __func__, phys_addr);
> > +      return 1;
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> >  /*
> >   * Switches to specified vcpu, until a matching vcpu_put(), but
assumes
> >   * vcpu mutex is already taken.
> > @@ -736,15 +923,8 @@ static void vmx_vcpu_load(struct kvm_vcpu
> *vcpu, int cpu)
> >     }
> >
> >     if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
> > -      u8 error;
> > -
> >        per_cpu(current_vmcs, cpu) = vmx->vmcs;
> > -      asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
> > -               : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
> > -               : "cc");
> > -      if (error)
> > -         printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
> > -                vmx->vmcs, phys_addr);
> > +      vmptrld(vcpu, phys_addr);
> >     }
> >
> >     if (vcpu->cpu != cpu) {
> > @@ -1318,6 +1498,28 @@ struct level_state *create_state(void)
> >     return state;
> >  }
> >
> > +int create_l1_state(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +
> > +   if (!vmx->nested.l1_state) {
> > +      vmx->nested.l1_state = create_state();
> > +      if (!vmx->nested.l1_state)
> > +         return -ENOMEM;
> > +   } else
> > +      return 0;
> > +
> > +   vmx->nested.l1_state->shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL);
> > +   if (!vmx->nested.l1_state->shadow_vmcs) {
> > +      printk(KERN_INFO "%s error creating shadow vmcs\n",
> > +             __func__);
> > +      kfree(vmx->nested.l1_state);
> > +      return -ENOMEM;
> > +   }
> > +   return 0;
> > +}
> > +
> > +static struct vmcs *alloc_vmcs(void);
> >  int create_l2_state(struct kvm_vcpu *vcpu)
> >  {
> >     struct vcpu_vmx *vmx = to_vmx(vcpu);
> > @@ -1326,11 +1528,53 @@ int create_l2_state(struct kvm_vcpu *vcpu)
> >        vmx->nested.l2_state = create_state();
> >        if (!vmx->nested.l2_state)
> >           return -ENOMEM;
> > +   } else
> > +      return 0;
> > +
> > +   vmx->nested.l2_state->vmcs = alloc_vmcs();
> > +   if (!vmx->nested.l2_state->vmcs) {
> > +      printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
> > +      kfree(vmx->nested.l2_state);
> > +      return -ENOMEM;
> >     }
> >
> > +   if (cpu_has_vmx_msr_bitmap())
> > +      vmx->nested.l2_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
> > +   else
> > +      vmx->nested.l2_state->msr_bitmap = 0;
> > +
> > +   vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> > +   vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> > +
> >     return 0;
> >  }
> >
> > +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
> > +         struct kvm_vcpu *vcpu);
> > +
> Move to header.
>
> > +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> > +{
> Please make all local functions static. Here and in all other patches.
>
> > +
> > +   int r = 0;
> > +
> > +   r = kvm_read_guest_virt(vcpu->arch.regs[VCPU_REGS_RAX], gentry,
> > +            sizeof(u64), vcpu);
> vmptrld operand can be not only in RAX but in other registers too orin
memory.
For nested KVM proposes RAX was enough.
For running nested VMWare we added code that handles all the possible
options, I will merge it into the nested KVM patches.
>
> > +   if (r) {
> > +      printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
> > +             __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
> > +      return r;
> > +   }
> > +
> > +   if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> > +      printk(KERN_DEBUG "%s addr %llx not aligned\n",
> > +             __func__, *gentry);
> > +      return 1;
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> > +
> >  /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
> >   * tricks.
> >   */
> > @@ -3374,6 +3618,66 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
> >     return 1;
> >  }
> >
> > +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct page *vmcs_page;
> > +   u64 guest_vmcs_addr;
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   if (read_guest_vmcs_gpa(vcpu, &guest_vmcs_addr))
> > +      return 1;
> > +
> > +   if (create_l1_state(vcpu)) {
> > +      printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   if (create_l2_state(vcpu)) {
> > +      printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> > +      return 1;
> > +   }
> create_l2_state() is called on vmxon. As far as I can see this is nop
> here.
>
> > +
> > +   if (vmx->nested.vmptr != guest_vmcs_addr) {
> > +      /* checking vmptr address */
> > +      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> > +      if (vmcs_page == NULL)
> > +         return 1;
> > +
> > +      vmx->nested.vmptr = guest_vmcs_addr;
> > +
> > +      kvm_release_page_clean(vmcs_page);
> > +   }
> > +
> > +   clear_rflags_cf_zf(vcpu);
> > +   skip_emulated_instruction(vcpu);
> > +   return 1;
> > +}
> > +
> > +int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
> > +          struct kvm_vcpu *vcpu);
> Move to header.
ok.
>
> > +
> > +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> > +{
> > +   int r = 0;
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   r = kvm_write_guest_virt(vcpu->arch.regs[VCPU_REGS_RAX],
> > +             (void *)&to_vmx(vcpu)->nested.vmptr,
> > +             sizeof(u64), vcpu);
> Same as vmptrld. Why are you assuming RAX?
See above
>
> > +   if (r) {
> > +      printk(KERN_INFO "%s failed to write vmptr\n", __func__);
> > +      return 1;
> > +   }
> > +   clear_rflags_cf_zf(vcpu);
> > +   skip_emulated_instruction(vcpu);
> > +   return 1;
> > +}
> > +
> >  static int handle_invlpg(struct kvm_vcpu *vcpu)
> >  {
> >     unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> > @@ -3644,8 +3948,8 @@ static int (*kvm_vmx_exit_handlers[])(struct
> kvm_vcpu *vcpu) = {
> >     [EXIT_REASON_VMCALL]                  = handle_vmcall,
> >     [EXIT_REASON_VMCLEAR]                 = handle_vmclear,
> >     [EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
> > -   [EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
> > -   [EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
> > +   [EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
> > +   [EXIT_REASON_VMPTRST]                 = handle_vmptrst,
> >     [EXIT_REASON_VMREAD]                  = handle_vmx_insn,
> >     [EXIT_REASON_VMRESUME]                = handle_vmx_insn,
> >     [EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
> > @@ -4183,6 +4487,148 @@ static bool vmx_gb_page_enable(void)
> >     return false;
> >  }
> >
> > +void save_vmcs(struct shadow_vmcs *dst)
> > +{
> Not used by this patch. May be introduce in the patch that uses it.
>
> > +   dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> > +   dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> > +   dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> > +   dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> > +   dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> > +   dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> > +   dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> > +   dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> > +   dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> > +   dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> > +   dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> > +   dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> > +   dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> > +   dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> > +   dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> > +   dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> > +   dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> > +   if (cpu_has_vmx_msr_bitmap())
> > +      dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
> > +
> > +   dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> > +   dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> > +   dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
> > +   dst->tsc_offset = vmcs_read64(TSC_OFFSET);
> > +   dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
> > +   dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> > +   if (enable_ept)
> > +      dst->ept_pointer = vmcs_read64(EPT_POINTER);
> > +
> > +   dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> > +   dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> > +   dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> > +   if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
> > +      dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> > +   if (enable_ept) {
> > +      dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
> > +      dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
> > +      dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
> > +      dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
> > +   }
> > +   dst->pin_based_vm_exec_control = vmcs_read32
(PIN_BASED_VM_EXEC_CONTROL);
> > +   dst->cpu_based_vm_exec_control = vmcs_read32
(CPU_BASED_VM_EXEC_CONTROL);
> > +   dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
> > +   dst->page_fault_error_code_mask =
> > +      vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
> > +   dst->page_fault_error_code_match =
> > +      vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
> > +   dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> > +   dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
> > +   dst->vm_exit_msr_store_count = vmcs_read32
(VM_EXIT_MSR_STORE_COUNT);
> > +   dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
> > +   dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
> > +   dst->vm_entry_msr_load_count = vmcs_read32
(VM_ENTRY_MSR_LOAD_COUNT);
> > +   dst->vm_entry_intr_info_field = vmcs_read32
(VM_ENTRY_INTR_INFO_FIELD);
> > +   dst->vm_entry_exception_error_code =
> > +      vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> > +   dst->vm_entry_instruction_len = vmcs_read32
(VM_ENTRY_INSTRUCTION_LEN);
> > +   dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
> > +   dst->secondary_vm_exec_control = vmcs_read32
(SECONDARY_VM_EXEC_CONTROL);
> > +   if (enable_vpid && dst->secondary_vm_exec_control &
> > +       SECONDARY_EXEC_ENABLE_VPID)
> > +      dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
> > +   dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
> > +   dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> > +   dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > +   dst->vm_exit_intr_error_code = vmcs_read32
(VM_EXIT_INTR_ERROR_CODE);
> > +   dst->idt_vectoring_info_field = vmcs_read32
(IDT_VECTORING_INFO_FIELD);
> > +   dst->idt_vectoring_error_code = vmcs_read32
(IDT_VECTORING_ERROR_CODE);
> > +   dst->vm_exit_instruction_len = vmcs_read32
(VM_EXIT_INSTRUCTION_LEN);
> > +   dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> > +   dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> > +   dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> > +   dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> > +   dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> > +   dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> > +   dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> > +   dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> > +   dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> > +   dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> > +   dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> > +   dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> > +   dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> > +   dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> > +   dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> > +   dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> > +   dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> > +   dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> > +   dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> > +   dst->guest_interruptibility_info =
> > +      vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +   dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
> > +   dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> > +   dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
> > +   dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
> > +   dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
> > +   dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> > +   dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> > +   dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
> > +   dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
> > +   dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
> > +   dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
> > +   dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> > +   dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> > +   dst->guest_cr0 = vmcs_readl(GUEST_CR0);
> > +   dst->guest_cr3 = vmcs_readl(GUEST_CR3);
> > +   dst->guest_cr4 = vmcs_readl(GUEST_CR4);
> > +   dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> > +   dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> > +   dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> > +   dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> > +   dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> > +   dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> > +   dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> > +   dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> > +   dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> > +   dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> > +   dst->guest_dr7 = vmcs_readl(GUEST_DR7);
> > +   dst->guest_rsp = vmcs_readl(GUEST_RSP);
> > +   dst->guest_rip = vmcs_readl(GUEST_RIP);
> > +   dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> > +   dst->guest_pending_dbg_exceptions =
> > +      vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> > +   dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> > +   dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> > +   dst->host_cr0 = vmcs_readl(HOST_CR0);
> > +   dst->host_cr3 = vmcs_readl(HOST_CR3);
> > +   dst->host_cr4 = vmcs_readl(HOST_CR4);
> > +   dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
> > +   dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
> > +   dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
> > +   dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> > +   dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> > +   dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
> > +   dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
> > +   dst->host_rsp = vmcs_readl(HOST_RSP);
> > +   dst->host_rip = vmcs_readl(HOST_RIP);
> > +   if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
> > +      dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
> > +}
> > +
> >  static struct kvm_x86_ops vmx_x86_ops = {
> >     .cpu_has_kvm_support = cpu_has_kvm_support,
> >     .disabled_by_bios = vmx_disabled_by_bios,
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 9c39092..74eb888 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -2473,6 +2473,7 @@ static int kvm_read_guest_virt(gva_t addr,
> void *val, unsigned int bytes,
> >  out:
> >     return r;
> >  }
> > +EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
> >
> >  static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int
bytes,
> >              struct kvm_vcpu *vcpu)
> > @@ -2503,7 +2504,7 @@ static int kvm_write_guest_virt(gva_t addr,
> void *val, unsigned int bytes,
> >  out:
> >     return r;
> >  }
> > -
> > +EXPORT_SYMBOL_GPL(kvm_write_guest_virt);
> >
> >  static int emulator_read_emulated(unsigned long addr,
> >                void *val,
> > --
> > 1.6.0.4
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> --
>          Gleb.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-10-19 12:59       ` Gleb Natapov
@ 2009-10-21 13:28         ` Orit Wasserman
  0 siblings, 0 replies; 35+ messages in thread
From: Orit Wasserman @ 2009-10-21 13:28 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Gleb Natapov <gleb@redhat.com> wrote on 19/10/2009 14:59:53:

> From:
>
> Gleb Natapov <gleb@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Abel Gordon/
> Haifa/IBM@IBMIL, Muli Ben-Yehuda/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mdday@us.ibm.com
>
> Date:
>
> 19/10/2009 15:00
>
> Subject:
>
> Re: [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On Thu, Oct 15, 2009 at 04:41:44PM +0200, oritw@il.ibm.com wrote:
> > +static struct page *nested_get_page(struct kvm_vcpu *vcpu,
> > +                u64 vmcs_addr)
> > +{
> > +   struct page *vmcs_page = NULL;
> > +
> > +   down_read(&current->mm->mmap_sem);
> > +   vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
> > +   up_read(&current->mm->mmap_sem);
> Why are you taking mmap_sem here? gup_fast() takes it if required.
I will remove it.
>
> > +
> > +   if (is_error_page(vmcs_page)) {
> > +      printk(KERN_ERR "%s error allocating page \n", __func__);
> > +      kvm_release_page_clean(vmcs_page);
> > +      return NULL;
> > +   }
> > +
> > +   return vmcs_page;
> > +
> > +}
> > +
>
> --
>          Gleb.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite
  2009-10-19 13:17         ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite Gleb Natapov
@ 2009-10-21 13:32           ` Orit Wasserman
  0 siblings, 0 replies; 35+ messages in thread
From: Orit Wasserman @ 2009-10-21 13:32 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Gleb Natapov <gleb@redhat.com> wrote on 19/10/2009 15:17:20:

> From:
>
> Gleb Natapov <gleb@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Abel Gordon/
> Haifa/IBM@IBMIL, Muli Ben-Yehuda/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mdday@us.ibm.com
>
> Date:
>
> 19/10/2009 15:17
>
> Subject:
>
> Re: [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite
>
> On Thu, Oct 15, 2009 at 04:41:45PM +0200, oritw@il.ibm.com wrote:
> > From: Orit Wasserman <oritw@il.ibm.com>
> >
> > ---
> >  arch/x86/kvm/vmx.c |  591 +++++++++++++++++++++++++++++++++++++++
> ++++++++++++-
> >  1 files changed, 589 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 8c186e0..6a4c252 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -225,6 +225,21 @@ struct nested_vmx {
> >     struct level_state *l1_state;
> >  };
> >
> > +enum vmcs_field_type {
> > +   VMCS_FIELD_TYPE_U16 = 0,
> > +   VMCS_FIELD_TYPE_U64 = 1,
> > +   VMCS_FIELD_TYPE_U32 = 2,
> > +   VMCS_FIELD_TYPE_ULONG = 3
> > +};
> > +
> > +#define VMCS_FIELD_LENGTH_OFFSET 13
> > +#define VMCS_FIELD_LENGTH_MASK 0x6000
> > +
> > +static inline int vmcs_field_length(unsigned long field)
> > +{
> > +   return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
> > +}
> > +
> >  struct vmcs {
> >     u32 revision_id;
> >     u32 abort;
> > @@ -288,6 +303,404 @@ static inline struct vcpu_vmx *to_vmx(struct
> kvm_vcpu *vcpu)
> >     return container_of(vcpu, struct vcpu_vmx, vcpu);
> >  }
> >
> > +#define SHADOW_VMCS_OFFSET(x) offsetof(struct shadow_vmcs, x)
> > +
> > +static unsigned short vmcs_field_to_offset_table[HOST_RIP+1] = {
> > +
> > +   [VIRTUAL_PROCESSOR_ID] =
> > +      SHADOW_VMCS_OFFSET(virtual_processor_id),
> > +   [GUEST_ES_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(guest_es_selector),
> > +   [GUEST_CS_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(guest_cs_selector),
> > +   [GUEST_SS_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(guest_ss_selector),
> > +   [GUEST_DS_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(guest_ds_selector),
> > +   [GUEST_FS_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(guest_fs_selector),
> > +   [GUEST_GS_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(guest_gs_selector),
> > +   [GUEST_LDTR_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(guest_ldtr_selector),
> > +   [GUEST_TR_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(guest_tr_selector),
> > +   [HOST_ES_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(host_es_selector),
> > +   [HOST_CS_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(host_cs_selector),
> > +   [HOST_SS_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(host_ss_selector),
> > +   [HOST_DS_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(host_ds_selector),
> > +   [HOST_FS_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(host_fs_selector),
> > +   [HOST_GS_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(host_gs_selector),
> > +   [HOST_TR_SELECTOR] =
> > +      SHADOW_VMCS_OFFSET(host_tr_selector),
> > +   [IO_BITMAP_A] =
> > +      SHADOW_VMCS_OFFSET(io_bitmap_a),
> > +   [IO_BITMAP_A_HIGH] =
> > +      SHADOW_VMCS_OFFSET(io_bitmap_a)+4,
> > +   [IO_BITMAP_B] =
> > +      SHADOW_VMCS_OFFSET(io_bitmap_b),
> > +   [IO_BITMAP_B_HIGH] =
> > +      SHADOW_VMCS_OFFSET(io_bitmap_b)+4,
> > +   [MSR_BITMAP] =
> > +      SHADOW_VMCS_OFFSET(msr_bitmap),
> > +   [MSR_BITMAP_HIGH] =
> > +      SHADOW_VMCS_OFFSET(msr_bitmap)+4,
> > +   [VM_EXIT_MSR_STORE_ADDR] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_msr_store_addr),
> > +   [VM_EXIT_MSR_STORE_ADDR_HIGH] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_msr_store_addr)+4,
> > +   [VM_EXIT_MSR_LOAD_ADDR] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_msr_load_addr),
> > +   [VM_EXIT_MSR_LOAD_ADDR_HIGH] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_msr_load_addr)+4,
> > +   [VM_ENTRY_MSR_LOAD_ADDR] =
> > +      SHADOW_VMCS_OFFSET(vm_entry_msr_load_addr),
> > +   [VM_ENTRY_MSR_LOAD_ADDR_HIGH] =
> > +      SHADOW_VMCS_OFFSET(vm_entry_msr_load_addr)+4,
> > +   [TSC_OFFSET] =
> > +      SHADOW_VMCS_OFFSET(tsc_offset),
> > +   [TSC_OFFSET_HIGH] =
> > +      SHADOW_VMCS_OFFSET(tsc_offset)+4,
> > +   [VIRTUAL_APIC_PAGE_ADDR] =
> > +      SHADOW_VMCS_OFFSET(virtual_apic_page_addr),
> > +   [VIRTUAL_APIC_PAGE_ADDR_HIGH] =
> > +      SHADOW_VMCS_OFFSET(virtual_apic_page_addr)+4,
> > +   [APIC_ACCESS_ADDR] =
> > +      SHADOW_VMCS_OFFSET(apic_access_addr),
> > +   [APIC_ACCESS_ADDR_HIGH] =
> > +      SHADOW_VMCS_OFFSET(apic_access_addr)+4,
> > +   [EPT_POINTER] =
> > +      SHADOW_VMCS_OFFSET(ept_pointer),
> > +   [EPT_POINTER_HIGH] =
> > +      SHADOW_VMCS_OFFSET(ept_pointer)+4,
> > +   [GUEST_PHYSICAL_ADDRESS] =
> > +      SHADOW_VMCS_OFFSET(guest_physical_address),
> > +   [GUEST_PHYSICAL_ADDRESS_HIGH] =
> > +      SHADOW_VMCS_OFFSET(guest_physical_address)+4,
> > +   [VMCS_LINK_POINTER] =
> > +      SHADOW_VMCS_OFFSET(vmcs_link_pointer),
> > +   [VMCS_LINK_POINTER_HIGH] =
> > +      SHADOW_VMCS_OFFSET(vmcs_link_pointer)+4,
> > +   [GUEST_IA32_DEBUGCTL] =
> > +      SHADOW_VMCS_OFFSET(guest_ia32_debugctl),
> > +   [GUEST_IA32_DEBUGCTL_HIGH] =
> > +      SHADOW_VMCS_OFFSET(guest_ia32_debugctl)+4,
> > +   [GUEST_IA32_PAT] =
> > +      SHADOW_VMCS_OFFSET(guest_ia32_pat),
> > +   [GUEST_IA32_PAT_HIGH] =
> > +      SHADOW_VMCS_OFFSET(guest_ia32_pat)+4,
> > +   [GUEST_PDPTR0] =
> > +      SHADOW_VMCS_OFFSET(guest_pdptr0),
> > +   [GUEST_PDPTR0_HIGH] =
> > +      SHADOW_VMCS_OFFSET(guest_pdptr0)+4,
> > +   [GUEST_PDPTR1] =
> > +      SHADOW_VMCS_OFFSET(guest_pdptr1),
> > +   [GUEST_PDPTR1_HIGH] =
> > +      SHADOW_VMCS_OFFSET(guest_pdptr1)+4,
> > +   [GUEST_PDPTR2] =
> > +      SHADOW_VMCS_OFFSET(guest_pdptr2),
> > +   [GUEST_PDPTR2_HIGH] =
> > +      SHADOW_VMCS_OFFSET(guest_pdptr2)+4,
> > +   [GUEST_PDPTR3] =
> > +      SHADOW_VMCS_OFFSET(guest_pdptr3),
> > +   [GUEST_PDPTR3_HIGH] =
> > +      SHADOW_VMCS_OFFSET(guest_pdptr3)+4,
> > +   [HOST_IA32_PAT] =
> > +      SHADOW_VMCS_OFFSET(host_ia32_pat),
> > +   [HOST_IA32_PAT_HIGH] =
> > +      SHADOW_VMCS_OFFSET(host_ia32_pat)+4,
> > +   [PIN_BASED_VM_EXEC_CONTROL] =
> > +      SHADOW_VMCS_OFFSET(pin_based_vm_exec_control),
> > +   [CPU_BASED_VM_EXEC_CONTROL] =
> > +      SHADOW_VMCS_OFFSET(cpu_based_vm_exec_control),
> > +   [EXCEPTION_BITMAP] =
> > +      SHADOW_VMCS_OFFSET(exception_bitmap),
> > +   [PAGE_FAULT_ERROR_CODE_MASK] =
> > +      SHADOW_VMCS_OFFSET(page_fault_error_code_mask),
> > +   [PAGE_FAULT_ERROR_CODE_MATCH] =
> > +      SHADOW_VMCS_OFFSET(page_fault_error_code_match),
> > +   [CR3_TARGET_COUNT] =
> > +      SHADOW_VMCS_OFFSET(cr3_target_count),
> > +   [VM_EXIT_CONTROLS] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_controls),
> > +   [VM_EXIT_MSR_STORE_COUNT] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_msr_store_count),
> > +   [VM_EXIT_MSR_LOAD_COUNT] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_msr_load_count),
> > +   [VM_ENTRY_CONTROLS] =
> > +      SHADOW_VMCS_OFFSET(vm_entry_controls),
> > +   [VM_ENTRY_MSR_LOAD_COUNT] =
> > +      SHADOW_VMCS_OFFSET(vm_entry_msr_load_count),
> > +   [VM_ENTRY_INTR_INFO_FIELD] =
> > +      SHADOW_VMCS_OFFSET(vm_entry_intr_info_field),
> > +   [VM_ENTRY_EXCEPTION_ERROR_CODE] =
> > +      SHADOW_VMCS_OFFSET(vm_entry_exception_error_code),
> > +   [VM_ENTRY_INSTRUCTION_LEN] =
> > +      SHADOW_VMCS_OFFSET(vm_entry_instruction_len),
> > +   [TPR_THRESHOLD] =
> > +      SHADOW_VMCS_OFFSET(tpr_threshold),
> > +   [SECONDARY_VM_EXEC_CONTROL] =
> > +      SHADOW_VMCS_OFFSET(secondary_vm_exec_control),
> > +   [VM_INSTRUCTION_ERROR] =
> > +      SHADOW_VMCS_OFFSET(vm_instruction_error),
> > +   [VM_EXIT_REASON] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_reason),
> > +   [VM_EXIT_INTR_INFO] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_intr_info),
> > +   [VM_EXIT_INTR_ERROR_CODE] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_intr_error_code),
> > +   [IDT_VECTORING_INFO_FIELD] =
> > +      SHADOW_VMCS_OFFSET(idt_vectoring_info_field),
> > +   [IDT_VECTORING_ERROR_CODE] =
> > +      SHADOW_VMCS_OFFSET(idt_vectoring_error_code),
> > +   [VM_EXIT_INSTRUCTION_LEN] =
> > +      SHADOW_VMCS_OFFSET(vm_exit_instruction_len),
> > +   [VMX_INSTRUCTION_INFO] =
> > +      SHADOW_VMCS_OFFSET(vmx_instruction_info),
> > +   [GUEST_ES_LIMIT] =
> > +      SHADOW_VMCS_OFFSET(guest_es_limit),
> > +   [GUEST_CS_LIMIT] =
> > +      SHADOW_VMCS_OFFSET(guest_cs_limit),
> > +   [GUEST_SS_LIMIT] =
> > +      SHADOW_VMCS_OFFSET(guest_ss_limit),
> > +   [GUEST_DS_LIMIT] =
> > +      SHADOW_VMCS_OFFSET(guest_ds_limit),
> > +   [GUEST_FS_LIMIT] =
> > +      SHADOW_VMCS_OFFSET(guest_fs_limit),
> > +   [GUEST_GS_LIMIT] =
> > +      SHADOW_VMCS_OFFSET(guest_gs_limit),
> > +   [GUEST_LDTR_LIMIT] =
> > +      SHADOW_VMCS_OFFSET(guest_ldtr_limit),
> > +   [GUEST_TR_LIMIT] =
> > +      SHADOW_VMCS_OFFSET(guest_tr_limit),
> > +   [GUEST_GDTR_LIMIT] =
> > +      SHADOW_VMCS_OFFSET(guest_gdtr_limit),
> > +   [GUEST_IDTR_LIMIT] =
> > +      SHADOW_VMCS_OFFSET(guest_idtr_limit),
> > +   [GUEST_ES_AR_BYTES] =
> > +      SHADOW_VMCS_OFFSET(guest_es_ar_bytes),
> > +   [GUEST_CS_AR_BYTES] =
> > +      SHADOW_VMCS_OFFSET(guest_cs_ar_bytes),
> > +   [GUEST_SS_AR_BYTES] =
> > +      SHADOW_VMCS_OFFSET(guest_ss_ar_bytes),
> > +   [GUEST_DS_AR_BYTES] =
> > +      SHADOW_VMCS_OFFSET(guest_ds_ar_bytes),
> > +   [GUEST_FS_AR_BYTES] =
> > +      SHADOW_VMCS_OFFSET(guest_fs_ar_bytes),
> > +   [GUEST_GS_AR_BYTES] =
> > +      SHADOW_VMCS_OFFSET(guest_gs_ar_bytes),
> > +   [GUEST_LDTR_AR_BYTES] =
> > +      SHADOW_VMCS_OFFSET(guest_ldtr_ar_bytes),
> > +   [GUEST_TR_AR_BYTES] =
> > +      SHADOW_VMCS_OFFSET(guest_tr_ar_bytes),
> > +   [GUEST_INTERRUPTIBILITY_INFO] =
> > +      SHADOW_VMCS_OFFSET(guest_interruptibility_info),
> > +   [GUEST_ACTIVITY_STATE] =
> > +      SHADOW_VMCS_OFFSET(guest_activity_state),
> > +   [GUEST_SYSENTER_CS] =
> > +      SHADOW_VMCS_OFFSET(guest_sysenter_cs),
> > +   [HOST_IA32_SYSENTER_CS] =
> > +      SHADOW_VMCS_OFFSET(host_ia32_sysenter_cs),
> > +   [CR0_GUEST_HOST_MASK] =
> > +      SHADOW_VMCS_OFFSET(cr0_guest_host_mask),
> > +   [CR4_GUEST_HOST_MASK] =
> > +      SHADOW_VMCS_OFFSET(cr4_guest_host_mask),
> > +   [CR0_READ_SHADOW] =
> > +      SHADOW_VMCS_OFFSET(cr0_read_shadow),
> > +   [CR4_READ_SHADOW] =
> > +      SHADOW_VMCS_OFFSET(cr4_read_shadow),
> > +   [CR3_TARGET_VALUE0] =
> > +      SHADOW_VMCS_OFFSET(cr3_target_value0),
> > +   [CR3_TARGET_VALUE1] =
> > +      SHADOW_VMCS_OFFSET(cr3_target_value1),
> > +   [CR3_TARGET_VALUE2] =
> > +      SHADOW_VMCS_OFFSET(cr3_target_value2),
> > +   [CR3_TARGET_VALUE3] =
> > +      SHADOW_VMCS_OFFSET(cr3_target_value3),
> > +   [EXIT_QUALIFICATION] =
> > +      SHADOW_VMCS_OFFSET(exit_qualification),
> > +   [GUEST_LINEAR_ADDRESS] =
> > +      SHADOW_VMCS_OFFSET(guest_linear_address),
> > +   [GUEST_CR0] =
> > +      SHADOW_VMCS_OFFSET(guest_cr0),
> > +   [GUEST_CR3] =
> > +      SHADOW_VMCS_OFFSET(guest_cr3),
> > +   [GUEST_CR4] =
> > +      SHADOW_VMCS_OFFSET(guest_cr4),
> > +   [GUEST_ES_BASE] =
> > +      SHADOW_VMCS_OFFSET(guest_es_base),
> > +   [GUEST_CS_BASE] =
> > +      SHADOW_VMCS_OFFSET(guest_cs_base),
> > +   [GUEST_SS_BASE] =
> > +      SHADOW_VMCS_OFFSET(guest_ss_base),
> > +   [GUEST_DS_BASE] =
> > +      SHADOW_VMCS_OFFSET(guest_ds_base),
> > +   [GUEST_FS_BASE] =
> > +      SHADOW_VMCS_OFFSET(guest_fs_base),
> > +   [GUEST_GS_BASE] =
> > +      SHADOW_VMCS_OFFSET(guest_gs_base),
> > +   [GUEST_LDTR_BASE] =
> > +      SHADOW_VMCS_OFFSET(guest_ldtr_base),
> > +   [GUEST_TR_BASE] =
> > +      SHADOW_VMCS_OFFSET(guest_tr_base),
> > +   [GUEST_GDTR_BASE] =
> > +      SHADOW_VMCS_OFFSET(guest_gdtr_base),
> > +   [GUEST_IDTR_BASE] =
> > +      SHADOW_VMCS_OFFSET(guest_idtr_base),
> > +   [GUEST_DR7] =
> > +      SHADOW_VMCS_OFFSET(guest_dr7),
> > +   [GUEST_RSP] =
> > +      SHADOW_VMCS_OFFSET(guest_rsp),
> > +   [GUEST_RIP] =
> > +      SHADOW_VMCS_OFFSET(guest_rip),
> > +   [GUEST_RFLAGS] =
> > +      SHADOW_VMCS_OFFSET(guest_rflags),
> > +   [GUEST_PENDING_DBG_EXCEPTIONS] =
> > +      SHADOW_VMCS_OFFSET(guest_pending_dbg_exceptions),
> > +   [GUEST_SYSENTER_ESP] =
> > +      SHADOW_VMCS_OFFSET(guest_sysenter_esp),
> > +   [GUEST_SYSENTER_EIP] =
> > +      SHADOW_VMCS_OFFSET(guest_sysenter_eip),
> > +   [HOST_CR0] =
> > +      SHADOW_VMCS_OFFSET(host_cr0),
> > +   [HOST_CR3] =
> > +      SHADOW_VMCS_OFFSET(host_cr3),
> > +   [HOST_CR4] =
> > +      SHADOW_VMCS_OFFSET(host_cr4),
> > +   [HOST_FS_BASE] =
> > +      SHADOW_VMCS_OFFSET(host_fs_base),
> > +   [HOST_GS_BASE] =
> > +      SHADOW_VMCS_OFFSET(host_gs_base),
> > +   [HOST_TR_BASE] =
> > +      SHADOW_VMCS_OFFSET(host_tr_base),
> > +   [HOST_GDTR_BASE] =
> > +      SHADOW_VMCS_OFFSET(host_gdtr_base),
> > +   [HOST_IDTR_BASE] =
> > +      SHADOW_VMCS_OFFSET(host_idtr_base),
> > +   [HOST_IA32_SYSENTER_ESP] =
> > +      SHADOW_VMCS_OFFSET(host_ia32_sysenter_esp),
> > +   [HOST_IA32_SYSENTER_EIP] =
> > +      SHADOW_VMCS_OFFSET(host_ia32_sysenter_eip),
> > +   [HOST_RSP] =
> > +      SHADOW_VMCS_OFFSET(host_rsp),
> > +   [HOST_RIP] =
> > +      SHADOW_VMCS_OFFSET(host_rip),
> > +};
> > +
> > +static inline unsigned short vmcs_field_to_offset(unsigned long field)
> > +{
> > +
> > +   if (field > HOST_RIP || vmcs_field_to_offset_table[field] == 0) {
> > +      printk(KERN_ERR "invalid vmcs encoding 0x%lx\n", field);
> > +      return -1;
> > +   }
> > +
> > +   return vmcs_field_to_offset_table[field];
> > +}
> > +
> > +static inline unsigned long nested_vmcs_readl(struct kvm_vcpu *vcpu,
> > +                     unsigned long field)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   unsigned long *entry;
> > +
> > +   if (!vmx->nested.l2_state->shadow_vmcs) {
> > +      printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> > +      return -1;
> > +   }
> > +
> > +   entry = (unsigned long *)((char *)(vmx->nested.l2_state->
shadow_vmcs) +
> > +             vmcs_field_to_offset(field));
> > +   return *entry;
> > +}
> > +
> > +static inline u16 nested_vmcs_read16(struct kvm_vcpu *vcpu,
> > +                 unsigned long field)
> > +{
> > +   return nested_vmcs_readl(vcpu, field);
> > +}
> > +
> > +static inline u32 nested_vmcs_read32(struct kvm_vcpu *vcpu,
> unsigned long field)
> > +{
> > +   return nested_vmcs_readl(vcpu, field);
> > +}
> > +
> > +static inline u64 nested_vmcs_read64(struct kvm_vcpu *vcpu,
> unsigned long field)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   u64 *entry;
> > +   if (!vmx->nested.l2_state->shadow_vmcs) {
> > +      printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> > +      return -1;
> > +   }
> > +
> > +   entry = (u64 *)((char *)(vmx->nested.l2_state->shadow_vmcs) +
> > +             vmcs_field_to_offset(field));
> > +   return *entry;
> > +}
> > +
> > +static inline void nested_vmcs_writel(struct kvm_vcpu *vcpu,
> > +                  unsigned long field, unsigned long value)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   unsigned long entry =
> > +      (unsigned long)(vmx->nested.l2_state->shadow_vmcs);
> > +
> > +   if (!vmx->nested.l2_state->shadow_vmcs) {
> > +      printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> > +      return;
> > +   }
> > +   entry += vmcs_field_to_offset(field);
> > +   *(unsigned long *)entry = value;
> > +}
> > +
> > +static inline void nested_vmcs_write16(struct kvm_vcpu *vcpu,
> > +                   unsigned long field, u16 value)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   unsigned long entry =
> > +      (unsigned long)(vmx->nested.l2_state->shadow_vmcs);
> > +
> > +   if (!vmx->nested.l2_state->shadow_vmcs) {
> > +      printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> > +      return;
> > +   }
> > +   entry += vmcs_field_to_offset(field);
> > +   *(u16 *)entry = value;
> > +}
> > +
> > +static inline void nested_vmcs_write32(struct kvm_vcpu *vcpu,
> > +                   unsigned long field, u32 value)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   unsigned long entry =
> > +      (unsigned long)(vmx->nested.l2_state->shadow_vmcs);
> > +
> > +   if (!vmx->nested.l2_state->shadow_vmcs) {
> > +      printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> > +      return;
> > +   }
> > +   entry += vmcs_field_to_offset(field);
> > +   *(u32 *)entry = value;
> > +}
> > +
> > +static inline void nested_vmcs_write64(struct kvm_vcpu *vcpu,
> > +                   unsigned long field, u64 value)
> > +{
> > +#ifdef CONFIG_X86_64
> > +   nested_vmcs_writel(vcpu, field, value);
> > +#else /* nested: 32 bit not actually tested */
> > +   nested_vmcs_writel(vcpu, field, value);
> > +   nested_vmcs_writel(vcpu, field+1, value >> 32);
> > +#endif
> > +}
> > +
> >  static struct page *nested_get_page(struct kvm_vcpu *vcpu,
> >                  u64 vmcs_addr)
> >  {
> > @@ -307,6 +720,50 @@ static struct page *nested_get_page(struct
> kvm_vcpu *vcpu,
> >
> >  }
> >
> > +static int nested_map_shadow_vmcs(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct page *vmcs_page = nested_get_page(vcpu, vmx->nested.vmptr);
> > +
> > +   if (vmcs_page == NULL) {
> > +      printk(KERN_INFO "%s: failure in nested_get_page\n",__func__);
> > +      return 0;
> > +   }
> > +
> > +   if (vmx->nested.l2_state->shadow_vmcs) {
> > +      printk(KERN_INFO "%s: shadow vmcs already mapped\n",__func__);
> > +      return 0;
> > +   }
> > +
> > +   vmx->nested.l2_state->shadow_vmcs = kmap_atomic(vmcs_page,
KM_USER0);
> > +
> > +   if (!vmx->nested.l2_state->shadow_vmcs) {
> > +      printk(KERN_INFO "%s: error in kmap_atomic\n",__func__);
> > +      return 0;
> > +   }
> > +
> > +   return 1;
> > +}
> Cleanup after error is non-existent in this function.
I will fix it.
>
> > +
> > +static void nested_unmap_shadow_vmcs(struct kvm_vcpu *vcpu)
> > +{
> > +   struct page *page;
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +
> > +   if (!vmx->nested.l2_state->shadow_vmcs) {
> > +      printk("Shadow vmcs already unmapped\n");
> > +      return;
> > +   }
> > +
> > +   page = kmap_atomic_to_page(vmx->nested.l2_state->shadow_vmcs);
> > +
> > +   kunmap_atomic(vmx->nested.l2_state->shadow_vmcs, KM_USER0);
> > +
> > +   kvm_release_page_dirty(page);
> > +
> > +   vmx->nested.l2_state->shadow_vmcs = NULL;
> > +}
> > +
> >  static int init_rmode(struct kvm *kvm);
> >  static u64 construct_eptp(unsigned long root_hpa);
> >
> > @@ -3550,6 +4007,26 @@ static void clear_rflags_cf_zf(struct kvm_vcpu
*vcpu)
> >     vmx_set_rflags(vcpu, rflags);
> >  }
> >
> > +static void set_rflags_to_vmx_fail_invalid(struct kvm_vcpu *vcpu)
> > +{
> > +   unsigned long rflags;
> > +   rflags = vmx_get_rflags(vcpu);
> > +   rflags |= X86_EFLAGS_CF;
> > +   rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_ZF &
> > +      ~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
> > +   vmx_set_rflags(vcpu, rflags);
> > +}
> > +
> > +static void set_rflags_to_vmx_fail_valid(struct kvm_vcpu *vcpu)
> > +{
> > +   unsigned long rflags;
> > +   rflags = vmx_get_rflags(vcpu);
> > +   rflags |= X86_EFLAGS_ZF;
> > +   rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_CF &
> > +      ~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
> > +   vmx_set_rflags(vcpu, rflags);
> > +}
> > +
> >  static int handle_vmclear(struct kvm_vcpu *vcpu)
> >  {
> >     if (!nested_vmx_check_permission(vcpu))
> > @@ -3563,6 +4040,116 @@ static int handle_vmclear(struct kvm_vcpu
*vcpu)
> >     return 1;
> >  }
> >
> > +static int handle_vmread(struct kvm_vcpu *vcpu)
> > +{
> > +#ifndef CONFIG_X86_64
> > +   u64 value;
> > +#endif
> Can you move this to where it's used to save #ifdef here?
I will fix it.
>
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   if (!nested_map_shadow_vmcs(vcpu)) {
> > +      printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
> > +      set_rflags_to_vmx_fail_invalid(vcpu);
> > +      return 1;
> > +   }
> > +
> > +   switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
> > +   case VMCS_FIELD_TYPE_U16:
> > +      vcpu->arch.regs[VCPU_REGS_RAX] =
> Once again only vmread %rdx,%rax is implemented. No operand decoding.
Will be added.
>
> > +         nested_vmcs_read16(vcpu,
> > +                  vcpu->arch.regs[VCPU_REGS_RDX]);
> > +      break;
> > +   case VMCS_FIELD_TYPE_U32:
> > +      vcpu->arch.regs[VCPU_REGS_RAX] =
> > +         nested_vmcs_read32(vcpu,
> > +                  vcpu->arch.regs[VCPU_REGS_RDX]);
> > +      break;
> > +   case VMCS_FIELD_TYPE_U64:
> > +#ifdef CONFIG_X86_64
> > +      vcpu->arch.regs[VCPU_REGS_RAX] =
> > +      nested_vmcs_read64(vcpu,
> > +                  vcpu->arch.regs[VCPU_REGS_RDX]);
> > +#else /* nested: 32 bit not actually tested */
> > +      value =  nested_vmcs_read64(vcpu,
> > +                   vcpu->arch.regs[VCPU_REGS_RDX]);
> > +      vcpu->arch.regs[VCPU_REGS_RAX] = value;
> > +      vcpu->arch.regs[VCPU_REGS_RBX] = value >> 32;
> > +#endif
> > +   break;
> > +   case VMCS_FIELD_TYPE_ULONG:
> > +      vcpu->arch.regs[VCPU_REGS_RAX] =
> > +         nested_vmcs_readl(vcpu,
> > +                 vcpu->arch.regs[VCPU_REGS_RDX]);
> > +      break;
> > +   default:
> > +      printk(KERN_INFO "%s invalid field\n", __func__);
> > +      set_rflags_to_vmx_fail_valid(vcpu);
> > +      vmcs_write32(VM_INSTRUCTION_ERROR, 12);
> > +      nested_unmap_shadow_vmcs(vcpu);
> > +      return 1;
> > +   }
> > +
> > +   clear_rflags_cf_zf(vcpu);
> > +   skip_emulated_instruction(vcpu);
> > +   nested_unmap_shadow_vmcs(vcpu);
> > +   return 1;
> > +}
> > +
> > +static int handle_vmwrite(struct kvm_vcpu *vcpu)
> > +{
> > +#ifndef CONFIG_X86_64
> > +   u64 value ;
> > +#endif
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   if (!nested_map_shadow_vmcs(vcpu)) {
> > +      printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
> > +      set_rflags_to_vmx_fail_invalid(vcpu);
> > +      return 1;
> > +   }
> > +
> > +   switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
> > +   case VMCS_FIELD_TYPE_U16:
> > +      nested_vmcs_write16(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
> > +                vcpu->arch.regs[VCPU_REGS_RAX]);
> > +      break;
> > +   case VMCS_FIELD_TYPE_U32:
> > +      nested_vmcs_write32(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
> > +                vcpu->arch.regs[VCPU_REGS_RAX]);
> > +      break;
> > +   case VMCS_FIELD_TYPE_U64:
> > +#ifdef CONFIG_X86_64
> > +      nested_vmcs_write64(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
> > +                vcpu->arch.regs[VCPU_REGS_RAX]);
> > +#else /* nested: 32 bit not actually tested */
> > +      value =  vcpu->arch.regs[VCPU_REGS_RAX] |
> > +         (vcpu->arch.regs[VCPU_REGS_RBX] << 32);
> > +      nested_vmcs_write64(vcpu,
> > +                vcpu->arch.regs[VCPU_REGS_RDX], value);
> Why not open code part of nested_vmcs_write64 and get rid of #ifdef
> there?:
>    nested_vmcs_writel(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
> vcpu->arch.regs[VCPU_REGS_RAX]);
>    nested_vmcs_writel(vcpu, vcpu->arch.regs[VCPU_REGS_RDX]+1,
> vcpu->arch.regs[VCPU_REGS_RBX]);
I will change it.
>
> > +#endif
> > +      break;
> > +   case VMCS_FIELD_TYPE_ULONG:
> > +      nested_vmcs_writel(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
> > +               vcpu->arch.regs[VCPU_REGS_RAX]);
> > +      break;
> > +   default:
> > +      printk(KERN_INFO "%s invalid field\n", __func__);
> > +      set_rflags_to_vmx_fail_valid(vcpu);
> > +      vmcs_write32(VM_INSTRUCTION_ERROR, 12);
> > +      nested_unmap_shadow_vmcs(vcpu);
> > +      return 1;
> > +   }
> > +
> > +   clear_rflags_cf_zf(vcpu);
> > +   skip_emulated_instruction(vcpu);
> > +   nested_unmap_shadow_vmcs(vcpu);
> > +   return 1;
> > +}
> > +
> >  static int handle_vmoff(struct kvm_vcpu *vcpu)
> >  {
> >     struct vcpu_vmx *vmx = to_vmx(vcpu);
> > @@ -3950,9 +4537,9 @@ static int (*kvm_vmx_exit_handlers[])(struct
> kvm_vcpu *vcpu) = {
> >     [EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
> >     [EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
> >     [EXIT_REASON_VMPTRST]                 = handle_vmptrst,
> > -   [EXIT_REASON_VMREAD]                  = handle_vmx_insn,
> > +   [EXIT_REASON_VMREAD]                  = handle_vmread,
> >     [EXIT_REASON_VMRESUME]                = handle_vmx_insn,
> > -   [EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
> > +   [EXIT_REASON_VMWRITE]                 = handle_vmwrite,
> >     [EXIT_REASON_VMOFF]                   = handle_vmoff,
> >     [EXIT_REASON_VMON]                    = handle_vmon,
> >     [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
> > --
> > 1.6.0.4
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> --
>          Gleb.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-19 17:29           ` Gleb Natapov
@ 2009-10-21 14:43             ` Orit Wasserman
  2009-10-22  9:04               ` Gleb Natapov
  2009-10-22 10:55               ` Avi Kivity
  0 siblings, 2 replies; 35+ messages in thread
From: Orit Wasserman @ 2009-10-21 14:43 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Gleb Natapov <gleb@redhat.com> wrote on 19/10/2009 19:29:39:

> From:
>
> Gleb Natapov <gleb@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Abel Gordon/
> Haifa/IBM@IBMIL, Muli Ben-Yehuda/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mdday@us.ibm.com
>
> Date:
>
> 19/10/2009 19:29
>
> Subject:
>
> Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
>
> On Thu, Oct 15, 2009 at 04:41:46PM +0200, oritw@il.ibm.com wrote:
> > From: Orit Wasserman <oritw@il.ibm.com>
> >
> > ---
> >  arch/x86/kvm/vmx.c | 1173 +++++++++++++++++++++++++++++++++++++++
> +++++++++++--
> >  1 files changed, 1148 insertions(+), 25 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 6a4c252..e814029 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -209,6 +209,7 @@ struct __attribute__ ((__packed__)) level_state {
> >     struct vmcs *vmcs;
> >     int cpu;
> >     int launched;
> > +   bool first_launch;
> >  };
> >
> >  struct nested_vmx {
> > @@ -216,6 +217,12 @@ struct nested_vmx {
> >     bool vmxon;
> >     /* What is the location of the  vmcs l1 keeps for l2? (in level1
gpa) */
> >     u64 vmptr;
> > +   /* Are we running nested guest */
> > +   bool nested_mode;
> > +   /* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
> > +   bool nested_run_pending;
> > +   /* flag indicating if there was a valid IDT after exiting from l2
*/
> > +   bool nested_valid_idt;
> >     /*
> >      * Level 2 state : includes vmcs,registers and
> >      * a copy of vmcs12 for vmread/vmwrite
> > @@ -240,6 +247,10 @@ static inline int vmcs_field_length(unsigned
> long field)
> >     return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
> >  }
> >
> > +#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \
> > +               VM_EXIT_SAVE_IA32_PAT))
> > +#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \
> > +                VM_ENTRY_IA32E_MODE))
> >  struct vmcs {
> >     u32 revision_id;
> >     u32 abort;
> > @@ -303,6 +314,12 @@ static inline struct vcpu_vmx *to_vmx(struct
> kvm_vcpu *vcpu)
> >     return container_of(vcpu, struct vcpu_vmx, vcpu);
> >  }
> >
> > +static inline struct shadow_vmcs *get_shadow_vmcs(struct kvm_vcpu
*vcpu)
> > +{
> > +   WARN_ON(!to_vmx(vcpu)->nested.l2_state->shadow_vmcs);
> > +   return to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
> > +}
> > +
> >  #define SHADOW_VMCS_OFFSET(x) offsetof(struct shadow_vmcs, x)
> >
> >  static unsigned short vmcs_field_to_offset_table[HOST_RIP+1] = {
> > @@ -822,8 +839,16 @@ static struct kvm_vmx_segment_field {
> >  static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
> >
> >  static int nested_vmx_check_permission(struct kvm_vcpu *vcpu);
> > +static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned
nr,
> > +                  bool has_error_code, u32 error_code);
> > +static int nested_vmx_intr(struct kvm_vcpu *vcpu);
> >  static int create_l1_state(struct kvm_vcpu *vcpu);
> >  static int create_l2_state(struct kvm_vcpu *vcpu);
> > +static int launch_guest(struct kvm_vcpu *vcpu);
> > +static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu);
> > +static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool
> kvm_override);
> > +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> > +              bool is_interrupt);
> >
> >  /*
> >   * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize
it
> > @@ -940,6 +965,18 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
> >     return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
> >  }
> >
> > +static inline int is_exception(u32 intr_info)
> > +{
> > +   return (intr_info & (INTR_INFO_INTR_TYPE_MASK |
INTR_INFO_VALID_MASK))
> > +      == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
> > +}
> > +
> > +static inline int is_nmi(u32 intr_info)
> > +{
> > +   return (intr_info & (INTR_INFO_INTR_TYPE_MASK |
INTR_INFO_VALID_MASK))
> > +      == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
> > +}
> > +
> >  static inline int cpu_has_vmx_invept_individual_addr(void)
> >  {
> >     return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
> > @@ -990,6 +1027,51 @@ static inline bool report_flexpriority(void)
> >     return flexpriority_enabled;
> >  }
> >
> > +static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu
*vcpu)
> > +{
> > +   return cpu_has_vmx_tpr_shadow() &&
> > +      get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
> > +      CPU_BASED_TPR_SHADOW;
> > +}
> > +
> > +static inline int nested_cpu_has_secondary_exec_ctrls(struct
> kvm_vcpu *vcpu)
> > +{
> > +   return cpu_has_secondary_exec_ctrls() &&
> > +      get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
> > +      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> > +}
> > +
> > +static inline bool nested_vm_need_virtualize_apic_accesses(struct
kvm_vcpu
> > +                        *vcpu)
> > +{
> > +   return get_shadow_vmcs(vcpu)->secondary_vm_exec_control &
> > +      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> > +}
> > +
> > +static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
> > +{
> > +   return get_shadow_vmcs(vcpu)->
> > +      secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT;
> > +}
> > +
> > +static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu)
> > +{
> > +   return get_shadow_vmcs(vcpu)->secondary_vm_exec_control &
> > +      SECONDARY_EXEC_ENABLE_VPID;
> > +}
> > +
> > +static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu)
> > +{
> > +   return get_shadow_vmcs(vcpu)->vm_entry_controls &
> > +      VM_ENTRY_LOAD_IA32_PAT;
> > +}
> > +
> > +static inline int nested_cpu_has_vmx_msr_bitmap(struct kvm_vcpu *vcpu)
> > +{
> > +   return get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
> > +      CPU_BASED_USE_MSR_BITMAPS;
> > +}
> > +
> >  static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
> >  {
> >     int i;
> > @@ -1501,6 +1583,9 @@ static void vmx_queue_exception(struct
> kvm_vcpu *vcpu, unsigned nr,
> >     struct vcpu_vmx *vmx = to_vmx(vcpu);
> >     u32 intr_info = nr | INTR_INFO_VALID_MASK;
> >
> > +   if (nested_vmx_check_exception(vmx, nr, has_error_code,
error_code))
> > +      return;
> > +
> >     if (has_error_code) {
> >        vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
> >        intr_info |= INTR_INFO_DELIVER_CODE_MASK;
> > @@ -1943,6 +2028,200 @@ static void vmclear_local_vcpus(void)
> >        __vcpu_clear(vmx);
> >  }
> >
> > +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> > +{
> > +   struct shadow_vmcs *l2_shadow_vmcs =
> > +      get_shadow_vmcs(vcpu);
> > +
> > +   l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> > +   l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> > +   l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> > +   l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16
(GUEST_LDTR_SELECTOR);
> > +   l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> > +
> > +   l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> > +   l2_shadow_vmcs->guest_physical_address =
> > +      vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> > +   l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> > +   l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64
(GUEST_IA32_DEBUGCTL);
> > +   if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
> > +      l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> > +   l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> > +   l2_shadow_vmcs->vm_entry_intr_info_field =
> > +      vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> > +   l2_shadow_vmcs->vm_entry_exception_error_code =
> > +      vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> > +   l2_shadow_vmcs->vm_entry_instruction_len =
> > +      vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> > +   l2_shadow_vmcs->vm_instruction_error =
> > +      vmcs_read32(VM_INSTRUCTION_ERROR);
> > +   l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> > +   l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > +   l2_shadow_vmcs->vm_exit_intr_error_code =
> > +      vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> > +   l2_shadow_vmcs->idt_vectoring_info_field =
> > +      vmcs_read32(IDT_VECTORING_INFO_FIELD);
> > +   l2_shadow_vmcs->idt_vectoring_error_code =
> > +      vmcs_read32(IDT_VECTORING_ERROR_CODE);
> > +   l2_shadow_vmcs->vm_exit_instruction_len =
> > +      vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> > +   l2_shadow_vmcs->vmx_instruction_info =
> > +      vmcs_read32(VMX_INSTRUCTION_INFO);
> > +   l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> > +   l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> > +   l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> > +   l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> > +   l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> > +   l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> > +   l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> > +   l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> > +   l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32
(GUEST_LDTR_AR_BYTES);
> > +   l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> > +   l2_shadow_vmcs->guest_interruptibility_info =
> > +      vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +   l2_shadow_vmcs->guest_activity_state =
> > +      vmcs_read32(GUEST_ACTIVITY_STATE);
> > +   l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> > +
> > +   l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> > +   l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> > +   l2_shadow_vmcs->exit_qualification = vmcs_readl
(EXIT_QUALIFICATION);
> > +   l2_shadow_vmcs->guest_linear_address = vmcs_readl
(GUEST_LINEAR_ADDRESS);
> > +   l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
> > +
> > +   l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
> > +   l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> > +   l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> > +   l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> > +   l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> > +   l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> > +   l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> > +   l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> > +   l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> > +   l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> > +   l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> > +   l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
> > +   l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
> > +   l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
> > +   l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> > +   l2_shadow_vmcs->guest_pending_dbg_exceptions =
> > +      vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> > +   l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl
(GUEST_SYSENTER_ESP);
> > +   l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl
(GUEST_SYSENTER_EIP);
> > +}
> > +
> > +int load_vmcs_common(struct shadow_vmcs *src)
> > +{
> > +   vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
> > +   vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
> > +   vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
> > +   vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
> > +   vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
> > +   vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
> > +   vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
> > +   vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
> > +
> > +   vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
> > +   vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
> > +
> > +   if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
> > +      vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
> > +
> > +   vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->
vm_entry_msr_load_count);
> > +   vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->
vm_entry_intr_info_field);
> > +   vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> > +           src->vm_entry_exception_error_code);
> > +   vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->
vm_entry_instruction_len);
> > +
> > +   vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
> > +   vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
> > +   vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
> > +   vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
> > +   vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
> > +   vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
> > +   vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
> > +   vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
> > +   vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
> > +   vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
> > +   vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
> > +   vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
> > +   vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
> > +   vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
> > +   vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
> > +   vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
> > +   vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
> > +   vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
> > +   vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
> > +           src->guest_interruptibility_info);
> > +   vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
> > +   vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
> > +
> > +   vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
> > +   vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
> > +   vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
> > +   vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
> > +   vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
> > +   vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
> > +   vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
> > +   vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
> > +   vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
> > +   vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
> > +   vmcs_writel(GUEST_DR7, src->guest_dr7);
> > +   vmcs_writel(GUEST_RSP, src->guest_rsp);
> > +   vmcs_writel(GUEST_RIP, src->guest_rip);
> > +   vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
> > +   vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
> > +          src->guest_pending_dbg_exceptions);
> > +   vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
> > +   vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
> > +
> > +   return 0;
> > +}
> > +
> > +int load_vmcs_host_state(struct shadow_vmcs *src)
> > +{
> > +   vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector);
> > +   vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector);
> > +   vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector);
> > +   vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector);
> > +   vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector);
> > +   vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector);
> > +   vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector);
> > +
> > +   vmcs_write64(TSC_OFFSET, src->tsc_offset);
> > +
> > +   if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
> > +      vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat);
> > +
> > +   vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs);
> > +
> > +   vmcs_writel(HOST_CR0, src->host_cr0);
> > +   vmcs_writel(HOST_CR3, src->host_cr3);
> > +   vmcs_writel(HOST_CR4, src->host_cr4);
> > +   vmcs_writel(HOST_FS_BASE, src->host_fs_base);
> > +   vmcs_writel(HOST_GS_BASE, src->host_gs_base);
> > +   vmcs_writel(HOST_TR_BASE, src->host_tr_base);
> > +   vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base);
> > +   vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base);
> > +   vmcs_writel(HOST_RSP, src->host_rsp);
> > +   vmcs_writel(HOST_RIP, src->host_rip);
> > +   vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp);
> > +   vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip);
> > +
> > +   return 0;
> > +}
> > +
> >  struct level_state *create_state(void)
> >  {
> >     struct level_state *state = NULL;
> > @@ -2003,6 +2282,8 @@ int create_l2_state(struct kvm_vcpu *vcpu)
> >     vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> >     vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> >
> > +   vmx->nested.l2_state->first_launch = true;
> > +
> >     return 0;
> >  }
> >
> > @@ -3393,6 +3674,14 @@ static void enable_irq_window(struct kvm_vcpu
*vcpu)
> >  {
> >     u32 cpu_based_vm_exec_control;
> >
> > +   if (to_vmx(vcpu)->nested.nested_mode) {
> > +      if (kvm_cpu_has_interrupt(vcpu)) {
> Why interrupt will not be present during the call to
> enable_irq_window()?
I will look into it.
>
> > +         if (nested_vmx_intr(vcpu))
> > +            return;
> > +      }
> > +      return;
> > +   }
> > +
> >     cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> >     cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
> >     vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
> > @@ -3448,6 +3737,10 @@ static void vmx_inject_nmi(struct kvm_vcpu
*vcpu)
> >  {
> >     struct vcpu_vmx *vmx = to_vmx(vcpu);
> >
> > +   if (vmx->nested.nested_mode) {
> > +      return;
> > +   }
> > +
> >     if (!cpu_has_virtual_nmis()) {
> >        /*
> >         * Tracking the NMI-blocked state in software is built upon
> > @@ -3489,6 +3782,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu
*vcpu)
> >
> >  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
> >  {
> > +   if (to_vmx(vcpu)->nested.nested_mode) {
> > +      if (kvm_cpu_has_interrupt(vcpu)) {
> > +         if (!nested_vmx_intr(vcpu))
> > +            return 0;
> > +      }
> > +   }
> > +
> Same as above. kvm_cpu_has_interrupt() should alway return true here.
> More interesting question: why not return 0 here if in nested mode and
> PIN_BASED_EXT_INTR_MASK is not set and let enable_irq_window() to do
> nested exit. This is what svm does as far as I see.
I will look into it.
>
> >     return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
> >        !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
> >           (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
> > @@ -3993,12 +4293,6 @@ static int nested_vmx_check_permission
> (struct kvm_vcpu *vcpu)
> >     return 1;
> >  }
> >
> > -static int handle_vmx_insn(struct kvm_vcpu *vcpu)
> > -{
> > -   kvm_queue_exception(vcpu, UD_VECTOR);
> > -   return 1;
> > -}
> > -
> >  static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
> >  {
> >     unsigned long rflags;
> > @@ -4040,6 +4334,27 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
> >     return 1;
> >  }
> >
> > +static int handle_vmlaunch(struct kvm_vcpu *vcpu)
> > +{
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +   if (!to_vmx(vcpu)->nested.l2_state->vmclear)
> > +      return 1;
> > +
> > +   return launch_guest(vcpu);
> > +}
> > +
> > +static int handle_vmresume(struct kvm_vcpu *vcpu)
> > +{
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   if (to_vmx(vcpu)->nested.l2_state->vmclear)
> > +      return 1;
> > +
> > +   return launch_guest(vcpu);
> > +}
> > +
> handle_vmlaunch() and handle_vmresume() are looking suspiciously
> similar may be move vmclear checking logic into launch_guest()?
> It will get additional parameter: expected value of vmclear.
Ok.
>
> >  static int handle_vmread(struct kvm_vcpu *vcpu)
> >  {
> >  #ifndef CONFIG_X86_64
> > @@ -4050,7 +4365,6 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
> >        return 1;
> >
> >     if (!nested_map_shadow_vmcs(vcpu)) {
> > -      printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
> >        set_rflags_to_vmx_fail_invalid(vcpu);
> >        return 1;
> >     }
> Remove from a patch that add it if you don't need this. Also all
> prinks that can be triggered by a guest should be removed/changed to
> debug output.
Ok.
>
> > @@ -4107,7 +4421,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
> >        return 1;
> >
> >     if (!nested_map_shadow_vmcs(vcpu)) {
> > -      printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
> >        set_rflags_to_vmx_fail_invalid(vcpu);
> >        return 1;
> >     }
> > @@ -4137,16 +4450,16 @@ static int handle_vmwrite(struct kvm_vcpu
*vcpu)
> >                 vcpu->arch.regs[VCPU_REGS_RAX]);
> >        break;
> >     default:
> > +      nested_unmap_shadow_vmcs(vcpu);
> >        printk(KERN_INFO "%s invalid field\n", __func__);
> >        set_rflags_to_vmx_fail_valid(vcpu);
> >        vmcs_write32(VM_INSTRUCTION_ERROR, 12);
> > -      nested_unmap_shadow_vmcs(vcpu);
> >        return 1;
> >     }
> Why this is here and not in the patch that introduce the function?
I will move it there.
>
> >
> > +   nested_unmap_shadow_vmcs(vcpu);
> >     clear_rflags_cf_zf(vcpu);
> >     skip_emulated_instruction(vcpu);
> > -   nested_unmap_shadow_vmcs(vcpu);
> >     return 1;
> >  }
> Same.
I will move it there.
>
> >
> > @@ -4208,7 +4521,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
> >  static int handle_vmptrld(struct kvm_vcpu *vcpu)
> >  {
> >     struct vcpu_vmx *vmx = to_vmx(vcpu);
> > -   struct page *vmcs_page;
> >     u64 guest_vmcs_addr;
> >
> >     if (!nested_vmx_check_permission(vcpu))
> > @@ -4228,14 +4540,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
> >     }
> >
> >     if (vmx->nested.vmptr != guest_vmcs_addr) {
> > -      /* checking vmptr address */
> > -      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> > -      if (vmcs_page == NULL)
> > -         return 1;
> > -
> >        vmx->nested.vmptr = guest_vmcs_addr;
> > -
> > -      kvm_release_page_clean(vmcs_page);
> >     }
> Same.
I will move it there.
>
> >
> >     clear_rflags_cf_zf(vcpu);
> > @@ -4534,11 +4839,11 @@ static int (*kvm_vmx_exit_handlers[])
> (struct kvm_vcpu *vcpu) = {
> >     [EXIT_REASON_INVLPG]            = handle_invlpg,
> >     [EXIT_REASON_VMCALL]                  = handle_vmcall,
> >     [EXIT_REASON_VMCLEAR]                 = handle_vmclear,
> > -   [EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
> > +   [EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
> >     [EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
> >     [EXIT_REASON_VMPTRST]                 = handle_vmptrst,
> >     [EXIT_REASON_VMREAD]                  = handle_vmread,
> > -   [EXIT_REASON_VMRESUME]                = handle_vmx_insn,
> > +   [EXIT_REASON_VMRESUME]                = handle_vmresume,
> >     [EXIT_REASON_VMWRITE]                 = handle_vmwrite,
> >     [EXIT_REASON_VMOFF]                   = handle_vmoff,
> >     [EXIT_REASON_VMON]                    = handle_vmon,
> > @@ -4566,6 +4871,17 @@ static int vmx_handle_exit(struct kvm_vcpu
*vcpu)
> >
> >     trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
> >
> > +   if (exit_reason == EXIT_REASON_VMLAUNCH ||
> > +       exit_reason == EXIT_REASON_VMRESUME)
> > +      vmx->nested.nested_run_pending = 1;
> > +   else
> > +      vmx->nested.nested_run_pending = 0;
> > +
> > +   if (vmx->nested.nested_mode && nested_vmx_exit_handled(vcpu, true))
{
> > +      nested_vmx_vmexit(vcpu, false);
> > +      return 1;
> > +   }
> > +
> >     /* If we need to emulate an MMIO from handle_invalid_guest_state
> >      * we just return 0 */
> >     if (vmx->emulation_required && emulate_invalid_guest_state) {
> > @@ -4585,7 +4901,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> >           = vmcs_read32(VM_INSTRUCTION_ERROR);
> >        return 0;
> >     }
> > -
> No spurious line deletions please.
I will fix it.
>
> >     if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
> >           (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
> >           exit_reason != EXIT_REASON_EPT_VIOLATION &&
> > @@ -4593,8 +4908,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> >        printk(KERN_WARNING "%s: unexpected, valid vectoring info "
> >               "(0x%x) and exit reason is 0x%x\n",
> >               __func__, vectoring_info, exit_reason);
> > -
> > -   if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
> > +   if (!vmx->nested.nested_mode && unlikely(!cpu_has_virtual_nmis
> () && vmx->soft_vnmi_blocked)) {
> >        if (vmx_interrupt_allowed(vcpu)) {
> >           vmx->soft_vnmi_blocked = 0;
> >        } else if (vmx->vnmi_blocked_time > 1000000000LL &&
> > @@ -4641,10 +4955,13 @@ static void vmx_complete_interrupts(struct
> vcpu_vmx *vmx)
> >     int type;
> >     bool idtv_info_valid;
> >
> > -   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > -
> >     vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
> >
> > +   if (vmx->nested.nested_mode)
> > +      return;
> > +
> Why return here? What the function does that should not be done in
> nested mode?
In nested mode L0 injects an interrupt to L2 only in one scenario,
if there is an IDT_VALID event and L0 decides to run L2 again and not to
switch back to L1.
In all other cases the injection is handled by L1.
>
> > +   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > +
> >     /* Handle machine checks before interrupts are enabled */
> >     if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
> >         || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
> > @@ -4747,6 +5064,60 @@ static void fixup_rmode_irq(struct vcpu_vmx
*vmx)
> >        | vmx->rmode.irq.vector;
> >  }
> >
> > +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
> > +{
> It seems by this function you are trying to bypass general event
> reinjection logic. Why?
See above.
>
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   int irq;
> > +   int type;
> > +   int errCodeValid;
> > +   u32 idt_vectoring_info;
> > +   u32 guest_intr;
> > +   bool nmi_window_open;
> > +   bool interrupt_window_open;
> > +
> > +   if (vmx->nested.nested_mode && vmx->nested.nested_valid_idt) {
> Caller already checked nested_mode why recheck?
I will remove it.
>
> > +      idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> > +      irq  = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
> > +      type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
> > +      errCodeValid = idt_vectoring_info &
> > +         VECTORING_INFO_DELIVER_CODE_MASK;
> > +
> > +      guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +      nmi_window_open =
> > +         !(guest_intr & (GUEST_INTR_STATE_STI |
> > +               GUEST_INTR_STATE_MOV_SS |
> > +               GUEST_INTR_STATE_NMI));
> > +
> > +      interrupt_window_open =
> > +         ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
> > +          !(guest_intr & (GUEST_INTR_STATE_STI |
> > +                GUEST_INTR_STATE_MOV_SS)));
> > +
> > +      if (type == INTR_TYPE_EXT_INTR && !interrupt_window_open) {
> > +         printk(KERN_INFO "IDT ignored, l2 interrupt window
closed!\n");
> > +         return 0;
> > +      }
> > +
> > +      if (type == INTR_TYPE_NMI_INTR && !nmi_window_open) {
> > +         printk(KERN_INFO "IDT ignored, l2 nmi window closed!\n");
> > +         return 0;
> > +      }
> > +
> > +      vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
> > +         irq | type | INTR_INFO_VALID_MASK | errCodeValid);
> > +
> > +
> > +      vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
> > +              vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
> > +
> > +      if (errCodeValid)
> > +         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> > +                 vmcs_read32(IDT_VECTORING_ERROR_CODE));
> > +   }
> > +
> > +   return 1;
> > +}
> > +
> >  #ifdef CONFIG_X86_64
> >  #define R "r"
> >  #define Q "q"
> > @@ -4758,6 +5129,26 @@ static void fixup_rmode_irq(struct vcpu_vmx
*vmx)
> >  static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
> >  {
> >     struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   int r;
> > +
> > +   if (vmx->nested.nested_mode) {
> > +      r = nested_handle_valid_idt(vcpu);
> > +      if (!r) {
> > +         vmx->fail = 1;
> > +         return;
> > +      }
> > +
> > +      if (!nested_map_shadow_vmcs(vcpu)) {
> > +         vmx->fail = 1;
> > +         return;
> > +      }
> > +
> > +      vmcs_write32(EXCEPTION_BITMAP, get_shadow_vmcs(vcpu)->
> > +              exception_bitmap |
> > +              vmx->nested.l1_state->shadow_vmcs->exception_bitmap);
> > +
> > +      nested_unmap_shadow_vmcs(vcpu);
> > +   }
> >
> >     if (enable_ept && is_paging(vcpu)) {
> >        vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
> > @@ -4896,6 +5287,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
> >     get_debugreg(vcpu->arch.dr6, 6);
> >
> >     vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> > +
> > +   vmx->nested.nested_valid_idt = vmx->nested.nested_mode &&
> > +      (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
> > +
> >     if (vmx->rmode.irq.pending)
> >        fixup_rmode_irq(vmx);
> >
> > @@ -4984,6 +5379,11 @@ static struct kvm_vcpu *vmx_create_vcpu
> (struct kvm *kvm, unsigned int id)
> >           goto free_vmcs;
> >     }
> >
> > +   vmx->nested.vmptr = 0;
> > +
> > +   vmx->nested.l1_state = NULL;
> > +   vmx->nested.l2_state = NULL;
> > +
> >     return &vmx->vcpu;
> >
> >  free_vmcs:
> > @@ -5215,6 +5615,729 @@ void save_vmcs(struct shadow_vmcs *dst)
> >     if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
> >        dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
> >  }
> > +int prepare_vmcs_02(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct shadow_vmcs *src = get_shadow_vmcs(vcpu);
> > +   u32 exec_control;
> > +
> > +   if (!src) {
> > +      printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   load_vmcs_common(src);
> > +
> > +   if (vmx->nested.l2_state->first_launch) {
> > +      if (cpu_has_vmx_vpid() && vmx->nested.l2_state->vpid != 0)
> > +         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->
vpid);
> > +
> > +      if (vmx->nested.l2_state->io_bitmap_a)
> > +         vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
> > +
> > +      if (vmx->nested.l2_state->io_bitmap_b)
> > +         vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
> > +
> > +      if (vmx->nested.l2_state->msr_bitmap)
> > +         vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
> > +
> > +      if (src->vm_entry_msr_load_count > 0) {
> > +         struct page *page;
> > +
> > +         page = nested_get_page(vcpu,
> > +                      src->vm_entry_msr_load_addr);
> > +         if (!page)
> > +            return 1;
> > +
> > +         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
> > +
> > +         kvm_release_page_clean(page);
> > +      }
> > +
> > +      if (nested_cpu_has_vmx_tpr_shadow(vcpu)) {
> > +         struct page *page;
> > +
> > +         page = nested_get_page(vcpu,
> > +                      src->virtual_apic_page_addr);
> > +         if (!page)
> > +            return 1;
> > +
> > +         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
> > +
> > +         kvm_release_page_clean(page);
> > +      }
> > +
> > +      if (nested_vm_need_virtualize_apic_accesses(vcpu)) {
> > +         struct page *page =
> > +            nested_get_page(vcpu, src->apic_access_addr);
> > +         if (!page)
> > +            return 1;
> > +
> > +         vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
> > +         kvm_release_page_clean(page);
> > +      }
> > +
> > +      vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> > +
> (vmx->nested.l1_state->shadow_vmcs->pin_based_vm_exec_control |
> > +               src->pin_based_vm_exec_control));
> > +
> > +      vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
> > +
> (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_mask &
> > +               src->page_fault_error_code_mask));
> > +
> > +      vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
> > +
> (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_match &
> > +               src->page_fault_error_code_match));
> > +
> > +      if (cpu_has_secondary_exec_ctrls()) {
> > +
> > +         exec_control =
> > +            vmx->nested.l1_state->shadow_vmcs->
secondary_vm_exec_control;
> > +
> > +         if (nested_cpu_has_secondary_exec_ctrls(vcpu)) {
> > +
> > +            exec_control |= src->secondary_vm_exec_control;
> > +
> > +            if (!vm_need_virtualize_apic_accesses(vcpu->kvm) ||
> > +                !nested_vm_need_virtualize_apic_accesses(vcpu))
> > +               exec_control &=
> > +                  ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> > +         }
> > +
> > +         vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> > +      }
> > +
> > +      load_vmcs_host_state(vmx->nested.l1_state->shadow_vmcs);
> > +
> > +      vmx->nested.l2_state->first_launch = false;
> > +   }
> > +
> > +   if (vm_need_tpr_shadow(vcpu->kvm) &&
> > +       nested_cpu_has_vmx_tpr_shadow(vcpu))
> > +      vmcs_write32(TPR_THRESHOLD, src->tpr_threshold);
> > +
> > +   if (enable_ept) {
> > +      if (!nested_cpu_has_vmx_ept(vcpu)) {
> > +         vmcs_write64(EPT_POINTER,
> > +                 vmx->nested.l1_state->shadow_vmcs->ept_pointer);
> > +         vmcs_write64(GUEST_PDPTR0,
> > +                 vmx->nested.l1_state->shadow_vmcs->guest_pdptr0);
> > +         vmcs_write64(GUEST_PDPTR1,
> > +                 vmx->nested.l1_state->shadow_vmcs->guest_pdptr1);
> > +         vmcs_write64(GUEST_PDPTR2,
> > +                 vmx->nested.l1_state->shadow_vmcs->guest_pdptr2);
> > +         vmcs_write64(GUEST_PDPTR3,
> > +                 vmx->nested.l1_state->shadow_vmcs->guest_pdptr3);
> > +      }
> > +   }
> > +
> > +   exec_control =
> vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
> > +
> > +   exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> > +
> > +   exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> > +
> > +   exec_control &= ~CPU_BASED_TPR_SHADOW;
> > +
> > +   exec_control |= src->cpu_based_vm_exec_control;
> > +
> > +   if (!vm_need_tpr_shadow(vcpu->kvm) ||
> > +       src->virtual_apic_page_addr == 0) {
> > +      exec_control &= ~CPU_BASED_TPR_SHADOW;
> > +#ifdef CONFIG_X86_64
> > +      exec_control |= CPU_BASED_CR8_STORE_EXITING |
> > +         CPU_BASED_CR8_LOAD_EXITING;
> > +#endif
> > +   } else if (exec_control & CPU_BASED_TPR_SHADOW) {
> > +
> > +#ifdef CONFIG_X86_64
> > +      exec_control &= ~CPU_BASED_CR8_STORE_EXITING;
> > +      exec_control &= ~CPU_BASED_CR8_LOAD_EXITING;
> > +#endif
> > +   }
> > +
> > +   vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
> > +
> > +   vmcs_write32(EXCEPTION_BITMAP,
> > +           (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
> > +            src->exception_bitmap));
> > +
> > +   vmcs_write32(VM_EXIT_CONTROLS,
> > +           ((vmx->nested.l1_state->shadow_vmcs->vm_exit_controls &
> > +             NESTED_VM_EXIT_CONTROLS_MASK) | src->vm_exit_controls));
> > +
> > +   vmcs_write32(VM_ENTRY_CONTROLS,
> > +           (vmx->nested.l1_state->shadow_vmcs->vm_entry_controls &
> > +            NESTED_VM_ENTRY_CONTROLS_MASK) | src->vm_entry_controls);
> > +
> > +   vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->
vm_entry_msr_load_count);
> > +
> > +   vmcs_writel(CR0_GUEST_HOST_MASK,
> > +          (vmx->nested.l1_state->shadow_vmcs->cr0_guest_host_mask  &
> > +           src->cr0_guest_host_mask));
> > +   vmcs_writel(CR4_GUEST_HOST_MASK,
> > +          (vmx->nested.l1_state->shadow_vmcs->cr4_guest_host_mask  &
> > +           src->cr4_guest_host_mask));
> > +
> > +   return 0;
> > +}
> > +
> > +int switch_back_vmcs(struct kvm_vcpu *vcpu)
> > +{
> > +   struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_state->
shadow_vmcs;
> > +
> > +   if (enable_vpid && src->virtual_processor_id != 0)
> > +      vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
> > +
> > +   vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
> > +   vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
> > +
> > +   if (cpu_has_vmx_msr_bitmap())
> > +      vmcs_write64(MSR_BITMAP, src->msr_bitmap);
> > +
> > +   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
> > +
> > +   if (vm_need_virtualize_apic_accesses(vcpu->kvm))
> > +      vmcs_write64(APIC_ACCESS_ADDR,
> > +              src->apic_access_addr);
> > +
> > +   if (enable_ept) {
> > +      vmcs_write64(EPT_POINTER, src->ept_pointer);
> > +      vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
> > +      vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
> > +      vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
> > +      vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
> > +   }
> > +
> > +   vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->
pin_based_vm_exec_control);
> > +   vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->
cpu_based_vm_exec_control);
> > +   vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
> > +   vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
> > +           src->page_fault_error_code_mask);
> > +   vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
> > +           src->page_fault_error_code_match);
> > +   vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
> > +   vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
> > +   vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->
vm_entry_msr_load_count);
> > +
> > +   if (cpu_has_secondary_exec_ctrls())
> > +      vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
> > +              src->secondary_vm_exec_control);
> > +
> > +   load_vmcs_common(src);
> > +
> > +   load_vmcs_host_state(to_vmx(vcpu)->nested.l1_state->shadow_vmcs);
> > +
> > +   return 0;
> > +}
> > +
> > +void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
> > +{
> > +   unsigned long mask;
> > +
> > +   if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->
arch.regs_dirty))
> > +      vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
> > +   if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->
arch.regs_dirty))
> > +      vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
> > +
> > +   mask = ~((1 << VCPU_REGS_RSP) | (1 << VCPU_REGS_RIP));
> > +
> > +   if (vcpu->arch.regs_dirty & mask) {
> > +      printk(KERN_INFO "WARNING: dirty cached registers
> regs_dirty 0x%x mask 0x%lx\n",
> > +             vcpu->arch.regs_dirty, mask);
> > +      WARN_ON(1);
> > +   }
> > +
> > +   vcpu->arch.regs_dirty = 0;
> > +}
> > +
> > +static int nested_vmx_run(struct kvm_vcpu *vcpu)
> > +{
> > +   /* verify that l1 has done vmptrld for l2 earlier */
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   int initial_pfu_active = vcpu->fpu_active;
> > +   int r = 0;
> > +
> > +   if (vmx->nested.nested_mode) {
> > +      printk(KERN_INFO "Nested guest already running\n");
> > +      set_rflags_to_vmx_fail_valid(vcpu);
> > +      return 1;
> > +   }
> > +
> > +
> > +   vmx->nested.nested_mode = 1;
> > +
> > +   vcpu->arch.exception.pending = false;
> Why need this?
I will look into it.
>
> > +
> > +   sync_cached_regs_to_vmcs(vcpu);
> > +
> > +   save_vmcs(vmx->nested.l1_state->shadow_vmcs);
> > +
> > +   vmx->nested.l1_state->shadow_efer = vcpu->arch.shadow_efer;
> > +   if (!enable_ept)
> > +      vmx->nested.l1_state->cr3 = vcpu->arch.cr3;
> > +   vmx->nested.l1_state->cr4 = vcpu->arch.cr4;
> > +
> > +   if (enable_vpid) {
> > +      if (vmx->nested.l2_state->vpid == 0) {
> > +         allocate_vpid(vmx);
> > +         vmx->nested.l2_state->vpid = vmx->vpid;
> > +      }
> > +   }
> > +
> > +   if (cpu_has_vmx_msr_bitmap())
> > +      vmx->nested.l1_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
> > +   else
> > +      vmx->nested.l1_state->msr_bitmap = 0;
> > +
> > +   vmx->nested.l1_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> > +   vmx->nested.l1_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> > +   vmx->nested.l1_state->vmcs = vmx->vmcs;
> > +   vmx->nested.l1_state->cpu = vcpu->cpu;
> > +   vmx->nested.l1_state->launched = vmx->launched;
> > +
> > +   vmx->vmcs = vmx->nested.l2_state->vmcs;
> > +   vcpu->cpu = vmx->nested.l2_state->cpu;
> Who initialize vmx->nested.l2_state->cpu before first launch?
I  will fix it.
> Why have different cpu for l1 and l2 guest? It seems like this is global
> vcpu thread property.
Lets look at this scenario L1 was running on cpu 0 than it launched L2 (at
cpu 0) ,
L2 run and did something that caused switching to userspace . When kvm
returns to running L2 we
switch to cpu 1, now vcpu->cpu=1.
When we switch back to running L1 we need to handle this switch and so we
update vcpu->cpu back to 0 and than call vmx_vcpu_load
that will handle it.
>
> > +   vmx->launched = vmx->nested.l2_state->launched;
> > +
> Can you explain why ->launched logic is needed?
It is possible L1 called vmlaunch but we didn't actually run L2 (for
example there was an interrupt and
enable_irq_window switched back to L1 before running L2). L1 thinks the
vmlaunch was successful and call vmresume in the next time
but KVM needs to call vmlaunch for L2.
>
> > +   if (vmx->nested.l2_state->vmclear || !vmx->launched) {
> > +      vmcs_clear(vmx->vmcs);
> > +      vmx->launched = 0;
> > +      vmx->nested.l2_state->vmclear = 0;
> > +   }
> > +
> > +   vmx_vcpu_load(vcpu, get_cpu());
> > +   put_cpu();
> > +
> > +
> > +   if (!nested_map_shadow_vmcs(vcpu)) {
> > +      set_rflags_to_vmx_fail_valid(vcpu);
> > +      return 1;
> > +   }
> No cleanup on error. Is looks like we are on an l2 vmcs at this point.
I will fix it.
>
> > +
> > +   prepare_vmcs_02(vcpu);
> > +
> > +   if (get_shadow_vmcs(vcpu)->vm_entry_controls &
> > +       VM_ENTRY_IA32E_MODE) {
> > +      if (!((vcpu->arch.shadow_efer & EFER_LMA) &&
> > +            (vcpu->arch.shadow_efer & EFER_LME)))
> > +         vcpu->arch.shadow_efer |= (EFER_LMA | EFER_LME);
> > +   } else {
> > +      if ((vcpu->arch.shadow_efer & EFER_LMA) ||
> > +          (vcpu->arch.shadow_efer & EFER_LME))
> > +         vcpu->arch.shadow_efer = 0;
> > +   }
> > +
> > +   vmx_set_cr0(vcpu, get_shadow_vmcs(vcpu)->guest_cr0);
> > +   vmcs_writel(CR0_READ_SHADOW,
> > +          get_shadow_vmcs(vcpu)->cr0_read_shadow);
> > +   vmx_set_cr4(vcpu, get_shadow_vmcs(vcpu)->guest_cr4);
> > +   vmcs_writel(CR4_READ_SHADOW,
> > +          get_shadow_vmcs(vcpu)->cr4_read_shadow);
> > +
> > +   vcpu->arch.cr0 |= X86_CR0_PG;
> > +
> > +   if (enable_ept && !nested_cpu_has_vmx_ept(vcpu)) {
> > +      vmcs_write32(GUEST_CR3, get_shadow_vmcs(vcpu)->guest_cr3);
> > +      vmx->vcpu.arch.cr3 = get_shadow_vmcs(vcpu)->guest_cr3;
> > +   } else {
> > +      kvm_set_cr3(vcpu, get_shadow_vmcs(vcpu)->guest_cr3);
> > +      kvm_mmu_reset_context(vcpu);
> > +
> > +      nested_unmap_shadow_vmcs(vcpu);
> > +
> > +      r = kvm_mmu_load(vcpu);
> > +      if (unlikely(r)) {
> > +         printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r);
> > +         nested_vmx_vmexit(vcpu, false);
> > +         set_rflags_to_vmx_fail_valid(vcpu);
> > +         return 1;
> > +      }
> > +
> > +      nested_map_shadow_vmcs(vcpu);
> > +   }
> > +
> > +   kvm_register_write(vcpu, VCPU_REGS_RSP,
> > +            get_shadow_vmcs(vcpu)->guest_rsp);
> > +   kvm_register_write(vcpu, VCPU_REGS_RIP,
> > +            get_shadow_vmcs(vcpu)->guest_rip);
> > +
> > +   vmcs_write32(EXCEPTION_BITMAP,
> > +           (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
> > +            get_shadow_vmcs(vcpu)->exception_bitmap));
> > +
> > +   nested_unmap_shadow_vmcs(vcpu);
> > +
> > +   if (initial_pfu_active)
> > +      vmx_fpu_activate(vcpu);
> > +
> > +   return 1;
> > +}
> > +
> > +static int launch_guest(struct kvm_vcpu *vcpu)
> > +{
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   skip_emulated_instruction(vcpu);
> > +
> > +   nested_vmx_run(vcpu);
> > +
> > +   return 1;
> > +}
> > +
> > +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> > +              bool is_interrupt)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   int initial_pfu_active = vcpu->fpu_active;
> > +
> > +   if (!vmx->nested.nested_mode) {
> > +      printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
> > +             __func__);
> > +      return 0;
> > +   }
> > +
> > +   save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
> > +
> > +   sync_cached_regs_to_vmcs(vcpu);
> > +
> > +   if (!nested_map_shadow_vmcs(vcpu)) {
> > +      printk(KERN_INFO "Error mapping shadow vmcs\n");
> > +      set_rflags_to_vmx_fail_valid(vcpu);
> Error during vmexit should set abort flag, not change flags.
I think this is more a vmlaunch/vmresume error (in the code that switch
back to L1).
>
> > +      return 1;
> > +   }
> > +
> > +   prepare_vmcs_12(vcpu);
> > +   if (is_interrupt)
> > +      get_shadow_vmcs(vcpu)->vm_exit_reason =
> > +         EXIT_REASON_EXTERNAL_INTERRUPT;
> > +
> > +   vmx->nested.l2_state->launched = vmx->launched;
> > +   vmx->nested.l2_state->cpu = vcpu->cpu;
> > +
> > +   nested_unmap_shadow_vmcs(vcpu);
> > +
> > +   vmx->vmcs = vmx->nested.l1_state->vmcs;
> > +   vcpu->cpu = vmx->nested.l1_state->cpu;
> > +   vmx->launched = vmx->nested.l1_state->launched;
> > +
> > +   vmx_vcpu_load(vcpu, get_cpu());
> > +   put_cpu();
> > +
> > +   vcpu->arch.exception.pending = false;
> Why need this?
I will remove it.
>
> > +
> > +   vcpu->arch.shadow_efer = vmx->nested.l1_state->shadow_efer;
> > +   vmx_set_cr0(vcpu, vmx->nested.l1_state->shadow_vmcs->
cr0_read_shadow);
> > +   vmx_set_cr4(vcpu, vmx->nested.l1_state->cr4);
> > +
> > +   if (enable_ept) {
> > +      vcpu->arch.cr3 = vmx->nested.l1_state->shadow_vmcs->guest_cr3;
> > +      vmcs_write32(GUEST_CR3,
> vmx->nested.l1_state->shadow_vmcs->guest_cr3);
> > +   } else {
> > +      kvm_set_cr3(vcpu, vmx->nested.l1_state->cr3);
> > +   }
> > +
> > +   if (!nested_map_shadow_vmcs(vcpu)) {
> > +      printk(KERN_INFO "Error mapping shadow vmcs\n");
> > +      set_rflags_to_vmx_fail_valid(vcpu);
> Abort not flags.
See above.
>
> > +      return 1;
> > +   }
> > +
> > +   switch_back_vmcs(vcpu);
> > +
> > +   nested_unmap_shadow_vmcs(vcpu);
> > +
> > +   kvm_register_write(vcpu, VCPU_REGS_RSP,
> > +            vmx->nested.l1_state->shadow_vmcs->guest_rsp);
> > +   kvm_register_write(vcpu, VCPU_REGS_RIP,
> > +            vmx->nested.l1_state->shadow_vmcs->guest_rip);
> > +
> > +   vmx->nested.nested_mode = 0;
> > +
> > +   kvm_mmu_reset_context(vcpu);
> > +   kvm_mmu_load(vcpu);
> > +
> > +   if (unlikely(vmx->fail)) {
> > +      vmx->fail = 0;
> > +      set_rflags_to_vmx_fail_valid(vcpu);
> > +   } else
> > +      clear_rflags_cf_zf(vcpu);
> > +
> > +   if (initial_pfu_active)
> > +      vmx_fpu_activate(vcpu);
> > +
> > +   return 0;
> > +}
> > +
> > +static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
> > +{
> > +   if (to_vmx(vcpu)->nested.nested_mode) {
> > +      struct page *msr_page = NULL;
> > +      u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
> > +      u32 exit_code = vmcs_read32(VM_EXIT_REASON);
> > +      struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu);
> > +
> > +      if (!cpu_has_vmx_msr_bitmap()
> > +          || !nested_cpu_has_vmx_msr_bitmap(vcpu))
> > +         return 1;
> > +
> > +      msr_page = nested_get_page(vcpu,
> > +                  l2svmcs->msr_bitmap);
> > +
> > +      if (!msr_page) {
> > +         printk(KERN_INFO "%s error in nested_get_page\n",
> > +                __func__);
> > +         return 0;
> > +      }
> > +
> > +      switch (exit_code) {
> > +      case EXIT_REASON_MSR_READ:
> > +         if (msr_index <= 0x1fff) {
> > +            if (test_bit(msr_index,
> > +                    (unsigned long *)(msr_page +
> > +                            0x000)))
> > +               return 1;
> > +         } else if ((msr_index >= 0xc0000000) &&
> > +               (msr_index <= 0xc0001fff)) {
> > +            msr_index &= 0x1fff;
> > +            if (test_bit(msr_index,
> > +                    (unsigned long *)(msr_page +
> > +                            0x400)))
> > +               return 1;
> > +         }
> > +         break;
> > +      case EXIT_REASON_MSR_WRITE:
> > +         if (msr_index <= 0x1fff) {
> > +            if (test_bit(msr_index,
> > +                    (unsigned long *)(msr_page +
> > +                            0x800)))
> > +                  return 1;
> > +         } else if ((msr_index >= 0xc0000000) &&
> > +               (msr_index <= 0xc0001fff)) {
> > +            msr_index &= 0x1fff;
> > +            if (test_bit(msr_index,
> > +                    (unsigned long *)(msr_page +
> > +                            0xc00)))
> > +               return 1;
> > +         }
> > +         break;
> > +      }
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> > +static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool
> kvm_override)
> > +{
> > +   u32 exit_code = vmcs_read32(VM_EXIT_REASON);
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > +   struct shadow_vmcs *l2svmcs;
> > +
> > +   int r = 0;
> > +
> > +   if (vmx->nested.nested_run_pending)
> > +      return 0;
> > +
> > +   if (unlikely(vmx->fail)) {
> > +      printk(KERN_INFO "%s failed vm entry %x\n",
> > +             __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
> > +      return 1;
> > +   }
> > +
> > +   if (kvm_override) {
> > +      switch (exit_code) {
> > +      case EXIT_REASON_EXTERNAL_INTERRUPT:
> > +         return 0;
> > +      case EXIT_REASON_EXCEPTION_NMI:
> > +         if (!is_exception(intr_info))
> > +            return 0;
> > +
> > +         if (is_page_fault(intr_info) && (!enable_ept))
> > +            return 0;
> > +
> > +         break;
> > +      case EXIT_REASON_EPT_VIOLATION:
> > +         if (enable_ept)
> > +            return 0;
> > +
> > +         break;
> > +      }
> > +   }
> > +
> > +
> > +   if (!nested_map_shadow_vmcs(vcpu))
> > +      return 0;
> > +   l2svmcs = get_shadow_vmcs(vcpu);
> > +
> > +   switch (exit_code) {
> > +   case EXIT_REASON_INVLPG:
> > +      if (l2svmcs->cpu_based_vm_exec_control &
> > +          CPU_BASED_INVLPG_EXITING)
> > +         r = 1;
> > +      break;
> > +   case EXIT_REASON_MSR_READ:
> > +   case EXIT_REASON_MSR_WRITE:
> > +      r = nested_vmx_exit_handled_msr(vcpu);
> > +      break;
> > +   case EXIT_REASON_CR_ACCESS: {
> > +      unsigned long exit_qualification =
> > +         vmcs_readl(EXIT_QUALIFICATION);
> > +      int cr = exit_qualification & 15;
> > +      int reg = (exit_qualification >> 8) & 15;
> > +      unsigned long val = kvm_register_read(vcpu, reg);
> > +
> > +      switch ((exit_qualification >> 4) & 3) {
> > +      case 0: /* mov to cr */
> > +         switch (cr) {
> > +         case 0:
> > +            if (l2svmcs->cr0_guest_host_mask &
> > +                (val ^ l2svmcs->cr0_read_shadow))
> > +               r = 1;
> > +            break;
> > +         case 3:
> > +            if (l2svmcs->cpu_based_vm_exec_control &
> > +                CPU_BASED_CR3_LOAD_EXITING)
> > +               r = 1;
> > +            break;
> > +         case 4:
> > +            if (l2svmcs->cr4_guest_host_mask &
> > +                (l2svmcs->cr4_read_shadow ^ val))
> > +               r = 1;
> > +            break;
> > +         case 8:
> > +            if (l2svmcs->cpu_based_vm_exec_control &
> > +                CPU_BASED_CR8_LOAD_EXITING)
> > +               r = 1;
> > +            break;
> > +         }
> > +         break;
> > +      case 2: /* clts */
> > +         if (l2svmcs->cr0_guest_host_mask &
> > +             (val ^ l2svmcs->cr0_read_shadow))
> > +            r = 1;
> > +         break;
> > +      case 1: /*mov from cr*/
> > +         switch (cr) {
> > +         case 0:
> > +            r = 1;
> > +         case 3:
> > +            if (l2svmcs->cpu_based_vm_exec_control &
> > +                CPU_BASED_CR3_STORE_EXITING)
> > +               r = 1;
> > +            break;
> > +         case 4:
> > +            r = 1;
> > +            break;
> > +         case 8:
> > +            if (l2svmcs->cpu_based_vm_exec_control &
> > +                CPU_BASED_CR8_STORE_EXITING)
> > +               r = 1;
> > +            break;
> > +         }
> > +         break;
> > +      case 3: /* lmsw */
> > +         if (l2svmcs->cr0_guest_host_mask &
> > +             (val ^ l2svmcs->cr0_read_shadow))
> > +            r = 1;
> > +         break;
> > +      }
> > +      break;
> > +   }
> > +   case EXIT_REASON_DR_ACCESS: {
> > +      if (l2svmcs->cpu_based_vm_exec_control &
> > +          CPU_BASED_MOV_DR_EXITING)
> > +         r = 1;
> > +      break;
> > +   }
> > +
> > +   case EXIT_REASON_EXCEPTION_NMI: {
> > +
> > +      if (is_external_interrupt(intr_info) &&
> > +          (l2svmcs->pin_based_vm_exec_control &
> > +           PIN_BASED_EXT_INTR_MASK))
> > +         r = 1;
> > +      else if (is_nmi(intr_info) &&
> > +          (l2svmcs->pin_based_vm_exec_control &
> > +           PIN_BASED_NMI_EXITING))
> > +         r = 1;
> > +      else if (is_exception(intr_info) &&
> > +          (l2svmcs->exception_bitmap &
> > +           (1u << (intr_info & INTR_INFO_VECTOR_MASK))))
> > +         r = 1;
> > +      else if (is_page_fault(intr_info))
> > +         r = 1;
> > +      break;
> > +   }
> > +
> > +   case EXIT_REASON_EXTERNAL_INTERRUPT:
> > +      if (l2svmcs->pin_based_vm_exec_control &
> > +          PIN_BASED_EXT_INTR_MASK)
> > +         r = 1;
> > +      break;
> > +   default:
> > +      r = 1;
> > +   }
> > +   nested_unmap_shadow_vmcs(vcpu);
> > +
> > +   return r;
> > +}
> > +
> > +static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned
nr,
> > +                  bool has_error_code, u32 error_code)
> > +{
> > +   if (vmx->nested.nested_mode) {
> > +      if (nested_vmx_exit_handled(&vmx->vcpu, false)) {
> > +         nested_vmx_vmexit(&vmx->vcpu, false);
> > +         if (!nested_map_shadow_vmcs(&vmx->vcpu))
> > +            return 1;
> > +         get_shadow_vmcs(&vmx->vcpu)->vm_exit_reason =
> > +            EXIT_REASON_EXCEPTION_NMI;
> > +         get_shadow_vmcs(&vmx->vcpu)->vm_exit_intr_info =
> > +            (nr | INTR_TYPE_HARD_EXCEPTION
> > +             | (has_error_code ?
> > +                INTR_INFO_DELIVER_CODE_MASK : 0)
> > +             | INTR_INFO_VALID_MASK);
> > +
> > +         if (has_error_code)
> > +            get_shadow_vmcs(&vmx->vcpu)->
> > +               vm_exit_intr_error_code = error_code;
> > +         nested_unmap_shadow_vmcs(&vmx->vcpu);
> > +         return 1;
> > +      }
> > +   }
> > +   return 0;
> > +}
> > +
> > +static int nested_vmx_intr(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +
> > +   if (vmx->nested.nested_mode) {
> This function is called only after checking nested_mode. Why recheck?
I will remove it.
>
> > +
> > +      if (!nested_map_shadow_vmcs(vcpu))
> > +         return 0;
> > +
> > +      if (get_shadow_vmcs(vcpu)->pin_based_vm_exec_control &
> > +          PIN_BASED_EXT_INTR_MASK) {
> > +
> > +         if (vmx->nested.nested_run_pending) {
> > +            nested_unmap_shadow_vmcs(vcpu);
> > +            return 0;
> > +         }
> > +
> > +         nested_unmap_shadow_vmcs(vcpu);
> > +         nested_vmx_vmexit(vcpu, true);
> > +         return 1;
> > +      }
> > +
> > +      nested_unmap_shadow_vmcs(vcpu);
> > +
> > +   }
> > +
> > +   return 0;
> > +}
> >
> >  static struct kvm_x86_ops vmx_x86_ops = {
> >     .cpu_has_kvm_support = cpu_has_kvm_support,
> > --
> > 1.6.0.4
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> --
>          Gleb.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: Nested VMX support v3
  2009-10-20  3:30 ` Avi Kivity
@ 2009-10-21 14:50   ` Orit Wasserman
  0 siblings, 0 replies; 35+ messages in thread
From: Orit Wasserman @ 2009-10-21 14:50 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 20/10/2009 05:30:34:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Abel Gordon/
> Haifa/IBM@IBMIL, Muli Ben-Yehuda/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mdday@us.ibm.com
>
> Date:
>
> 20/10/2009 05:30
>
> Subject:
>
> Re: Nested VMX support v3
>
> On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
> > Avi,
> > We have addressed all of the comments, please apply.
> >
> > The following patches implement nested VMX support. The patches
> enable a guest
> > to use the VMX APIs in order to run its own nested guest (i.e.,
> enable running
> > other hypervisors which use VMX under KVM). The current patches
> support running
> > Linux under a nested KVM using shadow page table (with bypass_guest_pf
> > disabled). SMP support was fixed.  Reworking EPT support to mesh
> cleanly with
> > the current shadow paging design per Avi's comments is a
work-in-progress.
> >
>
> Why is bypass_guest_pf disabled?
It was not implemented.
We need to modify the walk_addr code to handle the sptes that have invalid
content (used only for the bypass_guest_pf
optimization) and identify them as not present. Maybe remove some other
validity checks too.
>
> > The current patches only support a single nested hypervisor, which
> can only run
> > a single guest (multiple guests are work in progress). Only 64-bit
nested
> > hypervisors are supported.
> >
>
> Multiple guests and 32-bit support are merge requirements.  As far as I
> can tell there shouldn't be anything special required to support them?
Ok.
>
>
> > vpid allocation will be updated with the multiguest support (work
> in progress).
> > We are working on fixing the cr0.TS handling, it works for nested kvm
by not
> > for vmware server.
> >
>
> Please either drop or fix vpid before merging.  What's wrong with
> cr0.ts?  I'd like to see that fixed as well.
Ok.
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/5] Nested VMX patch 2 implements vmclear
  2009-10-20  4:06     ` [PATCH 2/5] Nested VMX patch 2 implements vmclear Avi Kivity
@ 2009-10-21 14:56       ` Orit Wasserman
  0 siblings, 0 replies; 35+ messages in thread
From: Orit Wasserman @ 2009-10-21 14:56 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 20/10/2009 06:06:40:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Abel Gordon/
> Haifa/IBM@IBMIL, Muli Ben-Yehuda/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mdday@us.ibm.com
>
> Date:
>
> 20/10/2009 06:06
>
> Subject:
>
> Re: [PATCH 2/5] Nested VMX patch 2 implements vmclear
>
> On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
> > From: Orit Wasserman<oritw@il.ibm.com>
> >
> > ---
> >   arch/x86/kvm/vmx.c |   70 ++++++++++++++++++++++++++++++++++++++
> ++++++++++---
> >   1 files changed, 65 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 71bd91a..411cbdb 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -61,15 +61,26 @@ module_param_named(unrestricted_guest,
> >   static int __read_mostly emulate_invalid_guest_state = 0;
> >   module_param(emulate_invalid_guest_state, bool, S_IRUGO);
> >
> > -struct vmcs {
> > -   u32 revision_id;
> > -   u32 abort;
> > -   char data[0];
> > +struct __attribute__ ((__packed__)) level_state {
> > +   /* Has the level1 guest done vmclear? */
> > +   bool vmclear;
> >   };
> >
>
> Why __packed__?
I will remove it.
>
> >
> >   struct nested_vmx {
> >      /* Has the level1 guest done vmxon? */
> >      bool vmxon;
> > +
> > +   /*
> > +    * Level 2 state : includes vmcs,registers and
> > +    * a copy of vmcs12 for vmread/vmwrite
> > +    */
> > +   struct level_state *l2_state;
> > +};
> > +
> > +struct vmcs {
> > +   u32 revision_id;
> > +   u32 abort;
> > +   char data[0];
> >   };
> >
>
> Why move struct vmcs around?
I will fix it.
>
> > +
> >   static int handle_vmoff(struct kvm_vcpu *vcpu)
> >   {
> >      struct vcpu_vmx *vmx = to_vmx(vcpu);
> > @@ -3310,6 +3368,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
> >
> >      vmx->nested.vmxon = 1;
> >
> > +   create_l2_state(vcpu);
> > +
> >
>
> Need to check return code.
I will add the check.
>
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-21 14:43             ` Orit Wasserman
@ 2009-10-22  9:04               ` Gleb Natapov
  2009-10-22 15:46                 ` Orit Wasserman
  2009-10-22 10:55               ` Avi Kivity
  1 sibling, 1 reply; 35+ messages in thread
From: Gleb Natapov @ 2009-10-22  9:04 UTC (permalink / raw)
  To: Orit Wasserman
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda

On Wed, Oct 21, 2009 at 04:43:44PM +0200, Orit Wasserman wrote:
> > > @@ -4641,10 +4955,13 @@ static void vmx_complete_interrupts(struct
> > vcpu_vmx *vmx)
> > >     int type;
> > >     bool idtv_info_valid;
> > >
> > > -   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > -
> > >     vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
> > >
> > > +   if (vmx->nested.nested_mode)
> > > +      return;
> > > +
> > Why return here? What the function does that should not be done in
> > nested mode?
> In nested mode L0 injects an interrupt to L2 only in one scenario,
> if there is an IDT_VALID event and L0 decides to run L2 again and not to
> switch back to L1.
> In all other cases the injection is handled by L1.
This is exactly the kind of scenario that is handled by
vmx_complete_interrupts(). (vmx|svm)_complete_interrups() store
pending event in arch agnostic way and re-injection is handled by
x86.c You bypass this logic by inserting return here and introducing
nested_handle_valid_idt() function below.

> >
> > > +   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > +
> > >     /* Handle machine checks before interrupts are enabled */
> > >     if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
> > >         || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
> > > @@ -4747,6 +5064,60 @@ static void fixup_rmode_irq(struct vcpu_vmx
> *vmx)
> > >        | vmx->rmode.irq.vector;
> > >  }
> > >
> > > +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
> > > +{
> > It seems by this function you are trying to bypass general event
> > reinjection logic. Why?
> See above.
The logic implemented by this function is handled in x86.c in arch
agnostic way. Is there something wrong with this?

> > > +   vmx->launched = vmx->nested.l2_state->launched;
> > > +
> > Can you explain why ->launched logic is needed?
> It is possible L1 called vmlaunch but we didn't actually run L2 (for
> example there was an interrupt and
> enable_irq_window switched back to L1 before running L2). L1 thinks the
> vmlaunch was successful and call vmresume in the next time
> but KVM needs to call vmlaunch for L2.
handle_vmlauch() and handle_vmresume() are exactly the same. Why KVM needs
to run one and not the other?
 
> > > +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> > > +              bool is_interrupt)
> > > +{
> > > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > > +   int initial_pfu_active = vcpu->fpu_active;
> > > +
> > > +   if (!vmx->nested.nested_mode) {
> > > +      printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
> > > +             __func__);
> > > +      return 0;
> > > +   }
> > > +
> > > +   save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
> > > +
> > > +   sync_cached_regs_to_vmcs(vcpu);
> > > +
> > > +   if (!nested_map_shadow_vmcs(vcpu)) {
> > > +      printk(KERN_INFO "Error mapping shadow vmcs\n");
> > > +      set_rflags_to_vmx_fail_valid(vcpu);
> > Error during vmexit should set abort flag, not change flags.
> I think this is more a vmlaunch/vmresume error (in the code that switch
> back to L1).
How is this vmlaunch/vmresume error? This function is called to exit
from L2 guest while on L2 vcms. It is called asynchronously in respect
to L2 guest and you modify L2 guest rflags register at unpredictable
place here.

--
			Gleb.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-21 14:43             ` Orit Wasserman
  2009-10-22  9:04               ` Gleb Natapov
@ 2009-10-22 10:55               ` Avi Kivity
  1 sibling, 0 replies; 35+ messages in thread
From: Avi Kivity @ 2009-10-22 10:55 UTC (permalink / raw)
  To: Orit Wasserman
  Cc: Gleb Natapov, Abel Gordon, aliguori, Ben-Ami Yassour1, kvm,
	mdday, Muli Ben-Yehuda

On 10/21/2009 04:43 PM, Orit Wasserman wrote:
>
> It is possible L1 called vmlaunch but we didn't actually run L2 (for
> example there was an interrupt and
> enable_irq_window switched back to L1 before running L2). L1 thinks the
> vmlaunch was successful and call vmresume in the next time
> but KVM needs to call vmlaunch for L2.
>    

Is it really possible?  If vmlaunch is started, it should complete 
unconditionally for L1.  The irq window should be recalculated based on 
the guest vmcs "exit on external interrupt" and guest eflags/cr8 etc.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff
  2009-10-20  4:00   ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
@ 2009-10-22 12:41     ` Orit Wasserman
  0 siblings, 0 replies; 35+ messages in thread
From: Orit Wasserman @ 2009-10-22 12:41 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 20/10/2009 06:00:50:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Abel Gordon/
> Haifa/IBM@IBMIL, Muli Ben-Yehuda/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mdday@us.ibm.com
>
> Date:
>
> 20/10/2009 06:02
>
> Subject:
>
> Re: [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff
>
> On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
> >
> >   /*
> > + * Handles msr read for nested virtualization
> > + */
> > +static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
> > +               u64 *pdata)
> > +{
> > +   u64 vmx_msr = 0;
> > +
> > +   switch (msr_index) {
> > +   case MSR_IA32_FEATURE_CONTROL:
> > +      *pdata = 0;
> > +      break;
> > +   case MSR_IA32_VMX_BASIC:
> > +      *pdata = 0;
> > +      rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr);
> > +      *pdata = (vmx_msr&  0x00ffffcfffffffff);
> > +      break;
> > +
> >
>
> This (and the rest of the msrs) must be controllable from userspace.
> Otherwise a live migration from a newer host to an older host would
break.
OK.
>
> >
> >   /*
> > + * Writes msr value for nested virtualization
> > + * Returns 0 on success, non-0 otherwise.
> > + */
> > +static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32
> msr_index, u64 data)
> > +{
> > +   switch (msr_index) {
> > +   case MSR_IA32_FEATURE_CONTROL:
> > +      if ((data&  (FEATURE_CONTROL_LOCKED |
> > +              FEATURE_CONTROL_VMXON_ENABLED))
> > +          != (FEATURE_CONTROL_LOCKED |
> > +         FEATURE_CONTROL_VMXON_ENABLED))
> > +         return 1;
> > +      break;
> > +   default:
> > +      return 1;
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> >
>
> Need to export this msr to userspace for live migration.  See
> msrs_to_save[].
OK.
>
> >
> > +/*
> > + * Check to see if vcpu can execute vmx command
> > + * Inject the corrseponding exception
> > + */
> > +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
> > +{
> > +   struct kvm_segment cs;
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct kvm_msr_entry *msr;
> > +
> > +   vmx_get_segment(vcpu,&cs, VCPU_SREG_CS);
> > +
> > +   if (!vmx->nested.vmxon) {
> > +      printk(KERN_DEBUG "%s: vmx not on\n", __func__);
> >
>
> pr_debug
I will change it.
>
> > +      kvm_queue_exception(vcpu, UD_VECTOR);
> > +      return 0;
> > +   }
> > +
> > +   msr = find_msr_entry(vmx, MSR_EFER);
> > +
> > +   if ((vmx_get_rflags(vcpu)&  X86_EFLAGS_VM) ||
> > +       ((msr->data&  EFER_LMA)&&  !cs.l)) {
> >
>
> is_long_mode()
I will change it.
>
> >   static int handle_vmx_insn(struct kvm_vcpu *vcpu)
> >   {
> >      kvm_queue_exception(vcpu, UD_VECTOR);
> >      return 1;
> >   }
> >
> > +static int handle_vmoff(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   vmx->nested.vmxon = 0;
> > +
> > +   skip_emulated_instruction(vcpu);
> > +   return 1;
> > +}
> > +
> > +static int handle_vmon(struct kvm_vcpu *vcpu)
> > +{
> > +   struct kvm_segment cs;
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +
> > +   if (!nested) {
> > +      printk(KERN_DEBUG "%s: nested vmx not enabled\n", __func__);
> > +      kvm_queue_exception(vcpu, UD_VECTOR);
> > +      return 1;
> > +   }
> > +
> > +   vmx_get_segment(vcpu,&cs, VCPU_SREG_CS);
> > +
> > +   if (!(vcpu->arch.cr4&  X86_CR4_VMXE) ||
> > +       !(vcpu->arch.cr0&  X86_CR0_PE) ||
> > +       (vmx_get_rflags(vcpu)&  X86_EFLAGS_VM)) {
> > +      kvm_queue_exception(vcpu, UD_VECTOR);
> > +      printk(KERN_INFO "%s invalid register state\n", __func__);
> > +      return 1;
> > +   }
> > +#ifdef CONFIG_X86_64
> > +   if (((find_msr_entry(to_vmx(vcpu),
> > +              MSR_EFER)->data&  EFER_LMA)&&  !cs.l)) {
> >
>
> is_long_mode(), and you can avoid the #ifdef.
I will change it.
>
>
> VMXON is supposed to block INIT, please add that (in a separate patch).
I will add it.
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-10-20  4:24       ` Avi Kivity
@ 2009-10-22 12:48         ` Orit Wasserman
  0 siblings, 0 replies; 35+ messages in thread
From: Orit Wasserman @ 2009-10-22 12:48 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 20/10/2009 06:24:33:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Abel Gordon/
> Haifa/IBM@IBMIL, Muli Ben-Yehuda/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mdday@us.ibm.com
>
> Date:
>
> 20/10/2009 06:24
>
> Subject:
>
> Re: [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
> >
> > +
> > +struct __attribute__ ((__packed__)) shadow_vmcs {
> >
>
> Since this is in guest memory, we need it packed so the binary format is
> preserved across migration.  Please add a comment so it isn't changed
> (at least without changing the revision_id).
I will add a comment.
>
> vmclear state should be here, that will help multiguest support.
Working on it ...
>
> >
> >   struct nested_vmx {
> >      /* Has the level1 guest done vmxon? */
> >      bool vmxon;
> > -
> > +   /* What is the location of the  vmcs l1 keeps for l2? (in level1
gpa) */
> > +   u64 vmptr;
> >
>
> Need to expose it for live migration.
I will look into it.
>
> >      /*
> >       * Level 2 state : includes vmcs,registers and
> >       * a copy of vmcs12 for vmread/vmwrite
> >       */
> >      struct level_state *l2_state;
> > +   /* Level 1 state for switching to level 2 and back */
> > +   struct level_state *l1_state;
> >
>
> This creates a ton of duplication.
>
> Some of the data is completely unnecessary, for example we can
> recalculate cr0 from HOST_CR0 and GUEST_CR0.
I will look into it .
>
> > +
> > +static int vmptrld(struct kvm_vcpu *vcpu,
> > +         u64 phys_addr)
> > +{
> > +   u8 error;
> > +
> > +   asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
> > +            : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
> > +            : "cc");
> > +   if (error) {
> > +      printk(KERN_ERR "kvm: %s vmptrld %llx failed\n",
> > +             __func__, phys_addr);
> > +      return 1;
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> >   /*
> >    * Switches to specified vcpu, until a matching vcpu_put(), but
assumes
> >    * vcpu mutex is already taken.
> > @@ -736,15 +923,8 @@ static void vmx_vcpu_load(struct kvm_vcpu
> *vcpu, int cpu)
> >      }
> >
> >      if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
> > -      u8 error;
> > -
> >         per_cpu(current_vmcs, cpu) = vmx->vmcs;
> > -      asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
> > -               : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
> > -               : "cc");
> > -      if (error)
> > -         printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
> > -                vmx->vmcs, phys_addr);
> > +      vmptrld(vcpu, phys_addr);
> >      }
> >
>
> This part of the patch is no longer needed.
I will remove it.
> > +   if (cpu_has_vmx_msr_bitmap())
> > +      vmx->nested.l2_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
> > +   else
> > +      vmx->nested.l2_state->msr_bitmap = 0;
> > +
> > +   vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> > +   vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> > +
> >
>
> This no longer works, since we don't load the guest vmcs.
I will look into it.
>
> > +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
> > +         struct kvm_vcpu *vcpu);
> >
>
> Isn't this in a header somewhere?
I will move it into the header.
>
> > +
> > +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> > +{
> > +
> > +   int r = 0;
> > +
> > +   r = kvm_read_guest_virt(vcpu->arch.regs[VCPU_REGS_RAX], gentry,
> > +            sizeof(u64), vcpu);
> > +   if (r) {
> > +      printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
> > +             __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
> > +      return r;
> > +   }
> > +
> > +   if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> > +      printk(KERN_DEBUG "%s addr %llx not aligned\n",
> > +             __func__, *gentry);
> > +      return 1;
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> >
>
> Should go through the emulator to evaluate arguments.
OK.
>
> > +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct page *vmcs_page;
> > +   u64 guest_vmcs_addr;
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   if (read_guest_vmcs_gpa(vcpu,&guest_vmcs_addr))
> > +      return 1;
> > +
> > +   if (create_l1_state(vcpu)) {
> > +      printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   if (create_l2_state(vcpu)) {
> > +      printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> > +      return 1;
> > +   }
> >
>
> return errors here, so we see the problem.
OK.
>
> > +
> > +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> > +{
> > +   int r = 0;
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   r = kvm_write_guest_virt(vcpu->arch.regs[VCPU_REGS_RAX],
> > +             (void *)&to_vmx(vcpu)->nested.vmptr,
> > +             sizeof(u64), vcpu);
> >
>
> Emulator again.
OK.
>
> > +void save_vmcs(struct shadow_vmcs *dst)
> > +{
> >
> > +   dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> > +   dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> >
>
> These (and many others) can never change due to a nested guest running,
> so no need to save them.
>
> > +   dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
> >
>
> In general, you need to translate host physical addresses to guest
> physical addresses.
If this is VMCS12 than it is already guest physical address.
>
> > +   dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> > +   if (enable_ept)
> > +      dst->ept_pointer = vmcs_read64(EPT_POINTER);
> > +
> >
>
> Not all hosts support these features.
I will add a check.
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite
  2009-10-20  4:44         ` Avi Kivity
@ 2009-10-22 12:50           ` Orit Wasserman
  0 siblings, 0 replies; 35+ messages in thread
From: Orit Wasserman @ 2009-10-22 12:50 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 20/10/2009 06:44:41:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Abel Gordon/
> Haifa/IBM@IBMIL, Muli Ben-Yehuda/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mdday@us.ibm.com
>
> Date:
>
> 20/10/2009 06:44
>
> Subject:
>
> Re: [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite
>
> On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
> >
> > +static int nested_map_shadow_vmcs(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct page *vmcs_page = nested_get_page(vcpu, vmx->nested.vmptr);
> > +
> > +   if (vmcs_page == NULL) {
> > +      printk(KERN_INFO "%s: failure in nested_get_page\n",__func__);
> > +      return 0;
> > +   }
> > +
> > +   if (vmx->nested.l2_state->shadow_vmcs) {
> > +      printk(KERN_INFO "%s: shadow vmcs already mapped\n",__func__);
> > +      return 0;
> > +   }
> > +
> >
>
> Consider dropping shadow_vmcs from l2_state and just passing it
> everywhere.  Less convenient but safer.
I will think about it, it is called from many places ...
>
> > +   vmx->nested.l2_state->shadow_vmcs = kmap_atomic(vmcs_page,
KM_USER0);
> > +
> > +   if (!vmx->nested.l2_state->shadow_vmcs) {
> > +      printk(KERN_INFO "%s: error in kmap_atomic\n",__func__);
> > +      return 0;
> > +   }
> >
>
> kmap_atomic() can't fail.
I will remove the check.
> >
> > +static int handle_vmread(struct kvm_vcpu *vcpu)
> > +{
> > +#ifndef CONFIG_X86_64
> > +   u64 value;
> > +#endif
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   if (!nested_map_shadow_vmcs(vcpu)) {
> > +      printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
> > +      set_rflags_to_vmx_fail_invalid(vcpu);
> > +      return 1;
> > +   }
> >
>
> return an error.
OK.
>
> > +
> > +   switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
> > +   case VMCS_FIELD_TYPE_U16:
> > +      vcpu->arch.regs[VCPU_REGS_RAX] =
> > +         nested_vmcs_read16(vcpu,
> > +                  vcpu->arch.regs[VCPU_REGS_RDX]);
> > +      break;
> >
>
> Use the emulator to decode operands.
OK.
>
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-20  4:56           ` Avi Kivity
@ 2009-10-22 12:56             ` Orit Wasserman
  0 siblings, 0 replies; 35+ messages in thread
From: Orit Wasserman @ 2009-10-22 12:56 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 20/10/2009 06:56:39:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Abel Gordon/
> Haifa/IBM@IBMIL, Muli Ben-Yehuda/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mdday@us.ibm.com
>
> Date:
>
> 20/10/2009 06:56
>
> Subject:
>
> Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
>
> On 10/15/2009 11:41 PM, oritw@il.ibm.com wrote:
> > From: Orit Wasserman<oritw@il.ibm.com>
> >
> > ---
> >   arch/x86/kvm/vmx.c | 1173 ++++++++++++++++++++++++++++++++++++++
> ++++++++++++--
> >   1 files changed, 1148 insertions(+), 25 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 6a4c252..e814029 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -209,6 +209,7 @@ struct __attribute__ ((__packed__)) level_state {
> >      struct vmcs *vmcs;
> >      int cpu;
> >      int launched;
> > +   bool first_launch;
> >   };
> >
> >   struct nested_vmx {
> > @@ -216,6 +217,12 @@ struct nested_vmx {
> >      bool vmxon;
> >      /* What is the location of the  vmcs l1 keeps for l2? (in
> level1 gpa) */
> >      u64 vmptr;
> > +   /* Are we running nested guest */
> > +   bool nested_mode;
> > +   /* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
> > +   bool nested_run_pending;
> > +   /* flag indicating if there was a valid IDT after exiting from l2
*/
> > +   bool nested_valid_idt;
> >
>
> Did you mean valid_idt_vectoring_info?
yes.
>
> No need to prefix everything with nested_ inside nested_vmx.
OK.
>
> > +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> > +{
> > +   struct shadow_vmcs *l2_shadow_vmcs =
> > +      get_shadow_vmcs(vcpu);
> > +
> > +   l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> > +   l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> > +   l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> > +   l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16
(GUEST_LDTR_SELECTOR);
> > +   l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> > +
> > +   l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> > +   l2_shadow_vmcs->guest_physical_address =
> > +      vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> > +   l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> >
>
> Physical addresses need translation,  no?
If you are referring to GUEST_PHYSICAL_ADDRESS than there is no need for
translation for L1.
It need to stay in L2 physical address.
>
> > +   l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
> > +
> > +   l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
> >
>
> We don't allow the guest to modify these, so no need to read them.  If
> you do, you need to remove the bits that we modify.
You are correct for example CR0.TS , it will be fixed in the next set of
patches.
>
> > +
> > +int load_vmcs_common(struct shadow_vmcs *src)
> > +{
> > +
> > +   vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
> >
>
> Why load this?
At the moment it is not used , maybe in the future.
I can add a check to see if it was changed.
>
> > +   vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
> >
>
> I think some features there are dangerous.
I will look into it.
>
> > +   vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->
vm_entry_msr_load_count);
> >
>
> Need to verify?  Also need to validate the loaded MSRs and run them
> through kvm_set_msr() instead of letting the cpu do it.
I will add the checks.
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-22  9:04               ` Gleb Natapov
@ 2009-10-22 15:46                 ` Orit Wasserman
  2009-10-25  9:44                   ` Gleb Natapov
  0 siblings, 1 reply; 35+ messages in thread
From: Orit Wasserman @ 2009-10-22 15:46 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Gleb Natapov <gleb@redhat.com> wrote on 22/10/2009 11:04:58:

> From:
>
> Gleb Natapov <gleb@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mdday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 22/10/2009 11:05
>
> Subject:
>
> Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
>
> On Wed, Oct 21, 2009 at 04:43:44PM +0200, Orit Wasserman wrote:
> > > > @@ -4641,10 +4955,13 @@ static void vmx_complete_interrupts(struct
> > > vcpu_vmx *vmx)
> > > >     int type;
> > > >     bool idtv_info_valid;
> > > >
> > > > -   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > > -
> > > >     vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
> > > >
> > > > +   if (vmx->nested.nested_mode)
> > > > +      return;
> > > > +
> > > Why return here? What the function does that should not be done in
> > > nested mode?
> > In nested mode L0 injects an interrupt to L2 only in one scenario,
> > if there is an IDT_VALID event and L0 decides to run L2 again and not
to
> > switch back to L1.
> > In all other cases the injection is handled by L1.
> This is exactly the kind of scenario that is handled by
> vmx_complete_interrupts(). (vmx|svm)_complete_interrups() store
> pending event in arch agnostic way and re-injection is handled by
> x86.c You bypass this logic by inserting return here and introducing
> nested_handle_valid_idt() function below.
The only location we can truly know if we are switching to L1 is in
vmx_vcpu_run
because enable_irq_window (that is called after handling the exit) can
decide to
switch to L1 because of an interrupt.
In order to simplify our code it was simpler to bypass
vmx_complete_interrupts when it is called (after
running L2) and to add nested_handle_valid_idt just before running L2.
> > >
> > > > +   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > > +
> > > >     /* Handle machine checks before interrupts are enabled */
> > > >     if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
> > > >         || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
> > > > @@ -4747,6 +5064,60 @@ static void fixup_rmode_irq(struct vcpu_vmx
> > *vmx)
> > > >        | vmx->rmode.irq.vector;
> > > >  }
> > > >
> > > > +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
> > > > +{
> > > It seems by this function you are trying to bypass general event
> > > reinjection logic. Why?
> > See above.
> The logic implemented by this function is handled in x86.c in arch
> agnostic way. Is there something wrong with this?
See my comment before
>
> > > > +   vmx->launched = vmx->nested.l2_state->launched;
> > > > +
> > > Can you explain why ->launched logic is needed?
> > It is possible L1 called vmlaunch but we didn't actually run L2 (for
> > example there was an interrupt and
> > enable_irq_window switched back to L1 before running L2). L1 thinks the
> > vmlaunch was successful and call vmresume in the next time
> > but KVM needs to call vmlaunch for L2.
> handle_vmlauch() and handle_vmresume() are exactly the same. Why KVM
needs
> to run one and not the other?
Yes they are very similar (almost the same code) the only difference is the
check of vmclear,
we need to emulate the vmx hardware behavior for those two commands and
check VMC12 state.
>
> > > > +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> > > > +              bool is_interrupt)
> > > > +{
> > > > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > > > +   int initial_pfu_active = vcpu->fpu_active;
> > > > +
> > > > +   if (!vmx->nested.nested_mode) {
> > > > +      printk(KERN_INFO "WARNING: %s called but not in nested mode
\n",
> > > > +             __func__);
> > > > +      return 0;
> > > > +   }
> > > > +
> > > > +   save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
> > > > +
> > > > +   sync_cached_regs_to_vmcs(vcpu);
> > > > +
> > > > +   if (!nested_map_shadow_vmcs(vcpu)) {
> > > > +      printk(KERN_INFO "Error mapping shadow vmcs\n");
> > > > +      set_rflags_to_vmx_fail_valid(vcpu);
> > > Error during vmexit should set abort flag, not change flags.
> > I think this is more a vmlaunch/vmresume error (in the code that switch
> > back to L1).
> How is this vmlaunch/vmresume error? This function is called to exit
> from L2 guest while on L2 vcms. It is called asynchronously in respect
> to L2 guest and you modify L2 guest rflags register at unpredictable
> place here.
OK.
>
> --
>          Gleb.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-22 15:46                 ` Orit Wasserman
@ 2009-10-25  9:44                   ` Gleb Natapov
  2009-10-28 16:23                     ` Orit Wasserman
  0 siblings, 1 reply; 35+ messages in thread
From: Gleb Natapov @ 2009-10-25  9:44 UTC (permalink / raw)
  To: Orit Wasserman
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda

On Thu, Oct 22, 2009 at 05:46:16PM +0200, Orit Wasserman wrote:
> 
> 
> Gleb Natapov <gleb@redhat.com> wrote on 22/10/2009 11:04:58:
> 
> > From:
> >
> > Gleb Natapov <gleb@redhat.com>
> >
> > To:
> >
> > Orit Wasserman/Haifa/IBM@IBMIL
> >
> > Cc:
> >
> > Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> > Haifa/IBM@IBMIL, kvm@vger.kernel.org, mdday@us.ibm.com, Muli Ben-
> > Yehuda/Haifa/IBM@IBMIL
> >
> > Date:
> >
> > 22/10/2009 11:05
> >
> > Subject:
> >
> > Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
> >
> > On Wed, Oct 21, 2009 at 04:43:44PM +0200, Orit Wasserman wrote:
> > > > > @@ -4641,10 +4955,13 @@ static void vmx_complete_interrupts(struct
> > > > vcpu_vmx *vmx)
> > > > >     int type;
> > > > >     bool idtv_info_valid;
> > > > >
> > > > > -   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > > > -
> > > > >     vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
> > > > >
> > > > > +   if (vmx->nested.nested_mode)
> > > > > +      return;
> > > > > +
> > > > Why return here? What the function does that should not be done in
> > > > nested mode?
> > > In nested mode L0 injects an interrupt to L2 only in one scenario,
> > > if there is an IDT_VALID event and L0 decides to run L2 again and not
> to
> > > switch back to L1.
> > > In all other cases the injection is handled by L1.
> > This is exactly the kind of scenario that is handled by
> > vmx_complete_interrupts(). (vmx|svm)_complete_interrups() store
> > pending event in arch agnostic way and re-injection is handled by
> > x86.c You bypass this logic by inserting return here and introducing
> > nested_handle_valid_idt() function below.
> The only location we can truly know if we are switching to L1 is in
> vmx_vcpu_run
> because enable_irq_window (that is called after handling the exit) can
> decide to
> switch to L1 because of an interrupt.
enable_irq_window() will be called after L2 VMCS will be setup for event
re-injection by previous call to inject_pending_event(). As far as I
can see this should work for interrupt injection. For exception we
should probably require l2 guest to re execute faulted instruction for
now like svm does.

> In order to simplify our code it was simpler to bypass
> vmx_complete_interrupts when it is called (after
> running L2) and to add nested_handle_valid_idt just before running L2.
> > > >
> > > > > +   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > > > +
> > > > >     /* Handle machine checks before interrupts are enabled */
> > > > >     if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
> > > > >         || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
> > > > > @@ -4747,6 +5064,60 @@ static void fixup_rmode_irq(struct vcpu_vmx
> > > *vmx)
> > > > >        | vmx->rmode.irq.vector;
> > > > >  }
> > > > >
> > > > > +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
> > > > > +{
> > > > It seems by this function you are trying to bypass general event
> > > > reinjection logic. Why?
> > > See above.
> > The logic implemented by this function is handled in x86.c in arch
> > agnostic way. Is there something wrong with this?
> See my comment before
Sometimes it is wrong to reinject events from L0 to L2 directly. If L2
was not able to handle event because its IDT is not mapped by L1 shadow
page table we should generate PF vmexit with valid idt vectoring info to
L1 and let L1 handle event reinjection.

--
			Gleb.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-25  9:44                   ` Gleb Natapov
@ 2009-10-28 16:23                     ` Orit Wasserman
  2009-10-29 17:31                       ` Gleb Natapov
  0 siblings, 1 reply; 35+ messages in thread
From: Orit Wasserman @ 2009-10-28 16:23 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda



Gleb Natapov <gleb@redhat.com> wrote on 25/10/2009 11:44:31:

> From:
>
> Gleb Natapov <gleb@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mdday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 25/10/2009 11:44
>
> Subject:
>
> Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
>
> On Thu, Oct 22, 2009 at 05:46:16PM +0200, Orit Wasserman wrote:
> >
> >
> > Gleb Natapov <gleb@redhat.com> wrote on 22/10/2009 11:04:58:
> >
> > > From:
> > >
> > > Gleb Natapov <gleb@redhat.com>
> > >
> > > To:
> > >
> > > Orit Wasserman/Haifa/IBM@IBMIL
> > >
> > > Cc:
> > >
> > > Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> > > Haifa/IBM@IBMIL, kvm@vger.kernel.org, mdday@us.ibm.com, Muli Ben-
> > > Yehuda/Haifa/IBM@IBMIL
> > >
> > > Date:
> > >
> > > 22/10/2009 11:05
> > >
> > > Subject:
> > >
> > > Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
> > >
> > > On Wed, Oct 21, 2009 at 04:43:44PM +0200, Orit Wasserman wrote:
> > > > > > @@ -4641,10 +4955,13 @@ static void vmx_complete_interrupts
(struct
> > > > > vcpu_vmx *vmx)
> > > > > >     int type;
> > > > > >     bool idtv_info_valid;
> > > > > >
> > > > > > -   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > > > > -
> > > > > >     vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
> > > > > >
> > > > > > +   if (vmx->nested.nested_mode)
> > > > > > +      return;
> > > > > > +
> > > > > Why return here? What the function does that should not be done
in
> > > > > nested mode?
> > > > In nested mode L0 injects an interrupt to L2 only in one scenario,
> > > > if there is an IDT_VALID event and L0 decides to run L2 again and
not
> > to
> > > > switch back to L1.
> > > > In all other cases the injection is handled by L1.
> > > This is exactly the kind of scenario that is handled by
> > > vmx_complete_interrupts(). (vmx|svm)_complete_interrups() store
> > > pending event in arch agnostic way and re-injection is handled by
> > > x86.c You bypass this logic by inserting return here and introducing
> > > nested_handle_valid_idt() function below.
> > The only location we can truly know if we are switching to L1 is in
> > vmx_vcpu_run
> > because enable_irq_window (that is called after handling the exit) can
> > decide to
> > switch to L1 because of an interrupt.
> enable_irq_window() will be called after L2 VMCS will be setup for event
> re-injection by previous call to inject_pending_event(). As far as I
> can see this should work for interrupt injection. For exception we
> should probably require l2 guest to re execute faulted instruction for
> now like svm does.
The main issue is that L0 doesn't inject events to L2 but L1 hypervisor (we
want to keep the nested hypervisor semantics as
much as possible). Only if the event was caused by the fact that L2 is a
nested guest
and L1 can't handle it L0 will re-inject and event to L2, for example IDT
event
with page fault that is caused by a missing entry in SPT02 (the shadow page
table L0 create for L2).
In this case when vmx_complete_intterupts is called L0 doesn't know if the
page fault should be handled by it or
by L1 (it is decided later when handling the exit).
In most other cases , L0 will switch to L1 and L1 will decide if there will
be re-injection
(depends on the L1 hypervisor logic) and update L2 VMCS accordingly.
>
> > In order to simplify our code it was simpler to bypass
> > vmx_complete_interrupts when it is called (after
> > running L2) and to add nested_handle_valid_idt just before running L2.
> > > > >
> > > > > > +   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > > > > +
> > > > > >     /* Handle machine checks before interrupts are enabled */
> > > > > >     if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
> > > > > >         || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
> > > > > > @@ -4747,6 +5064,60 @@ static void fixup_rmode_irq(struct
vcpu_vmx
> > > > *vmx)
> > > > > >        | vmx->rmode.irq.vector;
> > > > > >  }
> > > > > >
> > > > > > +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
> > > > > > +{
> > > > > It seems by this function you are trying to bypass general event
> > > > > reinjection logic. Why?
> > > > See above.
> > > The logic implemented by this function is handled in x86.c in arch
> > > agnostic way. Is there something wrong with this?
> > See my comment before
> Sometimes it is wrong to reinject events from L0 to L2 directly. If L2
> was not able to handle event because its IDT is not mapped by L1 shadow
> page table we should generate PF vmexit with valid idt vectoring info to
> L1 and let L1 handle event reinjection.
>
> --
>          Gleb.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-28 16:23                     ` Orit Wasserman
@ 2009-10-29 17:31                       ` Gleb Natapov
  2009-11-09  9:33                         ` Abel Gordon
  0 siblings, 1 reply; 35+ messages in thread
From: Gleb Natapov @ 2009-10-29 17:31 UTC (permalink / raw)
  To: Orit Wasserman
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda

On Wed, Oct 28, 2009 at 06:23:42PM +0200, Orit Wasserman wrote:
> 
> 
> Gleb Natapov <gleb@redhat.com> wrote on 25/10/2009 11:44:31:
> 
> > From:
> >
> > Gleb Natapov <gleb@redhat.com>
> >
> > To:
> >
> > Orit Wasserman/Haifa/IBM@IBMIL
> >
> > Cc:
> >
> > Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> > Haifa/IBM@IBMIL, kvm@vger.kernel.org, mdday@us.ibm.com, Muli Ben-
> > Yehuda/Haifa/IBM@IBMIL
> >
> > Date:
> >
> > 25/10/2009 11:44
> >
> > Subject:
> >
> > Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
> >
> > On Thu, Oct 22, 2009 at 05:46:16PM +0200, Orit Wasserman wrote:
> > >
> > >
> > > Gleb Natapov <gleb@redhat.com> wrote on 22/10/2009 11:04:58:
> > >
> > > > From:
> > > >
> > > > Gleb Natapov <gleb@redhat.com>
> > > >
> > > > To:
> > > >
> > > > Orit Wasserman/Haifa/IBM@IBMIL
> > > >
> > > > Cc:
> > > >
> > > > Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> > > > Haifa/IBM@IBMIL, kvm@vger.kernel.org, mdday@us.ibm.com, Muli Ben-
> > > > Yehuda/Haifa/IBM@IBMIL
> > > >
> > > > Date:
> > > >
> > > > 22/10/2009 11:05
> > > >
> > > > Subject:
> > > >
> > > > Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
> > > >
> > > > On Wed, Oct 21, 2009 at 04:43:44PM +0200, Orit Wasserman wrote:
> > > > > > > @@ -4641,10 +4955,13 @@ static void vmx_complete_interrupts
> (struct
> > > > > > vcpu_vmx *vmx)
> > > > > > >     int type;
> > > > > > >     bool idtv_info_valid;
> > > > > > >
> > > > > > > -   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > > > > > -
> > > > > > >     vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
> > > > > > >
> > > > > > > +   if (vmx->nested.nested_mode)
> > > > > > > +      return;
> > > > > > > +
> > > > > > Why return here? What the function does that should not be done
> in
> > > > > > nested mode?
> > > > > In nested mode L0 injects an interrupt to L2 only in one scenario,
> > > > > if there is an IDT_VALID event and L0 decides to run L2 again and
> not
> > > to
> > > > > switch back to L1.
> > > > > In all other cases the injection is handled by L1.
> > > > This is exactly the kind of scenario that is handled by
> > > > vmx_complete_interrupts(). (vmx|svm)_complete_interrups() store
> > > > pending event in arch agnostic way and re-injection is handled by
> > > > x86.c You bypass this logic by inserting return here and introducing
> > > > nested_handle_valid_idt() function below.
> > > The only location we can truly know if we are switching to L1 is in
> > > vmx_vcpu_run
> > > because enable_irq_window (that is called after handling the exit) can
> > > decide to
> > > switch to L1 because of an interrupt.
> > enable_irq_window() will be called after L2 VMCS will be setup for event
> > re-injection by previous call to inject_pending_event(). As far as I
> > can see this should work for interrupt injection. For exception we
> > should probably require l2 guest to re execute faulted instruction for
> > now like svm does.
> The main issue is that L0 doesn't inject events to L2 but L1 hypervisor (we
> want to keep the nested hypervisor semantics as
> much as possible). Only if the event was caused by the fact that L2 is a
> nested guest
> and L1 can't handle it L0 will re-inject and event to L2, for example IDT
> event
> with page fault that is caused by a missing entry in SPT02 (the shadow page
> table L0 create for L2).
> In this case when vmx_complete_intterupts is called L0 doesn't know if the
> page fault should be handled by it or
> by L1 (it is decided later when handling the exit).
So what? When it will be decided that L2 exit is needed pending event
will be transfered into L2's idt_vectoring_info. Otherwise event will be
reinfected by usual mechanism. BTW I don't see where you current code
setup L2's idt_vectoring_info if it is decided that L1 should handle
event reinjection.

> In most other cases , L0 will switch to L1 and L1 will decide if there will
> be re-injection
> (depends on the L1 hypervisor logic) and update L2 VMCS accordingly.
> >
> > > In order to simplify our code it was simpler to bypass
> > > vmx_complete_interrupts when it is called (after
> > > running L2) and to add nested_handle_valid_idt just before running L2.
> > > > > >
> > > > > > > +   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > > > > > +
> > > > > > >     /* Handle machine checks before interrupts are enabled */
> > > > > > >     if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
> > > > > > >         || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
> > > > > > > @@ -4747,6 +5064,60 @@ static void fixup_rmode_irq(struct
> vcpu_vmx
> > > > > *vmx)
> > > > > > >        | vmx->rmode.irq.vector;
> > > > > > >  }
> > > > > > >
> > > > > > > +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
> > > > > > > +{
> > > > > > It seems by this function you are trying to bypass general event
> > > > > > reinjection logic. Why?
> > > > > See above.
> > > > The logic implemented by this function is handled in x86.c in arch
> > > > agnostic way. Is there something wrong with this?
> > > See my comment before
> > Sometimes it is wrong to reinject events from L0 to L2 directly. If L2
> > was not able to handle event because its IDT is not mapped by L1 shadow
> > page table we should generate PF vmexit with valid idt vectoring info to
> > L1 and let L1 handle event reinjection.
> >
> > --
> >          Gleb.

--
			Gleb.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-10-29 17:31                       ` Gleb Natapov
@ 2009-11-09  9:33                         ` Abel Gordon
  0 siblings, 0 replies; 35+ messages in thread
From: Abel Gordon @ 2009-11-09  9:33 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda, Orit Wasserman



Gleb Natapov <gleb@redhat.com> wrote on 29/10/2009 19:31:05:

> [image removed]
>
> Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
>
> Gleb Natapov
>
> to:
>
> Orit Wasserman
>
> 29/10/2009 19:31
>
> Cc:
>
> Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mdday, Muli Ben-Yehuda
>
> On Wed, Oct 28, 2009 at 06:23:42PM +0200, Orit Wasserman wrote:
> >
> >
> > Gleb Natapov <gleb@redhat.com> wrote on 25/10/2009 11:44:31:
> >
> > > From:
> > >
> > > Gleb Natapov <gleb@redhat.com>
> > >
> > > To:
> > >
> > > Orit Wasserman/Haifa/IBM@IBMIL
> > >
> > > Cc:
> > >
> > > Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> > > Haifa/IBM@IBMIL, kvm@vger.kernel.org, mdday@us.ibm.com, Muli Ben-
> > > Yehuda/Haifa/IBM@IBMIL
> > >
> > > Date:
> > >
> > > 25/10/2009 11:44
> > >
> > > Subject:
> > >
> > > Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
> > >
> > > On Thu, Oct 22, 2009 at 05:46:16PM +0200, Orit Wasserman wrote:
> > > >
> > > >
> > > > Gleb Natapov <gleb@redhat.com> wrote on 22/10/2009 11:04:58:
> > > >
> > > > > From:
> > > > >
> > > > > Gleb Natapov <gleb@redhat.com>
> > > > >
> > > > > To:
> > > > >
> > > > > Orit Wasserman/Haifa/IBM@IBMIL
> > > > >
> > > > > Cc:
> > > > >
> > > > > Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami
Yassour1/
> > > > > Haifa/IBM@IBMIL, kvm@vger.kernel.org, mdday@us.ibm.com, Muli Ben-
> > > > > Yehuda/Haifa/IBM@IBMIL
> > > > >
> > > > > Date:
> > > > >
> > > > > 22/10/2009 11:05
> > > > >
> > > > > Subject:
> > > > >
> > > > > Re: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and
vmresume
> > > > >
> > > > > On Wed, Oct 21, 2009 at 04:43:44PM +0200, Orit Wasserman wrote:
> > > > > > > > @@ -4641,10 +4955,13 @@ static void vmx_complete_interrupts
> > (struct
> > > > > > > vcpu_vmx *vmx)
> > > > > > > >     int type;
> > > > > > > >     bool idtv_info_valid;
> > > > > > > >
> > > > > > > > -   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > > > > > > -
> > > > > > > >     vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
> > > > > > > >
> > > > > > > > +   if (vmx->nested.nested_mode)
> > > > > > > > +      return;
> > > > > > > > +
> > > > > > > Why return here? What the function does that should not be
done
> > in
> > > > > > > nested mode?
> > > > > > In nested mode L0 injects an interrupt to L2 only in one
scenario,
> > > > > > if there is an IDT_VALID event and L0 decides to run L2 again
and
> > not
> > > > to
> > > > > > switch back to L1.
> > > > > > In all other cases the injection is handled by L1.
> > > > > This is exactly the kind of scenario that is handled by
> > > > > vmx_complete_interrupts(). (vmx|svm)_complete_interrups() store
> > > > > pending event in arch agnostic way and re-injection is handled by
> > > > > x86.c You bypass this logic by inserting return here and
introducing
> > > > > nested_handle_valid_idt() function below.
> > > > The only location we can truly know if we are switching to L1 is in
> > > > vmx_vcpu_run
> > > > because enable_irq_window (that is called after handling the exit)
can
> > > > decide to
> > > > switch to L1 because of an interrupt.
> > > enable_irq_window() will be called after L2 VMCS will be setup for
event
> > > re-injection by previous call to inject_pending_event(). As far as I
> > > can see this should work for interrupt injection. For exception we
> > > should probably require l2 guest to re execute faulted instruction
for
> > > now like svm does.
> > The main issue is that L0 doesn't inject events to L2 but L1 hypervisor
(we
> > want to keep the nested hypervisor semantics as
> > much as possible). Only if the event was caused by the fact that L2 is
a
> > nested guest
> > and L1 can't handle it L0 will re-inject and event to L2, for example
IDT
> > event
> > with page fault that is caused by a missing entry in SPT02 (the shadow
page
> > table L0 create for L2).
> > In this case when vmx_complete_intterupts is called L0 doesn't know if
the
> > page fault should be handled by it or
> > by L1 (it is decided later when handling the exit).
> So what? When it will be decided that L2 exit is needed pending event
> will be transfered into L2's idt_vectoring_info. Otherwise event will be
> reinfected by usual mechanism. BTW I don't see where you current code
> setup L2's idt_vectoring_info if it is decided that L1 should handle
> event re-injection.
Suppose we are executing an L2 guest and we got an exit. There are 2
possible scenarios here:
A) The L2 exit will be handled by the L1 guest hypervisor. In this case
when we switch to L1 the IDT vectoring info field is copied from vmcs(02)
to vmcs(12) in prepare_vmcs_12 (part of the nested_vmx_vmexit path). Now is
under responsibility of L1 to deal with the IDT and do the corresponding
logic.
B) The L2 exit will be handled only by L0. In this case we never switch to
L1. L0 handles the exit and resume L2. Any pending event in vmcs(02) idt
vectoring info field is injected to l2 when L0 resumes it.

KVM handles IDT in at the end of vmx_vcpu_run, calling
vmx_complete_interrupts. The decision to switch or not switch to L1 is made
in the following points:
1) nested_vmx_check_exception (called from vmx_queue_exception)
2) nested_vmx_intr (called from vmx_interrupt_allowed and
enable_irq_window)
3) vmx_handle_exit

>From x86 perspective the flow looks as follow:
vcpu_enter_guest {
 1
 2
 run (includes vmx_complete_interrupts)
 3
}


All these functions are called after vmx_vcpu_run finished and
vmx_complete_interrupts was already executed. This impede us to re-use
regular non-nested IDT hadling because we still don't know there if the IDT
pending event must be injected or not. That's the reason we added the
function nested_handle_valid_idt which is called at the beginning of
vmx_vcpu_run. So now, the flow from x86 perspective will look like:
vcpu_enter_guest {
 1
 2
 check_nested_idt (injects pending IDT to L2 if necessary. only case B)
 run
 3
}

> > In most other cases , L0 will switch to L1 and L1 will decide if there
will
> > be re-injection
> > (depends on the L1 hypervisor logic) and update L2 VMCS accordingly.
> > >
> > > > In order to simplify our code it was simpler to bypass
> > > > vmx_complete_interrupts when it is called (after
> > > > running L2) and to add nested_handle_valid_idt just before running
L2.
> > > > > > >
> > > > > > > > +   exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > > > > > > > +
> > > > > > > >     /* Handle machine checks before interrupts are enabled
*/
> > > > > > > >     if ((vmx->exit_reason ==
EXIT_REASON_MCE_DURING_VMENTRY)
> > > > > > > >         || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
> > > > > > > > @@ -4747,6 +5064,60 @@ static void fixup_rmode_irq(struct
> > vcpu_vmx
> > > > > > *vmx)
> > > > > > > >        | vmx->rmode.irq.vector;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
> > > > > > > > +{
> > > > > > > It seems by this function you are trying to bypass general
event
> > > > > > > reinjection logic. Why?
> > > > > > See above.
> > > > > The logic implemented by this function is handled in x86.c in
arch
> > > > > agnostic way. Is there something wrong with this?
> > > > See my comment before
> > > Sometimes it is wrong to reinject events from L0 to L2 directly. If
L2
> > > was not able to handle event because its IDT is not mapped by L1
shadow
> > > page table we should generate PF vmexit with valid idt vectoring info
to
> > > L1 and let L1 handle event reinjection.
According to above explanation I think this is what we are doing in the
required case (A). Are we missing something ?

Abel.



^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff
  2009-09-30 13:32 Nested VMX support v2 oritw
@ 2009-09-30 13:32 ` oritw
  0 siblings, 0 replies; 35+ messages in thread
From: oritw @ 2009-09-30 13:32 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, abelg, muli, aliguori, -mday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/svm.c |    3 -
 arch/x86/kvm/vmx.c |  217 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c |    6 +-
 arch/x86/kvm/x86.h |    2 +
 4 files changed, 222 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2df9b45..3c1f22a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -124,9 +124,6 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
-static int nested = 1;
-module_param(nested, int, S_IRUGO);
-
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 78101dd..71bd91a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -67,6 +67,11 @@ struct vmcs {
 	char data[0];
 };
 
+struct nested_vmx {
+	/* Has the level1 guest done vmxon? */
+	bool vmxon;
+};
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	struct list_head      local_vcpus_link;
@@ -114,6 +119,9 @@ struct vcpu_vmx {
 	ktime_t entry_time;
 	s64 vnmi_blocked_time;
 	u32 exit_reason;
+
+	/* Nested vmx */
+	struct nested_vmx nested;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -967,6 +975,95 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 }
 
 /*
+ * Handles msr read for nested virtualization
+ */
+static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
+			      u64 *pdata)
+{
+	u64 vmx_msr = 0;
+
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
+		*pdata = 0;
+		break;
+	case MSR_IA32_VMX_BASIC:
+		*pdata = 0;
+		rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr);
+		*pdata = (vmx_msr & 0x00ffffcfffffffff);
+		break;
+	case MSR_IA32_VMX_PINBASED_CTLS:
+		rdmsrl(MSR_IA32_VMX_PINBASED_CTLS, vmx_msr);
+		*pdata = (PIN_BASED_EXT_INTR_MASK & vmcs_config.pin_based_exec_ctrl) |
+			(PIN_BASED_NMI_EXITING & vmcs_config.pin_based_exec_ctrl) |
+			(PIN_BASED_VIRTUAL_NMIS & vmcs_config.pin_based_exec_ctrl);
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+	{
+		u32 vmx_msr_high, vmx_msr_low;
+		u32 control = CPU_BASED_HLT_EXITING |
+#ifdef CONFIG_X86_64
+			CPU_BASED_CR8_LOAD_EXITING |
+			CPU_BASED_CR8_STORE_EXITING |
+#endif
+			CPU_BASED_CR3_LOAD_EXITING |
+			CPU_BASED_CR3_STORE_EXITING |
+			CPU_BASED_USE_IO_BITMAPS |
+			CPU_BASED_MOV_DR_EXITING |
+			CPU_BASED_USE_TSC_OFFSETING |
+			CPU_BASED_INVLPG_EXITING |
+			CPU_BASED_TPR_SHADOW |
+			CPU_BASED_USE_MSR_BITMAPS |
+			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
+		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+
+		control &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+		control |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
+
+		*pdata = (CPU_BASED_HLT_EXITING & control) |
+#ifdef CONFIG_X86_64
+			(CPU_BASED_CR8_LOAD_EXITING & control) |
+			(CPU_BASED_CR8_STORE_EXITING & control) |
+#endif
+			(CPU_BASED_CR3_LOAD_EXITING & control) |
+			(CPU_BASED_CR3_STORE_EXITING & control) |
+			(CPU_BASED_USE_IO_BITMAPS & control) |
+			(CPU_BASED_MOV_DR_EXITING & control) |
+			(CPU_BASED_USE_TSC_OFFSETING & control) |
+			(CPU_BASED_INVLPG_EXITING & control) ;
+
+		if (cpu_has_secondary_exec_ctrls())
+			*pdata |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
+		if (vm_need_tpr_shadow(vcpu->kvm))
+			*pdata |= CPU_BASED_TPR_SHADOW;
+		break;
+	}
+	case MSR_IA32_VMX_EXIT_CTLS:
+		*pdata = 0;
+#ifdef CONFIG_X86_64
+		*pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+		break;
+	case MSR_IA32_VMX_ENTRY_CTLS:
+		*pdata = 0;
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS2:
+		*pdata = 0;
+		if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+			*pdata |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		break;
+	case MSR_IA32_VMX_EPT_VPID_CAP:
+		*pdata = 0;
+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
@@ -1005,6 +1102,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
 	default:
+		if (nested &&
+		    !nested_vmx_get_msr(vcpu, msr_index, &data))
+			break;
 		vmx_load_host_state(to_vmx(vcpu));
 		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
@@ -1019,6 +1119,27 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 }
 
 /*
+ * Writes msr value for nested virtualization
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
+		if ((data & (FEATURE_CONTROL_LOCKED |
+			     FEATURE_CONTROL_VMXON_ENABLED))
+		    != (FEATURE_CONTROL_LOCKED |
+			FEATURE_CONTROL_VMXON_ENABLED))
+			return 1;
+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
@@ -1064,6 +1185,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		}
 		/* Otherwise falls through to kvm_set_msr_common */
 	default:
+		if (nested &&
+		    !nested_vmx_set_msr(vcpu, msr_index, data))
+			break;
 		vmx_load_host_state(vmx);
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
@@ -3095,12 +3219,101 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * Check to see if vcpu can execute vmx command
+ * Inject the corrseponding exception
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_msr_entry *msr;
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+	if (!vmx->nested.vmxon) {
+		printk(KERN_DEBUG "%s: vmx not on\n", __func__);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	msr = find_msr_entry(vmx, MSR_EFER);
+
+	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+		 ((msr->data & EFER_LMA) && !cs.l)) {
+		printk(KERN_DEBUG "%s: invalid mode cs.l %d lma %llu\n",
+		       __func__, cs.l, msr->data & EFER_LMA);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	if (vmx_get_cpl(vcpu)) {
+		kvm_inject_gp(vcpu, 0);
+		return 0;
+	}
+
+	return 1;
+}
+
 static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 {
 	kvm_queue_exception(vcpu, UD_VECTOR);
 	return 1;
 }
 
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vmx->nested.vmxon = 0;
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!nested) {
+		printk(KERN_DEBUG "%s: nested vmx not enabled\n", __func__);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+	if (!(vcpu->arch.cr4 & X86_CR4_VMXE) ||
+	    !(vcpu->arch.cr0 & X86_CR0_PE) ||
+	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		printk(KERN_INFO "%s invalid register state\n", __func__);
+		return 1;
+	}
+#ifdef CONFIG_X86_64
+	if (((find_msr_entry(to_vmx(vcpu),
+			     MSR_EFER)->data & EFER_LMA) && !cs.l)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		printk(KERN_INFO "%s invalid register state\n", __func__);
+		return 1;
+	}
+#endif
+	if (vmx_get_cpl(vcpu)) {
+		printk(KERN_INFO "%s no permission\n", __func__);
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	vmx->nested.vmxon = 1;
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3376,8 +3589,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
 	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
-	[EXIT_REASON_VMOFF]                   = handle_vmx_insn,
-	[EXIT_REASON_VMON]                    = handle_vmx_insn,
+	[EXIT_REASON_VMOFF]                   = handle_vmoff,
+	[EXIT_REASON_VMON]                    = handle_vmon,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b3a169..9c39092 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -87,6 +87,10 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
 int ignore_msrs = 0;
 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
+int nested = 1;
+EXPORT_SYMBOL_GPL(nested);
+module_param(nested, int, S_IRUGO);
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
 	{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -373,7 +377,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		return;
 	}
 
-	if (cr4 & X86_CR4_VMXE) {
+	if (cr4 & X86_CR4_VMXE && !nested) {
 		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 		kvm_inject_gp(vcpu, 0);
 		return;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea5..57204cb 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -35,4 +35,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index);
 
+extern int nested;
+
 #endif
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2009-11-09  9:33 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-10-15 14:41 Nested VMX support v3 oritw
2009-10-15 14:41 ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff oritw
2009-10-15 14:41   ` [PATCH 2/5] Nested VMX patch 2 implements vmclear oritw
2009-10-15 14:41     ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst oritw
2009-10-15 14:41       ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite oritw
2009-10-15 14:41         ` [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume oritw
2009-10-19 17:29           ` Gleb Natapov
2009-10-21 14:43             ` Orit Wasserman
2009-10-22  9:04               ` Gleb Natapov
2009-10-22 15:46                 ` Orit Wasserman
2009-10-25  9:44                   ` Gleb Natapov
2009-10-28 16:23                     ` Orit Wasserman
2009-10-29 17:31                       ` Gleb Natapov
2009-11-09  9:33                         ` Abel Gordon
2009-10-22 10:55               ` Avi Kivity
2009-10-20  4:56           ` Avi Kivity
2009-10-22 12:56             ` Orit Wasserman
2009-10-19 13:17         ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite Gleb Natapov
2009-10-21 13:32           ` Orit Wasserman
2009-10-20  4:44         ` Avi Kivity
2009-10-22 12:50           ` Orit Wasserman
2009-10-19 11:17       ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst Gleb Natapov
2009-10-21 13:27         ` Orit Wasserman
2009-10-19 12:59       ` Gleb Natapov
2009-10-21 13:28         ` Orit Wasserman
2009-10-20  4:24       ` Avi Kivity
2009-10-22 12:48         ` Orit Wasserman
2009-10-20  4:06     ` [PATCH 2/5] Nested VMX patch 2 implements vmclear Avi Kivity
2009-10-21 14:56       ` Orit Wasserman
2009-10-20  4:00   ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
2009-10-22 12:41     ` Orit Wasserman
2009-10-19 10:47 ` Nested VMX support v3 Gleb Natapov
2009-10-20  3:30 ` Avi Kivity
2009-10-21 14:50   ` Orit Wasserman
  -- strict thread matches above, loose matches on Subject: below --
2009-09-30 13:32 Nested VMX support v2 oritw
2009-09-30 13:32 ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff oritw

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.