All of lore.kernel.org
 help / color / mirror / Atom feed
* Nested VMX support - kernel v1
@ 2009-09-02 15:38 oritw
  2009-09-02 15:38 ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff oritw
                   ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: oritw @ 2009-09-02 15:38 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, muli, abelg, aliguori, mmday

The following patches implement nested VMX support. The patches enable a guest
to use the VMX APIs in order to run its own nested guest (i.e., enable running
other hypervisors which use VMX under KVM). The current patches support running
Linux under a nested KVM using shadow page table (with bypass_guest_pf
disabled). SMP support was fixed.  Reworking EPT support to mesh cleanly with
the current shadow paging design per Avi's comments is a work-in-progress.  

The current patches only support a single nested hypervisor, which can only run
a single guest (multiple guests are work in progress). Only 64-bit nested
hypervisors are supported.

Additional patches for running Windows under nested KVM, and Linux under nested
VMware server(!), are currently running in the lab. We are in the process of
forward-porting those patches to -tip.

This patches were written by:
     Orit Wasserman, oritw@il.ibm.com
     Ben-Ami Yassor, benami@il.ibm.com
     Abel Gordon, abelg@il.ibm.com
     Muli Ben-Yehuda, muli@il.ibm.com
     
With contributions by:
     Anthony Liguori, aliguori@us.ibm.com
     Mike Day, mmday@us.ibm.com

This work was inspired by the nested SVM support by Alexander Graf and Joerg
Roedel.  
Changes since v1: 
     SMP support.
     Split into 6 smaller patches.  
     Use nested_vmx structure for nested parameters.
     Use Array for shadow VMCS offsets.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff
  2009-09-02 15:38 Nested VMX support - kernel v1 oritw
@ 2009-09-02 15:38 ` oritw
  2009-09-02 15:38   ` [PATCH 2/6] Nested VMX patch 2 implements vmclear oritw
  2009-09-02 19:34   ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
  2009-09-02 15:57 ` Nested VMX support - kernel v1 Alexander Graf
  2009-09-02 21:39 ` Avi Kivity
  2 siblings, 2 replies; 31+ messages in thread
From: oritw @ 2009-09-02 15:38 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, muli, abelg, aliguori, mmday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/svm.c |    3 -
 arch/x86/kvm/vmx.c |  187 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c |    6 ++-
 arch/x86/kvm/x86.h |    2 +
 4 files changed, 192 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2df9b45..3c1f22a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -124,9 +124,6 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
-static int nested = 1;
-module_param(nested, int, S_IRUGO);
-
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 78101dd..abba325 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -67,6 +67,11 @@ struct vmcs {
 	char data[0];
 };
 
+struct nested_vmx {
+	/* Has the level1 guest done vmon? */
+	bool vmon;
+};
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	struct list_head      local_vcpus_link;
@@ -114,6 +119,9 @@ struct vcpu_vmx {
 	ktime_t entry_time;
 	s64 vnmi_blocked_time;
 	u32 exit_reason;
+
+	/* Nested vmx */
+	struct nested_vmx nested;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -967,6 +975,69 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 }
 
 /*
+ * Handles msr read for nested virtualization
+ */
+static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
+			      u64 *pdata)
+{
+	u32 vmx_msr_low = 0, vmx_msr_high = 0;
+
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
+		*pdata = 0;
+		break;
+	case MSR_IA32_VMX_BASIC:
+		rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+		*pdata = vmx_msr_low | ((u64)vmx_msr_high << 32);
+		break;
+	case MSR_IA32_VMX_PINBASED_CTLS:
+		*pdata = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
+			PIN_BASED_VIRTUAL_NMIS;
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+		*pdata =  CPU_BASED_HLT_EXITING |
+#ifdef CONFIG_X86_64
+			CPU_BASED_CR8_LOAD_EXITING |
+			CPU_BASED_CR8_STORE_EXITING |
+#endif
+			CPU_BASED_CR3_LOAD_EXITING |
+			CPU_BASED_CR3_STORE_EXITING |
+			CPU_BASED_USE_IO_BITMAPS |
+			CPU_BASED_MOV_DR_EXITING |
+			CPU_BASED_USE_TSC_OFFSETING |
+			CPU_BASED_INVLPG_EXITING;
+
+		if (cpu_has_secondary_exec_ctrls())
+			*pdata |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
+		if (vm_need_tpr_shadow(vcpu->kvm))
+			*pdata |= CPU_BASED_TPR_SHADOW;
+		break;
+	case MSR_IA32_VMX_EXIT_CTLS:
+		*pdata = 0;
+#ifdef CONFIG_X86_64
+		*pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+		break;
+	case MSR_IA32_VMX_ENTRY_CTLS:
+		*pdata = 0;
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS2:
+		*pdata = 0;
+		if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+			*pdata |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		break;
+	case MSR_IA32_VMX_EPT_VPID_CAP:
+		*pdata = 0;
+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
@@ -1005,6 +1076,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
 	default:
+		if (nested &&
+		    !nested_vmx_get_msr(vcpu, msr_index, &data))
+			break;
 		vmx_load_host_state(to_vmx(vcpu));
 		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
@@ -1019,6 +1093,27 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 }
 
 /*
+ * Writes msr value for nested virtualization
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
+		if ((data & (FEATURE_CONTROL_LOCKED |
+			     FEATURE_CONTROL_VMXON_ENABLED))
+		    != (FEATURE_CONTROL_LOCKED |
+			FEATURE_CONTROL_VMXON_ENABLED))
+			return 1;
+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
@@ -1064,6 +1159,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		}
 		/* Otherwise falls through to kvm_set_msr_common */
 	default:
+		if (nested &&
+		    !nested_vmx_set_msr(vcpu, msr_index, data))
+			break;
 		vmx_load_host_state(vmx);
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
@@ -3095,12 +3193,97 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * Check to see if vcpu can execute vmx command
+ * Inject the corrseponding exception
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_msr_entry *msr;
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+	if (!vmx->nested.vmon) {
+		printk(KERN_DEBUG "%s: vmx not on\n", __func__);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	msr = find_msr_entry(vmx, MSR_EFER);
+
+	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+		 ((msr->data & EFER_LMA) && !cs.l)) {
+		printk(KERN_DEBUG "%s: invalid mode cs.l %d lma %llu\n",
+		       __func__, cs.l, msr->data & EFER_LMA);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	if (vmx_get_cpl(vcpu)) {
+		kvm_inject_gp(vcpu, 0);
+		return 0;
+	}
+
+	return 1;
+}
+
+
 static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 {
 	kvm_queue_exception(vcpu, UD_VECTOR);
 	return 1;
 }
 
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vmx->nested.vmon = 0;
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!nested) {
+		printk(KERN_DEBUG "%s: nested vmx not enabled\n", __func__);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+	if (!(vcpu->arch.cr4 & X86_CR4_VMXE) ||
+	    !(vcpu->arch.cr0 & X86_CR0_PE) ||
+	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+	    ((find_msr_entry(to_vmx(vcpu),
+			     MSR_EFER)->data & EFER_LMA) && !cs.l)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		printk(KERN_INFO "%s invalid register state\n", __func__);
+		return 1;
+	}
+
+	if (vmx_get_cpl(vcpu)) {
+		printk(KERN_INFO "%s no permission\n", __func__);
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	vmx->nested.vmon = 1;
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3376,8 +3559,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
 	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
-	[EXIT_REASON_VMOFF]                   = handle_vmx_insn,
-	[EXIT_REASON_VMON]                    = handle_vmx_insn,
+	[EXIT_REASON_VMOFF]                   = handle_vmoff,
+	[EXIT_REASON_VMON]                    = handle_vmon,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b3a169..9c39092 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -87,6 +87,10 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
 int ignore_msrs = 0;
 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
+int nested = 1;
+EXPORT_SYMBOL_GPL(nested);
+module_param(nested, int, S_IRUGO);
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
 	{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -373,7 +377,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		return;
 	}
 
-	if (cr4 & X86_CR4_VMXE) {
+	if (cr4 & X86_CR4_VMXE && !nested) {
 		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 		kvm_inject_gp(vcpu, 0);
 		return;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea5..57204cb 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -35,4 +35,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index);
 
+extern int nested;
+
 #endif
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/6] Nested VMX patch 2 implements vmclear
  2009-09-02 15:38 ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff oritw
@ 2009-09-02 15:38   ` oritw
  2009-09-02 15:38     ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst oritw
  2009-09-02 19:38     ` [PATCH 2/6] Nested VMX patch 2 implements vmclear Avi Kivity
  2009-09-02 19:34   ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
  1 sibling, 2 replies; 31+ messages in thread
From: oritw @ 2009-09-02 15:38 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, muli, abelg, aliguori, mmday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |   24 +++++++++++++++++++++++-
 1 files changed, 23 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index abba325..2b1fc3b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -70,6 +70,8 @@ struct vmcs {
 struct nested_vmx {
 	/* Has the level1 guest done vmon? */
 	bool vmon;
+	/* Has the level1 guest done vmclear? */
+	bool vmclear;
 };
 
 struct vcpu_vmx {
@@ -3229,6 +3231,26 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_ZF);
+	vmx_set_rflags(vcpu, rflags);
+}
+
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	to_vmx(vcpu)->nested.vmclear = 1;
+
+	skip_emulated_instruction(vcpu);
+	clear_rflags_cf_zf(vcpu);
+
+	return 1;
+}
 
 static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 {
@@ -3552,7 +3574,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_HLT]                     = handle_halt,
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
-	[EXIT_REASON_VMCLEAR]	              = handle_vmx_insn,
+	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
 	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-09-02 15:38   ` [PATCH 2/6] Nested VMX patch 2 implements vmclear oritw
@ 2009-09-02 15:38     ` oritw
  2009-09-02 15:38       ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite oritw
  2009-09-02 20:05       ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst Avi Kivity
  2009-09-02 19:38     ` [PATCH 2/6] Nested VMX patch 2 implements vmclear Avi Kivity
  1 sibling, 2 replies; 31+ messages in thread
From: oritw @ 2009-09-02 15:38 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, muli, abelg, aliguori, mmday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  533 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 523 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2b1fc3b..5ab07a0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,6 +61,151 @@ module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
+struct __attribute__ ((__packed__)) shadow_vmcs {
+	uint16_t virtual_processor_id;
+	uint16_t guest_es_selector;
+	uint16_t guest_cs_selector;
+	uint16_t guest_ss_selector;
+	uint16_t guest_ds_selector;
+	uint16_t guest_fs_selector;
+	uint16_t guest_gs_selector;
+	uint16_t guest_ldtr_selector;
+	uint16_t guest_tr_selector;
+	uint16_t host_es_selector;
+	uint16_t host_cs_selector;
+	uint16_t host_ss_selector;
+	uint16_t host_ds_selector;
+	uint16_t host_fs_selector;
+	uint16_t host_gs_selector;
+	uint16_t host_tr_selector;
+	uint64_t io_bitmap_a;
+	uint64_t io_bitmap_b;
+	uint64_t msr_bitmap;
+	uint64_t vm_exit_msr_store_addr;
+	uint64_t vm_exit_msr_load_addr;
+	uint64_t vm_entry_msr_load_addr;
+	uint64_t tsc_offset;
+	uint64_t virtual_apic_page_addr;
+	uint64_t apic_access_addr;
+	uint64_t ept_pointer;
+	uint64_t guest_physical_address;
+	uint64_t vmcs_link_pointer;
+	uint64_t guest_ia32_debugctl;
+	uint64_t guest_ia32_pat;
+	uint64_t guest_pdptr0;
+	uint64_t guest_pdptr1;
+	uint64_t guest_pdptr2;
+	uint64_t guest_pdptr3;
+	uint64_t host_ia32_pat;
+	uint32_t pin_based_vm_exec_control;
+	uint32_t cpu_based_vm_exec_control;
+	uint32_t exception_bitmap;
+	uint32_t page_fault_error_code_mask;
+	uint32_t page_fault_error_code_match;
+	uint32_t cr3_target_count;
+	uint32_t vm_exit_controls;
+	uint32_t vm_exit_msr_store_count;
+	uint32_t vm_exit_msr_load_count;
+	uint32_t vm_entry_controls;
+	uint32_t vm_entry_msr_load_count;
+	uint32_t vm_entry_intr_info_field;
+	uint32_t vm_entry_exception_error_code;
+	uint32_t vm_entry_instruction_len;
+	uint32_t tpr_threshold;
+	uint32_t secondary_vm_exec_control;
+	uint32_t vm_instruction_error;
+	uint32_t vm_exit_reason;
+	uint32_t vm_exit_intr_info;
+	uint32_t vm_exit_intr_error_code;
+	uint32_t idt_vectoring_info_field;
+	uint32_t idt_vectoring_error_code;
+	uint32_t vm_exit_instruction_len;
+	uint32_t vmx_instruction_info;
+	uint32_t guest_es_limit;
+	uint32_t guest_cs_limit;
+	uint32_t guest_ss_limit;
+	uint32_t guest_ds_limit;
+	uint32_t guest_fs_limit;
+	uint32_t guest_gs_limit;
+	uint32_t guest_ldtr_limit;
+	uint32_t guest_tr_limit;
+	uint32_t guest_gdtr_limit;
+	uint32_t guest_idtr_limit;
+	uint32_t guest_es_ar_bytes;
+	uint32_t guest_cs_ar_bytes;
+	uint32_t guest_ss_ar_bytes;
+	uint32_t guest_ds_ar_bytes;
+	uint32_t guest_fs_ar_bytes;
+	uint32_t guest_gs_ar_bytes;
+	uint32_t guest_ldtr_ar_bytes;
+	uint32_t guest_tr_ar_bytes;
+	uint32_t guest_interruptibility_info;
+	uint32_t guest_activity_state;
+	uint32_t guest_sysenter_cs;
+	uint32_t host_ia32_sysenter_cs;
+	unsigned long cr0_guest_host_mask;
+	unsigned long cr4_guest_host_mask;
+	unsigned long cr0_read_shadow;
+	unsigned long cr4_read_shadow;
+	unsigned long cr3_target_value0;
+	unsigned long cr3_target_value1;
+	unsigned long cr3_target_value2;
+	unsigned long cr3_target_value3;
+	unsigned long exit_qualification;
+	unsigned long guest_linear_address;
+	unsigned long guest_cr0;
+	unsigned long guest_cr3;
+	unsigned long guest_cr4;
+	unsigned long guest_es_base;
+	unsigned long guest_cs_base;
+	unsigned long guest_ss_base;
+	unsigned long guest_ds_base;
+	unsigned long guest_fs_base;
+	unsigned long guest_gs_base;
+	unsigned long guest_ldtr_base;
+	unsigned long guest_tr_base;
+	unsigned long guest_gdtr_base;
+	unsigned long guest_idtr_base;
+	unsigned long guest_dr7;
+	unsigned long guest_rsp;
+	unsigned long guest_rip;
+	unsigned long guest_rflags;
+	unsigned long guest_pending_dbg_exceptions;
+	unsigned long guest_sysenter_esp;
+	unsigned long guest_sysenter_eip;
+	unsigned long host_cr0;
+	unsigned long host_cr3;
+	unsigned long host_cr4;
+	unsigned long host_fs_base;
+	unsigned long host_gs_base;
+	unsigned long host_tr_base;
+	unsigned long host_gdtr_base;
+	unsigned long host_idtr_base;
+	unsigned long host_ia32_sysenter_esp;
+	unsigned long host_ia32_sysenter_eip;
+	unsigned long host_rsp;
+	unsigned long host_rip;
+};
+
+struct __attribute__ ((__packed__)) level_state {
+	struct shadow_vmcs *shadow_vmcs;
+
+	u16 vpid;
+	u64 shadow_efer;
+	unsigned long cr2;
+	unsigned long cr3;
+	unsigned long cr4;
+	unsigned long cr8;
+
+	u64 io_bitmap_a;
+	u64 io_bitmap_b;
+	u64 msr_bitmap;
+
+	struct vmcs *vmcs;
+	int cpu;
+	int launched;
+};
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -72,6 +217,17 @@ struct nested_vmx {
 	bool vmon;
 	/* Has the level1 guest done vmclear? */
 	bool vmclear;
+	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
+	u64 l1_cur_vmcs;
+	/*
+	 * Level 2 state : includes vmcs,registers and
+	 * a copy of vmcs12 for vmread/vmwrite
+	 */
+	struct level_state *l2_state;
+
+	/* Level 1 state for switching to level 2 and back */
+	struct level_state *l1_state;
+
 };
 
 struct vcpu_vmx {
@@ -131,6 +287,25 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static struct page *nested_get_page(struct kvm_vcpu *vcpu,
+				    u64 vmcs_addr)
+{
+	struct page *vmcs_page = NULL;
+
+	down_read(&current->mm->mmap_sem);
+	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+
+	if (is_error_page(vmcs_page)) {
+		printk(KERN_ERR "%s error allocating page \n", __func__);
+		kvm_release_page_clean(vmcs_page);
+		return NULL;
+	}
+
+	return vmcs_page;
+
+}
+
 static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 
@@ -188,6 +363,10 @@ static struct kvm_vmx_segment_field {
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
+static int create_l1_state(struct kvm_vcpu *vcpu);
+static int create_l2_state(struct kvm_vcpu *vcpu);
+static int shadow_vmcs_load(struct kvm_vcpu *vcpu);
+
 /*
  * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
  * away by decrementing the array size.
@@ -704,6 +883,24 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	preempt_enable();
 }
 
+
+static int vmptrld(struct kvm_vcpu *vcpu,
+		   u64 phys_addr)
+{
+	u8 error;
+
+	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+		      : "cc");
+	if (error) {
+		printk(KERN_ERR "kvm: %s vmptrld %llx failed\n",
+		       __func__, phys_addr);
+		return 1;
+	}
+
+	return 0;
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -725,15 +922,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		u8 error;
-
 		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
-			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
-			      : "cc");
-		if (error)
-			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
-			       vmx->vmcs, phys_addr);
+		vmptrld(vcpu, phys_addr);
 	}
 
 	if (vcpu->cpu != cpu) {
@@ -3252,6 +3442,115 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
+{
+	gpa_t gpa;
+	struct page *page;
+	int r = 0;
+
+	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
+
+	/* checking guest gpa */
+	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (is_error_page(page)) {
+		printk(KERN_ERR "%s Invalid guest vmcs addr %llx\n",
+		       __func__, gpa);
+		r = 1;
+		goto out;
+	}
+
+	r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
+	if (r) {
+		printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
+		       __func__, gpa, r);
+		goto out;
+	}
+
+	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
+		printk(KERN_DEBUG "%s addr %llx not aligned\n",
+		       __func__, *gentry);
+		return 1;
+	}
+
+out:
+	kvm_release_page_clean(page);
+	return r;
+}
+
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct page *vmcs_page;
+	u64 guest_vmcs_addr;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (read_guest_vmcs_gpa(vcpu, &guest_vmcs_addr))
+		return 1;
+
+	if (create_l1_state(vcpu)) {
+		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
+		return 1;
+	}
+
+	if (create_l2_state(vcpu)) {
+		printk(KERN_ERR "%s create_l2_state failed\n", __func__);
+		return 1;
+	}
+
+	vmx->nested.l2_state->vmcs = alloc_vmcs();
+	if (!vmx->nested.l2_state->vmcs) {
+		printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
+		return 1;
+	}
+
+	if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
+		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
+		if (vmcs_page == NULL)
+			return 1;
+
+		/* load nested vmcs to processor */
+		if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
+			printk(KERN_INFO "%s error in vmptrld \n",
+			       __func__);
+			kvm_release_page_clean(vmcs_page);
+			return 1;
+		}
+
+		/* save nested vmcs in the shadow vmcs */
+		if (shadow_vmcs_load(vcpu)) {
+			kvm_release_page_clean(vmcs_page);
+			return 1;
+		}
+
+		vmx->nested.l1_cur_vmcs = guest_vmcs_addr;
+
+		/* load to previous vmcs */
+		if (vmptrld(vcpu, __pa(to_vmx(vcpu)->vmcs))) {
+			kvm_release_page_clean(vmcs_page);
+				return 1;
+		}
+
+		kvm_release_page_clean(vmcs_page);
+	}
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vcpu->arch.regs[VCPU_REGS_RAX] = to_vmx(vcpu)->nested.l1_cur_vmcs;
+
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 {
 	kvm_queue_exception(vcpu, UD_VECTOR);
@@ -3576,8 +3875,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
-	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
-	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
+	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
+	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
 	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
@@ -4115,6 +4414,220 @@ static bool vmx_gb_page_enable(void)
 	return false;
 }
 
+void save_vmcs(struct shadow_vmcs *dst)
+{
+	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
+	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
+	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
+	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
+	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
+	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
+	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
+	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+	if (cpu_has_vmx_msr_bitmap())
+		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
+
+	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
+	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
+	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
+	dst->tsc_offset = vmcs_read64(TSC_OFFSET);
+	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
+	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
+	if (enable_ept)
+		dst->ept_pointer = vmcs_read64(EPT_POINTER);
+
+	dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	if (enable_ept) {
+		dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+		dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+		dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+		dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+	}
+	dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+	dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
+	dst->page_fault_error_code_mask =
+		vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
+	dst->page_fault_error_code_match =
+		vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
+	dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
+	dst->vm_exit_msr_store_count = vmcs_read32(VM_EXIT_MSR_STORE_COUNT);
+	dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
+	dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
+	dst->vm_entry_msr_load_count = vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT);
+	dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	dst->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
+	dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	if (enable_vpid && dst->secondary_vm_exec_control &
+	    SECONDARY_EXEC_ENABLE_VPID)
+		dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
+	dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+	dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	dst->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+	dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+	dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
+	dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
+	dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
+	dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+	dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
+	dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
+	dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
+	dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
+	dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	dst->guest_cr0 = vmcs_readl(GUEST_CR0);
+	dst->guest_cr3 = vmcs_readl(GUEST_CR3);
+	dst->guest_cr4 = vmcs_readl(GUEST_CR4);
+	dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	dst->guest_dr7 = vmcs_readl(GUEST_DR7);
+	dst->guest_rsp = vmcs_readl(GUEST_RSP);
+	dst->guest_rip = vmcs_readl(GUEST_RIP);
+	dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	dst->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+	dst->host_cr0 = vmcs_readl(HOST_CR0);
+	dst->host_cr3 = vmcs_readl(HOST_CR3);
+	dst->host_cr4 = vmcs_readl(HOST_CR4);
+	dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
+	dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
+	dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
+	dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
+	dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
+	dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
+	dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
+	dst->host_rsp = vmcs_readl(HOST_RSP);
+	dst->host_rip = vmcs_readl(HOST_RIP);
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
+}
+
+static int shadow_vmcs_load(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		vmx->nested.l2_state->shadow_vmcs =
+			kzalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!vmx->nested.l2_state->shadow_vmcs) {
+			printk(KERN_INFO "%s error creating nested vmcs\n",
+			       __func__);
+			return -ENOMEM;
+		}
+	}
+
+	save_vmcs(vmx->nested.l2_state->shadow_vmcs);
+
+	return 0;
+}
+
+struct level_state *create_state(void)
+{
+	struct level_state *state = NULL;
+
+	state = kzalloc(sizeof(struct level_state), GFP_KERNEL);
+	if (!state) {
+		printk(KERN_INFO "Error create level state\n");
+		return NULL;
+	}
+	state->shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!state->shadow_vmcs) {
+		printk(KERN_INFO "%s error creating shadow vmcs\n",
+		       __func__);
+		kfree(state);
+		return NULL;
+	}
+	return state;
+}
+
+int create_l1_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l1_state) {
+		vmx->nested.l1_state = create_state();
+		if (!vmx->nested.l1_state)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int create_l2_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l2_state) {
+		vmx->nested.l2_state = create_state();
+		if (!vmx->nested.l2_state)
+			return -ENOMEM;
+	}
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx->nested.l2_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
+	else
+		vmx->nested.l2_state->msr_bitmap = 0;
+
+	vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+
+	return 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite
  2009-09-02 15:38     ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst oritw
@ 2009-09-02 15:38       ` oritw
  2009-09-02 15:38         ` [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume oritw
  2009-09-02 20:15         ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite Avi Kivity
  2009-09-02 20:05       ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst Avi Kivity
  1 sibling, 2 replies; 31+ messages in thread
From: oritw @ 2009-09-02 15:38 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, muli, abelg, aliguori, mmday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  556 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 554 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5ab07a0..2453c67 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -206,6 +206,21 @@ struct __attribute__ ((__packed__)) level_state {
 	int launched;
 };
 
+enum vmcs_field_type {
+	VMCS_FIELD_TYPE_U16 = 0,
+	VMCS_FIELD_TYPE_U64 = 1,
+	VMCS_FIELD_TYPE_U32 = 2,
+	VMCS_FIELD_TYPE_ULONG = 3
+};
+
+#define VMCS_FIELD_LENGTH_OFFSET 13
+#define VMCS_FIELD_LENGTH_MASK 0x6000
+
+static inline int vmcs_field_length(unsigned long field)
+{
+	return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
+}
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -287,6 +302,411 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static unsigned short vmcs_field_to_offset_table[HOST_RIP+1];
+
+static void init_vmcs_field_to_offset_table(void)
+{
+	memset(vmcs_field_to_offset_table,0xff,
+	       sizeof(vmcs_field_to_offset_table));
+
+	vmcs_field_to_offset_table[VIRTUAL_PROCESSOR_ID] =
+		offsetof(struct shadow_vmcs, virtual_processor_id);
+	vmcs_field_to_offset_table[GUEST_ES_SELECTOR] =
+		offsetof(struct shadow_vmcs, guest_es_selector);
+	vmcs_field_to_offset_table[GUEST_CS_SELECTOR] =
+		offsetof(struct shadow_vmcs, guest_cs_selector);
+	vmcs_field_to_offset_table[GUEST_SS_SELECTOR] =
+		offsetof(struct shadow_vmcs, guest_ss_selector);
+	vmcs_field_to_offset_table[GUEST_DS_SELECTOR] =
+		offsetof(struct shadow_vmcs, guest_ds_selector);
+	vmcs_field_to_offset_table[GUEST_FS_SELECTOR] =
+		offsetof(struct shadow_vmcs, guest_fs_selector);
+	vmcs_field_to_offset_table[GUEST_GS_SELECTOR] =
+		offsetof(struct shadow_vmcs, guest_gs_selector);
+	vmcs_field_to_offset_table[GUEST_LDTR_SELECTOR] =
+		offsetof(struct shadow_vmcs, guest_ldtr_selector);
+	vmcs_field_to_offset_table[GUEST_TR_SELECTOR] =
+		offsetof(struct shadow_vmcs, guest_tr_selector);
+	vmcs_field_to_offset_table[HOST_ES_SELECTOR] =
+		offsetof(struct shadow_vmcs, host_es_selector);
+	vmcs_field_to_offset_table[HOST_CS_SELECTOR] =
+		offsetof(struct shadow_vmcs, host_cs_selector);
+	vmcs_field_to_offset_table[HOST_SS_SELECTOR] =
+		offsetof(struct shadow_vmcs, host_ss_selector);
+	vmcs_field_to_offset_table[HOST_DS_SELECTOR] =
+		offsetof(struct shadow_vmcs, host_ds_selector);
+	vmcs_field_to_offset_table[HOST_FS_SELECTOR] =
+		offsetof(struct shadow_vmcs, host_fs_selector);
+	vmcs_field_to_offset_table[HOST_GS_SELECTOR] =
+		offsetof(struct shadow_vmcs, host_gs_selector);
+	vmcs_field_to_offset_table[HOST_TR_SELECTOR] =
+		offsetof(struct shadow_vmcs, host_tr_selector);
+	vmcs_field_to_offset_table[IO_BITMAP_A] =
+		offsetof(struct shadow_vmcs, io_bitmap_a);
+	vmcs_field_to_offset_table[IO_BITMAP_A_HIGH] =
+		offsetof(struct shadow_vmcs, io_bitmap_a)+4;
+	vmcs_field_to_offset_table[IO_BITMAP_B] =
+		offsetof(struct shadow_vmcs, io_bitmap_b);
+	vmcs_field_to_offset_table[IO_BITMAP_B_HIGH] =
+		offsetof(struct shadow_vmcs, io_bitmap_b)+4;
+	vmcs_field_to_offset_table[MSR_BITMAP] =
+		offsetof(struct shadow_vmcs, msr_bitmap);
+	vmcs_field_to_offset_table[MSR_BITMAP_HIGH] =
+		offsetof(struct shadow_vmcs, msr_bitmap)+4;
+	vmcs_field_to_offset_table[VM_EXIT_MSR_STORE_ADDR] =
+		offsetof(struct shadow_vmcs, vm_exit_msr_store_addr);
+	vmcs_field_to_offset_table[VM_EXIT_MSR_STORE_ADDR_HIGH] =
+		offsetof(struct shadow_vmcs, vm_exit_msr_store_addr)+4;
+	vmcs_field_to_offset_table[VM_EXIT_MSR_LOAD_ADDR] =
+		offsetof(struct shadow_vmcs, vm_exit_msr_load_addr);
+	vmcs_field_to_offset_table[VM_EXIT_MSR_LOAD_ADDR_HIGH] =
+		offsetof(struct shadow_vmcs, vm_exit_msr_load_addr)+4;
+	vmcs_field_to_offset_table[VM_ENTRY_MSR_LOAD_ADDR] =
+		offsetof(struct shadow_vmcs, vm_entry_msr_load_addr);
+	vmcs_field_to_offset_table[VM_ENTRY_MSR_LOAD_ADDR_HIGH] =
+		offsetof(struct shadow_vmcs, vm_entry_msr_load_addr)+4;
+	vmcs_field_to_offset_table[TSC_OFFSET] =
+		offsetof(struct shadow_vmcs, tsc_offset);
+	vmcs_field_to_offset_table[TSC_OFFSET_HIGH] =
+		offsetof(struct shadow_vmcs, tsc_offset)+4;
+	vmcs_field_to_offset_table[VIRTUAL_APIC_PAGE_ADDR] =
+		offsetof(struct shadow_vmcs, virtual_apic_page_addr);
+	vmcs_field_to_offset_table[VIRTUAL_APIC_PAGE_ADDR_HIGH] =
+		offsetof(struct shadow_vmcs, virtual_apic_page_addr)+4;
+	vmcs_field_to_offset_table[APIC_ACCESS_ADDR] =
+		offsetof(struct shadow_vmcs, apic_access_addr);
+	vmcs_field_to_offset_table[APIC_ACCESS_ADDR_HIGH] =
+		offsetof(struct shadow_vmcs, apic_access_addr)+4;
+	vmcs_field_to_offset_table[EPT_POINTER] =
+		offsetof(struct shadow_vmcs, ept_pointer);
+	vmcs_field_to_offset_table[EPT_POINTER_HIGH] =
+		offsetof(struct shadow_vmcs, ept_pointer)+4;
+	vmcs_field_to_offset_table[GUEST_PHYSICAL_ADDRESS] =
+		offsetof(struct shadow_vmcs, guest_physical_address);
+	vmcs_field_to_offset_table[GUEST_PHYSICAL_ADDRESS_HIGH] =
+		offsetof(struct shadow_vmcs, guest_physical_address)+4;
+	vmcs_field_to_offset_table[VMCS_LINK_POINTER] =
+		offsetof(struct shadow_vmcs, vmcs_link_pointer);
+	vmcs_field_to_offset_table[VMCS_LINK_POINTER_HIGH] =
+		offsetof(struct shadow_vmcs, vmcs_link_pointer)+4;
+	vmcs_field_to_offset_table[GUEST_IA32_DEBUGCTL] =
+		offsetof(struct shadow_vmcs, guest_ia32_debugctl);
+	vmcs_field_to_offset_table[GUEST_IA32_DEBUGCTL_HIGH] =
+		offsetof(struct shadow_vmcs, guest_ia32_debugctl)+4;
+	vmcs_field_to_offset_table[GUEST_IA32_PAT] =
+		offsetof(struct shadow_vmcs, guest_ia32_pat);
+	vmcs_field_to_offset_table[GUEST_IA32_PAT_HIGH] =
+		offsetof(struct shadow_vmcs, guest_ia32_pat)+4;
+	vmcs_field_to_offset_table[GUEST_PDPTR0] =
+		offsetof(struct shadow_vmcs, guest_pdptr0);
+	vmcs_field_to_offset_table[GUEST_PDPTR0_HIGH] =
+		offsetof(struct shadow_vmcs, guest_pdptr0)+4;
+	vmcs_field_to_offset_table[GUEST_PDPTR1] =
+		offsetof(struct shadow_vmcs, guest_pdptr1);
+	vmcs_field_to_offset_table[GUEST_PDPTR1_HIGH] =
+		offsetof(struct shadow_vmcs, guest_pdptr1)+4;
+	vmcs_field_to_offset_table[GUEST_PDPTR2] =
+		offsetof(struct shadow_vmcs, guest_pdptr2);
+	vmcs_field_to_offset_table[GUEST_PDPTR2_HIGH] =
+		offsetof(struct shadow_vmcs, guest_pdptr2)+4;
+	vmcs_field_to_offset_table[GUEST_PDPTR3] =
+		offsetof(struct shadow_vmcs, guest_pdptr3);
+	vmcs_field_to_offset_table[GUEST_PDPTR3_HIGH] =
+		offsetof(struct shadow_vmcs, guest_pdptr3)+4;
+	vmcs_field_to_offset_table[HOST_IA32_PAT] =
+		offsetof(struct shadow_vmcs, host_ia32_pat);
+	vmcs_field_to_offset_table[HOST_IA32_PAT_HIGH] =
+		offsetof(struct shadow_vmcs, host_ia32_pat)+4;
+	vmcs_field_to_offset_table[PIN_BASED_VM_EXEC_CONTROL] =
+		offsetof(struct shadow_vmcs, pin_based_vm_exec_control);
+	vmcs_field_to_offset_table[CPU_BASED_VM_EXEC_CONTROL] =
+		offsetof(struct shadow_vmcs, cpu_based_vm_exec_control);
+	vmcs_field_to_offset_table[EXCEPTION_BITMAP] =
+		offsetof(struct shadow_vmcs, exception_bitmap);
+	vmcs_field_to_offset_table[PAGE_FAULT_ERROR_CODE_MASK] =
+		offsetof(struct shadow_vmcs, page_fault_error_code_mask);
+	vmcs_field_to_offset_table[PAGE_FAULT_ERROR_CODE_MATCH] =
+		offsetof(struct shadow_vmcs,
+				page_fault_error_code_match);
+	vmcs_field_to_offset_table[CR3_TARGET_COUNT] =
+		offsetof(struct shadow_vmcs, cr3_target_count);
+	vmcs_field_to_offset_table[VM_EXIT_CONTROLS] =
+		offsetof(struct shadow_vmcs, vm_exit_controls);
+	vmcs_field_to_offset_table[VM_EXIT_MSR_STORE_COUNT] =
+		offsetof(struct shadow_vmcs, vm_exit_msr_store_count);
+	vmcs_field_to_offset_table[VM_EXIT_MSR_LOAD_COUNT] =
+		offsetof(struct shadow_vmcs, vm_exit_msr_load_count);
+	vmcs_field_to_offset_table[VM_ENTRY_CONTROLS] =
+		offsetof(struct shadow_vmcs, vm_entry_controls);
+	vmcs_field_to_offset_table[VM_ENTRY_MSR_LOAD_COUNT] =
+		offsetof(struct shadow_vmcs, vm_entry_msr_load_count);
+	vmcs_field_to_offset_table[VM_ENTRY_INTR_INFO_FIELD] =
+		offsetof(struct shadow_vmcs, vm_entry_intr_info_field);
+	vmcs_field_to_offset_table[VM_ENTRY_EXCEPTION_ERROR_CODE] =
+		offsetof(struct shadow_vmcs,
+				vm_entry_exception_error_code);
+	vmcs_field_to_offset_table[VM_ENTRY_INSTRUCTION_LEN] =
+		offsetof(struct shadow_vmcs, vm_entry_instruction_len);
+	vmcs_field_to_offset_table[TPR_THRESHOLD] =
+		offsetof(struct shadow_vmcs, tpr_threshold);
+	vmcs_field_to_offset_table[SECONDARY_VM_EXEC_CONTROL] =
+		offsetof(struct shadow_vmcs, secondary_vm_exec_control);
+	vmcs_field_to_offset_table[VM_INSTRUCTION_ERROR] =
+		offsetof(struct shadow_vmcs, vm_instruction_error);
+	vmcs_field_to_offset_table[VM_EXIT_REASON] =
+		offsetof(struct shadow_vmcs, vm_exit_reason);
+	vmcs_field_to_offset_table[VM_EXIT_INTR_INFO] =
+		offsetof(struct shadow_vmcs, vm_exit_intr_info);
+	vmcs_field_to_offset_table[VM_EXIT_INTR_ERROR_CODE] =
+		offsetof(struct shadow_vmcs, vm_exit_intr_error_code);
+	vmcs_field_to_offset_table[IDT_VECTORING_INFO_FIELD] =
+		offsetof(struct shadow_vmcs, idt_vectoring_info_field);
+	vmcs_field_to_offset_table[IDT_VECTORING_ERROR_CODE] =
+		offsetof(struct shadow_vmcs, idt_vectoring_error_code);
+	vmcs_field_to_offset_table[VM_EXIT_INSTRUCTION_LEN] =
+		offsetof(struct shadow_vmcs, vm_exit_instruction_len);
+	vmcs_field_to_offset_table[VMX_INSTRUCTION_INFO] =
+		offsetof(struct shadow_vmcs, vmx_instruction_info);
+	vmcs_field_to_offset_table[GUEST_ES_LIMIT] =
+		offsetof(struct shadow_vmcs, guest_es_limit);
+	vmcs_field_to_offset_table[GUEST_CS_LIMIT] =
+		offsetof(struct shadow_vmcs, guest_cs_limit);
+	vmcs_field_to_offset_table[GUEST_SS_LIMIT] =
+		offsetof(struct shadow_vmcs, guest_ss_limit);
+	vmcs_field_to_offset_table[GUEST_DS_LIMIT] =
+		offsetof(struct shadow_vmcs, guest_ds_limit);
+	vmcs_field_to_offset_table[GUEST_FS_LIMIT] =
+		offsetof(struct shadow_vmcs, guest_fs_limit);
+	vmcs_field_to_offset_table[GUEST_GS_LIMIT] =
+		offsetof(struct shadow_vmcs, guest_gs_limit);
+	vmcs_field_to_offset_table[GUEST_LDTR_LIMIT] =
+		offsetof(struct shadow_vmcs, guest_ldtr_limit);
+	vmcs_field_to_offset_table[GUEST_TR_LIMIT] =
+		offsetof(struct shadow_vmcs, guest_tr_limit);
+	vmcs_field_to_offset_table[GUEST_GDTR_LIMIT] =
+		offsetof(struct shadow_vmcs, guest_gdtr_limit);
+	vmcs_field_to_offset_table[GUEST_IDTR_LIMIT] =
+		offsetof(struct shadow_vmcs, guest_idtr_limit);
+	vmcs_field_to_offset_table[GUEST_ES_AR_BYTES] =
+		offsetof(struct shadow_vmcs, guest_es_ar_bytes);
+	vmcs_field_to_offset_table[GUEST_CS_AR_BYTES] =
+		offsetof(struct shadow_vmcs, guest_cs_ar_bytes);
+	vmcs_field_to_offset_table[GUEST_SS_AR_BYTES] =
+		offsetof(struct shadow_vmcs, guest_ss_ar_bytes);
+	vmcs_field_to_offset_table[GUEST_DS_AR_BYTES] =
+		offsetof(struct shadow_vmcs, guest_ds_ar_bytes);
+	vmcs_field_to_offset_table[GUEST_FS_AR_BYTES] =
+		offsetof(struct shadow_vmcs, guest_fs_ar_bytes);
+	vmcs_field_to_offset_table[GUEST_GS_AR_BYTES] =
+		offsetof(struct shadow_vmcs, guest_gs_ar_bytes);
+	vmcs_field_to_offset_table[GUEST_LDTR_AR_BYTES] =
+		offsetof(struct shadow_vmcs, guest_ldtr_ar_bytes);
+	vmcs_field_to_offset_table[GUEST_TR_AR_BYTES] =
+		offsetof(struct shadow_vmcs, guest_tr_ar_bytes);
+	vmcs_field_to_offset_table[GUEST_INTERRUPTIBILITY_INFO] =
+		offsetof(struct shadow_vmcs,
+				guest_interruptibility_info);
+	vmcs_field_to_offset_table[GUEST_ACTIVITY_STATE] =
+		offsetof(struct shadow_vmcs, guest_activity_state);
+	vmcs_field_to_offset_table[GUEST_SYSENTER_CS] =
+		offsetof(struct shadow_vmcs, guest_sysenter_cs);
+	vmcs_field_to_offset_table[HOST_IA32_SYSENTER_CS] =
+		offsetof(struct shadow_vmcs, host_ia32_sysenter_cs);
+	vmcs_field_to_offset_table[CR0_GUEST_HOST_MASK] =
+		offsetof(struct shadow_vmcs, cr0_guest_host_mask);
+	vmcs_field_to_offset_table[CR4_GUEST_HOST_MASK] =
+		offsetof(struct shadow_vmcs, cr4_guest_host_mask);
+	vmcs_field_to_offset_table[CR0_READ_SHADOW] =
+		offsetof(struct shadow_vmcs, cr0_read_shadow);
+	vmcs_field_to_offset_table[CR4_READ_SHADOW] =
+		offsetof(struct shadow_vmcs, cr4_read_shadow);
+	vmcs_field_to_offset_table[CR3_TARGET_VALUE0] =
+		offsetof(struct shadow_vmcs, cr3_target_value0);
+	vmcs_field_to_offset_table[CR3_TARGET_VALUE1] =
+		offsetof(struct shadow_vmcs, cr3_target_value1);
+	vmcs_field_to_offset_table[CR3_TARGET_VALUE2] =
+		offsetof(struct shadow_vmcs, cr3_target_value2);
+	vmcs_field_to_offset_table[CR3_TARGET_VALUE3] =
+		offsetof(struct shadow_vmcs, cr3_target_value3);
+	vmcs_field_to_offset_table[EXIT_QUALIFICATION] =
+		offsetof(struct shadow_vmcs, exit_qualification);
+	vmcs_field_to_offset_table[GUEST_LINEAR_ADDRESS] =
+		offsetof(struct shadow_vmcs, guest_linear_address);
+	vmcs_field_to_offset_table[GUEST_CR0] =
+		offsetof(struct shadow_vmcs, guest_cr0);
+	vmcs_field_to_offset_table[GUEST_CR3] =
+		offsetof(struct shadow_vmcs, guest_cr3);
+	vmcs_field_to_offset_table[GUEST_CR4] =
+		offsetof(struct shadow_vmcs, guest_cr4);
+	vmcs_field_to_offset_table[GUEST_ES_BASE] =
+		offsetof(struct shadow_vmcs, guest_es_base);
+	vmcs_field_to_offset_table[GUEST_CS_BASE] =
+		offsetof(struct shadow_vmcs, guest_cs_base);
+	vmcs_field_to_offset_table[GUEST_SS_BASE] =
+		offsetof(struct shadow_vmcs, guest_ss_base);
+	vmcs_field_to_offset_table[GUEST_DS_BASE] =
+		offsetof(struct shadow_vmcs, guest_ds_base);
+	vmcs_field_to_offset_table[GUEST_FS_BASE] =
+		offsetof(struct shadow_vmcs, guest_fs_base);
+	vmcs_field_to_offset_table[GUEST_GS_BASE] =
+		offsetof(struct shadow_vmcs, guest_gs_base);
+	vmcs_field_to_offset_table[GUEST_LDTR_BASE] =
+		offsetof(struct shadow_vmcs, guest_ldtr_base);
+	vmcs_field_to_offset_table[GUEST_TR_BASE] =
+		offsetof(struct shadow_vmcs, guest_tr_base);
+	vmcs_field_to_offset_table[GUEST_GDTR_BASE] =
+		offsetof(struct shadow_vmcs, guest_gdtr_base);
+	vmcs_field_to_offset_table[GUEST_IDTR_BASE] =
+		offsetof(struct shadow_vmcs, guest_idtr_base);
+	vmcs_field_to_offset_table[GUEST_DR7] =
+		offsetof(struct shadow_vmcs, guest_dr7);
+	vmcs_field_to_offset_table[GUEST_RSP] =
+		offsetof(struct shadow_vmcs, guest_rsp);
+	vmcs_field_to_offset_table[GUEST_RIP] =
+		offsetof(struct shadow_vmcs, guest_rip);
+	vmcs_field_to_offset_table[GUEST_RFLAGS] =
+		offsetof(struct shadow_vmcs, guest_rflags);
+	vmcs_field_to_offset_table[GUEST_PENDING_DBG_EXCEPTIONS] =
+		offsetof(struct shadow_vmcs,
+				guest_pending_dbg_exceptions);
+	vmcs_field_to_offset_table[GUEST_SYSENTER_ESP] =
+		offsetof(struct shadow_vmcs, guest_sysenter_esp);
+	vmcs_field_to_offset_table[GUEST_SYSENTER_EIP] =
+		offsetof(struct shadow_vmcs, guest_sysenter_eip);
+	vmcs_field_to_offset_table[HOST_CR0] =
+		offsetof(struct shadow_vmcs, host_cr0);
+	vmcs_field_to_offset_table[HOST_CR3] =
+		offsetof(struct shadow_vmcs, host_cr3);
+	vmcs_field_to_offset_table[HOST_CR4] =
+		offsetof(struct shadow_vmcs, host_cr4);
+	vmcs_field_to_offset_table[HOST_FS_BASE] =
+		offsetof(struct shadow_vmcs, host_fs_base);
+	vmcs_field_to_offset_table[HOST_GS_BASE] =
+		offsetof(struct shadow_vmcs, host_gs_base);
+	vmcs_field_to_offset_table[HOST_TR_BASE] =
+		offsetof(struct shadow_vmcs, host_tr_base);
+	vmcs_field_to_offset_table[HOST_GDTR_BASE] =
+		offsetof(struct shadow_vmcs, host_gdtr_base);
+	vmcs_field_to_offset_table[HOST_IDTR_BASE] =
+		offsetof(struct shadow_vmcs, host_idtr_base);
+	vmcs_field_to_offset_table[HOST_IA32_SYSENTER_ESP] =
+		offsetof(struct shadow_vmcs, host_ia32_sysenter_esp);
+	vmcs_field_to_offset_table[HOST_IA32_SYSENTER_EIP] =
+		offsetof(struct shadow_vmcs, host_ia32_sysenter_eip);
+	vmcs_field_to_offset_table[HOST_RSP] =
+		offsetof(struct shadow_vmcs, host_rsp);
+	vmcs_field_to_offset_table[HOST_RIP] =
+		offsetof(struct shadow_vmcs, host_rip);
+}
+
+static inline unsigned short vmcs_field_to_offset(unsigned long field)
+{
+
+	if (field > HOST_RIP || vmcs_field_to_offset_table[field] == -1) {
+		printk(KERN_ERR "invalid vmcs encoding 0x%lx\n", field);
+		return -1;
+	}
+
+	return vmcs_field_to_offset_table[field];
+}
+
+static inline unsigned long nested_vmcs_readl(struct kvm_vcpu *vcpu,
+					      unsigned long field)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long *entry;
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return -1;
+	}
+
+	entry = (unsigned long*)((char*)(vmx->nested.l2_state->shadow_vmcs) +
+				 vmcs_field_to_offset(field));
+	return *entry;
+}
+
+static inline u16 nested_vmcs_read16(struct kvm_vcpu *vcpu,
+				     unsigned long field)
+{
+	return nested_vmcs_readl(vcpu, field);
+}
+
+static inline u32 nested_vmcs_read32(struct kvm_vcpu *vcpu, unsigned long field)
+{
+	return nested_vmcs_readl(vcpu, field);
+}
+
+static inline u64 nested_vmcs_read64(struct kvm_vcpu *vcpu, unsigned long field)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 *entry;
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return -1;
+	}
+
+	entry = (u64*)((char*)(vmx->nested.l2_state->shadow_vmcs) +
+				 vmcs_field_to_offset(field));
+	return *entry;
+}
+
+static inline void nested_vmcs_writel(struct kvm_vcpu *vcpu,
+				      unsigned long field, unsigned long value)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long entry = 
+		(unsigned long)(vmx->nested.l2_state->shadow_vmcs);
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return;
+	}
+	entry += vmcs_field_to_offset(field);
+	*(unsigned long *)entry = value;
+}
+
+static inline void nested_vmcs_write16(struct kvm_vcpu *vcpu,
+				       unsigned long field, u16 value)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long entry = 
+		(unsigned long)(vmx->nested.l2_state->shadow_vmcs);
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return;
+	}
+	entry += vmcs_field_to_offset(field);
+	*(u16 *)entry = value;
+}
+
+static inline void nested_vmcs_write32(struct kvm_vcpu *vcpu,
+				       unsigned long field, u32 value)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long entry = 
+		(unsigned long)(vmx->nested.l2_state->shadow_vmcs);
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return;
+	}
+	entry += vmcs_field_to_offset(field);
+	*(u32 *)entry = value;
+}
+
+static inline void nested_vmcs_write64(struct kvm_vcpu *vcpu,
+				       unsigned long field, u64 value)
+{
+#ifdef CONFIG_X86_64
+	nested_vmcs_writel(vcpu, field, value);
+#else /* nested: 32 bit not actually tested */
+	nested_vmcs_writel(vcpu, field, value);
+	nested_vmcs_writel(vcpu, field+1, value >> 32);
+#endif
+}
+
 static struct page *nested_get_page(struct kvm_vcpu *vcpu,
 				    u64 vmcs_addr)
 {
@@ -3429,6 +3849,26 @@ static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
 	vmx_set_rflags(vcpu, rflags);
 }
 
+static void set_rflags_to_vmx_fail_invalid(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags |= X86_EFLAGS_CF;
+	rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_ZF &
+		~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
+	vmx_set_rflags(vcpu, rflags);
+}
+
+static void set_rflags_to_vmx_fail_valid(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags |= X86_EFLAGS_ZF;
+	rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_CF &
+		~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
+	vmx_set_rflags(vcpu, rflags);
+}
+
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
 	if (!nested_vmx_check_permission(vcpu))
@@ -3557,6 +3997,116 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+#ifndef CONFIG_X86_64
+	u64 value;
+#endif
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_INFO "%s no shadow vmcs\n", __func__);
+		set_rflags_to_vmx_fail_invalid(vcpu);
+		return 1;
+	}
+
+	switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
+	case VMCS_FIELD_TYPE_U16:
+		vcpu->arch.regs[VCPU_REGS_RAX] =
+			nested_vmcs_read16(vcpu,
+					   vcpu->arch.regs[VCPU_REGS_RDX]);
+		break;
+	case VMCS_FIELD_TYPE_U32:
+		vcpu->arch.regs[VCPU_REGS_RAX] =
+			nested_vmcs_read32(vcpu,
+					   vcpu->arch.regs[VCPU_REGS_RDX]);
+		break;
+	case VMCS_FIELD_TYPE_U64:
+#ifdef CONFIG_X86_64
+		vcpu->arch.regs[VCPU_REGS_RAX] =
+		nested_vmcs_read64(vcpu,
+					   vcpu->arch.regs[VCPU_REGS_RDX]);
+#else /* nested: 32 bit not actually tested */
+		value =  nested_vmcs_read64(vcpu,
+					    vcpu->arch.regs[VCPU_REGS_RDX]);
+		vcpu->arch.regs[VCPU_REGS_RAX] = value;
+		vcpu->arch.regs[VCPU_REGS_RBX] = value >> 32;
+#endif
+	break;
+	case VMCS_FIELD_TYPE_ULONG:
+		vcpu->arch.regs[VCPU_REGS_RAX] =
+			nested_vmcs_readl(vcpu,
+					  vcpu->arch.regs[VCPU_REGS_RDX]);
+		break;
+	default:
+		printk(KERN_INFO "%s invalid field\n", __func__);
+		set_rflags_to_vmx_fail_valid(vcpu);
+		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
+		return 1;
+	}
+
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+#ifndef CONFIG_X86_64
+	u64 value ;
+#endif
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!vmx->nested.l2_state->shadow_vmcs) {
+		printk(KERN_INFO "%s no shadow vmcs\n", __func__);
+		set_rflags_to_vmx_fail_invalid(vcpu);
+		return 1;
+	}
+
+	switch (vmcs_field_length(vcpu->arch.regs[VCPU_REGS_RDX])) {
+	case VMCS_FIELD_TYPE_U16:
+		nested_vmcs_write16(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+				    vcpu->arch.regs[VCPU_REGS_RAX]);
+		break;
+	case VMCS_FIELD_TYPE_U32:
+		nested_vmcs_write32(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+				    vcpu->arch.regs[VCPU_REGS_RAX]);
+		break;
+	case VMCS_FIELD_TYPE_U64:
+#ifdef CONFIG_X86_64
+		nested_vmcs_write64(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+				    vcpu->arch.regs[VCPU_REGS_RAX]);
+#else /* nested: 32 bit not actually tested */
+		value =  vcpu->arch.regs[VCPU_REGS_RAX] |
+			(vcpu->arch.regs[VCPU_REGS_RBX] << 32);
+		nested_vmcs_write64(vcpu,
+				    vcpu->arch.regs[VCPU_REGS_RDX], value);
+#endif
+		break;
+	case VMCS_FIELD_TYPE_ULONG:
+		nested_vmcs_writel(vcpu, vcpu->arch.regs[VCPU_REGS_RDX],
+				   vcpu->arch.regs[VCPU_REGS_RAX]);
+		break;
+	default:
+		printk(KERN_INFO "%s invalid field\n", __func__);
+		set_rflags_to_vmx_fail_valid(vcpu);
+		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
+		return 1;
+	}
+
+	clear_rflags_cf_zf(vcpu);
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_vmoff(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3877,9 +4427,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
-	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
+	[EXIT_REASON_VMREAD]                  = handle_vmread,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
-	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
+	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
@@ -4625,6 +5175,8 @@ int create_l2_state(struct kvm_vcpu *vcpu)
 	vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
 	vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
 
+	init_vmcs_field_to_offset_table();
+
 	return 0;
 }
 
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-09-02 15:38       ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite oritw
@ 2009-09-02 15:38         ` oritw
  2009-09-02 21:38           ` Avi Kivity
  2009-09-02 20:15         ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite Avi Kivity
  1 sibling, 1 reply; 31+ messages in thread
From: oritw @ 2009-09-02 15:38 UTC (permalink / raw)
  To: kvm; +Cc: oritw, benami, muli, abelg, aliguori, mmday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c | 1142 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 1130 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 62139b5..a7a62df 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -221,17 +221,30 @@ static inline int vmcs_field_length(unsigned long field)
 	return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
 }
 
+#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \
+					VM_EXIT_SAVE_IA32_PAT))
+#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \
+					 VM_ENTRY_IA32E_MODE))
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
 	char data[0];
 };
 
-struct nested_vmx {
-	/* Has the level1 guest done vmon? */
+struct nested_vmx {	/* Has the level1 guest done vmon? */
 	bool vmon;
 	/* Has the level1 guest done vmclear? */
 	bool vmclear;
+
+	/* Are we running nested guest */
+	bool nested_mode;
+
+	/* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
+	bool nested_run_pending;
+
+	/* flag indicating if there was a valid IDT after exiting from l2 */
+	bool nested_pending_valid_idt;
+
 	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
 	u64 l1_cur_vmcs;
 	/*
@@ -704,6 +717,53 @@ static inline void nested_vmcs_write64(struct kvm_vcpu *vcpu,
 #endif
 }
 
+
+static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
+		cpu_based_vm_exec_control & CPU_BASED_TPR_SHADOW;
+}
+
+static inline int nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->cpu_based_vm_exec_control &
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
+							   *vcpu)
+{
+	struct shadow_vmcs *shadow = to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
+
+	return (shadow->secondary_vm_exec_control &
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+		to_vmx(vcpu)->nested.l2_state->shadow_vmcs->apic_access_addr != 0;
+}
+
+static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
+		secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->secondary_vm_exec_control &
+		SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->vm_entry_controls &
+		VM_ENTRY_LOAD_IA32_PAT;
+}
+
+static inline int nested_cpu_has_vmx_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->cpu_based_vm_exec_control &
+		CPU_BASED_USE_MSR_BITMAPS;
+}
+
 static struct page *nested_get_page(struct kvm_vcpu *vcpu,
 				    u64 vmcs_addr)
 {
@@ -779,9 +839,16 @@ static struct kvm_vmx_segment_field {
 };
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
-
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+				      bool has_error_code, u32 error_code);
+static int nested_vmx_intr(struct kvm_vcpu *vcpu);
 static int create_l1_state(struct kvm_vcpu *vcpu);
 static int create_l2_state(struct kvm_vcpu *vcpu);
+static int launch_guest(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override);
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt);
 static int shadow_vmcs_load(struct kvm_vcpu *vcpu);
 
 /*
@@ -899,6 +966,18 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
 	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
 }
 
+static inline int is_exception(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_nmi(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+}
+
 static inline int cpu_has_vmx_invept_individual_addr(void)
 {
 	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -1460,6 +1539,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+	if (nested_vmx_check_exception(vmx, nr, has_error_code, error_code))
+		return;
+
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -3238,6 +3320,14 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
 
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		if (kvm_cpu_has_interrupt(vcpu)) {
+			if (nested_vmx_intr(vcpu))
+				return;
+		}
+		return;
+	}
+
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -3289,10 +3379,25 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
 }
 
+static void vmx_set_irq(struct kvm_vcpu *vcpu)
+{
+	if (to_vmx(vcpu)->nested.nested_mode)
+		return;
+
+	if (nested_vmx_intr(vcpu))
+		return;
+
+	vmx_inject_irq(vcpu);
+}
+
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (vmx->nested.nested_mode) {
+		return;
+	}
+
 	if (!cpu_has_virtual_nmis()) {
 		/*
 		 * Tracking the NMI-blocked state in software is built upon
@@ -3334,6 +3439,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		if (kvm_cpu_has_interrupt(vcpu)) {
+			if (!nested_vmx_intr(vcpu))
+				return 0;
+		}
+	}
+
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3988,10 +4100,25 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-static int handle_vmx_insn(struct kvm_vcpu *vcpu)
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
 {
-	kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+	if (!to_vmx(vcpu)->nested.vmclear)
+		return 1;
+
+	return launch_guest(vcpu);
+}
+
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (to_vmx(vcpu)->nested.vmclear)
+		return 1;
+
+	return launch_guest(vcpu);
 }
 
 static int handle_vmread(struct kvm_vcpu *vcpu)
@@ -4421,11 +4548,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
-	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
+	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmread,
-	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
+	[EXIT_REASON_VMRESUME]                = handle_vmresume,
 	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
@@ -4452,6 +4579,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
 	trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
+	if (exit_reason == EXIT_REASON_VMLAUNCH ||
+	    exit_reason == EXIT_REASON_VMRESUME)
+		vmx->nested.nested_run_pending = 1;
+	else
+		vmx->nested.nested_run_pending = 0;
+
+	if (vmx->nested.nested_mode && nested_vmx_exit_handled(vcpu, true)) {
+		nested_vmx_vmexit(vcpu, false);
+		return 1;
+	}
 
 	/* If we need to emulate an MMIO from handle_invalid_guest_state
 	 * we just return 0 */
@@ -4481,7 +4618,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		       "(0x%x) and exit reason is 0x%x\n",
 		       __func__, vectoring_info, exit_reason);
 
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+	if (!vmx->nested.nested_mode && unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
 		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -4528,10 +4665,13 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	int type;
 	bool idtv_info_valid;
 
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
+	if (vmx->nested.nested_mode)
+		return;
+
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
 	/* Handle machine checks before interrupts are enabled */
 	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
 	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
@@ -4634,6 +4774,62 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 		| vmx->rmode.irq.vector;
 }
 
+static int nested_handle_pending_idt(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int irq;
+	int type;
+	int errCodeValid;
+	u32 idt_vectoring_info;
+	u32 guest_intr;
+	bool nmi_window_open;
+	bool interrupt_window_open;
+
+	if (vmx->nested.nested_mode && vmx->nested.nested_pending_valid_idt) {
+		idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+		irq  = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+		type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+		errCodeValid = idt_vectoring_info &
+			VECTORING_INFO_DELIVER_CODE_MASK;
+
+		guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+		nmi_window_open =
+			!(guest_intr & (GUEST_INTR_STATE_STI |
+					GUEST_INTR_STATE_MOV_SS |
+					GUEST_INTR_STATE_NMI));
+
+		interrupt_window_open =
+			((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+			 !(guest_intr & (GUEST_INTR_STATE_STI |
+					 GUEST_INTR_STATE_MOV_SS)));
+
+		if (type == INTR_TYPE_EXT_INTR && !interrupt_window_open) {
+			printk(KERN_INFO "IDT ignored, l2 interrupt window closed!\n");
+			return 0;
+		}
+
+		if (type == INTR_TYPE_NMI_INTR && !nmi_window_open) {
+			printk(KERN_INFO "IDT ignored, l2 nmi window closed!\n");
+			return 0;
+		}
+
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			irq | type | INTR_INFO_VALID_MASK | errCodeValid);
+
+
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+
+		if (errCodeValid)
+			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+				     vmcs_read32(IDT_VECTORING_ERROR_CODE));
+
+		return 1;
+	}
+
+	return 0;
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #define Q "q"
@@ -4646,6 +4842,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	nested_handle_pending_idt(vcpu);
+
+	if (vmx->nested.nested_mode) {
+		vmcs_writel(GUEST_CR0, vmx->nested.l2_state->shadow_vmcs->guest_cr0);
+		vmcs_write32(EXCEPTION_BITMAP, vmx->nested.l2_state->shadow_vmcs->
+			     exception_bitmap |
+			     vmx->nested.l1_state->shadow_vmcs->exception_bitmap);
+	}
+
 	if (enable_ept && is_paging(vcpu)) {
 		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
 		ept_load_pdptrs(vcpu);
@@ -4783,12 +4988,19 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	get_debugreg(vcpu->arch.dr6, 6);
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+	vmx->nested.nested_pending_valid_idt = vmx->nested.nested_mode &&
+		(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
 
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
 
+	if (vmx->nested.nested_mode)
+		vmx->nested.vmclear = 0;
+
 	vmx_complete_interrupts(vmx);
 }
 
@@ -4871,6 +5083,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	vmx->nested.l1_cur_vmcs = 0;
+
+	vmx->nested.l1_state = NULL;
+	vmx->nested.l2_state = NULL;
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -5122,6 +5339,228 @@ static int shadow_vmcs_load(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+void prepare_vmcs_12(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *l2_shadow_vmcs =
+		to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
+	struct shadow_vmcs *l1_shadow_vmcs =
+		to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
+
+	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+
+	l1_shadow_vmcs->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
+	l1_shadow_vmcs->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
+	l1_shadow_vmcs->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
+	l1_shadow_vmcs->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
+	l1_shadow_vmcs->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
+	l1_shadow_vmcs->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
+	l1_shadow_vmcs->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
+
+	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
+	l2_shadow_vmcs->guest_physical_address =
+		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	l2_shadow_vmcs->vm_entry_intr_info_field =
+		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	l2_shadow_vmcs->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	l2_shadow_vmcs->vm_entry_instruction_len =
+		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	l2_shadow_vmcs->vm_instruction_error =
+		vmcs_read32(VM_INSTRUCTION_ERROR);
+	l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	l2_shadow_vmcs->vm_exit_intr_error_code =
+		vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	l2_shadow_vmcs->idt_vectoring_info_field =
+		vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	l2_shadow_vmcs->idt_vectoring_error_code =
+		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	l2_shadow_vmcs->vm_exit_instruction_len =
+		vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	l2_shadow_vmcs->vmx_instruction_info =
+		vmcs_read32(VMX_INSTRUCTION_INFO);
+	l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	l2_shadow_vmcs->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	l2_shadow_vmcs->guest_activity_state =
+		vmcs_read32(GUEST_ACTIVITY_STATE);
+	l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+
+	l1_shadow_vmcs->host_ia32_sysenter_cs =
+		vmcs_read32(HOST_IA32_SYSENTER_CS);
+
+	l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+	l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
+
+	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
+	l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
+	l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
+	l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
+	l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	l2_shadow_vmcs->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+
+	l1_shadow_vmcs->host_cr0 = vmcs_readl(HOST_CR0);
+	l1_shadow_vmcs->host_cr3 = vmcs_readl(HOST_CR3);
+	l1_shadow_vmcs->host_cr4 = vmcs_readl(HOST_CR4);
+	l1_shadow_vmcs->host_fs_base = vmcs_readl(HOST_FS_BASE);
+	l1_shadow_vmcs->host_gs_base = vmcs_readl(HOST_GS_BASE);
+	l1_shadow_vmcs->host_tr_base = vmcs_readl(HOST_TR_BASE);
+	l1_shadow_vmcs->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
+	l1_shadow_vmcs->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
+	l1_shadow_vmcs->host_ia32_sysenter_esp =
+		vmcs_readl(HOST_IA32_SYSENTER_ESP);
+	l1_shadow_vmcs->host_ia32_sysenter_eip =
+		vmcs_readl(HOST_IA32_SYSENTER_EIP);
+	l1_shadow_vmcs->host_rsp = vmcs_readl(HOST_RSP);
+	l1_shadow_vmcs->host_rip = vmcs_readl(HOST_RIP);
+}
+
+int load_vmcs_common(struct shadow_vmcs *src)
+{
+	vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
+	vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
+	vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
+	vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
+	vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
+	vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
+	vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
+	vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
+
+	vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
+	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
+
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
+
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field);
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+		     src->vm_entry_exception_error_code);
+	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len);
+
+	vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
+	vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
+	vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
+	vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
+	vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
+	vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
+	vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
+	vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
+	vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
+	vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
+	vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
+	vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
+	vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
+	vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
+	vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
+	vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
+	vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
+	vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
+	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+		     src->guest_interruptibility_info);
+	vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
+	vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
+
+	vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
+	vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
+	vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
+	vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
+	vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
+	vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
+	vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
+	vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
+	vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
+	vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
+	vmcs_writel(GUEST_DR7, src->guest_dr7);
+	vmcs_writel(GUEST_RSP, src->guest_rsp);
+	vmcs_writel(GUEST_RIP, src->guest_rip);
+	vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
+	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+		    src->guest_pending_dbg_exceptions);
+	vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
+
+	return 0;
+}
+
+int load_vmcs_host_state(struct shadow_vmcs *src)
+{
+	vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector);
+	vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector);
+	vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector);
+	vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector);
+	vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector);
+	vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector);
+	vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector);
+
+	vmcs_write64(TSC_OFFSET, src->tsc_offset);
+
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+		vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat);
+
+	vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs);
+
+	vmcs_writel(HOST_CR0, src->host_cr0);
+	vmcs_writel(HOST_CR3, src->host_cr3);
+	vmcs_writel(HOST_CR4, src->host_cr4);
+	vmcs_writel(HOST_FS_BASE, src->host_fs_base);
+	vmcs_writel(HOST_GS_BASE, src->host_gs_base);
+	vmcs_writel(HOST_TR_BASE, src->host_tr_base);
+	vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base);
+	vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base);
+	vmcs_writel(HOST_RSP, src->host_rsp);
+	vmcs_writel(HOST_RIP, src->host_rip);
+	vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp);
+	vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip);
+
+	return 0;
+}
+
 struct level_state *create_state(void)
 {
 	struct level_state *state = NULL;
@@ -5176,6 +5615,685 @@ int create_l2_state(struct kvm_vcpu *vcpu)
 
 	return 0;
 }
+int prepare_vmcs_02(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct shadow_vmcs *src = vmx->nested.l2_state->shadow_vmcs;
+	u32 exec_control;
+
+	if (!src) {
+		printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
+		return 1;
+	}
+
+	load_vmcs_common(src);
+
+	if (cpu_has_vmx_vpid() && vmx->nested.l2_state->vpid != 0)
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->vpid);
+
+	if (vmx->nested.l2_state->io_bitmap_a)
+		vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
+
+	if (vmx->nested.l2_state->io_bitmap_b)
+		vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
+
+	if (vmx->nested.l2_state->msr_bitmap)
+		vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
+
+	if (src->vm_entry_msr_load_count > 0) {
+		struct page *page;
+
+		page = nested_get_page(vcpu,
+				       src->vm_entry_msr_load_addr);
+		if (!page)
+			return 1;
+
+		vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
+
+		kvm_release_page_clean(page);
+	}
+
+	if (src->virtual_apic_page_addr != 0) {
+		struct page *page;
+
+		page = nested_get_page(vcpu,
+				       src->virtual_apic_page_addr);
+		if (!page)
+			return 1;
+
+		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
+
+		kvm_release_page_clean(page);
+	}  else {
+		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+			     src->virtual_apic_page_addr);
+	}
+
+	if (vm_need_virtualize_apic_accesses(vcpu->kvm)) {
+		if (src->apic_access_addr != 0) {
+			struct page *page =
+				nested_get_page(vcpu, src->apic_access_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+			kvm_release_page_clean(page);
+		} else {
+			vmcs_write64(APIC_ACCESS_ADDR, 0);
+		}
+	}
+
+	if (vm_need_tpr_shadow(vcpu->kvm) &&
+	    nested_cpu_has_vmx_tpr_shadow(vcpu))
+		vmcs_write32(TPR_THRESHOLD, src->tpr_threshold);
+
+	if (enable_ept) {
+		if (!nested_cpu_has_vmx_ept(vcpu)) {
+			vmcs_write64(EPT_POINTER,
+				     vmx->nested.l1_state->shadow_vmcs->ept_pointer);
+			vmcs_write64(GUEST_PDPTR0,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr0);
+			vmcs_write64(GUEST_PDPTR1,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr1);
+			vmcs_write64(GUEST_PDPTR2,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr2);
+			vmcs_write64(GUEST_PDPTR3,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr3);
+		}
+	}
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+		     (vmx->nested.l1_state->shadow_vmcs->pin_based_vm_exec_control |
+		      src->pin_based_vm_exec_control));
+
+	exec_control = vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
+
+	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+
+	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+
+	exec_control &= ~CPU_BASED_TPR_SHADOW;
+
+	exec_control |= src->cpu_based_vm_exec_control;
+
+	if (!vm_need_tpr_shadow(vcpu->kvm) ||
+	    src->virtual_apic_page_addr == 0) {
+		exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_STORE_EXITING |
+			CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	} else if (exec_control & CPU_BASED_TPR_SHADOW) {
+
+#ifdef CONFIG_X86_64
+		exec_control &= ~CPU_BASED_CR8_STORE_EXITING;
+		exec_control &= ~CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	}
+
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	vmcs_write32(EXCEPTION_BITMAP,
+		     (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
+		      src->exception_bitmap));
+
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		     (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_mask &
+		      src->page_fault_error_code_mask));
+
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		     (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_match &
+		      src->page_fault_error_code_match));
+
+	vmcs_write32(VM_EXIT_CONTROLS,
+		     ((vmx->nested.l1_state->shadow_vmcs->vm_exit_controls &
+		       NESTED_VM_EXIT_CONTROLS_MASK) | src->vm_exit_controls));
+
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		     (vmx->nested.l1_state->shadow_vmcs->vm_entry_controls &
+		      NESTED_VM_ENTRY_CONTROLS_MASK) | src->vm_entry_controls);
+
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	if (cpu_has_secondary_exec_ctrls()) {
+
+		exec_control =
+			vmx->nested.l1_state->shadow_vmcs->secondary_vm_exec_control;
+
+		if (nested_cpu_has_secondary_exec_ctrls(vcpu)) {
+
+			exec_control |= src->secondary_vm_exec_control;
+
+			if (!vm_need_virtualize_apic_accesses(vcpu->kvm) ||
+			    !nested_vm_need_virtualize_apic_accesses(vcpu))
+				exec_control &=
+				       ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		}
+
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	}
+
+	vmcs_writel(CR0_GUEST_HOST_MASK,
+		    (vmx->nested.l1_state->shadow_vmcs->cr0_guest_host_mask  &
+		     src->cr0_guest_host_mask));
+	vmcs_writel(CR4_GUEST_HOST_MASK,
+		    (vmx->nested.l1_state->shadow_vmcs->cr4_guest_host_mask  &
+		     src->cr4_guest_host_mask));
+
+	load_vmcs_host_state(vmx->nested.l1_state->shadow_vmcs);
+
+	return 0;
+}
+
+int switch_back_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
+
+	if (enable_vpid && src->virtual_processor_id != 0)
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
+
+	vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
+	vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, src->msr_bitmap);
+
+	vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
+
+	if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+		vmcs_write64(APIC_ACCESS_ADDR,
+			     src->apic_access_addr);
+
+	if (enable_ept) {
+		vmcs_write64(EPT_POINTER, src->ept_pointer);
+		vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
+		vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
+		vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
+		vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
+	}
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control);
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control);
+	vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		     src->page_fault_error_code_mask);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		     src->page_fault_error_code_match);
+	vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
+	vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	if (cpu_has_secondary_exec_ctrls())
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+			     src->secondary_vm_exec_control);
+
+	load_vmcs_common(src);
+
+	load_vmcs_host_state(to_vmx(vcpu)->nested.l1_state->shadow_vmcs);
+
+	return 0;
+}
+
+void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
+{
+	unsigned long mask;
+
+	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+	mask = ~((1 << VCPU_REGS_RSP) | (1 << VCPU_REGS_RIP));
+
+	if (vcpu->arch.regs_dirty & mask) {
+		printk(KERN_INFO "WARNING: dirty cached registers regs_dirty 0x%x mask 0x%lx\n",
+		       vcpu->arch.regs_dirty, mask);
+		WARN_ON(1);
+	}
+
+	vcpu->arch.regs_dirty = 0;
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu)
+{
+	/* verify that l1 has done vmptrld for l2 earlier */
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int initial_pfu_active = vcpu->fpu_active;
+	int r = 0;
+
+	if (vmx->nested.nested_mode) {
+		printk(KERN_INFO "Nested guest already running\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	vmx->nested.nested_mode = 1;
+
+	vcpu->arch.exception.pending = false;
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	save_vmcs(vmx->nested.l1_state->shadow_vmcs);
+
+	vmx->nested.l1_state->shadow_efer = vcpu->arch.shadow_efer;
+	if (!enable_ept)
+		vmx->nested.l1_state->cr3 = vcpu->arch.cr3;
+	vmx->nested.l1_state->cr4 = vcpu->arch.cr4;
+
+	if (enable_vpid) {
+		if (vmx->nested.l2_state->vpid == 0) {
+			allocate_vpid(vmx);
+			vmx->nested.l2_state->vpid = vmx->vpid;
+		}
+	}
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx->nested.l1_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
+	else
+		vmx->nested.l1_state->msr_bitmap = 0;
+
+	vmx->nested.l1_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	vmx->nested.l1_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+	vmx->nested.l1_state->vmcs = vmx->vmcs;
+	vmx->nested.l1_state->cpu = vcpu->cpu;
+	vmx->nested.l1_state->launched = vmx->launched;
+
+	vmx->vmcs = vmx->nested.l2_state->vmcs;
+	vcpu->cpu = vmx->nested.l2_state->cpu;
+	vmx->launched = vmx->nested.l2_state->launched;
+
+	if (vmx->nested.vmclear || !vmx->launched) {
+		vmcs_clear(vmx->vmcs);
+		vmx->launched = 0;
+	}
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+	prepare_vmcs_02(vcpu);
+
+	if (vmx->nested.l2_state->shadow_vmcs->vm_entry_controls &
+	    VM_ENTRY_IA32E_MODE) {
+		if (!((vcpu->arch.shadow_efer & EFER_LMA) &&
+		      (vcpu->arch.shadow_efer & EFER_LME)))
+			vcpu->arch.shadow_efer |= (EFER_LMA | EFER_LME);
+	} else {
+		if ((vcpu->arch.shadow_efer & EFER_LMA) ||
+		    (vcpu->arch.shadow_efer & EFER_LME))
+			vcpu->arch.shadow_efer = 0;
+	}
+
+	vmx_set_cr0(vcpu, vmx->nested.l2_state->shadow_vmcs->guest_cr0);
+	vmcs_writel(CR0_READ_SHADOW,
+		    vmx->nested.l2_state->shadow_vmcs->cr0_read_shadow);
+	vmx_set_cr4(vcpu, vmx->nested.l2_state->shadow_vmcs->guest_cr4);
+	vmcs_writel(CR4_READ_SHADOW,
+		    vmx->nested.l2_state->shadow_vmcs->cr4_read_shadow);
+
+	vcpu->arch.cr0 |= X86_CR0_PG;
+
+	if (enable_ept && !nested_cpu_has_vmx_ept(vcpu)) {
+		vmcs_write32(GUEST_CR3, vmx->nested.l2_state->shadow_vmcs->guest_cr3);
+		vmx->vcpu.arch.cr3 = vmx->nested.l2_state->shadow_vmcs->guest_cr3;
+	} else {
+		kvm_set_cr3(vcpu, vmx->nested.l2_state->shadow_vmcs->guest_cr3);
+		kvm_mmu_reset_context(vcpu);
+
+		r = kvm_mmu_load(vcpu);
+		if (unlikely(r)) {
+			printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r);
+			nested_vmx_vmexit(vcpu, false);
+			set_rflags_to_vmx_fail_valid(vcpu);
+			return 1;
+		}
+
+	}
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   vmx->nested.l2_state->shadow_vmcs->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   vmx->nested.l2_state->shadow_vmcs->guest_rip);
+
+	vmcs_write32(EXCEPTION_BITMAP,
+		     (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
+		      vmx->nested.l2_state->shadow_vmcs->exception_bitmap));
+
+	if (initial_pfu_active)
+		vmx_fpu_activate(vcpu);
+
+	return 1;
+}
+
+static int launch_guest(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	skip_emulated_instruction(vcpu);
+
+	nested_vmx_run(vcpu);
+
+	return 1;
+}
+
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int initial_pfu_active = vcpu->fpu_active;
+
+	if (!vmx->nested.nested_mode) {
+		printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
+		       __func__);
+		return 0;
+	}
+
+	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	prepare_vmcs_12(vcpu);
+	if (is_interrupt)
+		vmx->nested.l2_state->shadow_vmcs->vm_exit_reason =
+			EXIT_REASON_EXTERNAL_INTERRUPT;
+
+	vmx->nested.l2_state->launched = vmx->launched;
+	vmx->nested.l2_state->cpu = vcpu->cpu;
+
+	vmx->vmcs = vmx->nested.l1_state->vmcs;
+	vcpu->cpu = vmx->nested.l1_state->cpu;
+	vmx->launched = vmx->nested.l1_state->launched;
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+	vcpu->arch.exception.pending = false;
+
+	vcpu->arch.shadow_efer = vmx->nested.l1_state->shadow_efer;
+	vmx_set_cr0(vcpu, vmx->nested.l1_state->shadow_vmcs->cr0_read_shadow);
+	vmx_set_cr4(vcpu, vmx->nested.l1_state->cr4);
+
+	if (enable_ept) {
+		vcpu->arch.cr3 = vmx->nested.l1_state->shadow_vmcs->guest_cr3;
+		vmcs_write32(GUEST_CR3, vmx->nested.l1_state->shadow_vmcs->guest_cr3);
+	} else {
+		kvm_set_cr3(vcpu, vmx->nested.l1_state->cr3);
+	}
+
+	switch_back_vmcs(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   vmx->nested.l1_state->shadow_vmcs->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   vmx->nested.l1_state->shadow_vmcs->guest_rip);
+
+	vmx->nested.nested_mode = 0;
+
+	kvm_mmu_reset_context(vcpu);
+	kvm_mmu_load(vcpu);
+
+	if (unlikely(vmx->fail)) {
+		vmx->fail = 0;
+		set_rflags_to_vmx_fail_valid(vcpu);
+	} else
+		clear_rflags_cf_zf(vcpu);
+
+	if (initial_pfu_active)
+		vmx_fpu_activate(vcpu);
+
+	return 0;
+}
+
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
+{
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		struct page *msr_page = NULL;
+		u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+		u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+		struct shadow_vmcs *l2svmcs =
+			to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
+
+		if (!cpu_has_vmx_msr_bitmap()
+		    || !nested_cpu_has_vmx_msr_bitmap(vcpu))
+			return 1;
+
+		msr_page = nested_get_page(vcpu,
+					   l2svmcs->msr_bitmap);
+
+		if (!msr_page) {
+			printk(KERN_INFO "%s error in nested_get_page\n",
+			       __func__);
+			return 0;
+		}
+
+		switch (exit_code) {
+		case EXIT_REASON_MSR_READ:
+			if (msr_index <= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x000)))
+					return 1;
+			} else if ((msr_index >= 0xc0000000) &&
+				   (msr_index <= 0xc0001fff)) {
+				msr_index &= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x400)))
+					return 1;
+			}
+			break;
+		case EXIT_REASON_MSR_WRITE:
+			if (msr_index <= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x800)))
+						return 1;
+			} else if ((msr_index >= 0xc0000000) &&
+				   (msr_index <= 0xc0001fff)) {
+				msr_index &= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0xc00)))
+					return 1;
+			}
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
+{
+	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	struct shadow_vmcs *l2svmcs = vmx->nested.l2_state->shadow_vmcs;
+
+	if (vmx->nested.nested_run_pending)
+		return 0;
+
+	if (unlikely(vmx->fail)) {
+		printk(KERN_INFO "%s failed vm entry %x\n",
+		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		return 1;
+	}
+
+	if (kvm_override) {
+		switch (exit_code) {
+		case EXIT_REASON_EXTERNAL_INTERRUPT:
+			return 0;
+		case EXIT_REASON_EXCEPTION_NMI:
+			if (!is_exception(intr_info))
+				return 0;
+
+			if (is_page_fault(intr_info) && (!enable_ept))
+				return 0;
+
+			break;
+		case EXIT_REASON_EPT_VIOLATION:
+			if (enable_ept)
+				return 0;
+
+			break;
+		}
+	}
+
+	switch (exit_code) {
+	case EXIT_REASON_INVLPG:
+		if (l2svmcs->cpu_based_vm_exec_control &
+		    CPU_BASED_INVLPG_EXITING)
+			return 1;
+
+		break;
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_MSR_WRITE:
+		return nested_vmx_exit_handled_msr(vcpu);
+	case EXIT_REASON_CR_ACCESS: {
+		unsigned long exit_qualification =
+			vmcs_readl(EXIT_QUALIFICATION);
+		int cr = exit_qualification & 15;
+		int reg = (exit_qualification >> 8) & 15;
+		unsigned long val = kvm_register_read(vcpu, reg);
+
+		switch ((exit_qualification >> 4) & 3) {
+		case 0: /* mov to cr */
+			switch (cr) {
+			case 0:
+				if (l2svmcs->cr0_guest_host_mask &
+				    (val ^ l2svmcs->cr0_read_shadow))
+					return 1;
+				break;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR3_LOAD_EXITING)
+					return 1;
+				return 1;
+			case 4:
+				if (l2svmcs->cr4_guest_host_mask &
+				    (l2svmcs->cr4_read_shadow ^ val))
+					return 1;
+				break;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR8_LOAD_EXITING)
+					return 1;
+				break;
+			}
+			break;
+		case 2: /* clts */
+			if (l2svmcs->cr0_guest_host_mask &
+			    (val ^ l2svmcs->cr0_read_shadow))
+				return 1;
+			break;
+		case 1: /*mov from cr*/
+			switch (cr) {
+			case 0:
+				return 1;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR3_STORE_EXITING)
+					return 1;
+				break;
+			case 4:
+				return 1;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR8_STORE_EXITING)
+					return 1;
+				break;
+			}
+			break;
+		case 3: /* lmsw */
+			if (l2svmcs->cr0_guest_host_mask &
+			    (val ^ l2svmcs->cr0_read_shadow))
+				return 1;
+			break;
+		}
+		break;
+	}
+	case EXIT_REASON_DR_ACCESS: {
+		if (l2svmcs->cpu_based_vm_exec_control &
+		    CPU_BASED_MOV_DR_EXITING)
+			return 1;
+		break;
+	}
+
+	case EXIT_REASON_EXCEPTION_NMI: {
+
+		if (is_external_interrupt(intr_info) &&
+		    (l2svmcs->pin_based_vm_exec_control &
+		     PIN_BASED_EXT_INTR_MASK))
+			return 1;
+
+		if (is_nmi(intr_info) &&
+		    (l2svmcs->pin_based_vm_exec_control &
+		     PIN_BASED_NMI_EXITING))
+			return 1;
+
+		if (is_exception(intr_info) &&
+		    (l2svmcs->exception_bitmap &
+		     (1u << (intr_info & INTR_INFO_VECTOR_MASK))))
+			return 1;
+
+		if (is_page_fault(intr_info))
+			return 1;
+
+		break;
+	}
+
+	case EXIT_REASON_EXTERNAL_INTERRUPT:
+		if (l2svmcs->pin_based_vm_exec_control &
+		    PIN_BASED_EXT_INTR_MASK)
+			return 1;
+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+				      bool has_error_code, u32 error_code)
+{
+	if (vmx->nested.nested_mode) {
+		if (nested_vmx_exit_handled(&vmx->vcpu, false)) {
+			nested_vmx_vmexit(&vmx->vcpu, false);
+			vmx->nested.l2_state->shadow_vmcs->vm_exit_reason =
+				EXIT_REASON_EXCEPTION_NMI;
+			vmx->nested.l2_state->shadow_vmcs->vm_exit_intr_info =
+				(nr | INTR_TYPE_HARD_EXCEPTION
+				 | (has_error_code ?
+				    INTR_INFO_DELIVER_CODE_MASK : 0)
+				 | INTR_INFO_VALID_MASK);
+
+			if (has_error_code)
+				vmx->nested.l2_state->shadow_vmcs->
+					vm_exit_intr_error_code = error_code;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int nested_vmx_intr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vmx->nested.nested_mode) {
+		if (vmx->nested.l2_state->shadow_vmcs->pin_based_vm_exec_control &
+		    PIN_BASED_EXT_INTR_MASK) {
+			if (vmx->nested.nested_run_pending)
+				return 0;
+
+			nested_vmx_vmexit(vcpu, true);
+			return 1;
+		}
+	}
+
+	return 0;
+}
 
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
@@ -5224,7 +6342,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_interrupt_shadow = vmx_set_interrupt_shadow,
 	.get_interrupt_shadow = vmx_get_interrupt_shadow,
 	.patch_hypercall = vmx_patch_hypercall,
-	.set_irq = vmx_inject_irq,
+	.set_irq = vmx_set_irq,
 	.set_nmi = vmx_inject_nmi,
 	.queue_exception = vmx_queue_exception,
 	.interrupt_allowed = vmx_interrupt_allowed,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: Nested VMX support - kernel v1
  2009-09-02 15:38 Nested VMX support - kernel v1 oritw
  2009-09-02 15:38 ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff oritw
@ 2009-09-02 15:57 ` Alexander Graf
  2009-09-03  6:01   ` Muli Ben-Yehuda
  2009-09-02 21:39 ` Avi Kivity
  2 siblings, 1 reply; 31+ messages in thread
From: Alexander Graf @ 2009-09-02 15:57 UTC (permalink / raw)
  To: oritw; +Cc: kvm, oritw, benami, muli, abelg, aliguori, mmday


Am 02.09.2009 um 17:38 schrieb oritw@il.ibm.com:

> The following patches implement nested VMX support. The patches  
> enable a guest
> to use the VMX APIs in order to run its own nested guest (i.e.,  
> enable running
> other hypervisors which use VMX under KVM).

Copl! Great job here. I was expecting vmcs load/stores to kill  
performance, but apparently I was wrong. How did you get those fast?

> The current patches support running
> Linux under a nested KVM using shadow page table (with bypass_guest_pf
> disabled).

What is keeping you from running the other hypervisors, guests?

> SMP support was fixed.  Reworking EPT support to mesh cleanly with
> the current shadow paging design per Avi's comments is a work-in- 
> progress.
>
> The current patches only support a single nested hypervisor

Why?

> , which can only run
> a single guest (multiple guests are work in progress). Only 64-bit  
> nested
> hypervisors are supported.
>
> Additional patches for running Windows under nested KVM, and Linux  
> under nested
> VMware server(!),

GSX or ESX? With ESX I was running into TSC timing issues because ESX  
didn't like the fact it got preempted :-).

How about Hyper-V and Xen?

Again, great job and congratulations on making this work!

Alex

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff
  2009-09-02 15:38 ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff oritw
  2009-09-02 15:38   ` [PATCH 2/6] Nested VMX patch 2 implements vmclear oritw
@ 2009-09-02 19:34   ` Avi Kivity
  2009-09-03 12:34     ` Orit Wasserman
  1 sibling, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2009-09-02 19:34 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, muli, abelg, aliguori, mmday

On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
> ---
>   arch/x86/kvm/svm.c |    3 -
>   arch/x86/kvm/vmx.c |  187 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>   arch/x86/kvm/x86.c |    6 ++-
>   arch/x86/kvm/x86.h |    2 +
>   4 files changed, 192 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 2df9b45..3c1f22a 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -124,9 +124,6 @@ static int npt = 1;
>
>   module_param(npt, int, S_IRUGO);
>
> -static int nested = 1;
> -module_param(nested, int, S_IRUGO);
> -
>   static void svm_flush_tlb(struct kvm_vcpu *vcpu);
>   static void svm_complete_interrupts(struct vcpu_svm *svm);
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 78101dd..abba325 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -67,6 +67,11 @@ struct vmcs {
>   	char data[0];
>   };
>
> +struct nested_vmx {
> +	/* Has the level1 guest done vmon? */
> +	bool vmon;
> +};
>    

vmxon

> @@ -967,6 +975,69 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
>   }
>
>   /*
> + * Handles msr read for nested virtualization
> + */
> +static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
> +			      u64 *pdata)
> +{
> +	u32 vmx_msr_low = 0, vmx_msr_high = 0;
> +
> +	switch (msr_index) {
> +	case MSR_IA32_FEATURE_CONTROL:
> +		*pdata = 0;
> +		break;
> +	case MSR_IA32_VMX_BASIC:
> +		rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
>    

Use rdmsrl, it's easier.

I think we need to mask it with the capabilities we support.  Otherwise 
the guest can try to use some new feature which we don't support yet, 
and crash.

> +		*pdata = vmx_msr_low | ((u64)vmx_msr_high<<  32);
> +		break;
> +	case MSR_IA32_VMX_PINBASED_CTLS:
> +		*pdata = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
> +			PIN_BASED_VIRTUAL_NMIS;
>    

Need to mask with actual cpu capabilities in case we run on an older cpu.

> +		break;
> +	case MSR_IA32_VMX_PROCBASED_CTLS:
> +		*pdata =  CPU_BASED_HLT_EXITING |
> +#ifdef CONFIG_X86_64
> +			CPU_BASED_CR8_LOAD_EXITING |
> +			CPU_BASED_CR8_STORE_EXITING |
> +#endif
> +			CPU_BASED_CR3_LOAD_EXITING |
> +			CPU_BASED_CR3_STORE_EXITING |
> +			CPU_BASED_USE_IO_BITMAPS |
> +			CPU_BASED_MOV_DR_EXITING |
> +			CPU_BASED_USE_TSC_OFFSETING |
> +			CPU_BASED_INVLPG_EXITING;
>    

Same here... or are all these guaranteed to be present?

> +
> +static int handle_vmon(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_segment cs;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!nested) {
> +		printk(KERN_DEBUG "%s: nested vmx not enabled\n", __func__);
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 1;
> +	}
> +
> +	vmx_get_segment(vcpu,&cs, VCPU_SREG_CS);
> +
> +	if (!(vcpu->arch.cr4&  X86_CR4_VMXE) ||
> +	    !(vcpu->arch.cr0&  X86_CR0_PE) ||
> +	    (vmx_get_rflags(vcpu)&  X86_EFLAGS_VM) ||
> +	    ((find_msr_entry(to_vmx(vcpu),
> +			     MSR_EFER)->data&  EFER_LMA)&&  !cs.l)) {
>    

Not everyone has EFER.  Better to wrap this in an #ifdef CONFIG_X86_64.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/6] Nested VMX patch 2 implements vmclear
  2009-09-02 15:38   ` [PATCH 2/6] Nested VMX patch 2 implements vmclear oritw
  2009-09-02 15:38     ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst oritw
@ 2009-09-02 19:38     ` Avi Kivity
  2009-09-03 13:54       ` Orit Wasserman
  1 sibling, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2009-09-02 19:38 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, muli, abelg, aliguori, mmday

On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
> ---
>   arch/x86/kvm/vmx.c |   24 +++++++++++++++++++++++-
>   1 files changed, 23 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index abba325..2b1fc3b 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -70,6 +70,8 @@ struct vmcs {
>   struct nested_vmx {
>   	/* Has the level1 guest done vmon? */
>   	bool vmon;
> +	/* Has the level1 guest done vmclear? */
> +	bool vmclear;
>   };
>    

Doesn't seem these two belong in the same structure - vmclear is 
per-vmcs... but you're probably aware of that with the multi-guest 
support coming.


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-09-02 15:38     ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst oritw
  2009-09-02 15:38       ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite oritw
@ 2009-09-02 20:05       ` Avi Kivity
  2009-09-03 14:25         ` Orit Wasserman
  1 sibling, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2009-09-02 20:05 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, muli, abelg, aliguori, mmday

On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> +struct __attribute__ ((__packed__)) level_state {
> +	struct shadow_vmcs *shadow_vmcs;
> +
> +	u16 vpid;
> +	u64 shadow_efer;
> +	unsigned long cr2;
> +	unsigned long cr3;
> +	unsigned long cr4;
> +	unsigned long cr8;
> +
> +	u64 io_bitmap_a;
> +	u64 io_bitmap_b;
> +	u64 msr_bitmap;
> +
> +	struct vmcs *vmcs;
> +	int cpu;
> +	int launched;
> +};
>    



> +
>   struct vmcs {
>   	u32 revision_id;
>   	u32 abort;
> @@ -72,6 +217,17 @@ struct nested_vmx {
>   	bool vmon;
>   	/* Has the level1 guest done vmclear? */
>   	bool vmclear;
> +	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
> +	u64 l1_cur_vmcs;
>    

This is the vmptr (exactly as loaded by vmptrld), right?  If so, please 
call it vmptr.

> +	/*
> +	 * Level 2 state : includes vmcs,registers and
> +	 * a copy of vmcs12 for vmread/vmwrite
> +	 */
> +	struct level_state *l2_state;
> +
> +	/* Level 1 state for switching to level 2 and back */
> +	struct level_state *l1_state;
>    

Can you explain why we need two of them?  in the guest vmcs we have host 
and guest values, and in l1_state and l2_state we have more copies, and 
in struct vcpu we have yet another set of copies.  We also have a couple 
of copies in the host vmcs.  I'm getting dizzy...


>   static int init_rmode(struct kvm *kvm);
>   static u64 construct_eptp(unsigned long root_hpa);
>
>
>
> +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> +{
> +	gpa_t gpa;
> +	struct page *page;
> +	int r = 0;
> +
> +	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
> +
> +	/* checking guest gpa */
> +	page = gfn_to_page(vcpu->kvm, gpa>>  PAGE_SHIFT);
> +	if (is_error_page(page)) {
> +		printk(KERN_ERR "%s Invalid guest vmcs addr %llx\n",
> +		       __func__, gpa);
> +		r = 1;
> +		goto out;
> +	}
> +
> +	r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
> +	if (r) {
> +		printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
> +		       __func__, gpa, r);
> +		goto out;
> +	}
>    

You can use kvm_read_guest_virt() to simplify this.

> +
> +	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> +		printk(KERN_DEBUG "%s addr %llx not aligned\n",
> +		       __func__, *gentry);
> +		return 1;
> +	}
> +
> +out:
> +	kvm_release_page_clean(page);
> +	return r;
> +}
> +
> +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct page *vmcs_page;
> +	u64 guest_vmcs_addr;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (read_guest_vmcs_gpa(vcpu,&guest_vmcs_addr))
> +		return 1;
> +
> +	if (create_l1_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> +		return 1;
> +	}
> +
> +	if (create_l2_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> +		return 1;
> +	}
> +
> +	vmx->nested.l2_state->vmcs = alloc_vmcs();
> +	if (!vmx->nested.l2_state->vmcs) {
> +		printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
> +		return 1;
> +	}
> +
> +	if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
> +		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> +		if (vmcs_page == NULL)
> +			return 1;
> +
> +		/* load nested vmcs to processor */
> +		if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
>    

So, you're loading a guest page as the vmcs.  This is dangerous as the 
guest can play with it.  Much better to use inaccessible memory (and you 
do alloc_vmcs() earlier?)

> +
> +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> +{
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	vcpu->arch.regs[VCPU_REGS_RAX] = to_vmx(vcpu)->nested.l1_cur_vmcs;
>    

Should store to mem64 according to the docs?

Better done through the emulator.

> +void save_vmcs(struct shadow_vmcs *dst)
> +{
> +	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> +	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> +	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> +	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> +	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> +	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> +	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> +	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +	if (cpu_has_vmx_msr_bitmap())
> +		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
> +
> +	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> +	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> +	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
> +	dst->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
> +	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> +	if (enable_ept)
> +		dst->ept_pointer = vmcs_read64(EPT_POINTER);
> +
> +	dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> +	dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> +	if (enable_ept) {
> +		dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
> +		dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
> +		dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
> +		dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
> +	}
> +	dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
> +	dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +	dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
> +	dst->page_fault_error_code_mask =
> +		vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
> +	dst->page_fault_error_code_match =
> +		vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
> +	dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> +	dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
> +	dst->vm_exit_msr_store_count = vmcs_read32(VM_EXIT_MSR_STORE_COUNT);
> +	dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
> +	dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
> +	dst->vm_entry_msr_load_count = vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT);
> +	dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> +	dst->vm_entry_exception_error_code =
> +		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> +	dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> +	dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
> +	dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +	if (enable_vpid&&  dst->secondary_vm_exec_control&
> +	    SECONDARY_EXEC_ENABLE_VPID)
> +		dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
> +	dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
> +	dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> +	dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> +	dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +	dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
> +	dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> +	dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> +	dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> +	dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> +	dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> +	dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> +	dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> +	dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> +	dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> +	dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> +	dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> +	dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> +	dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> +	dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> +	dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> +	dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> +	dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> +	dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> +	dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> +	dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> +	dst->guest_interruptibility_info =
> +		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +	dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
> +	dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> +	dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
> +	dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
> +	dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
> +	dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> +	dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> +	dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
> +	dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
> +	dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
> +	dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
> +	dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> +	dst->guest_cr0 = vmcs_readl(GUEST_CR0);
> +	dst->guest_cr3 = vmcs_readl(GUEST_CR3);
> +	dst->guest_cr4 = vmcs_readl(GUEST_CR4);
> +	dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> +	dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> +	dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> +	dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> +	dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> +	dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> +	dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> +	dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> +	dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> +	dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +	dst->guest_dr7 = vmcs_readl(GUEST_DR7);
> +	dst->guest_rsp = vmcs_readl(GUEST_RSP);
> +	dst->guest_rip = vmcs_readl(GUEST_RIP);
> +	dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> +	dst->guest_pending_dbg_exceptions =
> +		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> +	dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> +	dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> +	dst->host_cr0 = vmcs_readl(HOST_CR0);
> +	dst->host_cr3 = vmcs_readl(HOST_CR3);
> +	dst->host_cr4 = vmcs_readl(HOST_CR4);
> +	dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
> +	dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
> +	dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
> +	dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> +	dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> +	dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
> +	dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
> +	dst->host_rsp = vmcs_readl(HOST_RSP);
> +	dst->host_rip = vmcs_readl(HOST_RIP);
> +	if (vmcs_config.vmexit_ctrl&  VM_EXIT_LOAD_IA32_PAT)
> +		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
> +}
>    

I see.  You're using the processor's format when reading the guest 
vmcs.  But we don't have to do that, we can use the shadow_vmcs 
structure (and a memcpy).


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite
  2009-09-02 15:38       ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite oritw
  2009-09-02 15:38         ` [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume oritw
@ 2009-09-02 20:15         ` Avi Kivity
  2009-09-03 14:26           ` Orit Wasserman
  1 sibling, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2009-09-02 20:15 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, muli, abelg, aliguori, mmday

On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> +
> +static void init_vmcs_field_to_offset_table(void)
> +{
> +	memset(vmcs_field_to_offset_table,0xff,
> +	       sizeof(vmcs_field_to_offset_table));
> +
> +	vmcs_field_to_offset_table[VIRTUAL_PROCESSOR_ID] =
> +		offsetof(struct shadow_vmcs, virtual_processor_id);
> +	vmcs_field_to_offset_table[GUEST_ES_SELECTOR] =
> +		offsetof(struct shadow_vmcs, guest_es_selector);
> +	vmcs_field_to_offset_table[GUEST_CS_SELECTOR] =
> +		offsetof(struct shadow_vmcs, guest_cs_selector);
> +	vmcs_field_to_offset_table[GUEST_SS_SELECTOR] =
> +		offsetof(struct shadow_vmcs, guest_ss_selector);
> +	vmcs_field_to_offset_table[GUEST_DS_SELECTOR] =
> +		offsetof(struct shadow_vmcs, guest_ds_selector);
> +	vmcs_field_to_offset_table[GUEST_FS_SELECTOR] =
> +		offsetof(struct shadow_vmcs, guest_fs_selector);
> +	vmcs_field_to_offset_table[GUEST_GS_SELECTOR] =
> +		offsetof(struct shadow_vmcs, guest_gs_selector);
> +	vmcs_field_to_offset_table[GUEST_LDTR_SELECTOR] =
> +		offsetof(struct shadow_vmcs, guest_ldtr_selector);
> +	vmcs_field_to_offset_table[GUEST_TR_SELECTOR] =
> +		offsetof(struct shadow_vmcs, guest_tr_selector);
> +	vmcs_field_to_offset_table[HOST_ES_SELECTOR] =
> +		offsetof(struct shadow_vmcs, host_es_selector);
> +	vmcs_field_to_offset_table[HOST_CS_SELECTOR] =
> +		offsetof(struct shadow_vmcs, host_cs_selector);
> +	vmcs_field_to_offset_table[HOST_SS_SELECTOR] =
> +		offsetof(struct shadow_vmcs, host_ss_selector);
> +	vmcs_field_to_offset_table[HOST_DS_SELECTOR] =
> +		offsetof(struct shadow_vmcs, host_ds_selector);
> +	vmcs_field_to_offset_table[HOST_FS_SELECTOR] =
> +		offsetof(struct shadow_vmcs, host_fs_selector);
> +	vmcs_field_to_offset_table[HOST_GS_SELECTOR] =
> +		offsetof(struct shadow_vmcs, host_gs_selector);
> +	vmcs_field_to_offset_table[HOST_TR_SELECTOR] =
> +		offsetof(struct shadow_vmcs, host_tr_selector);
> +	vmcs_field_to_offset_table[IO_BITMAP_A] =
> +		offsetof(struct shadow_vmcs, io_bitmap_a);
> +	vmcs_field_to_offset_table[IO_BITMAP_A_HIGH] =
> +		offsetof(struct shadow_vmcs, io_bitmap_a)+4;
> +	vmcs_field_to_offset_table[IO_BITMAP_B] =
> +		offsetof(struct shadow_vmcs, io_bitmap_b);
> +	vmcs_field_to_offset_table[IO_BITMAP_B_HIGH] =
> +		offsetof(struct shadow_vmcs, io_bitmap_b)+4;
> +	vmcs_field_to_offset_table[MSR_BITMAP] =
> +		offsetof(struct shadow_vmcs, msr_bitmap);
> +	vmcs_field_to_offset_table[MSR_BITMAP_HIGH] =
> +		offsetof(struct shadow_vmcs, msr_bitmap)+4;
> +	vmcs_field_to_offset_table[VM_EXIT_MSR_STORE_ADDR] =
> +		offsetof(struct shadow_vmcs, vm_exit_msr_store_addr);
> +	vmcs_field_to_offset_table[VM_EXIT_MSR_STORE_ADDR_HIGH] =
> +		offsetof(struct shadow_vmcs, vm_exit_msr_store_addr)+4;
> +	vmcs_field_to_offset_table[VM_EXIT_MSR_LOAD_ADDR] =
> +		offsetof(struct shadow_vmcs, vm_exit_msr_load_addr);
> +	vmcs_field_to_offset_table[VM_EXIT_MSR_LOAD_ADDR_HIGH] =
> +		offsetof(struct shadow_vmcs, vm_exit_msr_load_addr)+4;
> +	vmcs_field_to_offset_table[VM_ENTRY_MSR_LOAD_ADDR] =
> +		offsetof(struct shadow_vmcs, vm_entry_msr_load_addr);
> +	vmcs_field_to_offset_table[VM_ENTRY_MSR_LOAD_ADDR_HIGH] =
> +		offsetof(struct shadow_vmcs, vm_entry_msr_load_addr)+4;
> +	vmcs_field_to_offset_table[TSC_OFFSET] =
> +		offsetof(struct shadow_vmcs, tsc_offset);
> +	vmcs_field_to_offset_table[TSC_OFFSET_HIGH] =
> +		offsetof(struct shadow_vmcs, tsc_offset)+4;
> +	vmcs_field_to_offset_table[VIRTUAL_APIC_PAGE_ADDR] =
> +		offsetof(struct shadow_vmcs, virtual_apic_page_addr);
> +	vmcs_field_to_offset_table[VIRTUAL_APIC_PAGE_ADDR_HIGH] =
> +		offsetof(struct shadow_vmcs, virtual_apic_page_addr)+4;
> +	vmcs_field_to_offset_table[APIC_ACCESS_ADDR] =
> +		offsetof(struct shadow_vmcs, apic_access_addr);
> +	vmcs_field_to_offset_table[APIC_ACCESS_ADDR_HIGH] =
> +		offsetof(struct shadow_vmcs, apic_access_addr)+4;
> +	vmcs_field_to_offset_table[EPT_POINTER] =
> +		offsetof(struct shadow_vmcs, ept_pointer);
> +	vmcs_field_to_offset_table[EPT_POINTER_HIGH] =
> +		offsetof(struct shadow_vmcs, ept_pointer)+4;
> +	vmcs_field_to_offset_table[GUEST_PHYSICAL_ADDRESS] =
> +		offsetof(struct shadow_vmcs, guest_physical_address);
> +	vmcs_field_to_offset_table[GUEST_PHYSICAL_ADDRESS_HIGH] =
> +		offsetof(struct shadow_vmcs, guest_physical_address)+4;
> +	vmcs_field_to_offset_table[VMCS_LINK_POINTER] =
> +		offsetof(struct shadow_vmcs, vmcs_link_pointer);
> +	vmcs_field_to_offset_table[VMCS_LINK_POINTER_HIGH] =
> +		offsetof(struct shadow_vmcs, vmcs_link_pointer)+4;
> +	vmcs_field_to_offset_table[GUEST_IA32_DEBUGCTL] =
> +		offsetof(struct shadow_vmcs, guest_ia32_debugctl);
> +	vmcs_field_to_offset_table[GUEST_IA32_DEBUGCTL_HIGH] =
> +		offsetof(struct shadow_vmcs, guest_ia32_debugctl)+4;
> +	vmcs_field_to_offset_table[GUEST_IA32_PAT] =
> +		offsetof(struct shadow_vmcs, guest_ia32_pat);
> +	vmcs_field_to_offset_table[GUEST_IA32_PAT_HIGH] =
> +		offsetof(struct shadow_vmcs, guest_ia32_pat)+4;
> +	vmcs_field_to_offset_table[GUEST_PDPTR0] =
> +		offsetof(struct shadow_vmcs, guest_pdptr0);
> +	vmcs_field_to_offset_table[GUEST_PDPTR0_HIGH] =
> +		offsetof(struct shadow_vmcs, guest_pdptr0)+4;
> +	vmcs_field_to_offset_table[GUEST_PDPTR1] =
> +		offsetof(struct shadow_vmcs, guest_pdptr1);
> +	vmcs_field_to_offset_table[GUEST_PDPTR1_HIGH] =
> +		offsetof(struct shadow_vmcs, guest_pdptr1)+4;
> +	vmcs_field_to_offset_table[GUEST_PDPTR2] =
> +		offsetof(struct shadow_vmcs, guest_pdptr2);
> +	vmcs_field_to_offset_table[GUEST_PDPTR2_HIGH] =
> +		offsetof(struct shadow_vmcs, guest_pdptr2)+4;
> +	vmcs_field_to_offset_table[GUEST_PDPTR3] =
> +		offsetof(struct shadow_vmcs, guest_pdptr3);
> +	vmcs_field_to_offset_table[GUEST_PDPTR3_HIGH] =
> +		offsetof(struct shadow_vmcs, guest_pdptr3)+4;
> +	vmcs_field_to_offset_table[HOST_IA32_PAT] =
> +		offsetof(struct shadow_vmcs, host_ia32_pat);
> +	vmcs_field_to_offset_table[HOST_IA32_PAT_HIGH] =
> +		offsetof(struct shadow_vmcs, host_ia32_pat)+4;
> +	vmcs_field_to_offset_table[PIN_BASED_VM_EXEC_CONTROL] =
> +		offsetof(struct shadow_vmcs, pin_based_vm_exec_control);
> +	vmcs_field_to_offset_table[CPU_BASED_VM_EXEC_CONTROL] =
> +		offsetof(struct shadow_vmcs, cpu_based_vm_exec_control);
> +	vmcs_field_to_offset_table[EXCEPTION_BITMAP] =
> +		offsetof(struct shadow_vmcs, exception_bitmap);
> +	vmcs_field_to_offset_table[PAGE_FAULT_ERROR_CODE_MASK] =
> +		offsetof(struct shadow_vmcs, page_fault_error_code_mask);
> +	vmcs_field_to_offset_table[PAGE_FAULT_ERROR_CODE_MATCH] =
> +		offsetof(struct shadow_vmcs,
> +				page_fault_error_code_match);
> +	vmcs_field_to_offset_table[CR3_TARGET_COUNT] =
> +		offsetof(struct shadow_vmcs, cr3_target_count);
> +	vmcs_field_to_offset_table[VM_EXIT_CONTROLS] =
> +		offsetof(struct shadow_vmcs, vm_exit_controls);
> +	vmcs_field_to_offset_table[VM_EXIT_MSR_STORE_COUNT] =
> +		offsetof(struct shadow_vmcs, vm_exit_msr_store_count);
> +	vmcs_field_to_offset_table[VM_EXIT_MSR_LOAD_COUNT] =
> +		offsetof(struct shadow_vmcs, vm_exit_msr_load_count);
> +	vmcs_field_to_offset_table[VM_ENTRY_CONTROLS] =
> +		offsetof(struct shadow_vmcs, vm_entry_controls);
> +	vmcs_field_to_offset_table[VM_ENTRY_MSR_LOAD_COUNT] =
> +		offsetof(struct shadow_vmcs, vm_entry_msr_load_count);
> +	vmcs_field_to_offset_table[VM_ENTRY_INTR_INFO_FIELD] =
> +		offsetof(struct shadow_vmcs, vm_entry_intr_info_field);
> +	vmcs_field_to_offset_table[VM_ENTRY_EXCEPTION_ERROR_CODE] =
> +		offsetof(struct shadow_vmcs,
> +				vm_entry_exception_error_code);
> +	vmcs_field_to_offset_table[VM_ENTRY_INSTRUCTION_LEN] =
> +		offsetof(struct shadow_vmcs, vm_entry_instruction_len);
> +	vmcs_field_to_offset_table[TPR_THRESHOLD] =
> +		offsetof(struct shadow_vmcs, tpr_threshold);
> +	vmcs_field_to_offset_table[SECONDARY_VM_EXEC_CONTROL] =
> +		offsetof(struct shadow_vmcs, secondary_vm_exec_control);
> +	vmcs_field_to_offset_table[VM_INSTRUCTION_ERROR] =
> +		offsetof(struct shadow_vmcs, vm_instruction_error);
> +	vmcs_field_to_offset_table[VM_EXIT_REASON] =
> +		offsetof(struct shadow_vmcs, vm_exit_reason);
> +	vmcs_field_to_offset_table[VM_EXIT_INTR_INFO] =
> +		offsetof(struct shadow_vmcs, vm_exit_intr_info);
> +	vmcs_field_to_offset_table[VM_EXIT_INTR_ERROR_CODE] =
> +		offsetof(struct shadow_vmcs, vm_exit_intr_error_code);
> +	vmcs_field_to_offset_table[IDT_VECTORING_INFO_FIELD] =
> +		offsetof(struct shadow_vmcs, idt_vectoring_info_field);
> +	vmcs_field_to_offset_table[IDT_VECTORING_ERROR_CODE] =
> +		offsetof(struct shadow_vmcs, idt_vectoring_error_code);
> +	vmcs_field_to_offset_table[VM_EXIT_INSTRUCTION_LEN] =
> +		offsetof(struct shadow_vmcs, vm_exit_instruction_len);
> +	vmcs_field_to_offset_table[VMX_INSTRUCTION_INFO] =
> +		offsetof(struct shadow_vmcs, vmx_instruction_info);
> +	vmcs_field_to_offset_table[GUEST_ES_LIMIT] =
> +		offsetof(struct shadow_vmcs, guest_es_limit);
> +	vmcs_field_to_offset_table[GUEST_CS_LIMIT] =
> +		offsetof(struct shadow_vmcs, guest_cs_limit);
> +	vmcs_field_to_offset_table[GUEST_SS_LIMIT] =
> +		offsetof(struct shadow_vmcs, guest_ss_limit);
> +	vmcs_field_to_offset_table[GUEST_DS_LIMIT] =
> +		offsetof(struct shadow_vmcs, guest_ds_limit);
> +	vmcs_field_to_offset_table[GUEST_FS_LIMIT] =
> +		offsetof(struct shadow_vmcs, guest_fs_limit);
> +	vmcs_field_to_offset_table[GUEST_GS_LIMIT] =
> +		offsetof(struct shadow_vmcs, guest_gs_limit);
> +	vmcs_field_to_offset_table[GUEST_LDTR_LIMIT] =
> +		offsetof(struct shadow_vmcs, guest_ldtr_limit);
> +	vmcs_field_to_offset_table[GUEST_TR_LIMIT] =
> +		offsetof(struct shadow_vmcs, guest_tr_limit);
> +	vmcs_field_to_offset_table[GUEST_GDTR_LIMIT] =
> +		offsetof(struct shadow_vmcs, guest_gdtr_limit);
> +	vmcs_field_to_offset_table[GUEST_IDTR_LIMIT] =
> +		offsetof(struct shadow_vmcs, guest_idtr_limit);
> +	vmcs_field_to_offset_table[GUEST_ES_AR_BYTES] =
> +		offsetof(struct shadow_vmcs, guest_es_ar_bytes);
> +	vmcs_field_to_offset_table[GUEST_CS_AR_BYTES] =
> +		offsetof(struct shadow_vmcs, guest_cs_ar_bytes);
> +	vmcs_field_to_offset_table[GUEST_SS_AR_BYTES] =
> +		offsetof(struct shadow_vmcs, guest_ss_ar_bytes);
> +	vmcs_field_to_offset_table[GUEST_DS_AR_BYTES] =
> +		offsetof(struct shadow_vmcs, guest_ds_ar_bytes);
> +	vmcs_field_to_offset_table[GUEST_FS_AR_BYTES] =
> +		offsetof(struct shadow_vmcs, guest_fs_ar_bytes);
> +	vmcs_field_to_offset_table[GUEST_GS_AR_BYTES] =
> +		offsetof(struct shadow_vmcs, guest_gs_ar_bytes);
> +	vmcs_field_to_offset_table[GUEST_LDTR_AR_BYTES] =
> +		offsetof(struct shadow_vmcs, guest_ldtr_ar_bytes);
> +	vmcs_field_to_offset_table[GUEST_TR_AR_BYTES] =
> +		offsetof(struct shadow_vmcs, guest_tr_ar_bytes);
> +	vmcs_field_to_offset_table[GUEST_INTERRUPTIBILITY_INFO] =
> +		offsetof(struct shadow_vmcs,
> +				guest_interruptibility_info);
> +	vmcs_field_to_offset_table[GUEST_ACTIVITY_STATE] =
> +		offsetof(struct shadow_vmcs, guest_activity_state);
> +	vmcs_field_to_offset_table[GUEST_SYSENTER_CS] =
> +		offsetof(struct shadow_vmcs, guest_sysenter_cs);
> +	vmcs_field_to_offset_table[HOST_IA32_SYSENTER_CS] =
> +		offsetof(struct shadow_vmcs, host_ia32_sysenter_cs);
> +	vmcs_field_to_offset_table[CR0_GUEST_HOST_MASK] =
> +		offsetof(struct shadow_vmcs, cr0_guest_host_mask);
> +	vmcs_field_to_offset_table[CR4_GUEST_HOST_MASK] =
> +		offsetof(struct shadow_vmcs, cr4_guest_host_mask);
> +	vmcs_field_to_offset_table[CR0_READ_SHADOW] =
> +		offsetof(struct shadow_vmcs, cr0_read_shadow);
> +	vmcs_field_to_offset_table[CR4_READ_SHADOW] =
> +		offsetof(struct shadow_vmcs, cr4_read_shadow);
> +	vmcs_field_to_offset_table[CR3_TARGET_VALUE0] =
> +		offsetof(struct shadow_vmcs, cr3_target_value0);
> +	vmcs_field_to_offset_table[CR3_TARGET_VALUE1] =
> +		offsetof(struct shadow_vmcs, cr3_target_value1);
> +	vmcs_field_to_offset_table[CR3_TARGET_VALUE2] =
> +		offsetof(struct shadow_vmcs, cr3_target_value2);
> +	vmcs_field_to_offset_table[CR3_TARGET_VALUE3] =
> +		offsetof(struct shadow_vmcs, cr3_target_value3);
> +	vmcs_field_to_offset_table[EXIT_QUALIFICATION] =
> +		offsetof(struct shadow_vmcs, exit_qualification);
> +	vmcs_field_to_offset_table[GUEST_LINEAR_ADDRESS] =
> +		offsetof(struct shadow_vmcs, guest_linear_address);
> +	vmcs_field_to_offset_table[GUEST_CR0] =
> +		offsetof(struct shadow_vmcs, guest_cr0);
> +	vmcs_field_to_offset_table[GUEST_CR3] =
> +		offsetof(struct shadow_vmcs, guest_cr3);
> +	vmcs_field_to_offset_table[GUEST_CR4] =
> +		offsetof(struct shadow_vmcs, guest_cr4);
> +	vmcs_field_to_offset_table[GUEST_ES_BASE] =
> +		offsetof(struct shadow_vmcs, guest_es_base);
> +	vmcs_field_to_offset_table[GUEST_CS_BASE] =
> +		offsetof(struct shadow_vmcs, guest_cs_base);
> +	vmcs_field_to_offset_table[GUEST_SS_BASE] =
> +		offsetof(struct shadow_vmcs, guest_ss_base);
> +	vmcs_field_to_offset_table[GUEST_DS_BASE] =
> +		offsetof(struct shadow_vmcs, guest_ds_base);
> +	vmcs_field_to_offset_table[GUEST_FS_BASE] =
> +		offsetof(struct shadow_vmcs, guest_fs_base);
> +	vmcs_field_to_offset_table[GUEST_GS_BASE] =
> +		offsetof(struct shadow_vmcs, guest_gs_base);
> +	vmcs_field_to_offset_table[GUEST_LDTR_BASE] =
> +		offsetof(struct shadow_vmcs, guest_ldtr_base);
> +	vmcs_field_to_offset_table[GUEST_TR_BASE] =
> +		offsetof(struct shadow_vmcs, guest_tr_base);
> +	vmcs_field_to_offset_table[GUEST_GDTR_BASE] =
> +		offsetof(struct shadow_vmcs, guest_gdtr_base);
> +	vmcs_field_to_offset_table[GUEST_IDTR_BASE] =
> +		offsetof(struct shadow_vmcs, guest_idtr_base);
> +	vmcs_field_to_offset_table[GUEST_DR7] =
> +		offsetof(struct shadow_vmcs, guest_dr7);
> +	vmcs_field_to_offset_table[GUEST_RSP] =
> +		offsetof(struct shadow_vmcs, guest_rsp);
> +	vmcs_field_to_offset_table[GUEST_RIP] =
> +		offsetof(struct shadow_vmcs, guest_rip);
> +	vmcs_field_to_offset_table[GUEST_RFLAGS] =
> +		offsetof(struct shadow_vmcs, guest_rflags);
> +	vmcs_field_to_offset_table[GUEST_PENDING_DBG_EXCEPTIONS] =
> +		offsetof(struct shadow_vmcs,
> +				guest_pending_dbg_exceptions);
> +	vmcs_field_to_offset_table[GUEST_SYSENTER_ESP] =
> +		offsetof(struct shadow_vmcs, guest_sysenter_esp);
> +	vmcs_field_to_offset_table[GUEST_SYSENTER_EIP] =
> +		offsetof(struct shadow_vmcs, guest_sysenter_eip);
> +	vmcs_field_to_offset_table[HOST_CR0] =
> +		offsetof(struct shadow_vmcs, host_cr0);
> +	vmcs_field_to_offset_table[HOST_CR3] =
> +		offsetof(struct shadow_vmcs, host_cr3);
> +	vmcs_field_to_offset_table[HOST_CR4] =
> +		offsetof(struct shadow_vmcs, host_cr4);
> +	vmcs_field_to_offset_table[HOST_FS_BASE] =
> +		offsetof(struct shadow_vmcs, host_fs_base);
> +	vmcs_field_to_offset_table[HOST_GS_BASE] =
> +		offsetof(struct shadow_vmcs, host_gs_base);
> +	vmcs_field_to_offset_table[HOST_TR_BASE] =
> +		offsetof(struct shadow_vmcs, host_tr_base);
> +	vmcs_field_to_offset_table[HOST_GDTR_BASE] =
> +		offsetof(struct shadow_vmcs, host_gdtr_base);
> +	vmcs_field_to_offset_table[HOST_IDTR_BASE] =
> +		offsetof(struct shadow_vmcs, host_idtr_base);
> +	vmcs_field_to_offset_table[HOST_IA32_SYSENTER_ESP] =
> +		offsetof(struct shadow_vmcs, host_ia32_sysenter_esp);
> +	vmcs_field_to_offset_table[HOST_IA32_SYSENTER_EIP] =
> +		offsetof(struct shadow_vmcs, host_ia32_sysenter_eip);
> +	vmcs_field_to_offset_table[HOST_RSP] =
> +		offsetof(struct shadow_vmcs, host_rsp);
> +	vmcs_field_to_offset_table[HOST_RIP] =
> +		offsetof(struct shadow_vmcs, host_rip);
> +}
> +
>    

Best done with a static initializer.  Use a macro to avoid repeating the 
offsetof(struct shadow_vmcs).


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-09-02 15:38         ` [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume oritw
@ 2009-09-02 21:38           ` Avi Kivity
  2009-09-03 14:53             ` Orit Wasserman
  0 siblings, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2009-09-02 21:38 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, muli, abelg, aliguori, mmday

On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> -struct nested_vmx {
> -	/* Has the level1 guest done vmon? */
> +struct nested_vmx {	/* Has the level1 guest done vmon? */
>    

A \n died here.

>   	bool vmon;
>   	/* Has the level1 guest done vmclear? */
>   	bool vmclear;
> +
> +	/* Are we running nested guest */
> +	bool nested_mode;
> +
> +	/* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
> +	bool nested_run_pending;
> +
> +	/* flag indicating if there was a valid IDT after exiting from l2 */
> +	bool nested_pending_valid_idt;
>    

What does this mean?  pending event?

>
> +
> +static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu *vcpu)
> +{
> +	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
> +		cpu_based_vm_exec_control&  CPU_BASED_TPR_SHADOW;
> +}
>    

Don't we need to check if the host supports it too?

> +static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
> +							   *vcpu)
> +{
> +	struct shadow_vmcs *shadow = to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
> +
> +	return (shadow->secondary_vm_exec_control&
> +		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)&&
> +		to_vmx(vcpu)->nested.l2_state->shadow_vmcs->apic_access_addr != 0;
> +}
>    

Why check apic_access_addr?

> +
> +static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
> +{
> +	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
> +		secondary_vm_exec_control&  SECONDARY_EXEC_ENABLE_EPT;
> +}
>    

Need to check if secondary controls enabled?

> +static void vmx_set_irq(struct kvm_vcpu *vcpu)
> +{
> +	if (to_vmx(vcpu)->nested.nested_mode)
> +		return;
>    

Why?

Note if the guest didn't enable external interrupt exiting, we need to 
inject as usual.

>
> +static int nested_handle_pending_idt(struct kvm_vcpu *vcpu)
> +{
>    

Again the name is confusing.  pending_event_injection?

> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int irq;
> +	int type;
> +	int errCodeValid;
> +	u32 idt_vectoring_info;
> +	u32 guest_intr;
> +	bool nmi_window_open;
> +	bool interrupt_window_open;
> +
> +	if (vmx->nested.nested_mode&&  vmx->nested.nested_pending_valid_idt) {
> +		idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +		irq  = idt_vectoring_info&  VECTORING_INFO_VECTOR_MASK;
> +		type = idt_vectoring_info&  VECTORING_INFO_TYPE_MASK;
> +		errCodeValid = idt_vectoring_info&
> +			VECTORING_INFO_DELIVER_CODE_MASK;
> +
> +		guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +		nmi_window_open =
> +			!(guest_intr&  (GUEST_INTR_STATE_STI |
> +					GUEST_INTR_STATE_MOV_SS |
> +					GUEST_INTR_STATE_NMI));
> +
> +		interrupt_window_open =
> +			((vmcs_readl(GUEST_RFLAGS)&  X86_EFLAGS_IF)&&
> +			 !(guest_intr&  (GUEST_INTR_STATE_STI |
> +					 GUEST_INTR_STATE_MOV_SS)));
> +
> +		if (type == INTR_TYPE_EXT_INTR&&  !interrupt_window_open) {
> +			printk(KERN_INFO "IDT ignored, l2 interrupt window closed!\n");
> +			return 0;
> +		}
>    

How can this happen?  Unless it's on nested entry, in which case we need 
to abort the entry.

> +
>   #ifdef CONFIG_X86_64
>   #define R "r"
>   #define Q "q"
> @@ -4646,6 +4842,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
>
> +	nested_handle_pending_idt(vcpu);
>    

You're not checking the return code (need to do that on entry).

> +
> +	if (vmx->nested.nested_mode) {
> +		vmcs_writel(GUEST_CR0, vmx->nested.l2_state->shadow_vmcs->guest_cr0);
>    

Might not be legal.  We may also want to force-enable caching.  Lastly, 
don't we need to handle cr0.ts and ct0.mp specially to manage the fpu state?

>
> +	if (vmx->nested.nested_mode)
> +		vmx->nested.vmclear = 0;
> +
>    

Why?

>   free_vmcs:
> @@ -5122,6 +5339,228 @@ static int shadow_vmcs_load(struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>
> +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> +{
> +	struct shadow_vmcs *l2_shadow_vmcs =
> +		to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
> +	struct shadow_vmcs *l1_shadow_vmcs =
> +		to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
> +
> +	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +
> +	l1_shadow_vmcs->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> +	l1_shadow_vmcs->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> +	l1_shadow_vmcs->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> +	l1_shadow_vmcs->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> +	l1_shadow_vmcs->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> +	l1_shadow_vmcs->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> +	l1_shadow_vmcs->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> +
> +	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	l2_shadow_vmcs->guest_physical_address =
> +		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> +	l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> +	l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> +	l2_shadow_vmcs->vm_entry_intr_info_field =
> +		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> +	l2_shadow_vmcs->vm_entry_exception_error_code =
> +		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> +	l2_shadow_vmcs->vm_entry_instruction_len =
> +		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> +	l2_shadow_vmcs->vm_instruction_error =
> +		vmcs_read32(VM_INSTRUCTION_ERROR);
> +	l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> +	l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	l2_shadow_vmcs->vm_exit_intr_error_code =
> +		vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> +	l2_shadow_vmcs->idt_vectoring_info_field =
> +		vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +	l2_shadow_vmcs->idt_vectoring_error_code =
> +		vmcs_read32(IDT_VECTORING_ERROR_CODE);
> +	l2_shadow_vmcs->vm_exit_instruction_len =
> +		vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> +	l2_shadow_vmcs->vmx_instruction_info =
> +		vmcs_read32(VMX_INSTRUCTION_INFO);
> +	l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> +	l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> +	l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> +	l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> +	l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> +	l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> +	l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> +	l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> +	l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> +	l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> +	l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> +	l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> +	l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> +	l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> +	l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> +	l2_shadow_vmcs->guest_interruptibility_info =
> +		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +	l2_shadow_vmcs->guest_activity_state =
> +		vmcs_read32(GUEST_ACTIVITY_STATE);
> +	l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> +
> +	l1_shadow_vmcs->host_ia32_sysenter_cs =
> +		vmcs_read32(HOST_IA32_SYSENTER_CS);
> +
> +	l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> +	l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> +	l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> +	l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
> +
> +	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
> +	l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> +	l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> +	l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> +	l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> +	l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> +	l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> +	l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> +	l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> +	l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> +	l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +	l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
> +	l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
> +	l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
> +	l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> +	l2_shadow_vmcs->guest_pending_dbg_exceptions =
> +		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> +	l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> +	l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> +
> +	l1_shadow_vmcs->host_cr0 = vmcs_readl(HOST_CR0);
> +	l1_shadow_vmcs->host_cr3 = vmcs_readl(HOST_CR3);
> +	l1_shadow_vmcs->host_cr4 = vmcs_readl(HOST_CR4);
> +	l1_shadow_vmcs->host_fs_base = vmcs_readl(HOST_FS_BASE);
> +	l1_shadow_vmcs->host_gs_base = vmcs_readl(HOST_GS_BASE);
> +	l1_shadow_vmcs->host_tr_base = vmcs_readl(HOST_TR_BASE);
> +	l1_shadow_vmcs->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> +	l1_shadow_vmcs->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> +	l1_shadow_vmcs->host_ia32_sysenter_esp =
> +		vmcs_readl(HOST_IA32_SYSENTER_ESP);
> +	l1_shadow_vmcs->host_ia32_sysenter_eip =
> +		vmcs_readl(HOST_IA32_SYSENTER_EIP);
> +	l1_shadow_vmcs->host_rsp = vmcs_readl(HOST_RSP);
> +	l1_shadow_vmcs->host_rip = vmcs_readl(HOST_RIP);
> +}
>    

Can't we do it lazily?  Only read these on demand?

> +
> +int load_vmcs_common(struct shadow_vmcs *src)
> +{
> +	vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
> +	vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
> +	vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
> +	vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
> +	vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
> +	vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
> +	vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
> +	vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
> +
> +	vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
> +	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
> +
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
> +
> +	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
> +	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field);
> +	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> +		     src->vm_entry_exception_error_code);
> +	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len);
> +
> +	vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
> +	vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
> +	vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
> +	vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
> +	vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
> +	vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
> +	vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
> +	vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
> +	vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
> +	vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
> +	vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
> +	vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
> +	vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
> +	vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
> +	vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
> +	vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
> +	vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
> +	vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
> +	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
> +		     src->guest_interruptibility_info);
> +	vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
> +	vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
> +
> +	vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
> +	vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
> +	vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
> +	vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
> +	vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
> +	vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
> +	vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
> +	vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
> +	vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
> +	vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
> +	vmcs_writel(GUEST_DR7, src->guest_dr7);
> +	vmcs_writel(GUEST_RSP, src->guest_rsp);
> +	vmcs_writel(GUEST_RIP, src->guest_rip);
> +	vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
> +	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
> +		    src->guest_pending_dbg_exceptions);
> +	vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
> +	vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
> +
> +	return 0;
> +}
>    

If we do it lazily, we'll only need to reload bits that have changed.

>   struct level_state *create_state(void)
>   {
>   	struct level_state *state = NULL;
> @@ -5176,6 +5615,685 @@ int create_l2_state(struct kvm_vcpu *vcpu)
>
>   	return 0;
>   }
> +int prepare_vmcs_02(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct shadow_vmcs *src = vmx->nested.l2_state->shadow_vmcs;
> +	u32 exec_control;
> +
> +	if (!src) {
> +		printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
> +		return 1;
> +	}
> +
> +	load_vmcs_common(src);
> +
> +	if (cpu_has_vmx_vpid()&&  vmx->nested.l2_state->vpid != 0)
> +		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->vpid);
> +
> +	if (vmx->nested.l2_state->io_bitmap_a)
> +		vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
> +
> +	if (vmx->nested.l2_state->io_bitmap_b)
> +		vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
> +
> +	if (vmx->nested.l2_state->msr_bitmap)
> +		vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
>    

Don't we need to combine the host and guest msr bitmaps and I/O 
bitmaps?  If the host doesn't allow an msr or I/O access to the guest, 
it shouldn't allow it to nested guests.

> +
> +	if (src->vm_entry_msr_load_count>  0) {
> +		struct page *page;
> +
> +		page = nested_get_page(vcpu,
> +				       src->vm_entry_msr_load_addr);
> +		if (!page)
> +			return 1;
> +
> +		vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
>    

Luckily we don't use the msr autoload stuff.  If we did we'd have to 
merge it too.  But We have to emulate those loads (via vmx_set_msr), the 
guest can easily load bad msrs which would kill the host.

> +	if (src->virtual_apic_page_addr != 0) {
> +		struct page *page;
> +
> +		page = nested_get_page(vcpu,
> +				       src->virtual_apic_page_addr);
> +		if (!page)
> +			return 1;
> +
> +		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
> +
> +		kvm_release_page_clean(page);
> +	}  else {
> +		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
> +			     src->virtual_apic_page_addr);
> +	}
>    

Don't understand the special zero value.

> +
> +	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> +		     (vmx->nested.l1_state->shadow_vmcs->pin_based_vm_exec_control |
> +		      src->pin_based_vm_exec_control));
> +
> +	exec_control = vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
> +
> +	exec_control&= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> +
> +	exec_control&= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> +
> +	exec_control&= ~CPU_BASED_TPR_SHADOW;
>    

Why?

> +	if (enable_vpid) {
> +		if (vmx->nested.l2_state->vpid == 0) {
> +			allocate_vpid(vmx);
> +			vmx->nested.l2_state->vpid = vmx->vpid;
>    

What if the guest has a nonzero vpid?

> +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> +			     bool is_interrupt)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int initial_pfu_active = vcpu->fpu_active;
> +
> +	if (!vmx->nested.nested_mode) {
> +		printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
> +		       __func__);
> +		return 0;
> +	}
> +
> +	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
> +
> +	sync_cached_regs_to_vmcs(vcpu);
> +
> +	prepare_vmcs_12(vcpu);
> +	if (is_interrupt)
> +		vmx->nested.l2_state->shadow_vmcs->vm_exit_reason =
> +			EXIT_REASON_EXTERNAL_INTERRUPT;
>    

Need to auto-ack the interrupt if requested by the guest.



-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: Nested VMX support - kernel v1
  2009-09-02 15:38 Nested VMX support - kernel v1 oritw
  2009-09-02 15:38 ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff oritw
  2009-09-02 15:57 ` Nested VMX support - kernel v1 Alexander Graf
@ 2009-09-02 21:39 ` Avi Kivity
  2 siblings, 0 replies; 31+ messages in thread
From: Avi Kivity @ 2009-09-02 21:39 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, muli, abelg, aliguori, mmday

On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> The following patches implement nested VMX support. The patches enable a guest
> to use the VMX APIs in order to run its own nested guest (i.e., enable running
> other hypervisors which use VMX under KVM). The current patches support running
> Linux under a nested KVM using shadow page table (with bypass_guest_pf
> disabled). SMP support was fixed.  Reworking EPT support to mesh cleanly with
> the current shadow paging design per Avi's comments is a work-in-progress.
>
> The current patches only support a single nested hypervisor, which can only run
> a single guest (multiple guests are work in progress). Only 64-bit nested
> hypervisors are supported.
>
> Additional patches for running Windows under nested KVM, and Linux under nested
> VMware server(!), are currently running in the lab. We are in the process of
> forward-porting those patches to -tip.
>
> This patches were written by:
>       Orit Wasserman, oritw@il.ibm.com
>       Ben-Ami Yassor, benami@il.ibm.com
>       Abel Gordon, abelg@il.ibm.com
>       Muli Ben-Yehuda, muli@il.ibm.com
>
> With contributions by:
>       Anthony Liguori, aliguori@us.ibm.com
>       Mike Day, mmday@us.ibm.com
>
> This work was inspired by the nested SVM support by Alexander Graf and Joerg
> Roedel.
> Changes since v1:
>       SMP support.
>       Split into 6 smaller patches.
>       Use nested_vmx structure for nested parameters.
>       Use Array for shadow VMCS offsets.
>    

Thanks for splitting, much easier to review.  As I come to understand 
the code, you'll get better reviews.

Patch 6 didn't make it to the list.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: Nested VMX support - kernel v1
  2009-09-02 15:57 ` Nested VMX support - kernel v1 Alexander Graf
@ 2009-09-03  6:01   ` Muli Ben-Yehuda
  2009-09-03  7:29     ` Alexander Graf
  0 siblings, 1 reply; 31+ messages in thread
From: Muli Ben-Yehuda @ 2009-09-03  6:01 UTC (permalink / raw)
  To: Alexander Graf
  Cc: Orit Wasserman, kvm, Ben-Ami Yassour1, Abel Gordon, aliguori, mmday

On Wed, Sep 02, 2009 at 05:57:39PM +0200, Alexander Graf wrote:
>
> Am 02.09.2009 um 17:38 schrieb oritw@il.ibm.com:
>
>> The following patches implement nested VMX support. The patches
>> enable a guest to use the VMX APIs in order to run its own nested
>> guest (i.e., enable running other hypervisors which use VMX under
>> KVM).
>
> Copl! Great job here. I was expecting vmcs load/stores to kill
> performance, but apparently I was wrong. How did you get those fast?

Are you asking about vmptrld (switching the VMCS's) or the costs of
trapping vmread/vmwrites?

>> The current patches support running Linux under a nested KVM using
>> shadow page table (with bypass_guest_pf disabled).
>
> What is keeping you from running the other hypervisors, guests?

A simple matter of programming (or more accurately, debugging). There
are no fundamental limitations that preclude running other hypervisors
or multiple guests, but we've been concentrating on getting a single
hypervisor (both KVM and VMware) performing well first.

>> SMP support was fixed.  Reworking EPT support to mesh cleanly with
>> the current shadow paging design per Avi's comments is a
>> work-in-progress.
>>
>> The current patches only support a single nested hypervisor
>
> Why?

See above---no fundamental limitation---but needs more work. Bug
reports happily accepted, patches even more so :-)

>> , which can only run a single guest (multiple guests are work in
>> progress). Only 64-bit nested hypervisors are supported.
>>
>> Additional patches for running Windows under nested KVM, and Linux
>> under nested VMware server(!),
>
> GSX or ESX? With ESX I was running into TSC timing issues because
> ESX didn't like the fact it got preempted :-).

GSX. I expect the issues you've run into are not SVM specific, so
likely they'll need to be address for VMX as well.

> How about Hyper-V and Xen?

We haven't tried them.

> Again, great job and congratulations on making this work!

Thank you, your patches were very useful!

Cheers,
Muli
-- 
Muli Ben-Yehuda | muli@il.ibm.com | +972-4-8281080
Manager, Virtualization and Systems Architecture
Master Inventor, IBM Haifa Research Laboratory

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: Nested VMX support - kernel v1
  2009-09-03  6:01   ` Muli Ben-Yehuda
@ 2009-09-03  7:29     ` Alexander Graf
  2009-09-03  9:53       ` Muli Ben-Yehuda
  0 siblings, 1 reply; 31+ messages in thread
From: Alexander Graf @ 2009-09-03  7:29 UTC (permalink / raw)
  To: Muli Ben-Yehuda
  Cc: Orit Wasserman, kvm, Ben-Ami Yassour1, Abel Gordon, aliguori, mmday


On 03.09.2009, at 08:01, Muli Ben-Yehuda wrote:

> On Wed, Sep 02, 2009 at 05:57:39PM +0200, Alexander Graf wrote:
>>
>> Am 02.09.2009 um 17:38 schrieb oritw@il.ibm.com:
>>
>>> The following patches implement nested VMX support. The patches
>>> enable a guest to use the VMX APIs in order to run its own nested
>>> guest (i.e., enable running other hypervisors which use VMX under
>>> KVM).
>>
>> Copl! Great job here. I was expecting vmcs load/stores to kill
>> performance, but apparently I was wrong. How did you get those fast?
>
> Are you asking about vmptrld (switching the VMCS's) or the costs of
> trapping vmread/vmwrites?

vmprtld shouldn't really be too much of a problem. Just handle it as  
reset of the shadow vmcs and you're good.

No, what I was wondering about was vmread & vmwrite. Those probably  
trap quite a lot and from what I've seen with nested SVM, trapping on  
the VMEXIT path is horribly slow, especially on shadow shadow paging,  
because you just get so many of them.

>>> SMP support was fixed.  Reworking EPT support to mesh cleanly with
>>> the current shadow paging design per Avi's comments is a
>>> work-in-progress.
>>>
>>> The current patches only support a single nested hypervisor
>>
>> Why?
>
> See above---no fundamental limitation---but needs more work. Bug
> reports happily accepted, patches even more so :-)

Well, maybe I understand the wording. Does "a single nested  
hypervisor" mean "one user of VMX per VCPU"?

If so, it's only vmptrld that's not really well implemented.

It does sound as if you only support one nested hypervisor throughout  
all VMs which wouldn't make sense, since all nested data should be  
vcpu local.

>> How about Hyper-V and Xen?
>
> We haven't tried them.

It  might be worth giving Xen a try. I found it being the second  
easiest target (after KVM).

>> Again, great job and congratulations on making this work!
>
> Thank you, your patches were very useful!

It's good to see that they inspired you. In fact I even saw quite some  
structure resemblances in the source code :-).

Will you be at the LPC where I'll be giving a talk about nested SVM?

I'd love to get you on the stage so you get a chance of telling people  
that this even works for VMX. Last time I gave a talk on that topic I  
could merely say that no such thing existed.

Alex

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: Nested VMX support - kernel v1
  2009-09-03  7:29     ` Alexander Graf
@ 2009-09-03  9:53       ` Muli Ben-Yehuda
  2009-09-06 19:28         ` Anthony Liguori
  0 siblings, 1 reply; 31+ messages in thread
From: Muli Ben-Yehuda @ 2009-09-03  9:53 UTC (permalink / raw)
  To: Alexander Graf
  Cc: Orit Wasserman, kvm, Ben-Ami Yassour1, Abel Gordon, aliguori,
	Mike D. Day

On Thu, Sep 03, 2009 at 09:29:08AM +0200, Alexander Graf wrote:
>
> On 03.09.2009, at 08:01, Muli Ben-Yehuda wrote:
>
>> On Wed, Sep 02, 2009 at 05:57:39PM +0200, Alexander Graf wrote:
>>>
>>> Am 02.09.2009 um 17:38 schrieb oritw@il.ibm.com:
>>>
>>>> The following patches implement nested VMX support. The patches
>>>> enable a guest to use the VMX APIs in order to run its own nested
>>>> guest (i.e., enable running other hypervisors which use VMX under
>>>> KVM).
>>>
>>> Copl! Great job here. I was expecting vmcs load/stores to kill
>>> performance, but apparently I was wrong. How did you get those
>>> fast?
>>
>> Are you asking about vmptrld (switching the VMCS's) or the costs of
>> trapping vmread/vmwrites?
>
> vmprtld shouldn't really be too much of a problem. Just handle it as
> reset of the shadow vmcs and you're good.
>
> No, what I was wondering about was vmread & vmwrite. Those probably
> trap quite a lot and from what I've seen with nested SVM, trapping
> on the VMEXIT path is horribly slow, especially on shadow shadow
> paging, because you just get so many of them.

Nested EPT helps compared to shadow by removing many page fault exits
and their associated vmreads and vmwrites. Other than that I don't
recall we've done anything specific to reduce the overhead of vmreads
and vmwrites. Somewhat to our surprise, it turns out that with nested
EPT, given the cost of a single vmread and vmwrite on Nehalem class
machines, and more importantly the frequency and distribution of
vmreads and vmwrites, performance results are acceptable even with a
straightforward implementation. Having said that, for pathological
cases such as L2 workloads which are dominated by the L2 vmexit costs,
trapping on every L1 vmread and vmwrite will be horrendously
expensive.

>>>> SMP support was fixed.  Reworking EPT support to mesh cleanly
>>>> with the current shadow paging design per Avi's comments is a
>>>> work-in-progress.
>>>>
>>>> The current patches only support a single nested hypervisor
>>>
>>> Why?
>>
>> See above---no fundamental limitation---but needs more work. Bug
>> reports happily accepted, patches even more so :-)
>
> Well, maybe I understand the wording. Does "a single nested
> hypervisor" mean "one user of VMX per VCPU"?
>
> If so, it's only vmptrld that's not really well implemented.
>
> It does sound as if you only support one nested hypervisor
> throughout all VMs which wouldn't make sense, since all nested data
> should be vcpu local.

We only support one nested hypervisor throughout all VMs, but this is
a statement about what we've currently implemented and tested, not a
fundamental design limitation. Supporting multiple nested hypervisors
shouldn't be particularly difficult, except we might have made some
shortcuts such as using global data rather than vcpu local data that
will need to be fixed. It's on the roadmap :-)

>>> How about Hyper-V and Xen?
>>
>> We haven't tried them.
>
> It might be worth giving Xen a try. I found it being the second
> easiest target (after KVM).

Thanks, Xen is also on the roadmap but way down the list.

>>> Again, great job and congratulations on making this work!
>>
>> Thank you, your patches were very useful!
>
> It's good to see that they inspired you. In fact I even saw quite
> some structure resemblances in the source code :-).
>
> Will you be at the LPC where I'll be giving a talk about nested SVM?
>
> I'd love to get you on the stage so you get a chance of telling
> people that this even works for VMX. Last time I gave a talk on that
> topic I could merely say that no such thing existed.

Unfortunately I don't think anyone from Haifa will be there. Perhaps
Anthony or Mike (CC'd) will be there?

Cheers,
Muli
-- 
Muli Ben-Yehuda | muli@il.ibm.com | +972-4-8281080
Manager, Virtualization and Systems Architecture
Master Inventor, IBM Haifa Research Laboratory

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff
  2009-09-02 19:34   ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
@ 2009-09-03 12:34     ` Orit Wasserman
  2009-09-03 13:39       ` Avi Kivity
  0 siblings, 1 reply; 31+ messages in thread
From: Orit Wasserman @ 2009-09-03 12:34 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mmday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 02/09/2009 22:34:58:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL, Abel Gordon/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mmday@us.ibm.com
>
> Date:
>
> 02/09/2009 22:34
>
> Subject:
>
> Re: [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff
>
> On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> > From: Orit Wasserman<oritw@il.ibm.com>
> >
> > ---
> >   arch/x86/kvm/svm.c |    3 -
> >   arch/x86/kvm/vmx.c |  187 ++++++++++++++++++++++++++++++++++++++
> +++++++++++++-
> >   arch/x86/kvm/x86.c |    6 ++-
> >   arch/x86/kvm/x86.h |    2 +
> >   4 files changed, 192 insertions(+), 6 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 2df9b45..3c1f22a 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -124,9 +124,6 @@ static int npt = 1;
> >
> >   module_param(npt, int, S_IRUGO);
> >
> > -static int nested = 1;
> > -module_param(nested, int, S_IRUGO);
> > -
> >   static void svm_flush_tlb(struct kvm_vcpu *vcpu);
> >   static void svm_complete_interrupts(struct vcpu_svm *svm);
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 78101dd..abba325 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -67,6 +67,11 @@ struct vmcs {
> >      char data[0];
> >   };
> >
> > +struct nested_vmx {
> > +   /* Has the level1 guest done vmon? */
> > +   bool vmon;
> > +};
> >
>
> vmxon
fixed
>
> > @@ -967,6 +975,69 @@ static void guest_write_tsc(u64 guest_tsc,
> u64 host_tsc)
> >   }
> >
> >   /*
> > + * Handles msr read for nested virtualization
> > + */
> > +static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
> > +               u64 *pdata)
> > +{
> > +   u32 vmx_msr_low = 0, vmx_msr_high = 0;
> > +
> > +   switch (msr_index) {
> > +   case MSR_IA32_FEATURE_CONTROL:
> > +      *pdata = 0;
> > +      break;
> > +   case MSR_IA32_VMX_BASIC:
> > +      rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
> >
>
> Use rdmsrl, it's easier.
fixed
>
> I think we need to mask it with the capabilities we support.  Otherwise
> the guest can try to use some new feature which we don't support yet,
> and crash.
I agree , but I went over the Intel spec and didn't find any problematic
feature.
We may need to consider it in the future.
>
> > +      *pdata = vmx_msr_low | ((u64)vmx_msr_high<<  32);
> > +      break;
> > +   case MSR_IA32_VMX_PINBASED_CTLS:
> > +      *pdata = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
> > +         PIN_BASED_VIRTUAL_NMIS;
> >
>
> Need to mask with actual cpu capabilities in case we run on an older cpu.
fixed
>
> > +      break;
> > +   case MSR_IA32_VMX_PROCBASED_CTLS:
> > +      *pdata =  CPU_BASED_HLT_EXITING |
> > +#ifdef CONFIG_X86_64
> > +         CPU_BASED_CR8_LOAD_EXITING |
> > +         CPU_BASED_CR8_STORE_EXITING |
> > +#endif
> > +         CPU_BASED_CR3_LOAD_EXITING |
> > +         CPU_BASED_CR3_STORE_EXITING |
> > +         CPU_BASED_USE_IO_BITMAPS |
> > +         CPU_BASED_MOV_DR_EXITING |
> > +         CPU_BASED_USE_TSC_OFFSETING |
> > +         CPU_BASED_INVLPG_EXITING;
> >
>
> Same here... or are all these guaranteed to be present?
fixed
>
> > +
> > +static int handle_vmon(struct kvm_vcpu *vcpu)
> > +{
> > +   struct kvm_segment cs;
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +
> > +   if (!nested) {
> > +      printk(KERN_DEBUG "%s: nested vmx not enabled\n", __func__);
> > +      kvm_queue_exception(vcpu, UD_VECTOR);
> > +      return 1;
> > +   }
> > +
> > +   vmx_get_segment(vcpu,&cs, VCPU_SREG_CS);
> > +
> > +   if (!(vcpu->arch.cr4&  X86_CR4_VMXE) ||
> > +       !(vcpu->arch.cr0&  X86_CR0_PE) ||
> > +       (vmx_get_rflags(vcpu)&  X86_EFLAGS_VM) ||
> > +       ((find_msr_entry(to_vmx(vcpu),
> > +              MSR_EFER)->data&  EFER_LMA)&&  !cs.l)) {
> >
>
> Not everyone has EFER.  Better to wrap this in an #ifdef CONFIG_X86_64.
fixed
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff
  2009-09-03 12:34     ` Orit Wasserman
@ 2009-09-03 13:39       ` Avi Kivity
  2009-09-03 14:54         ` Orit Wasserman
  0 siblings, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2009-09-03 13:39 UTC (permalink / raw)
  To: Orit Wasserman
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mmday, Muli Ben-Yehuda

On 09/03/2009 03:34 PM, Orit Wasserman wrote:
>
>> I think we need to mask it with the capabilities we support.  Otherwise
>> the guest can try to use some new feature which we don't support yet,
>> and crash.
>>      
> I agree , but I went over the Intel spec and didn't find any problematic
> feature.
> We may need to consider it in the future.
>    

We need to do it, since we don't know anything about future processors.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/6] Nested VMX patch 2 implements vmclear
  2009-09-02 19:38     ` [PATCH 2/6] Nested VMX patch 2 implements vmclear Avi Kivity
@ 2009-09-03 13:54       ` Orit Wasserman
  0 siblings, 0 replies; 31+ messages in thread
From: Orit Wasserman @ 2009-09-03 13:54 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mmday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 02/09/2009 22:38:22:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL, Abel Gordon/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mmday@us.ibm.com
>
> Date:
>
> 02/09/2009 23:01
>
> Subject:
>
> Re: [PATCH 2/6] Nested VMX patch 2 implements vmclear
>
> On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> > From: Orit Wasserman<oritw@il.ibm.com>
> >
> > ---
> >   arch/x86/kvm/vmx.c |   24 +++++++++++++++++++++++-
> >   1 files changed, 23 insertions(+), 1 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index abba325..2b1fc3b 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -70,6 +70,8 @@ struct vmcs {
> >   struct nested_vmx {
> >      /* Has the level1 guest done vmon? */
> >      bool vmon;
> > +   /* Has the level1 guest done vmclear? */
> > +   bool vmclear;
> >   };
> >
>
> Doesn't seem these two belong in the same structure - vmclear is
> per-vmcs... but you're probably aware of that with the multi-guest
> support coming.
You are right vmclear flag is part of the L2 guest state.
>
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-09-02 20:05       ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst Avi Kivity
@ 2009-09-03 14:25         ` Orit Wasserman
  2009-09-06  9:25           ` Avi Kivity
  0 siblings, 1 reply; 31+ messages in thread
From: Orit Wasserman @ 2009-09-03 14:25 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mmday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 02/09/2009 23:05:09:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL, Abel Gordon/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mmday@us.ibm.com
>
> Date:
>
> 02/09/2009 23:04
>
> Subject:
>
> Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> > +struct __attribute__ ((__packed__)) level_state {
> > +   struct shadow_vmcs *shadow_vmcs;
> > +
> > +   u16 vpid;
> > +   u64 shadow_efer;
> > +   unsigned long cr2;
> > +   unsigned long cr3;
> > +   unsigned long cr4;
> > +   unsigned long cr8;
> > +
> > +   u64 io_bitmap_a;
> > +   u64 io_bitmap_b;
> > +   u64 msr_bitmap;
> > +
> > +   struct vmcs *vmcs;
> > +   int cpu;
> > +   int launched;
> > +};
> >
>
>
>
> > +
> >   struct vmcs {
> >      u32 revision_id;
> >      u32 abort;
> > @@ -72,6 +217,17 @@ struct nested_vmx {
> >      bool vmon;
> >      /* Has the level1 guest done vmclear? */
> >      bool vmclear;
> > +   /* What is the location of the  vmcs l1 keeps for l2? (in level1
gpa) */
> > +   u64 l1_cur_vmcs;
> >
>
> This is the vmptr (exactly as loaded by vmptrld), right?  If so, please
> call it vmptr.
OK I will change it.
>
> > +   /*
> > +    * Level 2 state : includes vmcs,registers and
> > +    * a copy of vmcs12 for vmread/vmwrite
> > +    */
> > +   struct level_state *l2_state;
> > +
> > +   /* Level 1 state for switching to level 2 and back */
> > +   struct level_state *l1_state;
> >
>
> Can you explain why we need two of them?  in the guest vmcs we have host
> and guest values, and in l1_state and l2_state we have more copies, and
> in struct vcpu we have yet another set of copies.  We also have a couple
> of copies in the host vmcs.  I'm getting dizzy...
L2_state stores all the L2 guest state:
      vmcs - A pointer to VMCS02, the VMCS used to run it by L0.
      shadow vmcs - a structure storing the values of VMCS12 (the vmcs L1
create to run L2).
      cpu - the cpu id
      launched- launched flag
      vpid - the vpid allocate by L0 for L2 (we need to store it somewhere)
      msr_bitmap - At the moment we use L0 msr_bitmap(as we are running kvm
on kvm) in the future we will use a merge of both bitmaps.
      io_bitmaps - At the moment we use L0 io_bitmaps (as we are running
kvm on kvm) in the future we will use a merge of both io_bitmaps.

L1 state stores the L1 state -
      vmcs - pointer to VMCS01
      shadow vmcs - a structure storing the values of VMCS01. we use it
when updating VMCS02 in order to avoid the need to switch between VMCS02
and VMCS01.
      cpu - the cpu id
      launched- launched flag
      vpid - the vpid allocate by L0 for L1 (we need to store it somewhere)
      shadow_efer - Till recently wasn't part of the VMCS and may be needed
for older processors.
      cr0 - not used I will remove it
      cr2 - not used I will remove it
      cr3
      cr4

We didn't use the state stored in the vcpu for L1 because sometime it
changes during L2 run.
The vmcs in the vcpu point to the active vmcs  it is pointing to VMCS01
when L1 is running and to VMCS02 when L2 guest is running.

>
>
> >   static int init_rmode(struct kvm *kvm);
> >   static u64 construct_eptp(unsigned long root_hpa);
> >
> >
> >
> > +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> > +{
> > +   gpa_t gpa;
> > +   struct page *page;
> > +   int r = 0;
> > +
> > +   gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vcpu->arch.regs
[VCPU_REGS_RAX]);
> > +
> > +   /* checking guest gpa */
> > +   page = gfn_to_page(vcpu->kvm, gpa>>  PAGE_SHIFT);
> > +   if (is_error_page(page)) {
> > +      printk(KERN_ERR "%s Invalid guest vmcs addr %llx\n",
> > +             __func__, gpa);
> > +      r = 1;
> > +      goto out;
> > +   }
> > +
> > +   r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
> > +   if (r) {
> > +      printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
> > +             __func__, gpa, r);
> > +      goto out;
> > +   }
> >
>
> You can use kvm_read_guest_virt() to simplify this.
I will fix it.
>
> > +
> > +   if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> > +      printk(KERN_DEBUG "%s addr %llx not aligned\n",
> > +             __func__, *gentry);
> > +      return 1;
> > +   }
> > +
> > +out:
> > +   kvm_release_page_clean(page);
> > +   return r;
> > +}
> > +
> > +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct page *vmcs_page;
> > +   u64 guest_vmcs_addr;
> > +
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   if (read_guest_vmcs_gpa(vcpu,&guest_vmcs_addr))
> > +      return 1;
> > +
> > +   if (create_l1_state(vcpu)) {
> > +      printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   if (create_l2_state(vcpu)) {
> > +      printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   vmx->nested.l2_state->vmcs = alloc_vmcs();
> > +   if (!vmx->nested.l2_state->vmcs) {
> > +      printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
> > +      return 1;
> > +   }
> > +
> > +   if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
> > +      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> > +      if (vmcs_page == NULL)
> > +         return 1;
> > +
> > +      /* load nested vmcs to processor */
> > +      if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
> >
>
> So, you're loading a guest page as the vmcs.  This is dangerous as the
> guest can play with it.  Much better to use inaccessible memory (and you
> do alloc_vmcs() earlier?)
We can copy the vmcs and than vmptrld it. As for the allocate vmcs this is
a memory leak and I will fix it (it should be allocated only once).

> > +
> > +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> > +{
> > +   if (!nested_vmx_check_permission(vcpu))
> > +      return 1;
> > +
> > +   vcpu->arch.regs[VCPU_REGS_RAX] = to_vmx(vcpu)->nested.l1_cur_vmcs;
> >
>
> Should store to mem64 according to the docs?
>
> Better done through the emulator.
Sure I will fix it.
>
> > +void save_vmcs(struct shadow_vmcs *dst)
> > +{
> > +   dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> > +   dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> > +   dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> > +   dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> > +   dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> > +   dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> > +   dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> > +   dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> > +   dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> > +   dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> > +   dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> > +   dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> > +   dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> > +   dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> > +   dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> > +   dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> > +   dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> > +   if (cpu_has_vmx_msr_bitmap())
> > +      dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
> > +
> > +   dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> > +   dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> > +   dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
> > +   dst->tsc_offset = vmcs_read64(TSC_OFFSET);
> > +   dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
> > +   dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> > +   if (enable_ept)
> > +      dst->ept_pointer = vmcs_read64(EPT_POINTER);
> > +
> > +   dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> > +   dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> > +   dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> > +   if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> > +      dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> > +   if (enable_ept) {
> > +      dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
> > +      dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
> > +      dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
> > +      dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
> > +   }
> > +   dst->pin_based_vm_exec_control = vmcs_read32
(PIN_BASED_VM_EXEC_CONTROL);
> > +   dst->cpu_based_vm_exec_control = vmcs_read32
(CPU_BASED_VM_EXEC_CONTROL);
> > +   dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
> > +   dst->page_fault_error_code_mask =
> > +      vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
> > +   dst->page_fault_error_code_match =
> > +      vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
> > +   dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> > +   dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
> > +   dst->vm_exit_msr_store_count = vmcs_read32
(VM_EXIT_MSR_STORE_COUNT);
> > +   dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
> > +   dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
> > +   dst->vm_entry_msr_load_count = vmcs_read32
(VM_ENTRY_MSR_LOAD_COUNT);
> > +   dst->vm_entry_intr_info_field = vmcs_read32
(VM_ENTRY_INTR_INFO_FIELD);
> > +   dst->vm_entry_exception_error_code =
> > +      vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> > +   dst->vm_entry_instruction_len = vmcs_read32
(VM_ENTRY_INSTRUCTION_LEN);
> > +   dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
> > +   dst->secondary_vm_exec_control = vmcs_read32
(SECONDARY_VM_EXEC_CONTROL);
> > +   if (enable_vpid&&  dst->secondary_vm_exec_control&
> > +       SECONDARY_EXEC_ENABLE_VPID)
> > +      dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
> > +   dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
> > +   dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> > +   dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > +   dst->vm_exit_intr_error_code = vmcs_read32
(VM_EXIT_INTR_ERROR_CODE);
> > +   dst->idt_vectoring_info_field = vmcs_read32
(IDT_VECTORING_INFO_FIELD);
> > +   dst->idt_vectoring_error_code = vmcs_read32
(IDT_VECTORING_ERROR_CODE);
> > +   dst->vm_exit_instruction_len = vmcs_read32
(VM_EXIT_INSTRUCTION_LEN);
> > +   dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> > +   dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> > +   dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> > +   dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> > +   dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> > +   dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> > +   dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> > +   dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> > +   dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> > +   dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> > +   dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> > +   dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> > +   dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> > +   dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> > +   dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> > +   dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> > +   dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> > +   dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> > +   dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> > +   dst->guest_interruptibility_info =
> > +      vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +   dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
> > +   dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> > +   dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
> > +   dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
> > +   dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
> > +   dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> > +   dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> > +   dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
> > +   dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
> > +   dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
> > +   dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
> > +   dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> > +   dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> > +   dst->guest_cr0 = vmcs_readl(GUEST_CR0);
> > +   dst->guest_cr3 = vmcs_readl(GUEST_CR3);
> > +   dst->guest_cr4 = vmcs_readl(GUEST_CR4);
> > +   dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> > +   dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> > +   dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> > +   dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> > +   dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> > +   dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> > +   dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> > +   dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> > +   dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> > +   dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> > +   dst->guest_dr7 = vmcs_readl(GUEST_DR7);
> > +   dst->guest_rsp = vmcs_readl(GUEST_RSP);
> > +   dst->guest_rip = vmcs_readl(GUEST_RIP);
> > +   dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> > +   dst->guest_pending_dbg_exceptions =
> > +      vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> > +   dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> > +   dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> > +   dst->host_cr0 = vmcs_readl(HOST_CR0);
> > +   dst->host_cr3 = vmcs_readl(HOST_CR3);
> > +   dst->host_cr4 = vmcs_readl(HOST_CR4);
> > +   dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
> > +   dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
> > +   dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
> > +   dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> > +   dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> > +   dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
> > +   dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
> > +   dst->host_rsp = vmcs_readl(HOST_RSP);
> > +   dst->host_rip = vmcs_readl(HOST_RIP);
> > +   if (vmcs_config.vmexit_ctrl&  VM_EXIT_LOAD_IA32_PAT)
> > +      dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
> > +}
> >
>
> I see.  You're using the processor's format when reading the guest
> vmcs.  But we don't have to do that, we can use the shadow_vmcs
> structure (and a memcpy).
I'm sorry I don't understand your comment can u elaborate ?
>
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite
  2009-09-02 20:15         ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite Avi Kivity
@ 2009-09-03 14:26           ` Orit Wasserman
  0 siblings, 0 replies; 31+ messages in thread
From: Orit Wasserman @ 2009-09-03 14:26 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mmday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 02/09/2009 23:15:40:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL, Abel Gordon/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mmday@us.ibm.com
>
> Date:
>
> 02/09/2009 23:15
>
> Subject:
>
> Re: [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite
>
> On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> > +
> > +static void init_vmcs_field_to_offset_table(void)
> > +{
> > +   memset(vmcs_field_to_offset_table,0xff,
> > +          sizeof(vmcs_field_to_offset_table));
> > +
> > +   vmcs_field_to_offset_table[VIRTUAL_PROCESSOR_ID] =
> > +      offsetof(struct shadow_vmcs, virtual_processor_id);
> > +   vmcs_field_to_offset_table[GUEST_ES_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, guest_es_selector);
> > +   vmcs_field_to_offset_table[GUEST_CS_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, guest_cs_selector);
> > +   vmcs_field_to_offset_table[GUEST_SS_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, guest_ss_selector);
> > +   vmcs_field_to_offset_table[GUEST_DS_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, guest_ds_selector);
> > +   vmcs_field_to_offset_table[GUEST_FS_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, guest_fs_selector);
> > +   vmcs_field_to_offset_table[GUEST_GS_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, guest_gs_selector);
> > +   vmcs_field_to_offset_table[GUEST_LDTR_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, guest_ldtr_selector);
> > +   vmcs_field_to_offset_table[GUEST_TR_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, guest_tr_selector);
> > +   vmcs_field_to_offset_table[HOST_ES_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, host_es_selector);
> > +   vmcs_field_to_offset_table[HOST_CS_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, host_cs_selector);
> > +   vmcs_field_to_offset_table[HOST_SS_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, host_ss_selector);
> > +   vmcs_field_to_offset_table[HOST_DS_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, host_ds_selector);
> > +   vmcs_field_to_offset_table[HOST_FS_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, host_fs_selector);
> > +   vmcs_field_to_offset_table[HOST_GS_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, host_gs_selector);
> > +   vmcs_field_to_offset_table[HOST_TR_SELECTOR] =
> > +      offsetof(struct shadow_vmcs, host_tr_selector);
> > +   vmcs_field_to_offset_table[IO_BITMAP_A] =
> > +      offsetof(struct shadow_vmcs, io_bitmap_a);
> > +   vmcs_field_to_offset_table[IO_BITMAP_A_HIGH] =
> > +      offsetof(struct shadow_vmcs, io_bitmap_a)+4;
> > +   vmcs_field_to_offset_table[IO_BITMAP_B] =
> > +      offsetof(struct shadow_vmcs, io_bitmap_b);
> > +   vmcs_field_to_offset_table[IO_BITMAP_B_HIGH] =
> > +      offsetof(struct shadow_vmcs, io_bitmap_b)+4;
> > +   vmcs_field_to_offset_table[MSR_BITMAP] =
> > +      offsetof(struct shadow_vmcs, msr_bitmap);
> > +   vmcs_field_to_offset_table[MSR_BITMAP_HIGH] =
> > +      offsetof(struct shadow_vmcs, msr_bitmap)+4;
> > +   vmcs_field_to_offset_table[VM_EXIT_MSR_STORE_ADDR] =
> > +      offsetof(struct shadow_vmcs, vm_exit_msr_store_addr);
> > +   vmcs_field_to_offset_table[VM_EXIT_MSR_STORE_ADDR_HIGH] =
> > +      offsetof(struct shadow_vmcs, vm_exit_msr_store_addr)+4;
> > +   vmcs_field_to_offset_table[VM_EXIT_MSR_LOAD_ADDR] =
> > +      offsetof(struct shadow_vmcs, vm_exit_msr_load_addr);
> > +   vmcs_field_to_offset_table[VM_EXIT_MSR_LOAD_ADDR_HIGH] =
> > +      offsetof(struct shadow_vmcs, vm_exit_msr_load_addr)+4;
> > +   vmcs_field_to_offset_table[VM_ENTRY_MSR_LOAD_ADDR] =
> > +      offsetof(struct shadow_vmcs, vm_entry_msr_load_addr);
> > +   vmcs_field_to_offset_table[VM_ENTRY_MSR_LOAD_ADDR_HIGH] =
> > +      offsetof(struct shadow_vmcs, vm_entry_msr_load_addr)+4;
> > +   vmcs_field_to_offset_table[TSC_OFFSET] =
> > +      offsetof(struct shadow_vmcs, tsc_offset);
> > +   vmcs_field_to_offset_table[TSC_OFFSET_HIGH] =
> > +      offsetof(struct shadow_vmcs, tsc_offset)+4;
> > +   vmcs_field_to_offset_table[VIRTUAL_APIC_PAGE_ADDR] =
> > +      offsetof(struct shadow_vmcs, virtual_apic_page_addr);
> > +   vmcs_field_to_offset_table[VIRTUAL_APIC_PAGE_ADDR_HIGH] =
> > +      offsetof(struct shadow_vmcs, virtual_apic_page_addr)+4;
> > +   vmcs_field_to_offset_table[APIC_ACCESS_ADDR] =
> > +      offsetof(struct shadow_vmcs, apic_access_addr);
> > +   vmcs_field_to_offset_table[APIC_ACCESS_ADDR_HIGH] =
> > +      offsetof(struct shadow_vmcs, apic_access_addr)+4;
> > +   vmcs_field_to_offset_table[EPT_POINTER] =
> > +      offsetof(struct shadow_vmcs, ept_pointer);
> > +   vmcs_field_to_offset_table[EPT_POINTER_HIGH] =
> > +      offsetof(struct shadow_vmcs, ept_pointer)+4;
> > +   vmcs_field_to_offset_table[GUEST_PHYSICAL_ADDRESS] =
> > +      offsetof(struct shadow_vmcs, guest_physical_address);
> > +   vmcs_field_to_offset_table[GUEST_PHYSICAL_ADDRESS_HIGH] =
> > +      offsetof(struct shadow_vmcs, guest_physical_address)+4;
> > +   vmcs_field_to_offset_table[VMCS_LINK_POINTER] =
> > +      offsetof(struct shadow_vmcs, vmcs_link_pointer);
> > +   vmcs_field_to_offset_table[VMCS_LINK_POINTER_HIGH] =
> > +      offsetof(struct shadow_vmcs, vmcs_link_pointer)+4;
> > +   vmcs_field_to_offset_table[GUEST_IA32_DEBUGCTL] =
> > +      offsetof(struct shadow_vmcs, guest_ia32_debugctl);
> > +   vmcs_field_to_offset_table[GUEST_IA32_DEBUGCTL_HIGH] =
> > +      offsetof(struct shadow_vmcs, guest_ia32_debugctl)+4;
> > +   vmcs_field_to_offset_table[GUEST_IA32_PAT] =
> > +      offsetof(struct shadow_vmcs, guest_ia32_pat);
> > +   vmcs_field_to_offset_table[GUEST_IA32_PAT_HIGH] =
> > +      offsetof(struct shadow_vmcs, guest_ia32_pat)+4;
> > +   vmcs_field_to_offset_table[GUEST_PDPTR0] =
> > +      offsetof(struct shadow_vmcs, guest_pdptr0);
> > +   vmcs_field_to_offset_table[GUEST_PDPTR0_HIGH] =
> > +      offsetof(struct shadow_vmcs, guest_pdptr0)+4;
> > +   vmcs_field_to_offset_table[GUEST_PDPTR1] =
> > +      offsetof(struct shadow_vmcs, guest_pdptr1);
> > +   vmcs_field_to_offset_table[GUEST_PDPTR1_HIGH] =
> > +      offsetof(struct shadow_vmcs, guest_pdptr1)+4;
> > +   vmcs_field_to_offset_table[GUEST_PDPTR2] =
> > +      offsetof(struct shadow_vmcs, guest_pdptr2);
> > +   vmcs_field_to_offset_table[GUEST_PDPTR2_HIGH] =
> > +      offsetof(struct shadow_vmcs, guest_pdptr2)+4;
> > +   vmcs_field_to_offset_table[GUEST_PDPTR3] =
> > +      offsetof(struct shadow_vmcs, guest_pdptr3);
> > +   vmcs_field_to_offset_table[GUEST_PDPTR3_HIGH] =
> > +      offsetof(struct shadow_vmcs, guest_pdptr3)+4;
> > +   vmcs_field_to_offset_table[HOST_IA32_PAT] =
> > +      offsetof(struct shadow_vmcs, host_ia32_pat);
> > +   vmcs_field_to_offset_table[HOST_IA32_PAT_HIGH] =
> > +      offsetof(struct shadow_vmcs, host_ia32_pat)+4;
> > +   vmcs_field_to_offset_table[PIN_BASED_VM_EXEC_CONTROL] =
> > +      offsetof(struct shadow_vmcs, pin_based_vm_exec_control);
> > +   vmcs_field_to_offset_table[CPU_BASED_VM_EXEC_CONTROL] =
> > +      offsetof(struct shadow_vmcs, cpu_based_vm_exec_control);
> > +   vmcs_field_to_offset_table[EXCEPTION_BITMAP] =
> > +      offsetof(struct shadow_vmcs, exception_bitmap);
> > +   vmcs_field_to_offset_table[PAGE_FAULT_ERROR_CODE_MASK] =
> > +      offsetof(struct shadow_vmcs, page_fault_error_code_mask);
> > +   vmcs_field_to_offset_table[PAGE_FAULT_ERROR_CODE_MATCH] =
> > +      offsetof(struct shadow_vmcs,
> > +            page_fault_error_code_match);
> > +   vmcs_field_to_offset_table[CR3_TARGET_COUNT] =
> > +      offsetof(struct shadow_vmcs, cr3_target_count);
> > +   vmcs_field_to_offset_table[VM_EXIT_CONTROLS] =
> > +      offsetof(struct shadow_vmcs, vm_exit_controls);
> > +   vmcs_field_to_offset_table[VM_EXIT_MSR_STORE_COUNT] =
> > +      offsetof(struct shadow_vmcs, vm_exit_msr_store_count);
> > +   vmcs_field_to_offset_table[VM_EXIT_MSR_LOAD_COUNT] =
> > +      offsetof(struct shadow_vmcs, vm_exit_msr_load_count);
> > +   vmcs_field_to_offset_table[VM_ENTRY_CONTROLS] =
> > +      offsetof(struct shadow_vmcs, vm_entry_controls);
> > +   vmcs_field_to_offset_table[VM_ENTRY_MSR_LOAD_COUNT] =
> > +      offsetof(struct shadow_vmcs, vm_entry_msr_load_count);
> > +   vmcs_field_to_offset_table[VM_ENTRY_INTR_INFO_FIELD] =
> > +      offsetof(struct shadow_vmcs, vm_entry_intr_info_field);
> > +   vmcs_field_to_offset_table[VM_ENTRY_EXCEPTION_ERROR_CODE] =
> > +      offsetof(struct shadow_vmcs,
> > +            vm_entry_exception_error_code);
> > +   vmcs_field_to_offset_table[VM_ENTRY_INSTRUCTION_LEN] =
> > +      offsetof(struct shadow_vmcs, vm_entry_instruction_len);
> > +   vmcs_field_to_offset_table[TPR_THRESHOLD] =
> > +      offsetof(struct shadow_vmcs, tpr_threshold);
> > +   vmcs_field_to_offset_table[SECONDARY_VM_EXEC_CONTROL] =
> > +      offsetof(struct shadow_vmcs, secondary_vm_exec_control);
> > +   vmcs_field_to_offset_table[VM_INSTRUCTION_ERROR] =
> > +      offsetof(struct shadow_vmcs, vm_instruction_error);
> > +   vmcs_field_to_offset_table[VM_EXIT_REASON] =
> > +      offsetof(struct shadow_vmcs, vm_exit_reason);
> > +   vmcs_field_to_offset_table[VM_EXIT_INTR_INFO] =
> > +      offsetof(struct shadow_vmcs, vm_exit_intr_info);
> > +   vmcs_field_to_offset_table[VM_EXIT_INTR_ERROR_CODE] =
> > +      offsetof(struct shadow_vmcs, vm_exit_intr_error_code);
> > +   vmcs_field_to_offset_table[IDT_VECTORING_INFO_FIELD] =
> > +      offsetof(struct shadow_vmcs, idt_vectoring_info_field);
> > +   vmcs_field_to_offset_table[IDT_VECTORING_ERROR_CODE] =
> > +      offsetof(struct shadow_vmcs, idt_vectoring_error_code);
> > +   vmcs_field_to_offset_table[VM_EXIT_INSTRUCTION_LEN] =
> > +      offsetof(struct shadow_vmcs, vm_exit_instruction_len);
> > +   vmcs_field_to_offset_table[VMX_INSTRUCTION_INFO] =
> > +      offsetof(struct shadow_vmcs, vmx_instruction_info);
> > +   vmcs_field_to_offset_table[GUEST_ES_LIMIT] =
> > +      offsetof(struct shadow_vmcs, guest_es_limit);
> > +   vmcs_field_to_offset_table[GUEST_CS_LIMIT] =
> > +      offsetof(struct shadow_vmcs, guest_cs_limit);
> > +   vmcs_field_to_offset_table[GUEST_SS_LIMIT] =
> > +      offsetof(struct shadow_vmcs, guest_ss_limit);
> > +   vmcs_field_to_offset_table[GUEST_DS_LIMIT] =
> > +      offsetof(struct shadow_vmcs, guest_ds_limit);
> > +   vmcs_field_to_offset_table[GUEST_FS_LIMIT] =
> > +      offsetof(struct shadow_vmcs, guest_fs_limit);
> > +   vmcs_field_to_offset_table[GUEST_GS_LIMIT] =
> > +      offsetof(struct shadow_vmcs, guest_gs_limit);
> > +   vmcs_field_to_offset_table[GUEST_LDTR_LIMIT] =
> > +      offsetof(struct shadow_vmcs, guest_ldtr_limit);
> > +   vmcs_field_to_offset_table[GUEST_TR_LIMIT] =
> > +      offsetof(struct shadow_vmcs, guest_tr_limit);
> > +   vmcs_field_to_offset_table[GUEST_GDTR_LIMIT] =
> > +      offsetof(struct shadow_vmcs, guest_gdtr_limit);
> > +   vmcs_field_to_offset_table[GUEST_IDTR_LIMIT] =
> > +      offsetof(struct shadow_vmcs, guest_idtr_limit);
> > +   vmcs_field_to_offset_table[GUEST_ES_AR_BYTES] =
> > +      offsetof(struct shadow_vmcs, guest_es_ar_bytes);
> > +   vmcs_field_to_offset_table[GUEST_CS_AR_BYTES] =
> > +      offsetof(struct shadow_vmcs, guest_cs_ar_bytes);
> > +   vmcs_field_to_offset_table[GUEST_SS_AR_BYTES] =
> > +      offsetof(struct shadow_vmcs, guest_ss_ar_bytes);
> > +   vmcs_field_to_offset_table[GUEST_DS_AR_BYTES] =
> > +      offsetof(struct shadow_vmcs, guest_ds_ar_bytes);
> > +   vmcs_field_to_offset_table[GUEST_FS_AR_BYTES] =
> > +      offsetof(struct shadow_vmcs, guest_fs_ar_bytes);
> > +   vmcs_field_to_offset_table[GUEST_GS_AR_BYTES] =
> > +      offsetof(struct shadow_vmcs, guest_gs_ar_bytes);
> > +   vmcs_field_to_offset_table[GUEST_LDTR_AR_BYTES] =
> > +      offsetof(struct shadow_vmcs, guest_ldtr_ar_bytes);
> > +   vmcs_field_to_offset_table[GUEST_TR_AR_BYTES] =
> > +      offsetof(struct shadow_vmcs, guest_tr_ar_bytes);
> > +   vmcs_field_to_offset_table[GUEST_INTERRUPTIBILITY_INFO] =
> > +      offsetof(struct shadow_vmcs,
> > +            guest_interruptibility_info);
> > +   vmcs_field_to_offset_table[GUEST_ACTIVITY_STATE] =
> > +      offsetof(struct shadow_vmcs, guest_activity_state);
> > +   vmcs_field_to_offset_table[GUEST_SYSENTER_CS] =
> > +      offsetof(struct shadow_vmcs, guest_sysenter_cs);
> > +   vmcs_field_to_offset_table[HOST_IA32_SYSENTER_CS] =
> > +      offsetof(struct shadow_vmcs, host_ia32_sysenter_cs);
> > +   vmcs_field_to_offset_table[CR0_GUEST_HOST_MASK] =
> > +      offsetof(struct shadow_vmcs, cr0_guest_host_mask);
> > +   vmcs_field_to_offset_table[CR4_GUEST_HOST_MASK] =
> > +      offsetof(struct shadow_vmcs, cr4_guest_host_mask);
> > +   vmcs_field_to_offset_table[CR0_READ_SHADOW] =
> > +      offsetof(struct shadow_vmcs, cr0_read_shadow);
> > +   vmcs_field_to_offset_table[CR4_READ_SHADOW] =
> > +      offsetof(struct shadow_vmcs, cr4_read_shadow);
> > +   vmcs_field_to_offset_table[CR3_TARGET_VALUE0] =
> > +      offsetof(struct shadow_vmcs, cr3_target_value0);
> > +   vmcs_field_to_offset_table[CR3_TARGET_VALUE1] =
> > +      offsetof(struct shadow_vmcs, cr3_target_value1);
> > +   vmcs_field_to_offset_table[CR3_TARGET_VALUE2] =
> > +      offsetof(struct shadow_vmcs, cr3_target_value2);
> > +   vmcs_field_to_offset_table[CR3_TARGET_VALUE3] =
> > +      offsetof(struct shadow_vmcs, cr3_target_value3);
> > +   vmcs_field_to_offset_table[EXIT_QUALIFICATION] =
> > +      offsetof(struct shadow_vmcs, exit_qualification);
> > +   vmcs_field_to_offset_table[GUEST_LINEAR_ADDRESS] =
> > +      offsetof(struct shadow_vmcs, guest_linear_address);
> > +   vmcs_field_to_offset_table[GUEST_CR0] =
> > +      offsetof(struct shadow_vmcs, guest_cr0);
> > +   vmcs_field_to_offset_table[GUEST_CR3] =
> > +      offsetof(struct shadow_vmcs, guest_cr3);
> > +   vmcs_field_to_offset_table[GUEST_CR4] =
> > +      offsetof(struct shadow_vmcs, guest_cr4);
> > +   vmcs_field_to_offset_table[GUEST_ES_BASE] =
> > +      offsetof(struct shadow_vmcs, guest_es_base);
> > +   vmcs_field_to_offset_table[GUEST_CS_BASE] =
> > +      offsetof(struct shadow_vmcs, guest_cs_base);
> > +   vmcs_field_to_offset_table[GUEST_SS_BASE] =
> > +      offsetof(struct shadow_vmcs, guest_ss_base);
> > +   vmcs_field_to_offset_table[GUEST_DS_BASE] =
> > +      offsetof(struct shadow_vmcs, guest_ds_base);
> > +   vmcs_field_to_offset_table[GUEST_FS_BASE] =
> > +      offsetof(struct shadow_vmcs, guest_fs_base);
> > +   vmcs_field_to_offset_table[GUEST_GS_BASE] =
> > +      offsetof(struct shadow_vmcs, guest_gs_base);
> > +   vmcs_field_to_offset_table[GUEST_LDTR_BASE] =
> > +      offsetof(struct shadow_vmcs, guest_ldtr_base);
> > +   vmcs_field_to_offset_table[GUEST_TR_BASE] =
> > +      offsetof(struct shadow_vmcs, guest_tr_base);
> > +   vmcs_field_to_offset_table[GUEST_GDTR_BASE] =
> > +      offsetof(struct shadow_vmcs, guest_gdtr_base);
> > +   vmcs_field_to_offset_table[GUEST_IDTR_BASE] =
> > +      offsetof(struct shadow_vmcs, guest_idtr_base);
> > +   vmcs_field_to_offset_table[GUEST_DR7] =
> > +      offsetof(struct shadow_vmcs, guest_dr7);
> > +   vmcs_field_to_offset_table[GUEST_RSP] =
> > +      offsetof(struct shadow_vmcs, guest_rsp);
> > +   vmcs_field_to_offset_table[GUEST_RIP] =
> > +      offsetof(struct shadow_vmcs, guest_rip);
> > +   vmcs_field_to_offset_table[GUEST_RFLAGS] =
> > +      offsetof(struct shadow_vmcs, guest_rflags);
> > +   vmcs_field_to_offset_table[GUEST_PENDING_DBG_EXCEPTIONS] =
> > +      offsetof(struct shadow_vmcs,
> > +            guest_pending_dbg_exceptions);
> > +   vmcs_field_to_offset_table[GUEST_SYSENTER_ESP] =
> > +      offsetof(struct shadow_vmcs, guest_sysenter_esp);
> > +   vmcs_field_to_offset_table[GUEST_SYSENTER_EIP] =
> > +      offsetof(struct shadow_vmcs, guest_sysenter_eip);
> > +   vmcs_field_to_offset_table[HOST_CR0] =
> > +      offsetof(struct shadow_vmcs, host_cr0);
> > +   vmcs_field_to_offset_table[HOST_CR3] =
> > +      offsetof(struct shadow_vmcs, host_cr3);
> > +   vmcs_field_to_offset_table[HOST_CR4] =
> > +      offsetof(struct shadow_vmcs, host_cr4);
> > +   vmcs_field_to_offset_table[HOST_FS_BASE] =
> > +      offsetof(struct shadow_vmcs, host_fs_base);
> > +   vmcs_field_to_offset_table[HOST_GS_BASE] =
> > +      offsetof(struct shadow_vmcs, host_gs_base);
> > +   vmcs_field_to_offset_table[HOST_TR_BASE] =
> > +      offsetof(struct shadow_vmcs, host_tr_base);
> > +   vmcs_field_to_offset_table[HOST_GDTR_BASE] =
> > +      offsetof(struct shadow_vmcs, host_gdtr_base);
> > +   vmcs_field_to_offset_table[HOST_IDTR_BASE] =
> > +      offsetof(struct shadow_vmcs, host_idtr_base);
> > +   vmcs_field_to_offset_table[HOST_IA32_SYSENTER_ESP] =
> > +      offsetof(struct shadow_vmcs, host_ia32_sysenter_esp);
> > +   vmcs_field_to_offset_table[HOST_IA32_SYSENTER_EIP] =
> > +      offsetof(struct shadow_vmcs, host_ia32_sysenter_eip);
> > +   vmcs_field_to_offset_table[HOST_RSP] =
> > +      offsetof(struct shadow_vmcs, host_rsp);
> > +   vmcs_field_to_offset_table[HOST_RIP] =
> > +      offsetof(struct shadow_vmcs, host_rip);
> > +}
> > +
> >
>
> Best done with a static initializer.  Use a macro to avoid repeating the
> offsetof(struct shadow_vmcs).
I will fix it
>
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-09-02 21:38           ` Avi Kivity
@ 2009-09-03 14:53             ` Orit Wasserman
  2009-09-06  9:29               ` Avi Kivity
  0 siblings, 1 reply; 31+ messages in thread
From: Orit Wasserman @ 2009-09-03 14:53 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mmday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 03/09/2009 00:38:16:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@vger.kernel.org, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL, Abel Gordon/Haifa/IBM@IBMIL,
> aliguori@us.ibm.com, mmday@us.ibm.com
>
> Date:
>
> 03/09/2009 00:38
>
> Subject:
>
> Re: [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume
>
> On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> > -struct nested_vmx {
> > -   /* Has the level1 guest done vmon? */
> > +struct nested_vmx {   /* Has the level1 guest done vmon? */
> >
>
> A \n died here.
I will fix it.
>
> >      bool vmon;
> >      /* Has the level1 guest done vmclear? */
> >      bool vmclear;
> > +
> > +   /* Are we running nested guest */
> > +   bool nested_mode;
> > +
> > +   /* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
> > +   bool nested_run_pending;
> > +
> > +   /* flag indicating if there was a valid IDT after exiting from l2
*/
> > +   bool nested_pending_valid_idt;
> >
>
> What does this mean?  pending event?
I will rename it.
> >
> > +
> > +static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu
*vcpu)
> > +{
> > +   return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
> > +      cpu_based_vm_exec_control&  CPU_BASED_TPR_SHADOW;
> > +}
> >
>
> Don't we need to check if the host supports it too?
We check it separately but I can add it here
>
> > +static inline bool nested_vm_need_virtualize_apic_accesses(struct
kvm_vcpu
> > +                        *vcpu)
> > +{
> > +   struct shadow_vmcs *shadow = to_vmx(vcpu)->nested.l2_state->
shadow_vmcs;
> > +
> > +   return (shadow->secondary_vm_exec_control&
> > +      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)&&
> > +      to_vmx(vcpu)->nested.l2_state->shadow_vmcs->apic_access_addr !=
0;
> > +}
> >
>
> Why check apic_access_addr?
I will remove it.
>
> > +
> > +static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
> > +{
> > +   return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
> > +      secondary_vm_exec_control&  SECONDARY_EXEC_ENABLE_EPT;
> > +}
> >
>
> Need to check if secondary controls enabled?
If the secondary controls are not enabled this field is zero.
>
> > +static void vmx_set_irq(struct kvm_vcpu *vcpu)
> > +{
> > +   if (to_vmx(vcpu)->nested.nested_mode)
> > +      return;
> >
>
> Why?
>
> Note if the guest didn't enable external interrupt exiting, we need to
> inject as usual.
I look into it.
>
> >
> > +static int nested_handle_pending_idt(struct kvm_vcpu *vcpu)
> > +{
> >
>
> Again the name is confusing.  pending_event_injection?
I will rename it.
>
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   int irq;
> > +   int type;
> > +   int errCodeValid;
> > +   u32 idt_vectoring_info;
> > +   u32 guest_intr;
> > +   bool nmi_window_open;
> > +   bool interrupt_window_open;
> > +
> > +   if (vmx->nested.nested_mode&&  vmx->
nested.nested_pending_valid_idt) {
> > +      idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> > +      irq  = idt_vectoring_info&  VECTORING_INFO_VECTOR_MASK;
> > +      type = idt_vectoring_info&  VECTORING_INFO_TYPE_MASK;
> > +      errCodeValid = idt_vectoring_info&
> > +         VECTORING_INFO_DELIVER_CODE_MASK;
> > +
> > +      guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +      nmi_window_open =
> > +         !(guest_intr&  (GUEST_INTR_STATE_STI |
> > +               GUEST_INTR_STATE_MOV_SS |
> > +               GUEST_INTR_STATE_NMI));
> > +
> > +      interrupt_window_open =
> > +         ((vmcs_readl(GUEST_RFLAGS)&  X86_EFLAGS_IF)&&
> > +          !(guest_intr&  (GUEST_INTR_STATE_STI |
> > +                GUEST_INTR_STATE_MOV_SS)));
> > +
> > +      if (type == INTR_TYPE_EXT_INTR&&  !interrupt_window_open) {
> > +         printk(KERN_INFO "IDT ignored, l2 interrupt window
closed!\n");
> > +         return 0;
> > +      }
> >
>
> How can this happen?  Unless it's on nested entry, in which case we need
> to abort the entry.
Ok i will fix it. The truth I never saw it happen.
>
> > +
> >   #ifdef CONFIG_X86_64
> >   #define R "r"
> >   #define Q "q"
> > @@ -4646,6 +4842,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
> >   {
> >      struct vcpu_vmx *vmx = to_vmx(vcpu);
> >
> > +   nested_handle_pending_idt(vcpu);
> >
>
> You're not checking the return code (need to do that on entry).
I will fix it.
>
> > +
> > +   if (vmx->nested.nested_mode) {
> > +      vmcs_writel(GUEST_CR0, vmx->nested.l2_state->shadow_vmcs->
guest_cr0);
> >
>
> Might not be legal.  We may also want to force-enable caching.  Lastly,
> don't we need to handle cr0.ts and ct0.mp specially to manage the fpu
state?
We are working on implementing this correctly . kvm seems to handle it fine
but vmware doesn't like it.
>
> >
> > +   if (vmx->nested.nested_mode)
> > +      vmx->nested.vmclear = 0;
> > +
> >
>
> Why?
I will check it.
>
> >   free_vmcs:
> > @@ -5122,6 +5339,228 @@ static int shadow_vmcs_load(struct kvm_vcpu
*vcpu)
> >      return 0;
> >   }
> >
> > +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> > +{
> > +   struct shadow_vmcs *l2_shadow_vmcs =
> > +      to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
> > +   struct shadow_vmcs *l1_shadow_vmcs =
> > +      to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
> > +
> > +   l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> > +   l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> > +   l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> > +   l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16
(GUEST_LDTR_SELECTOR);
> > +   l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> > +
> > +   l1_shadow_vmcs->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> > +   l1_shadow_vmcs->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> > +   l1_shadow_vmcs->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> > +   l1_shadow_vmcs->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> > +   l1_shadow_vmcs->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> > +   l1_shadow_vmcs->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> > +   l1_shadow_vmcs->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> > +
> > +   l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> > +   l2_shadow_vmcs->guest_physical_address =
> > +      vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> > +   l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> > +   l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64
(GUEST_IA32_DEBUGCTL);
> > +   if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> > +      l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> > +   l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> > +   l2_shadow_vmcs->vm_entry_intr_info_field =
> > +      vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> > +   l2_shadow_vmcs->vm_entry_exception_error_code =
> > +      vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> > +   l2_shadow_vmcs->vm_entry_instruction_len =
> > +      vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> > +   l2_shadow_vmcs->vm_instruction_error =
> > +      vmcs_read32(VM_INSTRUCTION_ERROR);
> > +   l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> > +   l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > +   l2_shadow_vmcs->vm_exit_intr_error_code =
> > +      vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> > +   l2_shadow_vmcs->idt_vectoring_info_field =
> > +      vmcs_read32(IDT_VECTORING_INFO_FIELD);
> > +   l2_shadow_vmcs->idt_vectoring_error_code =
> > +      vmcs_read32(IDT_VECTORING_ERROR_CODE);
> > +   l2_shadow_vmcs->vm_exit_instruction_len =
> > +      vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> > +   l2_shadow_vmcs->vmx_instruction_info =
> > +      vmcs_read32(VMX_INSTRUCTION_INFO);
> > +   l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> > +   l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> > +   l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> > +   l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> > +   l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> > +   l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> > +   l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> > +   l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> > +   l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32
(GUEST_LDTR_AR_BYTES);
> > +   l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> > +   l2_shadow_vmcs->guest_interruptibility_info =
> > +      vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +   l2_shadow_vmcs->guest_activity_state =
> > +      vmcs_read32(GUEST_ACTIVITY_STATE);
> > +   l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> > +
> > +   l1_shadow_vmcs->host_ia32_sysenter_cs =
> > +      vmcs_read32(HOST_IA32_SYSENTER_CS);
> > +
> > +   l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> > +   l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> > +   l2_shadow_vmcs->exit_qualification = vmcs_readl
(EXIT_QUALIFICATION);
> > +   l2_shadow_vmcs->guest_linear_address = vmcs_readl
(GUEST_LINEAR_ADDRESS);
> > +   l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
> > +
> > +   l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
> > +   l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> > +   l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> > +   l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> > +   l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> > +   l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> > +   l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> > +   l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> > +   l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> > +   l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> > +   l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> > +   l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
> > +   l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
> > +   l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
> > +   l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> > +   l2_shadow_vmcs->guest_pending_dbg_exceptions =
> > +      vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> > +   l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl
(GUEST_SYSENTER_ESP);
> > +   l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl
(GUEST_SYSENTER_EIP);
> > +
> > +   l1_shadow_vmcs->host_cr0 = vmcs_readl(HOST_CR0);
> > +   l1_shadow_vmcs->host_cr3 = vmcs_readl(HOST_CR3);
> > +   l1_shadow_vmcs->host_cr4 = vmcs_readl(HOST_CR4);
> > +   l1_shadow_vmcs->host_fs_base = vmcs_readl(HOST_FS_BASE);
> > +   l1_shadow_vmcs->host_gs_base = vmcs_readl(HOST_GS_BASE);
> > +   l1_shadow_vmcs->host_tr_base = vmcs_readl(HOST_TR_BASE);
> > +   l1_shadow_vmcs->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> > +   l1_shadow_vmcs->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> > +   l1_shadow_vmcs->host_ia32_sysenter_esp =
> > +      vmcs_readl(HOST_IA32_SYSENTER_ESP);
> > +   l1_shadow_vmcs->host_ia32_sysenter_eip =
> > +      vmcs_readl(HOST_IA32_SYSENTER_EIP);
> > +   l1_shadow_vmcs->host_rsp = vmcs_readl(HOST_RSP);
> > +   l1_shadow_vmcs->host_rip = vmcs_readl(HOST_RIP);
> > +}
> >
>
> Can't we do it lazily?  Only read these on demand?
We can optimize and read some fields only when they are changed (after we
switch to qemu for example),
we do it in our performance version. Also there are some field that kvm
only write to once , we can read once.
This can be dangerous for other hypervisors , that may change them more
frequently.

>
> > +
> > +int load_vmcs_common(struct shadow_vmcs *src)
> > +{
> > +   vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
> > +   vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
> > +   vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
> > +   vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
> > +   vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
> > +   vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
> > +   vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
> > +   vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
> > +
> > +   vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
> > +   vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
> > +
> > +   if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> > +      vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
> > +
> > +   vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->
vm_entry_msr_load_count);
> > +   vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->
vm_entry_intr_info_field);
> > +   vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> > +           src->vm_entry_exception_error_code);
> > +   vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->
vm_entry_instruction_len);
> > +
> > +   vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
> > +   vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
> > +   vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
> > +   vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
> > +   vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
> > +   vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
> > +   vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
> > +   vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
> > +   vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
> > +   vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
> > +   vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
> > +   vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
> > +   vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
> > +   vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
> > +   vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
> > +   vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
> > +   vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
> > +   vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
> > +   vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
> > +           src->guest_interruptibility_info);
> > +   vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
> > +   vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
> > +
> > +   vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
> > +   vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
> > +   vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
> > +   vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
> > +   vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
> > +   vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
> > +   vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
> > +   vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
> > +   vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
> > +   vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
> > +   vmcs_writel(GUEST_DR7, src->guest_dr7);
> > +   vmcs_writel(GUEST_RSP, src->guest_rsp);
> > +   vmcs_writel(GUEST_RIP, src->guest_rip);
> > +   vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
> > +   vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
> > +          src->guest_pending_dbg_exceptions);
> > +   vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
> > +   vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
> > +
> > +   return 0;
> > +}
> >
>
> If we do it lazily, we'll only need to reload bits that have changed.
True, we can add a bitmap and update the fields written to only.
>
> >   struct level_state *create_state(void)
> >   {
> >      struct level_state *state = NULL;
> > @@ -5176,6 +5615,685 @@ int create_l2_state(struct kvm_vcpu *vcpu)
> >
> >      return 0;
> >   }
> > +int prepare_vmcs_02(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct shadow_vmcs *src = vmx->nested.l2_state->shadow_vmcs;
> > +   u32 exec_control;
> > +
> > +   if (!src) {
> > +      printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   load_vmcs_common(src);
> > +
> > +   if (cpu_has_vmx_vpid()&&  vmx->nested.l2_state->vpid != 0)
> > +      vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->vpid);
> > +
> > +   if (vmx->nested.l2_state->io_bitmap_a)
> > +      vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
> > +
> > +   if (vmx->nested.l2_state->io_bitmap_b)
> > +      vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
> > +
> > +   if (vmx->nested.l2_state->msr_bitmap)
> > +      vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
> >
>
> Don't we need to combine the host and guest msr bitmaps and I/O
> bitmaps?  If the host doesn't allow an msr or I/O access to the guest,
> it shouldn't allow it to nested guests.
Yes we didn't implement it yet.
>
> > +
> > +   if (src->vm_entry_msr_load_count>  0) {
> > +      struct page *page;
> > +
> > +      page = nested_get_page(vcpu,
> > +                   src->vm_entry_msr_load_addr);
> > +      if (!page)
> > +         return 1;
> > +
> > +      vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
> >
>
> Luckily we don't use the msr autoload stuff.  If we did we'd have to
> merge it too.  But We have to emulate those loads (via vmx_set_msr), the
> guest can easily load bad msrs which would kill the host.
Ok.
>
> > +   if (src->virtual_apic_page_addr != 0) {
> > +      struct page *page;
> > +
> > +      page = nested_get_page(vcpu,
> > +                   src->virtual_apic_page_addr);
> > +      if (!page)
> > +         return 1;
> > +
> > +      vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
> > +
> > +      kvm_release_page_clean(page);
> > +   }  else {
> > +      vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
> > +              src->virtual_apic_page_addr);
> > +   }
> >
>
> Don't understand the special zero value.
I will look into it.
>
> > +
> > +   vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> > +           (vmx->nested.l1_state->shadow_vmcs->
pin_based_vm_exec_control |
> > +            src->pin_based_vm_exec_control));
> > +
> > +   exec_control =
> vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
> > +
> > +   exec_control&= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> > +
> > +   exec_control&= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> > +
> > +   exec_control&= ~CPU_BASED_TPR_SHADOW;
> >
>
> Why?
We use the values from VMCS12 always for those controls.
>
> > +   if (enable_vpid) {
> > +      if (vmx->nested.l2_state->vpid == 0) {
> > +         allocate_vpid(vmx);
> > +         vmx->nested.l2_state->vpid = vmx->vpid;
> >
>
> What if the guest has a nonzero vpid?
>
> > +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> > +              bool is_interrupt)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   int initial_pfu_active = vcpu->fpu_active;
> > +
> > +   if (!vmx->nested.nested_mode) {
> > +      printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
> > +             __func__);
> > +      return 0;
> > +   }
> > +
> > +   save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
> > +
> > +   sync_cached_regs_to_vmcs(vcpu);
> > +
> > +   prepare_vmcs_12(vcpu);
> > +   if (is_interrupt)
> > +      vmx->nested.l2_state->shadow_vmcs->vm_exit_reason =
> > +         EXIT_REASON_EXTERNAL_INTERRUPT;
> >
>
> Need to auto-ack the interrupt if requested by the guest.
The is_interrupt means L1 has interrupts, kvm regular code will handle it.
>
>
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff
  2009-09-03 13:39       ` Avi Kivity
@ 2009-09-03 14:54         ` Orit Wasserman
  0 siblings, 0 replies; 31+ messages in thread
From: Orit Wasserman @ 2009-09-03 14:54 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 03/09/2009 16:39:09:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mmday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 03/09/2009 16:39
>
> Subject:
>
> Re: [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff
>
> On 09/03/2009 03:34 PM, Orit Wasserman wrote:
> >
> >> I think we need to mask it with the capabilities we support.
Otherwise
> >> the guest can try to use some new feature which we don't support yet,
> >> and crash.
> >>
> > I agree , but I went over the Intel spec and didn't find any
problematic
> > feature.
> > We may need to consider it in the future.
> >
>
> We need to do it, since we don't know anything about future processors.
OK
>
> --
> error compiling committee.c: too many arguments to function
>


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-09-03 14:25         ` Orit Wasserman
@ 2009-09-06  9:25           ` Avi Kivity
  2009-09-06 13:36             ` Orit Wasserman
  0 siblings, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2009-09-06  9:25 UTC (permalink / raw)
  To: Orit Wasserman
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mmday, Muli Ben-Yehuda

On 09/03/2009 05:25 PM, Orit Wasserman wrote:
>>
>>> +   /*
>>> +    * Level 2 state : includes vmcs,registers and
>>> +    * a copy of vmcs12 for vmread/vmwrite
>>> +    */
>>> +   struct level_state *l2_state;
>>> +
>>> +   /* Level 1 state for switching to level 2 and back */
>>> +   struct level_state *l1_state;
>>>
>>>        
>> Can you explain why we need two of them?  in the guest vmcs we have host
>> and guest values, and in l1_state and l2_state we have more copies, and
>> in struct vcpu we have yet another set of copies.  We also have a couple
>> of copies in the host vmcs.  I'm getting dizzy...
>>      
> L2_state stores all the L2 guest state:
>        vmcs - A pointer to VMCS02, the VMCS used to run it by L0.
>        shadow vmcs - a structure storing the values of VMCS12 (the vmcs L1
> create to run L2).
>    

When we support multiple nested guests, we'll run into a problem of 
where to store shadow_vmcs.  I see these options:

- maintain a cache of limited size of shadow_vmcs; when evicting, copy 
the shadow_vmcs into the guest's vmptr]
- always put shadow_vmcs in the guest's vmptr, and write protect it so 
the guest can't play with it
- always put shadow_vmcs in the guest's vmptr, and verify everything you 
read (that's what nsvm does)

>        cpu - the cpu id
>    

Why is it needed?

>        launched- launched flag
>    

Can be part of shadow_vmcs

>        vpid - the vpid allocate by L0 for L2 (we need to store it somewhere)
>    

Note the guest can DoS the host by allocating a lot of vpids.  So we to 
allocate host vpids on demand and be able to flush them out.

>        msr_bitmap - At the moment we use L0 msr_bitmap(as we are running kvm
> on kvm) in the future we will use a merge of both bitmaps.
>    

Note kvm uses two bitmaps (for long mode and legacy mode).

> L1 state stores the L1 state -
>        vmcs - pointer to VMCS01
>    

So it's the same as vmx->vmcs in normal operation?

>        shadow vmcs - a structure storing the values of VMCS01. we use it
> when updating VMCS02 in order to avoid the need to switch between VMCS02
> and VMCS01.
>    

Sorry, don't understand.

>        cpu - the cpu id
>        launched- launched flag
>    

This is a copy of vmx->launched?

>>> +
>>> +   if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
>>> +      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
>>> +      if (vmcs_page == NULL)
>>> +         return 1;
>>> +
>>> +      /* load nested vmcs to processor */
>>> +      if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
>>>
>>>        
>> So, you're loading a guest page as the vmcs.  This is dangerous as the
>> guest can play with it.  Much better to use inaccessible memory (and you
>> do alloc_vmcs() earlier?)
>>      
> We can copy the vmcs and than vmptrld it. As for the allocate vmcs this is
> a memory leak and I will fix it (it should be allocated only once).
>    

But why do it?  Your approach is to store the guest vmcs in the same 
format as the processor (which we don't really know), so you have to use 
vmread/vmwrite to maintain it.  Instead, you can choose that the guest 
vmcs is a shadow_vmcs structure and then you can access it using normal 
memory operations.

>> I see.  You're using the processor's format when reading the guest
>> vmcs.  But we don't have to do that, we can use the shadow_vmcs
>> structure (and a memcpy).
>>      
> I'm sorry I don't understand your comment can u elaborate ?
>    
>

See previous comment.  Basically you can do

   struct shadow_vmcs *svmcs = kmap_atomic(gpa_to_page(vmx->vmptr));
   printk("guest_cs = %x\n", svmcs->guest_cs_selector);

instead of

   vmptrld(gpa_to_hpa(vmx->vmptr))
   printk("guest_cs = %x\n", vmcs_read16(GUEST_CS_SELECTOR));


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-09-03 14:53             ` Orit Wasserman
@ 2009-09-06  9:29               ` Avi Kivity
  2009-09-06 13:38                 ` Orit Wasserman
  0 siblings, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2009-09-06  9:29 UTC (permalink / raw)
  To: Orit Wasserman
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mmday, Muli Ben-Yehuda

On 09/03/2009 05:53 PM, Orit Wasserman wrote:
>>
>> Need to auto-ack the interrupt if requested by the guest.
>>      
> The is_interrupt means L1 has interrupts, kvm regular code will handle it.
>    

If the VM-Exit Controls bit 15 (Acknowledge interrupts on exit) is set, 
when the nested guest exits you need to run kvm_cpu_get_interrupt() and 
put the vector number in the VM-Exit interruption-information field.  
kvm doesn't set this bit but I think Xen does.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-09-06  9:25           ` Avi Kivity
@ 2009-09-06 13:36             ` Orit Wasserman
  2009-09-06 13:52               ` Avi Kivity
  0 siblings, 1 reply; 31+ messages in thread
From: Orit Wasserman @ 2009-09-06 13:36 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 06/09/2009 12:25:17:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mmday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 06/09/2009 12:25
>
> Subject:
>
> Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On 09/03/2009 05:25 PM, Orit Wasserman wrote:
> >>
> >>> +   /*
> >>> +    * Level 2 state : includes vmcs,registers and
> >>> +    * a copy of vmcs12 for vmread/vmwrite
> >>> +    */
> >>> +   struct level_state *l2_state;
> >>> +
> >>> +   /* Level 1 state for switching to level 2 and back */
> >>> +   struct level_state *l1_state;
> >>>
> >>>
> >> Can you explain why we need two of them?  in the guest vmcs we have
host
> >> and guest values, and in l1_state and l2_state we have more copies,
and
> >> in struct vcpu we have yet another set of copies.  We also have a
couple
> >> of copies in the host vmcs.  I'm getting dizzy...
> >>
> > L2_state stores all the L2 guest state:
> >        vmcs - A pointer to VMCS02, the VMCS used to run it by L0.
> >        shadow vmcs - a structure storing the values of VMCS12 (the vmcs
L1
> > create to run L2).
> >
>
> When we support multiple nested guests, we'll run into a problem of
> where to store shadow_vmcs.  I see these options:
>
> - maintain a cache of limited size of shadow_vmcs; when evicting, copy
> the shadow_vmcs into the guest's vmptr]
> - always put shadow_vmcs in the guest's vmptr, and write protect it so
> the guest can't play with it
> - always put shadow_vmcs in the guest's vmptr, and verify everything you
> read (that's what nsvm does)
>
The second option looks a bit complicated I prefer one of the other two.
> >        cpu - the cpu id
> >
>
> Why is it needed?
This is a copy of the cpu id from the vcpu to store the last cpu id the L2
guest run on.
>
> >        launched- launched flag
> >
>
> Can be part of shadow_vmcs
I prefer to keep the shadow_vmcs as a separate structure to store only VMCS
fields.
>
> >        vpid - the vpid allocate by L0 for L2 (we need to store it
somewhere)
> >
>
> Note the guest can DoS the host by allocating a lot of vpids.  So we to
> allocate host vpids on demand and be able to flush them out.
The guest is not allocating the vpids the host(L0) does using
allocate_vpid.
I agree that with nested the danger for them to run out is bigger.
>
> >        msr_bitmap - At the moment we use L0 msr_bitmap(as we are
running kvm
> > on kvm) in the future we will use a merge of both bitmaps.
> >
>
> Note kvm uses two bitmaps (for long mode and legacy mode).
OK.
>
> > L1 state stores the L1 state -
> >        vmcs - pointer to VMCS01
> >
>
> So it's the same as vmx->vmcs in normal operation?
yes , but with nested the vmx->vmcs is changed when running an L2 (nested)
guest.
>
> >        shadow vmcs - a structure storing the values of VMCS01. we use
it
> > when updating VMCS02 in order to avoid the need to switch between
VMCS02
> > and VMCS01.
> >
>
> Sorry, don't understand.
VMCS02 - the VMCS L0 uses to run L2.
When we create/update VMCS02 we need to read fields from VMCS01 (host state
is taken fully, control fields ).
For L1 the shadow_vmcs is a copy of VMCS01 in a structure format, we used
the same structure.
>
> >        cpu - the cpu id
> >        launched- launched flag
> >
>
> This is a copy of vmx->launched?
Exactly .The vmx->launched is updated when switching from L1/L2 and back so
we need to store it here.
>
> >>> +
> >>> +   if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
> >>> +      vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> >>> +      if (vmcs_page == NULL)
> >>> +         return 1;
> >>> +
> >>> +      /* load nested vmcs to processor */
> >>> +      if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
> >>>
> >>>
> >> So, you're loading a guest page as the vmcs.  This is dangerous as the
> >> guest can play with it.  Much better to use inaccessible memory (and
you
> >> do alloc_vmcs() earlier?)
> >>
> > We can copy the vmcs and than vmptrld it. As for the allocate vmcs this
is
> > a memory leak and I will fix it (it should be allocated only once).
> >
>
> But why do it?  Your approach is to store the guest vmcs in the same
> format as the processor (which we don't really know), so you have to use
> vmread/vmwrite to maintain it.  Instead, you can choose that the guest
> vmcs is a shadow_vmcs structure and then you can access it using normal
> memory operations.
I got it now.
We will need a way to distinguish between processor format VMCS and
structure based VMCS,
we can use the revision id field (create a unique revision id for nested
like 0xffff or 0x0).
>
> >> I see.  You're using the processor's format when reading the guest
> >> vmcs.  But we don't have to do that, we can use the shadow_vmcs
> >> structure (and a memcpy).
> >>
> > I'm sorry I don't understand your comment can u elaborate ?
> >
> >
>
> See previous comment.  Basically you can do
>
>    struct shadow_vmcs *svmcs = kmap_atomic(gpa_to_page(vmx->vmptr));
>    printk("guest_cs = %x\n", svmcs->guest_cs_selector);
See above.
>
> instead of
>
>    vmptrld(gpa_to_hpa(vmx->vmptr))
>    printk("guest_cs = %x\n", vmcs_read16(GUEST_CS_SELECTOR));
>
>
> --
> error compiling committee.c: too many arguments to function
>


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume
  2009-09-06  9:29               ` Avi Kivity
@ 2009-09-06 13:38                 ` Orit Wasserman
  0 siblings, 0 replies; 31+ messages in thread
From: Orit Wasserman @ 2009-09-06 13:38 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 06/09/2009 12:29:58:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mmday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 06/09/2009 12:30
>
> Subject:
>
> Re: [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume
>
> On 09/03/2009 05:53 PM, Orit Wasserman wrote:
> >>
> >> Need to auto-ack the interrupt if requested by the guest.
> >>
> > The is_interrupt means L1 has interrupts, kvm regular code will handle
it.
> >
>
> If the VM-Exit Controls bit 15 (Acknowledge interrupts on exit) is set,
> when the nested guest exits you need to run kvm_cpu_get_interrupt() and
> put the vector number in the VM-Exit interruption-information field.
> kvm doesn't set this bit but I think Xen does.
VMware doesn't set it either.
We have to run L2 with the bit off even if the L1 hypervisor set it, and
emulate the L2 exit to L1 hypervisor correctly.
I will look at it.
>
> --
> error compiling committee.c: too many arguments to function
>


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-09-06 13:36             ` Orit Wasserman
@ 2009-09-06 13:52               ` Avi Kivity
  2009-09-06 16:55                 ` Orit Wasserman
  0 siblings, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2009-09-06 13:52 UTC (permalink / raw)
  To: Orit Wasserman
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mday, Muli Ben-Yehuda

On 09/06/2009 04:36 PM, Orit Wasserman wrote:
>
>> When we support multiple nested guests, we'll run into a problem of
>> where to store shadow_vmcs.  I see these options:
>>
>> - maintain a cache of limited size of shadow_vmcs; when evicting, copy
>> the shadow_vmcs into the guest's vmptr]
>> - always put shadow_vmcs in the guest's vmptr, and write protect it so
>> the guest can't play with it
>> - always put shadow_vmcs in the guest's vmptr, and verify everything you
>> read (that's what nsvm does)
>>
>>      
> The second option looks a bit complicated I prefer one of the other two.
>    

I agree, the third option looks easiest but not sure how much 
verification is needed.

Note other things like the msr bitmaps may need write protection, 
otherwise you have to re-merge the bitmap on every guest entry, which 
can be very slow.  So we may be forced to add write protection anyway.

>>>         launched- launched flag
>>>
>>>        
>> Can be part of shadow_vmcs
>>      
> I prefer to keep the shadow_vmcs as a separate structure to store only VMCS
> fields.
>    

It is a vmcs field - it is manipulated by vmx instructions which operate 
on the vmcs.  You'll need to store it in guest memory when you support 
multiple nested guests.

You can put the vmcs fields in a sub-structure if you want to separate 
between explicit fields and implicit fields (I can only see one implicit 
field (launched), but maybe there are more).

>>      
>>>         vpid - the vpid allocate by L0 for L2 (we need to store it
>>>        
> somewhere)
>    
>>>        
>> Note the guest can DoS the host by allocating a lot of vpids.  So we to
>> allocate host vpids on demand and be able to flush them out.
>>      
> The guest is not allocating the vpids the host(L0) does using
> allocate_vpid.
>    

I meant, the guest can force the host to allocate vpids if we don't 
protect against it.

> I agree that with nested the danger for them to run out is bigger.
>    


>> Sorry, don't understand.
>>      
> VMCS02 - the VMCS L0 uses to run L2.
> When we create/update VMCS02 we need to read fields from VMCS01 (host state
> is taken fully, control fields ).
> For L1 the shadow_vmcs is a copy of VMCS01 in a structure format, we used
> the same structure.
>    

I don't understand why you need it.  Host state shouldn't change.  Only 
the control fields are interesting, and things like exception_bitmap.

>> But why do it?  Your approach is to store the guest vmcs in the same
>> format as the processor (which we don't really know), so you have to use
>> vmread/vmwrite to maintain it.  Instead, you can choose that the guest
>> vmcs is a shadow_vmcs structure and then you can access it using normal
>> memory operations.
>>      
> I got it now.
> We will need a way to distinguish between processor format VMCS and
> structure based VMCS,
> we can use the revision id field (create a unique revision id for nested
> like 0xffff or 0x0).
>    

No, you can always store guest vmcs in software format, since we'll 
never load it with vmptrld.  We'll only load a real vmcs with vmptrld.

Note it also solves live migration, since now all guest vmcss are copied 
as part of normal guest memory (including their launched state).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-09-06 13:52               ` Avi Kivity
@ 2009-09-06 16:55                 ` Orit Wasserman
  2009-09-06 19:10                   ` Avi Kivity
  0 siblings, 1 reply; 31+ messages in thread
From: Orit Wasserman @ 2009-09-06 16:55 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mday, Muli Ben-Yehuda



Avi Kivity <avi@redhat.com> wrote on 06/09/2009 16:52:56:

> From:
>
> Avi Kivity <avi@redhat.com>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> Abel Gordon/Haifa/IBM@IBMIL, aliguori@us.ibm.com, Ben-Ami Yassour1/
> Haifa/IBM@IBMIL, kvm@vger.kernel.org, mday@us.ibm.com, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL
>
> Date:
>
> 06/09/2009 16:53
>
> Subject:
>
> Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
>
> On 09/06/2009 04:36 PM, Orit Wasserman wrote:
> >
> >> When we support multiple nested guests, we'll run into a problem of
> >> where to store shadow_vmcs.  I see these options:
> >>
> >> - maintain a cache of limited size of shadow_vmcs; when evicting, copy
> >> the shadow_vmcs into the guest's vmptr]
> >> - always put shadow_vmcs in the guest's vmptr, and write protect it so
> >> the guest can't play with it
> >> - always put shadow_vmcs in the guest's vmptr, and verify everything
you
> >> read (that's what nsvm does)
> >>
> >>
> > The second option looks a bit complicated I prefer one of the other
two.
> >
>
> I agree, the third option looks easiest but not sure how much
> verification is needed.
>
> Note other things like the msr bitmaps may need write protection,
> otherwise you have to re-merge the bitmap on every guest entry, which
> can be very slow.  So we may be forced to add write protection anyway.
We will also need to write protected L1's EPT tables , to allow L1 to swap
out his guests.
>
> >>>         launched- launched flag
> >>>
> >>>
> >> Can be part of shadow_vmcs
> >>
> > I prefer to keep the shadow_vmcs as a separate structure to store only
VMCS
> > fields.
> >
>
> It is a vmcs field - it is manipulated by vmx instructions which operate
> on the vmcs.  You'll need to store it in guest memory when you support
> multiple nested guests.
>
> You can put the vmcs fields in a sub-structure if you want to separate
> between explicit fields and implicit fields (I can only see one implicit
> field (launched), but maybe there are more).
OK.
>
> >>
> >>>         vpid - the vpid allocate by L0 for L2 (we need to store it
> >>>
> > somewhere)
> >
> >>>
> >> Note the guest can DoS the host by allocating a lot of vpids.  So we
to
> >> allocate host vpids on demand and be able to flush them out.
> >>
> > The guest is not allocating the vpids the host(L0) does using
> > allocate_vpid.
> >
>
> I meant, the guest can force the host to allocate vpids if we don't
> protect against it.
You meant by launching a lot of guests ?
We can limit the number of guests as a very quick solution.
More complicated is limiting the number of vpids per L1 hypervisor and
reusing them.
This means we will sometime need to invalidate the vpid when switching
between L2 guests.
>
> > I agree that with nested the danger for them to run out is bigger.
> >
>
>
> >> Sorry, don't understand.
> >>
> > VMCS02 - the VMCS L0 uses to run L2.
> > When we create/update VMCS02 we need to read fields from VMCS01 (host
state
> > is taken fully, control fields ).
> > For L1 the shadow_vmcs is a copy of VMCS01 in a structure format, we
used
> > the same structure.
> >
>
> I don't understand why you need it.  Host state shouldn't change.  Only
> the control fields are interesting, and things like exception_bitmap.
I think that when KVM switches to Qemu the host state can change (L0 host
state). If this happens between different runs of L2
we will need to update VMCS02 host state. Of course we can optimize and
update it only than.
>
> >> But why do it?  Your approach is to store the guest vmcs in the same
> >> format as the processor (which we don't really know), so you have to
use
> >> vmread/vmwrite to maintain it.  Instead, you can choose that the guest
> >> vmcs is a shadow_vmcs structure and then you can access it using
normal
> >> memory operations.
> >>
> > I got it now.
> > We will need a way to distinguish between processor format VMCS and
> > structure based VMCS,
> > we can use the revision id field (create a unique revision id for
nested
> > like 0xffff or 0x0).
> >
>
> No, you can always store guest vmcs in software format, since we'll
> never load it with vmptrld.  We'll only load a real vmcs with vmptrld.
You are right , a new VMCS will be zeroed.
>
> Note it also solves live migration, since now all guest vmcss are copied
> as part of normal guest memory (including their launched state).
Great.
>
> --
> error compiling committee.c: too many arguments to function
>


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-09-06 16:55                 ` Orit Wasserman
@ 2009-09-06 19:10                   ` Avi Kivity
  0 siblings, 0 replies; 31+ messages in thread
From: Avi Kivity @ 2009-09-06 19:10 UTC (permalink / raw)
  To: Orit Wasserman
  Cc: Abel Gordon, aliguori, Ben-Ami Yassour1, kvm, mday, Muli Ben-Yehuda

On 09/06/2009 07:55 PM, Orit Wasserman wrote:
>> Note other things like the msr bitmaps may need write protection,
>> otherwise you have to re-merge the bitmap on every guest entry, which
>> can be very slow.  So we may be forced to add write protection anyway.
>>      
> We will also need to write protected L1's EPT tables , to allow L1 to swap
> out his guests.
>    

That comes naturally with the shadow mmu.  In the same way normal shadow 
mmu protects guest page tables, nested EPT shadow should protect the 
guest's EPT pages.

(unfortunately there is no INVEPT instruction that accepts a gpa 
operand; this would make write protection unnecessary).

>> I meant, the guest can force the host to allocate vpids if we don't
>> protect against it.
>>      
> You meant by launching a lot of guests ?
>    

Yes.

> We can limit the number of guests as a very quick solution.
>    

How?  There is no way to tell the guest not to launch more guests.

> More complicated is limiting the number of vpids per L1 hypervisor and
> reusing them.
>    

When the bitmap is full, clear it.  Use a generation count to tell vcpus 
to reload.  svm does that (svm only has 63 asids).

> This means we will sometime need to invalidate the vpid when switching
> between L2 guests.
>    

Yes.

>> I don't understand why you need it.  Host state shouldn't change.  Only
>> the control fields are interesting, and things like exception_bitmap.
>>      
> I think that when KVM switches to Qemu the host state can change (L0 host
> state). If this happens between different runs of L2
> we will need to update VMCS02 host state. Of course we can optimize and
> update it only than.
>    

No, I don't think any host state changes, except for cr0.ts.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: Nested VMX support - kernel v1
  2009-09-03  9:53       ` Muli Ben-Yehuda
@ 2009-09-06 19:28         ` Anthony Liguori
  0 siblings, 0 replies; 31+ messages in thread
From: Anthony Liguori @ 2009-09-06 19:28 UTC (permalink / raw)
  To: Muli Ben-Yehuda
  Cc: Alexander Graf, Orit Wasserman, kvm, Ben-Ami Yassour1,
	Abel Gordon, aliguori, Mike D. Day

Muli Ben-Yehuda wrote:
> Unfortunately I don't think anyone from Haifa will be there. Perhaps
> Anthony or Mike (CC'd) will be there?
>   

Yup, I'll be there.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2009-09-06 19:28 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-02 15:38 Nested VMX support - kernel v1 oritw
2009-09-02 15:38 ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff oritw
2009-09-02 15:38   ` [PATCH 2/6] Nested VMX patch 2 implements vmclear oritw
2009-09-02 15:38     ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst oritw
2009-09-02 15:38       ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite oritw
2009-09-02 15:38         ` [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume oritw
2009-09-02 21:38           ` Avi Kivity
2009-09-03 14:53             ` Orit Wasserman
2009-09-06  9:29               ` Avi Kivity
2009-09-06 13:38                 ` Orit Wasserman
2009-09-02 20:15         ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite Avi Kivity
2009-09-03 14:26           ` Orit Wasserman
2009-09-02 20:05       ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst Avi Kivity
2009-09-03 14:25         ` Orit Wasserman
2009-09-06  9:25           ` Avi Kivity
2009-09-06 13:36             ` Orit Wasserman
2009-09-06 13:52               ` Avi Kivity
2009-09-06 16:55                 ` Orit Wasserman
2009-09-06 19:10                   ` Avi Kivity
2009-09-02 19:38     ` [PATCH 2/6] Nested VMX patch 2 implements vmclear Avi Kivity
2009-09-03 13:54       ` Orit Wasserman
2009-09-02 19:34   ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
2009-09-03 12:34     ` Orit Wasserman
2009-09-03 13:39       ` Avi Kivity
2009-09-03 14:54         ` Orit Wasserman
2009-09-02 15:57 ` Nested VMX support - kernel v1 Alexander Graf
2009-09-03  6:01   ` Muli Ben-Yehuda
2009-09-03  7:29     ` Alexander Graf
2009-09-03  9:53       ` Muli Ben-Yehuda
2009-09-06 19:28         ` Anthony Liguori
2009-09-02 21:39 ` Avi Kivity

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.