All of lore.kernel.org
 help / color / mirror / Atom feed
From: oritw@il.ibm.com
To: kvm@vger.kernel.org
Cc: oritw@il.ibm.com, benami@il.ibm.com, abelg@il.ibm.com,
	muli@il.ibm.com, aliguori@us.ibm.com, -mday@us.ibm.com
Subject: [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume
Date: Wed, 30 Sep 2009 15:32:12 +0200	[thread overview]
Message-ID: <1254317532-26123-6-git-send-email-oritw@il.ibm.com> (raw)
In-Reply-To: <1254317532-26123-5-git-send-email-oritw@il.ibm.com>

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c | 1173 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 1148 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a4c252..e814029 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -209,6 +209,7 @@ struct __attribute__ ((__packed__)) level_state {
 	struct vmcs *vmcs;
 	int cpu;
 	int launched;
+	bool first_launch;
 };
 
 struct nested_vmx {
@@ -216,6 +217,12 @@ struct nested_vmx {
 	bool vmxon;
 	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
 	u64 vmptr;
+	/* Are we running nested guest */
+	bool nested_mode;
+	/* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
+	bool nested_run_pending;
+	/* flag indicating if there was a valid IDT after exiting from l2 */
+	bool nested_valid_idt;
 	/*
 	 * Level 2 state : includes vmcs,registers and
 	 * a copy of vmcs12 for vmread/vmwrite
@@ -240,6 +247,10 @@ static inline int vmcs_field_length(unsigned long field)
 	return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
 }
 
+#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \
+					VM_EXIT_SAVE_IA32_PAT))
+#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \
+					 VM_ENTRY_IA32E_MODE))
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -303,6 +314,12 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static inline struct shadow_vmcs *get_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+	WARN_ON(!to_vmx(vcpu)->nested.l2_state->shadow_vmcs);
+	return to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
+}
+
 #define SHADOW_VMCS_OFFSET(x) offsetof(struct shadow_vmcs, x)
 
 static unsigned short vmcs_field_to_offset_table[HOST_RIP+1] = {
@@ -822,8 +839,16 @@ static struct kvm_vmx_segment_field {
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu);
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+				      bool has_error_code, u32 error_code);
+static int nested_vmx_intr(struct kvm_vcpu *vcpu);
 static int create_l1_state(struct kvm_vcpu *vcpu);
 static int create_l2_state(struct kvm_vcpu *vcpu);
+static int launch_guest(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override);
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt);
 
 /*
  * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
@@ -940,6 +965,18 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
 	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
 }
 
+static inline int is_exception(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_nmi(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+}
+
 static inline int cpu_has_vmx_invept_individual_addr(void)
 {
 	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -990,6 +1027,51 @@ static inline bool report_flexpriority(void)
 	return flexpriority_enabled;
 }
 
+static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu *vcpu)
+{
+	return cpu_has_vmx_tpr_shadow() &&
+		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
+		CPU_BASED_TPR_SHADOW;
+}
+
+static inline int nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu)
+{
+	return cpu_has_secondary_exec_ctrls() &&
+		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
+							   *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control &
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->
+		secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control &
+		SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->vm_entry_controls &
+		VM_ENTRY_LOAD_IA32_PAT;
+}
+
+static inline int nested_cpu_has_vmx_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
+		CPU_BASED_USE_MSR_BITMAPS;
+}
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -1501,6 +1583,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+	if (nested_vmx_check_exception(vmx, nr, has_error_code, error_code))
+		return;
+
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -1943,6 +2028,200 @@ static void vmclear_local_vcpus(void)
 		__vcpu_clear(vmx);
 }
 
+void prepare_vmcs_12(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *l2_shadow_vmcs =
+		get_shadow_vmcs(vcpu);
+
+	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+
+	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
+	l2_shadow_vmcs->guest_physical_address =
+		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	l2_shadow_vmcs->vm_entry_intr_info_field =
+		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	l2_shadow_vmcs->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	l2_shadow_vmcs->vm_entry_instruction_len =
+		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	l2_shadow_vmcs->vm_instruction_error =
+		vmcs_read32(VM_INSTRUCTION_ERROR);
+	l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	l2_shadow_vmcs->vm_exit_intr_error_code =
+		vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	l2_shadow_vmcs->idt_vectoring_info_field =
+		vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	l2_shadow_vmcs->idt_vectoring_error_code =
+		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	l2_shadow_vmcs->vm_exit_instruction_len =
+		vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	l2_shadow_vmcs->vmx_instruction_info =
+		vmcs_read32(VMX_INSTRUCTION_INFO);
+	l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	l2_shadow_vmcs->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	l2_shadow_vmcs->guest_activity_state =
+		vmcs_read32(GUEST_ACTIVITY_STATE);
+	l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+
+	l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+	l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
+
+	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
+	l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
+	l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
+	l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
+	l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	l2_shadow_vmcs->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+}
+
+int load_vmcs_common(struct shadow_vmcs *src)
+{
+	vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
+	vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
+	vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
+	vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
+	vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
+	vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
+	vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
+	vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
+
+	vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
+	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
+
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
+
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field);
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+		     src->vm_entry_exception_error_code);
+	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len);
+
+	vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
+	vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
+	vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
+	vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
+	vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
+	vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
+	vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
+	vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
+	vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
+	vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
+	vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
+	vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
+	vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
+	vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
+	vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
+	vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
+	vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
+	vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
+	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+		     src->guest_interruptibility_info);
+	vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
+	vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
+
+	vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
+	vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
+	vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
+	vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
+	vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
+	vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
+	vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
+	vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
+	vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
+	vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
+	vmcs_writel(GUEST_DR7, src->guest_dr7);
+	vmcs_writel(GUEST_RSP, src->guest_rsp);
+	vmcs_writel(GUEST_RIP, src->guest_rip);
+	vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
+	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+		    src->guest_pending_dbg_exceptions);
+	vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
+
+	return 0;
+}
+
+int load_vmcs_host_state(struct shadow_vmcs *src)
+{
+	vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector);
+	vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector);
+	vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector);
+	vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector);
+	vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector);
+	vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector);
+	vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector);
+
+	vmcs_write64(TSC_OFFSET, src->tsc_offset);
+
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+		vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat);
+
+	vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs);
+
+	vmcs_writel(HOST_CR0, src->host_cr0);
+	vmcs_writel(HOST_CR3, src->host_cr3);
+	vmcs_writel(HOST_CR4, src->host_cr4);
+	vmcs_writel(HOST_FS_BASE, src->host_fs_base);
+	vmcs_writel(HOST_GS_BASE, src->host_gs_base);
+	vmcs_writel(HOST_TR_BASE, src->host_tr_base);
+	vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base);
+	vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base);
+	vmcs_writel(HOST_RSP, src->host_rsp);
+	vmcs_writel(HOST_RIP, src->host_rip);
+	vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp);
+	vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip);
+
+	return 0;
+}
+
 struct level_state *create_state(void)
 {
 	struct level_state *state = NULL;
@@ -2003,6 +2282,8 @@ int create_l2_state(struct kvm_vcpu *vcpu)
 	vmx->nested.l2_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
 	vmx->nested.l2_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
 
+	vmx->nested.l2_state->first_launch = true;
+
 	return 0;
 }
 
@@ -3393,6 +3674,14 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
 
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		if (kvm_cpu_has_interrupt(vcpu)) {
+			if (nested_vmx_intr(vcpu))
+				return;
+		}
+		return;
+	}
+
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -3448,6 +3737,10 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (vmx->nested.nested_mode) {
+		return;
+	}
+
 	if (!cpu_has_virtual_nmis()) {
 		/*
 		 * Tracking the NMI-blocked state in software is built upon
@@ -3489,6 +3782,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		if (kvm_cpu_has_interrupt(vcpu)) {
+			if (!nested_vmx_intr(vcpu))
+				return 0;
+		}
+	}
+
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3993,12 +4293,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-static int handle_vmx_insn(struct kvm_vcpu *vcpu)
-{
-	kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
-}
-
 static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
 {
 	unsigned long rflags;
@@ -4040,6 +4334,27 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+	if (!to_vmx(vcpu)->nested.l2_state->vmclear)
+		return 1;
+
+	return launch_guest(vcpu);
+}
+
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (to_vmx(vcpu)->nested.l2_state->vmclear)
+		return 1;
+
+	return launch_guest(vcpu);
+}
+
 static int handle_vmread(struct kvm_vcpu *vcpu)
 {
 #ifndef CONFIG_X86_64
@@ -4050,7 +4365,6 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 		return 1;
 
 	if (!nested_map_shadow_vmcs(vcpu)) {
-		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
 		set_rflags_to_vmx_fail_invalid(vcpu);
 		return 1;
 	}
@@ -4107,7 +4421,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 		return 1;
 
 	if (!nested_map_shadow_vmcs(vcpu)) {
-		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
 		set_rflags_to_vmx_fail_invalid(vcpu);
 		return 1;
 	}
@@ -4137,16 +4450,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 				   vcpu->arch.regs[VCPU_REGS_RAX]);
 		break;
 	default:
+		nested_unmap_shadow_vmcs(vcpu);
 		printk(KERN_INFO "%s invalid field\n", __func__);
 		set_rflags_to_vmx_fail_valid(vcpu);
 		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
-		nested_unmap_shadow_vmcs(vcpu);
 		return 1;
 	}
 
+	nested_unmap_shadow_vmcs(vcpu);
 	clear_rflags_cf_zf(vcpu);
 	skip_emulated_instruction(vcpu);
-	nested_unmap_shadow_vmcs(vcpu);
 	return 1;
 }
 
@@ -4208,7 +4521,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct page *vmcs_page;
 	u64 guest_vmcs_addr;
 
 	if (!nested_vmx_check_permission(vcpu))
@@ -4228,14 +4540,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 	}
 
 	if (vmx->nested.vmptr != guest_vmcs_addr) {
-		/* checking vmptr address */
-		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
-		if (vmcs_page == NULL)
-			return 1;
-
 		vmx->nested.vmptr = guest_vmcs_addr;
-
-		kvm_release_page_clean(vmcs_page);
 	}
 
 	clear_rflags_cf_zf(vcpu);
@@ -4534,11 +4839,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
-	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
+	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmread,
-	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
+	[EXIT_REASON_VMRESUME]                = handle_vmresume,
 	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
@@ -4566,6 +4871,17 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
 	trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
 
+	if (exit_reason == EXIT_REASON_VMLAUNCH ||
+	    exit_reason == EXIT_REASON_VMRESUME)
+		vmx->nested.nested_run_pending = 1;
+	else
+		vmx->nested.nested_run_pending = 0;
+
+	if (vmx->nested.nested_mode && nested_vmx_exit_handled(vcpu, true)) {
+		nested_vmx_vmexit(vcpu, false);
+		return 1;
+	}
+
 	/* If we need to emulate an MMIO from handle_invalid_guest_state
 	 * we just return 0 */
 	if (vmx->emulation_required && emulate_invalid_guest_state) {
@@ -4585,7 +4901,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 			= vmcs_read32(VM_INSTRUCTION_ERROR);
 		return 0;
 	}
-
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
 			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
 			exit_reason != EXIT_REASON_EPT_VIOLATION &&
@@ -4593,8 +4908,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		printk(KERN_WARNING "%s: unexpected, valid vectoring info "
 		       "(0x%x) and exit reason is 0x%x\n",
 		       __func__, vectoring_info, exit_reason);
-
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+	if (!vmx->nested.nested_mode && unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
 		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -4641,10 +4955,13 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	int type;
 	bool idtv_info_valid;
 
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
+	if (vmx->nested.nested_mode)
+		return;
+
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
 	/* Handle machine checks before interrupts are enabled */
 	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
 	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
@@ -4747,6 +5064,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 		| vmx->rmode.irq.vector;
 }
 
+static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int irq;
+	int type;
+	int errCodeValid;
+	u32 idt_vectoring_info;
+	u32 guest_intr;
+	bool nmi_window_open;
+	bool interrupt_window_open;
+
+	if (vmx->nested.nested_mode && vmx->nested.nested_valid_idt) {
+		idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+		irq  = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+		type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+		errCodeValid = idt_vectoring_info &
+			VECTORING_INFO_DELIVER_CODE_MASK;
+
+		guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+		nmi_window_open =
+			!(guest_intr & (GUEST_INTR_STATE_STI |
+					GUEST_INTR_STATE_MOV_SS |
+					GUEST_INTR_STATE_NMI));
+
+		interrupt_window_open =
+			((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+			 !(guest_intr & (GUEST_INTR_STATE_STI |
+					 GUEST_INTR_STATE_MOV_SS)));
+
+		if (type == INTR_TYPE_EXT_INTR && !interrupt_window_open) {
+			printk(KERN_INFO "IDT ignored, l2 interrupt window closed!\n");
+			return 0;
+		}
+
+		if (type == INTR_TYPE_NMI_INTR && !nmi_window_open) {
+			printk(KERN_INFO "IDT ignored, l2 nmi window closed!\n");
+			return 0;
+		}
+
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			irq | type | INTR_INFO_VALID_MASK | errCodeValid);
+
+
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+
+		if (errCodeValid)
+			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+				     vmcs_read32(IDT_VECTORING_ERROR_CODE));
+	}
+
+	return 1;
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #define Q "q"
@@ -4758,6 +5129,26 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int r;
+
+	if (vmx->nested.nested_mode) {
+		r = nested_handle_valid_idt(vcpu);
+		if (!r) {
+			vmx->fail = 1;
+			return;
+		}
+
+		if (!nested_map_shadow_vmcs(vcpu)) {
+			vmx->fail = 1;
+			return;
+		}
+
+		vmcs_write32(EXCEPTION_BITMAP, get_shadow_vmcs(vcpu)->
+			     exception_bitmap |
+			     vmx->nested.l1_state->shadow_vmcs->exception_bitmap);
+
+		nested_unmap_shadow_vmcs(vcpu);
+	}
 
 	if (enable_ept && is_paging(vcpu)) {
 		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
@@ -4896,6 +5287,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	get_debugreg(vcpu->arch.dr6, 6);
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+	vmx->nested.nested_valid_idt = vmx->nested.nested_mode &&
+		(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
 
@@ -4984,6 +5379,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	vmx->nested.vmptr = 0;
+
+	vmx->nested.l1_state = NULL;
+	vmx->nested.l2_state = NULL;
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -5215,6 +5615,729 @@ void save_vmcs(struct shadow_vmcs *dst)
 	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
 		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
 }
+int prepare_vmcs_02(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct shadow_vmcs *src = get_shadow_vmcs(vcpu);
+	u32 exec_control;
+
+	if (!src) {
+		printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
+		return 1;
+	}
+
+	load_vmcs_common(src);
+
+	if (vmx->nested.l2_state->first_launch) {
+		if (cpu_has_vmx_vpid() && vmx->nested.l2_state->vpid != 0)
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->vpid);
+
+		if (vmx->nested.l2_state->io_bitmap_a)
+			vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
+
+		if (vmx->nested.l2_state->io_bitmap_b)
+			vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
+
+		if (vmx->nested.l2_state->msr_bitmap)
+			vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
+
+		if (src->vm_entry_msr_load_count > 0) {
+			struct page *page;
+
+			page = nested_get_page(vcpu,
+					       src->vm_entry_msr_load_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
+
+			kvm_release_page_clean(page);
+		}
+
+		if (nested_cpu_has_vmx_tpr_shadow(vcpu)) {
+			struct page *page;
+
+			page = nested_get_page(vcpu,
+					       src->virtual_apic_page_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
+
+			kvm_release_page_clean(page);
+		}
+
+		if (nested_vm_need_virtualize_apic_accesses(vcpu)) {
+			struct page *page =
+				nested_get_page(vcpu, src->apic_access_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+			kvm_release_page_clean(page);
+		}
+
+		vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+			     (vmx->nested.l1_state->shadow_vmcs->pin_based_vm_exec_control |
+			      src->pin_based_vm_exec_control));
+
+		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+			     (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_mask &
+			      src->page_fault_error_code_mask));
+
+		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+			     (vmx->nested.l1_state->shadow_vmcs->page_fault_error_code_match &
+			      src->page_fault_error_code_match));
+
+		if (cpu_has_secondary_exec_ctrls()) {
+
+			exec_control =
+				vmx->nested.l1_state->shadow_vmcs->secondary_vm_exec_control;
+
+			if (nested_cpu_has_secondary_exec_ctrls(vcpu)) {
+
+				exec_control |= src->secondary_vm_exec_control;
+
+				if (!vm_need_virtualize_apic_accesses(vcpu->kvm) ||
+				    !nested_vm_need_virtualize_apic_accesses(vcpu))
+					exec_control &=
+						~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+			}
+
+			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+		}
+
+		load_vmcs_host_state(vmx->nested.l1_state->shadow_vmcs);
+
+		vmx->nested.l2_state->first_launch = false;
+	}
+
+	if (vm_need_tpr_shadow(vcpu->kvm) &&
+	    nested_cpu_has_vmx_tpr_shadow(vcpu))
+		vmcs_write32(TPR_THRESHOLD, src->tpr_threshold);
+
+	if (enable_ept) {
+		if (!nested_cpu_has_vmx_ept(vcpu)) {
+			vmcs_write64(EPT_POINTER,
+				     vmx->nested.l1_state->shadow_vmcs->ept_pointer);
+			vmcs_write64(GUEST_PDPTR0,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr0);
+			vmcs_write64(GUEST_PDPTR1,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr1);
+			vmcs_write64(GUEST_PDPTR2,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr2);
+			vmcs_write64(GUEST_PDPTR3,
+				     vmx->nested.l1_state->shadow_vmcs->guest_pdptr3);
+		}
+	}
+
+	exec_control = vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
+
+	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+
+	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+
+	exec_control &= ~CPU_BASED_TPR_SHADOW;
+
+	exec_control |= src->cpu_based_vm_exec_control;
+
+	if (!vm_need_tpr_shadow(vcpu->kvm) ||
+	    src->virtual_apic_page_addr == 0) {
+		exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_STORE_EXITING |
+			CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	} else if (exec_control & CPU_BASED_TPR_SHADOW) {
+
+#ifdef CONFIG_X86_64
+		exec_control &= ~CPU_BASED_CR8_STORE_EXITING;
+		exec_control &= ~CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	}
+
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	vmcs_write32(EXCEPTION_BITMAP,
+		     (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
+		      src->exception_bitmap));
+
+	vmcs_write32(VM_EXIT_CONTROLS,
+		     ((vmx->nested.l1_state->shadow_vmcs->vm_exit_controls &
+		       NESTED_VM_EXIT_CONTROLS_MASK) | src->vm_exit_controls));
+
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		     (vmx->nested.l1_state->shadow_vmcs->vm_entry_controls &
+		      NESTED_VM_ENTRY_CONTROLS_MASK) | src->vm_entry_controls);
+
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	vmcs_writel(CR0_GUEST_HOST_MASK,
+		    (vmx->nested.l1_state->shadow_vmcs->cr0_guest_host_mask  &
+		     src->cr0_guest_host_mask));
+	vmcs_writel(CR4_GUEST_HOST_MASK,
+		    (vmx->nested.l1_state->shadow_vmcs->cr4_guest_host_mask  &
+		     src->cr4_guest_host_mask));
+
+	return 0;
+}
+
+int switch_back_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
+
+	if (enable_vpid && src->virtual_processor_id != 0)
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
+
+	vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
+	vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, src->msr_bitmap);
+
+	vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
+
+	if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+		vmcs_write64(APIC_ACCESS_ADDR,
+			     src->apic_access_addr);
+
+	if (enable_ept) {
+		vmcs_write64(EPT_POINTER, src->ept_pointer);
+		vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
+		vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
+		vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
+		vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
+	}
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control);
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control);
+	vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		     src->page_fault_error_code_mask);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		     src->page_fault_error_code_match);
+	vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
+	vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	if (cpu_has_secondary_exec_ctrls())
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+			     src->secondary_vm_exec_control);
+
+	load_vmcs_common(src);
+
+	load_vmcs_host_state(to_vmx(vcpu)->nested.l1_state->shadow_vmcs);
+
+	return 0;
+}
+
+void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
+{
+	unsigned long mask;
+
+	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+	mask = ~((1 << VCPU_REGS_RSP) | (1 << VCPU_REGS_RIP));
+
+	if (vcpu->arch.regs_dirty & mask) {
+		printk(KERN_INFO "WARNING: dirty cached registers regs_dirty 0x%x mask 0x%lx\n",
+		       vcpu->arch.regs_dirty, mask);
+		WARN_ON(1);
+	}
+
+	vcpu->arch.regs_dirty = 0;
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu)
+{
+	/* verify that l1 has done vmptrld for l2 earlier */
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int initial_pfu_active = vcpu->fpu_active;
+	int r = 0;
+
+	if (vmx->nested.nested_mode) {
+		printk(KERN_INFO "Nested guest already running\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+
+	vmx->nested.nested_mode = 1;
+
+	vcpu->arch.exception.pending = false;
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	save_vmcs(vmx->nested.l1_state->shadow_vmcs);
+
+	vmx->nested.l1_state->shadow_efer = vcpu->arch.shadow_efer;
+	if (!enable_ept)
+		vmx->nested.l1_state->cr3 = vcpu->arch.cr3;
+	vmx->nested.l1_state->cr4 = vcpu->arch.cr4;
+
+	if (enable_vpid) {
+		if (vmx->nested.l2_state->vpid == 0) {
+			allocate_vpid(vmx);
+			vmx->nested.l2_state->vpid = vmx->vpid;
+		}
+	}
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx->nested.l1_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
+	else
+		vmx->nested.l1_state->msr_bitmap = 0;
+
+	vmx->nested.l1_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	vmx->nested.l1_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+	vmx->nested.l1_state->vmcs = vmx->vmcs;
+	vmx->nested.l1_state->cpu = vcpu->cpu;
+	vmx->nested.l1_state->launched = vmx->launched;
+
+	vmx->vmcs = vmx->nested.l2_state->vmcs;
+	vcpu->cpu = vmx->nested.l2_state->cpu;
+	vmx->launched = vmx->nested.l2_state->launched;
+
+	if (vmx->nested.l2_state->vmclear || !vmx->launched) {
+		vmcs_clear(vmx->vmcs);
+		vmx->launched = 0;
+		vmx->nested.l2_state->vmclear = 0;
+	}
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+
+	if (!nested_map_shadow_vmcs(vcpu)) {
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	prepare_vmcs_02(vcpu);
+
+	if (get_shadow_vmcs(vcpu)->vm_entry_controls &
+	    VM_ENTRY_IA32E_MODE) {
+		if (!((vcpu->arch.shadow_efer & EFER_LMA) &&
+		      (vcpu->arch.shadow_efer & EFER_LME)))
+			vcpu->arch.shadow_efer |= (EFER_LMA | EFER_LME);
+	} else {
+		if ((vcpu->arch.shadow_efer & EFER_LMA) ||
+		    (vcpu->arch.shadow_efer & EFER_LME))
+			vcpu->arch.shadow_efer = 0;
+	}
+
+	vmx_set_cr0(vcpu, get_shadow_vmcs(vcpu)->guest_cr0);
+	vmcs_writel(CR0_READ_SHADOW,
+		    get_shadow_vmcs(vcpu)->cr0_read_shadow);
+	vmx_set_cr4(vcpu, get_shadow_vmcs(vcpu)->guest_cr4);
+	vmcs_writel(CR4_READ_SHADOW,
+		    get_shadow_vmcs(vcpu)->cr4_read_shadow);
+
+	vcpu->arch.cr0 |= X86_CR0_PG;
+
+	if (enable_ept && !nested_cpu_has_vmx_ept(vcpu)) {
+		vmcs_write32(GUEST_CR3, get_shadow_vmcs(vcpu)->guest_cr3);
+		vmx->vcpu.arch.cr3 = get_shadow_vmcs(vcpu)->guest_cr3;
+	} else {
+		kvm_set_cr3(vcpu, get_shadow_vmcs(vcpu)->guest_cr3);
+		kvm_mmu_reset_context(vcpu);
+
+		nested_unmap_shadow_vmcs(vcpu);
+
+		r = kvm_mmu_load(vcpu);
+		if (unlikely(r)) {
+			printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r);
+			nested_vmx_vmexit(vcpu, false);
+			set_rflags_to_vmx_fail_valid(vcpu);
+			return 1;
+		}
+
+		nested_map_shadow_vmcs(vcpu);
+	}
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   get_shadow_vmcs(vcpu)->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   get_shadow_vmcs(vcpu)->guest_rip);
+
+	vmcs_write32(EXCEPTION_BITMAP,
+		     (vmx->nested.l1_state->shadow_vmcs->exception_bitmap |
+		      get_shadow_vmcs(vcpu)->exception_bitmap));
+
+	nested_unmap_shadow_vmcs(vcpu);
+
+	if (initial_pfu_active)
+		vmx_fpu_activate(vcpu);
+
+	return 1;
+}
+
+static int launch_guest(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	skip_emulated_instruction(vcpu);
+
+	nested_vmx_run(vcpu);
+
+	return 1;
+}
+
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int initial_pfu_active = vcpu->fpu_active;
+
+	if (!vmx->nested.nested_mode) {
+		printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
+		       __func__);
+		return 0;
+	}
+
+	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	if (!nested_map_shadow_vmcs(vcpu)) {
+		printk(KERN_INFO "Error mapping shadow vmcs\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	prepare_vmcs_12(vcpu);
+	if (is_interrupt)
+		get_shadow_vmcs(vcpu)->vm_exit_reason =
+			EXIT_REASON_EXTERNAL_INTERRUPT;
+
+	vmx->nested.l2_state->launched = vmx->launched;
+	vmx->nested.l2_state->cpu = vcpu->cpu;
+
+	nested_unmap_shadow_vmcs(vcpu);
+
+	vmx->vmcs = vmx->nested.l1_state->vmcs;
+	vcpu->cpu = vmx->nested.l1_state->cpu;
+	vmx->launched = vmx->nested.l1_state->launched;
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+	vcpu->arch.exception.pending = false;
+
+	vcpu->arch.shadow_efer = vmx->nested.l1_state->shadow_efer;
+	vmx_set_cr0(vcpu, vmx->nested.l1_state->shadow_vmcs->cr0_read_shadow);
+	vmx_set_cr4(vcpu, vmx->nested.l1_state->cr4);
+
+	if (enable_ept) {
+		vcpu->arch.cr3 = vmx->nested.l1_state->shadow_vmcs->guest_cr3;
+		vmcs_write32(GUEST_CR3, vmx->nested.l1_state->shadow_vmcs->guest_cr3);
+	} else {
+		kvm_set_cr3(vcpu, vmx->nested.l1_state->cr3);
+	}
+
+	if (!nested_map_shadow_vmcs(vcpu)) {
+		printk(KERN_INFO "Error mapping shadow vmcs\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	switch_back_vmcs(vcpu);
+
+	nested_unmap_shadow_vmcs(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   vmx->nested.l1_state->shadow_vmcs->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   vmx->nested.l1_state->shadow_vmcs->guest_rip);
+
+	vmx->nested.nested_mode = 0;
+
+	kvm_mmu_reset_context(vcpu);
+	kvm_mmu_load(vcpu);
+
+	if (unlikely(vmx->fail)) {
+		vmx->fail = 0;
+		set_rflags_to_vmx_fail_valid(vcpu);
+	} else
+		clear_rflags_cf_zf(vcpu);
+
+	if (initial_pfu_active)
+		vmx_fpu_activate(vcpu);
+
+	return 0;
+}
+
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
+{
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		struct page *msr_page = NULL;
+		u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+		u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+		struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu);
+
+		if (!cpu_has_vmx_msr_bitmap()
+		    || !nested_cpu_has_vmx_msr_bitmap(vcpu))
+			return 1;
+
+		msr_page = nested_get_page(vcpu,
+					   l2svmcs->msr_bitmap);
+
+		if (!msr_page) {
+			printk(KERN_INFO "%s error in nested_get_page\n",
+			       __func__);
+			return 0;
+		}
+
+		switch (exit_code) {
+		case EXIT_REASON_MSR_READ:
+			if (msr_index <= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x000)))
+					return 1;
+			} else if ((msr_index >= 0xc0000000) &&
+				   (msr_index <= 0xc0001fff)) {
+				msr_index &= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x400)))
+					return 1;
+			}
+			break;
+		case EXIT_REASON_MSR_WRITE:
+			if (msr_index <= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x800)))
+						return 1;
+			} else if ((msr_index >= 0xc0000000) &&
+				   (msr_index <= 0xc0001fff)) {
+				msr_index &= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0xc00)))
+					return 1;
+			}
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
+{
+	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	struct shadow_vmcs *l2svmcs;
+
+	int r = 0;
+
+	if (vmx->nested.nested_run_pending)
+		return 0;
+
+	if (unlikely(vmx->fail)) {
+		printk(KERN_INFO "%s failed vm entry %x\n",
+		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		return 1;
+	}
+
+	if (kvm_override) {
+		switch (exit_code) {
+		case EXIT_REASON_EXTERNAL_INTERRUPT:
+			return 0;
+		case EXIT_REASON_EXCEPTION_NMI:
+			if (!is_exception(intr_info))
+				return 0;
+
+			if (is_page_fault(intr_info) && (!enable_ept))
+				return 0;
+
+			break;
+		case EXIT_REASON_EPT_VIOLATION:
+			if (enable_ept)
+				return 0;
+
+			break;
+		}
+	}
+
+
+	if (!nested_map_shadow_vmcs(vcpu))
+		return 0;
+	l2svmcs = get_shadow_vmcs(vcpu);
+
+	switch (exit_code) {
+	case EXIT_REASON_INVLPG:
+		if (l2svmcs->cpu_based_vm_exec_control &
+		    CPU_BASED_INVLPG_EXITING)
+			r = 1;
+		break;
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_MSR_WRITE:
+		r = nested_vmx_exit_handled_msr(vcpu);
+		break;
+	case EXIT_REASON_CR_ACCESS: {
+		unsigned long exit_qualification =
+			vmcs_readl(EXIT_QUALIFICATION);
+		int cr = exit_qualification & 15;
+		int reg = (exit_qualification >> 8) & 15;
+		unsigned long val = kvm_register_read(vcpu, reg);
+
+		switch ((exit_qualification >> 4) & 3) {
+		case 0: /* mov to cr */
+			switch (cr) {
+			case 0:
+				if (l2svmcs->cr0_guest_host_mask &
+				    (val ^ l2svmcs->cr0_read_shadow))
+					r = 1;
+				break;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR3_LOAD_EXITING)
+					r = 1;
+				break;
+			case 4:
+				if (l2svmcs->cr4_guest_host_mask &
+				    (l2svmcs->cr4_read_shadow ^ val))
+					r = 1;
+				break;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR8_LOAD_EXITING)
+					r = 1;
+				break;
+			}
+			break;
+		case 2: /* clts */
+			if (l2svmcs->cr0_guest_host_mask &
+			    (val ^ l2svmcs->cr0_read_shadow))
+				r = 1;
+			break;
+		case 1: /*mov from cr*/
+			switch (cr) {
+			case 0:
+				r = 1;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR3_STORE_EXITING)
+					r = 1;
+				break;
+			case 4:
+				r = 1;
+				break;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR8_STORE_EXITING)
+					r = 1;
+				break;
+			}
+			break;
+		case 3: /* lmsw */
+			if (l2svmcs->cr0_guest_host_mask &
+			    (val ^ l2svmcs->cr0_read_shadow))
+				r = 1;
+			break;
+		}
+		break;
+	}
+	case EXIT_REASON_DR_ACCESS: {
+		if (l2svmcs->cpu_based_vm_exec_control &
+		    CPU_BASED_MOV_DR_EXITING)
+			r = 1;
+		break;
+	}
+
+	case EXIT_REASON_EXCEPTION_NMI: {
+
+		if (is_external_interrupt(intr_info) &&
+		    (l2svmcs->pin_based_vm_exec_control &
+		     PIN_BASED_EXT_INTR_MASK))
+			r = 1;
+		else if (is_nmi(intr_info) &&
+		    (l2svmcs->pin_based_vm_exec_control &
+		     PIN_BASED_NMI_EXITING))
+			r = 1;
+		else if (is_exception(intr_info) &&
+		    (l2svmcs->exception_bitmap &
+		     (1u << (intr_info & INTR_INFO_VECTOR_MASK))))
+			r = 1;
+		else if (is_page_fault(intr_info))
+			r = 1;
+		break;
+	}
+
+	case EXIT_REASON_EXTERNAL_INTERRUPT:
+		if (l2svmcs->pin_based_vm_exec_control &
+		    PIN_BASED_EXT_INTR_MASK)
+			r = 1;
+		break;
+	default:
+		r = 1;
+	}
+	nested_unmap_shadow_vmcs(vcpu);
+
+	return r;
+}
+
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+				      bool has_error_code, u32 error_code)
+{
+	if (vmx->nested.nested_mode) {
+		if (nested_vmx_exit_handled(&vmx->vcpu, false)) {
+			nested_vmx_vmexit(&vmx->vcpu, false);
+			if (!nested_map_shadow_vmcs(&vmx->vcpu))
+				return 1;
+			get_shadow_vmcs(&vmx->vcpu)->vm_exit_reason =
+				EXIT_REASON_EXCEPTION_NMI;
+			get_shadow_vmcs(&vmx->vcpu)->vm_exit_intr_info =
+				(nr | INTR_TYPE_HARD_EXCEPTION
+				 | (has_error_code ?
+				    INTR_INFO_DELIVER_CODE_MASK : 0)
+				 | INTR_INFO_VALID_MASK);
+
+			if (has_error_code)
+				get_shadow_vmcs(&vmx->vcpu)->
+					vm_exit_intr_error_code = error_code;
+			nested_unmap_shadow_vmcs(&vmx->vcpu);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int nested_vmx_intr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vmx->nested.nested_mode) {
+
+		if (!nested_map_shadow_vmcs(vcpu))
+			return 0;
+
+		if (get_shadow_vmcs(vcpu)->pin_based_vm_exec_control &
+		    PIN_BASED_EXT_INTR_MASK) {
+
+			if (vmx->nested.nested_run_pending) {
+				nested_unmap_shadow_vmcs(vcpu);
+				return 0;
+			}
+
+			nested_unmap_shadow_vmcs(vcpu);
+			nested_vmx_vmexit(vcpu, true);
+			return 1;		
+		}
+
+		nested_unmap_shadow_vmcs(vcpu);
+
+	}
+
+	return 0;
+}
 
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
-- 
1.6.0.4


  reply	other threads:[~2009-09-30 13:32 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-09-30 13:32 Nested VMX support v2 oritw
2009-09-30 13:32 ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff oritw
2009-09-30 13:32   ` [PATCH 2/5] Nested VMX patch 2 implements vmclear oritw
2009-09-30 13:32     ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst oritw
2009-09-30 13:32       ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite oritw
2009-09-30 13:32         ` oritw [this message]
2009-10-15 14:41 Nested VMX support v3 oritw
2009-10-15 14:41 ` [PATCH 1/5] Nested VMX patch 1 implements vmon and vmoff oritw
2009-10-15 14:41   ` [PATCH 2/5] Nested VMX patch 2 implements vmclear oritw
2009-10-15 14:41     ` [PATCH 3/5] Nested VMX patch 3 implements vmptrld and vmptrst oritw
2009-10-15 14:41       ` [PATCH 4/5] Nested VMX patch 4 implements vmread and vmwrite oritw
2009-10-15 14:41         ` [PATCH 5/5] Nested VMX patch 5 implements vmlaunch and vmresume oritw
2009-10-19 17:29           ` Gleb Natapov
2009-10-21 14:43             ` Orit Wasserman
2009-10-22  9:04               ` Gleb Natapov
2009-10-22 15:46                 ` Orit Wasserman
2009-10-25  9:44                   ` Gleb Natapov
2009-10-28 16:23                     ` Orit Wasserman
2009-10-29 17:31                       ` Gleb Natapov
2009-11-09  9:33                         ` Abel Gordon
2009-10-22 10:55               ` Avi Kivity
2009-10-20  4:56           ` Avi Kivity
2009-10-22 12:56             ` Orit Wasserman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1254317532-26123-6-git-send-email-oritw@il.ibm.com \
    --to=oritw@il.ibm.com \
    --cc=-mday@us.ibm.com \
    --cc=abelg@il.ibm.com \
    --cc=aliguori@us.ibm.com \
    --cc=benami@il.ibm.com \
    --cc=kvm@vger.kernel.org \
    --cc=muli@il.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.