From mboxrd@z Thu Jan  1 00:00:00 1970
From: "Nadav Har'El" <nyh@il.ibm.com>
Subject: [PATCH 18/24] Exiting from L2 to L1
Date: Sun, 13 Jun 2010 15:31:47 +0300
Message-ID: <201006131231.o5DCVlKB013102@rice.haifa.ibm.com>
References: <1276431753-nyh@il.ibm.com>
Cc: kvm@vger.kernel.org
To: avi@redhat.com
Return-path: <kvm-owner@vger.kernel.org>
Received: from mtagate2.uk.ibm.com ([194.196.100.162]:51637 "EHLO
	mtagate2.uk.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1753437Ab0FMMbu (ORCPT <rfc822;kvm@vger.kernel.org>);
	Sun, 13 Jun 2010 08:31:50 -0400
Received: from d06nrmr1806.portsmouth.uk.ibm.com (d06nrmr1806.portsmouth.uk.ibm.com [9.149.39.193])
	by mtagate2.uk.ibm.com (8.13.1/8.13.1) with ESMTP id o5DCVnt6021608
	for <kvm@vger.kernel.org>; Sun, 13 Jun 2010 12:31:49 GMT
Received: from d06av01.portsmouth.uk.ibm.com (d06av01.portsmouth.uk.ibm.com [9.149.37.212])
	by d06nrmr1806.portsmouth.uk.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o5DCVniT1015830
	for <kvm@vger.kernel.org>; Sun, 13 Jun 2010 13:31:49 +0100
Received: from d06av01.portsmouth.uk.ibm.com (loopback [127.0.0.1])
	by d06av01.portsmouth.uk.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id o5DCVnZu029860
	for <kvm@vger.kernel.org>; Sun, 13 Jun 2010 13:31:49 +0100
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

This patch implements nested_vmx_vmexit(), called when the nested L2 guest
exits and we want to run its L1 parent and let it handle this exit.

Note that this will not necessarily be called on every L2 exit. L0 may decide
to handle a particular exit on its own, without L1's involvement; In that
case, L0 will handle the exit, and resume running L2, without running L1 and
without calling nested_vmx_vmexit(). The logic for deciding whether to handle
a particular exit in L1 or in L0, i.e., whether to call nested_vmx_vmexit(),
will appear in the next patch.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
---
--- .before/arch/x86/kvm/vmx.c	2010-06-13 15:01:30.000000000 +0300
+++ .after/arch/x86/kvm/vmx.c	2010-06-13 15:01:30.000000000 +0300
@@ -5080,9 +5080,13 @@ static void vmx_complete_interrupts(stru
 	int type;
 	bool idtv_info_valid;
 
+	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+	if (vmx->nested.nested_mode)
+		return;
+
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
-	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
 	/* Handle machine checks before interrupts are enabled */
 	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -5978,6 +5982,278 @@ static int nested_vmx_run(struct kvm_vcp
 	return 1;
 }
 
+/* prepare_vmcs_12 is called when the nested L2 guest exits and we want to
+ * prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), and this
+ * function updates it to reflect the state of the registers during the exit,
+ * and to reflect some changes that happened while L2 was running (and perhaps
+ * made some exits which were handled directly by L0 without going back to L1).
+ */
+void prepare_vmcs_12(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *vmcs12 = get_shadow_vmcs(vcpu);
+
+	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+
+	vmcs12->tsc_offset = vmcs_read64(TSC_OFFSET);
+	vmcs12->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	vmcs12->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	vmcs12->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	vmcs12->vm_entry_intr_info_field =
+		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	vmcs12->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	vmcs12->vm_entry_instruction_len =
+		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	vmcs12->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+	vmcs12->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	vmcs12->idt_vectoring_info_field =
+		vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	vmcs12->idt_vectoring_error_code =
+		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	vmcs12->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+
+	vmcs12->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
+	/* If any of the CRO_GUEST_HOST_MASK bits are off, the L2 guest may
+	 * have changed some cr0 bits without us ever saving them in the shadow
+	 * vmcs. So we need to save these changes now.
+	 * In the current code, the only GHM bit which can be off is TS (it
+	 * will be off when fpu_active and L1 also set it to off).
+	 */
+	vmcs12->guest_cr0 = vmcs_readl(GUEST_CR0);
+
+	/* But this may not be the guest_cr0 that the L1 guest hypervisor
+	 * actually thought it was giving its L2 guest. It is possible that
+	 * L1 wished to allow its guest to set a cr0 bit directly, but we (L0)
+	 * captured this attempt and instead set just the read shadow. If this
+	 * is the case, we need copy these read-shadow bits back to guest_cr0,
+	 * where L1 believes they already are. Note that we must read the
+	 * actual CR0_READ_SHADOW (which is what L0 may have changed), not
+	 * vmcs12->cr0_read_shadow (which L1 defined, and we don't
+	 * change without being told by L1). Currently, the only bit where
+	 * this can happen is TS.
+	 */
+	if (!(vcpu->arch.cr0_guest_owned_bits & X86_CR0_TS)
+			&& !(vmcs12->cr0_guest_host_mask & X86_CR0_TS))
+		vmcs12->guest_cr0 =
+			(vmcs12->guest_cr0 & ~X86_CR0_TS) |
+			(vmcs_readl(CR0_READ_SHADOW) & X86_CR0_TS);
+
+	vmcs12->guest_cr4 = vmcs_readl(GUEST_CR4);
+	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	vmcs12->guest_dr7 = vmcs_readl(GUEST_DR7);
+	vmcs12->guest_rsp = vmcs_readl(GUEST_RSP);
+	vmcs12->guest_rip = vmcs_readl(GUEST_RIP);
+	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	vmcs12->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+}
+
+int switch_back_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_shadow_vmcs;
+
+	if (enable_vpid && src->virtual_processor_id != 0)
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
+
+	vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
+	vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, src->msr_bitmap);
+
+	vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
+
+	if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+		vmcs_write64(APIC_ACCESS_ADDR,
+			     src->apic_access_addr);
+
+	if (enable_ept) {
+		vmcs_write64(EPT_POINTER, src->ept_pointer);
+		vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
+		vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
+		vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
+		vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
+	}
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control);
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control);
+	vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		     src->page_fault_error_code_mask);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		     src->page_fault_error_code_match);
+	vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
+	vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
+
+	if (cpu_has_secondary_exec_ctrls())
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+			     src->secondary_vm_exec_control);
+
+	load_vmcs_common(src);
+
+	load_vmcs_host_state(to_vmx(vcpu)->nested.l1_shadow_vmcs);
+
+	return 0;
+}
+
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int efer_offset;
+
+	if (!vmx->nested.nested_mode) {
+		printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
+		       __func__);
+		return 0;
+	}
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	if (!nested_map_current(vcpu)) {
+		printk(KERN_INFO "Error mapping shadow vmcs\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	prepare_vmcs_12(vcpu);
+	if (is_interrupt)
+		get_shadow_vmcs(vcpu)->vm_exit_reason =
+			EXIT_REASON_EXTERNAL_INTERRUPT;
+
+	vmx->nested.current_l2_page->launched = vmx->launched;
+	vmx->nested.current_l2_page->cpu = vcpu->cpu;
+
+	nested_unmap_current(vcpu);
+
+	vmx->vmcs = vmx->nested.l1_vmcs;
+	vcpu->cpu = vmx->nested.l1_state.cpu;
+	vmx->launched = vmx->nested.l1_state.launched;
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+	vcpu->arch.efer = vmx->nested.l1_state.efer;
+	if ((vcpu->arch.efer & EFER_LMA) &&
+	    !(vcpu->arch.efer & EFER_SCE))
+		vcpu->arch.efer |= EFER_SCE;
+
+	efer_offset = __find_msr_index(vmx, MSR_EFER);
+	if (update_transition_efer(vmx, efer_offset))
+		wrmsrl(MSR_EFER, vmx->guest_msrs[efer_offset].data);
+
+	/* We're running a regular L1 guest again, so we do the regular KVM
+	 * thing: run vmx_set_cr0 with the cr0 bits the guest thinks it has
+	 * (this can be figured out by combining its old guest_cr0 and
+	 * cr0_read_shadow, using the cr0_guest_host_mask). vmx_set_cr0 might
+	 * use slightly different bits on the new guest_cr0 it sets, e.g.,
+	 * add TS when !fpu_active.
+	 */
+	vmx_set_cr0(vcpu,
+		(vmx->nested.l1_shadow_vmcs->cr0_guest_host_mask &
+		vmx->nested.l1_shadow_vmcs->cr0_read_shadow) |
+		(~vmx->nested.l1_shadow_vmcs->cr0_guest_host_mask &
+		vmx->nested.l1_shadow_vmcs->guest_cr0));
+
+	vmx_set_cr4(vcpu, vmx->nested.l1_state.cr4);
+
+	if (enable_ept) {
+		vcpu->arch.cr3 = vmx->nested.l1_shadow_vmcs->guest_cr3;
+		vmcs_write32(GUEST_CR3, vmx->nested.l1_shadow_vmcs->guest_cr3);
+	} else {
+		kvm_set_cr3(vcpu, vmx->nested.l1_state.cr3);
+	}
+
+	if (!nested_map_current(vcpu)) {
+		printk(KERN_INFO "Error mapping shadow vmcs\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	switch_back_vmcs(vcpu);
+
+	nested_unmap_current(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   vmx->nested.l1_shadow_vmcs->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   vmx->nested.l1_shadow_vmcs->guest_rip);
+
+	vmx->nested.nested_mode = 0;
+
+	/* If we did fpu_activate()/fpu_deactive() during l2's run, we need
+	 * to apply the same changes also when running l1. We don't need to
+	 * change cr0 here - we already did this above - just the
+	 * cr0_guest_host_mask, and exception bitmap.
+	 */
+	vmcs_write32(EXCEPTION_BITMAP,
+		(vmx->nested.l1_shadow_vmcs->exception_bitmap &
+			~(1u<<NM_VECTOR)) |
+			(vcpu->fpu_active ? 0 : (1u<<NM_VECTOR)));
+	vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+	kvm_mmu_reset_context(vcpu);
+	kvm_mmu_load(vcpu);
+
+	if (unlikely(vmx->fail)) {
+		vmx->fail = 0;
+		set_rflags_to_vmx_fail_valid(vcpu);
+	} else
+		clear_rflags_cf_zf(vcpu);
+
+	return 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,