All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Nadav Har'El" <nyh@il.ibm.com>
To: kvm@vger.kernel.org
Cc: gleb@redhat.com, avi@redhat.com
Subject: [PATCH 17/29] nVMX: Implement VMLAUNCH and VMRESUME
Date: Thu, 27 Jan 2011 10:38:33 +0200	[thread overview]
Message-ID: <201101270838.p0R8cXFJ002634@rice.haifa.ibm.com> (raw)
In-Reply-To: 1296116987-nyh@il.ibm.com

Implement the VMLAUNCH and VMRESUME instructions, allowing a guest
hypervisor to run its own guests.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
---
 arch/x86/kvm/vmx.c |  205 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 202 insertions(+), 3 deletions(-)

--- .before/arch/x86/kvm/vmx.c	2011-01-26 18:06:05.000000000 +0200
+++ .after/arch/x86/kvm/vmx.c	2011-01-26 18:06:05.000000000 +0200
@@ -341,6 +341,10 @@ struct nested_vmx {
 	/* list of real (hardware) VMCS, one for each L2 guest of L1 */
 	struct list_head vmcs02_list; /* a vmcs_list */
 	int vmcs02_num;
+
+	/* Saving the VMCS that we used for running L1 */
+	struct saved_vmcs saved_vmcs01;
+	struct vmcs_fields *vmcs01_fields;
 };
 
 struct vcpu_vmx {
@@ -4453,6 +4457,10 @@ static int handle_vmon(struct kvm_vcpu *
 	INIT_LIST_HEAD(&(vmx->nested.vmcs02_list));
 	vmx->nested.vmcs02_num = 0;
 
+	vmx->nested.vmcs01_fields = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!vmx->nested.vmcs01_fields)
+		return -ENOMEM;
+
 	vmx->nested.vmxon = true;
 
 	skip_emulated_instruction(vcpu);
@@ -4505,6 +4513,9 @@ static void free_nested(struct vcpu_vmx 
 	}
 
 	nested_free_all_vmcs(vmx);
+
+	kfree(vmx->nested.vmcs01_fields);
+	vmx->nested.vmcs01_fields = NULL;
 }
 
 /* Emulate the VMXOFF instruction */
@@ -4665,6 +4676,60 @@ static int handle_vmclear(struct kvm_vcp
 	return 1;
 }
 
+static int nested_vmx_run(struct kvm_vcpu *vcpu);
+
+static int handle_launch_or_resume(struct kvm_vcpu *vcpu, bool launch)
+{
+	struct vmcs12 *vmcs12;
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	skip_emulated_instruction(vcpu);
+
+	vmcs12 = get_vmcs12(vcpu);
+	/* yet another strange pre-requisite listed in the VMX spec */
+	if (vmcs12->fields.guest_interruptibility_info &
+			GUEST_INTR_STATE_MOV_SS) {
+		nested_vmx_failValid(vcpu,
+			VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+		return 1;
+	}
+	/*
+	 * enforce that after VMCLEAR, L1 must use VMLAUNCH, but later must use
+	 * VMRESUME, as this is part of the spec (even though it was easier for
+	 * us to just allow both to work any time).
+	 */
+	if (vmcs12->launch_state == launch) {
+		nested_vmx_failValid(vcpu,
+			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS :
+				 VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+		return 1;
+	}
+
+	nested_vmx_run(vcpu);
+
+	/*
+	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+	 * returned as far as L1 is concerned. It will only return (and set
+	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
+	 */
+	return 1;
+}
+
+/* Emulate the VMLAUNCH instruction */
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+	return handle_launch_or_resume(vcpu, true);
+}
+
+/* Emulate the VMRESUME instruction */
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+	return handle_launch_or_resume(vcpu, false);
+}
+
 enum vmcs_field_type {
 	VMCS_FIELD_TYPE_U16 = 0,
 	VMCS_FIELD_TYPE_U64 = 1,
@@ -4941,11 +5006,11 @@ static int (*kvm_vmx_exit_handlers[])(st
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
-	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
+	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmread,
-	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
+	[EXIT_REASON_VMRESUME]                = handle_vmresume,
 	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
@@ -5009,7 +5074,8 @@ static int vmx_handle_exit(struct kvm_vc
 		       "(0x%x) and exit reason is 0x%x\n",
 		       __func__, vectoring_info, exit_reason);
 
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+	if (!is_guest_mode(vcpu) &&
+	    unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
 		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -5944,6 +6010,139 @@ int prepare_vmcs02(struct kvm_vcpu *vcpu
 	return 0;
 }
 
+/*
+ * Return the cr0 value that a guest would read. This is a combination of
+ * the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
+ * the hypervisor (cr0_read_shadow).
+ */
+static inline unsigned long guest_readable_cr0(struct vmcs_fields *fields)
+{
+	return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
+		(fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+}
+static inline unsigned long guest_readable_cr4(struct vmcs_fields *fields)
+{
+	return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
+		(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+}
+static inline void set_cr3_and_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	vcpu->arch.cr3 = cr3;
+	vmcs_writel(GUEST_CR3, cr3);
+	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+	load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3);
+	vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
+	vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
+	vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
+	vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int cpu;
+	struct saved_vmcs *saved_vmcs02;
+
+	enter_guest_mode(vcpu);
+	sync_cached_regs_to_vmcs(vcpu);
+	save_vmcs(vmx->nested.vmcs01_fields);
+
+	/*
+	 * Switch from L1's VMCS, to L2's VMCS. Remember the L1 VMCS, on which
+	 * CPU it was last loaded, and whether it was launched (we need all
+	 * these values next time we will use L1). Then recall these values as
+	 * they were for L2's VMCS (unless L2 has never been launched).
+	 */
+	vmx->nested.saved_vmcs01.vmcs = vmx->vmcs;
+	vmx->nested.saved_vmcs01.cpu = vcpu->cpu;
+	vmx->nested.saved_vmcs01.launched = vmx->launched;
+
+	saved_vmcs02 = nested_get_current_vmcs(vmx);
+	if (!saved_vmcs02) {
+		/* In the current code, this cannot happen */
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		return 1;
+	}
+	vmx->vmcs = saved_vmcs02->vmcs;
+	vcpu->cpu = saved_vmcs02->cpu;
+	vmx->launched = saved_vmcs02->launched;
+
+	vmx_vcpu_put(vcpu);
+	cpu = get_cpu();
+	vmx_vcpu_load(vcpu, cpu);
+	vcpu->cpu = cpu;
+	put_cpu();
+
+	vmx->nested.current_vmcs12->launch_state = 1;
+
+	prepare_vmcs02(vcpu,
+		get_vmcs12_fields(vcpu), vmx->nested.vmcs01_fields);
+
+	if (get_vmcs12_fields(vcpu)->vm_entry_controls &
+	    VM_ENTRY_IA32E_MODE)
+		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+	else
+		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+
+	vmx->rmode.vm86_active =
+		!(get_vmcs12_fields(vcpu)->cr0_read_shadow & X86_CR0_PE);
+
+	/* vmx_set_cr0() sets the cr0 that L2 will read, to be the one that L1
+	 * dictated, and takes appropriate actions for special cr0 bits (like
+	 * real mode, etc.).
+	 */
+	vmx_set_cr0(vcpu, guest_readable_cr0(get_vmcs12_fields(vcpu)));
+
+	/* However, vmx_set_cr0 incorrectly enforces KVM's relationship between
+	 * GUEST_CR0 and CR0_READ_SHADOW, e.g., that the former is the same as
+	 * the latter with with TS added if !fpu_active. We need to take the
+	 * actual GUEST_CR0 that L1 wanted, just with added TS if !fpu_active
+	 * like KVM wants (for the "lazy fpu" feature, to avoid the costly
+	 * restoration of fpu registers until the FPU is really used).
+	 */
+	vmcs_writel(GUEST_CR0, get_vmcs12_fields(vcpu)->guest_cr0 |
+		(vcpu->fpu_active ? 0 : X86_CR0_TS));
+
+	/* we have to set the X86_CR0_PG bit of the cached cr0, because
+	 * kvm_mmu_reset_context enables paging only if X86_CR0_PG is set in
+	 * CR0 (we need the paging so that KVM treat this guest as a paging
+	 * guest so we can easly forward page faults to L1.)
+	 */
+	vcpu->arch.cr0 |= X86_CR0_PG;
+
+	if (enable_ept) {
+		/* shadow page tables on EPT */
+		vcpu->arch.cr4 = guest_readable_cr4(get_vmcs12_fields(vcpu));
+		vmcs_writel(CR4_READ_SHADOW, vcpu->arch.cr4);
+		vmcs_writel(GUEST_CR4, get_vmcs12_fields(vcpu)->guest_cr4);
+		set_cr3_and_pdptrs(vcpu, get_vmcs12_fields(vcpu)->guest_cr3);
+	} else {
+		/* shadow page tables on shadow page tables */
+		vmx_set_cr4(vcpu, get_vmcs12_fields(vcpu)->guest_cr4);
+		vmcs_writel(CR4_READ_SHADOW,
+			    get_vmcs12_fields(vcpu)->cr4_read_shadow);
+		kvm_set_cr3(vcpu, get_vmcs12_fields(vcpu)->guest_cr3);
+		kvm_mmu_reset_context(vcpu);
+
+		if (unlikely(kvm_mmu_load(vcpu))) {
+			/*
+			 * TODO: there is no reasonable error number to use.
+			 * perhaps a more reasonable thing to do is to
+			 * emulate a guest shutdown, not a launch error?
+			 */
+			nested_vmx_failValid(vcpu, 87);
+			return 1;
+		}
+	}
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   get_vmcs12_fields(vcpu)->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   get_vmcs12_fields(vcpu)->guest_rip);
+
+	return 1;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,

  parent reply	other threads:[~2011-01-27  8:38 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-01-27  8:29 [PATCH 0/29] nVMX: Nested VMX, v8 Nadav Har'El
2011-01-27  8:30 ` [PATCH 01/29] nVMX: Add "nested" module option to vmx.c Nadav Har'El
2011-01-27  8:30 ` [PATCH 02/29] nVMX: Implement VMXON and VMXOFF Nadav Har'El
2011-01-27  8:31 ` [PATCH 03/29] nVMX: Allow setting the VMXE bit in CR4 Nadav Har'El
2011-01-27  8:31 ` [PATCH 04/29] nVMX: Introduce vmcs12: a VMCS structure for L1 Nadav Har'El
2011-01-27  8:32 ` [PATCH 05/29] nVMX: Implement reading and writing of VMX MSRs Nadav Har'El
2011-01-30  9:52   ` Avi Kivity
2011-01-31  8:57     ` Nadav Har'El
2011-01-31  9:01       ` Avi Kivity
2011-01-27  8:32 ` [PATCH 06/29] nVMX: Decoding memory operands of VMX instructions Nadav Har'El
2011-01-27  8:33 ` [PATCH 07/29] nVMX: Hold a vmcs02 for each vmcs12 Nadav Har'El
2011-01-30 10:02   ` Avi Kivity
2011-01-31  9:26     ` Nadav Har'El
2011-01-31  9:41       ` Avi Kivity
2011-02-03 12:57     ` Nadav Har'El
2011-02-06  9:16       ` Avi Kivity
2011-02-13 13:04         ` Nadav Har'El
2011-02-13 14:58           ` Avi Kivity
2011-02-13 20:07             ` Nadav Har'El
2011-01-27  8:33 ` [PATCH 08/29] nVMX: Fix local_vcpus_link handling Nadav Har'El
2011-01-30 10:08   ` Avi Kivity
2011-01-27  8:34 ` [PATCH 09/29] nVMX: Add VMCS fields to the vmcs12 Nadav Har'El
2011-01-30 10:10   ` Avi Kivity
2011-01-27  8:34 ` [PATCH 10/29] nVMX: Success/failure of VMX instructions Nadav Har'El
2011-01-27  8:35 ` [PATCH 11/29] nVMX: Implement VMCLEAR Nadav Har'El
2011-01-30 12:07   ` Avi Kivity
2011-01-27  8:35 ` [PATCH 12/29] nVMX: Implement VMPTRLD Nadav Har'El
2011-01-27  8:36 ` [PATCH 13/29] nVMX: Implement VMPTRST Nadav Har'El
2011-01-27  8:37 ` [PATCH 14/29] nVMX: Implement VMREAD and VMWRITE Nadav Har'El
2011-01-27  8:37 ` [PATCH 15/29] nVMX: Prepare vmcs02 from vmcs01 and vmcs12 Nadav Har'El
2011-01-27  8:38 ` [PATCH 16/29] nVMX: Move register-syncing to a function Nadav Har'El
2011-01-27  8:38 ` Nadav Har'El [this message]
2011-01-27  8:39 ` [PATCH 18/29] nVMX: No need for handle_vmx_insn function any more Nadav Har'El
2011-01-27  8:39 ` [PATCH 19/29] nVMX: Exiting from L2 to L1 Nadav Har'El
2011-01-27  8:40 ` [PATCH 20/29] nVMX: Deciding if L0 or L1 should handle an L2 exit Nadav Har'El
2011-01-27  8:40 ` [PATCH 21/29] nVMX: Correct handling of interrupt injection Nadav Har'El
2011-01-27  8:41 ` [PATCH 22/29] nVMX: Correct handling of exception injection Nadav Har'El
2011-01-27  8:41 ` [PATCH 23/29] nVMX: Correct handling of idt vectoring info Nadav Har'El
2011-01-27  8:42 ` [PATCH 24/29] nVMX: Handling of CR0 and CR4 modifying instructions Nadav Har'El
2011-01-27  8:42 ` [PATCH 25/29] nVMX: Further fixes for lazy FPU loading Nadav Har'El
2011-01-27  8:43 ` [PATCH 26/29] nVMX: Additional TSC-offset handling Nadav Har'El
2011-01-27  8:43 ` [PATCH 27/29] nVMX: Add VMX to list of supported cpuid features Nadav Har'El
2011-01-27  8:44 ` [PATCH 28/29] nVMX: Miscellenous small corrections Nadav Har'El
2011-01-27  8:44 ` [PATCH 29/29] nVMX: Documentation Nadav Har'El
2011-01-28  8:41 ` [PATCH 0/29] nVMX: Nested VMX, v8 Juerg Haefliger
2011-01-28 17:16   ` Nadav Har'El
2011-01-31 10:07   ` Nadav Har'El

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=201101270838.p0R8cXFJ002634@rice.haifa.ibm.com \
    --to=nyh@il.ibm.com \
    --cc=avi@redhat.com \
    --cc=gleb@redhat.com \
    --cc=kvm@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.