From mboxrd@z Thu Jan  1 00:00:00 1970
From: "Tian, Kevin" <kevin.tian@intel.com>
Subject: RE: [PATCH 02/31] nVMX: Implement VMXON and VMXOFF
Date: Fri, 20 May 2011 15:58:46 +0800
Message-ID: <625BA99ED14B2D499DC4E29D8138F1505C9BEEFE20@shsmsx502.ccr.corp.intel.com>
References: <1305575004-nyh@il.ibm.com>
 <201105161944.p4GJiu9K001652@rice.haifa.ibm.com>
Mime-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 8BIT
Cc: "gleb@redhat.com" <gleb@redhat.com>,
	"avi@redhat.com" <avi@redhat.com>
To: Nadav Har'El <nyh@il.ibm.com>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>
Return-path: <kvm-owner@vger.kernel.org>
Received: from mga11.intel.com ([192.55.52.93]:31326 "EHLO mga11.intel.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S934212Ab1ETIAe convert rfc822-to-8bit (ORCPT
	<rfc822;kvm@vger.kernel.org>); Fri, 20 May 2011 04:00:34 -0400
In-Reply-To: <201105161944.p4GJiu9K001652@rice.haifa.ibm.com>
Content-Language: en-US
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

> From: Nadav Har'El
> Sent: Tuesday, May 17, 2011 3:45 AM
> 
> This patch allows a guest to use the VMXON and VMXOFF instructions, and
> emulates them accordingly. Basically this amounts to checking some
> prerequisites, and then remembering whether the guest has enabled or
> disabled VMX operation.
> 
> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> ---
>  arch/x86/kvm/vmx.c |  110
> ++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 108 insertions(+), 2 deletions(-)
> 
> --- .before/arch/x86/kvm/vmx.c	2011-05-16 22:36:46.000000000 +0300
> +++ .after/arch/x86/kvm/vmx.c	2011-05-16 22:36:46.000000000 +0300
> @@ -130,6 +130,15 @@ struct shared_msr_entry {
>  	u64 mask;
>  };
> 
> +/*
> + * The nested_vmx structure is part of vcpu_vmx, and holds information
> +we need
> + * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
> + */
> +struct nested_vmx {
> +	/* Has the level1 guest done vmxon? */
> +	bool vmxon;
> +};
> +
>  struct vcpu_vmx {
>  	struct kvm_vcpu       vcpu;
>  	struct list_head      local_vcpus_link;
> @@ -184,6 +193,9 @@ struct vcpu_vmx {
>  	u32 exit_reason;
> 
>  	bool rdtscp_enabled;
> +
> +	/* Support for a guest hypervisor (nested VMX) */
> +	struct nested_vmx nested;
>  };
> 
>  enum segment_cache_field {
> @@ -3890,6 +3902,99 @@ static int handle_invalid_op(struct kvm_  }
> 
>  /*
> + * Emulate the VMXON instruction.
> + * Currently, we just remember that VMX is active, and do not save or
> +even
> + * inspect the argument to VMXON (the so-called "VMXON pointer")
> +because we
> + * do not currently need to store anything in that guest-allocated
> +memory

Though we don't need store anything, VMXON needs to check revision ID of
VMXON region to make sure it matches processor's assumption. Considering
an user uses nVMX to practice VMM development and forgot to fill revision
ID into the region. We should fail the instruction at the 1st place.

> + * region. Consequently, VMCLEAR and VMPTRLD also do not verify that
> +the their
> + * argument is different from the VMXON pointer (which the spec says they
> do).
> + */
> +static int handle_vmon(struct kvm_vcpu *vcpu) {
> +	struct kvm_segment cs;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	/* The Intel VMX Instruction Reference lists a bunch of bits that
> +	 * are prerequisite to running VMXON, most notably cr4.VMXE must be
> +	 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
> +	 * Otherwise, we should fail with #UD. We test these now:
> +	 */
> +	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
> +	    !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
> +	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 1;
> +	}
> +
> +	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
> +	if (is_long_mode(vcpu) && !cs.l) {
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 1;
> +	}
> +
> +	if (vmx_get_cpl(vcpu)) {
> +		kvm_inject_gp(vcpu, 0);
> +		return 1;
> +	}

You need also check IA32_FEATURE_CONTROL_MSR for bit 0/1/2 as
said in SDM. 

So does the check on 4k alignment and physical-address width for VMXON
region.

> +
> +	vmx->nested.vmxon = true;
> +
> +	skip_emulated_instruction(vcpu);
> +	return 1;
> +}
> +
> +/*
> + * Intel's VMX Instruction Reference specifies a common set of
> +prerequisites
> + * for running VMX instructions (except VMXON, whose prerequisites are
> + * slightly different). It also specifies what exception to inject otherwise.
> + */
> +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) {
> +	struct kvm_segment cs;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!vmx->nested.vmxon) {
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 0;
> +	}
> +
> +	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
> +	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
> +	    (is_long_mode(vcpu) && !cs.l)) {
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 0;
> +	}
> +
> +	if (vmx_get_cpl(vcpu)) {
> +		kvm_inject_gp(vcpu, 0);
> +		return 0;
> +	}
> +
> +	return 1;
> +}
> +
> +/*
> + * Free whatever needs to be freed from vmx->nested when L1 goes down,
> +or
> + * just stops using VMX.
> + */
> +static void free_nested(struct vcpu_vmx *vmx) {
> +	if (!vmx->nested.vmxon)
> +		return;
> +	vmx->nested.vmxon = false;
> +}
> +
> +/* Emulate the VMXOFF instruction */
> +static int handle_vmoff(struct kvm_vcpu *vcpu) {
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;

miss one check on CR0.PE

> +	free_nested(to_vmx(vcpu));
> +	skip_emulated_instruction(vcpu);
> +	return 1;
> +}
> +
> +/*
>   * The exit handlers return 1 if the exit was handled fully and guest execution
>   * may resume.  Otherwise they set the kvm_run parameter to indicate
> what needs
>   * to be done to userspace and return 0.
> @@ -3917,8 +4022,8 @@ static int (*kvm_vmx_exit_handlers[])(st
>  	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
>  	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
>  	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
> -	[EXIT_REASON_VMOFF]                   = handle_vmx_insn,
> -	[EXIT_REASON_VMON]                    = handle_vmx_insn,
> +	[EXIT_REASON_VMOFF]                   = handle_vmoff,
> +	[EXIT_REASON_VMON]                    = handle_vmon,
>  	[EXIT_REASON_TPR_BELOW_THRESHOLD]     =
> handle_tpr_below_threshold,
>  	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
>  	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
> @@ -4329,6 +4434,7 @@ static void vmx_free_vcpu(struct kvm_vcp
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> 
>  	free_vpid(vmx);
> +	free_nested(vmx);
>  	vmx_free_vmcs(vcpu);
>  	kfree(vmx->guest_msrs);
>  	kvm_vcpu_uninit(vcpu);
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a
> message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html