All of lore.kernel.org
 help / color / mirror / Atom feed
From: Avi Kivity <avi@redhat.com>
To: "Nadav Har'El" <nyh@il.ibm.com>
Cc: kvm@vger.kernel.org
Subject: Re: [PATCH 16/24] Implement VMLAUNCH and VMRESUME
Date: Mon, 14 Jun 2010 14:41:29 +0300	[thread overview]
Message-ID: <4C161569.3000602@redhat.com> (raw)
In-Reply-To: <201006131230.o5DCUk2i013070@rice.haifa.ibm.com>

On 06/13/2010 03:30 PM, Nadav Har'El wrote:
> Implement the VMLAUNCH and VMRESUME instructions, allowing a guest
> hypervisor to run its own guests.
>
> Signed-off-by: Nadav Har'El<nyh@il.ibm.com>
> ---
> --- .before/arch/x86/kvm/vmx.c	2010-06-13 15:01:29.000000000 +0300
> +++ .after/arch/x86/kvm/vmx.c	2010-06-13 15:01:29.000000000 +0300
> @@ -272,6 +272,9 @@ struct __attribute__ ((__packed__)) vmcs
>   	struct shadow_vmcs shadow_vmcs;
>
>   	bool launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
> +
> +	int cpu;
>    

Not sure cpu should be here.  It's certainly won't survive live 
migration.  Perhaps in struct vmcs_list (which should be renamed, 
perhaps struct cached_vmcs).

> +	int launched;
>   };
>    

What's the difference between this and launch_state?

>
>   struct vmcs_list {
> @@ -297,6 +300,24 @@ struct nested_vmx {
>   	/* list of real (hardware) VMCS, one for each L2 guest of L1 */
>   	struct list_head l2_vmcs_list; /* a vmcs_list */
>   	int l2_vmcs_num;
> +
> +	/* Are we running a nested guest now */
> +	bool nested_mode;
> +	/* Level 1 state for switching to level 2 and back */
> +	struct  {
> +		u64 efer;
> +		unsigned long cr3;
> +		unsigned long cr4;
> +		u64 io_bitmap_a;
> +		u64 io_bitmap_b;
> +		u64 msr_bitmap;
> +		int cpu;
> +		int launched;
> +	} l1_state;
>    

This state needs save/restore support (as well as the current vmptr and 
vmxon state).

> +	/* Level 1 shadow vmcs for switching to level 2 and back */
> +	struct shadow_vmcs *l1_shadow_vmcs;
>    

Again, not really happy about shadowing the non-nested vmcs.

> +	/* Level 1 vmcs loaded into the processor */
> +	struct vmcs *l1_vmcs;
>   };
>
>   enum vmcs_field_type {
> @@ -1407,6 +1428,19 @@ static void vmx_vcpu_load(struct kvm_vcp
>   			new_offset = vmcs_read64(TSC_OFFSET) + delta;
>   			vmcs_write64(TSC_OFFSET, new_offset);
>   		}
> +
> +		if (vmx->nested.l1_shadow_vmcs != NULL) {
> +			struct shadow_vmcs *l1svmcs =
> +				vmx->nested.l1_shadow_vmcs;
> +			l1svmcs->host_tr_base = vmcs_readl(HOST_TR_BASE);
> +			l1svmcs->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> +			l1svmcs->host_ia32_sysenter_esp =
> +				vmcs_readl(HOST_IA32_SYSENTER_ESP);
>    

These are all static (at least on a single cpu.  No need to read them 
from a vmcs.

> +			if (tsc_this<  vcpu->arch.host_tsc)
> +				l1svmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> +			if (vmx->nested.nested_mode)
> +				load_vmcs_host_state(l1svmcs);
> +		}
>   	}
>   }
>
>
> @@ -4348,6 +4392,42 @@ static int handle_vmclear(struct kvm_vcp
>   	return 1;
>   }
>
> +static int nested_vmx_run(struct kvm_vcpu *vcpu);
> +
> +static int handle_launch_or_resume(struct kvm_vcpu *vcpu, bool launch)
> +{
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (!nested_map_current(vcpu))
> +		return 1;
>    

Better error handling needed, perhaps triple fault.

> +	if (to_vmx(vcpu)->nested.current_l2_page->launch_state == launch) {
> +		/* Must use VMLAUNCH for the first time, VMRESUME later */
> +		set_rflags_to_vmx_fail_valid(vcpu);
> +		nested_unmap_current(vcpu);
>    

skip_emulted_instruction();

> +		return 1;
> +	}
> +	nested_unmap_current(vcpu);
> +
> +	skip_emulated_instruction(vcpu);
> +
> +	nested_vmx_run(vcpu);
> +	return 1;
> +}
>
> @@ -4958,7 +5038,8 @@ static int vmx_handle_exit(struct kvm_vc
>   		       "(0x%x) and exit reason is 0x%x\n",
>   		       __func__, vectoring_info, exit_reason);
>
> -	if (unlikely(!cpu_has_virtual_nmis()&&  vmx->soft_vnmi_blocked)) {
> +	if (!vmx->nested.nested_mode&&
> +		unlikely(!cpu_has_virtual_nmis()&&  vmx->soft_vnmi_blocked)) {
>    

Too much indent.  the unlikely() looks like the first statement of the 
block.

I think it isn't enough to check for nested mode.  If the guest hasn't 
enabled virtual NMIs, then the nested guest should behave exactly like 
the guest.

>
> +static int nested_vmx_run(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	vmx->nested.nested_mode = 1;
>    

true

> +	sync_cached_regs_to_vmcs(vcpu);
> +	save_vmcs(vmx->nested.l1_shadow_vmcs);
> +
> +	vmx->nested.l1_state.efer = vcpu->arch.efer;
>    

Not sure why you need to save efer.  Ordinarily, vmx reconstructs it 
from the guest efer and the host size exit control, you can do the same.

> +	if (!enable_ept)
> +		vmx->nested.l1_state.cr3 = vcpu->arch.cr3;
>    

Ditto, isn't that HOST_CR3?

> +	vmx->nested.l1_state.cr4 = vcpu->arch.cr4;
>    

Ditto.

> +
> +	if (!nested_map_current(vcpu)) {
> +		set_rflags_to_vmx_fail_valid(vcpu);
> +		return 1;
> +	}
> +
> +	if (cpu_has_vmx_msr_bitmap())
> +		vmx->nested.l1_state.msr_bitmap = vmcs_read64(MSR_BITMAP);
> +	else
> +		vmx->nested.l1_state.msr_bitmap = 0;
> +
> +	vmx->nested.l1_state.io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	vmx->nested.l1_state.io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +	vmx->nested.l1_vmcs = vmx->vmcs;
> +	vmx->nested.l1_state.cpu = vcpu->cpu;
> +	vmx->nested.l1_state.launched = vmx->launched;
> +
> +	vmx->vmcs = nested_get_current_vmcs(vcpu);
> +	if (!vmx->vmcs) {
> +		printk(KERN_ERR "Missing VMCS\n");
> +		set_rflags_to_vmx_fail_valid(vcpu);
> +		return 1;
> +	}
> +
> +	vcpu->cpu = vmx->nested.current_l2_page->cpu;
>    

How can this change?  It must remain constant between 
kvm_arch_vcpu_load() and kvm_arch_vcpu_put().

> +	vmx->launched = vmx->nested.current_l2_page->launched;
> +
> +	if (!vmx->nested.current_l2_page->launch_state || !vmx->launched) {
> +		vmcs_clear(vmx->vmcs);
> +		vmx->launched = 0;
> +		vmx->nested.current_l2_page->launch_state = 1;
> +	}
> +
> +	vmx_vcpu_load(vcpu, get_cpu());
> +	put_cpu();
> +
> +	prepare_vmcs_02(vcpu,
> +		get_shadow_vmcs(vcpu), vmx->nested.l1_shadow_vmcs);
> +
> +	if (get_shadow_vmcs(vcpu)->vm_entry_controls&
> +	    VM_ENTRY_IA32E_MODE) {
> +		if (!((vcpu->arch.efer&  EFER_LMA)&&
> +		      (vcpu->arch.efer&  EFER_LME)))
> +			vcpu->arch.efer |= (EFER_LMA | EFER_LME);
> +	} else {
> +		if ((vcpu->arch.efer&  EFER_LMA) ||
> +		    (vcpu->arch.efer&  EFER_LME))
> +			vcpu->arch.efer = 0;
> +	}
> +
> +	/* vmx_set_cr0() sets the cr0 that L2 will read, to be the one that L1
> +	 * dictated, and takes appropriate actions for special cr0 bits (like
> +	 * real mode, etc.).
> +	 */
> +	vmx_set_cr0(vcpu,
> +		(get_shadow_vmcs(vcpu)->guest_cr0&
> +			~get_shadow_vmcs(vcpu)->cr0_guest_host_mask) |
> +		(get_shadow_vmcs(vcpu)->cr0_read_shadow&
> +			get_shadow_vmcs(vcpu)->cr0_guest_host_mask));
> +
> +	/* However, vmx_set_cr0 incorrectly enforces KVM's relationship between
> +	 * GUEST_CR0 and CR0_READ_SHADOW, e.g., that the former is the same as
> +	 * the latter with with TS added if !fpu_active. We need to take the
> +	 * actual GUEST_CR0 that L1 wanted, just with added TS if !fpu_active
> +	 * like KVM wants (for the "lazy fpu" feature, to avoid the costly
> +	 * restoration of fpu registers until the FPU is really used).
> +	 */
> +	vmcs_writel(GUEST_CR0, get_shadow_vmcs(vcpu)->guest_cr0 |
> +		(vcpu->fpu_active ? 0 : X86_CR0_TS));
>    

Please update vmx_set_cr0() instead.

> +
> +	vmx_set_cr4(vcpu, get_shadow_vmcs(vcpu)->guest_cr4);
>    

Note: kvm_set_cr4() does some stuff that vmx_set_cr4() doesn't.  Esp. 
the kvm_mmu_reset_context().

> +	vmcs_writel(CR4_READ_SHADOW,
> +		    get_shadow_vmcs(vcpu)->cr4_read_shadow);
> +
> +	/* we have to set the X86_CR0_PG bit of the cached cr0, because
> +	 * kvm_mmu_reset_context enables paging only if X86_CR0_PG is set in
> +	 * CR0 (we need the paging so that KVM treat this guest as a paging
> +	 * guest so we can easly forward page faults to L1.)
> +	 */
> +	vcpu->arch.cr0 |= X86_CR0_PG;
>    

Since this version doesn't support unrestricted nested guests, cr0.pg 
will be already set or we will have failed vmentry.

> +
> +	if (enable_ept&&  !nested_cpu_has_vmx_ept(vcpu)) {
>    

We don't support nested ept yet, yes?

> +		vmcs_write32(GUEST_CR3, get_shadow_vmcs(vcpu)->guest_cr3);
> +		vmx->vcpu.arch.cr3 = get_shadow_vmcs(vcpu)->guest_cr3;
>    

Should be via kvm_set_cr3().

> +	} else {
> +		int r;
> +		kvm_set_cr3(vcpu, get_shadow_vmcs(vcpu)->guest_cr3);
> +		kvm_mmu_reset_context(vcpu);
> +
> +		nested_unmap_current(vcpu);
> +
> +		r = kvm_mmu_load(vcpu);
>    

Ordinary guest entry will load the mmu.  Failures here can only be 
memory allocation and should not be visible to the guest anyway (we 
return -ENOMEM to userspace and that's it).

> +		if (unlikely(r)) {
> +			printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r);
> +			set_rflags_to_vmx_fail_valid(vcpu);
> +			/* switch back to L1 */
> +			vmx->nested.nested_mode = 0;
> +			vmx->vmcs = vmx->nested.l1_vmcs;
> +			vcpu->cpu = vmx->nested.l1_state.cpu;
> +			vmx->launched = vmx->nested.l1_state.launched;
> +
> +			vmx_vcpu_load(vcpu, get_cpu());
> +			put_cpu();
> +
> +			return 1;
> +		}
> +
> +		nested_map_current(vcpu);
> +	}
> +
> +	kvm_register_write(vcpu, VCPU_REGS_RSP,
> +			   get_shadow_vmcs(vcpu)->guest_rsp);
> +	kvm_register_write(vcpu, VCPU_REGS_RIP,
> +			   get_shadow_vmcs(vcpu)->guest_rip);
> +
> +	nested_unmap_current(vcpu);
> +
> +	return 1;
> +}
> +
>    

-- 
error compiling committee.c: too many arguments to function


  reply	other threads:[~2010-06-14 11:41 UTC|newest]

Thread overview: 147+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-06-13 12:22 [PATCH 0/24] Nested VMX, v5 Nadav Har'El
2010-06-13 12:23 ` [PATCH 1/24] Move nested option from svm.c to x86.c Nadav Har'El
2010-06-14  8:11   ` Avi Kivity
2010-06-15 14:27     ` Nadav Har'El
2010-06-13 12:23 ` [PATCH 2/24] Add VMX and SVM to list of supported cpuid features Nadav Har'El
2010-06-14  8:13   ` Avi Kivity
2010-06-15 14:31     ` Nadav Har'El
2010-06-13 12:24 ` [PATCH 3/24] Implement VMXON and VMXOFF Nadav Har'El
2010-06-14  8:21   ` Avi Kivity
2010-06-16 11:14     ` Nadav Har'El
2010-06-16 11:26       ` Avi Kivity
2010-06-15 20:18   ` Marcelo Tosatti
2010-06-16  7:50     ` Nadav Har'El
2010-06-13 12:24 ` [PATCH 4/24] Allow setting the VMXE bit in CR4 Nadav Har'El
2010-06-15 11:09   ` Gleb Natapov
2010-06-15 14:44     ` Nadav Har'El
2010-06-13 12:25 ` [PATCH 5/24] Introduce vmcs12: a VMCS structure for L1 Nadav Har'El
2010-06-14  8:33   ` Avi Kivity
2010-06-14  8:49     ` Nadav Har'El
2010-06-14 12:35       ` Avi Kivity
2010-06-16 12:24     ` Nadav Har'El
2010-06-16 13:10       ` Avi Kivity
2010-06-22 14:54     ` Nadav Har'El
2010-06-22 16:53       ` Nadav Har'El
2010-06-23  8:07         ` Avi Kivity
2010-08-08 15:09           ` Nadav Har'El
2010-08-10  3:24             ` Avi Kivity
2010-06-23  7:57       ` Avi Kivity
2010-06-23  9:15         ` Alexander Graf
2010-06-23  9:24           ` Avi Kivity
2010-06-23 12:07         ` Nadav Har'El
2010-06-23 12:13           ` Avi Kivity
2010-06-13 12:25 ` [PATCH 6/24] Implement reading and writing of VMX MSRs Nadav Har'El
2010-06-14  8:42   ` Avi Kivity
2010-06-23  8:13     ` Nadav Har'El
2010-06-23  8:24       ` Avi Kivity
2010-06-13 12:26 ` [PATCH 7/24] Understanding guest pointers to vmcs12 structures Nadav Har'El
2010-06-14  8:48   ` Avi Kivity
2010-08-02 12:25     ` Nadav Har'El
2010-08-02 13:38       ` Avi Kivity
2010-06-15 12:14   ` Gleb Natapov
2010-08-01 15:16     ` Nadav Har'El
2010-08-01 15:25       ` Gleb Natapov
2010-08-02  8:57         ` Nadav Har'El
2010-06-13 12:26 ` [PATCH 8/24] Hold a vmcs02 for each vmcs12 Nadav Har'El
2010-06-14  8:57   ` Avi Kivity
2010-07-06  9:50   ` Dong, Eddie
2010-08-02 13:38     ` Nadav Har'El
2010-06-13 12:27 ` [PATCH 9/24] Implement VMCLEAR Nadav Har'El
2010-06-14  9:03   ` Avi Kivity
2010-06-15 13:47   ` Gleb Natapov
2010-06-15 13:50     ` Avi Kivity
2010-06-15 13:54       ` Gleb Natapov
2010-08-05 11:50         ` Nadav Har'El
2010-08-05 11:53           ` Gleb Natapov
2010-08-05 12:01             ` Nadav Har'El
2010-08-05 12:05               ` Avi Kivity
2010-08-05 12:10                 ` Nadav Har'El
2010-08-05 12:13                   ` Avi Kivity
2010-08-05 12:29                     ` Nadav Har'El
2010-08-05 12:03           ` Avi Kivity
2010-07-06  2:56   ` Dong, Eddie
2010-08-03 12:12     ` Nadav Har'El
2010-06-13 12:27 ` [PATCH 10/24] Implement VMPTRLD Nadav Har'El
2010-06-14  9:07   ` Avi Kivity
2010-08-05 11:13     ` Nadav Har'El
2010-06-16 13:36   ` Gleb Natapov
2010-07-06  3:09   ` Dong, Eddie
2010-08-05 11:35     ` Nadav Har'El
2010-06-13 12:28 ` [PATCH 11/24] Implement VMPTRST Nadav Har'El
2010-06-14  9:15   ` Avi Kivity
2010-06-16 13:53     ` Gleb Natapov
2010-06-16 15:33       ` Nadav Har'El
2010-06-13 12:28 ` [PATCH 12/24] Add VMCS fields to the vmcs12 Nadav Har'El
2010-06-14  9:24   ` Avi Kivity
2010-06-16 14:18   ` Gleb Natapov
2010-06-13 12:29 ` [PATCH 13/24] Implement VMREAD and VMWRITE Nadav Har'El
2010-06-14  9:36   ` Avi Kivity
2010-06-16 14:48     ` Gleb Natapov
2010-08-04 13:42       ` Nadav Har'El
2010-08-04 16:09     ` Nadav Har'El
2010-08-04 16:41       ` Avi Kivity
2010-06-16 15:03   ` Gleb Natapov
2010-08-04 11:46     ` Nadav Har'El
2010-06-13 12:29 ` [PATCH 14/24] Prepare vmcs02 from vmcs01 and vmcs12 Nadav Har'El
2010-06-14 11:11   ` Avi Kivity
2010-06-17  8:50   ` Gleb Natapov
2010-07-06  6:25   ` Dong, Eddie
2010-06-13 12:30 ` [PATCH 15/24] Move register-syncing to a function Nadav Har'El
2010-06-13 12:30 ` [PATCH 16/24] Implement VMLAUNCH and VMRESUME Nadav Har'El
2010-06-14 11:41   ` Avi Kivity [this message]
2010-09-26 11:14     ` Nadav Har'El
2010-09-26 12:56       ` Avi Kivity
2010-09-26 13:06         ` Nadav Har'El
2010-09-26 13:51           ` Avi Kivity
2010-06-17 10:59   ` Gleb Natapov
2010-09-16 16:06     ` Nadav Har'El
2010-06-13 12:31 ` [PATCH 17/24] No need for handle_vmx_insn function any more Nadav Har'El
2010-06-13 12:31 ` [PATCH 18/24] Exiting from L2 to L1 Nadav Har'El
2010-06-14 12:04   ` Avi Kivity
2010-09-12 14:05     ` Nadav Har'El
2010-09-12 14:29       ` Avi Kivity
2010-09-12 17:05         ` Nadav Har'El
2010-09-12 17:21           ` Avi Kivity
2010-09-12 19:51             ` Nadav Har'El
2010-09-13  8:48               ` Avi Kivity
2010-09-13  5:53             ` Sheng Yang
2010-09-13  8:52               ` Avi Kivity
2010-09-13  9:01                 ` Nadav Har'El
2010-09-13  9:34                   ` Avi Kivity
2010-09-14 13:07     ` Nadav Har'El
2010-06-13 12:32 ` [PATCH 19/24] Deciding if L0 or L1 should handle an L2 exit Nadav Har'El
2010-06-14 12:24   ` Avi Kivity
2010-09-16 14:42     ` Nadav Har'El
2010-06-13 12:32 ` [PATCH 20/24] Correct handling of interrupt injection Nadav Har'El
2010-06-14 12:29   ` Avi Kivity
2010-06-14 12:48     ` Avi Kivity
2010-09-16 15:25     ` Nadav Har'El
2010-06-13 12:33 ` [PATCH 21/24] Correct handling of exception injection Nadav Har'El
2010-06-13 12:33 ` [PATCH 22/24] Correct handling of idt vectoring info Nadav Har'El
2010-06-17 11:58   ` Gleb Natapov
2010-09-20  6:37     ` Nadav Har'El
2010-09-20  9:34       ` Gleb Natapov
2010-09-20 10:03         ` Nadav Har'El
2010-09-20 10:11           ` Avi Kivity
2010-09-22 23:15             ` Nadav Har'El
2010-09-26 15:14               ` Avi Kivity
2010-09-26 15:18                 ` Gleb Natapov
2010-09-20 10:20           ` Gleb Natapov
2010-06-13 12:34 ` [PATCH 23/24] Handling of CR0.TS and #NM for Lazy FPU loading Nadav Har'El
2010-06-13 12:34 ` [PATCH 24/24] Miscellenous small corrections Nadav Har'El
2010-06-14 12:34 ` [PATCH 0/24] Nested VMX, v5 Avi Kivity
2010-06-14 13:03   ` Nadav Har'El
2010-06-15 10:00     ` Avi Kivity
2010-10-17 12:03       ` Nadav Har'El
2010-10-17 12:10         ` Avi Kivity
2010-10-17 12:39           ` Nadav Har'El
2010-10-17 13:35             ` Avi Kivity
2010-07-09  8:59 ` Dong, Eddie
2010-07-11  8:27   ` Nadav Har'El
2010-07-11 11:05     ` Alexander Graf
2010-07-11 12:49       ` Nadav Har'El
2010-07-11 13:12         ` Avi Kivity
2010-07-11 15:39           ` Nadav Har'El
2010-07-11 15:45             ` Avi Kivity
2010-07-11 13:20     ` Avi Kivity
2010-07-15  3:27 ` Sheng Yang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4C161569.3000602@redhat.com \
    --to=avi@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=nyh@il.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.