Re: [PATCH] Activate Virtualization On Demand

From: Alexander Graf <agraf@suse.de>
To: Marcelo Tosatti <mtosatti@redhat.com>
Cc: kvm@vger.kernel.org
Subject: Re: [PATCH] Activate Virtualization On Demand
Date: Mon, 14 Sep 2009 18:25:20 +0200	[thread overview]
Message-ID: <78577088-4F27-48EB-81E2-27F34F3776F3@suse.de> (raw)
In-Reply-To: <20090914161453.GA9816@amt.cnet>

On 14.09.2009, at 18:14, Marcelo Tosatti wrote:

> On Mon, Sep 14, 2009 at 05:52:48PM +0200, Alexander Graf wrote:
>>
>> On 14.09.2009, at 15:23, Marcelo Tosatti wrote:
>>
>>> On Wed, Sep 09, 2009 at 04:18:58PM +0200, Alexander Graf wrote:
>>>> X86 CPUs need to have some magic happening to enable the
>>>> virtualization
>>>> extensions on them. This magic can result in unpleasant results for
>>>> users, like blocking other VMMs from working (vmx) or using invalid
>>>> TLB
>>>> entries (svm).
>>>>
>>>> Currently KVM activates virtualization when the respective kernel
>>>> module
>>>> is loaded. This blocks us from autoloading KVM modules without
>>>> breaking
>>>> other VMMs.
>>>>
>>>> To circumvent this problem at least a bit, this patch introduces on
>>>> demand activation of virtualization. This means, that instead
>>>> virtualization is enabled on creation of the first virtual machine
>>>> and disabled on destruction of the last one.
>>>>
>>>> So using this, KVM can be easily autoloaded, while keeping other
>>>> hypervisors usable.
>>>>
>>>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>>>
>>>> --
>>>>
>>>> I've tested the following:
>>>>
>>>> - shutdown
>>>> - suspend / resume to RAM
>>>> - running VirtualBox while kvm module is loaded
>>>> ---
>>>> arch/ia64/kvm/kvm-ia64.c        |    8 ++-
>>>> arch/powerpc/kvm/powerpc.c      |    3 +-
>>>> arch/s390/kvm/kvm-s390.c        |    3 +-
>>>> arch/x86/include/asm/kvm_host.h |    2 +-
>>>> arch/x86/kvm/svm.c              |   13 ++++--
>>>> arch/x86/kvm/vmx.c              |    7 +++-
>>>> arch/x86/kvm/x86.c              |    4 +-
>>>> include/linux/kvm_host.h        |    2 +-
>>>> virt/kvm/kvm_main.c             |   82 +++++++++++++++++++++++++++ 
>>>> ++
>>>> ++++------
>>>> 9 files changed, 98 insertions(+), 26 deletions(-)
>>>>
>>>> diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
>>>> index f6471c8..5fdeec5 100644
>>>> --- a/arch/ia64/kvm/kvm-ia64.c
>>>> +++ b/arch/ia64/kvm/kvm-ia64.c
>>>> @@ -124,7 +124,7 @@ long ia64_pal_vp_create(u64 *vpd, u64  
>>>> *host_iva,
>>>> u64 *opt_handler)
>>>>
>>>> static  DEFINE_SPINLOCK(vp_lock);
>>>>
>>>> -void kvm_arch_hardware_enable(void *garbage)
>>>> +int kvm_arch_hardware_enable(void *garbage)
>>>> {
>>>> 	long  status;
>>>> 	long  tmp_base;
>>>> @@ -137,7 +137,7 @@ void kvm_arch_hardware_enable(void *garbage)
>>>> 	slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT);
>>>> 	local_irq_restore(saved_psr);
>>>> 	if (slot < 0)
>>>> -		return;
>>>> +		return -EINVAL;
>>>>
>>>> 	spin_lock(&vp_lock);
>>>> 	status = ia64_pal_vp_init_env(kvm_vsa_base ?
>>>> @@ -145,7 +145,7 @@ void kvm_arch_hardware_enable(void *garbage)
>>>> 			__pa(kvm_vm_buffer), KVM_VM_BUFFER_BASE, &tmp_base);
>>>> 	if (status != 0) {
>>>> 		printk(KERN_WARNING"kvm: Failed to Enable VT Support!!!!\n");
>>>> -		return ;
>>>> +		return -EINVAL;
>>>> 	}
>>>>
>>>> 	if (!kvm_vsa_base) {
>>>> @@ -154,6 +154,8 @@ void kvm_arch_hardware_enable(void *garbage)
>>>> 	}
>>>> 	spin_unlock(&vp_lock);
>>>> 	ia64_ptr_entry(0x3, slot);
>>>> +
>>>> +	return 0;
>>>> }
>>>>
>>>> void kvm_arch_hardware_disable(void *garbage)
>>>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/ 
>>>> powerpc.c
>>>> index 95af622..5902bbc 100644
>>>> --- a/arch/powerpc/kvm/powerpc.c
>>>> +++ b/arch/powerpc/kvm/powerpc.c
>>>> @@ -78,8 +78,9 @@ int kvmppc_emulate_mmio(struct kvm_run *run,
>>>> struct kvm_vcpu *vcpu)
>>>> 	return r;
>>>> }
>>>>
>>>> -void kvm_arch_hardware_enable(void *garbage)
>>>> +int kvm_arch_hardware_enable(void *garbage)
>>>> {
>>>> +	return 0;
>>>> }
>>>>
>>>> void kvm_arch_hardware_disable(void *garbage)
>>>> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
>>>> index 00e2ce8..5445058 100644
>>>> --- a/arch/s390/kvm/kvm-s390.c
>>>> +++ b/arch/s390/kvm/kvm-s390.c
>>>> @@ -74,9 +74,10 @@ struct kvm_stats_debugfs_item  
>>>> debugfs_entries[] =
>>>> {
>>>> static unsigned long long *facilities;
>>>>
>>>> /* Section: not file related */
>>>> -void kvm_arch_hardware_enable(void *garbage)
>>>> +int kvm_arch_hardware_enable(void *garbage)
>>>> {
>>>> 	/* every s390 is virtualization enabled ;-) */
>>>> +	return 0;
>>>> }
>>>>
>>>> void kvm_arch_hardware_disable(void *garbage)
>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/ 
>>>> asm/
>>>> kvm_host.h
>>>> index 6046e6f..b17886f 100644
>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>> @@ -462,7 +462,7 @@ struct descriptor_table {
>>>> struct kvm_x86_ops {
>>>> 	int (*cpu_has_kvm_support)(void);          /* __init */
>>>> 	int (*disabled_by_bios)(void);             /* __init */
>>>> -	void (*hardware_enable)(void *dummy);      /* __init */
>>>> +	int (*hardware_enable)(void *dummy);
>>>> 	void (*hardware_disable)(void *dummy);
>>>> 	void (*check_processor_compatibility)(void *rtn);
>>>> 	int (*hardware_setup)(void);               /* __init */
>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>> index a5f90c7..2f3a388 100644
>>>> --- a/arch/x86/kvm/svm.c
>>>> +++ b/arch/x86/kvm/svm.c
>>>> @@ -316,7 +316,7 @@ static void svm_hardware_disable(void *garbage)
>>>> 	cpu_svm_disable();
>>>> }
>>>>
>>>> -static void svm_hardware_enable(void *garbage)
>>>> +static int svm_hardware_enable(void *garbage)
>>>> {
>>>>
>>>> 	struct svm_cpu_data *svm_data;
>>>> @@ -325,16 +325,20 @@ static void svm_hardware_enable(void  
>>>> *garbage)
>>>> 	struct desc_struct *gdt;
>>>> 	int me = raw_smp_processor_id();
>>>>
>>>> +	rdmsrl(MSR_EFER, efer);
>>>> +	if (efer & EFER_SVME)
>>>> +		return -EBUSY;
>>>> +
>>>> 	if (!has_svm()) {
>>>> 		printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
>>>> -		return;
>>>> +		return -EINVAL;
>>>> 	}
>>>> 	svm_data = per_cpu(svm_data, me);
>>>>
>>>> 	if (!svm_data) {
>>>> 		printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
>>>> 		       me);
>>>> -		return;
>>>> +		return -EINVAL;
>>>> 	}
>>>>
>>>> 	svm_data->asid_generation = 1;
>>>> @@ -345,11 +349,12 @@ static void svm_hardware_enable(void  
>>>> *garbage)
>>>> 	gdt = (struct desc_struct *)gdt_descr.base;
>>>> 	svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt +
>>>> GDT_ENTRY_TSS);
>>>>
>>>> -	rdmsrl(MSR_EFER, efer);
>>>> 	wrmsrl(MSR_EFER, efer | EFER_SVME);
>>>>
>>>> 	wrmsrl(MSR_VM_HSAVE_PA,
>>>> 	       page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
>>>> +
>>>> +	return 0;
>>>> }
>>>>
>>>> static void svm_cpu_uninit(int cpu)
>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>> index d3213ac..c20a902 100644
>>>> --- a/arch/x86/kvm/vmx.c
>>>> +++ b/arch/x86/kvm/vmx.c
>>>> @@ -1138,12 +1138,15 @@ static __init int  
>>>> vmx_disabled_by_bios(void)
>>>> 	/* locked but not enabled */
>>>> }
>>>>
>>>> -static void hardware_enable(void *garbage)
>>>> +static int hardware_enable(void *garbage)
>>>> {
>>>> 	int cpu = raw_smp_processor_id();
>>>> 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
>>>> 	u64 old;
>>>>
>>>> +	if (read_cr4() & X86_CR4_VMXE)
>>>> +		return -EBUSY;
>>>> +
>>>> 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
>>>> 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
>>>> 	if ((old & (FEATURE_CONTROL_LOCKED |
>>>> @@ -1158,6 +1161,8 @@ static void hardware_enable(void *garbage)
>>>> 	asm volatile (ASM_VMX_VMXON_RAX
>>>> 		      : : "a"(&phys_addr), "m"(phys_addr)
>>>> 		      : "memory", "cc");
>>>> +
>>>> +	return 0;
>>>> }
>>>>
>>>> static void vmclear_local_vcpus(void)
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>> index 891234b..ec16169 100644
>>>> --- a/arch/x86/kvm/x86.c
>>>> +++ b/arch/x86/kvm/x86.c
>>>> @@ -4703,9 +4703,9 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu  
>>>> *vcpu)
>>>> 	return kvm_x86_ops->vcpu_reset(vcpu);
>>>> }
>>>>
>>>> -void kvm_arch_hardware_enable(void *garbage)
>>>> +int kvm_arch_hardware_enable(void *garbage)
>>>> {
>>>> -	kvm_x86_ops->hardware_enable(garbage);
>>>> +	return kvm_x86_ops->hardware_enable(garbage);
>>>> }
>>>>
>>>> void kvm_arch_hardware_disable(void *garbage)
>>>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>>>> index 3e57be4..0bf9ee9 100644
>>>> --- a/include/linux/kvm_host.h
>>>> +++ b/include/linux/kvm_host.h
>>>> @@ -346,7 +346,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
>>>> void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
>>>>
>>>> int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
>>>> -void kvm_arch_hardware_enable(void *garbage);
>>>> +int kvm_arch_hardware_enable(void *garbage);
>>>> void kvm_arch_hardware_disable(void *garbage);
>>>> int kvm_arch_hardware_setup(void);
>>>> void kvm_arch_hardware_unsetup(void);
>>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>>>> index 6ce5ef3..39f0f5e 100644
>>>> --- a/virt/kvm/kvm_main.c
>>>> +++ b/virt/kvm/kvm_main.c
>>>> @@ -69,6 +69,8 @@ DEFINE_SPINLOCK(kvm_lock);
>>>> LIST_HEAD(vm_list);
>>>>
>>>> static cpumask_var_t cpus_hardware_enabled;
>>>> +static int kvm_usage_count = 0;
>>>> +static atomic_t hardware_enable_failed;
>>>>
>>>> struct kmem_cache *kvm_vcpu_cache;
>>>> EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
>>>> @@ -79,6 +81,8 @@ struct dentry *kvm_debugfs_dir;
>>>>
>>>> static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
>>>> 			   unsigned long arg);
>>>> +static int hardware_enable_all(void);
>>>> +static void hardware_disable_all(void);
>>>>
>>>> static bool kvm_rebooting;
>>>>
>>>> @@ -326,6 +330,7 @@ static const struct mmu_notifier_ops
>>>> kvm_mmu_notifier_ops = {
>>>>
>>>> static struct kvm *kvm_create_vm(void)
>>>> {
>>>> +	int r = 0;
>>>> 	struct kvm *kvm = kvm_arch_create_vm();
>>>> #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
>>>> 	struct page *page;
>>>> @@ -333,6 +338,11 @@ static struct kvm *kvm_create_vm(void)
>>>>
>>>> 	if (IS_ERR(kvm))
>>>> 		goto out;
>>>> +
>>>> +	r = hardware_enable_all();
>>>> +	if (r)
>>>> +		goto out_err;
>>>> +
>>>> #ifdef CONFIG_HAVE_KVM_IRQCHIP
>>>> 	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
>>>> 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
>>>> @@ -341,8 +351,8 @@ static struct kvm *kvm_create_vm(void)
>>>> #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
>>>> 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
>>>> 	if (!page) {
>>>> -		kfree(kvm);
>>>> -		return ERR_PTR(-ENOMEM);
>>>> +		r = -ENOMEM;
>>>> +		goto out_err;
>>>> 	}
>>>> 	kvm->coalesced_mmio_ring =
>>>> 			(struct kvm_coalesced_mmio_ring *)page_address(page);
>>>> @@ -350,15 +360,13 @@ static struct kvm *kvm_create_vm(void)
>>>>
>>>> #if defined(CONFIG_MMU_NOTIFIER) &&
>>>> defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>>>> 	{
>>>> -		int err;
>>>> 		kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
>>>> -		err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
>>>> -		if (err) {
>>>> +		r = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
>>>> +		if (r) {
>>>> #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
>>>> 			put_page(page);
>>>> #endif
>>>> -			kfree(kvm);
>>>> -			return ERR_PTR(err);
>>>> +			goto out_err;
>>>> 		}
>>>> 	}
>>>> #endif
>>>> @@ -382,6 +390,11 @@ static struct kvm *kvm_create_vm(void)
>>>> #endif
>>>> out:
>>>> 	return kvm;
>>>> +
>>>> +out_err:
>>>> +	hardware_disable_all();
>>>> +	kfree(kvm);
>>>> +	return ERR_PTR(r);
>>>> }
>>>>
>>>> /*
>>>> @@ -440,6 +453,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
>>>> 	kvm_arch_flush_shadow(kvm);
>>>> #endif
>>>> 	kvm_arch_destroy_vm(kvm);
>>>> +	hardware_disable_all();
>>>> 	mmdrop(mm);
>>>> }
>>>>
>>>> @@ -1631,11 +1645,41 @@ static struct miscdevice kvm_dev = {
>>>> static void hardware_enable(void *junk)
>>>> {
>>>> 	int cpu = raw_smp_processor_id();
>>>> +	int r;
>>>>
>>>> 	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
>>>> 		return;
>>>> +
>>>> 	cpumask_set_cpu(cpu, cpus_hardware_enabled);
>>>> -	kvm_arch_hardware_enable(NULL);
>>>> +
>>>> +	r = kvm_arch_hardware_enable(NULL);
>>>> +
>>>> +	if (r) {
>>>> +		cpumask_clear_cpu(cpu, cpus_hardware_enabled);
>>>> +		atomic_inc(&hardware_enable_failed);
>>>> +		printk(KERN_INFO "kvm: enabling virtualization on "
>>>> +				 "CPU%d failed\n", cpu);
>>>> +	}
>>>> +}
>>>> +
>>>> +static int hardware_enable_all(void)
>>>> +{
>>>> +	int r = 0;
>>>> +
>>>> +	spin_lock(&kvm_lock);
>>>> +
>>>> +	kvm_usage_count++;
>>>> +	if (kvm_usage_count == 1) {
>>>> +		atomic_set(&hardware_enable_failed, 0);
>>>> +		on_each_cpu(hardware_enable, NULL, 1);
>>>> +
>>>> +		if (atomic_read(&hardware_enable_failed))
>>>> +			r = -EBUSY;
>>>> +	}
>>>> +
>>>> +	spin_unlock(&kvm_lock);
>>>> +
>>>> +	return r;
>>>> }
>>>
>>> I think the kvm_usage_count > 1 path should also test for
>>> hardware_enable_failed (you assume that if kvm_usage_count > 1
>>> then hardware enablement has succeeded, which is not always true).
>>
>> I don't see how kvm_usage_count could be > 1 without hardware  
>> enablement
>> having succeeded. The hardware_enable_all caller calls
>> hardware_disable_all (kvm_usage_count--) when enabling fails.
>
> But it does not hold any lock in between hardware_enable_all
> and hardware_disable_all.
>
> So its unsafe if another kvm_create_vm call happens in between, while
> kvm_usage_count is 1 ?

So what we really need is a lock, so hardware_enable_all doesn't get  
called twice? Isn't that what the kvm_lock here does?

>>> Also, better move vmx.c's ept_sync_global from vmx_init to
>>> hardware_enable.
>>
>> Why? What does that do?
>
> 25.3.3.4 Guidelines for Use of the INVEPT Instruction
>
> Software can use the INVEPT instruction with the “all-context”
> INVEPT type immediately after execution of the VMXON instruction or
> immediately prior to execution of the VMXOFF instruction. Either
> prevents potentially undesired retention of information cached from  
> EPT
> paging structures between separate uses of VMX operation.

Hmhm. I don't have EPT hardware to test things on, but I can of course  
make a blind move of the call.

Alex