Re: [PATCH 2/2] kvm: change the dirty page tracking to work with dirty bity

From: Izik Eidus <ieidus@redhat.com>
To: Marcelo Tosatti <mtosatti@redhat.com>
Cc: kvm@vger.kernel.org, avi@redhat.com
Subject: Re: [PATCH 2/2] kvm: change the dirty page tracking to work with dirty bity
Date: Thu, 11 Jun 2009 02:57:02 +0300	[thread overview]
Message-ID: <4A30484E.1040708@redhat.com> (raw)
In-Reply-To: <20090610204227.GA12682@amt.cnet>

Marcelo Tosatti wrote:
> On Wed, Jun 10, 2009 at 07:23:25PM +0300, Izik Eidus wrote:
>   
>> change the dirty page tracking to work with dirty bity instead of page fault.
>> right now the dirty page tracking work with the help of page faults, when we
>> want to track a page for being dirty, we write protect it and we mark it dirty
>> when we have write page fault, this code move into looking at the dirty bit
>> of the spte.
>>
>> Signed-off-by: Izik Eidus <ieidus@redhat.com>
>> ---
>>  arch/ia64/kvm/kvm-ia64.c        |    4 +++
>>  arch/powerpc/kvm/powerpc.c      |    4 +++
>>  arch/s390/kvm/kvm-s390.c        |    4 +++
>>  arch/x86/include/asm/kvm_host.h |    3 ++
>>  arch/x86/kvm/mmu.c              |   42 ++++++++++++++++++++++++++++++++++++--
>>  arch/x86/kvm/svm.c              |    7 ++++++
>>  arch/x86/kvm/vmx.c              |    7 ++++++
>>  arch/x86/kvm/x86.c              |   26 ++++++++++++++++++++---
>>  include/linux/kvm_host.h        |    1 +
>>  virt/kvm/kvm_main.c             |    6 ++++-
>>  10 files changed, 96 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
>> index 3199221..5914128 100644
>> --- a/arch/ia64/kvm/kvm-ia64.c
>> +++ b/arch/ia64/kvm/kvm-ia64.c
>> @@ -1809,6 +1809,10 @@ void kvm_arch_exit(void)
>>  	kvm_vmm_info = NULL;
>>  }
>>  
>> +void kvm_arch_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
>> +{
>> +}
>> +
>>  static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
>>  		struct kvm_dirty_log *log)
>>  {
>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>> index 2cf915e..6beb368 100644
>> --- a/arch/powerpc/kvm/powerpc.c
>> +++ b/arch/powerpc/kvm/powerpc.c
>> @@ -418,6 +418,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
>>  	return -ENOTSUPP;
>>  }
>>     
>
>   
>>  
>>     
>
> #ifndef KVM_ARCH_HAVE_DIRTY_LOG
>   
>> +void kvm_arch_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
>> +{
>> +}
>> +
>>     
> #endif
>
> in virt/kvm/main.c
>
>
>   
>> index c7b0cc2..8a24149 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -527,6 +527,7 @@ struct kvm_x86_ops {
>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>>  	int (*get_tdp_level)(void);
>>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>> +	int (*dirty_bit_support)(void);
>>  };
>>  
>>  extern struct kvm_x86_ops *kvm_x86_ops;
>> @@ -796,4 +797,6 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
>>  int kvm_age_hva(struct kvm *kvm, unsigned long hva);
>>  int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
>>  
>> +int is_dirty_and_clean_rmapp(struct kvm *kvm, unsigned long *rmapp);
>> +
>>  #endif /* _ASM_X86_KVM_HOST_H */
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 809cce0..500e0e2 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -140,6 +140,8 @@ module_param(oos_shadow, bool, 0644);
>>  #define ACC_USER_MASK    PT_USER_MASK
>>  #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
>>  
>> +#define SPTE_DONT_DIRTY (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
>> +
>>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
>>  
>>  struct kvm_rmap_desc {
>> @@ -629,6 +631,29 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
>>  	return NULL;
>>  }
>>  
>> +int is_dirty_and_clean_rmapp(struct kvm *kvm, unsigned long *rmapp)
>> +{
>> +	u64 *spte;
>> +	int dirty = 0;
>> +
>> +	if (!shadow_dirty_mask)
>> +		return 0;
>> +
>> +	spte = rmap_next(kvm, rmapp, NULL);
>> +	while (spte) {
>> +		if (*spte & PT_DIRTY_MASK) {
>> +			set_shadow_pte(spte, (*spte &= ~PT_DIRTY_MASK) |
>> +				       SPTE_DONT_DIRTY);
>> +			dirty = 1;
>> +			break;
>> +		}
>> +		spte = rmap_next(kvm, rmapp, spte);
>> +	}
>> +
>> +	return dirty;
>> +}
>> +
>> +
>>  static int rmap_write_protect(struct kvm *kvm, u64 gfn)
>>  {
>>  	unsigned long *rmapp;
>> @@ -1381,11 +1406,17 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
>>  static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>>  {
>>  	int ret;
>> +	int i;
>> +
>>  	++kvm->stat.mmu_shadow_zapped;
>>  	ret = mmu_zap_unsync_children(kvm, sp);
>>  	kvm_mmu_page_unlink_children(kvm, sp);
>>  	kvm_mmu_unlink_parents(kvm, sp);
>>  	kvm_flush_remote_tlbs(kvm);
>> +	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
>> +		if (sp->spt[i] & PT_DIRTY_MASK)
>> +			mark_page_dirty(kvm, sp->gfns[i]);
>> +	}
>>     
>
> Also need to transfer dirty bit in other places probably.
>   

Yes, i can think about some other case, but maybe i can avoid it using 
some trick.

>   
>>  	if (!sp->role.invalid && !sp->role.direct)
>>  		unaccount_shadowed(kvm, sp->gfn);
>>  	if (sp->unsync)
>> @@ -1676,7 +1707,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
>>  	 * whether the guest actually used the pte (in order to detect
>>  	 * demand paging).
>>  	 */
>> -	spte = shadow_base_present_pte | shadow_dirty_mask;
>> +	spte = shadow_base_present_pte;
>> +	if (!(spte & SPTE_DONT_DIRTY))
>> +		spte |= shadow_dirty_mask;
>> +
>>  	if (!speculative)
>>  		spte |= shadow_accessed_mask;
>>  	if (!dirty)
>> @@ -1725,8 +1759,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
>>  		}
>>  	}
>>  
>> -	if (pte_access & ACC_WRITE_MASK)
>> -		mark_page_dirty(vcpu->kvm, gfn);
>> +	if (!shadow_dirty_mask) {
>> +		if (pte_access & ACC_WRITE_MASK)
>> +			mark_page_dirty(vcpu->kvm, gfn);
>> +	}
>>     
>
> You can avoid the mark_page_dirty if SPTE_DONT_DIRTY? (which is a good idea,
> gfn_to_memslot_unaliased and friends show up high in profiling).
>   

This code shouldnt run on anything but EPT, shadow_dirty_mask should be 
set to zero only on EPT that doesnt have dirty bit tracking, so we still 
need to mark the page dirty from this context...
>   
>>  set_pte:
>>  	set_shadow_pte(shadow_pte, spte);
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index 37397f6..5b6351e 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -2724,6 +2724,11 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
>>  	return 0;
>>  }
>>  
>> +static int svm_dirty_bit_support(void)
>> +{
>> +	return 1;
>> +}
>> +
>>  static struct kvm_x86_ops svm_x86_ops = {
>>  	.cpu_has_kvm_support = has_svm,
>>  	.disabled_by_bios = is_disabled,
>> @@ -2785,6 +2790,8 @@ static struct kvm_x86_ops svm_x86_ops = {
>>  	.set_tss_addr = svm_set_tss_addr,
>>  	.get_tdp_level = get_npt_level,
>>  	.get_mt_mask = svm_get_mt_mask,
>> +
>> +	.dirty_bit_support = svm_dirty_bit_support,
>>  };
>>  
>>  static int __init svm_init(void)
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 673bcb3..8903314 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -3774,6 +3774,11 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
>>  	return ret;
>>  }
>>  
>> +static int vmx_dirty_bit_support(void)
>> +{
>> +	return false;
>> +}
>> +
>>  static struct kvm_x86_ops vmx_x86_ops = {
>>  	.cpu_has_kvm_support = cpu_has_kvm_support,
>>  	.disabled_by_bios = vmx_disabled_by_bios,
>> @@ -3833,6 +3838,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
>>  	.set_tss_addr = vmx_set_tss_addr,
>>  	.get_tdp_level = get_ept_level,
>>  	.get_mt_mask = vmx_get_mt_mask,
>> +
>> +	.dirty_bit_support = vmx_dirty_bit_support,
>>  };
>>  
>>  static int __init vmx_init(void)
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 272e2e8..e06387c 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -1963,6 +1963,20 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
>>  	return 0;
>>  }
>>  
>> +void kvm_arch_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
>> +{
>> +	int i;
>> +
>> +	spin_lock(&kvm->mmu_lock);
>> +	for (i = 0; i < memslot->npages; ++i) {
>> +		if (!test_bit(i, memslot->dirty_bitmap)) {
>> +			if (is_dirty_and_clean_rmapp(kvm, &memslot->rmap[i]))
>> +				set_bit(i, memslot->dirty_bitmap);
>> +		}
>> +	}
>> +	spin_unlock(&kvm->mmu_lock);
>> +}
>> +
>>  /*
>>   * Get (and clear) the dirty memory log for a memory slot.
>>   */
>> @@ -1982,10 +1996,14 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
>>  
>>  	/* If nothing is dirty, don't bother messing with page tables. */
>>  	if (is_dirty) {
>> -		spin_lock(&kvm->mmu_lock);
>> -		kvm_mmu_slot_remove_write_access(kvm, log->slot);
>> -		spin_unlock(&kvm->mmu_lock);
>> -		kvm_flush_remote_tlbs(kvm);
>> +		if (!kvm_x86_ops->dirty_bit_support()) {
>> +			spin_lock(&kvm->mmu_lock);
>> +			/*  remove_write_access() flush the tlb */
>> +			kvm_mmu_slot_remove_write_access(kvm, log->slot);
>> +			spin_unlock(&kvm->mmu_lock);
>> +		} else {
>> +			kvm_flush_remote_tlbs(kvm);
>> +		}
>>  		memslot = &kvm->memslots[log->slot];
>>  		n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
>>  		memset(memslot->dirty_bitmap, 0, n);
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index cdf1279..d1657a3 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -250,6 +250,7 @@ int kvm_dev_ioctl_check_extension(long ext);
>>  
>>  int kvm_get_dirty_log(struct kvm *kvm,
>>  			struct kvm_dirty_log *log, int *is_dirty);
>> +void kvm_arch_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
>>  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
>>  				struct kvm_dirty_log *log);
>>  
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 3046e9c..a876231 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -1135,8 +1135,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
>>  	}
>>  
>>  	/* Free page dirty bitmap if unneeded */
>> -	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
>> +	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) {
>>  		new.dirty_bitmap = NULL;
>> +		if (old.flags & KVM_MEM_LOG_DIRTY_PAGES)
>> +			kvm_arch_flush_shadow(kvm);
>> +	}
>>     
>
> Whats this for?
>   

We have added all this SPTE_DONT_DIRTY..., when we stop dirty bit 
tracking, we want to continue setting the dirty bit for the spte inside 
set_spte(), so writing to the page would be faster....

> The idea of making dirty bit accessible (also can use it to map host
> ptes read-only, when guest fault is read-only, for KSM (*)) is good. But
> better first introduce infrastructure to handle dirty bit (making sure
> the bit is transferred properly), then have logging make use of it.
>   

???, I dont understand it much, it mean you want to continue trying 
walking in that direction of the patch? or you want some other way?

> * EPT violations do no transfer fault information to the page fault
> handler, but its available (there's a vm-exit field).
>   

EPT in this patch, run as before, (using page faults with 
mark_page_dirty() inside set_spte())

>   
>>  	r = -ENOMEM;
>>  
>> @@ -1279,6 +1282,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
>>  	if (!memslot->dirty_bitmap)
>>  		goto out;
>>  
>> +	kvm_arch_get_dirty_log(kvm, memslot);
>>  	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
>>  
>>  	for (i = 0; !any && i < n/sizeof(long); ++i)
>> -- 
>> 1.5.6.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>     

Thanks :).