All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/3] migration dirtybitmap support ARMv7
@ 2014-04-15  1:24 Mario Smarduch
  2014-04-15  8:58 ` Marc Zyngier
  0 siblings, 1 reply; 3+ messages in thread
From: Mario Smarduch @ 2014-04-15  1:24 UTC (permalink / raw)
  To: kvmarm, christoffer.dall, marc.zyngier, 이정석,
	정성진
  Cc: kvm


- Support write protection of entire VM address space
- Split pmds section in migration mode
- Write protect dirty pages on Dirty log read 

Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 arch/arm/kvm/mmu.c |  158 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 156 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 7789857..502e776 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -56,6 +56,13 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+void kvm_tlb_flush_vmid(struct kvm *kvm)
+{
+	phys_addr_t x;
+	/* based on function description 2nd argument is irrelevent */
+	kvm_tlb_flush_vmid_ipa(kvm, x);
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  int min, int max)
 {
@@ -639,6 +646,143 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
 	return false;
 }
 
+/*
+ * Called when QEMU retrieves the dirty log and write protects dirty pages
+ * for next QEMU call to retrieve the dirty logn
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+		struct kvm_memory_slot *slot,
+		gfn_t gfn_offset, unsigned long mask)
+{
+	phys_addr_t ipa;
+	pgd_t *pgdp = kvm->arch.pgd, *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte, new_pte;
+
+	while (mask) {
+		ipa = (slot->base_gfn + gfn_offset + __ffs(mask)) << PAGE_SHIFT;
+		pgd = pgdp + pgd_index(ipa);
+		if (!pgd_present(*pgd))
+			goto update_mask;
+		pud = pud_offset(pgd, ipa);
+		if (!pud_present(*pud))
+			goto update_mask;
+		pmd = pmd_offset(pud, ipa);
+		if (!pmd_present(*pmd))
+			goto update_mask;
+		pte = pte_offset_kernel(pmd, ipa);
+		if (!pte_present(*pte))
+			goto update_mask;
+		if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
+			goto update_mask;
+		new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
+		*pte = new_pte;
+update_mask:
+		mask &= mask - 1;
+	}
+}
+
+/*
+ * In migration splits PMDs into PTEs to keep track of dirty pages. Without
+ * spliting light execution prevents migration.
+ */
+bool split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr)
+{
+	struct page *page;
+	pfn_t pfn = pmd_pfn(*pmd);
+	pte_t *pte, new_pte;
+	int i;
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		return false;
+
+	pte = page_address(page);
+	for (i = 0; i < PMD_SIZE/PAGE_SIZE; i++) {
+		new_pte = pfn_pte(pfn+i, PAGE_S2);
+		pte[i] = new_pte;
+	}
+	kvm_clean_pte(pte);
+	pmd_populate_kernel(NULL, pmd, pte);
+
+	/*
+	* flush the whole TLB for VM  relying on hardware broadcast
+	*/
+	kvm_tlb_flush_vmid(kvm);
+	get_page(virt_to_page(pte));
+	return true;
+}
+
+/*
+ * Called from QEMU when migration dirty logging is started. Write the protect
+ * current set. Future faults writes are tracked through WP of when dirty log
+ * log.
+ */
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte, new_pte;
+	pgd_t *pgdp = kvm->arch.pgd;
+	struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
+	u64 start = memslot->base_gfn << PAGE_SHIFT;
+	u64 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+	u64 addr = start, addr1;
+
+	spin_lock(&kvm->mmu_lock);
+	kvm->arch.migration_in_progress = true;
+	while (addr < end) {
+		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+			kvm_tlb_flush_vmid(kvm);
+			cond_resched_lock(&kvm->mmu_lock);
+		}
+
+		pgd = pgdp + pgd_index(addr);
+		if (!pgd_present(*pgd)) {
+			addr = pgd_addr_end(addr, end);
+			continue;
+		}
+
+		pud = pud_offset(pgd, addr);
+		if (pud_huge(*pud) || !pud_present(*pud)) {
+			addr = pud_addr_end(addr, end);
+			continue;
+		}
+
+		pmd = pmd_offset(pud, addr);
+		if (!pmd_present(*pmd)) {
+			addr = pmd_addr_end(addr, end);
+			continue;
+		}
+
+		if (kvm_pmd_huge(*pmd)) {
+			if (!split_pmd(kvm, pmd, addr)) {
+				kvm->arch.migration_in_progress = false;
+				return;
+			}
+			addr = pmd_addr_end(addr, end);
+			continue;
+		}
+
+		pte = pte_offset_kernel(pmd, addr);
+		addr1 = addr;
+		addr += PAGE_SIZE;
+		if (!pte_present(*pte))
+			continue;
+
+		if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
+			continue;
+
+		new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
+		*pte = new_pte;
+	}
+	kvm_tlb_flush_vmid(kvm);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot,
 			  unsigned long fault_status)
@@ -652,6 +796,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
 	struct vm_area_struct *vma;
 	pfn_t pfn;
+	bool migration_active = vcpu->kvm->arch.migration_in_progress;
 
 	write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
 	if (fault_status == FSC_PERM && !write_fault) {
@@ -705,10 +850,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	spin_lock(&kvm->mmu_lock);
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
-	if (!hugetlb && !force_pte)
+	/* During migration don't rebuild huge pages */
+	if (!hugetlb && !force_pte && !migration_active)
 		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
 
-	if (hugetlb) {
+	if (!migration_active && hugetlb) {
 		pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
 		new_pmd = pmd_mkhuge(new_pmd);
 		if (writable) {
@@ -720,6 +866,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	} else {
 		pte_t new_pte = pfn_pte(pfn, PAGE_S2);
 		if (writable) {
+			if (migration_active && hugetlb) {
+				/* get back pfn from fault_ipa */
+				pfn += (fault_ipa >> PAGE_SHIFT) &
+					((1 << (PMD_SHIFT - PAGE_SHIFT))-1);
+				new_pte = pfn_pte(pfn, PAGE_S2);
+			}
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
@@ -727,6 +879,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
 	}
 
+	if (writable)
+		mark_page_dirty(kvm, gfn);
 
 out_unlock:
 	spin_unlock(&kvm->mmu_lock);
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH 2/3] migration dirtybitmap support ARMv7
  2014-04-15  1:24 [PATCH 2/3] migration dirtybitmap support ARMv7 Mario Smarduch
@ 2014-04-15  8:58 ` Marc Zyngier
  2014-04-16  1:18   ` Mario Smarduch
  0 siblings, 1 reply; 3+ messages in thread
From: Marc Zyngier @ 2014-04-15  8:58 UTC (permalink / raw)
  To: Mario Smarduch
  Cc: kvmarm, christoffer.dall, 이정석,
	정성진,
	kvm

On 15/04/14 02:24, Mario Smarduch wrote:
> 
> - Support write protection of entire VM address space
> - Split pmds section in migration mode
> - Write protect dirty pages on Dirty log read 
> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
>  arch/arm/kvm/mmu.c |  158 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 156 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 7789857..502e776 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -56,6 +56,13 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>  }
>  
> +void kvm_tlb_flush_vmid(struct kvm *kvm)
> +{
> +	phys_addr_t x;
> +	/* based on function description 2nd argument is irrelevent */
> +	kvm_tlb_flush_vmid_ipa(kvm, x);
> +}

We don't declare a variable just for the sake of having one. The address
passed to kvm_tlb_flush_vmid_ipa() serves a purpose, and while ARMv7
doesn't have a way to invalidate by IPA, you cannot violate the internal
API by passing junk to it. Think of the children!! (ARMv8 in this case) ;-)

If you need the invalidation of a full VMID, then introduce the proper
API at HYP level.

>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>  				  int min, int max)
>  {
> @@ -639,6 +646,143 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
>  	return false;
>  }
>  
> +/*
> + * Called when QEMU retrieves the dirty log and write protects dirty pages
> + * for next QEMU call to retrieve the dirty logn

You really need to explain a few things here. What is this trying to do?
What is mask? Where is it called from? What are the side effects?
Locking conditions?

> + */
> +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
> +		struct kvm_memory_slot *slot,
> +		gfn_t gfn_offset, unsigned long mask)
> +{
> +	phys_addr_t ipa;
> +	pgd_t *pgdp = kvm->arch.pgd, *pgd;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte, new_pte;
> +
> +	while (mask) {
> +		ipa = (slot->base_gfn + gfn_offset + __ffs(mask)) << PAGE_SHIFT;
> +		pgd = pgdp + pgd_index(ipa);
> +		if (!pgd_present(*pgd))
> +			goto update_mask;
> +		pud = pud_offset(pgd, ipa);
> +		if (!pud_present(*pud))
> +			goto update_mask;
> +		pmd = pmd_offset(pud, ipa);
> +		if (!pmd_present(*pmd))
> +			goto update_mask;
> +		pte = pte_offset_kernel(pmd, ipa);
> +		if (!pte_present(*pte))
> +			goto update_mask;
> +		if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
> +			goto update_mask;
> +		new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
> +		*pte = new_pte;
> +update_mask:
> +		mask &= mask - 1;
> +	}
> +}
> +
> +/*
> + * In migration splits PMDs into PTEs to keep track of dirty pages. Without
> + * spliting light execution prevents migration.

Same as above. We really need to understand the context, the conditions,
and how you make sure this doesn't race against the rest of the VM.

> + */
> +bool split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr)
> +{
> +	struct page *page;
> +	pfn_t pfn = pmd_pfn(*pmd);
> +	pte_t *pte, new_pte;
> +	int i;
> +
> +	page = alloc_page(GFP_KERNEL);
> +	if (page == NULL)
> +		return false;
> +
> +	pte = page_address(page);
> +	for (i = 0; i < PMD_SIZE/PAGE_SIZE; i++) {

Use PTRS_PER_PMD

> +		new_pte = pfn_pte(pfn+i, PAGE_S2);

Missing spaces.

> +		pte[i] = new_pte;
> +	}
> +	kvm_clean_pte(pte);
> +	pmd_populate_kernel(NULL, pmd, pte);
> +
> +	/*
> +	* flush the whole TLB for VM  relying on hardware broadcast
> +	*/
> +	kvm_tlb_flush_vmid(kvm);

Why do you nuke the whole TLBs for this VM? I assume you're going to
repeatedly call this for all the huge pages, aren't you? Can you delay
this flush to do it only once?

> +	get_page(virt_to_page(pte));
> +	return true;
> +}
> +
> +/*
> + * Called from QEMU when migration dirty logging is started. Write the protect
> + * current set. Future faults writes are tracked through WP of when dirty log
> + * log.

Same as above.

> + */
> +
> +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
> +{
> +	pgd_t *pgd;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte, new_pte;
> +	pgd_t *pgdp = kvm->arch.pgd;
> +	struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
> +	u64 start = memslot->base_gfn << PAGE_SHIFT;
> +	u64 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
> +	u64 addr = start, addr1;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	kvm->arch.migration_in_progress = true;
> +	while (addr < end) {
> +		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
> +			kvm_tlb_flush_vmid(kvm);

Looks like you're extremely flush happy. If you're holding the lock, why
do you need all the extra flushes in the previous function?

> +			cond_resched_lock(&kvm->mmu_lock);
> +		}
> +
> +		pgd = pgdp + pgd_index(addr);
> +		if (!pgd_present(*pgd)) {
> +			addr = pgd_addr_end(addr, end);
> +			continue;
> +		}
> +
> +		pud = pud_offset(pgd, addr);
> +		if (pud_huge(*pud) || !pud_present(*pud)) {
> +			addr = pud_addr_end(addr, end);
> +			continue;
> +		}
> +
> +		pmd = pmd_offset(pud, addr);
> +		if (!pmd_present(*pmd)) {
> +			addr = pmd_addr_end(addr, end);
> +			continue;
> +		}
> +
> +		if (kvm_pmd_huge(*pmd)) {
> +			if (!split_pmd(kvm, pmd, addr)) {
> +				kvm->arch.migration_in_progress = false;
> +				return;

Bang, you're dead.

> +			}
> +			addr = pmd_addr_end(addr, end);
> +			continue;
> +		}
> +
> +		pte = pte_offset_kernel(pmd, addr);
> +		addr1 = addr;

What is addr1 used for?

> +		addr += PAGE_SIZE;
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
> +			continue;
> +
> +		new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
> +		*pte = new_pte;
> +	}
> +	kvm_tlb_flush_vmid(kvm);

OK. So it looks to me that only *this* flush and the one just before the
cond_resched are useful. What am I missing?

> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_memory_slot *memslot,
>  			  unsigned long fault_status)
> @@ -652,6 +796,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>  	struct vm_area_struct *vma;
>  	pfn_t pfn;
> +	bool migration_active = vcpu->kvm->arch.migration_in_progress;

The flag may not be observable until you've actually taken the mmu_lock.

>  	write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -705,10 +850,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
>  		goto out_unlock;
> -	if (!hugetlb && !force_pte)
> +	/* During migration don't rebuild huge pages */
> +	if (!hugetlb && !force_pte && !migration_active)
>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>  
> -	if (hugetlb) {
> +	if (!migration_active && hugetlb) {
>  		pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2);
>  		new_pmd = pmd_mkhuge(new_pmd);
>  		if (writable) {
> @@ -720,6 +866,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	} else {
>  		pte_t new_pte = pfn_pte(pfn, PAGE_S2);
>  		if (writable) {
> +			if (migration_active && hugetlb) {
> +				/* get back pfn from fault_ipa */
> +				pfn += (fault_ipa >> PAGE_SHIFT) &
> +					((1 << (PMD_SHIFT - PAGE_SHIFT))-1);
> +				new_pte = pfn_pte(pfn, PAGE_S2);

Please explain this.

> +			}
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
>  		}
> @@ -727,6 +879,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
>  	}
>  
> +	if (writable)
> +		mark_page_dirty(kvm, gfn);
>  
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
> 

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH 2/3] migration dirtybitmap support ARMv7
  2014-04-15  8:58 ` Marc Zyngier
@ 2014-04-16  1:18   ` Mario Smarduch
  0 siblings, 0 replies; 3+ messages in thread
From: Mario Smarduch @ 2014-04-16  1:18 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: kvmarm, christoffer.dall, 이정석,
	정성진,
	kvm

On 04/15/2014 01:58 AM, Marc Zyngier wrote:
> 
> Why do you nuke the whole TLBs for this VM? I assume you're going to
> repeatedly call this for all the huge pages, aren't you? Can you delay
> this flush to do it only once?
> 
>> +	get_page(virt_to_page(pte));
>> +	return true;
>> +}
>> +
>> +/*
>> + * Called from QEMU when migration dirty logging is started. Write the protect
>> + * current set. Future faults writes are tracked through WP of when dirty log
>> + * log.
> 
> Same as above.
> 
>> + */
>> +
>> +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
>> +{
>> +	pgd_t *pgd;
>> +	pud_t *pud;
>> +	pmd_t *pmd;
>> +	pte_t *pte, new_pte;
>> +	pgd_t *pgdp = kvm->arch.pgd;
>> +	struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
>> +	u64 start = memslot->base_gfn << PAGE_SHIFT;
>> +	u64 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
>> +	u64 addr = start, addr1;
>> +
>> +	spin_lock(&kvm->mmu_lock);
>> +	kvm->arch.migration_in_progress = true;
>> +	while (addr < end) {
>> +		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
>> +			kvm_tlb_flush_vmid(kvm);
> 
> Looks like you're extremely flush happy. If you're holding the lock, why
> do you need all the extra flushes in the previous function?

Reduced it to one flush, upon termination of the write protect loop.




>> +
>> +		if (kvm_pmd_huge(*pmd)) {
>> +			if (!split_pmd(kvm, pmd, addr)) {
>> +				kvm->arch.migration_in_progress = false;
>> +				return;
> 
> Bang, you're dead.
Yes added the unlock, also added return code in get dirty log function
to abort migration.

> 

> 

>>  		pte_t new_pte = pfn_pte(pfn, PAGE_S2);
>>  		if (writable) {
>> +			if (migration_active && hugetlb) {
>> +				/* get back pfn from fault_ipa */
>> +				pfn += (fault_ipa >> PAGE_SHIFT) &
>> +					((1 << (PMD_SHIFT - PAGE_SHIFT))-1);
>> +				new_pte = pfn_pte(pfn, PAGE_S2);
> 
> Please explain this.
 Next patch series will update this, there was another
problem of handling pmd huge pages and directing them to
pte handling.




^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2014-04-16  1:18 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-15  1:24 [PATCH 2/3] migration dirtybitmap support ARMv7 Mario Smarduch
2014-04-15  8:58 ` Marc Zyngier
2014-04-16  1:18   ` Mario Smarduch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.