All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-10  4:17 ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-10  4:17 UTC (permalink / raw)
  To: christoffer.dall, marc.zyngier
  Cc: pbonzini, kvmarm, kvm, linux-arm-kernel, Mario Smarduch

This patch adds support for 2nd stage page fault handling while dirty page
logging. On huge page faults, huge pages are dissolved to normal pages, and
rebuilding of 2nd stage huge pages is blocked. In case migration is 
canceled this restriction is removed and huge pages may be rebuilt again.

This patch applies cleanly on top of patch series posted Dec. 15'th:
https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html

Patch #11 has been dropped, and should not be applied.

Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---

Change Log since last RESEND v1 --> v2:
- Disallow dirty page logging of IO region - fail for initial write protect
  and disable logging code in 2nd stage page fault handler.
- Fixed auto spell correction errors

Change Log RESEND v0 --> v1:
- fixed bug exposed by new generic __get_user_pages_fast(), when region is 
  writable, prevent write protection of pte on read fault
- Removed marking entire huge page dirty on initial access
- don't dissolve huge pages of non-writable regions
- Made updates based on Christoffers comments
  - renamed logging status function to memslot_is_logging()
  - changed few values to bool from longs
  - streamlined user_mem_abort() to eliminate extra conditional checks
---
 arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 105 insertions(+), 8 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 73d506f..b878236 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
 #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
 #define kvm_pud_huge(_x)	pud_huge(_x)
 
+#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
+#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+#ifdef CONFIG_ARM
+	return !!memslot->dirty_bitmap;
+#else
+	return false;
+#endif
+}
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
 	/*
@@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm:	pointer to kvm structure.
+ * @addr:	IPA
+ * @pmd:	pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
+ * pages in the range dirty.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+	if (!kvm_pmd_huge(*pmd))
+		return;
+
+	pmd_clear(pmd);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	put_page(virt_to_page(pmd));
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  int min, int max)
 {
@@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 }
 
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
+			  phys_addr_t addr, const pte_t *new_pte,
+			  unsigned long flags)
 {
 	pmd_t *pmd;
 	pte_t *pte, old_pte;
+	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
+	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
 
 	/* Create stage-2 page table mapping - Levels 0 and 1 */
 	pmd = stage2_get_pmd(kvm, cache, addr);
@@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 		return 0;
 	}
 
+	/*
+	 * While dirty page logging - dissolve huge PMD, then continue on to
+	 * allocate page.
+	 */
+	if (logging_active)
+		stage2_dissolve_pmd(kvm, addr, pmd);
+
 	/* Create stage-2 page mappings - Level 2 */
 	if (pmd_none(*pmd)) {
 		if (!cache)
@@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 		if (ret)
 			goto out;
 		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
+		ret = stage2_set_pte(kvm, &cache, addr, &pte,
+						KVM_S2PTE_FLAG_IS_IOMAP);
 		spin_unlock(&kvm->mmu_lock);
 		if (ret)
 			goto out;
@@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	pfn_t pfn;
 	pgprot_t mem_type = PAGE_S2;
 	bool fault_ipa_uncached;
+	bool can_set_pte_rw = true;
+	unsigned long set_pte_flags = 0;
 
 	write_fault = kvm_is_write_fault(vcpu);
 	if (fault_status == FSC_PERM && !write_fault) {
@@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
+
 	/* Let's check if we will get back a huge page backed by hugetlbfs */
 	down_read(&current->mm->mmap_sem);
 	vma = find_vma_intersection(current->mm, hva, hva + 1);
@@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;
 
-	if (kvm_is_device_pfn(pfn))
+	if (kvm_is_device_pfn(pfn)) {
 		mem_type = PAGE_S2_DEVICE;
+		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
+	}
 
 	spin_lock(&kvm->mmu_lock);
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
+
+	/*
+	 * When logging is enabled general page fault handling changes:
+	 * -  Writable huge pages are dissolved on a read or write fault.
+	 * -  pte's are not allowed write permission on a read fault to
+	 *    writable region so future writes can be marked dirty
+	 * Access to non-writable region is unchanged, and logging of IO
+	 * regions is not allowed.
+	 */
+	if (memslot_is_logging(memslot) && writable) {
+		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
+		if (hugetlb) {
+			gfn += pte_index(fault_ipa);
+			pfn += pte_index(fault_ipa);
+			hugetlb = false;
+		}
+		force_pte = true;
+		if (!write_fault)
+			can_set_pte_rw = false;
+	}
+
 	if (!hugetlb && !force_pte)
 		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
 
@@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
 		pte_t new_pte = pfn_pte(pfn, mem_type);
-		if (writable) {
+
+		/*
+		 * Don't set write permission, for non-writable region, and
+		 * for read fault to writable region while logging.
+		 */
+		if (writable && can_set_pte_rw) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+							set_pte_flags);
 	}
 
+	if (write_fault)
+		mark_page_dirty(kvm, gfn);
 
 out_unlock:
 	spin_unlock(&kvm->mmu_lock);
@@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
 {
 	pte_t *pte = (pte_t *)data;
 
-	stage2_set_pte(kvm, NULL, gpa, pte, false);
+	/*
+	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
+	 * flag clear because MMU notifiers will have unmapped a huge PMD before
+	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+	 * therefore stage2_set_pte() never needs to clear out a huge PMD
+	 * through this calling path.
+	 */
+	stage2_set_pte(kvm, NULL, gpa, pte, 0);
 }
 
 
@@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	bool writable = !(mem->flags & KVM_MEM_READONLY);
 	int ret = 0;
 
-	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
+	/*
+	 * Let - enable of dirty page logging through, later check if it's for
+	 * an IO region and fail.
+	 */
+	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
+		change == KVM_MR_FLAGS_ONLY &&
+		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))
 		return 0;
 
 	/*
@@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
 					 vm_start - vma->vm_start;
 
-			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
+			if (change != KVM_MR_FLAGS_ONLY)
+				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
 						    vm_end - vm_start,
 						    writable);
+			else
+				/* IO region dirty page logging not allowed */
+				return -EINVAL;
+
 			if (ret)
 				break;
 		}
 		hva = vm_end;
 	} while (hva < reg_end);
 
+	/* Anything after here doesn't apply to memslot flag changes */
+	if (change == KVM_MR_FLAGS_ONLY)
+		return ret;
+
 	spin_lock(&kvm->mmu_lock);
 	if (ret)
 		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-10  4:17 ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-10  4:17 UTC (permalink / raw)
  To: linux-arm-kernel

This patch adds support for 2nd stage page fault handling while dirty page
logging. On huge page faults, huge pages are dissolved to normal pages, and
rebuilding of 2nd stage huge pages is blocked. In case migration is 
canceled this restriction is removed and huge pages may be rebuilt again.

This patch applies cleanly on top of patch series posted Dec. 15'th:
https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html

Patch #11 has been dropped, and should not be applied.

Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---

Change Log since last RESEND v1 --> v2:
- Disallow dirty page logging of IO region - fail for initial write protect
  and disable logging code in 2nd stage page fault handler.
- Fixed auto spell correction errors

Change Log RESEND v0 --> v1:
- fixed bug exposed by new generic __get_user_pages_fast(), when region is 
  writable, prevent write protection of pte on read fault
- Removed marking entire huge page dirty on initial access
- don't dissolve huge pages of non-writable regions
- Made updates based on Christoffers comments
  - renamed logging status function to memslot_is_logging()
  - changed few values to bool from longs
  - streamlined user_mem_abort() to eliminate extra conditional checks
---
 arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 105 insertions(+), 8 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 73d506f..b878236 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
 #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
 #define kvm_pud_huge(_x)	pud_huge(_x)
 
+#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
+#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+#ifdef CONFIG_ARM
+	return !!memslot->dirty_bitmap;
+#else
+	return false;
+#endif
+}
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
 	/*
@@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm:	pointer to kvm structure.
+ * @addr:	IPA
+ * @pmd:	pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
+ * pages in the range dirty.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+	if (!kvm_pmd_huge(*pmd))
+		return;
+
+	pmd_clear(pmd);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	put_page(virt_to_page(pmd));
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  int min, int max)
 {
@@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 }
 
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
+			  phys_addr_t addr, const pte_t *new_pte,
+			  unsigned long flags)
 {
 	pmd_t *pmd;
 	pte_t *pte, old_pte;
+	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
+	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
 
 	/* Create stage-2 page table mapping - Levels 0 and 1 */
 	pmd = stage2_get_pmd(kvm, cache, addr);
@@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 		return 0;
 	}
 
+	/*
+	 * While dirty page logging - dissolve huge PMD, then continue on to
+	 * allocate page.
+	 */
+	if (logging_active)
+		stage2_dissolve_pmd(kvm, addr, pmd);
+
 	/* Create stage-2 page mappings - Level 2 */
 	if (pmd_none(*pmd)) {
 		if (!cache)
@@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 		if (ret)
 			goto out;
 		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
+		ret = stage2_set_pte(kvm, &cache, addr, &pte,
+						KVM_S2PTE_FLAG_IS_IOMAP);
 		spin_unlock(&kvm->mmu_lock);
 		if (ret)
 			goto out;
@@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	pfn_t pfn;
 	pgprot_t mem_type = PAGE_S2;
 	bool fault_ipa_uncached;
+	bool can_set_pte_rw = true;
+	unsigned long set_pte_flags = 0;
 
 	write_fault = kvm_is_write_fault(vcpu);
 	if (fault_status == FSC_PERM && !write_fault) {
@@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
+
 	/* Let's check if we will get back a huge page backed by hugetlbfs */
 	down_read(&current->mm->mmap_sem);
 	vma = find_vma_intersection(current->mm, hva, hva + 1);
@@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;
 
-	if (kvm_is_device_pfn(pfn))
+	if (kvm_is_device_pfn(pfn)) {
 		mem_type = PAGE_S2_DEVICE;
+		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
+	}
 
 	spin_lock(&kvm->mmu_lock);
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
+
+	/*
+	 * When logging is enabled general page fault handling changes:
+	 * -  Writable huge pages are dissolved on a read or write fault.
+	 * -  pte's are not allowed write permission on a read fault to
+	 *    writable region so future writes can be marked dirty
+	 * Access to non-writable region is unchanged, and logging of IO
+	 * regions is not allowed.
+	 */
+	if (memslot_is_logging(memslot) && writable) {
+		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
+		if (hugetlb) {
+			gfn += pte_index(fault_ipa);
+			pfn += pte_index(fault_ipa);
+			hugetlb = false;
+		}
+		force_pte = true;
+		if (!write_fault)
+			can_set_pte_rw = false;
+	}
+
 	if (!hugetlb && !force_pte)
 		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
 
@@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
 		pte_t new_pte = pfn_pte(pfn, mem_type);
-		if (writable) {
+
+		/*
+		 * Don't set write permission, for non-writable region, and
+		 * for read fault to writable region while logging.
+		 */
+		if (writable && can_set_pte_rw) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+							set_pte_flags);
 	}
 
+	if (write_fault)
+		mark_page_dirty(kvm, gfn);
 
 out_unlock:
 	spin_unlock(&kvm->mmu_lock);
@@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
 {
 	pte_t *pte = (pte_t *)data;
 
-	stage2_set_pte(kvm, NULL, gpa, pte, false);
+	/*
+	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
+	 * flag clear because MMU notifiers will have unmapped a huge PMD before
+	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+	 * therefore stage2_set_pte() never needs to clear out a huge PMD
+	 * through this calling path.
+	 */
+	stage2_set_pte(kvm, NULL, gpa, pte, 0);
 }
 
 
@@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	bool writable = !(mem->flags & KVM_MEM_READONLY);
 	int ret = 0;
 
-	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
+	/*
+	 * Let - enable of dirty page logging through, later check if it's for
+	 * an IO region and fail.
+	 */
+	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
+		change == KVM_MR_FLAGS_ONLY &&
+		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))
 		return 0;
 
 	/*
@@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
 					 vm_start - vma->vm_start;
 
-			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
+			if (change != KVM_MR_FLAGS_ONLY)
+				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
 						    vm_end - vm_start,
 						    writable);
+			else
+				/* IO region dirty page logging not allowed */
+				return -EINVAL;
+
 			if (ret)
 				break;
 		}
 		hva = vm_end;
 	} while (hva < reg_end);
 
+	/* Anything after here doesn't apply to memslot flag changes */
+	if (change == KVM_MR_FLAGS_ONLY)
+		return ret;
+
 	spin_lock(&kvm->mmu_lock);
 	if (ret)
 		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-10  4:17 ` Mario Smarduch
@ 2015-01-11 14:00   ` Christoffer Dall
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-11 14:00 UTC (permalink / raw)
  To: Mario Smarduch; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
> This patch adds support for 2nd stage page fault handling while dirty page
> logging. On huge page faults, huge pages are dissolved to normal pages, and
> rebuilding of 2nd stage huge pages is blocked. In case migration is 
> canceled this restriction is removed and huge pages may be rebuilt again.
> 
> This patch applies cleanly on top of patch series posted Dec. 15'th:
> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html

In the future such information should also go under the ---
separator.

> 
> Patch #11 has been dropped, and should not be applied.

this should go under the '---' separator too.

> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
> 
> Change Log since last RESEND v1 --> v2:
> - Disallow dirty page logging of IO region - fail for initial write protect
>   and disable logging code in 2nd stage page fault handler.
> - Fixed auto spell correction errors
> 
> Change Log RESEND v0 --> v1:
> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>   writable, prevent write protection of pte on read fault
> - Removed marking entire huge page dirty on initial access
> - don't dissolve huge pages of non-writable regions
> - Made updates based on Christoffers comments
>   - renamed logging status function to memslot_is_logging()
>   - changed few values to bool from longs
>   - streamlined user_mem_abort() to eliminate extra conditional checks
> ---
>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 105 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 73d506f..b878236 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>  #define kvm_pud_huge(_x)	pud_huge(_x)
>  
> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
> +
> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> +{
> +#ifdef CONFIG_ARM
> +	return !!memslot->dirty_bitmap;
> +#else
> +	return false;
> +#endif
> +}
> +
>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  {
>  	/*
> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>  }
>  
> +/**
> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
> + * @kvm:	pointer to kvm structure.
> + * @addr:	IPA
> + * @pmd:	pmd pointer for IPA
> + *
> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
> + * pages in the range dirty.
> + */
> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
> +{
> +	if (!kvm_pmd_huge(*pmd))
> +		return;
> +
> +	pmd_clear(pmd);
> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
> +	put_page(virt_to_page(pmd));
> +}
> +
>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>  				  int min, int max)
>  {
> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>  }
>  
>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
> +			  phys_addr_t addr, const pte_t *new_pte,
> +			  unsigned long flags)
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, old_pte;
> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>  
>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>  	pmd = stage2_get_pmd(kvm, cache, addr);
> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>  		return 0;
>  	}
>  
> +	/*
> +	 * While dirty page logging - dissolve huge PMD, then continue on to
> +	 * allocate page.
> +	 */
> +	if (logging_active)
> +		stage2_dissolve_pmd(kvm, addr, pmd);
> +
>  	/* Create stage-2 page mappings - Level 2 */
>  	if (pmd_none(*pmd)) {
>  		if (!cache)
> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>  		if (ret)
>  			goto out;
>  		spin_lock(&kvm->mmu_lock);
> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
> +						KVM_S2PTE_FLAG_IS_IOMAP);
>  		spin_unlock(&kvm->mmu_lock);
>  		if (ret)
>  			goto out;
> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	pfn_t pfn;
>  	pgprot_t mem_type = PAGE_S2;
>  	bool fault_ipa_uncached;
> +	bool can_set_pte_rw = true;
> +	unsigned long set_pte_flags = 0;
>  
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
>  
> +

stray whitespace change?

>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>  	down_read(&current->mm->mmap_sem);
>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (is_error_pfn(pfn))
>  		return -EFAULT;
>  
> -	if (kvm_is_device_pfn(pfn))
> +	if (kvm_is_device_pfn(pfn)) {
>  		mem_type = PAGE_S2_DEVICE;
> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
> +	}
>  
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
>  		goto out_unlock;
> +
> +	/*
> +	 * When logging is enabled general page fault handling changes:
> +	 * -  Writable huge pages are dissolved on a read or write fault.

why dissolve huge pages on a read fault?

> +	 * -  pte's are not allowed write permission on a read fault to
> +	 *    writable region so future writes can be marked dirty

new line

> +	 * Access to non-writable region is unchanged, and logging of IO
> +	 * regions is not allowed.
> +	 */
> +	if (memslot_is_logging(memslot) && writable) {
> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
> +		if (hugetlb) {
> +			gfn += pte_index(fault_ipa);
> +			pfn += pte_index(fault_ipa);
> +			hugetlb = false;
> +		}
> +		force_pte = true;

uh, not this is not what I meant, see my example (untested, partial)
patch in the end of this mail.

> +		if (!write_fault)
> +			can_set_pte_rw = false;
> +	}
> +
>  	if (!hugetlb && !force_pte)
>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>  
> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>  	} else {
>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> -		if (writable) {
> +
> +		/*
> +		 * Don't set write permission, for non-writable region, and
> +		 * for read fault to writable region while logging.
> +		 */
> +		if (writable && can_set_pte_rw) {
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>  					  fault_ipa_uncached);
>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> +							set_pte_flags);
>  	}
>  
> +	if (write_fault)
> +		mark_page_dirty(kvm, gfn);
>  
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>  {
>  	pte_t *pte = (pte_t *)data;
>  
> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> +	/*
> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
> +	 * through this calling path.
> +	 */
> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>  }
>  
>  
> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>  	int ret = 0;
>  
> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
> +	/*
> +	 * Let - enable of dirty page logging through, later check if it's for
> +	 * an IO region and fail.
> +	 */

I don't understand this comment or find it helpful.

> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
> +		change == KVM_MR_FLAGS_ONLY &&
> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))

this looks wrong, because you can now remove all the other checks of
change != and you are not returning early for KVM_MR_DELETE.

I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
and then after the 'return 0' check the subconditions for change ==
KVM_MR_FLAGS_ONLY.

>  		return 0;
>  
>  	/*
> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>  					 vm_start - vma->vm_start;
>  
> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
> +			if (change != KVM_MR_FLAGS_ONLY)
> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>  						    vm_end - vm_start,
>  						    writable);
> +			else
> +				/* IO region dirty page logging not allowed */
> +				return -EINVAL;
> +

this whole thing also looks weird.  I think you just need to add a check
before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
return an error in that case (you've identified a user attempting to set
dirty page logging on something that points to device memory, it doesn't
matter at this point through which 'change' it is done).

>  			if (ret)
>  				break;
>  		}
>  		hva = vm_end;
>  	} while (hva < reg_end);
>  
> +	/* Anything after here doesn't apply to memslot flag changes */
> +	if (change == KVM_MR_FLAGS_ONLY)
> +		return ret;
> +
>  	spin_lock(&kvm->mmu_lock);
>  	if (ret)
>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
> -- 


What I meant last time around concerning user_mem_abort was more
something like this:

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1dc9778..38ea58e 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
-	if (is_vm_hugetlb_page(vma)) {
+	/*
+	 * Writes to pages in a memslot with logging enabled are always logged
+	 * on a singe page-by-page basis.
+	 */
+	if (memslot_is_logging(memslot) && write_fault)
+		force_pte = true;
+
+	if (is_vm_hugetlb_page(vma) && !force_pte) {
 		hugetlb = true;
 		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
 	} else {
@@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;
 
+	if (memslot_is_logging(memslot) && !write_fault)
+		writable = false;
+
 	if (kvm_is_device_pfn(pfn))
 		mem_type = PAGE_S2_DEVICE;
 
@@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 					  fault_ipa_uncached);
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
+		unsigned long flags = 0;
 		pte_t new_pte = pfn_pte(pfn, mem_type);
+
 		if (writable) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+
+		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
+			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+
+		if (memslot_is_logging(memslot))
+			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
 	}
 
 

Thanks,
-Christoffer

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-11 14:00   ` Christoffer Dall
  0 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-11 14:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
> This patch adds support for 2nd stage page fault handling while dirty page
> logging. On huge page faults, huge pages are dissolved to normal pages, and
> rebuilding of 2nd stage huge pages is blocked. In case migration is 
> canceled this restriction is removed and huge pages may be rebuilt again.
> 
> This patch applies cleanly on top of patch series posted Dec. 15'th:
> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html

In the future such information should also go under the ---
separator.

> 
> Patch #11 has been dropped, and should not be applied.

this should go under the '---' separator too.

> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
> 
> Change Log since last RESEND v1 --> v2:
> - Disallow dirty page logging of IO region - fail for initial write protect
>   and disable logging code in 2nd stage page fault handler.
> - Fixed auto spell correction errors
> 
> Change Log RESEND v0 --> v1:
> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>   writable, prevent write protection of pte on read fault
> - Removed marking entire huge page dirty on initial access
> - don't dissolve huge pages of non-writable regions
> - Made updates based on Christoffers comments
>   - renamed logging status function to memslot_is_logging()
>   - changed few values to bool from longs
>   - streamlined user_mem_abort() to eliminate extra conditional checks
> ---
>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 105 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 73d506f..b878236 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>  #define kvm_pud_huge(_x)	pud_huge(_x)
>  
> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
> +
> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> +{
> +#ifdef CONFIG_ARM
> +	return !!memslot->dirty_bitmap;
> +#else
> +	return false;
> +#endif
> +}
> +
>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  {
>  	/*
> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>  }
>  
> +/**
> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
> + * @kvm:	pointer to kvm structure.
> + * @addr:	IPA
> + * @pmd:	pmd pointer for IPA
> + *
> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
> + * pages in the range dirty.
> + */
> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
> +{
> +	if (!kvm_pmd_huge(*pmd))
> +		return;
> +
> +	pmd_clear(pmd);
> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
> +	put_page(virt_to_page(pmd));
> +}
> +
>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>  				  int min, int max)
>  {
> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>  }
>  
>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
> +			  phys_addr_t addr, const pte_t *new_pte,
> +			  unsigned long flags)
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, old_pte;
> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>  
>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>  	pmd = stage2_get_pmd(kvm, cache, addr);
> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>  		return 0;
>  	}
>  
> +	/*
> +	 * While dirty page logging - dissolve huge PMD, then continue on to
> +	 * allocate page.
> +	 */
> +	if (logging_active)
> +		stage2_dissolve_pmd(kvm, addr, pmd);
> +
>  	/* Create stage-2 page mappings - Level 2 */
>  	if (pmd_none(*pmd)) {
>  		if (!cache)
> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>  		if (ret)
>  			goto out;
>  		spin_lock(&kvm->mmu_lock);
> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
> +						KVM_S2PTE_FLAG_IS_IOMAP);
>  		spin_unlock(&kvm->mmu_lock);
>  		if (ret)
>  			goto out;
> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	pfn_t pfn;
>  	pgprot_t mem_type = PAGE_S2;
>  	bool fault_ipa_uncached;
> +	bool can_set_pte_rw = true;
> +	unsigned long set_pte_flags = 0;
>  
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
>  
> +

stray whitespace change?

>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>  	down_read(&current->mm->mmap_sem);
>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (is_error_pfn(pfn))
>  		return -EFAULT;
>  
> -	if (kvm_is_device_pfn(pfn))
> +	if (kvm_is_device_pfn(pfn)) {
>  		mem_type = PAGE_S2_DEVICE;
> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
> +	}
>  
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
>  		goto out_unlock;
> +
> +	/*
> +	 * When logging is enabled general page fault handling changes:
> +	 * -  Writable huge pages are dissolved on a read or write fault.

why dissolve huge pages on a read fault?

> +	 * -  pte's are not allowed write permission on a read fault to
> +	 *    writable region so future writes can be marked dirty

new line

> +	 * Access to non-writable region is unchanged, and logging of IO
> +	 * regions is not allowed.
> +	 */
> +	if (memslot_is_logging(memslot) && writable) {
> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
> +		if (hugetlb) {
> +			gfn += pte_index(fault_ipa);
> +			pfn += pte_index(fault_ipa);
> +			hugetlb = false;
> +		}
> +		force_pte = true;

uh, not this is not what I meant, see my example (untested, partial)
patch in the end of this mail.

> +		if (!write_fault)
> +			can_set_pte_rw = false;
> +	}
> +
>  	if (!hugetlb && !force_pte)
>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>  
> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>  	} else {
>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> -		if (writable) {
> +
> +		/*
> +		 * Don't set write permission, for non-writable region, and
> +		 * for read fault to writable region while logging.
> +		 */
> +		if (writable && can_set_pte_rw) {
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>  					  fault_ipa_uncached);
>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> +							set_pte_flags);
>  	}
>  
> +	if (write_fault)
> +		mark_page_dirty(kvm, gfn);
>  
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>  {
>  	pte_t *pte = (pte_t *)data;
>  
> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> +	/*
> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
> +	 * through this calling path.
> +	 */
> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>  }
>  
>  
> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>  	int ret = 0;
>  
> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
> +	/*
> +	 * Let - enable of dirty page logging through, later check if it's for
> +	 * an IO region and fail.
> +	 */

I don't understand this comment or find it helpful.

> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
> +		change == KVM_MR_FLAGS_ONLY &&
> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))

this looks wrong, because you can now remove all the other checks of
change != and you are not returning early for KVM_MR_DELETE.

I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
and then after the 'return 0' check the subconditions for change ==
KVM_MR_FLAGS_ONLY.

>  		return 0;
>  
>  	/*
> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>  					 vm_start - vma->vm_start;
>  
> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
> +			if (change != KVM_MR_FLAGS_ONLY)
> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>  						    vm_end - vm_start,
>  						    writable);
> +			else
> +				/* IO region dirty page logging not allowed */
> +				return -EINVAL;
> +

this whole thing also looks weird.  I think you just need to add a check
before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
return an error in that case (you've identified a user attempting to set
dirty page logging on something that points to device memory, it doesn't
matter at this point through which 'change' it is done).

>  			if (ret)
>  				break;
>  		}
>  		hva = vm_end;
>  	} while (hva < reg_end);
>  
> +	/* Anything after here doesn't apply to memslot flag changes */
> +	if (change == KVM_MR_FLAGS_ONLY)
> +		return ret;
> +
>  	spin_lock(&kvm->mmu_lock);
>  	if (ret)
>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
> -- 


What I meant last time around concerning user_mem_abort was more
something like this:

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1dc9778..38ea58e 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
-	if (is_vm_hugetlb_page(vma)) {
+	/*
+	 * Writes to pages in a memslot with logging enabled are always logged
+	 * on a singe page-by-page basis.
+	 */
+	if (memslot_is_logging(memslot) && write_fault)
+		force_pte = true;
+
+	if (is_vm_hugetlb_page(vma) && !force_pte) {
 		hugetlb = true;
 		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
 	} else {
@@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;
 
+	if (memslot_is_logging(memslot) && !write_fault)
+		writable = false;
+
 	if (kvm_is_device_pfn(pfn))
 		mem_type = PAGE_S2_DEVICE;
 
@@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 					  fault_ipa_uncached);
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
+		unsigned long flags = 0;
 		pte_t new_pte = pfn_pte(pfn, mem_type);
+
 		if (writable) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+
+		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
+			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+
+		if (memslot_is_logging(memslot))
+			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
 	}
 
 

Thanks,
-Christoffer

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-11 14:00   ` Christoffer Dall
@ 2015-01-12 16:27     ` Mario Smarduch
  -1 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-12 16:27 UTC (permalink / raw)
  To: Christoffer Dall; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On 01/11/2015 06:00 AM, Christoffer Dall wrote:
> On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
>> This patch adds support for 2nd stage page fault handling while dirty page
>> logging. On huge page faults, huge pages are dissolved to normal pages, and
>> rebuilding of 2nd stage huge pages is blocked. In case migration is 
>> canceled this restriction is removed and huge pages may be rebuilt again.
>>
>> This patch applies cleanly on top of patch series posted Dec. 15'th:
>> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html
> 
> In the future such information should also go under the ---
> separator.
> 
>>
>> Patch #11 has been dropped, and should not be applied.
> 
> this should go under the '---' separator too.
Ok will keep that in mind.
> 
>>
>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>> ---
>>
>> Change Log since last RESEND v1 --> v2:
>> - Disallow dirty page logging of IO region - fail for initial write protect
>>   and disable logging code in 2nd stage page fault handler.
>> - Fixed auto spell correction errors
>>
>> Change Log RESEND v0 --> v1:
>> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>>   writable, prevent write protection of pte on read fault
>> - Removed marking entire huge page dirty on initial access
>> - don't dissolve huge pages of non-writable regions
>> - Made updates based on Christoffers comments
>>   - renamed logging status function to memslot_is_logging()
>>   - changed few values to bool from longs
>>   - streamlined user_mem_abort() to eliminate extra conditional checks
>> ---
>>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 105 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index 73d506f..b878236 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>>  #define kvm_pud_huge(_x)	pud_huge(_x)
>>  
>> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
>> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
>> +
>> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
>> +{
>> +#ifdef CONFIG_ARM
>> +	return !!memslot->dirty_bitmap;
>> +#else
>> +	return false;
>> +#endif
>> +}
>> +
>>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>  {
>>  	/*
>> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>>  }
>>  
>> +/**
>> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
>> + * @kvm:	pointer to kvm structure.
>> + * @addr:	IPA
>> + * @pmd:	pmd pointer for IPA
>> + *
>> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
>> + * pages in the range dirty.
>> + */
>> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
>> +{
>> +	if (!kvm_pmd_huge(*pmd))
>> +		return;
>> +
>> +	pmd_clear(pmd);
>> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
>> +	put_page(virt_to_page(pmd));
>> +}
>> +
>>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>>  				  int min, int max)
>>  {
>> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>>  }
>>  
>>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
>> +			  phys_addr_t addr, const pte_t *new_pte,
>> +			  unsigned long flags)
>>  {
>>  	pmd_t *pmd;
>>  	pte_t *pte, old_pte;
>> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
>> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>>  
>>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>>  	pmd = stage2_get_pmd(kvm, cache, addr);
>> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>  		return 0;
>>  	}
>>  
>> +	/*
>> +	 * While dirty page logging - dissolve huge PMD, then continue on to
>> +	 * allocate page.
>> +	 */
>> +	if (logging_active)
>> +		stage2_dissolve_pmd(kvm, addr, pmd);
>> +
>>  	/* Create stage-2 page mappings - Level 2 */
>>  	if (pmd_none(*pmd)) {
>>  		if (!cache)
>> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>  		if (ret)
>>  			goto out;
>>  		spin_lock(&kvm->mmu_lock);
>> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
>> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
>> +						KVM_S2PTE_FLAG_IS_IOMAP);
>>  		spin_unlock(&kvm->mmu_lock);
>>  		if (ret)
>>  			goto out;
>> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	pfn_t pfn;
>>  	pgprot_t mem_type = PAGE_S2;
>>  	bool fault_ipa_uncached;
>> +	bool can_set_pte_rw = true;
>> +	unsigned long set_pte_flags = 0;
>>  
>>  	write_fault = kvm_is_write_fault(vcpu);
>>  	if (fault_status == FSC_PERM && !write_fault) {
>> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  		return -EFAULT;
>>  	}
>>  
>> +
> 
> stray whitespace change?
Got it.
> 
>>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>>  	down_read(&current->mm->mmap_sem);
>>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	if (is_error_pfn(pfn))
>>  		return -EFAULT;
>>  
>> -	if (kvm_is_device_pfn(pfn))
>> +	if (kvm_is_device_pfn(pfn)) {
>>  		mem_type = PAGE_S2_DEVICE;
>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
>> +	}
>>  
>>  	spin_lock(&kvm->mmu_lock);
>>  	if (mmu_notifier_retry(kvm, mmu_seq))
>>  		goto out_unlock;
>> +
>> +	/*
>> +	 * When logging is enabled general page fault handling changes:
>> +	 * -  Writable huge pages are dissolved on a read or write fault.
> 
> why dissolve huge pages on a read fault?

What I noticed on write you would dissolve, on read you
rebuild THPs, flip back and forth like that, performance
& convergence was really bad.
> 
>> +	 * -  pte's are not allowed write permission on a read fault to
>> +	 *    writable region so future writes can be marked dirty
> 
> new line
ok.
> 
>> +	 * Access to non-writable region is unchanged, and logging of IO
>> +	 * regions is not allowed.
>> +	 */
>> +	if (memslot_is_logging(memslot) && writable) {
>> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>> +		if (hugetlb) {
>> +			gfn += pte_index(fault_ipa);
>> +			pfn += pte_index(fault_ipa);
>> +			hugetlb = false;
>> +		}
>> +		force_pte = true;
> 
> uh, not this is not what I meant, see my example (untested, partial)
> patch in the end of this mail.
I put some comments on your patch.
> 
>> +		if (!write_fault)
>> +			can_set_pte_rw = false;
>> +	}
>> +
>>  	if (!hugetlb && !force_pte)
>>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>>  
>> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>  	} else {
>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>> -		if (writable) {
>> +
>> +		/*
>> +		 * Don't set write permission, for non-writable region, and
>> +		 * for read fault to writable region while logging.
>> +		 */
>> +		if (writable && can_set_pte_rw) {
>>  			kvm_set_s2pte_writable(&new_pte);
>>  			kvm_set_pfn_dirty(pfn);
>>  		}
>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>  					  fault_ipa_uncached);
>>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>> +							set_pte_flags);
>>  	}
>>  
>> +	if (write_fault)
>> +		mark_page_dirty(kvm, gfn);
>>  
>>  out_unlock:
>>  	spin_unlock(&kvm->mmu_lock);
>> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>>  {
>>  	pte_t *pte = (pte_t *)data;
>>  
>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
>> +	/*
>> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
>> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
>> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
>> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
>> +	 * through this calling path.
>> +	 */
>> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>>  }
>>  
>>  
>> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>>  	int ret = 0;
>>  
>> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
>> +	/*
>> +	 * Let - enable of dirty page logging through, later check if it's for
>> +	 * an IO region and fail.
>> +	 */
> 
> I don't understand this comment or find it helpful.
Will remove.
> 
>> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
>> +		change == KVM_MR_FLAGS_ONLY &&
>> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))
> 
> this looks wrong, because you can now remove all the other checks of
> change != and you are not returning early for KVM_MR_DELETE.
> 
> I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
> and then after the 'return 0' check the subconditions for change ==
> KVM_MR_FLAGS_ONLY.
Yeah, oh boy time to get a new batch of brown bags.

I was trying to limit conditional down to add, remap and
dirty page flag only in case some other flags get toggled
often and waste time walking through VMAs.
> 
>>  		return 0;
>>  
>>  	/*
>> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>>  					 vm_start - vma->vm_start;
>>  
>> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>> +			if (change != KVM_MR_FLAGS_ONLY)
>> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>>  						    vm_end - vm_start,
>>  						    writable);
>> +			else
>> +				/* IO region dirty page logging not allowed */
>> +				return -EINVAL;
>> +
> 
> this whole thing also looks weird.  I think you just need to add a check
> before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
> return an error in that case (you've identified a user attempting to set
> dirty page logging on something that points to device memory, it doesn't
> matter at this point through which 'change' it is done).

Yes explicitly using KVM_MEM_LOG_DIRTY_PAGES is more clear.

> 
>>  			if (ret)
>>  				break;
>>  		}
>>  		hva = vm_end;
>>  	} while (hva < reg_end);
>>  
>> +	/* Anything after here doesn't apply to memslot flag changes */
>> +	if (change == KVM_MR_FLAGS_ONLY)
>> +		return ret;
>> +
>>  	spin_lock(&kvm->mmu_lock);
>>  	if (ret)
>>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
>> -- 
> 
> 
> What I meant last time around concerning user_mem_abort was more
> something like this:
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 1dc9778..38ea58e 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
>  
> -	if (is_vm_hugetlb_page(vma)) {
> +	/*
> +	 * Writes to pages in a memslot with logging enabled are always logged
> +	 * on a singe page-by-page basis.
> +	 */
> +	if (memslot_is_logging(memslot) && write_fault)
> +		force_pte = true;

If it's a write you take the pte route and
dissolves huge page, if it's a read you reconstruct the
THP that seems to yield pretty bad results.
> +
> +	if (is_vm_hugetlb_page(vma) && !force_pte) {
>  		hugetlb = true;
>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>  	} else {
> @@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (is_error_pfn(pfn))
>  		return -EFAULT;
>  
> +	if (memslot_is_logging(memslot) && !write_fault)
> +		writable = false;
Ok reusing writable is better.
> +
>  	if (kvm_is_device_pfn(pfn))
>  		mem_type = PAGE_S2_DEVICE;
>  
> @@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  					  fault_ipa_uncached);
>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>  	} else {
> +		unsigned long flags = 0;
>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> +
>  		if (writable) {
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>  					  fault_ipa_uncached);
> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> +
> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> +
> +		if (memslot_is_logging(memslot))
> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
Although we don't know if device mappings will be handled here.

Thanks.
> +
> +		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
>  	}
>  
>  
> 
> Thanks,
> -Christoffer
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-12 16:27     ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-12 16:27 UTC (permalink / raw)
  To: linux-arm-kernel

On 01/11/2015 06:00 AM, Christoffer Dall wrote:
> On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
>> This patch adds support for 2nd stage page fault handling while dirty page
>> logging. On huge page faults, huge pages are dissolved to normal pages, and
>> rebuilding of 2nd stage huge pages is blocked. In case migration is 
>> canceled this restriction is removed and huge pages may be rebuilt again.
>>
>> This patch applies cleanly on top of patch series posted Dec. 15'th:
>> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html
> 
> In the future such information should also go under the ---
> separator.
> 
>>
>> Patch #11 has been dropped, and should not be applied.
> 
> this should go under the '---' separator too.
Ok will keep that in mind.
> 
>>
>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>> ---
>>
>> Change Log since last RESEND v1 --> v2:
>> - Disallow dirty page logging of IO region - fail for initial write protect
>>   and disable logging code in 2nd stage page fault handler.
>> - Fixed auto spell correction errors
>>
>> Change Log RESEND v0 --> v1:
>> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>>   writable, prevent write protection of pte on read fault
>> - Removed marking entire huge page dirty on initial access
>> - don't dissolve huge pages of non-writable regions
>> - Made updates based on Christoffers comments
>>   - renamed logging status function to memslot_is_logging()
>>   - changed few values to bool from longs
>>   - streamlined user_mem_abort() to eliminate extra conditional checks
>> ---
>>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 105 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index 73d506f..b878236 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>>  #define kvm_pud_huge(_x)	pud_huge(_x)
>>  
>> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
>> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
>> +
>> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
>> +{
>> +#ifdef CONFIG_ARM
>> +	return !!memslot->dirty_bitmap;
>> +#else
>> +	return false;
>> +#endif
>> +}
>> +
>>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>  {
>>  	/*
>> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>>  }
>>  
>> +/**
>> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
>> + * @kvm:	pointer to kvm structure.
>> + * @addr:	IPA
>> + * @pmd:	pmd pointer for IPA
>> + *
>> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
>> + * pages in the range dirty.
>> + */
>> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
>> +{
>> +	if (!kvm_pmd_huge(*pmd))
>> +		return;
>> +
>> +	pmd_clear(pmd);
>> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
>> +	put_page(virt_to_page(pmd));
>> +}
>> +
>>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>>  				  int min, int max)
>>  {
>> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>>  }
>>  
>>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
>> +			  phys_addr_t addr, const pte_t *new_pte,
>> +			  unsigned long flags)
>>  {
>>  	pmd_t *pmd;
>>  	pte_t *pte, old_pte;
>> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
>> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>>  
>>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>>  	pmd = stage2_get_pmd(kvm, cache, addr);
>> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>  		return 0;
>>  	}
>>  
>> +	/*
>> +	 * While dirty page logging - dissolve huge PMD, then continue on to
>> +	 * allocate page.
>> +	 */
>> +	if (logging_active)
>> +		stage2_dissolve_pmd(kvm, addr, pmd);
>> +
>>  	/* Create stage-2 page mappings - Level 2 */
>>  	if (pmd_none(*pmd)) {
>>  		if (!cache)
>> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>  		if (ret)
>>  			goto out;
>>  		spin_lock(&kvm->mmu_lock);
>> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
>> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
>> +						KVM_S2PTE_FLAG_IS_IOMAP);
>>  		spin_unlock(&kvm->mmu_lock);
>>  		if (ret)
>>  			goto out;
>> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	pfn_t pfn;
>>  	pgprot_t mem_type = PAGE_S2;
>>  	bool fault_ipa_uncached;
>> +	bool can_set_pte_rw = true;
>> +	unsigned long set_pte_flags = 0;
>>  
>>  	write_fault = kvm_is_write_fault(vcpu);
>>  	if (fault_status == FSC_PERM && !write_fault) {
>> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  		return -EFAULT;
>>  	}
>>  
>> +
> 
> stray whitespace change?
Got it.
> 
>>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>>  	down_read(&current->mm->mmap_sem);
>>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  	if (is_error_pfn(pfn))
>>  		return -EFAULT;
>>  
>> -	if (kvm_is_device_pfn(pfn))
>> +	if (kvm_is_device_pfn(pfn)) {
>>  		mem_type = PAGE_S2_DEVICE;
>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
>> +	}
>>  
>>  	spin_lock(&kvm->mmu_lock);
>>  	if (mmu_notifier_retry(kvm, mmu_seq))
>>  		goto out_unlock;
>> +
>> +	/*
>> +	 * When logging is enabled general page fault handling changes:
>> +	 * -  Writable huge pages are dissolved on a read or write fault.
> 
> why dissolve huge pages on a read fault?

What I noticed on write you would dissolve, on read you
rebuild THPs, flip back and forth like that, performance
& convergence was really bad.
> 
>> +	 * -  pte's are not allowed write permission on a read fault to
>> +	 *    writable region so future writes can be marked dirty
> 
> new line
ok.
> 
>> +	 * Access to non-writable region is unchanged, and logging of IO
>> +	 * regions is not allowed.
>> +	 */
>> +	if (memslot_is_logging(memslot) && writable) {
>> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>> +		if (hugetlb) {
>> +			gfn += pte_index(fault_ipa);
>> +			pfn += pte_index(fault_ipa);
>> +			hugetlb = false;
>> +		}
>> +		force_pte = true;
> 
> uh, not this is not what I meant, see my example (untested, partial)
> patch in the end of this mail.
I put some comments on your patch.
> 
>> +		if (!write_fault)
>> +			can_set_pte_rw = false;
>> +	}
>> +
>>  	if (!hugetlb && !force_pte)
>>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>>  
>> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>  	} else {
>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>> -		if (writable) {
>> +
>> +		/*
>> +		 * Don't set write permission, for non-writable region, and
>> +		 * for read fault to writable region while logging.
>> +		 */
>> +		if (writable && can_set_pte_rw) {
>>  			kvm_set_s2pte_writable(&new_pte);
>>  			kvm_set_pfn_dirty(pfn);
>>  		}
>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>  					  fault_ipa_uncached);
>>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>> +							set_pte_flags);
>>  	}
>>  
>> +	if (write_fault)
>> +		mark_page_dirty(kvm, gfn);
>>  
>>  out_unlock:
>>  	spin_unlock(&kvm->mmu_lock);
>> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>>  {
>>  	pte_t *pte = (pte_t *)data;
>>  
>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
>> +	/*
>> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
>> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
>> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
>> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
>> +	 * through this calling path.
>> +	 */
>> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>>  }
>>  
>>  
>> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>>  	int ret = 0;
>>  
>> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
>> +	/*
>> +	 * Let - enable of dirty page logging through, later check if it's for
>> +	 * an IO region and fail.
>> +	 */
> 
> I don't understand this comment or find it helpful.
Will remove.
> 
>> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
>> +		change == KVM_MR_FLAGS_ONLY &&
>> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))
> 
> this looks wrong, because you can now remove all the other checks of
> change != and you are not returning early for KVM_MR_DELETE.
> 
> I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
> and then after the 'return 0' check the subconditions for change ==
> KVM_MR_FLAGS_ONLY.
Yeah, oh boy time to get a new batch of brown bags.

I was trying to limit conditional down to add, remap and
dirty page flag only in case some other flags get toggled
often and waste time walking through VMAs.
> 
>>  		return 0;
>>  
>>  	/*
>> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>>  					 vm_start - vma->vm_start;
>>  
>> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>> +			if (change != KVM_MR_FLAGS_ONLY)
>> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>>  						    vm_end - vm_start,
>>  						    writable);
>> +			else
>> +				/* IO region dirty page logging not allowed */
>> +				return -EINVAL;
>> +
> 
> this whole thing also looks weird.  I think you just need to add a check
> before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
> return an error in that case (you've identified a user attempting to set
> dirty page logging on something that points to device memory, it doesn't
> matter at this point through which 'change' it is done).

Yes explicitly using KVM_MEM_LOG_DIRTY_PAGES is more clear.

> 
>>  			if (ret)
>>  				break;
>>  		}
>>  		hva = vm_end;
>>  	} while (hva < reg_end);
>>  
>> +	/* Anything after here doesn't apply to memslot flag changes */
>> +	if (change == KVM_MR_FLAGS_ONLY)
>> +		return ret;
>> +
>>  	spin_lock(&kvm->mmu_lock);
>>  	if (ret)
>>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
>> -- 
> 
> 
> What I meant last time around concerning user_mem_abort was more
> something like this:
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 1dc9778..38ea58e 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
>  
> -	if (is_vm_hugetlb_page(vma)) {
> +	/*
> +	 * Writes to pages in a memslot with logging enabled are always logged
> +	 * on a singe page-by-page basis.
> +	 */
> +	if (memslot_is_logging(memslot) && write_fault)
> +		force_pte = true;

If it's a write you take the pte route and
dissolves huge page, if it's a read you reconstruct the
THP that seems to yield pretty bad results.
> +
> +	if (is_vm_hugetlb_page(vma) && !force_pte) {
>  		hugetlb = true;
>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>  	} else {
> @@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (is_error_pfn(pfn))
>  		return -EFAULT;
>  
> +	if (memslot_is_logging(memslot) && !write_fault)
> +		writable = false;
Ok reusing writable is better.
> +
>  	if (kvm_is_device_pfn(pfn))
>  		mem_type = PAGE_S2_DEVICE;
>  
> @@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  					  fault_ipa_uncached);
>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>  	} else {
> +		unsigned long flags = 0;
>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> +
>  		if (writable) {
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>  					  fault_ipa_uncached);
> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> +
> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> +
> +		if (memslot_is_logging(memslot))
> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
Although we don't know if device mappings will be handled here.

Thanks.
> +
> +		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
>  	}
>  
>  
> 
> Thanks,
> -Christoffer
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-12 16:27     ` Mario Smarduch
@ 2015-01-12 17:49       ` Christoffer Dall
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-12 17:49 UTC (permalink / raw)
  To: Mario Smarduch; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On Mon, Jan 12, 2015 at 08:27:03AM -0800, Mario Smarduch wrote:
> On 01/11/2015 06:00 AM, Christoffer Dall wrote:
> > On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
> >> This patch adds support for 2nd stage page fault handling while dirty page
> >> logging. On huge page faults, huge pages are dissolved to normal pages, and
> >> rebuilding of 2nd stage huge pages is blocked. In case migration is 
> >> canceled this restriction is removed and huge pages may be rebuilt again.
> >>
> >> This patch applies cleanly on top of patch series posted Dec. 15'th:
> >> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html
> > 
> > In the future such information should also go under the ---
> > separator.
> > 
> >>
> >> Patch #11 has been dropped, and should not be applied.
> > 
> > this should go under the '---' separator too.
> Ok will keep that in mind.

basically, think of everything above the '---' separator as the commit
message you will find in 'git log' when you are trying to understand a
piece of code or bisecting an issue or the like.  For those purposes you
don't care about the mechanics of how a patch was applied, how many
iterations of the patch there were, what changed between the iterations
and so on.

> > 
> >>
> >> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> >> ---
> >>
> >> Change Log since last RESEND v1 --> v2:
> >> - Disallow dirty page logging of IO region - fail for initial write protect
> >>   and disable logging code in 2nd stage page fault handler.
> >> - Fixed auto spell correction errors
> >>
> >> Change Log RESEND v0 --> v1:
> >> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
> >>   writable, prevent write protection of pte on read fault
> >> - Removed marking entire huge page dirty on initial access
> >> - don't dissolve huge pages of non-writable regions
> >> - Made updates based on Christoffers comments
> >>   - renamed logging status function to memslot_is_logging()
> >>   - changed few values to bool from longs
> >>   - streamlined user_mem_abort() to eliminate extra conditional checks
> >> ---
> >>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
> >>  1 file changed, 105 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >> index 73d506f..b878236 100644
> >> --- a/arch/arm/kvm/mmu.c
> >> +++ b/arch/arm/kvm/mmu.c
> >> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
> >>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
> >>  #define kvm_pud_huge(_x)	pud_huge(_x)
> >>  
> >> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
> >> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
> >> +
> >> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> >> +{
> >> +#ifdef CONFIG_ARM
> >> +	return !!memslot->dirty_bitmap;
> >> +#else
> >> +	return false;
> >> +#endif
> >> +}
> >> +
> >>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
> >>  {
> >>  	/*
> >> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
> >>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
> >>  }
> >>  
> >> +/**
> >> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
> >> + * @kvm:	pointer to kvm structure.
> >> + * @addr:	IPA
> >> + * @pmd:	pmd pointer for IPA
> >> + *
> >> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
> >> + * pages in the range dirty.
> >> + */
> >> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
> >> +{
> >> +	if (!kvm_pmd_huge(*pmd))
> >> +		return;
> >> +
> >> +	pmd_clear(pmd);
> >> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
> >> +	put_page(virt_to_page(pmd));
> >> +}
> >> +
> >>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
> >>  				  int min, int max)
> >>  {
> >> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
> >>  }
> >>  
> >>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> >> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
> >> +			  phys_addr_t addr, const pte_t *new_pte,
> >> +			  unsigned long flags)
> >>  {
> >>  	pmd_t *pmd;
> >>  	pte_t *pte, old_pte;
> >> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
> >> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
> >>  
> >>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
> >>  	pmd = stage2_get_pmd(kvm, cache, addr);
> >> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> >>  		return 0;
> >>  	}
> >>  
> >> +	/*
> >> +	 * While dirty page logging - dissolve huge PMD, then continue on to
> >> +	 * allocate page.
> >> +	 */
> >> +	if (logging_active)
> >> +		stage2_dissolve_pmd(kvm, addr, pmd);
> >> +
> >>  	/* Create stage-2 page mappings - Level 2 */
> >>  	if (pmd_none(*pmd)) {
> >>  		if (!cache)
> >> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
> >>  		if (ret)
> >>  			goto out;
> >>  		spin_lock(&kvm->mmu_lock);
> >> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
> >> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
> >> +						KVM_S2PTE_FLAG_IS_IOMAP);
> >>  		spin_unlock(&kvm->mmu_lock);
> >>  		if (ret)
> >>  			goto out;
> >> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  	pfn_t pfn;
> >>  	pgprot_t mem_type = PAGE_S2;
> >>  	bool fault_ipa_uncached;
> >> +	bool can_set_pte_rw = true;
> >> +	unsigned long set_pte_flags = 0;
> >>  
> >>  	write_fault = kvm_is_write_fault(vcpu);
> >>  	if (fault_status == FSC_PERM && !write_fault) {
> >> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  		return -EFAULT;
> >>  	}
> >>  
> >> +
> > 
> > stray whitespace change?
> Got it.
> > 
> >>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
> >>  	down_read(&current->mm->mmap_sem);
> >>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
> >> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  	if (is_error_pfn(pfn))
> >>  		return -EFAULT;
> >>  
> >> -	if (kvm_is_device_pfn(pfn))
> >> +	if (kvm_is_device_pfn(pfn)) {
> >>  		mem_type = PAGE_S2_DEVICE;
> >> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
> >> +	}
> >>  
> >>  	spin_lock(&kvm->mmu_lock);
> >>  	if (mmu_notifier_retry(kvm, mmu_seq))
> >>  		goto out_unlock;
> >> +
> >> +	/*
> >> +	 * When logging is enabled general page fault handling changes:
> >> +	 * -  Writable huge pages are dissolved on a read or write fault.
> > 
> > why dissolve huge pages on a read fault?
> 
> What I noticed on write you would dissolve, on read you
> rebuild THPs, flip back and forth like that, performance
> & convergence was really bad.

ah, that makes sense, we should probably indicate that reasoning
somehow.  In fact, what threw me off was the use of the word "dissolve
huge pages" which is not really what you're doing on a read fault, there
you are just never adjusting to huge pages.

I'm wondering why that would slow things down much though, the only cost
would be the extra tlb invalidation and replacing the PMD on a
subsequent write fault, but I trust your numbers nevertheless.

> > 
> >> +	 * -  pte's are not allowed write permission on a read fault to
> >> +	 *    writable region so future writes can be marked dirty
> > 
> > new line
> ok.
> > 
> >> +	 * Access to non-writable region is unchanged, and logging of IO
> >> +	 * regions is not allowed.
> >> +	 */
> >> +	if (memslot_is_logging(memslot) && writable) {
> >> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
> >> +		if (hugetlb) {
> >> +			gfn += pte_index(fault_ipa);
> >> +			pfn += pte_index(fault_ipa);
> >> +			hugetlb = false;
> >> +		}
> >> +		force_pte = true;
> > 
> > uh, not this is not what I meant, see my example (untested, partial)
> > patch in the end of this mail.
> I put some comments on your patch.
> > 
> >> +		if (!write_fault)
> >> +			can_set_pte_rw = false;
> >> +	}
> >> +
> >>  	if (!hugetlb && !force_pte)
> >>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
> >>  
> >> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
> >>  	} else {
> >>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> >> -		if (writable) {
> >> +
> >> +		/*
> >> +		 * Don't set write permission, for non-writable region, and
> >> +		 * for read fault to writable region while logging.
> >> +		 */
> >> +		if (writable && can_set_pte_rw) {
> >>  			kvm_set_s2pte_writable(&new_pte);
> >>  			kvm_set_pfn_dirty(pfn);
> >>  		}
> >>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
> >>  					  fault_ipa_uncached);
> >>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> >> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> >> +							set_pte_flags);
> >>  	}
> >>  
> >> +	if (write_fault)
> >> +		mark_page_dirty(kvm, gfn);
> >>  
> >>  out_unlock:
> >>  	spin_unlock(&kvm->mmu_lock);
> >> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
> >>  {
> >>  	pte_t *pte = (pte_t *)data;
> >>  
> >> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> >> +	/*
> >> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
> >> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
> >> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
> >> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
> >> +	 * through this calling path.
> >> +	 */
> >> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
> >>  }
> >>  
> >>  
> >> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
> >>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
> >>  	int ret = 0;
> >>  
> >> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
> >> +	/*
> >> +	 * Let - enable of dirty page logging through, later check if it's for
> >> +	 * an IO region and fail.
> >> +	 */
> > 
> > I don't understand this comment or find it helpful.
> Will remove.
> > 
> >> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
> >> +		change == KVM_MR_FLAGS_ONLY &&
> >> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))
> > 
> > this looks wrong, because you can now remove all the other checks of
> > change != and you are not returning early for KVM_MR_DELETE.
> > 
> > I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
> > and then after the 'return 0' check the subconditions for change ==
> > KVM_MR_FLAGS_ONLY.
> Yeah, oh boy time to get a new batch of brown bags.
> 
> I was trying to limit conditional down to add, remap and
> dirty page flag only in case some other flags get toggled
> often and waste time walking through VMAs.
> > 
> >>  		return 0;
> >>  
> >>  	/*
> >> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
> >>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
> >>  					 vm_start - vma->vm_start;
> >>  
> >> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
> >> +			if (change != KVM_MR_FLAGS_ONLY)
> >> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
> >>  						    vm_end - vm_start,
> >>  						    writable);
> >> +			else
> >> +				/* IO region dirty page logging not allowed */
> >> +				return -EINVAL;
> >> +
> > 
> > this whole thing also looks weird.  I think you just need to add a check
> > before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
> > return an error in that case (you've identified a user attempting to set
> > dirty page logging on something that points to device memory, it doesn't
> > matter at this point through which 'change' it is done).
> 
> Yes explicitly using KVM_MEM_LOG_DIRTY_PAGES is more clear.
> 
> > 
> >>  			if (ret)
> >>  				break;
> >>  		}
> >>  		hva = vm_end;
> >>  	} while (hva < reg_end);
> >>  
> >> +	/* Anything after here doesn't apply to memslot flag changes */
> >> +	if (change == KVM_MR_FLAGS_ONLY)
> >> +		return ret;
> >> +
> >>  	spin_lock(&kvm->mmu_lock);
> >>  	if (ret)
> >>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
> >> -- 
> > 
> > 
> > What I meant last time around concerning user_mem_abort was more
> > something like this:
> > 
> > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> > index 1dc9778..38ea58e 100644
> > --- a/arch/arm/kvm/mmu.c
> > +++ b/arch/arm/kvm/mmu.c
> > @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  		return -EFAULT;
> >  	}
> >  
> > -	if (is_vm_hugetlb_page(vma)) {
> > +	/*
> > +	 * Writes to pages in a memslot with logging enabled are always logged
> > +	 * on a singe page-by-page basis.
> > +	 */
> > +	if (memslot_is_logging(memslot) && write_fault)
> > +		force_pte = true;
> 
> If it's a write you take the pte route and
> dissolves huge page, if it's a read you reconstruct the
> THP that seems to yield pretty bad results.

ok, then remove the ' && write_fault' part of the clause.

> > +
> > +	if (is_vm_hugetlb_page(vma) && !force_pte) {
> >  		hugetlb = true;
> >  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
> >  	} else {
> > @@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  	if (is_error_pfn(pfn))
> >  		return -EFAULT;
> >  
> > +	if (memslot_is_logging(memslot) && !write_fault)
> > +		writable = false;
> Ok reusing writable is better.
> > +
> >  	if (kvm_is_device_pfn(pfn))
> >  		mem_type = PAGE_S2_DEVICE;
> >  
> > @@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  					  fault_ipa_uncached);
> >  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
> >  	} else {
> > +		unsigned long flags = 0;
> >  		pte_t new_pte = pfn_pte(pfn, mem_type);
> > +
> >  		if (writable) {
> >  			kvm_set_s2pte_writable(&new_pte);
> >  			kvm_set_pfn_dirty(pfn);
> >  		}
> >  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
> >  					  fault_ipa_uncached);
> > -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> > -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> > +
> > +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
> > +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> > +
> > +		if (memslot_is_logging(memslot))
> > +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
> Although we don't know if device mappings will be handled here.
> 

so forget all I said about this in the past, I confused the code
checking for !cache with the iomap flag.

So, I think you can always safeful assume that stage2_get_pmd() gives you
something valid back when you have the LOGGING flag set, because you
always call the function with a valid cache when the LOGGING flag is
set.  It could be worth adding the following to stage2_set_pte():

VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)

As for this code, the IOMAP flag's only effect is that we return -EFAULT
if we are seeing an existing PTE for the faulting address.  This would
no longer be valid if we allow logging dirty device memory pages, so we
really need to think about if there's any conceivable use case for this?

It doesn't really make sense to me, so I would suggest that we never
enable logging for pages that return kvm_is_device_pfn().

Thanks,
-Christoffer

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-12 17:49       ` Christoffer Dall
  0 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-12 17:49 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Jan 12, 2015 at 08:27:03AM -0800, Mario Smarduch wrote:
> On 01/11/2015 06:00 AM, Christoffer Dall wrote:
> > On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
> >> This patch adds support for 2nd stage page fault handling while dirty page
> >> logging. On huge page faults, huge pages are dissolved to normal pages, and
> >> rebuilding of 2nd stage huge pages is blocked. In case migration is 
> >> canceled this restriction is removed and huge pages may be rebuilt again.
> >>
> >> This patch applies cleanly on top of patch series posted Dec. 15'th:
> >> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html
> > 
> > In the future such information should also go under the ---
> > separator.
> > 
> >>
> >> Patch #11 has been dropped, and should not be applied.
> > 
> > this should go under the '---' separator too.
> Ok will keep that in mind.

basically, think of everything above the '---' separator as the commit
message you will find in 'git log' when you are trying to understand a
piece of code or bisecting an issue or the like.  For those purposes you
don't care about the mechanics of how a patch was applied, how many
iterations of the patch there were, what changed between the iterations
and so on.

> > 
> >>
> >> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> >> ---
> >>
> >> Change Log since last RESEND v1 --> v2:
> >> - Disallow dirty page logging of IO region - fail for initial write protect
> >>   and disable logging code in 2nd stage page fault handler.
> >> - Fixed auto spell correction errors
> >>
> >> Change Log RESEND v0 --> v1:
> >> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
> >>   writable, prevent write protection of pte on read fault
> >> - Removed marking entire huge page dirty on initial access
> >> - don't dissolve huge pages of non-writable regions
> >> - Made updates based on Christoffers comments
> >>   - renamed logging status function to memslot_is_logging()
> >>   - changed few values to bool from longs
> >>   - streamlined user_mem_abort() to eliminate extra conditional checks
> >> ---
> >>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
> >>  1 file changed, 105 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >> index 73d506f..b878236 100644
> >> --- a/arch/arm/kvm/mmu.c
> >> +++ b/arch/arm/kvm/mmu.c
> >> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
> >>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
> >>  #define kvm_pud_huge(_x)	pud_huge(_x)
> >>  
> >> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
> >> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
> >> +
> >> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> >> +{
> >> +#ifdef CONFIG_ARM
> >> +	return !!memslot->dirty_bitmap;
> >> +#else
> >> +	return false;
> >> +#endif
> >> +}
> >> +
> >>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
> >>  {
> >>  	/*
> >> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
> >>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
> >>  }
> >>  
> >> +/**
> >> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
> >> + * @kvm:	pointer to kvm structure.
> >> + * @addr:	IPA
> >> + * @pmd:	pmd pointer for IPA
> >> + *
> >> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
> >> + * pages in the range dirty.
> >> + */
> >> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
> >> +{
> >> +	if (!kvm_pmd_huge(*pmd))
> >> +		return;
> >> +
> >> +	pmd_clear(pmd);
> >> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
> >> +	put_page(virt_to_page(pmd));
> >> +}
> >> +
> >>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
> >>  				  int min, int max)
> >>  {
> >> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
> >>  }
> >>  
> >>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> >> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
> >> +			  phys_addr_t addr, const pte_t *new_pte,
> >> +			  unsigned long flags)
> >>  {
> >>  	pmd_t *pmd;
> >>  	pte_t *pte, old_pte;
> >> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
> >> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
> >>  
> >>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
> >>  	pmd = stage2_get_pmd(kvm, cache, addr);
> >> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> >>  		return 0;
> >>  	}
> >>  
> >> +	/*
> >> +	 * While dirty page logging - dissolve huge PMD, then continue on to
> >> +	 * allocate page.
> >> +	 */
> >> +	if (logging_active)
> >> +		stage2_dissolve_pmd(kvm, addr, pmd);
> >> +
> >>  	/* Create stage-2 page mappings - Level 2 */
> >>  	if (pmd_none(*pmd)) {
> >>  		if (!cache)
> >> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
> >>  		if (ret)
> >>  			goto out;
> >>  		spin_lock(&kvm->mmu_lock);
> >> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
> >> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
> >> +						KVM_S2PTE_FLAG_IS_IOMAP);
> >>  		spin_unlock(&kvm->mmu_lock);
> >>  		if (ret)
> >>  			goto out;
> >> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  	pfn_t pfn;
> >>  	pgprot_t mem_type = PAGE_S2;
> >>  	bool fault_ipa_uncached;
> >> +	bool can_set_pte_rw = true;
> >> +	unsigned long set_pte_flags = 0;
> >>  
> >>  	write_fault = kvm_is_write_fault(vcpu);
> >>  	if (fault_status == FSC_PERM && !write_fault) {
> >> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  		return -EFAULT;
> >>  	}
> >>  
> >> +
> > 
> > stray whitespace change?
> Got it.
> > 
> >>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
> >>  	down_read(&current->mm->mmap_sem);
> >>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
> >> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  	if (is_error_pfn(pfn))
> >>  		return -EFAULT;
> >>  
> >> -	if (kvm_is_device_pfn(pfn))
> >> +	if (kvm_is_device_pfn(pfn)) {
> >>  		mem_type = PAGE_S2_DEVICE;
> >> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
> >> +	}
> >>  
> >>  	spin_lock(&kvm->mmu_lock);
> >>  	if (mmu_notifier_retry(kvm, mmu_seq))
> >>  		goto out_unlock;
> >> +
> >> +	/*
> >> +	 * When logging is enabled general page fault handling changes:
> >> +	 * -  Writable huge pages are dissolved on a read or write fault.
> > 
> > why dissolve huge pages on a read fault?
> 
> What I noticed on write you would dissolve, on read you
> rebuild THPs, flip back and forth like that, performance
> & convergence was really bad.

ah, that makes sense, we should probably indicate that reasoning
somehow.  In fact, what threw me off was the use of the word "dissolve
huge pages" which is not really what you're doing on a read fault, there
you are just never adjusting to huge pages.

I'm wondering why that would slow things down much though, the only cost
would be the extra tlb invalidation and replacing the PMD on a
subsequent write fault, but I trust your numbers nevertheless.

> > 
> >> +	 * -  pte's are not allowed write permission on a read fault to
> >> +	 *    writable region so future writes can be marked dirty
> > 
> > new line
> ok.
> > 
> >> +	 * Access to non-writable region is unchanged, and logging of IO
> >> +	 * regions is not allowed.
> >> +	 */
> >> +	if (memslot_is_logging(memslot) && writable) {
> >> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
> >> +		if (hugetlb) {
> >> +			gfn += pte_index(fault_ipa);
> >> +			pfn += pte_index(fault_ipa);
> >> +			hugetlb = false;
> >> +		}
> >> +		force_pte = true;
> > 
> > uh, not this is not what I meant, see my example (untested, partial)
> > patch in the end of this mail.
> I put some comments on your patch.
> > 
> >> +		if (!write_fault)
> >> +			can_set_pte_rw = false;
> >> +	}
> >> +
> >>  	if (!hugetlb && !force_pte)
> >>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
> >>  
> >> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
> >>  	} else {
> >>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> >> -		if (writable) {
> >> +
> >> +		/*
> >> +		 * Don't set write permission, for non-writable region, and
> >> +		 * for read fault to writable region while logging.
> >> +		 */
> >> +		if (writable && can_set_pte_rw) {
> >>  			kvm_set_s2pte_writable(&new_pte);
> >>  			kvm_set_pfn_dirty(pfn);
> >>  		}
> >>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
> >>  					  fault_ipa_uncached);
> >>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> >> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> >> +							set_pte_flags);
> >>  	}
> >>  
> >> +	if (write_fault)
> >> +		mark_page_dirty(kvm, gfn);
> >>  
> >>  out_unlock:
> >>  	spin_unlock(&kvm->mmu_lock);
> >> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
> >>  {
> >>  	pte_t *pte = (pte_t *)data;
> >>  
> >> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> >> +	/*
> >> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
> >> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
> >> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
> >> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
> >> +	 * through this calling path.
> >> +	 */
> >> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
> >>  }
> >>  
> >>  
> >> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
> >>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
> >>  	int ret = 0;
> >>  
> >> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
> >> +	/*
> >> +	 * Let - enable of dirty page logging through, later check if it's for
> >> +	 * an IO region and fail.
> >> +	 */
> > 
> > I don't understand this comment or find it helpful.
> Will remove.
> > 
> >> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
> >> +		change == KVM_MR_FLAGS_ONLY &&
> >> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))
> > 
> > this looks wrong, because you can now remove all the other checks of
> > change != and you are not returning early for KVM_MR_DELETE.
> > 
> > I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
> > and then after the 'return 0' check the subconditions for change ==
> > KVM_MR_FLAGS_ONLY.
> Yeah, oh boy time to get a new batch of brown bags.
> 
> I was trying to limit conditional down to add, remap and
> dirty page flag only in case some other flags get toggled
> often and waste time walking through VMAs.
> > 
> >>  		return 0;
> >>  
> >>  	/*
> >> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
> >>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
> >>  					 vm_start - vma->vm_start;
> >>  
> >> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
> >> +			if (change != KVM_MR_FLAGS_ONLY)
> >> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
> >>  						    vm_end - vm_start,
> >>  						    writable);
> >> +			else
> >> +				/* IO region dirty page logging not allowed */
> >> +				return -EINVAL;
> >> +
> > 
> > this whole thing also looks weird.  I think you just need to add a check
> > before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
> > return an error in that case (you've identified a user attempting to set
> > dirty page logging on something that points to device memory, it doesn't
> > matter at this point through which 'change' it is done).
> 
> Yes explicitly using KVM_MEM_LOG_DIRTY_PAGES is more clear.
> 
> > 
> >>  			if (ret)
> >>  				break;
> >>  		}
> >>  		hva = vm_end;
> >>  	} while (hva < reg_end);
> >>  
> >> +	/* Anything after here doesn't apply to memslot flag changes */
> >> +	if (change == KVM_MR_FLAGS_ONLY)
> >> +		return ret;
> >> +
> >>  	spin_lock(&kvm->mmu_lock);
> >>  	if (ret)
> >>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
> >> -- 
> > 
> > 
> > What I meant last time around concerning user_mem_abort was more
> > something like this:
> > 
> > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> > index 1dc9778..38ea58e 100644
> > --- a/arch/arm/kvm/mmu.c
> > +++ b/arch/arm/kvm/mmu.c
> > @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  		return -EFAULT;
> >  	}
> >  
> > -	if (is_vm_hugetlb_page(vma)) {
> > +	/*
> > +	 * Writes to pages in a memslot with logging enabled are always logged
> > +	 * on a singe page-by-page basis.
> > +	 */
> > +	if (memslot_is_logging(memslot) && write_fault)
> > +		force_pte = true;
> 
> If it's a write you take the pte route and
> dissolves huge page, if it's a read you reconstruct the
> THP that seems to yield pretty bad results.

ok, then remove the ' && write_fault' part of the clause.

> > +
> > +	if (is_vm_hugetlb_page(vma) && !force_pte) {
> >  		hugetlb = true;
> >  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
> >  	} else {
> > @@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  	if (is_error_pfn(pfn))
> >  		return -EFAULT;
> >  
> > +	if (memslot_is_logging(memslot) && !write_fault)
> > +		writable = false;
> Ok reusing writable is better.
> > +
> >  	if (kvm_is_device_pfn(pfn))
> >  		mem_type = PAGE_S2_DEVICE;
> >  
> > @@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  					  fault_ipa_uncached);
> >  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
> >  	} else {
> > +		unsigned long flags = 0;
> >  		pte_t new_pte = pfn_pte(pfn, mem_type);
> > +
> >  		if (writable) {
> >  			kvm_set_s2pte_writable(&new_pte);
> >  			kvm_set_pfn_dirty(pfn);
> >  		}
> >  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
> >  					  fault_ipa_uncached);
> > -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> > -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> > +
> > +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
> > +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> > +
> > +		if (memslot_is_logging(memslot))
> > +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
> Although we don't know if device mappings will be handled here.
> 

so forget all I said about this in the past, I confused the code
checking for !cache with the iomap flag.

So, I think you can always safeful assume that stage2_get_pmd() gives you
something valid back when you have the LOGGING flag set, because you
always call the function with a valid cache when the LOGGING flag is
set.  It could be worth adding the following to stage2_set_pte():

VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)

As for this code, the IOMAP flag's only effect is that we return -EFAULT
if we are seeing an existing PTE for the faulting address.  This would
no longer be valid if we allow logging dirty device memory pages, so we
really need to think about if there's any conceivable use case for this?

It doesn't really make sense to me, so I would suggest that we never
enable logging for pages that return kvm_is_device_pfn().

Thanks,
-Christoffer

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-12 17:49       ` Christoffer Dall
@ 2015-01-12 19:04         ` Mario Smarduch
  -1 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-12 19:04 UTC (permalink / raw)
  To: Christoffer Dall; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On 01/12/2015 09:49 AM, Christoffer Dall wrote:
> On Mon, Jan 12, 2015 at 08:27:03AM -0800, Mario Smarduch wrote:
>> On 01/11/2015 06:00 AM, Christoffer Dall wrote:
>>> On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
>>>> This patch adds support for 2nd stage page fault handling while dirty page
>>>> logging. On huge page faults, huge pages are dissolved to normal pages, and
>>>> rebuilding of 2nd stage huge pages is blocked. In case migration is 
>>>> canceled this restriction is removed and huge pages may be rebuilt again.
>>>>
>>>> This patch applies cleanly on top of patch series posted Dec. 15'th:
>>>> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html
>>>
>>> In the future such information should also go under the ---
>>> separator.
>>>
>>>>
>>>> Patch #11 has been dropped, and should not be applied.
>>>
>>> this should go under the '---' separator too.
>> Ok will keep that in mind.
> 
> basically, think of everything above the '---' separator as the commit
> message you will find in 'git log' when you are trying to understand a
> piece of code or bisecting an issue or the like.  For those purposes you
> don't care about the mechanics of how a patch was applied, how many
> iterations of the patch there were, what changed between the iterations
> and so on.
> 
>>>
>>>>
>>>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>>>> ---
>>>>
>>>> Change Log since last RESEND v1 --> v2:
>>>> - Disallow dirty page logging of IO region - fail for initial write protect
>>>>   and disable logging code in 2nd stage page fault handler.
>>>> - Fixed auto spell correction errors
>>>>
>>>> Change Log RESEND v0 --> v1:
>>>> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>>>>   writable, prevent write protection of pte on read fault
>>>> - Removed marking entire huge page dirty on initial access
>>>> - don't dissolve huge pages of non-writable regions
>>>> - Made updates based on Christoffers comments
>>>>   - renamed logging status function to memslot_is_logging()
>>>>   - changed few values to bool from longs
>>>>   - streamlined user_mem_abort() to eliminate extra conditional checks
>>>> ---
>>>>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
>>>>  1 file changed, 105 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>> index 73d506f..b878236 100644
>>>> --- a/arch/arm/kvm/mmu.c
>>>> +++ b/arch/arm/kvm/mmu.c
>>>> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>>>>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>>>>  #define kvm_pud_huge(_x)	pud_huge(_x)
>>>>  
>>>> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
>>>> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
>>>> +
>>>> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
>>>> +{
>>>> +#ifdef CONFIG_ARM
>>>> +	return !!memslot->dirty_bitmap;
>>>> +#else
>>>> +	return false;
>>>> +#endif
>>>> +}
>>>> +
>>>>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>>>  {
>>>>  	/*
>>>> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>>>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>>>>  }
>>>>  
>>>> +/**
>>>> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
>>>> + * @kvm:	pointer to kvm structure.
>>>> + * @addr:	IPA
>>>> + * @pmd:	pmd pointer for IPA
>>>> + *
>>>> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
>>>> + * pages in the range dirty.
>>>> + */
>>>> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
>>>> +{
>>>> +	if (!kvm_pmd_huge(*pmd))
>>>> +		return;
>>>> +
>>>> +	pmd_clear(pmd);
>>>> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
>>>> +	put_page(virt_to_page(pmd));
>>>> +}
>>>> +
>>>>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>>>>  				  int min, int max)
>>>>  {
>>>> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>>>>  }
>>>>  
>>>>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>>> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
>>>> +			  phys_addr_t addr, const pte_t *new_pte,
>>>> +			  unsigned long flags)
>>>>  {
>>>>  	pmd_t *pmd;
>>>>  	pte_t *pte, old_pte;
>>>> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
>>>> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>>>>  
>>>>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>>>>  	pmd = stage2_get_pmd(kvm, cache, addr);
>>>> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>>>  		return 0;
>>>>  	}
>>>>  
>>>> +	/*
>>>> +	 * While dirty page logging - dissolve huge PMD, then continue on to
>>>> +	 * allocate page.
>>>> +	 */
>>>> +	if (logging_active)
>>>> +		stage2_dissolve_pmd(kvm, addr, pmd);
>>>> +
>>>>  	/* Create stage-2 page mappings - Level 2 */
>>>>  	if (pmd_none(*pmd)) {
>>>>  		if (!cache)
>>>> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>>>  		if (ret)
>>>>  			goto out;
>>>>  		spin_lock(&kvm->mmu_lock);
>>>> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
>>>> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
>>>> +						KVM_S2PTE_FLAG_IS_IOMAP);
>>>>  		spin_unlock(&kvm->mmu_lock);
>>>>  		if (ret)
>>>>  			goto out;
>>>> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	pfn_t pfn;
>>>>  	pgprot_t mem_type = PAGE_S2;
>>>>  	bool fault_ipa_uncached;
>>>> +	bool can_set_pte_rw = true;
>>>> +	unsigned long set_pte_flags = 0;
>>>>  
>>>>  	write_fault = kvm_is_write_fault(vcpu);
>>>>  	if (fault_status == FSC_PERM && !write_fault) {
>>>> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  		return -EFAULT;
>>>>  	}
>>>>  
>>>> +
>>>
>>> stray whitespace change?
>> Got it.
>>>
>>>>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>>>>  	down_read(&current->mm->mmap_sem);
>>>>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
>>>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	if (is_error_pfn(pfn))
>>>>  		return -EFAULT;
>>>>  
>>>> -	if (kvm_is_device_pfn(pfn))
>>>> +	if (kvm_is_device_pfn(pfn)) {
>>>>  		mem_type = PAGE_S2_DEVICE;
>>>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
>>>> +	}
>>>>  
>>>>  	spin_lock(&kvm->mmu_lock);
>>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
>>>>  		goto out_unlock;
>>>> +
>>>> +	/*
>>>> +	 * When logging is enabled general page fault handling changes:
>>>> +	 * -  Writable huge pages are dissolved on a read or write fault.
>>>
>>> why dissolve huge pages on a read fault?
>>
>> What I noticed on write you would dissolve, on read you
>> rebuild THPs, flip back and forth like that, performance
>> & convergence was really bad.
> 
> ah, that makes sense, we should probably indicate that reasoning
> somehow.  In fact, what threw me off was the use of the word "dissolve
> huge pages" which is not really what you're doing on a read fault, there
> you are just never adjusting to huge pages.
> 
> I'm wondering why that would slow things down much though, the only cost
> would be the extra tlb invalidation and replacing the PMD on a
> subsequent write fault, but I trust your numbers nevertheless.

If I understand correctly -
you do few writes, dissolve a huge page insert pte TLB entries,
then a read page fault installs a pmd clears the TLB cache
for that range, and it repeats over. Appears like you
need to constantly re-fault pte TLBs on writes to huge
page range.

> 
>>>
>>>> +	 * -  pte's are not allowed write permission on a read fault to
>>>> +	 *    writable region so future writes can be marked dirty
>>>
>>> new line
>> ok.
>>>
>>>> +	 * Access to non-writable region is unchanged, and logging of IO
>>>> +	 * regions is not allowed.
>>>> +	 */
>>>> +	if (memslot_is_logging(memslot) && writable) {
>>>> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>>>> +		if (hugetlb) {
>>>> +			gfn += pte_index(fault_ipa);
>>>> +			pfn += pte_index(fault_ipa);
>>>> +			hugetlb = false;
>>>> +		}
>>>> +		force_pte = true;
>>>
>>> uh, not this is not what I meant, see my example (untested, partial)
>>> patch in the end of this mail.
>> I put some comments on your patch.
>>>
>>>> +		if (!write_fault)
>>>> +			can_set_pte_rw = false;
>>>> +	}
>>>> +
>>>>  	if (!hugetlb && !force_pte)
>>>>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>>>>  
>>>> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>>>  	} else {
>>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>>>> -		if (writable) {
>>>> +
>>>> +		/*
>>>> +		 * Don't set write permission, for non-writable region, and
>>>> +		 * for read fault to writable region while logging.
>>>> +		 */
>>>> +		if (writable && can_set_pte_rw) {
>>>>  			kvm_set_s2pte_writable(&new_pte);
>>>>  			kvm_set_pfn_dirty(pfn);
>>>>  		}
>>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>>>  					  fault_ipa_uncached);
>>>>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>>>> +							set_pte_flags);
>>>>  	}
>>>>  
>>>> +	if (write_fault)
>>>> +		mark_page_dirty(kvm, gfn);
>>>>  
>>>>  out_unlock:
>>>>  	spin_unlock(&kvm->mmu_lock);
>>>> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>>>>  {
>>>>  	pte_t *pte = (pte_t *)data;
>>>>  
>>>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
>>>> +	/*
>>>> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
>>>> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
>>>> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
>>>> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
>>>> +	 * through this calling path.
>>>> +	 */
>>>> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>>>>  }
>>>>  
>>>>  
>>>> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>>>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>>>>  	int ret = 0;
>>>>  
>>>> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
>>>> +	/*
>>>> +	 * Let - enable of dirty page logging through, later check if it's for
>>>> +	 * an IO region and fail.
>>>> +	 */
>>>
>>> I don't understand this comment or find it helpful.
>> Will remove.
>>>
>>>> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
>>>> +		change == KVM_MR_FLAGS_ONLY &&
>>>> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))
>>>
>>> this looks wrong, because you can now remove all the other checks of
>>> change != and you are not returning early for KVM_MR_DELETE.
>>>
>>> I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
>>> and then after the 'return 0' check the subconditions for change ==
>>> KVM_MR_FLAGS_ONLY.
>> Yeah, oh boy time to get a new batch of brown bags.
>>
>> I was trying to limit conditional down to add, remap and
>> dirty page flag only in case some other flags get toggled
>> often and waste time walking through VMAs.
>>>
>>>>  		return 0;
>>>>  
>>>>  	/*
>>>> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>>>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>>>>  					 vm_start - vma->vm_start;
>>>>  
>>>> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>>>> +			if (change != KVM_MR_FLAGS_ONLY)
>>>> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>>>>  						    vm_end - vm_start,
>>>>  						    writable);
>>>> +			else
>>>> +				/* IO region dirty page logging not allowed */
>>>> +				return -EINVAL;
>>>> +
>>>
>>> this whole thing also looks weird.  I think you just need to add a check
>>> before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
>>> return an error in that case (you've identified a user attempting to set
>>> dirty page logging on something that points to device memory, it doesn't
>>> matter at this point through which 'change' it is done).
>>
>> Yes explicitly using KVM_MEM_LOG_DIRTY_PAGES is more clear.
>>
>>>
>>>>  			if (ret)
>>>>  				break;
>>>>  		}
>>>>  		hva = vm_end;
>>>>  	} while (hva < reg_end);
>>>>  
>>>> +	/* Anything after here doesn't apply to memslot flag changes */
>>>> +	if (change == KVM_MR_FLAGS_ONLY)
>>>> +		return ret;
>>>> +
>>>>  	spin_lock(&kvm->mmu_lock);
>>>>  	if (ret)
>>>>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
>>>> -- 
>>>
>>>
>>> What I meant last time around concerning user_mem_abort was more
>>> something like this:
>>>
>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>> index 1dc9778..38ea58e 100644
>>> --- a/arch/arm/kvm/mmu.c
>>> +++ b/arch/arm/kvm/mmu.c
>>> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  		return -EFAULT;
>>>  	}
>>>  
>>> -	if (is_vm_hugetlb_page(vma)) {
>>> +	/*
>>> +	 * Writes to pages in a memslot with logging enabled are always logged
>>> +	 * on a singe page-by-page basis.
>>> +	 */
>>> +	if (memslot_is_logging(memslot) && write_fault)
>>> +		force_pte = true;
>>
>> If it's a write you take the pte route and
>> dissolves huge page, if it's a read you reconstruct the
>> THP that seems to yield pretty bad results.
> 
> ok, then remove the ' && write_fault' part of the clause.
> 
>>> +
>>> +	if (is_vm_hugetlb_page(vma) && !force_pte) {
>>>  		hugetlb = true;
>>>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>>>  	} else {
>>> @@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  	if (is_error_pfn(pfn))
>>>  		return -EFAULT;
>>>  
>>> +	if (memslot_is_logging(memslot) && !write_fault)
>>> +		writable = false;
>> Ok reusing writable is better.
>>> +
>>>  	if (kvm_is_device_pfn(pfn))
>>>  		mem_type = PAGE_S2_DEVICE;
>>>  
>>> @@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  					  fault_ipa_uncached);
>>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>>  	} else {
>>> +		unsigned long flags = 0;
>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>>> +
>>>  		if (writable) {
>>>  			kvm_set_s2pte_writable(&new_pte);
>>>  			kvm_set_pfn_dirty(pfn);
>>>  		}
>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>>  					  fault_ipa_uncached);
>>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>>> +
>>> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
>>> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
>>> +
>>> +		if (memslot_is_logging(memslot))
>>> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
>> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
>> Although we don't know if device mappings will be handled here.
>>
> 
> so forget all I said about this in the past, I confused the code
> checking for !cache with the iomap flag.
> 
> So, I think you can always safeful assume that stage2_get_pmd() gives you
> something valid back when you have the LOGGING flag set, because you
> always call the function with a valid cache when the LOGGING flag is
> set.  It could be worth adding the following to stage2_set_pte():
> 
> VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)

I see ok, thanks for clearing that up.

> 
> As for this code, the IOMAP flag's only effect is that we return -EFAULT
> if we are seeing an existing PTE for the faulting address.  This would
> no longer be valid if we allow logging dirty device memory pages, so we
Sorry, do you mean allow or disallow?

> really need to think about if there's any conceivable use case for this?
> 
> It doesn't really make sense to me, so I would suggest that we never
> enable logging for pages that return kvm_is_device_pfn().
> 
> Thanks,
> -Christoffer
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-12 19:04         ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-12 19:04 UTC (permalink / raw)
  To: linux-arm-kernel

On 01/12/2015 09:49 AM, Christoffer Dall wrote:
> On Mon, Jan 12, 2015 at 08:27:03AM -0800, Mario Smarduch wrote:
>> On 01/11/2015 06:00 AM, Christoffer Dall wrote:
>>> On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
>>>> This patch adds support for 2nd stage page fault handling while dirty page
>>>> logging. On huge page faults, huge pages are dissolved to normal pages, and
>>>> rebuilding of 2nd stage huge pages is blocked. In case migration is 
>>>> canceled this restriction is removed and huge pages may be rebuilt again.
>>>>
>>>> This patch applies cleanly on top of patch series posted Dec. 15'th:
>>>> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html
>>>
>>> In the future such information should also go under the ---
>>> separator.
>>>
>>>>
>>>> Patch #11 has been dropped, and should not be applied.
>>>
>>> this should go under the '---' separator too.
>> Ok will keep that in mind.
> 
> basically, think of everything above the '---' separator as the commit
> message you will find in 'git log' when you are trying to understand a
> piece of code or bisecting an issue or the like.  For those purposes you
> don't care about the mechanics of how a patch was applied, how many
> iterations of the patch there were, what changed between the iterations
> and so on.
> 
>>>
>>>>
>>>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>>>> ---
>>>>
>>>> Change Log since last RESEND v1 --> v2:
>>>> - Disallow dirty page logging of IO region - fail for initial write protect
>>>>   and disable logging code in 2nd stage page fault handler.
>>>> - Fixed auto spell correction errors
>>>>
>>>> Change Log RESEND v0 --> v1:
>>>> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>>>>   writable, prevent write protection of pte on read fault
>>>> - Removed marking entire huge page dirty on initial access
>>>> - don't dissolve huge pages of non-writable regions
>>>> - Made updates based on Christoffers comments
>>>>   - renamed logging status function to memslot_is_logging()
>>>>   - changed few values to bool from longs
>>>>   - streamlined user_mem_abort() to eliminate extra conditional checks
>>>> ---
>>>>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
>>>>  1 file changed, 105 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>> index 73d506f..b878236 100644
>>>> --- a/arch/arm/kvm/mmu.c
>>>> +++ b/arch/arm/kvm/mmu.c
>>>> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>>>>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>>>>  #define kvm_pud_huge(_x)	pud_huge(_x)
>>>>  
>>>> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
>>>> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
>>>> +
>>>> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
>>>> +{
>>>> +#ifdef CONFIG_ARM
>>>> +	return !!memslot->dirty_bitmap;
>>>> +#else
>>>> +	return false;
>>>> +#endif
>>>> +}
>>>> +
>>>>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>>>  {
>>>>  	/*
>>>> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>>>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>>>>  }
>>>>  
>>>> +/**
>>>> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
>>>> + * @kvm:	pointer to kvm structure.
>>>> + * @addr:	IPA
>>>> + * @pmd:	pmd pointer for IPA
>>>> + *
>>>> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
>>>> + * pages in the range dirty.
>>>> + */
>>>> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
>>>> +{
>>>> +	if (!kvm_pmd_huge(*pmd))
>>>> +		return;
>>>> +
>>>> +	pmd_clear(pmd);
>>>> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
>>>> +	put_page(virt_to_page(pmd));
>>>> +}
>>>> +
>>>>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>>>>  				  int min, int max)
>>>>  {
>>>> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>>>>  }
>>>>  
>>>>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>>> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
>>>> +			  phys_addr_t addr, const pte_t *new_pte,
>>>> +			  unsigned long flags)
>>>>  {
>>>>  	pmd_t *pmd;
>>>>  	pte_t *pte, old_pte;
>>>> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
>>>> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>>>>  
>>>>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>>>>  	pmd = stage2_get_pmd(kvm, cache, addr);
>>>> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>>>  		return 0;
>>>>  	}
>>>>  
>>>> +	/*
>>>> +	 * While dirty page logging - dissolve huge PMD, then continue on to
>>>> +	 * allocate page.
>>>> +	 */
>>>> +	if (logging_active)
>>>> +		stage2_dissolve_pmd(kvm, addr, pmd);
>>>> +
>>>>  	/* Create stage-2 page mappings - Level 2 */
>>>>  	if (pmd_none(*pmd)) {
>>>>  		if (!cache)
>>>> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>>>  		if (ret)
>>>>  			goto out;
>>>>  		spin_lock(&kvm->mmu_lock);
>>>> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
>>>> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
>>>> +						KVM_S2PTE_FLAG_IS_IOMAP);
>>>>  		spin_unlock(&kvm->mmu_lock);
>>>>  		if (ret)
>>>>  			goto out;
>>>> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	pfn_t pfn;
>>>>  	pgprot_t mem_type = PAGE_S2;
>>>>  	bool fault_ipa_uncached;
>>>> +	bool can_set_pte_rw = true;
>>>> +	unsigned long set_pte_flags = 0;
>>>>  
>>>>  	write_fault = kvm_is_write_fault(vcpu);
>>>>  	if (fault_status == FSC_PERM && !write_fault) {
>>>> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  		return -EFAULT;
>>>>  	}
>>>>  
>>>> +
>>>
>>> stray whitespace change?
>> Got it.
>>>
>>>>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>>>>  	down_read(&current->mm->mmap_sem);
>>>>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
>>>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	if (is_error_pfn(pfn))
>>>>  		return -EFAULT;
>>>>  
>>>> -	if (kvm_is_device_pfn(pfn))
>>>> +	if (kvm_is_device_pfn(pfn)) {
>>>>  		mem_type = PAGE_S2_DEVICE;
>>>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
>>>> +	}
>>>>  
>>>>  	spin_lock(&kvm->mmu_lock);
>>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
>>>>  		goto out_unlock;
>>>> +
>>>> +	/*
>>>> +	 * When logging is enabled general page fault handling changes:
>>>> +	 * -  Writable huge pages are dissolved on a read or write fault.
>>>
>>> why dissolve huge pages on a read fault?
>>
>> What I noticed on write you would dissolve, on read you
>> rebuild THPs, flip back and forth like that, performance
>> & convergence was really bad.
> 
> ah, that makes sense, we should probably indicate that reasoning
> somehow.  In fact, what threw me off was the use of the word "dissolve
> huge pages" which is not really what you're doing on a read fault, there
> you are just never adjusting to huge pages.
> 
> I'm wondering why that would slow things down much though, the only cost
> would be the extra tlb invalidation and replacing the PMD on a
> subsequent write fault, but I trust your numbers nevertheless.

If I understand correctly -
you do few writes, dissolve a huge page insert pte TLB entries,
then a read page fault installs a pmd clears the TLB cache
for that range, and it repeats over. Appears like you
need to constantly re-fault pte TLBs on writes to huge
page range.

> 
>>>
>>>> +	 * -  pte's are not allowed write permission on a read fault to
>>>> +	 *    writable region so future writes can be marked dirty
>>>
>>> new line
>> ok.
>>>
>>>> +	 * Access to non-writable region is unchanged, and logging of IO
>>>> +	 * regions is not allowed.
>>>> +	 */
>>>> +	if (memslot_is_logging(memslot) && writable) {
>>>> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>>>> +		if (hugetlb) {
>>>> +			gfn += pte_index(fault_ipa);
>>>> +			pfn += pte_index(fault_ipa);
>>>> +			hugetlb = false;
>>>> +		}
>>>> +		force_pte = true;
>>>
>>> uh, not this is not what I meant, see my example (untested, partial)
>>> patch in the end of this mail.
>> I put some comments on your patch.
>>>
>>>> +		if (!write_fault)
>>>> +			can_set_pte_rw = false;
>>>> +	}
>>>> +
>>>>  	if (!hugetlb && !force_pte)
>>>>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>>>>  
>>>> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>>>  	} else {
>>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>>>> -		if (writable) {
>>>> +
>>>> +		/*
>>>> +		 * Don't set write permission, for non-writable region, and
>>>> +		 * for read fault to writable region while logging.
>>>> +		 */
>>>> +		if (writable && can_set_pte_rw) {
>>>>  			kvm_set_s2pte_writable(&new_pte);
>>>>  			kvm_set_pfn_dirty(pfn);
>>>>  		}
>>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>>>  					  fault_ipa_uncached);
>>>>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>>>> +							set_pte_flags);
>>>>  	}
>>>>  
>>>> +	if (write_fault)
>>>> +		mark_page_dirty(kvm, gfn);
>>>>  
>>>>  out_unlock:
>>>>  	spin_unlock(&kvm->mmu_lock);
>>>> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>>>>  {
>>>>  	pte_t *pte = (pte_t *)data;
>>>>  
>>>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
>>>> +	/*
>>>> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
>>>> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
>>>> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
>>>> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
>>>> +	 * through this calling path.
>>>> +	 */
>>>> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>>>>  }
>>>>  
>>>>  
>>>> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>>>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>>>>  	int ret = 0;
>>>>  
>>>> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
>>>> +	/*
>>>> +	 * Let - enable of dirty page logging through, later check if it's for
>>>> +	 * an IO region and fail.
>>>> +	 */
>>>
>>> I don't understand this comment or find it helpful.
>> Will remove.
>>>
>>>> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
>>>> +		change == KVM_MR_FLAGS_ONLY &&
>>>> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))
>>>
>>> this looks wrong, because you can now remove all the other checks of
>>> change != and you are not returning early for KVM_MR_DELETE.
>>>
>>> I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
>>> and then after the 'return 0' check the subconditions for change ==
>>> KVM_MR_FLAGS_ONLY.
>> Yeah, oh boy time to get a new batch of brown bags.
>>
>> I was trying to limit conditional down to add, remap and
>> dirty page flag only in case some other flags get toggled
>> often and waste time walking through VMAs.
>>>
>>>>  		return 0;
>>>>  
>>>>  	/*
>>>> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>>>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>>>>  					 vm_start - vma->vm_start;
>>>>  
>>>> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>>>> +			if (change != KVM_MR_FLAGS_ONLY)
>>>> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>>>>  						    vm_end - vm_start,
>>>>  						    writable);
>>>> +			else
>>>> +				/* IO region dirty page logging not allowed */
>>>> +				return -EINVAL;
>>>> +
>>>
>>> this whole thing also looks weird.  I think you just need to add a check
>>> before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
>>> return an error in that case (you've identified a user attempting to set
>>> dirty page logging on something that points to device memory, it doesn't
>>> matter at this point through which 'change' it is done).
>>
>> Yes explicitly using KVM_MEM_LOG_DIRTY_PAGES is more clear.
>>
>>>
>>>>  			if (ret)
>>>>  				break;
>>>>  		}
>>>>  		hva = vm_end;
>>>>  	} while (hva < reg_end);
>>>>  
>>>> +	/* Anything after here doesn't apply to memslot flag changes */
>>>> +	if (change == KVM_MR_FLAGS_ONLY)
>>>> +		return ret;
>>>> +
>>>>  	spin_lock(&kvm->mmu_lock);
>>>>  	if (ret)
>>>>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
>>>> -- 
>>>
>>>
>>> What I meant last time around concerning user_mem_abort was more
>>> something like this:
>>>
>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>> index 1dc9778..38ea58e 100644
>>> --- a/arch/arm/kvm/mmu.c
>>> +++ b/arch/arm/kvm/mmu.c
>>> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  		return -EFAULT;
>>>  	}
>>>  
>>> -	if (is_vm_hugetlb_page(vma)) {
>>> +	/*
>>> +	 * Writes to pages in a memslot with logging enabled are always logged
>>> +	 * on a singe page-by-page basis.
>>> +	 */
>>> +	if (memslot_is_logging(memslot) && write_fault)
>>> +		force_pte = true;
>>
>> If it's a write you take the pte route and
>> dissolves huge page, if it's a read you reconstruct the
>> THP that seems to yield pretty bad results.
> 
> ok, then remove the ' && write_fault' part of the clause.
> 
>>> +
>>> +	if (is_vm_hugetlb_page(vma) && !force_pte) {
>>>  		hugetlb = true;
>>>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>>>  	} else {
>>> @@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  	if (is_error_pfn(pfn))
>>>  		return -EFAULT;
>>>  
>>> +	if (memslot_is_logging(memslot) && !write_fault)
>>> +		writable = false;
>> Ok reusing writable is better.
>>> +
>>>  	if (kvm_is_device_pfn(pfn))
>>>  		mem_type = PAGE_S2_DEVICE;
>>>  
>>> @@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  					  fault_ipa_uncached);
>>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>>  	} else {
>>> +		unsigned long flags = 0;
>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>>> +
>>>  		if (writable) {
>>>  			kvm_set_s2pte_writable(&new_pte);
>>>  			kvm_set_pfn_dirty(pfn);
>>>  		}
>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>>  					  fault_ipa_uncached);
>>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>>> +
>>> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
>>> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
>>> +
>>> +		if (memslot_is_logging(memslot))
>>> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
>> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
>> Although we don't know if device mappings will be handled here.
>>
> 
> so forget all I said about this in the past, I confused the code
> checking for !cache with the iomap flag.
> 
> So, I think you can always safeful assume that stage2_get_pmd() gives you
> something valid back when you have the LOGGING flag set, because you
> always call the function with a valid cache when the LOGGING flag is
> set.  It could be worth adding the following to stage2_set_pte():
> 
> VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)

I see ok, thanks for clearing that up.

> 
> As for this code, the IOMAP flag's only effect is that we return -EFAULT
> if we are seeing an existing PTE for the faulting address.  This would
> no longer be valid if we allow logging dirty device memory pages, so we
Sorry, do you mean allow or disallow?

> really need to think about if there's any conceivable use case for this?
> 
> It doesn't really make sense to me, so I would suggest that we never
> enable logging for pages that return kvm_is_device_pfn().
> 
> Thanks,
> -Christoffer
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-12 19:04         ` Mario Smarduch
@ 2015-01-12 19:43           ` Christoffer Dall
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-12 19:43 UTC (permalink / raw)
  To: Mario Smarduch; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On Mon, Jan 12, 2015 at 11:04:45AM -0800, Mario Smarduch wrote:

[...]

> >>>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>>>  	if (is_error_pfn(pfn))
> >>>>  		return -EFAULT;
> >>>>  
> >>>> -	if (kvm_is_device_pfn(pfn))
> >>>> +	if (kvm_is_device_pfn(pfn)) {
> >>>>  		mem_type = PAGE_S2_DEVICE;
> >>>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
> >>>> +	}
> >>>>  
> >>>>  	spin_lock(&kvm->mmu_lock);
> >>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
> >>>>  		goto out_unlock;
> >>>> +
> >>>> +	/*
> >>>> +	 * When logging is enabled general page fault handling changes:
> >>>> +	 * -  Writable huge pages are dissolved on a read or write fault.
> >>>
> >>> why dissolve huge pages on a read fault?
> >>
> >> What I noticed on write you would dissolve, on read you
> >> rebuild THPs, flip back and forth like that, performance
> >> & convergence was really bad.
> > 
> > ah, that makes sense, we should probably indicate that reasoning
> > somehow.  In fact, what threw me off was the use of the word "dissolve
> > huge pages" which is not really what you're doing on a read fault, there
> > you are just never adjusting to huge pages.
> > 
> > I'm wondering why that would slow things down much though, the only cost
> > would be the extra tlb invalidation and replacing the PMD on a
> > subsequent write fault, but I trust your numbers nevertheless.
> 
> If I understand correctly -
> you do few writes, dissolve a huge page insert pte TLB entries,
> then a read page fault installs a pmd clears the TLB cache
> for that range, and it repeats over. Appears like you
> need to constantly re-fault pte TLBs on writes to huge
> page range.

that makes good sense, thanks for the explanation.

[...]

> >>>  	} else {
> >>> +		unsigned long flags = 0;
> >>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> >>> +
> >>>  		if (writable) {
> >>>  			kvm_set_s2pte_writable(&new_pte);
> >>>  			kvm_set_pfn_dirty(pfn);
> >>>  		}
> >>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
> >>>  					  fault_ipa_uncached);
> >>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> >>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> >>> +
> >>> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
> >>> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> >>> +
> >>> +		if (memslot_is_logging(memslot))
> >>> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
> >> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
> >> Although we don't know if device mappings will be handled here.
> >>
> > 
> > so forget all I said about this in the past, I confused the code
> > checking for !cache with the iomap flag.
> > 
> > So, I think you can always safeful assume that stage2_get_pmd() gives you
> > something valid back when you have the LOGGING flag set, because you
> > always call the function with a valid cache when the LOGGING flag is
> > set.  It could be worth adding the following to stage2_set_pte():
> > 
> > VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)
> 
> I see ok, thanks for clearing that up.
> 
> > 
> > As for this code, the IOMAP flag's only effect is that we return -EFAULT
> > if we are seeing an existing PTE for the faulting address.  This would
> > no longer be valid if we allow logging dirty device memory pages, so we
> Sorry, do you mean allow or disallow?

if we (by these patches) allow logging dirty pages for device memory,
then we...

> 
> > really need to think about if there's any conceivable use case for this?
> > 
> > It doesn't really make sense to me, so I would suggest that we never
> > enable logging for pages that return kvm_is_device_pfn().
> > 
> > Thanks,
> > -Christoffer
> > 
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-12 19:43           ` Christoffer Dall
  0 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-12 19:43 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Jan 12, 2015 at 11:04:45AM -0800, Mario Smarduch wrote:

[...]

> >>>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>>>  	if (is_error_pfn(pfn))
> >>>>  		return -EFAULT;
> >>>>  
> >>>> -	if (kvm_is_device_pfn(pfn))
> >>>> +	if (kvm_is_device_pfn(pfn)) {
> >>>>  		mem_type = PAGE_S2_DEVICE;
> >>>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
> >>>> +	}
> >>>>  
> >>>>  	spin_lock(&kvm->mmu_lock);
> >>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
> >>>>  		goto out_unlock;
> >>>> +
> >>>> +	/*
> >>>> +	 * When logging is enabled general page fault handling changes:
> >>>> +	 * -  Writable huge pages are dissolved on a read or write fault.
> >>>
> >>> why dissolve huge pages on a read fault?
> >>
> >> What I noticed on write you would dissolve, on read you
> >> rebuild THPs, flip back and forth like that, performance
> >> & convergence was really bad.
> > 
> > ah, that makes sense, we should probably indicate that reasoning
> > somehow.  In fact, what threw me off was the use of the word "dissolve
> > huge pages" which is not really what you're doing on a read fault, there
> > you are just never adjusting to huge pages.
> > 
> > I'm wondering why that would slow things down much though, the only cost
> > would be the extra tlb invalidation and replacing the PMD on a
> > subsequent write fault, but I trust your numbers nevertheless.
> 
> If I understand correctly -
> you do few writes, dissolve a huge page insert pte TLB entries,
> then a read page fault installs a pmd clears the TLB cache
> for that range, and it repeats over. Appears like you
> need to constantly re-fault pte TLBs on writes to huge
> page range.

that makes good sense, thanks for the explanation.

[...]

> >>>  	} else {
> >>> +		unsigned long flags = 0;
> >>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> >>> +
> >>>  		if (writable) {
> >>>  			kvm_set_s2pte_writable(&new_pte);
> >>>  			kvm_set_pfn_dirty(pfn);
> >>>  		}
> >>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
> >>>  					  fault_ipa_uncached);
> >>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> >>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> >>> +
> >>> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
> >>> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> >>> +
> >>> +		if (memslot_is_logging(memslot))
> >>> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
> >> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
> >> Although we don't know if device mappings will be handled here.
> >>
> > 
> > so forget all I said about this in the past, I confused the code
> > checking for !cache with the iomap flag.
> > 
> > So, I think you can always safeful assume that stage2_get_pmd() gives you
> > something valid back when you have the LOGGING flag set, because you
> > always call the function with a valid cache when the LOGGING flag is
> > set.  It could be worth adding the following to stage2_set_pte():
> > 
> > VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)
> 
> I see ok, thanks for clearing that up.
> 
> > 
> > As for this code, the IOMAP flag's only effect is that we return -EFAULT
> > if we are seeing an existing PTE for the faulting address.  This would
> > no longer be valid if we allow logging dirty device memory pages, so we
> Sorry, do you mean allow or disallow?

if we (by these patches) allow logging dirty pages for device memory,
then we...

> 
> > really need to think about if there's any conceivable use case for this?
> > 
> > It doesn't really make sense to me, so I would suggest that we never
> > enable logging for pages that return kvm_is_device_pfn().
> > 
> > Thanks,
> > -Christoffer
> > 
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-12 19:43           ` Christoffer Dall
@ 2015-01-13 17:42             ` Mario Smarduch
  -1 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-13 17:42 UTC (permalink / raw)
  To: Christoffer Dall; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On 01/12/2015 11:43 AM, Christoffer Dall wrote:
> On Mon, Jan 12, 2015 at 11:04:45AM -0800, Mario Smarduch wrote:
> 
> [...]
> 
>>>>>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>>>  	if (is_error_pfn(pfn))
>>>>>>  		return -EFAULT;
>>>>>>  
>>>>>> -	if (kvm_is_device_pfn(pfn))
>>>>>> +	if (kvm_is_device_pfn(pfn)) {
>>>>>>  		mem_type = PAGE_S2_DEVICE;
>>>>>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
>>>>>> +	}
>>>>>>  
>>>>>>  	spin_lock(&kvm->mmu_lock);
>>>>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
>>>>>>  		goto out_unlock;
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * When logging is enabled general page fault handling changes:
>>>>>> +	 * -  Writable huge pages are dissolved on a read or write fault.
>>>>>
>>>>> why dissolve huge pages on a read fault?
>>>>
>>>> What I noticed on write you would dissolve, on read you
>>>> rebuild THPs, flip back and forth like that, performance
>>>> & convergence was really bad.
>>>
>>> ah, that makes sense, we should probably indicate that reasoning
>>> somehow.  In fact, what threw me off was the use of the word "dissolve
>>> huge pages" which is not really what you're doing on a read fault, there
>>> you are just never adjusting to huge pages.
>>>
>>> I'm wondering why that would slow things down much though, the only cost
>>> would be the extra tlb invalidation and replacing the PMD on a
>>> subsequent write fault, but I trust your numbers nevertheless.
>>
>> If I understand correctly -
>> you do few writes, dissolve a huge page insert pte TLB entries,
>> then a read page fault installs a pmd clears the TLB cache
>> for that range, and it repeats over. Appears like you
>> need to constantly re-fault pte TLBs on writes to huge
>> page range.
> 
> that makes good sense, thanks for the explanation.
> 
> [...]
> 
>>>>>  	} else {
>>>>> +		unsigned long flags = 0;
>>>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>>>>> +
>>>>>  		if (writable) {
>>>>>  			kvm_set_s2pte_writable(&new_pte);
>>>>>  			kvm_set_pfn_dirty(pfn);
>>>>>  		}
>>>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>>>>  					  fault_ipa_uncached);
>>>>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>>>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>>>>> +
>>>>> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
>>>>> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
>>>>> +
>>>>> +		if (memslot_is_logging(memslot))
>>>>> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
>>>> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
>>>> Although we don't know if device mappings will be handled here.
>>>>
>>>
>>> so forget all I said about this in the past, I confused the code
>>> checking for !cache with the iomap flag.
>>>
>>> So, I think you can always safeful assume that stage2_get_pmd() gives you
>>> something valid back when you have the LOGGING flag set, because you
>>> always call the function with a valid cache when the LOGGING flag is
>>> set.  It could be worth adding the following to stage2_set_pte():
>>>
>>> VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)
>>
>> I see ok, thanks for clearing that up.
>>
>>>
>>> As for this code, the IOMAP flag's only effect is that we return -EFAULT
>>> if we are seeing an existing PTE for the faulting address.  This would
>>> no longer be valid if we allow logging dirty device memory pages, so we
>> Sorry, do you mean allow or disallow?
> 
> if we (by these patches) allow logging dirty pages for device memory,
> then we...
> 
>>
>>> really need to think about if there's any conceivable use case for this?

No I can't think of any use case to log Device address space.

So I could move forward - drop the IOMAP flag, and add the
VM_BUG_ON to stage2_set_pte().

Thanks.

>>>
>>> It doesn't really make sense to me, so I would suggest that we never
>>> enable logging for pages that return kvm_is_device_pfn().
>>>
>>> Thanks,
>>> -Christoffer
>>>
>>


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-13 17:42             ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-13 17:42 UTC (permalink / raw)
  To: linux-arm-kernel

On 01/12/2015 11:43 AM, Christoffer Dall wrote:
> On Mon, Jan 12, 2015 at 11:04:45AM -0800, Mario Smarduch wrote:
> 
> [...]
> 
>>>>>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>>>  	if (is_error_pfn(pfn))
>>>>>>  		return -EFAULT;
>>>>>>  
>>>>>> -	if (kvm_is_device_pfn(pfn))
>>>>>> +	if (kvm_is_device_pfn(pfn)) {
>>>>>>  		mem_type = PAGE_S2_DEVICE;
>>>>>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
>>>>>> +	}
>>>>>>  
>>>>>>  	spin_lock(&kvm->mmu_lock);
>>>>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
>>>>>>  		goto out_unlock;
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * When logging is enabled general page fault handling changes:
>>>>>> +	 * -  Writable huge pages are dissolved on a read or write fault.
>>>>>
>>>>> why dissolve huge pages on a read fault?
>>>>
>>>> What I noticed on write you would dissolve, on read you
>>>> rebuild THPs, flip back and forth like that, performance
>>>> & convergence was really bad.
>>>
>>> ah, that makes sense, we should probably indicate that reasoning
>>> somehow.  In fact, what threw me off was the use of the word "dissolve
>>> huge pages" which is not really what you're doing on a read fault, there
>>> you are just never adjusting to huge pages.
>>>
>>> I'm wondering why that would slow things down much though, the only cost
>>> would be the extra tlb invalidation and replacing the PMD on a
>>> subsequent write fault, but I trust your numbers nevertheless.
>>
>> If I understand correctly -
>> you do few writes, dissolve a huge page insert pte TLB entries,
>> then a read page fault installs a pmd clears the TLB cache
>> for that range, and it repeats over. Appears like you
>> need to constantly re-fault pte TLBs on writes to huge
>> page range.
> 
> that makes good sense, thanks for the explanation.
> 
> [...]
> 
>>>>>  	} else {
>>>>> +		unsigned long flags = 0;
>>>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>>>>> +
>>>>>  		if (writable) {
>>>>>  			kvm_set_s2pte_writable(&new_pte);
>>>>>  			kvm_set_pfn_dirty(pfn);
>>>>>  		}
>>>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>>>>  					  fault_ipa_uncached);
>>>>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>>>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>>>>> +
>>>>> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
>>>>> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
>>>>> +
>>>>> +		if (memslot_is_logging(memslot))
>>>>> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
>>>> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
>>>> Although we don't know if device mappings will be handled here.
>>>>
>>>
>>> so forget all I said about this in the past, I confused the code
>>> checking for !cache with the iomap flag.
>>>
>>> So, I think you can always safeful assume that stage2_get_pmd() gives you
>>> something valid back when you have the LOGGING flag set, because you
>>> always call the function with a valid cache when the LOGGING flag is
>>> set.  It could be worth adding the following to stage2_set_pte():
>>>
>>> VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)
>>
>> I see ok, thanks for clearing that up.
>>
>>>
>>> As for this code, the IOMAP flag's only effect is that we return -EFAULT
>>> if we are seeing an existing PTE for the faulting address.  This would
>>> no longer be valid if we allow logging dirty device memory pages, so we
>> Sorry, do you mean allow or disallow?
> 
> if we (by these patches) allow logging dirty pages for device memory,
> then we...
> 
>>
>>> really need to think about if there's any conceivable use case for this?

No I can't think of any use case to log Device address space.

So I could move forward - drop the IOMAP flag, and add the
VM_BUG_ON to stage2_set_pte().

Thanks.

>>>
>>> It doesn't really make sense to me, so I would suggest that we never
>>> enable logging for pages that return kvm_is_device_pfn().
>>>
>>> Thanks,
>>> -Christoffer
>>>
>>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-12 17:49       ` Christoffer Dall
@ 2015-01-13 23:14         ` Mario Smarduch
  -1 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-13 23:14 UTC (permalink / raw)
  To: Christoffer Dall; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On 01/12/2015 09:49 AM, Christoffer Dall wrote:
> On Mon, Jan 12, 2015 at 08:27:03AM -0800, Mario Smarduch wrote:
>> On 01/11/2015 06:00 AM, Christoffer Dall wrote:
>>> On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
>>>> This patch adds support for 2nd stage page fault handling while dirty page
>>>> logging. On huge page faults, huge pages are dissolved to normal pages, and
>>>> rebuilding of 2nd stage huge pages is blocked. In case migration is 
>>>> canceled this restriction is removed and huge pages may be rebuilt again.
>>>>
>>>> This patch applies cleanly on top of patch series posted Dec. 15'th:
>>>> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html
>>>
>>> In the future such information should also go under the ---
>>> separator.
>>>
>>>>
>>>> Patch #11 has been dropped, and should not be applied.
>>>
>>> this should go under the '---' separator too.
>> Ok will keep that in mind.
> 
> basically, think of everything above the '---' separator as the commit
> message you will find in 'git log' when you are trying to understand a
> piece of code or bisecting an issue or the like.  For those purposes you
> don't care about the mechanics of how a patch was applied, how many
> iterations of the patch there were, what changed between the iterations
> and so on.
> 
>>>
>>>>
>>>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>>>> ---
>>>>
>>>> Change Log since last RESEND v1 --> v2:
>>>> - Disallow dirty page logging of IO region - fail for initial write protect
>>>>   and disable logging code in 2nd stage page fault handler.
>>>> - Fixed auto spell correction errors
>>>>
>>>> Change Log RESEND v0 --> v1:
>>>> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>>>>   writable, prevent write protection of pte on read fault
>>>> - Removed marking entire huge page dirty on initial access
>>>> - don't dissolve huge pages of non-writable regions
>>>> - Made updates based on Christoffers comments
>>>>   - renamed logging status function to memslot_is_logging()
>>>>   - changed few values to bool from longs
>>>>   - streamlined user_mem_abort() to eliminate extra conditional checks
>>>> ---
>>>>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
>>>>  1 file changed, 105 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>> index 73d506f..b878236 100644
>>>> --- a/arch/arm/kvm/mmu.c
>>>> +++ b/arch/arm/kvm/mmu.c
>>>> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>>>>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>>>>  #define kvm_pud_huge(_x)	pud_huge(_x)
>>>>  
>>>> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
>>>> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
>>>> +
>>>> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
>>>> +{
>>>> +#ifdef CONFIG_ARM
>>>> +	return !!memslot->dirty_bitmap;
>>>> +#else
>>>> +	return false;
>>>> +#endif
>>>> +}
>>>> +
>>>>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>>>  {
>>>>  	/*
>>>> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>>>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>>>>  }
>>>>  
>>>> +/**
>>>> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
>>>> + * @kvm:	pointer to kvm structure.
>>>> + * @addr:	IPA
>>>> + * @pmd:	pmd pointer for IPA
>>>> + *
>>>> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
>>>> + * pages in the range dirty.
>>>> + */
>>>> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
>>>> +{
>>>> +	if (!kvm_pmd_huge(*pmd))
>>>> +		return;
>>>> +
>>>> +	pmd_clear(pmd);
>>>> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
>>>> +	put_page(virt_to_page(pmd));
>>>> +}
>>>> +
>>>>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>>>>  				  int min, int max)
>>>>  {
>>>> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>>>>  }
>>>>  
>>>>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>>> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
>>>> +			  phys_addr_t addr, const pte_t *new_pte,
>>>> +			  unsigned long flags)
>>>>  {
>>>>  	pmd_t *pmd;
>>>>  	pte_t *pte, old_pte;
>>>> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
>>>> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>>>>  
>>>>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>>>>  	pmd = stage2_get_pmd(kvm, cache, addr);
>>>> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>>>  		return 0;
>>>>  	}
>>>>  
>>>> +	/*
>>>> +	 * While dirty page logging - dissolve huge PMD, then continue on to
>>>> +	 * allocate page.
>>>> +	 */
>>>> +	if (logging_active)
>>>> +		stage2_dissolve_pmd(kvm, addr, pmd);
>>>> +
>>>>  	/* Create stage-2 page mappings - Level 2 */
>>>>  	if (pmd_none(*pmd)) {
>>>>  		if (!cache)
>>>> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>>>  		if (ret)
>>>>  			goto out;
>>>>  		spin_lock(&kvm->mmu_lock);
>>>> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
>>>> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
>>>> +						KVM_S2PTE_FLAG_IS_IOMAP);
>>>>  		spin_unlock(&kvm->mmu_lock);
>>>>  		if (ret)
>>>>  			goto out;
>>>> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	pfn_t pfn;
>>>>  	pgprot_t mem_type = PAGE_S2;
>>>>  	bool fault_ipa_uncached;
>>>> +	bool can_set_pte_rw = true;
>>>> +	unsigned long set_pte_flags = 0;
>>>>  
>>>>  	write_fault = kvm_is_write_fault(vcpu);
>>>>  	if (fault_status == FSC_PERM && !write_fault) {
>>>> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  		return -EFAULT;
>>>>  	}
>>>>  
>>>> +
>>>
>>> stray whitespace change?
>> Got it.
>>>
>>>>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>>>>  	down_read(&current->mm->mmap_sem);
>>>>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
>>>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	if (is_error_pfn(pfn))
>>>>  		return -EFAULT;
>>>>  
>>>> -	if (kvm_is_device_pfn(pfn))
>>>> +	if (kvm_is_device_pfn(pfn)) {
>>>>  		mem_type = PAGE_S2_DEVICE;
>>>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
>>>> +	}
>>>>  
>>>>  	spin_lock(&kvm->mmu_lock);
>>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
>>>>  		goto out_unlock;
>>>> +
>>>> +	/*
>>>> +	 * When logging is enabled general page fault handling changes:
>>>> +	 * -  Writable huge pages are dissolved on a read or write fault.
>>>
>>> why dissolve huge pages on a read fault?
>>
>> What I noticed on write you would dissolve, on read you
>> rebuild THPs, flip back and forth like that, performance
>> & convergence was really bad.
> 
> ah, that makes sense, we should probably indicate that reasoning
> somehow.  In fact, what threw me off was the use of the word "dissolve
> huge pages" which is not really what you're doing on a read fault, there
> you are just never adjusting to huge pages.
> 
> I'm wondering why that would slow things down much though, the only cost
> would be the extra tlb invalidation and replacing the PMD on a
> subsequent write fault, but I trust your numbers nevertheless.
> 
>>>
>>>> +	 * -  pte's are not allowed write permission on a read fault to
>>>> +	 *    writable region so future writes can be marked dirty
>>>
>>> new line
>> ok.
>>>
>>>> +	 * Access to non-writable region is unchanged, and logging of IO
>>>> +	 * regions is not allowed.
>>>> +	 */
>>>> +	if (memslot_is_logging(memslot) && writable) {
>>>> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>>>> +		if (hugetlb) {
>>>> +			gfn += pte_index(fault_ipa);
>>>> +			pfn += pte_index(fault_ipa);
>>>> +			hugetlb = false;
>>>> +		}
>>>> +		force_pte = true;
>>>
>>> uh, not this is not what I meant, see my example (untested, partial)
>>> patch in the end of this mail.
>> I put some comments on your patch.
>>>
>>>> +		if (!write_fault)
>>>> +			can_set_pte_rw = false;
>>>> +	}
>>>> +
>>>>  	if (!hugetlb && !force_pte)
>>>>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>>>>  
>>>> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>>>  	} else {
>>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>>>> -		if (writable) {
>>>> +
>>>> +		/*
>>>> +		 * Don't set write permission, for non-writable region, and
>>>> +		 * for read fault to writable region while logging.
>>>> +		 */
>>>> +		if (writable && can_set_pte_rw) {
>>>>  			kvm_set_s2pte_writable(&new_pte);
>>>>  			kvm_set_pfn_dirty(pfn);
>>>>  		}
>>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>>>  					  fault_ipa_uncached);
>>>>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>>>> +							set_pte_flags);
>>>>  	}
>>>>  
>>>> +	if (write_fault)
>>>> +		mark_page_dirty(kvm, gfn);
>>>>  
>>>>  out_unlock:
>>>>  	spin_unlock(&kvm->mmu_lock);
>>>> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>>>>  {
>>>>  	pte_t *pte = (pte_t *)data;
>>>>  
>>>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
>>>> +	/*
>>>> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
>>>> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
>>>> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
>>>> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
>>>> +	 * through this calling path.
>>>> +	 */
>>>> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>>>>  }
>>>>  
>>>>  
>>>> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>>>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>>>>  	int ret = 0;
>>>>  
>>>> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
>>>> +	/*
>>>> +	 * Let - enable of dirty page logging through, later check if it's for
>>>> +	 * an IO region and fail.
>>>> +	 */
>>>
>>> I don't understand this comment or find it helpful.
>> Will remove.
>>>
>>>> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
>>>> +		change == KVM_MR_FLAGS_ONLY &&
>>>> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))
>>>
>>> this looks wrong, because you can now remove all the other checks of
>>> change != and you are not returning early for KVM_MR_DELETE.
>>>
>>> I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
>>> and then after the 'return 0' check the subconditions for change ==
>>> KVM_MR_FLAGS_ONLY.
>> Yeah, oh boy time to get a new batch of brown bags.
>>
>> I was trying to limit conditional down to add, remap and
>> dirty page flag only in case some other flags get toggled
>> often and waste time walking through VMAs.
>>>
>>>>  		return 0;
>>>>  
>>>>  	/*
>>>> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>>>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>>>>  					 vm_start - vma->vm_start;
>>>>  
>>>> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>>>> +			if (change != KVM_MR_FLAGS_ONLY)
>>>> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>>>>  						    vm_end - vm_start,
>>>>  						    writable);
>>>> +			else
>>>> +				/* IO region dirty page logging not allowed */
>>>> +				return -EINVAL;
>>>> +
>>>
>>> this whole thing also looks weird.  I think you just need to add a check
>>> before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
>>> return an error in that case (you've identified a user attempting to set
>>> dirty page logging on something that points to device memory, it doesn't
>>> matter at this point through which 'change' it is done).
>>
>> Yes explicitly using KVM_MEM_LOG_DIRTY_PAGES is more clear.
>>
>>>
>>>>  			if (ret)
>>>>  				break;
>>>>  		}
>>>>  		hva = vm_end;
>>>>  	} while (hva < reg_end);
>>>>  
>>>> +	/* Anything after here doesn't apply to memslot flag changes */
>>>> +	if (change == KVM_MR_FLAGS_ONLY)
>>>> +		return ret;
>>>> +
>>>>  	spin_lock(&kvm->mmu_lock);
>>>>  	if (ret)
>>>>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
>>>> -- 
>>>
>>>
>>> What I meant last time around concerning user_mem_abort was more
>>> something like this:
>>>
>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>> index 1dc9778..38ea58e 100644
>>> --- a/arch/arm/kvm/mmu.c
>>> +++ b/arch/arm/kvm/mmu.c
>>> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  		return -EFAULT;
>>>  	}
>>>  
>>> -	if (is_vm_hugetlb_page(vma)) {
>>> +	/*
>>> +	 * Writes to pages in a memslot with logging enabled are always logged
>>> +	 * on a singe page-by-page basis.
>>> +	 */
>>> +	if (memslot_is_logging(memslot) && write_fault)
>>> +		force_pte = true;
>>
>> If it's a write you take the pte route and
>> dissolves huge page, if it's a read you reconstruct the
>> THP that seems to yield pretty bad results.
> 
> ok, then remove the ' && write_fault' part of the clause.
Hi Christoffer,
 couple comments/questions.

 setting force_pte here, disables huge pages for
non-writable regions.

> 
>>> +
>>> +	if (is_vm_hugetlb_page(vma) && !force_pte) {
>>>  		hugetlb = true;
>>>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>>>  	} else {
>>> @@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  	if (is_error_pfn(pfn))
>>>  		return -EFAULT;
>>>  
Perhaps here we should check if writable is set
and handle logging, if not do nothing.

>>> +	if (memslot_is_logging(memslot) && !write_fault)
>>> +		writable = false;
>> Ok reusing writable is better.
>>> +
>>>  	if (kvm_is_device_pfn(pfn))
>>>  		mem_type = PAGE_S2_DEVICE;

If we're not setting the IOMAP flag do we have need
this, since we're forfeiting error checking later
in stage2_set_pte()?


Thanks,
  Mario

>>>  
>>> @@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  					  fault_ipa_uncached);
>>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>>  	} else {
>>> +		unsigned long flags = 0;
>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>>> +
>>>  		if (writable) {
>>>  			kvm_set_s2pte_writable(&new_pte);
>>>  			kvm_set_pfn_dirty(pfn);
>>>  		}
>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>>  					  fault_ipa_uncached);
>>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>>> +
>>> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
>>> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
>>> +
>>> +		if (memslot_is_logging(memslot))
>>> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
>> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
>> Although we don't know if device mappings will be handled here.
>>
> 
> so forget all I said about this in the past, I confused the code
> checking for !cache with the iomap flag.
> 
> So, I think you can always safeful assume that stage2_get_pmd() gives you
> something valid back when you have the LOGGING flag set, because you
> always call the function with a valid cache when the LOGGING flag is
> set.  It could be worth adding the following to stage2_set_pte():
> 
> VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)
> 
> As for this code, the IOMAP flag's only effect is that we return -EFAULT
> if we are seeing an existing PTE for the faulting address.  This would
> no longer be valid if we allow logging dirty device memory pages, so we
> really need to think about if there's any conceivable use case for this?
> 
> It doesn't really make sense to me, so I would suggest that we never
> enable logging for pages that return kvm_is_device_pfn().
> 
> Thanks,
> -Christoffer
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-13 23:14         ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-13 23:14 UTC (permalink / raw)
  To: linux-arm-kernel

On 01/12/2015 09:49 AM, Christoffer Dall wrote:
> On Mon, Jan 12, 2015 at 08:27:03AM -0800, Mario Smarduch wrote:
>> On 01/11/2015 06:00 AM, Christoffer Dall wrote:
>>> On Fri, Jan 09, 2015 at 08:17:20PM -0800, Mario Smarduch wrote:
>>>> This patch adds support for 2nd stage page fault handling while dirty page
>>>> logging. On huge page faults, huge pages are dissolved to normal pages, and
>>>> rebuilding of 2nd stage huge pages is blocked. In case migration is 
>>>> canceled this restriction is removed and huge pages may be rebuilt again.
>>>>
>>>> This patch applies cleanly on top of patch series posted Dec. 15'th:
>>>> https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html
>>>
>>> In the future such information should also go under the ---
>>> separator.
>>>
>>>>
>>>> Patch #11 has been dropped, and should not be applied.
>>>
>>> this should go under the '---' separator too.
>> Ok will keep that in mind.
> 
> basically, think of everything above the '---' separator as the commit
> message you will find in 'git log' when you are trying to understand a
> piece of code or bisecting an issue or the like.  For those purposes you
> don't care about the mechanics of how a patch was applied, how many
> iterations of the patch there were, what changed between the iterations
> and so on.
> 
>>>
>>>>
>>>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>>>> ---
>>>>
>>>> Change Log since last RESEND v1 --> v2:
>>>> - Disallow dirty page logging of IO region - fail for initial write protect
>>>>   and disable logging code in 2nd stage page fault handler.
>>>> - Fixed auto spell correction errors
>>>>
>>>> Change Log RESEND v0 --> v1:
>>>> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>>>>   writable, prevent write protection of pte on read fault
>>>> - Removed marking entire huge page dirty on initial access
>>>> - don't dissolve huge pages of non-writable regions
>>>> - Made updates based on Christoffers comments
>>>>   - renamed logging status function to memslot_is_logging()
>>>>   - changed few values to bool from longs
>>>>   - streamlined user_mem_abort() to eliminate extra conditional checks
>>>> ---
>>>>  arch/arm/kvm/mmu.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++----
>>>>  1 file changed, 105 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>> index 73d506f..b878236 100644
>>>> --- a/arch/arm/kvm/mmu.c
>>>> +++ b/arch/arm/kvm/mmu.c
>>>> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>>>>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>>>>  #define kvm_pud_huge(_x)	pud_huge(_x)
>>>>  
>>>> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
>>>> +#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
>>>> +
>>>> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
>>>> +{
>>>> +#ifdef CONFIG_ARM
>>>> +	return !!memslot->dirty_bitmap;
>>>> +#else
>>>> +	return false;
>>>> +#endif
>>>> +}
>>>> +
>>>>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>>>  {
>>>>  	/*
>>>> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>>>>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>>>>  }
>>>>  
>>>> +/**
>>>> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
>>>> + * @kvm:	pointer to kvm structure.
>>>> + * @addr:	IPA
>>>> + * @pmd:	pmd pointer for IPA
>>>> + *
>>>> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
>>>> + * pages in the range dirty.
>>>> + */
>>>> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
>>>> +{
>>>> +	if (!kvm_pmd_huge(*pmd))
>>>> +		return;
>>>> +
>>>> +	pmd_clear(pmd);
>>>> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
>>>> +	put_page(virt_to_page(pmd));
>>>> +}
>>>> +
>>>>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>>>>  				  int min, int max)
>>>>  {
>>>> @@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>>>>  }
>>>>  
>>>>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>>> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
>>>> +			  phys_addr_t addr, const pte_t *new_pte,
>>>> +			  unsigned long flags)
>>>>  {
>>>>  	pmd_t *pmd;
>>>>  	pte_t *pte, old_pte;
>>>> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
>>>> +	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>>>>  
>>>>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>>>>  	pmd = stage2_get_pmd(kvm, cache, addr);
>>>> @@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>>>>  		return 0;
>>>>  	}
>>>>  
>>>> +	/*
>>>> +	 * While dirty page logging - dissolve huge PMD, then continue on to
>>>> +	 * allocate page.
>>>> +	 */
>>>> +	if (logging_active)
>>>> +		stage2_dissolve_pmd(kvm, addr, pmd);
>>>> +
>>>>  	/* Create stage-2 page mappings - Level 2 */
>>>>  	if (pmd_none(*pmd)) {
>>>>  		if (!cache)
>>>> @@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>>>>  		if (ret)
>>>>  			goto out;
>>>>  		spin_lock(&kvm->mmu_lock);
>>>> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
>>>> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
>>>> +						KVM_S2PTE_FLAG_IS_IOMAP);
>>>>  		spin_unlock(&kvm->mmu_lock);
>>>>  		if (ret)
>>>>  			goto out;
>>>> @@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	pfn_t pfn;
>>>>  	pgprot_t mem_type = PAGE_S2;
>>>>  	bool fault_ipa_uncached;
>>>> +	bool can_set_pte_rw = true;
>>>> +	unsigned long set_pte_flags = 0;
>>>>  
>>>>  	write_fault = kvm_is_write_fault(vcpu);
>>>>  	if (fault_status == FSC_PERM && !write_fault) {
>>>> @@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  		return -EFAULT;
>>>>  	}
>>>>  
>>>> +
>>>
>>> stray whitespace change?
>> Got it.
>>>
>>>>  	/* Let's check if we will get back a huge page backed by hugetlbfs */
>>>>  	down_read(&current->mm->mmap_sem);
>>>>  	vma = find_vma_intersection(current->mm, hva, hva + 1);
>>>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  	if (is_error_pfn(pfn))
>>>>  		return -EFAULT;
>>>>  
>>>> -	if (kvm_is_device_pfn(pfn))
>>>> +	if (kvm_is_device_pfn(pfn)) {
>>>>  		mem_type = PAGE_S2_DEVICE;
>>>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
>>>> +	}
>>>>  
>>>>  	spin_lock(&kvm->mmu_lock);
>>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
>>>>  		goto out_unlock;
>>>> +
>>>> +	/*
>>>> +	 * When logging is enabled general page fault handling changes:
>>>> +	 * -  Writable huge pages are dissolved on a read or write fault.
>>>
>>> why dissolve huge pages on a read fault?
>>
>> What I noticed on write you would dissolve, on read you
>> rebuild THPs, flip back and forth like that, performance
>> & convergence was really bad.
> 
> ah, that makes sense, we should probably indicate that reasoning
> somehow.  In fact, what threw me off was the use of the word "dissolve
> huge pages" which is not really what you're doing on a read fault, there
> you are just never adjusting to huge pages.
> 
> I'm wondering why that would slow things down much though, the only cost
> would be the extra tlb invalidation and replacing the PMD on a
> subsequent write fault, but I trust your numbers nevertheless.
> 
>>>
>>>> +	 * -  pte's are not allowed write permission on a read fault to
>>>> +	 *    writable region so future writes can be marked dirty
>>>
>>> new line
>> ok.
>>>
>>>> +	 * Access to non-writable region is unchanged, and logging of IO
>>>> +	 * regions is not allowed.
>>>> +	 */
>>>> +	if (memslot_is_logging(memslot) && writable) {
>>>> +		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
>>>> +		if (hugetlb) {
>>>> +			gfn += pte_index(fault_ipa);
>>>> +			pfn += pte_index(fault_ipa);
>>>> +			hugetlb = false;
>>>> +		}
>>>> +		force_pte = true;
>>>
>>> uh, not this is not what I meant, see my example (untested, partial)
>>> patch in the end of this mail.
>> I put some comments on your patch.
>>>
>>>> +		if (!write_fault)
>>>> +			can_set_pte_rw = false;
>>>> +	}
>>>> +
>>>>  	if (!hugetlb && !force_pte)
>>>>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>>>>  
>>>> @@ -1082,16 +1150,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>>>  	} else {
>>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>>>> -		if (writable) {
>>>> +
>>>> +		/*
>>>> +		 * Don't set write permission, for non-writable region, and
>>>> +		 * for read fault to writable region while logging.
>>>> +		 */
>>>> +		if (writable && can_set_pte_rw) {
>>>>  			kvm_set_s2pte_writable(&new_pte);
>>>>  			kvm_set_pfn_dirty(pfn);
>>>>  		}
>>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>>>  					  fault_ipa_uncached);
>>>>  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>>>> +							set_pte_flags);
>>>>  	}
>>>>  
>>>> +	if (write_fault)
>>>> +		mark_page_dirty(kvm, gfn);
>>>>  
>>>>  out_unlock:
>>>>  	spin_unlock(&kvm->mmu_lock);
>>>> @@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>>>>  {
>>>>  	pte_t *pte = (pte_t *)data;
>>>>  
>>>> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
>>>> +	/*
>>>> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
>>>> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
>>>> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
>>>> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
>>>> +	 * through this calling path.
>>>> +	 */
>>>> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>>>>  }
>>>>  
>>>>  
>>>> @@ -1396,7 +1478,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>>>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>>>>  	int ret = 0;
>>>>  
>>>> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
>>>> +	/*
>>>> +	 * Let - enable of dirty page logging through, later check if it's for
>>>> +	 * an IO region and fail.
>>>> +	 */
>>>
>>> I don't understand this comment or find it helpful.
>> Will remove.
>>>
>>>> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
>>>> +		change == KVM_MR_FLAGS_ONLY &&
>>>> +		!(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES))
>>>
>>> this looks wrong, because you can now remove all the other checks of
>>> change != and you are not returning early for KVM_MR_DELETE.
>>>
>>> I think you want to add a check simply for 'change != KVM_MR_FLAGS_ONLY'
>>> and then after the 'return 0' check the subconditions for change ==
>>> KVM_MR_FLAGS_ONLY.
>> Yeah, oh boy time to get a new batch of brown bags.
>>
>> I was trying to limit conditional down to add, remap and
>> dirty page flag only in case some other flags get toggled
>> often and waste time walking through VMAs.
>>>
>>>>  		return 0;
>>>>  
>>>>  	/*
>>>> @@ -1447,15 +1535,24 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>>>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>>>>  					 vm_start - vma->vm_start;
>>>>  
>>>> -			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>>>> +			if (change != KVM_MR_FLAGS_ONLY)
>>>> +				ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>>>>  						    vm_end - vm_start,
>>>>  						    writable);
>>>> +			else
>>>> +				/* IO region dirty page logging not allowed */
>>>> +				return -EINVAL;
>>>> +
>>>
>>> this whole thing also looks weird.  I think you just need to add a check
>>> before kvm_phys_addr_ioremap() for flags & KVM_MEM_LOG_DIRTY_PAGES and
>>> return an error in that case (you've identified a user attempting to set
>>> dirty page logging on something that points to device memory, it doesn't
>>> matter at this point through which 'change' it is done).
>>
>> Yes explicitly using KVM_MEM_LOG_DIRTY_PAGES is more clear.
>>
>>>
>>>>  			if (ret)
>>>>  				break;
>>>>  		}
>>>>  		hva = vm_end;
>>>>  	} while (hva < reg_end);
>>>>  
>>>> +	/* Anything after here doesn't apply to memslot flag changes */
>>>> +	if (change == KVM_MR_FLAGS_ONLY)
>>>> +		return ret;
>>>> +
>>>>  	spin_lock(&kvm->mmu_lock);
>>>>  	if (ret)
>>>>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
>>>> -- 
>>>
>>>
>>> What I meant last time around concerning user_mem_abort was more
>>> something like this:
>>>
>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>> index 1dc9778..38ea58e 100644
>>> --- a/arch/arm/kvm/mmu.c
>>> +++ b/arch/arm/kvm/mmu.c
>>> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  		return -EFAULT;
>>>  	}
>>>  
>>> -	if (is_vm_hugetlb_page(vma)) {
>>> +	/*
>>> +	 * Writes to pages in a memslot with logging enabled are always logged
>>> +	 * on a singe page-by-page basis.
>>> +	 */
>>> +	if (memslot_is_logging(memslot) && write_fault)
>>> +		force_pte = true;
>>
>> If it's a write you take the pte route and
>> dissolves huge page, if it's a read you reconstruct the
>> THP that seems to yield pretty bad results.
> 
> ok, then remove the ' && write_fault' part of the clause.
Hi Christoffer,
 couple comments/questions.

 setting force_pte here, disables huge pages for
non-writable regions.

> 
>>> +
>>> +	if (is_vm_hugetlb_page(vma) && !force_pte) {
>>>  		hugetlb = true;
>>>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>>>  	} else {
>>> @@ -976,6 +983,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  	if (is_error_pfn(pfn))
>>>  		return -EFAULT;
>>>  
Perhaps here we should check if writable is set
and handle logging, if not do nothing.

>>> +	if (memslot_is_logging(memslot) && !write_fault)
>>> +		writable = false;
>> Ok reusing writable is better.
>>> +
>>>  	if (kvm_is_device_pfn(pfn))
>>>  		mem_type = PAGE_S2_DEVICE;

If we're not setting the IOMAP flag do we have need
this, since we're forfeiting error checking later
in stage2_set_pte()?


Thanks,
  Mario

>>>  
>>> @@ -998,15 +1008,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>  					  fault_ipa_uncached);
>>>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>>>  	} else {
>>> +		unsigned long flags = 0;
>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
>>> +
>>>  		if (writable) {
>>>  			kvm_set_s2pte_writable(&new_pte);
>>>  			kvm_set_pfn_dirty(pfn);
>>>  		}
>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>>>  					  fault_ipa_uncached);
>>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
>>> +
>>> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
>>> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
>>> +
>>> +		if (memslot_is_logging(memslot))
>>> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
>> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
>> Although we don't know if device mappings will be handled here.
>>
> 
> so forget all I said about this in the past, I confused the code
> checking for !cache with the iomap flag.
> 
> So, I think you can always safeful assume that stage2_get_pmd() gives you
> something valid back when you have the LOGGING flag set, because you
> always call the function with a valid cache when the LOGGING flag is
> set.  It could be worth adding the following to stage2_set_pte():
> 
> VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)
> 
> As for this code, the IOMAP flag's only effect is that we return -EFAULT
> if we are seeing an existing PTE for the faulting address.  This would
> no longer be valid if we allow logging dirty device memory pages, so we
> really need to think about if there's any conceivable use case for this?
> 
> It doesn't really make sense to me, so I would suggest that we never
> enable logging for pages that return kvm_is_device_pfn().
> 
> Thanks,
> -Christoffer
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-13 23:14         ` Mario Smarduch
@ 2015-01-14 10:32           ` Christoffer Dall
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-14 10:32 UTC (permalink / raw)
  To: Mario Smarduch; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On Tue, Jan 13, 2015 at 03:14:47PM -0800, Mario Smarduch wrote:

[...]

> >>>
> >>>
> >>> What I meant last time around concerning user_mem_abort was more
> >>> something like this:
> >>>
> >>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >>> index 1dc9778..38ea58e 100644
> >>> --- a/arch/arm/kvm/mmu.c
> >>> +++ b/arch/arm/kvm/mmu.c
> >>> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>>  		return -EFAULT;
> >>>  	}
> >>>  
> >>> -	if (is_vm_hugetlb_page(vma)) {
> >>> +	/*
> >>> +	 * Writes to pages in a memslot with logging enabled are always logged
> >>> +	 * on a singe page-by-page basis.
> >>> +	 */
> >>> +	if (memslot_is_logging(memslot) && write_fault)
> >>> +		force_pte = true;
> >>
> >> If it's a write you take the pte route and
> >> dissolves huge page, if it's a read you reconstruct the
> >> THP that seems to yield pretty bad results.
> > 
> > ok, then remove the ' && write_fault' part of the clause.
> Hi Christoffer,
>  couple comments/questions.
> 
>  setting force_pte here, disables huge pages for
> non-writable regions.
> 

hmmm, by a non-writable region you mean a read-only memslot? Can you set
dirty page logging for such one?  That doesn't make much sense to me.

Note, that if you receive writable == false from gfn_to_pfn_prot() that
doesn't mean that the page can never be written to, it just means that
the current mapping of the page is not a writable one, you can call that
same function again later with write_fault=true, and you either get a
writable page back or you simply get an error.

[...]

> >>>  	if (kvm_is_device_pfn(pfn))
> >>>  		mem_type = PAGE_S2_DEVICE;
> 
> If we're not setting the IOMAP flag do we have need
> this, since we're forfeiting error checking later
> in stage2_set_pte()?
> 

we still need this, remember the error checking is about
cache == NULL, not about the IOMAP flag.  I think I address this in the
new proposal below, but please check carefully.

Take a look at this one:

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1dc9778..841e053 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -919,6 +919,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	pfn_t pfn;
 	pgprot_t mem_type = PAGE_S2;
 	bool fault_ipa_uncached;
+	unsigned long flags = 0;
 
 	write_fault = kvm_is_write_fault(vcpu);
 	if (fault_status == FSC_PERM && !write_fault) {
@@ -976,8 +977,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;
 
-	if (kvm_is_device_pfn(pfn))
+	if (kvm_is_device_pfn(pfn)) {
 		mem_type = PAGE_S2_DEVICE;
+		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+	} else if (memslot_is_logging(memslot)) {
+		/*
+		 * Faults on pages in a memslot with logging enabled that can
+		 * should not be mapped with huge pages (it introduces churn
+		 * and performance degradation), so force a pte mapping.
+		 */
+		force_pte = true;
+		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+		/*
+		 * Only actually map the page as writable if this was a write
+		 * fault.
+		 */
+		if (!write_fault)
+			writable = false;
+
+	}
 
 	spin_lock(&kvm->mmu_lock);
 	if (mmu_notifier_retry(kvm, mmu_seq))
@@ -1002,13 +1021,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		if (writable) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
+			mark_page_dirty(kvm, gfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
-	}
 
+		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
+	}
 
 out_unlock:
 	spin_unlock(&kvm->mmu_lock);

Thanks,
-Christoffer

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-14 10:32           ` Christoffer Dall
  0 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-14 10:32 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Jan 13, 2015 at 03:14:47PM -0800, Mario Smarduch wrote:

[...]

> >>>
> >>>
> >>> What I meant last time around concerning user_mem_abort was more
> >>> something like this:
> >>>
> >>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >>> index 1dc9778..38ea58e 100644
> >>> --- a/arch/arm/kvm/mmu.c
> >>> +++ b/arch/arm/kvm/mmu.c
> >>> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>>  		return -EFAULT;
> >>>  	}
> >>>  
> >>> -	if (is_vm_hugetlb_page(vma)) {
> >>> +	/*
> >>> +	 * Writes to pages in a memslot with logging enabled are always logged
> >>> +	 * on a singe page-by-page basis.
> >>> +	 */
> >>> +	if (memslot_is_logging(memslot) && write_fault)
> >>> +		force_pte = true;
> >>
> >> If it's a write you take the pte route and
> >> dissolves huge page, if it's a read you reconstruct the
> >> THP that seems to yield pretty bad results.
> > 
> > ok, then remove the ' && write_fault' part of the clause.
> Hi Christoffer,
>  couple comments/questions.
> 
>  setting force_pte here, disables huge pages for
> non-writable regions.
> 

hmmm, by a non-writable region you mean a read-only memslot? Can you set
dirty page logging for such one?  That doesn't make much sense to me.

Note, that if you receive writable == false from gfn_to_pfn_prot() that
doesn't mean that the page can never be written to, it just means that
the current mapping of the page is not a writable one, you can call that
same function again later with write_fault=true, and you either get a
writable page back or you simply get an error.

[...]

> >>>  	if (kvm_is_device_pfn(pfn))
> >>>  		mem_type = PAGE_S2_DEVICE;
> 
> If we're not setting the IOMAP flag do we have need
> this, since we're forfeiting error checking later
> in stage2_set_pte()?
> 

we still need this, remember the error checking is about
cache == NULL, not about the IOMAP flag.  I think I address this in the
new proposal below, but please check carefully.

Take a look at this one:

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1dc9778..841e053 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -919,6 +919,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	pfn_t pfn;
 	pgprot_t mem_type = PAGE_S2;
 	bool fault_ipa_uncached;
+	unsigned long flags = 0;
 
 	write_fault = kvm_is_write_fault(vcpu);
 	if (fault_status == FSC_PERM && !write_fault) {
@@ -976,8 +977,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;
 
-	if (kvm_is_device_pfn(pfn))
+	if (kvm_is_device_pfn(pfn)) {
 		mem_type = PAGE_S2_DEVICE;
+		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+	} else if (memslot_is_logging(memslot)) {
+		/*
+		 * Faults on pages in a memslot with logging enabled that can
+		 * should not be mapped with huge pages (it introduces churn
+		 * and performance degradation), so force a pte mapping.
+		 */
+		force_pte = true;
+		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+		/*
+		 * Only actually map the page as writable if this was a write
+		 * fault.
+		 */
+		if (!write_fault)
+			writable = false;
+
+	}
 
 	spin_lock(&kvm->mmu_lock);
 	if (mmu_notifier_retry(kvm, mmu_seq))
@@ -1002,13 +1021,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		if (writable) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
+			mark_page_dirty(kvm, gfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
-	}
 
+		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
+	}
 
 out_unlock:
 	spin_unlock(&kvm->mmu_lock);

Thanks,
-Christoffer

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-13 17:42             ` Mario Smarduch
@ 2015-01-14 10:33               ` Christoffer Dall
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-14 10:33 UTC (permalink / raw)
  To: Mario Smarduch; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On Tue, Jan 13, 2015 at 09:42:34AM -0800, Mario Smarduch wrote:
> On 01/12/2015 11:43 AM, Christoffer Dall wrote:
> > On Mon, Jan 12, 2015 at 11:04:45AM -0800, Mario Smarduch wrote:
> > 
> > [...]
> > 
> >>>>>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>>>>>  	if (is_error_pfn(pfn))
> >>>>>>  		return -EFAULT;
> >>>>>>  
> >>>>>> -	if (kvm_is_device_pfn(pfn))
> >>>>>> +	if (kvm_is_device_pfn(pfn)) {
> >>>>>>  		mem_type = PAGE_S2_DEVICE;
> >>>>>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
> >>>>>> +	}
> >>>>>>  
> >>>>>>  	spin_lock(&kvm->mmu_lock);
> >>>>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
> >>>>>>  		goto out_unlock;
> >>>>>> +
> >>>>>> +	/*
> >>>>>> +	 * When logging is enabled general page fault handling changes:
> >>>>>> +	 * -  Writable huge pages are dissolved on a read or write fault.
> >>>>>
> >>>>> why dissolve huge pages on a read fault?
> >>>>
> >>>> What I noticed on write you would dissolve, on read you
> >>>> rebuild THPs, flip back and forth like that, performance
> >>>> & convergence was really bad.
> >>>
> >>> ah, that makes sense, we should probably indicate that reasoning
> >>> somehow.  In fact, what threw me off was the use of the word "dissolve
> >>> huge pages" which is not really what you're doing on a read fault, there
> >>> you are just never adjusting to huge pages.
> >>>
> >>> I'm wondering why that would slow things down much though, the only cost
> >>> would be the extra tlb invalidation and replacing the PMD on a
> >>> subsequent write fault, but I trust your numbers nevertheless.
> >>
> >> If I understand correctly -
> >> you do few writes, dissolve a huge page insert pte TLB entries,
> >> then a read page fault installs a pmd clears the TLB cache
> >> for that range, and it repeats over. Appears like you
> >> need to constantly re-fault pte TLBs on writes to huge
> >> page range.
> > 
> > that makes good sense, thanks for the explanation.
> > 
> > [...]
> > 
> >>>>>  	} else {
> >>>>> +		unsigned long flags = 0;
> >>>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> >>>>> +
> >>>>>  		if (writable) {
> >>>>>  			kvm_set_s2pte_writable(&new_pte);
> >>>>>  			kvm_set_pfn_dirty(pfn);
> >>>>>  		}
> >>>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
> >>>>>  					  fault_ipa_uncached);
> >>>>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> >>>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> >>>>> +
> >>>>> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
> >>>>> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> >>>>> +
> >>>>> +		if (memslot_is_logging(memslot))
> >>>>> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
> >>>> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
> >>>> Although we don't know if device mappings will be handled here.
> >>>>
> >>>
> >>> so forget all I said about this in the past, I confused the code
> >>> checking for !cache with the iomap flag.
> >>>
> >>> So, I think you can always safeful assume that stage2_get_pmd() gives you
> >>> something valid back when you have the LOGGING flag set, because you
> >>> always call the function with a valid cache when the LOGGING flag is
> >>> set.  It could be worth adding the following to stage2_set_pte():
> >>>
> >>> VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)
> >>
> >> I see ok, thanks for clearing that up.
> >>
> >>>
> >>> As for this code, the IOMAP flag's only effect is that we return -EFAULT
> >>> if we are seeing an existing PTE for the faulting address.  This would
> >>> no longer be valid if we allow logging dirty device memory pages, so we
> >> Sorry, do you mean allow or disallow?
> > 
> > if we (by these patches) allow logging dirty pages for device memory,
> > then we...
> > 
> >>
> >>> really need to think about if there's any conceivable use case for this?
> 
> No I can't think of any use case to log Device address space.
> 
> So I could move forward - drop the IOMAP flag, and add the
> VM_BUG_ON to stage2_set_pte().
> 
add the VM_BUG_ON, but keep the IOMAP flag as a separate thing from page
logging (assuming we all agree they are orthogonal events), see other
mail thread.

-Christoffer

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-14 10:33               ` Christoffer Dall
  0 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-14 10:33 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Jan 13, 2015 at 09:42:34AM -0800, Mario Smarduch wrote:
> On 01/12/2015 11:43 AM, Christoffer Dall wrote:
> > On Mon, Jan 12, 2015 at 11:04:45AM -0800, Mario Smarduch wrote:
> > 
> > [...]
> > 
> >>>>>> @@ -1059,12 +1104,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>>>>>  	if (is_error_pfn(pfn))
> >>>>>>  		return -EFAULT;
> >>>>>>  
> >>>>>> -	if (kvm_is_device_pfn(pfn))
> >>>>>> +	if (kvm_is_device_pfn(pfn)) {
> >>>>>>  		mem_type = PAGE_S2_DEVICE;
> >>>>>> +		set_pte_flags = KVM_S2PTE_FLAG_IS_IOMAP;
> >>>>>> +	}
> >>>>>>  
> >>>>>>  	spin_lock(&kvm->mmu_lock);
> >>>>>>  	if (mmu_notifier_retry(kvm, mmu_seq))
> >>>>>>  		goto out_unlock;
> >>>>>> +
> >>>>>> +	/*
> >>>>>> +	 * When logging is enabled general page fault handling changes:
> >>>>>> +	 * -  Writable huge pages are dissolved on a read or write fault.
> >>>>>
> >>>>> why dissolve huge pages on a read fault?
> >>>>
> >>>> What I noticed on write you would dissolve, on read you
> >>>> rebuild THPs, flip back and forth like that, performance
> >>>> & convergence was really bad.
> >>>
> >>> ah, that makes sense, we should probably indicate that reasoning
> >>> somehow.  In fact, what threw me off was the use of the word "dissolve
> >>> huge pages" which is not really what you're doing on a read fault, there
> >>> you are just never adjusting to huge pages.
> >>>
> >>> I'm wondering why that would slow things down much though, the only cost
> >>> would be the extra tlb invalidation and replacing the PMD on a
> >>> subsequent write fault, but I trust your numbers nevertheless.
> >>
> >> If I understand correctly -
> >> you do few writes, dissolve a huge page insert pte TLB entries,
> >> then a read page fault installs a pmd clears the TLB cache
> >> for that range, and it repeats over. Appears like you
> >> need to constantly re-fault pte TLBs on writes to huge
> >> page range.
> > 
> > that makes good sense, thanks for the explanation.
> > 
> > [...]
> > 
> >>>>>  	} else {
> >>>>> +		unsigned long flags = 0;
> >>>>>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> >>>>> +
> >>>>>  		if (writable) {
> >>>>>  			kvm_set_s2pte_writable(&new_pte);
> >>>>>  			kvm_set_pfn_dirty(pfn);
> >>>>>  		}
> >>>>>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
> >>>>>  					  fault_ipa_uncached);
> >>>>> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> >>>>> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> >>>>> +
> >>>>> +		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
> >>>>> +			flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> >>>>> +
> >>>>> +		if (memslot_is_logging(memslot))
> >>>>> +			flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
> >>>> Now that it either IOMAP or LOGGING_ACTIVE do we need to acumulate flags?
> >>>> Although we don't know if device mappings will be handled here.
> >>>>
> >>>
> >>> so forget all I said about this in the past, I confused the code
> >>> checking for !cache with the iomap flag.
> >>>
> >>> So, I think you can always safeful assume that stage2_get_pmd() gives you
> >>> something valid back when you have the LOGGING flag set, because you
> >>> always call the function with a valid cache when the LOGGING flag is
> >>> set.  It could be worth adding the following to stage2_set_pte():
> >>>
> >>> VM_BUG_ON((flags & KVM_S2_FLAG_LOGGING_ACTIVE) && !cache)
> >>
> >> I see ok, thanks for clearing that up.
> >>
> >>>
> >>> As for this code, the IOMAP flag's only effect is that we return -EFAULT
> >>> if we are seeing an existing PTE for the faulting address.  This would
> >>> no longer be valid if we allow logging dirty device memory pages, so we
> >> Sorry, do you mean allow or disallow?
> > 
> > if we (by these patches) allow logging dirty pages for device memory,
> > then we...
> > 
> >>
> >>> really need to think about if there's any conceivable use case for this?
> 
> No I can't think of any use case to log Device address space.
> 
> So I could move forward - drop the IOMAP flag, and add the
> VM_BUG_ON to stage2_set_pte().
> 
add the VM_BUG_ON, but keep the IOMAP flag as a separate thing from page
logging (assuming we all agree they are orthogonal events), see other
mail thread.

-Christoffer

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-14 10:32           ` Christoffer Dall
@ 2015-01-14 23:10             ` Mario Smarduch
  -1 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-14 23:10 UTC (permalink / raw)
  To: Christoffer Dall; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On 01/14/2015 02:32 AM, Christoffer Dall wrote:
> On Tue, Jan 13, 2015 at 03:14:47PM -0800, Mario Smarduch wrote:
> 
> [...]
> 
>>>>>
>>>>>
>>>>> What I meant last time around concerning user_mem_abort was more
>>>>> something like this:
>>>>>
>>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>>> index 1dc9778..38ea58e 100644
>>>>> --- a/arch/arm/kvm/mmu.c
>>>>> +++ b/arch/arm/kvm/mmu.c
>>>>> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>>  		return -EFAULT;
>>>>>  	}
>>>>>  
>>>>> -	if (is_vm_hugetlb_page(vma)) {
>>>>> +	/*
>>>>> +	 * Writes to pages in a memslot with logging enabled are always logged
>>>>> +	 * on a singe page-by-page basis.
>>>>> +	 */
>>>>> +	if (memslot_is_logging(memslot) && write_fault)
>>>>> +		force_pte = true;
>>>>
>>>> If it's a write you take the pte route and
>>>> dissolves huge page, if it's a read you reconstruct the
>>>> THP that seems to yield pretty bad results.
>>>
>>> ok, then remove the ' && write_fault' part of the clause.
>> Hi Christoffer,
>>  couple comments/questions.
>>
>>  setting force_pte here, disables huge pages for
>> non-writable regions.
>>
> 

Hi Christoffer,
 another round, although I'll go ahead and post another
iteration, sorry but as you mentioned this code is
important.

> hmmm, by a non-writable region you mean a read-only memslot? Can you set
> dirty page logging for such one?  That doesn't make much sense to me.

Come to think of it that's  true.

It's bit fuzzyy when I was looking at the API for KVM_MEM_LOG_DIRTY_PAGES,
it appears user space needs to check if region is read-only and set region
size to 0(qemu). I don't see any checks in kernel to disable logging if
region is read only and we're enabling dirty page logging. API doesn't say
anything else. You may be able to enable logging
for read-only region if you leave region size as is.

I guess this has been around for quite a while so we
can just assume read-only slots will have logging disabled.

> 
> Note, that if you receive writable == false from gfn_to_pfn_prot() that
> doesn't mean that the page can never be written to, it just means that
> the current mapping of the page is not a writable one, you can call that
> same function again later with write_fault=true, and you either get a
> writable page back or you simply get an error.

Yes that's true after studying hva_to_pfn_slow(),
and __get_user_pages_fast(), a lot of conditions
handled there.

> 
> [...]
> 
>>>>>  	if (kvm_is_device_pfn(pfn))
>>>>>  		mem_type = PAGE_S2_DEVICE;
>>
>> If we're not setting the IOMAP flag do we have need
>> this, since we're forfeiting error checking later
>> in stage2_set_pte()?
>>
> 
> we still need this, remember the error checking is about
> cache == NULL, not about the IOMAP flag.  I think I address this in the
> new proposal below, but please check carefully.

Ok so mmu notifier may call stage2_set_pte() with
null cache poiner and intermediate table entries may
not be there so stage2_get_pud() may return NULL.
With logging on it won't happen, but just in case
we check.

And we'll continue to handle Device faults until
further notice.

> 
> Take a look at this one:

Looks good to me, concise.

Thanks.
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 1dc9778..841e053 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -919,6 +919,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	pfn_t pfn;
>  	pgprot_t mem_type = PAGE_S2;
>  	bool fault_ipa_uncached;
> +	unsigned long flags = 0;
>  
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -976,8 +977,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (is_error_pfn(pfn))
>  		return -EFAULT;
>  
> -	if (kvm_is_device_pfn(pfn))
> +	if (kvm_is_device_pfn(pfn)) {
>  		mem_type = PAGE_S2_DEVICE;
> +		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> +	} else if (memslot_is_logging(memslot)) {
> +		/*
> +		 * Faults on pages in a memslot with logging enabled that can
> +		 * should not be mapped with huge pages (it introduces churn
> +		 * and performance degradation), so force a pte mapping.
> +		 */
> +		force_pte = true;
> +		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
> +
> +		/*
> +		 * Only actually map the page as writable if this was a write
> +		 * fault.
> +		 */
> +		if (!write_fault)
> +			writable = false;
> +
> +	}
>  
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
> @@ -1002,13 +1021,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		if (writable) {
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
> +			mark_page_dirty(kvm, gfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>  					  fault_ipa_uncached);
> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> -	}
>  
> +		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
> +	}
>  
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
> 
> Thanks,
> -Christoffer
> 


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-14 23:10             ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-14 23:10 UTC (permalink / raw)
  To: linux-arm-kernel

On 01/14/2015 02:32 AM, Christoffer Dall wrote:
> On Tue, Jan 13, 2015 at 03:14:47PM -0800, Mario Smarduch wrote:
> 
> [...]
> 
>>>>>
>>>>>
>>>>> What I meant last time around concerning user_mem_abort was more
>>>>> something like this:
>>>>>
>>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>>> index 1dc9778..38ea58e 100644
>>>>> --- a/arch/arm/kvm/mmu.c
>>>>> +++ b/arch/arm/kvm/mmu.c
>>>>> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>>  		return -EFAULT;
>>>>>  	}
>>>>>  
>>>>> -	if (is_vm_hugetlb_page(vma)) {
>>>>> +	/*
>>>>> +	 * Writes to pages in a memslot with logging enabled are always logged
>>>>> +	 * on a singe page-by-page basis.
>>>>> +	 */
>>>>> +	if (memslot_is_logging(memslot) && write_fault)
>>>>> +		force_pte = true;
>>>>
>>>> If it's a write you take the pte route and
>>>> dissolves huge page, if it's a read you reconstruct the
>>>> THP that seems to yield pretty bad results.
>>>
>>> ok, then remove the ' && write_fault' part of the clause.
>> Hi Christoffer,
>>  couple comments/questions.
>>
>>  setting force_pte here, disables huge pages for
>> non-writable regions.
>>
> 

Hi Christoffer,
 another round, although I'll go ahead and post another
iteration, sorry but as you mentioned this code is
important.

> hmmm, by a non-writable region you mean a read-only memslot? Can you set
> dirty page logging for such one?  That doesn't make much sense to me.

Come to think of it that's  true.

It's bit fuzzyy when I was looking at the API for KVM_MEM_LOG_DIRTY_PAGES,
it appears user space needs to check if region is read-only and set region
size to 0(qemu). I don't see any checks in kernel to disable logging if
region is read only and we're enabling dirty page logging. API doesn't say
anything else. You may be able to enable logging
for read-only region if you leave region size as is.

I guess this has been around for quite a while so we
can just assume read-only slots will have logging disabled.

> 
> Note, that if you receive writable == false from gfn_to_pfn_prot() that
> doesn't mean that the page can never be written to, it just means that
> the current mapping of the page is not a writable one, you can call that
> same function again later with write_fault=true, and you either get a
> writable page back or you simply get an error.

Yes that's true after studying hva_to_pfn_slow(),
and __get_user_pages_fast(), a lot of conditions
handled there.

> 
> [...]
> 
>>>>>  	if (kvm_is_device_pfn(pfn))
>>>>>  		mem_type = PAGE_S2_DEVICE;
>>
>> If we're not setting the IOMAP flag do we have need
>> this, since we're forfeiting error checking later
>> in stage2_set_pte()?
>>
> 
> we still need this, remember the error checking is about
> cache == NULL, not about the IOMAP flag.  I think I address this in the
> new proposal below, but please check carefully.

Ok so mmu notifier may call stage2_set_pte() with
null cache poiner and intermediate table entries may
not be there so stage2_get_pud() may return NULL.
With logging on it won't happen, but just in case
we check.

And we'll continue to handle Device faults until
further notice.

> 
> Take a look at this one:

Looks good to me, concise.

Thanks.
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 1dc9778..841e053 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -919,6 +919,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	pfn_t pfn;
>  	pgprot_t mem_type = PAGE_S2;
>  	bool fault_ipa_uncached;
> +	unsigned long flags = 0;
>  
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -976,8 +977,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (is_error_pfn(pfn))
>  		return -EFAULT;
>  
> -	if (kvm_is_device_pfn(pfn))
> +	if (kvm_is_device_pfn(pfn)) {
>  		mem_type = PAGE_S2_DEVICE;
> +		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> +	} else if (memslot_is_logging(memslot)) {
> +		/*
> +		 * Faults on pages in a memslot with logging enabled that can
> +		 * should not be mapped with huge pages (it introduces churn
> +		 * and performance degradation), so force a pte mapping.
> +		 */
> +		force_pte = true;
> +		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
> +
> +		/*
> +		 * Only actually map the page as writable if this was a write
> +		 * fault.
> +		 */
> +		if (!write_fault)
> +			writable = false;
> +
> +	}
>  
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
> @@ -1002,13 +1021,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		if (writable) {
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
> +			mark_page_dirty(kvm, gfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>  					  fault_ipa_uncached);
> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> -	}
>  
> +		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
> +	}
>  
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
> 
> Thanks,
> -Christoffer
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-14 23:10             ` Mario Smarduch
@ 2015-01-15 10:20               ` Christoffer Dall
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-15 10:20 UTC (permalink / raw)
  To: Mario Smarduch; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On Wed, Jan 14, 2015 at 03:10:11PM -0800, Mario Smarduch wrote:
> On 01/14/2015 02:32 AM, Christoffer Dall wrote:
> > On Tue, Jan 13, 2015 at 03:14:47PM -0800, Mario Smarduch wrote:
> > 
> > [...]
> > 
> >>>>>
> >>>>>
> >>>>> What I meant last time around concerning user_mem_abort was more
> >>>>> something like this:
> >>>>>
> >>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >>>>> index 1dc9778..38ea58e 100644
> >>>>> --- a/arch/arm/kvm/mmu.c
> >>>>> +++ b/arch/arm/kvm/mmu.c
> >>>>> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>>>>  		return -EFAULT;
> >>>>>  	}
> >>>>>  
> >>>>> -	if (is_vm_hugetlb_page(vma)) {
> >>>>> +	/*
> >>>>> +	 * Writes to pages in a memslot with logging enabled are always logged
> >>>>> +	 * on a singe page-by-page basis.
> >>>>> +	 */
> >>>>> +	if (memslot_is_logging(memslot) && write_fault)
> >>>>> +		force_pte = true;
> >>>>
> >>>> If it's a write you take the pte route and
> >>>> dissolves huge page, if it's a read you reconstruct the
> >>>> THP that seems to yield pretty bad results.
> >>>
> >>> ok, then remove the ' && write_fault' part of the clause.
> >> Hi Christoffer,
> >>  couple comments/questions.
> >>
> >>  setting force_pte here, disables huge pages for
> >> non-writable regions.
> >>
> > 
> 
> Hi Christoffer,
>  another round, although I'll go ahead and post another
> iteration, sorry but as you mentioned this code is
> important.
> 
> > hmmm, by a non-writable region you mean a read-only memslot? Can you set
> > dirty page logging for such one?  That doesn't make much sense to me.
> 
> Come to think of it that's  true.
> 
> It's bit fuzzyy when I was looking at the API for KVM_MEM_LOG_DIRTY_PAGES,
> it appears user space needs to check if region is read-only and set region
> size to 0(qemu). I don't see any checks in kernel to disable logging if
> region is read only and we're enabling dirty page logging. API doesn't say
> anything else. You may be able to enable logging
> for read-only region if you leave region size as is.
> 
> I guess this has been around for quite a while so we
> can just assume read-only slots will have logging disabled.
> 

I'm a bit confused, IIUC we don't make any explicit checks in the code
right now, so either there is some generic code that never sets the
logging flag on a read-only memregion or you can change the
implementation of memslot_is_logging() to return false if the memslot is
read-only.

> > 
> > Note, that if you receive writable == false from gfn_to_pfn_prot() that
> > doesn't mean that the page can never be written to, it just means that
> > the current mapping of the page is not a writable one, you can call that
> > same function again later with write_fault=true, and you either get a
> > writable page back or you simply get an error.
> 
> Yes that's true after studying hva_to_pfn_slow(),
> and __get_user_pages_fast(), a lot of conditions
> handled there.
> 
> > 
> > [...]
> > 
> >>>>>  	if (kvm_is_device_pfn(pfn))
> >>>>>  		mem_type = PAGE_S2_DEVICE;
> >>
> >> If we're not setting the IOMAP flag do we have need
> >> this, since we're forfeiting error checking later
> >> in stage2_set_pte()?
> >>
> > 
> > we still need this, remember the error checking is about
> > cache == NULL, not about the IOMAP flag.  I think I address this in the
> > new proposal below, but please check carefully.
> 
> Ok so mmu notifier may call stage2_set_pte() with
> null cache poiner and intermediate table entries may
> not be there so stage2_get_pud() may return NULL.
> With logging on it won't happen, but just in case
> we check.

yes, to easily catch programming errors in the future, that's why if you
make it a VM_BUG_ON the check won't be compiled unless you have kernel
memory debugging enabled.

> 
> And we'll continue to handle Device faults until
> further notice.
> 

yes.

> > 
> > Take a look at this one:
> 
> Looks good to me, concise.
> 

Thanks,
-Christoffer

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-15 10:20               ` Christoffer Dall
  0 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-15 10:20 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Jan 14, 2015 at 03:10:11PM -0800, Mario Smarduch wrote:
> On 01/14/2015 02:32 AM, Christoffer Dall wrote:
> > On Tue, Jan 13, 2015 at 03:14:47PM -0800, Mario Smarduch wrote:
> > 
> > [...]
> > 
> >>>>>
> >>>>>
> >>>>> What I meant last time around concerning user_mem_abort was more
> >>>>> something like this:
> >>>>>
> >>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >>>>> index 1dc9778..38ea58e 100644
> >>>>> --- a/arch/arm/kvm/mmu.c
> >>>>> +++ b/arch/arm/kvm/mmu.c
> >>>>> @@ -935,7 +935,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>>>>  		return -EFAULT;
> >>>>>  	}
> >>>>>  
> >>>>> -	if (is_vm_hugetlb_page(vma)) {
> >>>>> +	/*
> >>>>> +	 * Writes to pages in a memslot with logging enabled are always logged
> >>>>> +	 * on a singe page-by-page basis.
> >>>>> +	 */
> >>>>> +	if (memslot_is_logging(memslot) && write_fault)
> >>>>> +		force_pte = true;
> >>>>
> >>>> If it's a write you take the pte route and
> >>>> dissolves huge page, if it's a read you reconstruct the
> >>>> THP that seems to yield pretty bad results.
> >>>
> >>> ok, then remove the ' && write_fault' part of the clause.
> >> Hi Christoffer,
> >>  couple comments/questions.
> >>
> >>  setting force_pte here, disables huge pages for
> >> non-writable regions.
> >>
> > 
> 
> Hi Christoffer,
>  another round, although I'll go ahead and post another
> iteration, sorry but as you mentioned this code is
> important.
> 
> > hmmm, by a non-writable region you mean a read-only memslot? Can you set
> > dirty page logging for such one?  That doesn't make much sense to me.
> 
> Come to think of it that's  true.
> 
> It's bit fuzzyy when I was looking at the API for KVM_MEM_LOG_DIRTY_PAGES,
> it appears user space needs to check if region is read-only and set region
> size to 0(qemu). I don't see any checks in kernel to disable logging if
> region is read only and we're enabling dirty page logging. API doesn't say
> anything else. You may be able to enable logging
> for read-only region if you leave region size as is.
> 
> I guess this has been around for quite a while so we
> can just assume read-only slots will have logging disabled.
> 

I'm a bit confused, IIUC we don't make any explicit checks in the code
right now, so either there is some generic code that never sets the
logging flag on a read-only memregion or you can change the
implementation of memslot_is_logging() to return false if the memslot is
read-only.

> > 
> > Note, that if you receive writable == false from gfn_to_pfn_prot() that
> > doesn't mean that the page can never be written to, it just means that
> > the current mapping of the page is not a writable one, you can call that
> > same function again later with write_fault=true, and you either get a
> > writable page back or you simply get an error.
> 
> Yes that's true after studying hva_to_pfn_slow(),
> and __get_user_pages_fast(), a lot of conditions
> handled there.
> 
> > 
> > [...]
> > 
> >>>>>  	if (kvm_is_device_pfn(pfn))
> >>>>>  		mem_type = PAGE_S2_DEVICE;
> >>
> >> If we're not setting the IOMAP flag do we have need
> >> this, since we're forfeiting error checking later
> >> in stage2_set_pte()?
> >>
> > 
> > we still need this, remember the error checking is about
> > cache == NULL, not about the IOMAP flag.  I think I address this in the
> > new proposal below, but please check carefully.
> 
> Ok so mmu notifier may call stage2_set_pte() with
> null cache poiner and intermediate table entries may
> not be there so stage2_get_pud() may return NULL.
> With logging on it won't happen, but just in case
> we check.

yes, to easily catch programming errors in the future, that's why if you
make it a VM_BUG_ON the check won't be compiled unless you have kernel
memory debugging enabled.

> 
> And we'll continue to handle Device faults until
> further notice.
> 

yes.

> > 
> > Take a look at this one:
> 
> Looks good to me, concise.
> 

Thanks,
-Christoffer

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
  2015-01-15  2:51 ` Mario Smarduch
@ 2015-01-15 10:55   ` Christoffer Dall
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-15 10:55 UTC (permalink / raw)
  To: Mario Smarduch; +Cc: marc.zyngier, pbonzini, kvmarm, kvm, linux-arm-kernel

On Wed, Jan 14, 2015 at 06:51:50PM -0800, Mario Smarduch wrote:
> This patch adds support for 2nd stage page fault handling while dirty page
> logging. On huge page faults, huge pages are dissolved to normal pages, and
> rebuilding of 2nd stage huge pages is blocked. In case migration is
> canceled this restriction is removed and huge pages may be rebuilt again.
> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
> 
> change Log since last RESEND v2 --> v3:
> - Handle read faults to writable regions properly
> - Along with Christoffers suggestions optimized user_mem_abor() while logging
> - Fix enable of dirty page logging to Device memory - reject request 
> 
> Change Log since last RESEND v1 --> v2:
> - Disallow dirty page logging of IO region - fail for initial write protect
>   and disable logging code in 2nd stage page fault handler.
> - Fixed auto spell correction errors
> 
> Change Log RESEND v0 --> v1:
> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>   writable, prevent write protection of pte on read fault
> - Removed marking entire huge page dirty on initial access
> - don't dissolve huge pages of non-writable regions
> - Made updates based on Christoffers comments
>   - renamed logging status function to memslot_is_logging()
>   - changed few values to bool from longs
>   - streamlined user_mem_abort() to eliminate extra conditional checks
> ---
>  arch/arm/kvm/mmu.c |   97 +++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 88 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 73d506f..2e494ac 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>  #define kvm_pud_huge(_x)	pud_huge(_x)
>  
> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
> +#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
> +
> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> +{
> +#ifdef CONFIG_ARM
> +	return !!memslot->dirty_bitmap;

change this to:
return memslot->dirty_bitmap && !(mem->flags & KVM_MEM_READONLY)

with the semantics that we only care about logging writes to the dirty
bitmap for things that will ever actually be written to.

> +#else
> +	return false;
> +#endif
> +}
> +
>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  {
>  	/*
> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>  }
>  
> +/**
> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
> + * @kvm:	pointer to kvm structure.
> + * @addr:	IPA
> + * @pmd:	pmd pointer for IPA
> + *
> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
> + * pages in the range dirty.
> + */
> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
> +{
> +	if (!kvm_pmd_huge(*pmd))
> +		return;
> +
> +	pmd_clear(pmd);
> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
> +	put_page(virt_to_page(pmd));
> +}
> +
>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>  				  int min, int max)
>  {
> @@ -703,10 +734,15 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>  }
>  
>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
> +			  phys_addr_t addr, const pte_t *new_pte,
> +			  unsigned long flags)
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, old_pte;
> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
> +	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
> +
> +	VM_BUG_ON(logging_active && !cache);
>  
>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>  	pmd = stage2_get_pmd(kvm, cache, addr);
> @@ -718,6 +754,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>  		return 0;
>  	}
>  
> +	/*
> +	 * While dirty page logging - dissolve huge PMD, then continue on to
> +	 * allocate page.
> +	 */
> +	if (logging_active)
> +		stage2_dissolve_pmd(kvm, addr, pmd);
> +
>  	/* Create stage-2 page mappings - Level 2 */
>  	if (pmd_none(*pmd)) {
>  		if (!cache)
> @@ -774,7 +817,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>  		if (ret)
>  			goto out;
>  		spin_lock(&kvm->mmu_lock);
> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
> +						KVM_S2PTE_FLAG_IS_IOMAP);
>  		spin_unlock(&kvm->mmu_lock);
>  		if (ret)
>  			goto out;
> @@ -1002,6 +1046,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	pfn_t pfn;
>  	pgprot_t mem_type = PAGE_S2;
>  	bool fault_ipa_uncached;
> +	bool logging_active = memslot_is_logging(memslot);
> +	unsigned long flags = 0;
>  
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -1018,7 +1064,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
>  
> -	if (is_vm_hugetlb_page(vma)) {
> +	if (is_vm_hugetlb_page(vma) && !logging_active) {
>  		hugetlb = true;
>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>  	} else {
> @@ -1059,12 +1105,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (is_error_pfn(pfn))
>  		return -EFAULT;
>  
> -	if (kvm_is_device_pfn(pfn))
> +	if (kvm_is_device_pfn(pfn)) {
>  		mem_type = PAGE_S2_DEVICE;
> +		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> +	} else if (logging_active) {
> +		/*
> +		 * Faults on pages in a memslot with logging enabled
> +		 * should not be mapped with huge pages (it introduces churn
> +		 * and performance degradation), so force a pte mapping.
> +		 */
> +		force_pte = true;
> +		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
> +
> +		/*
> +		 * Only actually map the page as writable if this was a write
> +		 * fault.
> +		 */
> +		if (!write_fault)
> +			writable = false;
> +	}
>  
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
>  		goto out_unlock;
> +
>  	if (!hugetlb && !force_pte)
>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>  
> @@ -1082,17 +1146,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>  	} else {
>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> +
>  		if (writable) {
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
> +			mark_page_dirty(kvm, gfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>  					  fault_ipa_uncached);
> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> +		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
>  	}
>  
> -
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
>  	kvm_release_pfn_clean(pfn);
> @@ -1242,7 +1306,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>  {
>  	pte_t *pte = (pte_t *)data;
>  
> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> +	/*
> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
> +	 * through this calling path.
> +	 */
> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>  }
>  
>  
> @@ -1396,7 +1467,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>  	int ret = 0;
>  
> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
> +			change != KVM_MR_FLAGS_ONLY)
>  		return 0;
>  
>  	/*
> @@ -1447,6 +1519,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>  					 vm_start - vma->vm_start;
>  
> +			/* IO region dirty page logging not allowed */
> +			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES)
> +				return -EINVAL;
> +
>  			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>  						    vm_end - vm_start,
>  						    writable);
> @@ -1456,6 +1532,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  		hva = vm_end;
>  	} while (hva < reg_end);
>  
> +	if (change == KVM_MR_FLAGS_ONLY)
> +		return ret;
> +
>  	spin_lock(&kvm->mmu_lock);
>  	if (ret)
>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
> -- 
> 1.7.9.5
> 

besides the final nit, this looks good!

Send out a new complete series, and I'll test it and we may be just in
time for the next merge window.

-Christoffer

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-15 10:55   ` Christoffer Dall
  0 siblings, 0 replies; 30+ messages in thread
From: Christoffer Dall @ 2015-01-15 10:55 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Jan 14, 2015 at 06:51:50PM -0800, Mario Smarduch wrote:
> This patch adds support for 2nd stage page fault handling while dirty page
> logging. On huge page faults, huge pages are dissolved to normal pages, and
> rebuilding of 2nd stage huge pages is blocked. In case migration is
> canceled this restriction is removed and huge pages may be rebuilt again.
> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
> 
> change Log since last RESEND v2 --> v3:
> - Handle read faults to writable regions properly
> - Along with Christoffers suggestions optimized user_mem_abor() while logging
> - Fix enable of dirty page logging to Device memory - reject request 
> 
> Change Log since last RESEND v1 --> v2:
> - Disallow dirty page logging of IO region - fail for initial write protect
>   and disable logging code in 2nd stage page fault handler.
> - Fixed auto spell correction errors
> 
> Change Log RESEND v0 --> v1:
> - fixed bug exposed by new generic __get_user_pages_fast(), when region is 
>   writable, prevent write protection of pte on read fault
> - Removed marking entire huge page dirty on initial access
> - don't dissolve huge pages of non-writable regions
> - Made updates based on Christoffers comments
>   - renamed logging status function to memslot_is_logging()
>   - changed few values to bool from longs
>   - streamlined user_mem_abort() to eliminate extra conditional checks
> ---
>  arch/arm/kvm/mmu.c |   97 +++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 88 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 73d506f..2e494ac 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
>  #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
>  #define kvm_pud_huge(_x)	pud_huge(_x)
>  
> +#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
> +#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
> +
> +static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> +{
> +#ifdef CONFIG_ARM
> +	return !!memslot->dirty_bitmap;

change this to:
return memslot->dirty_bitmap && !(mem->flags & KVM_MEM_READONLY)

with the semantics that we only care about logging writes to the dirty
bitmap for things that will ever actually be written to.

> +#else
> +	return false;
> +#endif
> +}
> +
>  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  {
>  	/*
> @@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
>  		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
>  }
>  
> +/**
> + * stage2_dissolve_pmd() - clear and flush huge PMD entry
> + * @kvm:	pointer to kvm structure.
> + * @addr:	IPA
> + * @pmd:	pmd pointer for IPA
> + *
> + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
> + * pages in the range dirty.
> + */
> +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
> +{
> +	if (!kvm_pmd_huge(*pmd))
> +		return;
> +
> +	pmd_clear(pmd);
> +	kvm_tlb_flush_vmid_ipa(kvm, addr);
> +	put_page(virt_to_page(pmd));
> +}
> +
>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>  				  int min, int max)
>  {
> @@ -703,10 +734,15 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
>  }
>  
>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> -			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
> +			  phys_addr_t addr, const pte_t *new_pte,
> +			  unsigned long flags)
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, old_pte;
> +	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
> +	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
> +
> +	VM_BUG_ON(logging_active && !cache);
>  
>  	/* Create stage-2 page table mapping - Levels 0 and 1 */
>  	pmd = stage2_get_pmd(kvm, cache, addr);
> @@ -718,6 +754,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
>  		return 0;
>  	}
>  
> +	/*
> +	 * While dirty page logging - dissolve huge PMD, then continue on to
> +	 * allocate page.
> +	 */
> +	if (logging_active)
> +		stage2_dissolve_pmd(kvm, addr, pmd);
> +
>  	/* Create stage-2 page mappings - Level 2 */
>  	if (pmd_none(*pmd)) {
>  		if (!cache)
> @@ -774,7 +817,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>  		if (ret)
>  			goto out;
>  		spin_lock(&kvm->mmu_lock);
> -		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
> +		ret = stage2_set_pte(kvm, &cache, addr, &pte,
> +						KVM_S2PTE_FLAG_IS_IOMAP);
>  		spin_unlock(&kvm->mmu_lock);
>  		if (ret)
>  			goto out;
> @@ -1002,6 +1046,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	pfn_t pfn;
>  	pgprot_t mem_type = PAGE_S2;
>  	bool fault_ipa_uncached;
> +	bool logging_active = memslot_is_logging(memslot);
> +	unsigned long flags = 0;
>  
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (fault_status == FSC_PERM && !write_fault) {
> @@ -1018,7 +1064,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
>  
> -	if (is_vm_hugetlb_page(vma)) {
> +	if (is_vm_hugetlb_page(vma) && !logging_active) {
>  		hugetlb = true;
>  		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
>  	} else {
> @@ -1059,12 +1105,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	if (is_error_pfn(pfn))
>  		return -EFAULT;
>  
> -	if (kvm_is_device_pfn(pfn))
> +	if (kvm_is_device_pfn(pfn)) {
>  		mem_type = PAGE_S2_DEVICE;
> +		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
> +	} else if (logging_active) {
> +		/*
> +		 * Faults on pages in a memslot with logging enabled
> +		 * should not be mapped with huge pages (it introduces churn
> +		 * and performance degradation), so force a pte mapping.
> +		 */
> +		force_pte = true;
> +		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
> +
> +		/*
> +		 * Only actually map the page as writable if this was a write
> +		 * fault.
> +		 */
> +		if (!write_fault)
> +			writable = false;
> +	}
>  
>  	spin_lock(&kvm->mmu_lock);
>  	if (mmu_notifier_retry(kvm, mmu_seq))
>  		goto out_unlock;
> +
>  	if (!hugetlb && !force_pte)
>  		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>  
> @@ -1082,17 +1146,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>  	} else {
>  		pte_t new_pte = pfn_pte(pfn, mem_type);
> +
>  		if (writable) {
>  			kvm_set_s2pte_writable(&new_pte);
>  			kvm_set_pfn_dirty(pfn);
> +			mark_page_dirty(kvm, gfn);
>  		}
>  		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
>  					  fault_ipa_uncached);
> -		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
> -			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
> +		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
>  	}
>  
> -
>  out_unlock:
>  	spin_unlock(&kvm->mmu_lock);
>  	kvm_release_pfn_clean(pfn);
> @@ -1242,7 +1306,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
>  {
>  	pte_t *pte = (pte_t *)data;
>  
> -	stage2_set_pte(kvm, NULL, gpa, pte, false);
> +	/*
> +	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
> +	 * flag clear because MMU notifiers will have unmapped a huge PMD before
> +	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
> +	 * therefore stage2_set_pte() never needs to clear out a huge PMD
> +	 * through this calling path.
> +	 */
> +	stage2_set_pte(kvm, NULL, gpa, pte, 0);
>  }
>  
>  
> @@ -1396,7 +1467,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  	bool writable = !(mem->flags & KVM_MEM_READONLY);
>  	int ret = 0;
>  
> -	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
> +	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
> +			change != KVM_MR_FLAGS_ONLY)
>  		return 0;
>  
>  	/*
> @@ -1447,6 +1519,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
>  					 vm_start - vma->vm_start;
>  
> +			/* IO region dirty page logging not allowed */
> +			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES)
> +				return -EINVAL;
> +
>  			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
>  						    vm_end - vm_start,
>  						    writable);
> @@ -1456,6 +1532,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  		hva = vm_end;
>  	} while (hva < reg_end);
>  
> +	if (change == KVM_MR_FLAGS_ONLY)
> +		return ret;
> +
>  	spin_lock(&kvm->mmu_lock);
>  	if (ret)
>  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
> -- 
> 1.7.9.5
> 

besides the final nit, this looks good!

Send out a new complete series, and I'll test it and we may be just in
time for the next merge window.

-Christoffer

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-15  2:51 ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-15  2:51 UTC (permalink / raw)
  To: christoffer.dall, marc.zyngier
  Cc: pbonzini, kvmarm, kvm, linux-arm-kernel, Mario Smarduch

This patch adds support for 2nd stage page fault handling while dirty page
logging. On huge page faults, huge pages are dissolved to normal pages, and
rebuilding of 2nd stage huge pages is blocked. In case migration is
canceled this restriction is removed and huge pages may be rebuilt again.

Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---

change Log since last RESEND v2 --> v3:
- Handle read faults to writable regions properly
- Along with Christoffers suggestions optimized user_mem_abor() while logging
- Fix enable of dirty page logging to Device memory - reject request 

Change Log since last RESEND v1 --> v2:
- Disallow dirty page logging of IO region - fail for initial write protect
  and disable logging code in 2nd stage page fault handler.
- Fixed auto spell correction errors

Change Log RESEND v0 --> v1:
- fixed bug exposed by new generic __get_user_pages_fast(), when region is 
  writable, prevent write protection of pte on read fault
- Removed marking entire huge page dirty on initial access
- don't dissolve huge pages of non-writable regions
- Made updates based on Christoffers comments
  - renamed logging status function to memslot_is_logging()
  - changed few values to bool from longs
  - streamlined user_mem_abort() to eliminate extra conditional checks
---
 arch/arm/kvm/mmu.c |   97 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 9 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 73d506f..2e494ac 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
 #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
 #define kvm_pud_huge(_x)	pud_huge(_x)
 
+#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
+#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+#ifdef CONFIG_ARM
+	return !!memslot->dirty_bitmap;
+#else
+	return false;
+#endif
+}
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
 	/*
@@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm:	pointer to kvm structure.
+ * @addr:	IPA
+ * @pmd:	pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
+ * pages in the range dirty.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+	if (!kvm_pmd_huge(*pmd))
+		return;
+
+	pmd_clear(pmd);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	put_page(virt_to_page(pmd));
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  int min, int max)
 {
@@ -703,10 +734,15 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 }
 
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
+			  phys_addr_t addr, const pte_t *new_pte,
+			  unsigned long flags)
 {
 	pmd_t *pmd;
 	pte_t *pte, old_pte;
+	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
+	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
+
+	VM_BUG_ON(logging_active && !cache);
 
 	/* Create stage-2 page table mapping - Levels 0 and 1 */
 	pmd = stage2_get_pmd(kvm, cache, addr);
@@ -718,6 +754,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 		return 0;
 	}
 
+	/*
+	 * While dirty page logging - dissolve huge PMD, then continue on to
+	 * allocate page.
+	 */
+	if (logging_active)
+		stage2_dissolve_pmd(kvm, addr, pmd);
+
 	/* Create stage-2 page mappings - Level 2 */
 	if (pmd_none(*pmd)) {
 		if (!cache)
@@ -774,7 +817,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 		if (ret)
 			goto out;
 		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
+		ret = stage2_set_pte(kvm, &cache, addr, &pte,
+						KVM_S2PTE_FLAG_IS_IOMAP);
 		spin_unlock(&kvm->mmu_lock);
 		if (ret)
 			goto out;
@@ -1002,6 +1046,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	pfn_t pfn;
 	pgprot_t mem_type = PAGE_S2;
 	bool fault_ipa_uncached;
+	bool logging_active = memslot_is_logging(memslot);
+	unsigned long flags = 0;
 
 	write_fault = kvm_is_write_fault(vcpu);
 	if (fault_status == FSC_PERM && !write_fault) {
@@ -1018,7 +1064,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
-	if (is_vm_hugetlb_page(vma)) {
+	if (is_vm_hugetlb_page(vma) && !logging_active) {
 		hugetlb = true;
 		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
 	} else {
@@ -1059,12 +1105,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;
 
-	if (kvm_is_device_pfn(pfn))
+	if (kvm_is_device_pfn(pfn)) {
 		mem_type = PAGE_S2_DEVICE;
+		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+	} else if (logging_active) {
+		/*
+		 * Faults on pages in a memslot with logging enabled
+		 * should not be mapped with huge pages (it introduces churn
+		 * and performance degradation), so force a pte mapping.
+		 */
+		force_pte = true;
+		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+		/*
+		 * Only actually map the page as writable if this was a write
+		 * fault.
+		 */
+		if (!write_fault)
+			writable = false;
+	}
 
 	spin_lock(&kvm->mmu_lock);
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
+
 	if (!hugetlb && !force_pte)
 		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
 
@@ -1082,17 +1146,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
 		pte_t new_pte = pfn_pte(pfn, mem_type);
+
 		if (writable) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
+			mark_page_dirty(kvm, gfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
 	}
 
-
 out_unlock:
 	spin_unlock(&kvm->mmu_lock);
 	kvm_release_pfn_clean(pfn);
@@ -1242,7 +1306,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
 {
 	pte_t *pte = (pte_t *)data;
 
-	stage2_set_pte(kvm, NULL, gpa, pte, false);
+	/*
+	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
+	 * flag clear because MMU notifiers will have unmapped a huge PMD before
+	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+	 * therefore stage2_set_pte() never needs to clear out a huge PMD
+	 * through this calling path.
+	 */
+	stage2_set_pte(kvm, NULL, gpa, pte, 0);
 }
 
 
@@ -1396,7 +1467,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	bool writable = !(mem->flags & KVM_MEM_READONLY);
 	int ret = 0;
 
-	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
+	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
+			change != KVM_MR_FLAGS_ONLY)
 		return 0;
 
 	/*
@@ -1447,6 +1519,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
 					 vm_start - vma->vm_start;
 
+			/* IO region dirty page logging not allowed */
+			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES)
+				return -EINVAL;
+
 			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
 						    vm_end - vm_start,
 						    writable);
@@ -1456,6 +1532,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		hva = vm_end;
 	} while (hva < reg_end);
 
+	if (change == KVM_MR_FLAGS_ONLY)
+		return ret;
+
 	spin_lock(&kvm->mmu_lock);
 	if (ret)
 		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-15  2:51 ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-15  2:51 UTC (permalink / raw)
  To: linux-arm-kernel

This patch adds support for 2nd stage page fault handling while dirty page
logging. On huge page faults, huge pages are dissolved to normal pages, and
rebuilding of 2nd stage huge pages is blocked. In case migration is
canceled this restriction is removed and huge pages may be rebuilt again.

Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---

change Log since last RESEND v2 --> v3:
- Handle read faults to writable regions properly
- Along with Christoffers suggestions optimized user_mem_abor() while logging
- Fix enable of dirty page logging to Device memory - reject request 

Change Log since last RESEND v1 --> v2:
- Disallow dirty page logging of IO region - fail for initial write protect
  and disable logging code in 2nd stage page fault handler.
- Fixed auto spell correction errors

Change Log RESEND v0 --> v1:
- fixed bug exposed by new generic __get_user_pages_fast(), when region is 
  writable, prevent write protection of pte on read fault
- Removed marking entire huge page dirty on initial access
- don't dissolve huge pages of non-writable regions
- Made updates based on Christoffers comments
  - renamed logging status function to memslot_is_logging()
  - changed few values to bool from longs
  - streamlined user_mem_abort() to eliminate extra conditional checks
---
 arch/arm/kvm/mmu.c |   97 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 9 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 73d506f..2e494ac 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
 #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
 #define kvm_pud_huge(_x)	pud_huge(_x)
 
+#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
+#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+#ifdef CONFIG_ARM
+	return !!memslot->dirty_bitmap;
+#else
+	return false;
+#endif
+}
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
 	/*
@@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm:	pointer to kvm structure.
+ * @addr:	IPA
+ * @pmd:	pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
+ * pages in the range dirty.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+	if (!kvm_pmd_huge(*pmd))
+		return;
+
+	pmd_clear(pmd);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	put_page(virt_to_page(pmd));
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  int min, int max)
 {
@@ -703,10 +734,15 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 }
 
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
+			  phys_addr_t addr, const pte_t *new_pte,
+			  unsigned long flags)
 {
 	pmd_t *pmd;
 	pte_t *pte, old_pte;
+	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
+	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
+
+	VM_BUG_ON(logging_active && !cache);
 
 	/* Create stage-2 page table mapping - Levels 0 and 1 */
 	pmd = stage2_get_pmd(kvm, cache, addr);
@@ -718,6 +754,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 		return 0;
 	}
 
+	/*
+	 * While dirty page logging - dissolve huge PMD, then continue on to
+	 * allocate page.
+	 */
+	if (logging_active)
+		stage2_dissolve_pmd(kvm, addr, pmd);
+
 	/* Create stage-2 page mappings - Level 2 */
 	if (pmd_none(*pmd)) {
 		if (!cache)
@@ -774,7 +817,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 		if (ret)
 			goto out;
 		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
+		ret = stage2_set_pte(kvm, &cache, addr, &pte,
+						KVM_S2PTE_FLAG_IS_IOMAP);
 		spin_unlock(&kvm->mmu_lock);
 		if (ret)
 			goto out;
@@ -1002,6 +1046,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	pfn_t pfn;
 	pgprot_t mem_type = PAGE_S2;
 	bool fault_ipa_uncached;
+	bool logging_active = memslot_is_logging(memslot);
+	unsigned long flags = 0;
 
 	write_fault = kvm_is_write_fault(vcpu);
 	if (fault_status == FSC_PERM && !write_fault) {
@@ -1018,7 +1064,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
-	if (is_vm_hugetlb_page(vma)) {
+	if (is_vm_hugetlb_page(vma) && !logging_active) {
 		hugetlb = true;
 		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
 	} else {
@@ -1059,12 +1105,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;
 
-	if (kvm_is_device_pfn(pfn))
+	if (kvm_is_device_pfn(pfn)) {
 		mem_type = PAGE_S2_DEVICE;
+		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+	} else if (logging_active) {
+		/*
+		 * Faults on pages in a memslot with logging enabled
+		 * should not be mapped with huge pages (it introduces churn
+		 * and performance degradation), so force a pte mapping.
+		 */
+		force_pte = true;
+		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+		/*
+		 * Only actually map the page as writable if this was a write
+		 * fault.
+		 */
+		if (!write_fault)
+			writable = false;
+	}
 
 	spin_lock(&kvm->mmu_lock);
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
+
 	if (!hugetlb && !force_pte)
 		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
 
@@ -1082,17 +1146,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
 		pte_t new_pte = pfn_pte(pfn, mem_type);
+
 		if (writable) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
+			mark_page_dirty(kvm, gfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
 	}
 
-
 out_unlock:
 	spin_unlock(&kvm->mmu_lock);
 	kvm_release_pfn_clean(pfn);
@@ -1242,7 +1306,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
 {
 	pte_t *pte = (pte_t *)data;
 
-	stage2_set_pte(kvm, NULL, gpa, pte, false);
+	/*
+	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
+	 * flag clear because MMU notifiers will have unmapped a huge PMD before
+	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+	 * therefore stage2_set_pte() never needs to clear out a huge PMD
+	 * through this calling path.
+	 */
+	stage2_set_pte(kvm, NULL, gpa, pte, 0);
 }
 
 
@@ -1396,7 +1467,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	bool writable = !(mem->flags & KVM_MEM_READONLY);
 	int ret = 0;
 
-	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
+	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
+			change != KVM_MR_FLAGS_ONLY)
 		return 0;
 
 	/*
@@ -1447,6 +1519,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 			phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
 					 vm_start - vma->vm_start;
 
+			/* IO region dirty page logging not allowed */
+			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES)
+				return -EINVAL;
+
 			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
 						    vm_end - vm_start,
 						    writable);
@@ -1456,6 +1532,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		hva = vm_end;
 	} while (hva < reg_end);
 
+	if (change == KVM_MR_FLAGS_ONLY)
+		return ret;
+
 	spin_lock(&kvm->mmu_lock);
 	if (ret)
 		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-09  1:42 ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-09  1:42 UTC (permalink / raw)
  To: christoffer.dall, marc.zyngier
  Cc: pbonzini, kvmarm, kvm, linux-arm-kernel, Mario Smarduch

This patch adds support for handling 2nd stage page faults during migration,
it disables faulting in huge pages, and dissolves huge pages to normal pages.
In case migration is canceled huge pages are used again, if memory conditions
permit it. I applies cleanly on top patches series posted Dec 15:
https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html

Patch number #11 of the series has be dropped.

Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---

Change Log since last RESEND:
- fixed bug exposed __get_user_pages_fast(), when region is writable prevent
  write protection of pte so we can handle a future write fault and mark page
  dirty.
- Removed marking entire huge page dirty on initial dirty log read.
- don't dissolve non-writable huge pages
- Made updates based on Christoffers comments
  - renamed logging status function to memslot_is_logging()
  - changes few values to bool from
  - streamlined user_mem_abort() to eliminate extra conditional checks
 
---
 arch/arm/kvm/mmu.c |   92 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 5 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 73d506f..2bfe22d 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
 #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
 #define kvm_pud_huge(_x)	pud_huge(_x)
 
+#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
+#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+#ifdef CONFIG_ARM
+	return !!memslot->dirty_bitmap;
+#else
+	return false;
+#endif
+}
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
 	/*
@@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm:	pointer to kvm structure.
+ * @addr:	IPA
+ * @pmd:	pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
+ * pages in the range dirty.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+	if (!kvm_pmd_huge(*pmd))
+		return;
+
+	pmd_clear(pmd);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	put_page(virt_to_page(pmd));
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  int min, int max)
 {
@@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 }
 
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
+			  phys_addr_t addr, const pte_t *new_pte,
+			  unsigned long flags)
 {
 	pmd_t *pmd;
 	pte_t *pte, old_pte;
+	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
+	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
 
 	/* Create stage-2 page table mapping - Levels 0 and 1 */
 	pmd = stage2_get_pmd(kvm, cache, addr);
@@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 		return 0;
 	}
 
+	/*
+	 * While dirty page logging - dissolve huge PMD, then continue on to
+	 * allocate page.
+	 */
+	if (logging_active)
+		stage2_dissolve_pmd(kvm, addr, pmd);
+
 	/* Create stage-2 page mappings - Level 2 */
 	if (pmd_none(*pmd)) {
 		if (!cache)
@@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 		if (ret)
 			goto out;
 		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
+		ret = stage2_set_pte(kvm, &cache, addr, &pte,
+						KVM_S2PTE_FLAG_IS_IOMAP);
 		spin_unlock(&kvm->mmu_lock);
 		if (ret)
 			goto out;
@@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	pfn_t pfn;
 	pgprot_t mem_type = PAGE_S2;
 	bool fault_ipa_uncached;
+	bool can_set_pte_rw = true;
+	unsigned long set_pte_flags = 0;
 
 	write_fault = kvm_is_write_fault(vcpu);
 	if (fault_status == FSC_PERM && !write_fault) {
@@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
+
 	/* Let's check if we will get back a huge page backed by hugetlbfs */
 	down_read(&current->mm->mmap_sem);
 	vma = find_vma_intersection(current->mm, hva, hva + 1);
@@ -1065,6 +1110,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	spin_lock(&kvm->mmu_lock);
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
+
+	/*
+	 * When logging is enabled general page fault handling changes:
+	 * -  Writable huge pages are dissolved on a read or write fault.
+	 * -  pte's are not allowed write permission on a read fault to
+	 *    writable region so future writes can be marked dirty
+	 * -  access to non-writable region is unchanged
+	 */
+	if (memslot_is_logging(memslot) && writable) {
+		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
+		if (hugetlb) {
+			gfn += pte_index(fault_ipa);
+			pfn += pte_index(fault_ipa);
+			hugetlb = false;
+		}
+		force_pte = true;
+		if (!write_fault)
+			can_set_pte_rw = false;
+	}
+
 	if (!hugetlb && !force_pte)
 		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
 
@@ -1082,16 +1147,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
 		pte_t new_pte = pfn_pte(pfn, mem_type);
-		if (writable) {
+
+		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
+			set_pte_flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+
+		/*
+		 * Don't set write permission, for non-writable region, and
+		 * for read fault to writable region while logging.
+		 */
+		if (writable && can_set_pte_rw) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+							set_pte_flags);
 	}
 
+	if (write_fault)
+		mark_page_dirty(kvm, gfn);
 
 out_unlock:
 	spin_unlock(&kvm->mmu_lock);
@@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
 {
 	pte_t *pte = (pte_t *)data;
 
-	stage2_set_pte(kvm, NULL, gpa, pte, false);
+	/*
+	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
+	 * flag clear because MMU notifiers will have unmapped a huge PMD before
+	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+	 * therefore stage2_set_pte() never needs to clear out a huge PMD
+	 * through this calling path.
+	 */
+	stage2_set_pte(kvm, NULL, gpa, pte, 0);
 }
 
 
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling
@ 2015-01-09  1:42 ` Mario Smarduch
  0 siblings, 0 replies; 30+ messages in thread
From: Mario Smarduch @ 2015-01-09  1:42 UTC (permalink / raw)
  To: linux-arm-kernel

This patch adds support for handling 2nd stage page faults during migration,
it disables faulting in huge pages, and dissolves huge pages to normal pages.
In case migration is canceled huge pages are used again, if memory conditions
permit it. I applies cleanly on top patches series posted Dec 15:
https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html

Patch number #11 of the series has be dropped.

Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---

Change Log since last RESEND:
- fixed bug exposed __get_user_pages_fast(), when region is writable prevent
  write protection of pte so we can handle a future write fault and mark page
  dirty.
- Removed marking entire huge page dirty on initial dirty log read.
- don't dissolve non-writable huge pages
- Made updates based on Christoffers comments
  - renamed logging status function to memslot_is_logging()
  - changes few values to bool from
  - streamlined user_mem_abort() to eliminate extra conditional checks
 
---
 arch/arm/kvm/mmu.c |   92 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 5 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 73d506f..2bfe22d 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
 #define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
 #define kvm_pud_huge(_x)	pud_huge(_x)
 
+#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
+#define KVM_S2PTE_FLAG_LOGGING_ACTIVE	(1UL << 1)
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+#ifdef CONFIG_ARM
+	return !!memslot->dirty_bitmap;
+#else
+	return false;
+#endif
+}
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
 	/*
@@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm:	pointer to kvm structure.
+ * @addr:	IPA
+ * @pmd:	pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
+ * pages in the range dirty.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+	if (!kvm_pmd_huge(*pmd))
+		return;
+
+	pmd_clear(pmd);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	put_page(virt_to_page(pmd));
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  int min, int max)
 {
@@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 }
 
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte, bool iomap)
+			  phys_addr_t addr, const pte_t *new_pte,
+			  unsigned long flags)
 {
 	pmd_t *pmd;
 	pte_t *pte, old_pte;
+	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
+	bool logging_active = flags & KVM_S2PTE_FLAG_LOGGING_ACTIVE;
 
 	/* Create stage-2 page table mapping - Levels 0 and 1 */
 	pmd = stage2_get_pmd(kvm, cache, addr);
@@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 		return 0;
 	}
 
+	/*
+	 * While dirty page logging - dissolve huge PMD, then continue on to
+	 * allocate page.
+	 */
+	if (logging_active)
+		stage2_dissolve_pmd(kvm, addr, pmd);
+
 	/* Create stage-2 page mappings - Level 2 */
 	if (pmd_none(*pmd)) {
 		if (!cache)
@@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 		if (ret)
 			goto out;
 		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
+		ret = stage2_set_pte(kvm, &cache, addr, &pte,
+						KVM_S2PTE_FLAG_IS_IOMAP);
 		spin_unlock(&kvm->mmu_lock);
 		if (ret)
 			goto out;
@@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	pfn_t pfn;
 	pgprot_t mem_type = PAGE_S2;
 	bool fault_ipa_uncached;
+	bool can_set_pte_rw = true;
+	unsigned long set_pte_flags = 0;
 
 	write_fault = kvm_is_write_fault(vcpu);
 	if (fault_status == FSC_PERM && !write_fault) {
@@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
+
 	/* Let's check if we will get back a huge page backed by hugetlbfs */
 	down_read(&current->mm->mmap_sem);
 	vma = find_vma_intersection(current->mm, hva, hva + 1);
@@ -1065,6 +1110,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	spin_lock(&kvm->mmu_lock);
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
+
+	/*
+	 * When logging is enabled general page fault handling changes:
+	 * -  Writable huge pages are dissolved on a read or write fault.
+	 * -  pte's are not allowed write permission on a read fault to
+	 *    writable region so future writes can be marked dirty
+	 * -  access to non-writable region is unchanged
+	 */
+	if (memslot_is_logging(memslot) && writable) {
+		set_pte_flags = KVM_S2PTE_FLAG_LOGGING_ACTIVE;
+		if (hugetlb) {
+			gfn += pte_index(fault_ipa);
+			pfn += pte_index(fault_ipa);
+			hugetlb = false;
+		}
+		force_pte = true;
+		if (!write_fault)
+			can_set_pte_rw = false;
+	}
+
 	if (!hugetlb && !force_pte)
 		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
 
@@ -1082,16 +1147,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
 		pte_t new_pte = pfn_pte(pfn, mem_type);
-		if (writable) {
+
+		if (pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE))
+			set_pte_flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+
+		/*
+		 * Don't set write permission, for non-writable region, and
+		 * for read fault to writable region while logging.
+		 */
+		if (writable && can_set_pte_rw) {
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
 		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
 					  fault_ipa_uncached);
 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
-			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
+							set_pte_flags);
 	}
 
+	if (write_fault)
+		mark_page_dirty(kvm, gfn);
 
 out_unlock:
 	spin_unlock(&kvm->mmu_lock);
@@ -1242,7 +1317,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
 {
 	pte_t *pte = (pte_t *)data;
 
-	stage2_set_pte(kvm, NULL, gpa, pte, false);
+	/*
+	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
+	 * flag clear because MMU notifiers will have unmapped a huge PMD before
+	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+	 * therefore stage2_set_pte() never needs to clear out a huge PMD
+	 * through this calling path.
+	 */
+	stage2_set_pte(kvm, NULL, gpa, pte, 0);
 }
 
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2015-01-15 10:55 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-10  4:17 [PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling Mario Smarduch
2015-01-10  4:17 ` Mario Smarduch
2015-01-11 14:00 ` Christoffer Dall
2015-01-11 14:00   ` Christoffer Dall
2015-01-12 16:27   ` Mario Smarduch
2015-01-12 16:27     ` Mario Smarduch
2015-01-12 17:49     ` Christoffer Dall
2015-01-12 17:49       ` Christoffer Dall
2015-01-12 19:04       ` Mario Smarduch
2015-01-12 19:04         ` Mario Smarduch
2015-01-12 19:43         ` Christoffer Dall
2015-01-12 19:43           ` Christoffer Dall
2015-01-13 17:42           ` Mario Smarduch
2015-01-13 17:42             ` Mario Smarduch
2015-01-14 10:33             ` Christoffer Dall
2015-01-14 10:33               ` Christoffer Dall
2015-01-13 23:14       ` Mario Smarduch
2015-01-13 23:14         ` Mario Smarduch
2015-01-14 10:32         ` Christoffer Dall
2015-01-14 10:32           ` Christoffer Dall
2015-01-14 23:10           ` Mario Smarduch
2015-01-14 23:10             ` Mario Smarduch
2015-01-15 10:20             ` Christoffer Dall
2015-01-15 10:20               ` Christoffer Dall
  -- strict thread matches above, loose matches on Subject: below --
2015-01-15  2:51 Mario Smarduch
2015-01-15  2:51 ` Mario Smarduch
2015-01-15 10:55 ` Christoffer Dall
2015-01-15 10:55   ` Christoffer Dall
2015-01-09  1:42 Mario Smarduch
2015-01-09  1:42 ` Mario Smarduch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.