KVM ARM Archive on lore.kernel.org
 help / color / Atom feed
From: Keqian Zhu <zhukeqian1@huawei.com>
To: <kvm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<linux-arm-kernel@lists.infradead.org>,
	<kvmarm@lists.cs.columbia.edu>
Cc: Marc Zyngier <maz@kernel.org>,
	Sean Christopherson <sean.j.christopherson@intel.com>,
	Jay Zhou <jianjay.zhou@huawei.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Will Deacon <will@kernel.org>
Subject: [PATCH 3/3] KVM/arm64: Only set bits of dirty bitmap with valid translation entries
Date: Wed, 25 Mar 2020 12:24:23 +0800
Message-ID: <20200325042423.12181-4-zhukeqian1@huawei.com> (raw)
In-Reply-To: <20200325042423.12181-1-zhukeqian1@huawei.com>

When KVM_DIRTY_LOG_INITIALLY_SET is enabled, we can only report these
pages that have valid translation entries to userspace, then userspace
don't need to do zero-check on other pages during VM migration.

Under the Huawei Kunpeng 920 2.6GHz platform, I did some tests on 128G
Linux VMs with different page size.

About the time of enabling dirty log: The memory pressure is 127GB.
Page size   Before      After
   4K        1.8ms      341ms
   2M        1.8ms       4ms
   1G        1.8ms       2ms

About the time of migration: The memory pressure is 3GB and the migration
bandwidth is 500MB/s.
Page size   Before    After
   4K        21s       6s
   2M        21s       6s
   1G        21s       7s

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
---
 virt/kvm/arm/mmu.c | 161 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 137 insertions(+), 24 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 6c84de442a0e..0c7a5faf8609 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1413,34 +1413,85 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
 	return false;
 }
 
+enum s2_operation {
+	S2_OP_WP,  /* write protect page tables */
+	S2_OP_MD,  /* mark dirty bitmap in memslot */
+};
+
 /**
- * stage2_wp_ptes - write protect PMD range
+ * mark_range_dirty - mark a range of dirty bitmap
+ * @kvm:	kvm instance for the VM
+ * @addr:	range start address
+ * @end:	range end address
+ *
+ * note: addr and end should belong to the same memslot.
+ */
+static void mark_range_dirty(struct kvm *kvm,
+			     phys_addr_t addr,
+			     phys_addr_t end)
+{
+	gfn_t gfn;
+	unsigned int start, nbits;
+	struct kvm_memory_slot *memslot = NULL;
+
+	gfn = addr >> PAGE_SHIFT;
+	memslot = gfn_to_memslot(kvm, gfn);
+
+	if (memslot && memslot->dirty_bitmap) {
+		start = gfn - memslot->base_gfn;
+		nbits = DIV_ROUND_UP(end, PAGE_SIZE) - gfn;
+		bitmap_set(memslot->dirty_bitmap, start, nbits);
+	}
+}
+
+/**
+ * stage2_op_ptes - do an operation on PMD range
+ * @kvm:	kvm instance for the VM
+ * @op: 	the operation wanted
  * @pmd:	pointer to pmd entry
  * @addr:	range start address
  * @end:	range end address
  */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+static void stage2_op_ptes(struct kvm *kvm,
+			   enum s2_operation op,
+			   pmd_t *pmd,
+			   phys_addr_t addr,
+			   phys_addr_t end)
 {
 	pte_t *pte;
 
 	pte = pte_offset_kernel(pmd, addr);
 	do {
-		if (!pte_none(*pte)) {
+		if (pte_none(*pte))
+			continue;
+
+		switch (op) {
+		case S2_OP_WP:
 			if (!kvm_s2pte_readonly(pte))
 				kvm_set_s2pte_readonly(pte);
+			break;
+		case S2_OP_MD:
+			mark_range_dirty(kvm, addr, addr + PAGE_SIZE);
+			break;
+		default:
+			break;
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 }
 
 /**
- * stage2_wp_pmds - write protect PUD range
- * kvm:		kvm instance for the VM
+ * stage2_op_pmds - do an operation on PUD range
+ * @kvm:	kvm instance for the VM
+ * @op: 	the operation wanted
  * @pud:	pointer to pud entry
  * @addr:	range start address
  * @end:	range end address
  */
-static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
-			   phys_addr_t addr, phys_addr_t end)
+static void stage2_op_pmds(struct kvm *kvm,
+			   enum s2_operation op,
+			   pud_t *pud,
+			   phys_addr_t addr,
+			   phys_addr_t end)
 {
 	pmd_t *pmd;
 	phys_addr_t next;
@@ -1449,25 +1500,40 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
 
 	do {
 		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd)) {
+		if (pmd_none(*pmd))
+			continue;
+
+		if (pmd_thp_or_huge(*pmd)) {
+			switch (op) {
+			case S2_OP_WP:
 				if (!kvm_s2pmd_readonly(pmd))
 					kvm_set_s2pmd_readonly(pmd);
-			} else {
-				stage2_wp_ptes(pmd, addr, next);
+				break;
+			case S2_OP_MD:
+				mark_range_dirty(kvm, addr, next);
+				break;
+			default:
+				break;
 			}
+		} else {
+			stage2_op_ptes(kvm, op, pmd, addr, next);
 		}
 	} while (pmd++, addr = next, addr != end);
 }
 
 /**
- * stage2_wp_puds - write protect PGD range
+ * stage2_op_puds - do an operation on PGD range
+ * @kvm:	kvm instance for the VM
+ * @op: 	the operation wanted
  * @pgd:	pointer to pgd entry
  * @addr:	range start address
  * @end:	range end address
  */
-static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
-			    phys_addr_t addr, phys_addr_t end)
+static void  stage2_op_puds(struct kvm *kvm,
+			    enum s2_operation op,
+			    pgd_t *pgd,
+			    phys_addr_t addr,
+			    phys_addr_t end)
 {
 	pud_t *pud;
 	phys_addr_t next;
@@ -1475,24 +1541,38 @@ static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
 	pud = stage2_pud_offset(kvm, pgd, addr);
 	do {
 		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud)) {
+		if (stage2_pud_none(kvm, *pud))
+			continue;
+
+		if (stage2_pud_huge(kvm, *pud)) {
+			switch (op) {
+			case S2_OP_WP:
 				if (!kvm_s2pud_readonly(pud))
 					kvm_set_s2pud_readonly(pud);
-			} else {
-				stage2_wp_pmds(kvm, pud, addr, next);
+				break;
+			case S2_OP_MD:
+				mark_range_dirty(kvm, addr, next);
+				break;
+			default:
+				break;
 			}
+		} else {
+			stage2_op_pmds(kvm, op, pud, addr, next);
 		}
 	} while (pud++, addr = next, addr != end);
 }
 
 /**
- * stage2_wp_range() - write protect stage2 memory region range
+ * stage2_op_range() - do an operation on stage2 memory region range
  * @kvm:	The KVM pointer
+ * @op: 	The operation wanted
  * @addr:	Start address of range
  * @end:	End address of range
  */
-static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+static void stage2_op_range(struct kvm *kvm,
+			    enum s2_operation op,
+			    phys_addr_t addr,
+			    phys_addr_t end)
 {
 	pgd_t *pgd;
 	phys_addr_t next;
@@ -1513,7 +1593,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 			break;
 		next = stage2_pgd_addr_end(kvm, addr, end);
 		if (stage2_pgd_present(kvm, *pgd))
-			stage2_wp_puds(kvm, pgd, addr, next);
+			stage2_op_puds(kvm, op, pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
 }
 
@@ -1543,11 +1623,44 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
 
 	spin_lock(&kvm->mmu_lock);
-	stage2_wp_range(kvm, start, end);
+	stage2_op_range(kvm, S2_OP_WP, start, end);
 	spin_unlock(&kvm->mmu_lock);
 	kvm_flush_remote_tlbs(kvm);
 }
 
+/**
+ * kvm_mmu_md_memory_region() - mark dirty bitmap for memory slot
+ * @kvm:	The KVM pointer
+ * @slot:	The memory slot to mark dirty
+ *
+ * Called to mark dirty bitmap after memory region KVM_MEM_LOG_DIRTY_PAGES
+ * operation is called and kvm_dirty_log_manual_protect_and_init_set is
+ * true. After this function returns, a bit of dirty bitmap is set if its
+ * corresponding page table (including PUD, PMD and PTEs) is present.
+ *
+ * Afterwards read of dirty page log can be called and present PUD, PMD and
+ * PTEs can be write protected by userspace manually.
+ *
+ * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
+ */
+static void kvm_mmu_md_memory_region(struct kvm *kvm, int slot)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
+	phys_addr_t start, end;
+
+	if (WARN_ON_ONCE(!memslot))
+		return;
+
+	start = memslot->base_gfn << PAGE_SHIFT;
+	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+	spin_lock(&kvm->mmu_lock);
+	stage2_op_range(kvm, S2_OP_MD, start, end);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 /**
  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
  * @kvm:	The KVM pointer
@@ -1567,7 +1680,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
 	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
 
-	stage2_wp_range(kvm, start, end);
+	stage2_op_range(kvm, S2_OP_WP, start, end);
 }
 
 /*
@@ -2274,7 +2387,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 			 * write protect any pages because they're reported
 			 * as dirty here.
 			 */
-			bitmap_set(new->dirty_bitmap, 0, new->npages);
+			kvm_mmu_md_memory_region(kvm, mem->slot);
 		}
 	}
 }
-- 
2.19.1

_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

      parent reply index

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-25  4:24 [PATCH 0/3] KVM: arm64: Some optimizations about enabling dirty log Keqian Zhu
2020-03-25  4:24 ` [PATCH 1/3] KVM/memslot: Move the initial set of dirty bitmap to arch Keqian Zhu
2020-03-25  4:24 ` [PATCH 2/3] KVM/arm64: Support enabling dirty log gradually in small chunks Keqian Zhu
2020-03-25  4:24 ` Keqian Zhu [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200325042423.12181-4-zhukeqian1@huawei.com \
    --to=zhukeqian1@huawei.com \
    --cc=jianjay.zhou@huawei.com \
    --cc=kvm@vger.kernel.org \
    --cc=kvmarm@lists.cs.columbia.edu \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=maz@kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=sean.j.christopherson@intel.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

KVM ARM Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/kvmarm/0 kvmarm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 kvmarm kvmarm/ https://lore.kernel.org/kvmarm \
		kvmarm@lists.cs.columbia.edu
	public-inbox-index kvmarm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/edu.columbia.cs.lists.kvmarm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git