linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 00/10] KVM: MMU: fast page fault
@ 2012-04-25  4:00 Xiao Guangrong
  2012-04-25  4:01 ` [PATCH v4 01/10] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
                   ` (9 more replies)
  0 siblings, 10 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:00 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

Changlong:
- split the meaning of SPTE_ALLOW_WRITE and rename it to SPTE_MMU_WRITEABLE

- introduce a middle patch that update spte under the protection of mmu-lock
  for better review.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v4 01/10] KVM: MMU: return bool in __rmap_write_protect
  2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
@ 2012-04-25  4:01 ` Xiao Guangrong
  2012-04-25  4:01 ` [PATCH v4 02/10] KVM: MMU: abstract spte write-protect Xiao Guangrong
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:01 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

The reture value of __rmap_write_protect is either 1 or 0, use
true/false instead of these

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   13 +++++++------
 1 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 29ad6f9..4a3cc18 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1041,11 +1041,12 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }

-static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+static bool
+__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int write_protected = 0;
+	bool write_protected = false;

 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
@@ -1066,7 +1067,7 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level
 			sptep = rmap_get_first(*rmapp, &iter);
 		}

-		write_protected = 1;
+		write_protected = true;
 	}

 	return write_protected;
@@ -1097,12 +1098,12 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 	}
 }

-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	int i;
-	int write_protected = 0;
+	bool write_protected = false;

 	slot = gfn_to_memslot(kvm, gfn);

@@ -1691,7 +1692,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,

 	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
-		int protected = 0;
+		bool protected = false;

 		for_each_sp(pages, sp, parents, i)
 			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH v4 02/10] KVM: MMU: abstract spte write-protect
  2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
  2012-04-25  4:01 ` [PATCH v4 01/10] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
@ 2012-04-25  4:01 ` Xiao Guangrong
  2012-04-25  4:02 ` [PATCH v4 03/10] KVM: VMX: export PFEC.P bit on ept Xiao Guangrong
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:01 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

Introduce a common function to abstract spte write-protect to
cleanup the code

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   60 ++++++++++++++++++++++++++++++---------------------
 1 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4a3cc18..e70ff38 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1041,6 +1041,34 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }

+/* Return true if the spte is dropped. */
+static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
+			       bool *flush)
+{
+	u64 spte = *sptep;
+
+	if (!is_writable_pte(spte))
+		return false;
+
+	*flush |= true;
+
+	if (large) {
+		pgprintk("rmap_write_protect(large): spte %p %llx\n",
+			 spte, *spte);
+		BUG_ON(!is_large_pte(spte));
+
+		drop_spte(kvm, sptep);
+		--kvm->stat.lpages;
+		return true;
+	}
+
+	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+	spte = spte & ~PT_WRITABLE_MASK;
+	mmu_spte_update(sptep, spte);
+
+	return false;
+}
+
 static bool
 __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 {
@@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)

 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-		rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
-
-		if (!is_writable_pte(*sptep)) {
-			sptep = rmap_get_next(&iter);
-			continue;
-		}
-
-		if (level == PT_PAGE_TABLE_LEVEL) {
-			mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
-			sptep = rmap_get_next(&iter);
-		} else {
-			BUG_ON(!is_large_pte(*sptep));
-			drop_spte(kvm, sptep);
-			--kvm->stat.lpages;
+		if (spte_write_protect(kvm, sptep, level > PT_PAGE_TABLE_LEVEL,
+			  &write_protected)) {
 			sptep = rmap_get_first(*rmapp, &iter);
+			continue;
 		}

-		write_protected = true;
+		sptep = rmap_get_next(&iter);
 	}

 	return write_protected;
@@ -3902,6 +3919,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
+	bool flush = false;

 	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
 		int i;
@@ -3916,16 +3934,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 			      !is_last_spte(pt[i], sp->role.level))
 				continue;

-			if (is_large_pte(pt[i])) {
-				drop_spte(kvm, &pt[i]);
-				--kvm->stat.lpages;
-				continue;
-			}
-
-			/* avoid RMW */
-			if (is_writable_pte(pt[i]))
-				mmu_spte_update(&pt[i],
-						pt[i] & ~PT_WRITABLE_MASK);
+			spte_write_protect(kvm, &pt[i],
+					   is_large_pte(pt[i]), &flush);
 		}
 	}
 	kvm_flush_remote_tlbs(kvm);
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH v4 03/10] KVM: VMX: export PFEC.P bit on ept
  2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
  2012-04-25  4:01 ` [PATCH v4 01/10] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
  2012-04-25  4:01 ` [PATCH v4 02/10] KVM: MMU: abstract spte write-protect Xiao Guangrong
@ 2012-04-25  4:02 ` Xiao Guangrong
  2012-04-25  4:02 ` [PATCH v4 04/10] KVM: MMU: introduce SPTE_MMU_WRITEABLE bit Xiao Guangrong
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:02 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

Export the present bit of page fault error code, the later patch
will use it

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/vmx.c |    9 ++++++++-
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 52f6856..2c98057 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4735,6 +4735,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification;
 	gpa_t gpa;
+	u32 error_code;
 	int gla_validity;

 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4759,7 +4760,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)

 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	trace_kvm_page_fault(gpa, exit_qualification);
-	return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+
+	/* It is a write fault? */
+	error_code = exit_qualification & (1U << 1);
+	/* ept page table is present? */
+	error_code |= (exit_qualification >> 3) & 0x1;
+
+	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }

 static u64 ept_rsvd_mask(u64 spte, int level)
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH v4 04/10] KVM: MMU: introduce SPTE_MMU_WRITEABLE bit
  2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
                   ` (2 preceding siblings ...)
  2012-04-25  4:02 ` [PATCH v4 03/10] KVM: VMX: export PFEC.P bit on ept Xiao Guangrong
@ 2012-04-25  4:02 ` Xiao Guangrong
  2012-04-25  4:03 ` [PATCH v4 05/10] KVM: MMU: introduce SPTE_WRITE_PROTECT bit Xiao Guangrong
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:02 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

This bit indicates whether the gpte of this spte is writable

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   11 ++++++++---
 1 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e70ff38..46bde3f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
 #define CREATE_TRACE_POINTS
 #include "mmutrace.h"

-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))

 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

@@ -2290,8 +2291,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_x_mask;
 	else
 		spte |= shadow_nx_mask;
+
 	if (pte_access & ACC_USER_MASK)
 		spte |= shadow_user_mask;
+
+	if (pte_access & ACC_WRITE_MASK)
+		spte |= SPTE_MMU_WRITEABLE;
+
 	if (level > PT_PAGE_TABLE_LEVEL)
 		spte |= PT_PAGE_SIZE_MASK;
 	if (tdp_enabled)
@@ -2345,8 +2351,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
-			if (is_writable_pte(spte))
-				spte &= ~PT_WRITABLE_MASK;
+			spte &= ~PT_WRITABLE_MASK;
 		}
 	}

-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH v4 05/10] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
                   ` (3 preceding siblings ...)
  2012-04-25  4:02 ` [PATCH v4 04/10] KVM: MMU: introduce SPTE_MMU_WRITEABLE bit Xiao Guangrong
@ 2012-04-25  4:03 ` Xiao Guangrong
  2012-04-25  4:03 ` [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:03 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

If this bit is set, it means the W bit of the spte is cleared due
to shadow page table protection

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   55 +++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 46bde3f..e7d8ffe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);

 #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))

 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

@@ -1042,36 +1043,53 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }

+static bool spte_wp_by_dirty_log(u64 spte)
+{
+	u64 mask = SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE;
+
+	WARN_ON(is_writable_pte(spte));
+
+	return ((spte & mask) == mask) && !(spte & SPTE_WRITE_PROTECT);
+}
+
 /* Return true if the spte is dropped. */
 static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
-			       bool *flush)
+			       bool *flush, bool page_table_protect)
 {
 	u64 spte = *sptep;

-	if (!is_writable_pte(spte))
-		return false;
+	if (is_writable_pte(spte)) {
+		*flush |= true;

-	*flush |= true;
+		if (large) {
+			pgprintk("rmap_write_protect(large): spte %p %llx\n",
+				 spte, *spte);
+			BUG_ON(!is_large_pte(spte));

-	if (large) {
-		pgprintk("rmap_write_protect(large): spte %p %llx\n",
-			 spte, *spte);
-		BUG_ON(!is_large_pte(spte));
+			drop_spte(kvm, sptep);
+			--kvm->stat.lpages;
+			return true;
+		}

-		drop_spte(kvm, sptep);
-		--kvm->stat.lpages;
-		return true;
+		goto reset_spte;
 	}

+	if (page_table_protect && spte_wp_by_dirty_log(spte))
+		goto reset_spte;
+
+	return false;
+
+reset_spte:
 	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 	spte = spte & ~PT_WRITABLE_MASK;
+	if (page_table_protect)
+		spte |= SPTE_WRITE_PROTECT;
 	mmu_spte_update(sptep, spte);
-
 	return false;
 }

-static bool
-__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+				 int level, bool page_table_protect)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1080,7 +1098,7 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
 		if (spte_write_protect(kvm, sptep, level > PT_PAGE_TABLE_LEVEL,
-			  &write_protected)) {
+			  &write_protected, page_table_protect)) {
 			sptep = rmap_get_first(*rmapp, &iter);
 			continue;
 		}
@@ -1109,7 +1127,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,

 	while (mask) {
 		rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
-		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
+		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);

 		/* clear the first set bit */
 		mask &= mask - 1;
@@ -1128,7 +1146,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 	for (i = PT_PAGE_TABLE_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		rmapp = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(kvm, rmapp, i);
+		write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
 	}

 	return write_protected;
@@ -2352,6 +2370,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
 			spte &= ~PT_WRITABLE_MASK;
+			spte |= SPTE_WRITE_PROTECT;
 		}
 	}

@@ -3940,7 +3959,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 				continue;

 			spte_write_protect(kvm, &pt[i],
-					   is_large_pte(pt[i]), &flush);
+					   is_large_pte(pt[i]), &flush, false);
 		}
 	}
 	kvm_flush_remote_tlbs(kvm);
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
                   ` (4 preceding siblings ...)
  2012-04-25  4:03 ` [PATCH v4 05/10] KVM: MMU: introduce SPTE_WRITE_PROTECT bit Xiao Guangrong
@ 2012-04-25  4:03 ` Xiao Guangrong
  2012-04-26 23:45   ` Marcelo Tosatti
  2012-04-25  4:04 ` [PATCH v4 07/10] KVM: MMU: lockless update spte on fast page fault path Xiao Guangrong
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:03 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

If the the present bit of page fault error code is set, it indicates
the shadow page is populated on all levels, it means what we do is
only modify the access bit which can be done out of mmu-lock

Currently, in order to simplify the code, we only fix the page fault
caused by write-protect on the fast path

In order to better review, we hold mmu-lock to update spte in this
patch, the concurrent update will be introduced in the later patch

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c         |  113 ++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/paging_tmpl.h |    3 +
 2 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e7d8ffe..96a9d5b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2683,18 +2683,117 @@ exit:
 	return ret;
 }

+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
+				   u32 error_code)
+{
+	/*
+	 * #PF can be fast only if the shadow page table is present and it
+	 * is caused by write-protect, that means we just need change the
+	 * W bit of the spte which can be done out of mmu-lock.
+	 */
+	if (!(error_code & PFERR_PRESENT_MASK) ||
+	      !(error_code & PFERR_WRITE_MASK))
+		return false;
+
+	return true;
+}
+
+static bool
+fast_pf_fix_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+		  u64 *sptep, u64 spte)
+{
+	gfn_t gfn;
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+
+	/* The spte has been changed. */
+	if (*sptep != spte)
+		goto exit;
+
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+	*sptep = spte | PT_WRITABLE_MASK;
+	mark_page_dirty(vcpu->kvm, gfn);
+
+exit:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	return true;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+			    int level, u32 error_code)
+{
+	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_mmu_page *sp;
+	bool ret = false;
+	u64 spte = 0ull;
+
+	if (!page_fault_can_be_fast(vcpu, gfn, error_code))
+		return false;
+
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+		if (!is_shadow_present_pte(spte) || iterator.level < level)
+			break;
+
+	/*
+	 * If the mapping has been changed, let the vcpu fault on the
+	 * same address again.
+	 */
+	if (!is_rmap_spte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	if (!is_last_spte(spte, level))
+		goto exit;
+
+	/*
+	 * Check if it is a spurious fault caused by TLB lazily flushed.
+	 *
+	 * Need not check the access of upper level table entries since
+	 * they are always ACC_ALL.
+	 */
+	 if (is_writable_pte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	/*
+	 * Currently, to simplify the code, only the spte write-protected
+	 * by dirty-log can be fast fixed.
+	 */
+	if (!spte_wp_by_dirty_log(spte))
+		goto exit;
+
+	sp = page_header(__pa(iterator.sptep));
+
+	ret = fast_pf_fix_spte(vcpu, sp, iterator.sptep, spte);
+
+exit:
+	walk_shadow_page_lockless_end(vcpu);
+
+	return ret;
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);

-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
-			 bool prefault)
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+			 gfn_t gfn, bool prefault)
 {
 	int r;
 	int level;
 	int force_pt_level;
 	pfn_t pfn;
 	unsigned long mmu_seq;
-	bool map_writable;
+	bool map_writable, write = error_code & PFERR_WRITE_MASK;

 	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
 	if (likely(!force_pt_level)) {
@@ -2711,6 +2810,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 	} else
 		level = PT_PAGE_TABLE_LEVEL;

+	if (fast_page_fault(vcpu, v, gfn, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

@@ -3099,7 +3201,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	gfn = gva >> PAGE_SHIFT;

 	return nonpaging_map(vcpu, gva & PAGE_MASK,
-			     error_code & PFERR_WRITE_MASK, gfn, prefault);
+			     error_code, gfn, prefault);
 }

 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3179,6 +3281,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	} else
 		level = PT_PAGE_TABLE_LEVEL;

+	if (fast_page_fault(vcpu, gpa, gfn, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df5a703..80493fb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -617,6 +617,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
 	}

+	if (fast_page_fault(vcpu, addr, walker.gfn, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH v4 07/10] KVM: MMU: lockless update spte on fast page fault path
  2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
                   ` (5 preceding siblings ...)
  2012-04-25  4:03 ` [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
@ 2012-04-25  4:04 ` Xiao Guangrong
  2012-04-25  4:04 ` [PATCH v4 08/10] KVM: MMU: trace fast page fault Xiao Guangrong
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:04 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

All the information we need on the fast page fault path is
SPTE_HOST_WRITEABLE bit, SPTE_MMU_WRITEABLE bit and SPTE_WRITE_PROTECT bit
on spte, actually, all these bits can be protected by cmpxchg

But, we need carefully handle the paths which depend on spte.W bit, it will
be documented in locking.txt in the next patch

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |  104 +++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 96a9d5b..15f01a3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -446,6 +446,15 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
 }
 #endif

+static bool spte_wp_by_dirty_log(u64 spte)
+{
+	u64 mask = SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE;
+
+	WARN_ON(is_writable_pte(spte));
+
+	return ((spte & mask) == mask) && !(spte & SPTE_WRITE_PROTECT);
+}
+
 static bool spte_has_volatile_bits(u64 spte)
 {
 	if (!shadow_accessed_mask)
@@ -454,9 +463,18 @@ static bool spte_has_volatile_bits(u64 spte)
 	if (!is_shadow_present_pte(spte))
 		return false;

-	if ((spte & shadow_accessed_mask) &&
-	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
-		return false;
+	if (spte & shadow_accessed_mask) {
+		if (is_writable_pte(spte))
+			return !(spte & shadow_dirty_mask);
+
+		/*
+		 * If the spte is write-protected by dirty-log, it may
+		 * be marked writable on fast page fault path, then CPU
+		 * can modify the Dirty bit.
+		 */
+		if (!spte_wp_by_dirty_log(spte))
+			return false;
+	}

 	return true;
 }
@@ -1043,15 +1061,6 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }

-static bool spte_wp_by_dirty_log(u64 spte)
-{
-	u64 mask = SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE;
-
-	WARN_ON(is_writable_pte(spte));
-
-	return ((spte & mask) == mask) && !(spte & SPTE_WRITE_PROTECT);
-}
-
 /* Return true if the spte is dropped. */
 static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 			       bool *flush, bool page_table_protect)
@@ -1059,13 +1068,12 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 	u64 spte = *sptep;

 	if (is_writable_pte(spte)) {
-		*flush |= true;
-
 		if (large) {
 			pgprintk("rmap_write_protect(large): spte %p %llx\n",
 				 spte, *spte);
 			BUG_ON(!is_large_pte(spte));

+			*flush |= true;
 			drop_spte(kvm, sptep);
 			--kvm->stat.lpages;
 			return true;
@@ -1074,6 +1082,10 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 		goto reset_spte;
 	}

+	/*
+	 * We need flush tlbs in this case: the fast page fault path
+	 * can mark the spte writable after we read the sptep.
+	 */
 	if (page_table_protect && spte_wp_by_dirty_log(spte))
 		goto reset_spte;

@@ -1081,6 +1093,8 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,

 reset_spte:
 	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+
+	*flush |= true;
 	spte = spte & ~PT_WRITABLE_MASK;
 	if (page_table_protect)
 		spte |= SPTE_WRITE_PROTECT;
@@ -2699,24 +2713,60 @@ static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
 }

 static bool
-fast_pf_fix_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-		  u64 *sptep, u64 spte)
+fast_pf_fix_indirect_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			  u64 *sptep, u64 spte, gfn_t gfn)
 {
-	gfn_t gfn;
+	pfn_t pfn;
+	bool ret = false;

-	spin_lock(&vcpu->kvm->mmu_lock);
+	/*
+	 * For the indirect spte, it is hard to get a stable gfn since
+	 * we just use a cmpxchg to avoid all the races which is not
+	 * enough to avoid the ABA problem: the host can arbitrarily
+	 * change spte and the mapping from gfn to pfn.
+	 *
+	 * What we do is call gfn_to_pfn_atomic to bind the gfn and the
+	 * pfn because after the call:
+	 * - we have held the refcount of pfn that means the pfn can not
+	 *   be freed and be reused for another gfn.
+	 * - the pfn is writable that means it can not be shared by different
+	 *   gfn.
+	 */
+	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);

-	/* The spte has been changed. */
-	if (*sptep != spte)
+	/* The host page is swapped out or merged. */
+	if (mmu_invalid_pfn(pfn))
 		goto exit;

-	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	ret = true;
+
+	if (pfn != spte_to_pfn(spte))
+		goto exit;

-	*sptep = spte | PT_WRITABLE_MASK;
-	mark_page_dirty(vcpu->kvm, gfn);
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+		mark_page_dirty(vcpu->kvm, gfn);

 exit:
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return ret;
+}
+
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			u64 *sptep, u64 spte)
+{
+	gfn_t gfn;
+
+	WARN_ON(!sp->role.direct);
+
+	/*
+	 * The gfn of direct spte is stable since it is calculated
+	 * by sp->gfn.
+	 */
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+		mark_page_dirty(vcpu->kvm, gfn);

 	return true;
 }
@@ -2774,7 +2824,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,

 	sp = page_header(__pa(iterator.sptep));

-	ret = fast_pf_fix_spte(vcpu, sp, iterator.sptep, spte);
+	if (sp->role.direct)
+		ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
+	else
+		ret = fast_pf_fix_indirect_spte(vcpu, sp, iterator.sptep,
+						spte, gfn);

 exit:
 	walk_shadow_page_lockless_end(vcpu);
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH v4 08/10] KVM: MMU: trace fast page fault
  2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
                   ` (6 preceding siblings ...)
  2012-04-25  4:04 ` [PATCH v4 07/10] KVM: MMU: lockless update spte on fast page fault path Xiao Guangrong
@ 2012-04-25  4:04 ` Xiao Guangrong
  2012-04-25  4:05 ` [PATCH v4 09/10] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint Xiao Guangrong
  2012-04-25  4:06 ` [PATCH v4 10/10] KVM: MMU: document mmu-lock and fast page fault Xiao Guangrong
  9 siblings, 0 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:04 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

To see what happen on this path and help us to optimize it

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c      |    2 ++
 arch/x86/kvm/mmutrace.h |   41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 15f01a3..f8d8375 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2831,6 +2831,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 						spte, gfn);

 exit:
+	trace_fast_page_fault(vcpu, gva, gfn, error_code, iterator.sptep,
+			      spte, ret);
 	walk_shadow_page_lockless_end(vcpu);

 	return ret;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e8..84da94f 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,6 +243,47 @@ TRACE_EVENT(
 	TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
 		  __entry->access)
 );
+
+#define __spte_satisfied(__spte)				\
+	(__entry->retry && is_writable_pte(__entry->__spte))
+
+TRACE_EVENT(
+	fast_page_fault,
+	TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, u32 error_code,
+		 u64 *sptep, u64 old_spte, bool retry),
+	TP_ARGS(vcpu, gva, gfn, error_code, sptep, old_spte, retry),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(gva_t, gva)
+		__field(gfn_t, gfn)
+		__field(u32, error_code)
+		__field(u64 *, sptep)
+		__field(u64, old_spte)
+		__field(u64, new_spte)
+		__field(bool, retry)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu->vcpu_id;
+		__entry->gva = gva;
+		__entry->gfn = gfn;
+		__entry->error_code = error_code;
+		__entry->sptep = sptep;
+		__entry->old_spte = old_spte;
+		__entry->new_spte = *sptep;
+		__entry->retry = retry;
+	),
+
+	TP_printk("vcpu %d gva %lx gfn %llx error_code %s sptep %p "
+		  "old %#llx new %llx spurious %d fixed %d", __entry->vcpu_id,
+		  __entry->gva, __entry->gfn,
+		  __print_flags(__entry->error_code, "|",
+				kvm_mmu_trace_pferr_flags),
+		  __entry->sptep, __entry->old_spte, __entry->new_spte,
+		  __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+	)
+);
 #endif /* _TRACE_KVMMMU_H */

 #undef TRACE_INCLUDE_PATH
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH v4 09/10] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint
  2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
                   ` (7 preceding siblings ...)
  2012-04-25  4:04 ` [PATCH v4 08/10] KVM: MMU: trace fast page fault Xiao Guangrong
@ 2012-04-25  4:05 ` Xiao Guangrong
  2012-04-25  4:06 ` [PATCH v4 10/10] KVM: MMU: document mmu-lock and fast page fault Xiao Guangrong
  9 siblings, 0 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:05 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

The P bit of page fault error code is missed in this tracepoint, fix it by
passing the full error code

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmutrace.h    |    7 +++----
 arch/x86/kvm/paging_tmpl.h |    3 +--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 84da94f..e762d35 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
  */
 TRACE_EVENT(
 	kvm_mmu_pagetable_walk,
-	TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
-	TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+	TP_PROTO(u64 addr, u32 pferr),
+	TP_ARGS(addr, pferr),

 	TP_STRUCT__entry(
 		__field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(

 	TP_fast_assign(
 		__entry->addr = addr;
-		__entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
-		                 | (!!fetch_fault << 4);
+		__entry->pferr = pferr;
 	),

 	TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 80493fb..8e6aac9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	const int fetch_fault = access & PFERR_FETCH_MASK;
 	u16 errcode = 0;

-	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
-				     fetch_fault);
+	trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
 	eperm = false;
 	walker->level = mmu->root_level;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH v4 10/10] KVM: MMU: document mmu-lock and fast page fault
  2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
                   ` (8 preceding siblings ...)
  2012-04-25  4:05 ` [PATCH v4 09/10] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint Xiao Guangrong
@ 2012-04-25  4:06 ` Xiao Guangrong
  9 siblings, 0 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-25  4:06 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

Document fast page fault and mmu-lock in locking.txt

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 Documentation/virtual/kvm/locking.txt |  152 ++++++++++++++++++++++++++++++++-
 1 files changed, 151 insertions(+), 1 deletions(-)

diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 3b4cd3b..f2dbefb 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -6,7 +6,151 @@ KVM Lock Overview

 (to be written)

-2. Reference
+3: Exception
+------------
+
+Fast page fault:
+
+Fast page fault is the fast path which fixes the guest page fault out of
+the mmu-lock on x86. Currently, the page fault can be fast only if the
+shadow page table is present and it is caused by write-protect, that means
+we just need change the W bit of the spte.
+
+What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit,
+SPTE_MMU_WRITEABLE bit and SPTE_WRITE_PROTECT bit on the spte:
+- SPTE_HOST_WRITEABLE means the gfn is writable on host.
+- SPTE_MMU_WRITEABLE means the gfn is writable on guest mmu.
+- SPTE_WRITE_PROTECT means the gfn is write-protected for shadow page
+  write protection.
+
+On fast page fault path, we will use cmpxchg to atomically set the spte W
+bit if spte.SPTE_HOST_WRITEABLE = 1, spte.SPTE_WRITE_PROTECT = 1 and
+spte.SPTE_WRITE_PROTECT = 0, this is safe because whenever changing these
+bits can be detected by cmpxchg.
+
+But we need carefully check these cases:
+1): The mapping from gfn to pfn
+
+The mapping from gfn to pfn may be changed since we can only ensure the pfn
+is not changed during cmpxchg. This is a ABA problem, for example, below case
+will happen:
+
+At the beginning:
+gpte = gfn1
+gfn1 is mapped to pfn1 on host
+spte is the shadow page table entry corresponding with gpte and
+spte = pfn1
+
+   VCPU 0                           VCPU0
+on fast page fault path:
+
+   old_spte = *spte;
+                                 pfn1 is swapped out:
+                                    spte = 0;
+
+                                 pfn1 is re-alloced for gfn2.
+
+                                 gpte is changed to point to
+                                 gfn2 by the guest:
+                                    spte = pfn1;
+
+   if (cmpxchg(spte, old_spte, old_spte+W)
+	mark_page_dirty(vcpu->kvm, gfn1)
+             OOPS!!!
+
+We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
+
+For direct sp, we can easily avoid it since the spte of direct sp is fixed
+to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic()
+to pin gfn to pfn, because after gfn_to_pfn_atomic():
+- We have held the refcount of pfn that means the pfn can not be freed and
+  be reused for another gfn.
+- The pfn is writable that means it can not be shared between different gfns
+  by KSM.
+
+Then, we can ensure the dirty bitmaps is correctly set for a gfn.
+
+2): flush tlbs due to shadow page table write-protected
+
+In rmap_write_protect(), we always need flush tlbs if
+spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_MMU_WRITEABLE = 1
+even if the current spte is read-only. The reason is fast page fault path
+will mark the spte to writable and the writable spte will be cached into tlb.
+Like below case:
+
+At the beginning:
+spte.W = 0
+spte.SPTE_WRITE_PROTECT = 0
+spte.SPTE_HOST_WRITEABLE = 1
+spte.SPTE_MMU_WRITEABLE = 1
+
+   VCPU 0                          VCPU0
+In rmap_write_protect():
+
+   flush = false;
+
+   if (spte.W == 1)
+      flush = true;
+
+                               On fast page fault path:
+                                  old_spte = *spte
+                                  cmpxchg(spte, old_spte, old_spte + W)
+
+                               the spte is fetched/prefetched into
+                               tlb by CPU
+
+   spte = (spte | SPTE_WRITE_PROTECT) &
+                      ~PT_WRITABLE_MASK;
+
+   if (flush)
+         kvm_flush_remote_tlbs(vcpu->kvm)
+               OOPS!!!
+
+The tlbs are not flushed since the spte is read-only, but invalid writable
+spte has been cached in the tlbs caused by fast page fault.
+
+3): Dirty bit tracking
+In the origin code, the spte can be fast updated (non-atomically) if the
+spte is read-only and the Accessed bit has already been set since the
+Accessed bit and Dirty bit can not be lost.
+
+But it is not true after fast page fault since the spte can be marked
+writable between reading spte and updating spte. Like below case:
+
+At the beginning:
+spte.W = 0
+spte.Accessed = 1
+
+   VCPU 0                                       VCPU0
+In mmu_spte_clear_track_bits():
+
+   old_spte = *spte;
+
+   /* 'if' condition is satisfied. */
+   if (old_spte.Accssed == 1 &&
+        old_spte.W == 0)
+      spte = 0ull;
+                                         on fast page fault path:
+                                             spte.W = 1
+                                         memory write on the spte:
+                                             spte.Dirty = 1
+
+
+   else
+      old_spte = xchg(spte, 0ull)
+
+
+   if (old_spte.Accssed == 1)
+      kvm_set_pfn_accessed(spte.pfn);
+   if (old_spte.Dirty == 1)
+      kvm_set_pfn_dirty(spte.pfn);
+      OOPS!!!
+
+The Dirty bit is lost in this case. We can call the slow path
+(__update_clear_spte_slow()) to update the spte if the spte can be changed
+by fast page fault.
+
+3. Reference
 ------------

 Name:		kvm_lock
@@ -23,3 +167,9 @@ Arch:		x86
 Protects:	- kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset}
 		- tsc offset in vmcb
 Comment:	'raw' because updating the tsc offsets must not be preempted.
+
+Name:		kvm->mmu_lock
+Type:		spinlock_t
+Arch:		any
+Protects:	-shadow page/shadow tlb entry
+Comment:	it is a spinlock since it will be used in mmu notifier.
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-04-25  4:03 ` [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
@ 2012-04-26 23:45   ` Marcelo Tosatti
  2012-04-27  5:53     ` Xiao Guangrong
  0 siblings, 1 reply; 31+ messages in thread
From: Marcelo Tosatti @ 2012-04-26 23:45 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Wed, Apr 25, 2012 at 12:03:48PM +0800, Xiao Guangrong wrote:
> If the the present bit of page fault error code is set, it indicates
> the shadow page is populated on all levels, it means what we do is
> only modify the access bit which can be done out of mmu-lock
> 
> Currently, in order to simplify the code, we only fix the page fault
> caused by write-protect on the fast path
> 
> In order to better review, we hold mmu-lock to update spte in this
> patch, the concurrent update will be introduced in the later patch
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/kvm/mmu.c         |  113 ++++++++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/paging_tmpl.h |    3 +
>  2 files changed, 112 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index e7d8ffe..96a9d5b 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2683,18 +2683,117 @@ exit:
>  	return ret;
>  }
> 
> +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
> +				   u32 error_code)
> +{
> +	/*
> +	 * #PF can be fast only if the shadow page table is present and it
> +	 * is caused by write-protect, that means we just need change the
> +	 * W bit of the spte which can be done out of mmu-lock.
> +	 */
> +	if (!(error_code & PFERR_PRESENT_MASK) ||
> +	      !(error_code & PFERR_WRITE_MASK))
> +		return false;
> +
> +	return true;
> +}
> +
> +static bool
> +fast_pf_fix_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> +		  u64 *sptep, u64 spte)
> +{
> +	gfn_t gfn;
> +
> +	spin_lock(&vcpu->kvm->mmu_lock);
> +
> +	/* The spte has been changed. */
> +	if (*sptep != spte)
> +		goto exit;
> +
> +	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> +
> +	*sptep = spte | PT_WRITABLE_MASK;
> +	mark_page_dirty(vcpu->kvm, gfn);
> +
> +exit:
> +	spin_unlock(&vcpu->kvm->mmu_lock);

There was a misunderstanding. The suggestion is to change codepaths that
today assume that a side effect of holding mmu_lock is that sptes are
not updated or read before attempting to introduce any lockless spte
updates.

example)

        u64 mask, old_spte = *sptep;

        if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
                __update_clear_spte_fast(sptep, new_spte);
        else
                old_spte = __update_clear_spte_slow(sptep, new_spte);

The local old_spte copy might be stale by the
time "spte_has_volatile_bits(old_spte)" reads the writable bit.

example)


VCPU0                                       VCPU1
set_spte
mmu_spte_update decides fast write 
mov newspte to sptep
(non-locked write instruction)
newspte in store buffer

                                            lockless spte read path reads stale value

spin_unlock(mmu_lock) 

Depending on the what stale value CPU1 read and what decision it took
this can be a problem, say the new bits (and we do not want to verify
every single case). The spte write from inside mmu_lock should always be
atomic now?

So these details must be exposed to the reader, they are hidden now
(note mmu_spte_update is already a maze, its getting too deep).

> +
> +	return true;
> +}
> +
> +/*
> + * Return value:
> + * - true: let the vcpu to access on the same address again.
> + * - false: let the real page fault path to fix it.
> + */
> +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
> +			    int level, u32 error_code)
> +{
> +	struct kvm_shadow_walk_iterator iterator;
> +	struct kvm_mmu_page *sp;
> +	bool ret = false;
> +	u64 spte = 0ull;
> +
> +	if (!page_fault_can_be_fast(vcpu, gfn, error_code))
> +		return false;
> +
> +	walk_shadow_page_lockless_begin(vcpu);
> +	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
> +		if (!is_shadow_present_pte(spte) || iterator.level < level)
> +			break;
> +
> +	/*
> +	 * If the mapping has been changed, let the vcpu fault on the
> +	 * same address again.
> +	 */
> +	if (!is_rmap_spte(spte)) {
> +		ret = true;
> +		goto exit;
> +	}
> +
> +	if (!is_last_spte(spte, level))
> +		goto exit;
> +
> +	/*
> +	 * Check if it is a spurious fault caused by TLB lazily flushed.
> +	 *
> +	 * Need not check the access of upper level table entries since
> +	 * they are always ACC_ALL.
> +	 */
> +	 if (is_writable_pte(spte)) {
> +		ret = true;
> +		goto exit;
> +	}
> +
> +	/*
> +	 * Currently, to simplify the code, only the spte write-protected
> +	 * by dirty-log can be fast fixed.
> +	 */
> +	if (!spte_wp_by_dirty_log(spte))
> +		goto exit;
> +
> +	sp = page_header(__pa(iterator.sptep));
> +
> +	ret = fast_pf_fix_spte(vcpu, sp, iterator.sptep, spte);
> +
> +exit:
> +	walk_shadow_page_lockless_end(vcpu);
> +
> +	return ret;
> +}
> +
>  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>  			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
> 
> -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
> -			 bool prefault)
> +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
> +			 gfn_t gfn, bool prefault)
>  {
>  	int r;
>  	int level;
>  	int force_pt_level;
>  	pfn_t pfn;
>  	unsigned long mmu_seq;
> -	bool map_writable;
> +	bool map_writable, write = error_code & PFERR_WRITE_MASK;
> 
>  	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
>  	if (likely(!force_pt_level)) {
> @@ -2711,6 +2810,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
>  	} else
>  		level = PT_PAGE_TABLE_LEVEL;
> 
> +	if (fast_page_fault(vcpu, v, gfn, level, error_code))
> +		return 0;
> +
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> 
> @@ -3099,7 +3201,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
>  	gfn = gva >> PAGE_SHIFT;
> 
>  	return nonpaging_map(vcpu, gva & PAGE_MASK,
> -			     error_code & PFERR_WRITE_MASK, gfn, prefault);
> +			     error_code, gfn, prefault);
>  }
> 
>  static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> @@ -3179,6 +3281,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
>  	} else
>  		level = PT_PAGE_TABLE_LEVEL;
> 
> +	if (fast_page_fault(vcpu, gpa, gfn, level, error_code))
> +		return 0;
> +
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> 
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index df5a703..80493fb 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -617,6 +617,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>  		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
>  	}
> 
> +	if (fast_page_fault(vcpu, addr, walker.gfn, level, error_code))
> +		return 0;
> +
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> 
> -- 
> 1.7.7.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-04-26 23:45   ` Marcelo Tosatti
@ 2012-04-27  5:53     ` Xiao Guangrong
  2012-04-27 14:52       ` Marcelo Tosatti
  0 siblings, 1 reply; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-27  5:53 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, LKML, KVM

On 04/27/2012 07:45 AM, Marcelo Tosatti wrote:


>> +static bool
>> +fast_pf_fix_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>> +		  u64 *sptep, u64 spte)
>> +{
>> +	gfn_t gfn;
>> +
>> +	spin_lock(&vcpu->kvm->mmu_lock);
>> +
>> +	/* The spte has been changed. */
>> +	if (*sptep != spte)
>> +		goto exit;
>> +
>> +	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
>> +
>> +	*sptep = spte | PT_WRITABLE_MASK;
>> +	mark_page_dirty(vcpu->kvm, gfn);
>> +
>> +exit:
>> +	spin_unlock(&vcpu->kvm->mmu_lock);
> 
> There was a misunderstanding. 


Sorry, Marcelo! It is still not clear for me now. :(

> The suggestion is to change codepaths that
> today assume that a side effect of holding mmu_lock is that sptes are
> not updated or read before attempting to introduce any lockless spte
> updates.
> 
> example)
> 
>         u64 mask, old_spte = *sptep;
> 
>         if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
>                 __update_clear_spte_fast(sptep, new_spte);
>         else
>                 old_spte = __update_clear_spte_slow(sptep, new_spte);
> 
> The local old_spte copy might be stale by the
> time "spte_has_volatile_bits(old_spte)" reads the writable bit.
> 
> example)
> 
> 
> VCPU0                                       VCPU1
> set_spte
> mmu_spte_update decides fast write 
> mov newspte to sptep
> (non-locked write instruction)
> newspte in store buffer
> 
>                                             lockless spte read path reads stale value
> 
> spin_unlock(mmu_lock) 
> 
> Depending on the what stale value CPU1 read and what decision it took
> this can be a problem, say the new bits (and we do not want to verify
> every single case). The spte write from inside mmu_lock should always be
> atomic now?
> 
> So these details must be exposed to the reader, they are hidden now
> (note mmu_spte_update is already a maze, its getting too deep).
> 


Actually, in this patch, all the spte update is under mmu-lock, and we
lockless-ly read spte , but the spte will be verified again after holding
mmu-lock.

+	spin_lock(&vcpu->kvm->mmu_lock);
+
+	/* The spte has been changed. */
+	if (*sptep != spte)
+		goto exit;
+
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+	*sptep = spte | PT_WRITABLE_MASK;
+	mark_page_dirty(vcpu->kvm, gfn);
+
+exit:
+	spin_unlock(&vcpu->kvm->mmu_lock);

Is not the same as both read/update spte under mmu-lock?

Hmm, this is what you want?

+static bool
+fast_pf_fix_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+		  u64 *sptep, u64 spte)
+{
+	gfn_t gfn;
+
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	mmu_spte_update(sptep, spte | PT_WRITABLE_MASK);
+	mark_page_dirty(vcpu->kvm, gfn);
+
+	return true;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+			    int level, u32 error_code)
+{
+	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_mmu_page *sp;
+	bool ret = false;
+	u64 spte = 0ull;
+
+	if (!page_fault_can_be_fast(vcpu, gfn, error_code))
+		return false;
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+
+	for_each_shadow_entry(vcpu, gva, iterator) {
+		spte = *iterator.sptep;
+
+		if (!is_shadow_present_pte(spte) || iterator.level < level)
+			break;
+	}
+
+	/*
+	 * If the mapping has been changed, let the vcpu fault on the
+	 * same address again.
+	 */
+	if (!is_rmap_spte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	if (!is_last_spte(spte, level))
+		goto exit;
+
+	/*
+	 * Check if it is a spurious fault caused by TLB lazily flushed.
+	 *
+	 * Need not check the access of upper level table entries since
+	 * they are always ACC_ALL.
+	 */
+	 if (is_writable_pte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	/*
+	 * Currently, to simplify the code, only the spte write-protected
+	 * by dirty-log can be fast fixed.
+	 */
+	if (!spte_wp_by_dirty_log(spte))
+		goto exit;
+
+	sp = page_header(__pa(iterator.sptep));
+
+	ret = fast_pf_fix_spte(vcpu, sp, iterator.sptep, spte);
+
+exit:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	return ret;
+}


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-04-27  5:53     ` Xiao Guangrong
@ 2012-04-27 14:52       ` Marcelo Tosatti
  2012-04-28  6:10         ` Xiao Guangrong
  2012-04-29  8:50         ` Takuya Yoshikawa
  0 siblings, 2 replies; 31+ messages in thread
From: Marcelo Tosatti @ 2012-04-27 14:52 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Fri, Apr 27, 2012 at 01:53:09PM +0800, Xiao Guangrong wrote:
> On 04/27/2012 07:45 AM, Marcelo Tosatti wrote:
> 
> 
> >> +static bool
> >> +fast_pf_fix_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> >> +		  u64 *sptep, u64 spte)
> >> +{
> >> +	gfn_t gfn;
> >> +
> >> +	spin_lock(&vcpu->kvm->mmu_lock);
> >> +
> >> +	/* The spte has been changed. */
> >> +	if (*sptep != spte)
> >> +		goto exit;
> >> +
> >> +	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> >> +
> >> +	*sptep = spte | PT_WRITABLE_MASK;
> >> +	mark_page_dirty(vcpu->kvm, gfn);
> >> +
> >> +exit:
> >> +	spin_unlock(&vcpu->kvm->mmu_lock);
> > 
> > There was a misunderstanding. 
> 
> 
> Sorry, Marcelo! It is still not clear for me now. :(
> 
> > The suggestion is to change codepaths that
> > today assume that a side effect of holding mmu_lock is that sptes are
> > not updated or read before attempting to introduce any lockless spte
> > updates.
> > 
> > example)
> > 
> >         u64 mask, old_spte = *sptep;
> > 
> >         if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
> >                 __update_clear_spte_fast(sptep, new_spte);
> >         else
> >                 old_spte = __update_clear_spte_slow(sptep, new_spte);
> > 
> > The local old_spte copy might be stale by the
> > time "spte_has_volatile_bits(old_spte)" reads the writable bit.
> > 
> > example)
> > 
> > 
> > VCPU0                                       VCPU1
> > set_spte
> > mmu_spte_update decides fast write 
> > mov newspte to sptep
> > (non-locked write instruction)
> > newspte in store buffer
> > 
> >                                             lockless spte read path reads stale value
> > 
> > spin_unlock(mmu_lock) 
> > 
> > Depending on the what stale value CPU1 read and what decision it took
> > this can be a problem, say the new bits (and we do not want to verify
> > every single case). The spte write from inside mmu_lock should always be
> > atomic now?
> > 
> > So these details must be exposed to the reader, they are hidden now
> > (note mmu_spte_update is already a maze, its getting too deep).
> > 
> 
> 
> Actually, in this patch, all the spte update is under mmu-lock, and we
> lockless-ly read spte , but the spte will be verified again after holding
> mmu-lock.

Yes but the objective you are aiming for is to read and write sptes
without mmu_lock. That is, i am not talking about this patch. 
Please read carefully the two examples i gave (separated by "example)").

> +	spin_lock(&vcpu->kvm->mmu_lock);
> +
> +	/* The spte has been changed. */
> +	if (*sptep != spte)
> +		goto exit;
> +
> +	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> +
> +	*sptep = spte | PT_WRITABLE_MASK;
> +	mark_page_dirty(vcpu->kvm, gfn);
> +
> +exit:
> +	spin_unlock(&vcpu->kvm->mmu_lock);
> 
> Is not the same as both read/update spte under mmu-lock?
> 
> Hmm, this is what you want?

The rules for code under mmu_lock should be:

1) Spte updates under mmu lock must always be atomic and 
with locked instructions.
2) Spte values must be read once, and appropriate action
must be taken when writing them back in case their value 
has changed (remote TLB flush might be required).

The maintenance of:

- gpte writable bit 
- protected by dirty log

Bits is tricky. We should think of a way to simplify things 
and get rid of them (or at least one of them), if possible.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-04-27 14:52       ` Marcelo Tosatti
@ 2012-04-28  6:10         ` Xiao Guangrong
  2012-05-01  1:34           ` Marcelo Tosatti
  2012-04-29  8:50         ` Takuya Yoshikawa
  1 sibling, 1 reply; 31+ messages in thread
From: Xiao Guangrong @ 2012-04-28  6:10 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, LKML, KVM

On 04/27/2012 10:52 PM, Marcelo Tosatti wrote:


>> Actually, in this patch, all the spte update is under mmu-lock, and we
>> lockless-ly read spte , but the spte will be verified again after holding
>> mmu-lock.
> 
> Yes but the objective you are aiming for is to read and write sptes
> without mmu_lock. That is, i am not talking about this patch. 
> Please read carefully the two examples i gave (separated by "example)").
> 


Thanks for your patience, Marcelo!

>> +	spin_lock(&vcpu->kvm->mmu_lock);
>> +
>> +	/* The spte has been changed. */
>> +	if (*sptep != spte)
>> +		goto exit;
>> +
>> +	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
>> +
>> +	*sptep = spte | PT_WRITABLE_MASK;
>> +	mark_page_dirty(vcpu->kvm, gfn);
>> +
>> +exit:
>> +	spin_unlock(&vcpu->kvm->mmu_lock);
>>
>> Is not the same as both read/update spte under mmu-lock?
>>
>> Hmm, this is what you want?
> 
> The rules for code under mmu_lock should be:
> 
> 1) Spte updates under mmu lock must always be atomic and 
> with locked instructions.


How about treating the spte is 'volatile' if the spte can be
updated out of mmu-lock? In this case, the update is always
atomic.

The piece of code:

+static bool spte_can_be_writable(u64 spte)
+{
+	u64 mask = SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE;
+
+	return (spte & mask) == mask;
+}
+
+static bool spte_can_lockless_update(u64 spte)
+{
+	return  !is_writable_pte(spte) && spte_can_be_writable(spte);
+}
+
 static bool spte_has_volatile_bits(u64 spte)
 {
+	/*
+	 * Always atomicly update spte if it can be updated
+	 * out of mmu-lock.
+	 */
+	if (spte_can_lockless_update(spte))
+		return true;
+

> 2) Spte values must be read once, and appropriate action
> must be taken when writing them back in case their value 
> has changed (remote TLB flush might be required).
> 


Okay, may be i get your idea now. :)

I will fix mmu_spte_update, let it to return the latest old value which
will be checked in the caller before it is updated.

> The maintenance of:
> 
> - gpte writable bit 
> - protected by dirty log
> 
> Bits is tricky. We should think of a way to simplify things 
> and get rid of them (or at least one of them), if possible.
> 

Maybe SPTE_MMU_WRITEABLE is sufficient, the second bit will be dropped.

Marcelo, do you satisfied with this patch?

[PATCH 6/6] KVM: MMU: fast path of handling guest page fault

If the the present bit of page fault error code is set, it indicates
the shadow page is populated on all levels, it means what we do is
only modify the access bit which can be done out of mmu-lock

Currently, in order to simplify the code, we only fix the page fault
caused by write-protect on the fast path

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c         |  259 +++++++++++++++++++++++++++++++++++---------
 arch/x86/kvm/paging_tmpl.h |    3 +
 2 files changed, 209 insertions(+), 53 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e7d8ffe..2c248b5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -446,8 +446,27 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
 }
 #endif

+static bool spte_can_be_writable(u64 spte)
+{
+	u64 mask = SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE;
+
+	return (spte & mask) == mask;
+}
+
+static bool spte_can_lockless_update(u64 spte)
+{
+	return  !is_writable_pte(spte) && spte_can_be_writable(spte);
+}
+
 static bool spte_has_volatile_bits(u64 spte)
 {
+	/*
+	 * Always atomicly update spte if it can be updated
+	 * out of mmu-lock.
+	 */
+	if (spte_can_lockless_update(spte))
+		return true;
+
 	if (!shadow_accessed_mask)
 		return false;

@@ -480,34 +499,38 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)

 /* Rules for using mmu_spte_update:
  * Update the state bits, it means the mapped pfn is not changged.
+ *
+ * Note: it returns the latest old value before it is updated, if
+ * the caller deponds on the old spte value, it should use the return
+ * value to check since the spte may be changed on the fast page fault
+ * path which is out of mmu-lock.
  */
-static void mmu_spte_update(u64 *sptep, u64 new_spte)
+static u64 mmu_spte_update(u64 *sptep, u64 new_spte)
 {
-	u64 mask, old_spte = *sptep;
+	u64 old_spte = *sptep;

 	WARN_ON(!is_rmap_spte(new_spte));

-	if (!is_shadow_present_pte(old_spte))
-		return mmu_spte_set(sptep, new_spte);
-
-	new_spte |= old_spte & shadow_dirty_mask;
-
-	mask = shadow_accessed_mask;
-	if (is_writable_pte(old_spte))
-		mask |= shadow_dirty_mask;
+	if (!is_shadow_present_pte(old_spte)) {
+		mmu_spte_set(sptep, new_spte);
+		goto exit;
+	}

-	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
+	if (!spte_has_volatile_bits(old_spte))
 		__update_clear_spte_fast(sptep, new_spte);
 	else
 		old_spte = __update_clear_spte_slow(sptep, new_spte);

 	if (!shadow_accessed_mask)
-		return;
+		goto exit;

 	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+
+exit:
+	return old_spte;
 }

 /*
@@ -1043,48 +1066,37 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }

-static bool spte_wp_by_dirty_log(u64 spte)
-{
-	u64 mask = SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE;
-
-	WARN_ON(is_writable_pte(spte));
-
-	return ((spte & mask) == mask) && !(spte & SPTE_WRITE_PROTECT);
-}
-
 /* Return true if the spte is dropped. */
 static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 			       bool *flush, bool page_table_protect)
 {
 	u64 spte = *sptep;

-	if (is_writable_pte(spte)) {
-		*flush |= true;
+	if (!spte_can_be_writable(spte))
+		return false;

-		if (large) {
-			pgprintk("rmap_write_protect(large): spte %p %llx\n",
-				 spte, *spte);
-			BUG_ON(!is_large_pte(spte));
+	if (large) {
+		pgprintk("rmap_write_protect(large): spte %p %llx\n",
+			 spte, *spte);
+		BUG_ON(!is_large_pte(spte));

-			drop_spte(kvm, sptep);
-			--kvm->stat.lpages;
-			return true;
-		}
-
-		goto reset_spte;
+		*flush |= true;
+		drop_spte(kvm, sptep);
+		--kvm->stat.lpages;
+		return true;
 	}

-	if (page_table_protect && spte_wp_by_dirty_log(spte))
-		goto reset_spte;
-
-	return false;
-
-reset_spte:
 	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
-	spte = spte & ~PT_WRITABLE_MASK;
+
 	if (page_table_protect)
-		spte |= SPTE_WRITE_PROTECT;
-	mmu_spte_update(sptep, spte);
+		spte &= ~SPTE_MMU_WRITEABLE;
+	spte &= ~PT_WRITABLE_MASK;
+
+	spte = mmu_spte_update(sptep, spte);
+
+	if (is_writable_pte(spte))
+		*flush |= true;
+
 	return false;
 }

@@ -2313,9 +2325,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (pte_access & ACC_USER_MASK)
 		spte |= shadow_user_mask;

-	if (pte_access & ACC_WRITE_MASK)
-		spte |= SPTE_MMU_WRITEABLE;
-
 	if (level > PT_PAGE_TABLE_LEVEL)
 		spte |= PT_PAGE_SIZE_MASK;
 	if (tdp_enabled)
@@ -2340,7 +2349,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			goto done;
 		}

-		spte |= PT_WRITABLE_MASK;
+		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;

 		if (!vcpu->arch.mmu.direct_map
 		    && !(pte_access & ACC_WRITE_MASK)) {
@@ -2369,8 +2378,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
-			spte &= ~PT_WRITABLE_MASK;
-			spte |= SPTE_WRITE_PROTECT;
+			spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
 		}
 	}

@@ -2378,7 +2386,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);

 set_pte:
-	mmu_spte_update(sptep, spte);
+	entry = mmu_spte_update(sptep, spte);
 	/*
 	 * If we overwrite a writable spte with a read-only one we
 	 * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2683,18 +2691,157 @@ exit:
 	return ret;
 }

+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
+				   u32 error_code)
+{
+	/*
+	 * #PF can be fast only if the shadow page table is present and it
+	 * is caused by write-protect, that means we just need change the
+	 * W bit of the spte which can be done out of mmu-lock.
+	 */
+	if (!(error_code & PFERR_PRESENT_MASK) ||
+	      !(error_code & PFERR_WRITE_MASK))
+		return false;
+
+	return true;
+}
+
+static bool
+fast_pf_fix_indirect_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			  u64 *sptep, u64 spte, gfn_t gfn)
+{
+	pfn_t pfn;
+	bool ret = false;
+
+	/*
+	 * For the indirect spte, it is hard to get a stable gfn since
+	 * we just use a cmpxchg to avoid all the races which is not
+	 * enough to avoid the ABA problem: the host can arbitrarily
+	 * change spte and the mapping from gfn to pfn.
+	 *
+	 * What we do is call gfn_to_pfn_atomic to bind the gfn and the
+	 * pfn because after the call:
+	 * - we have held the refcount of pfn that means the pfn can not
+	 *   be freed and be reused for another gfn.
+	 * - the pfn is writable that means it can not be shared by different
+	 *   gfn.
+	 */
+	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+	/* The host page is swapped out or merged. */
+	if (mmu_invalid_pfn(pfn))
+		goto exit;
+
+	ret = true;
+
+	if (pfn != spte_to_pfn(spte))
+		goto exit;
+
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+		mark_page_dirty(vcpu->kvm, gfn);
+
+exit:
+	kvm_release_pfn_clean(pfn);
+	return ret;
+}
+
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			u64 *sptep, u64 spte)
+{
+	gfn_t gfn;
+
+	WARN_ON(!sp->role.direct);
+
+	/*
+	 * The gfn of direct spte is stable since it is calculated
+	 * by sp->gfn.
+	 */
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+		mark_page_dirty(vcpu->kvm, gfn);
+
+	return true;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+			    int level, u32 error_code)
+{
+	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_mmu_page *sp;
+	bool ret = false;
+	u64 spte = 0ull;
+
+	if (!page_fault_can_be_fast(vcpu, gfn, error_code))
+		return false;
+
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+		if (!is_shadow_present_pte(spte) || iterator.level < level)
+			break;
+
+	/*
+	 * If the mapping has been changed, let the vcpu fault on the
+	 * same address again.
+	 */
+	if (!is_rmap_spte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	if (!is_last_spte(spte, level))
+		goto exit;
+
+	/*
+	 * Check if it is a spurious fault caused by TLB lazily flushed.
+	 *
+	 * Need not check the access of upper level table entries since
+	 * they are always ACC_ALL.
+	 */
+	 if (is_writable_pte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	/*
+	 * Currently, to simplify the code, only the spte write-protected
+	 * by dirty-log can be fast fixed.
+	 */
+	if (!spte_can_be_writable(spte))
+		goto exit;
+
+	sp = page_header(__pa(iterator.sptep));
+
+	if (sp->role.direct)
+		ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
+	else
+		ret = fast_pf_fix_indirect_spte(vcpu, sp, iterator.sptep,
+						spte, gfn);
+
+exit:
+	walk_shadow_page_lockless_end(vcpu);
+
+	return ret;
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);

-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
-			 bool prefault)
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+			 gfn_t gfn, bool prefault)
 {
 	int r;
 	int level;
 	int force_pt_level;
 	pfn_t pfn;
 	unsigned long mmu_seq;
-	bool map_writable;
+	bool map_writable, write = error_code & PFERR_WRITE_MASK;

 	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
 	if (likely(!force_pt_level)) {
@@ -2711,6 +2858,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 	} else
 		level = PT_PAGE_TABLE_LEVEL;

+	if (fast_page_fault(vcpu, v, gfn, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

@@ -3099,7 +3249,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	gfn = gva >> PAGE_SHIFT;

 	return nonpaging_map(vcpu, gva & PAGE_MASK,
-			     error_code & PFERR_WRITE_MASK, gfn, prefault);
+			     error_code, gfn, prefault);
 }

 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3179,6 +3329,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	} else
 		level = PT_PAGE_TABLE_LEVEL;

+	if (fast_page_fault(vcpu, gpa, gfn, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df5a703..80493fb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -617,6 +617,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
 	}

+	if (fast_page_fault(vcpu, addr, walker.gfn, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

-- 
1.7.7.6



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-04-27 14:52       ` Marcelo Tosatti
  2012-04-28  6:10         ` Xiao Guangrong
@ 2012-04-29  8:50         ` Takuya Yoshikawa
  2012-05-01  2:31           ` Marcelo Tosatti
  2012-05-02  5:39           ` Xiao Guangrong
  1 sibling, 2 replies; 31+ messages in thread
From: Takuya Yoshikawa @ 2012-04-29  8:50 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On Fri, 27 Apr 2012 11:52:13 -0300
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> Yes but the objective you are aiming for is to read and write sptes
> without mmu_lock. That is, i am not talking about this patch. 
> Please read carefully the two examples i gave (separated by "example)").

The real objective is not still clear.

The ~10% improvement reported before was on macro benchmarks during live
migration.  At least, that optimization was the initial objective.

But at some point, the objective suddenly changed to "lock-less" without
understanding what introduced the original improvement.

Was the problem really mmu_lock contention?

If the path being introduced by this patch is really fast, isn't it
possible to achieve the same improvement still using mmu_lock?


Note: During live migration, the fact that the guest gets faulted is
itself a limitation.  We could easily see noticeable slowdown of a
program even if it runs only between two GET_DIRTY_LOGs.


> The rules for code under mmu_lock should be:
> 
> 1) Spte updates under mmu lock must always be atomic and 
> with locked instructions.
> 2) Spte values must be read once, and appropriate action
> must be taken when writing them back in case their value 
> has changed (remote TLB flush might be required).

Although I am not certain about what will be really needed in the
final form, if this kind of maybe-needed-overhead is going to be
added little by little, I worry about possible regression.

Thanks,
	Takuya

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-04-28  6:10         ` Xiao Guangrong
@ 2012-05-01  1:34           ` Marcelo Tosatti
  2012-05-02  5:28             ` Xiao Guangrong
  0 siblings, 1 reply; 31+ messages in thread
From: Marcelo Tosatti @ 2012-05-01  1:34 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Sat, Apr 28, 2012 at 02:10:33PM +0800, Xiao Guangrong wrote:
> On 04/27/2012 10:52 PM, Marcelo Tosatti wrote:
> 
> 
> >> Actually, in this patch, all the spte update is under mmu-lock, and we
> >> lockless-ly read spte , but the spte will be verified again after holding
> >> mmu-lock.
> > 
> > Yes but the objective you are aiming for is to read and write sptes
> > without mmu_lock. That is, i am not talking about this patch. 
> > Please read carefully the two examples i gave (separated by "example)").
> > 
> 
> 
> Thanks for your patience, Marcelo!
> 
> >> +	spin_lock(&vcpu->kvm->mmu_lock);
> >> +
> >> +	/* The spte has been changed. */
> >> +	if (*sptep != spte)
> >> +		goto exit;
> >> +
> >> +	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> >> +
> >> +	*sptep = spte | PT_WRITABLE_MASK;
> >> +	mark_page_dirty(vcpu->kvm, gfn);
> >> +
> >> +exit:
> >> +	spin_unlock(&vcpu->kvm->mmu_lock);
> >>
> >> Is not the same as both read/update spte under mmu-lock?
> >>
> >> Hmm, this is what you want?
> > 
> > The rules for code under mmu_lock should be:
> > 
> > 1) Spte updates under mmu lock must always be atomic and 
> > with locked instructions.
> 
> 
> How about treating the spte is 'volatile' if the spte can be
> updated out of mmu-lock? In this case, the update is always
> atomic.
> 
> The piece of code:
> 
> +static bool spte_can_be_writable(u64 spte)
> +{
> +	u64 mask = SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE;
> +
> +	return (spte & mask) == mask;
> +}
> +
> +static bool spte_can_lockless_update(u64 spte)
> +{
> +	return  !is_writable_pte(spte) && spte_can_be_writable(spte);
> +}
> +
>  static bool spte_has_volatile_bits(u64 spte)
>  {
> +	/*
> +	 * Always atomicly update spte if it can be updated
> +	 * out of mmu-lock.
> +	 */
> +	if (spte_can_lockless_update(spte))
> +		return true;
> +
> 
> > 2) Spte values must be read once, and appropriate action
> > must be taken when writing them back in case their value 
> > has changed (remote TLB flush might be required).
> > 
> 
> 
> Okay, may be i get your idea now. :)
> 
> I will fix mmu_spte_update, let it to return the latest old value which
> will be checked in the caller before it is updated.
> 
> > The maintenance of:
> > 
> > - gpte writable bit 
> > - protected by dirty log
> > 
> > Bits is tricky. We should think of a way to simplify things 
> > and get rid of them (or at least one of them), if possible.
> > 
> 
> Maybe SPTE_MMU_WRITEABLE is sufficient, the second bit will be dropped.
> 
> Marcelo, do you satisfied with this patch?

It is getting better, but not yet, there are still reads of sptep
scattered all over (as mentioned before, i think a pattern of read spte
once, work on top of that, atomically write and then deal with results
_everywhere_ (where mmu lock is held) is more consistent.

        /*
         * If we overwrite a writable spte with a read-only one we
         * should flush remote TLBs. Otherwise rmap_write_protect
         * will find a read-only spte, even though the writable spte
         * might be cached on a CPU's TLB.
         */
        if (is_writable_pte(entry) && !is_writable_pte(*sptep))
                kvm_flush_remote_tlbs(vcpu->kvm);

This is inconsistent with the above obviously.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-04-29  8:50         ` Takuya Yoshikawa
@ 2012-05-01  2:31           ` Marcelo Tosatti
  2012-05-02  5:39           ` Xiao Guangrong
  1 sibling, 0 replies; 31+ messages in thread
From: Marcelo Tosatti @ 2012-05-01  2:31 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On Sun, Apr 29, 2012 at 05:50:04PM +0900, Takuya Yoshikawa wrote:
> On Fri, 27 Apr 2012 11:52:13 -0300
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > Yes but the objective you are aiming for is to read and write sptes
> > without mmu_lock. That is, i am not talking about this patch. 
> > Please read carefully the two examples i gave (separated by "example)").
> 
> The real objective is not still clear.
> 
> The ~10% improvement reported before was on macro benchmarks during live
> migration.  At least, that optimization was the initial objective.
> 
> But at some point, the objective suddenly changed to "lock-less" without
> understanding what introduced the original improvement.
> 
> Was the problem really mmu_lock contention?
> 
> If the path being introduced by this patch is really fast, isn't it
> possible to achieve the same improvement still using mmu_lock?

Right. Supposedly, mmu_lock cacheline bouncing is the problem. Hum:

$ pahole -C "kvm" /tmp/kvm.ko 
struct kvm {
	spinlock_t                 mmu_lock;             /*     0     2
*/

	/* XXX 6 bytes hole, try to pack */

	struct mutex               slots_lock;           /*     8    32
*/
	struct mm_struct *         mm;                   /*    40     8
*/
	struct kvm_memslots *      memslots;             /*    48     8
*/
	struct srcu_struct         srcu;                 /*    56    48
*/
	/* --- cacheline 1 boundary (64 bytes) was 40 bytes ago --- */
	u32                        bsp_vcpu_id;          /*   104     4
*/

	/* XXX 4 bytes hole, try to pack */

Oops. False sharing?

> Note: During live migration, the fact that the guest gets faulted is
> itself a limitation.  We could easily see noticeable slowdown of a
> program even if it runs only between two GET_DIRTY_LOGs.
> 
> 
> > The rules for code under mmu_lock should be:
> > 
> > 1) Spte updates under mmu lock must always be atomic and 
> > with locked instructions.
> > 2) Spte values must be read once, and appropriate action
> > must be taken when writing them back in case their value
> > has changed (remote TLB flush might be required).
> 
> Although I am not certain about what will be really needed in the
> final form, if this kind of maybe-needed-overhead is going to be
> added little by little, I worry about possible regression.
> 
> Thanks,
> 	Takuya

Yes, that is a possibility.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-01  1:34           ` Marcelo Tosatti
@ 2012-05-02  5:28             ` Xiao Guangrong
  2012-05-02 21:07               ` Marcelo Tosatti
  0 siblings, 1 reply; 31+ messages in thread
From: Xiao Guangrong @ 2012-05-02  5:28 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, LKML, KVM

On 05/01/2012 09:34 AM, Marcelo Tosatti wrote:


> 
> It is getting better, but not yet, there are still reads of sptep
> scattered all over (as mentioned before, i think a pattern of read spte
> once, work on top of that, atomically write and then deal with results
> _everywhere_ (where mmu lock is held) is more consistent.
> 


But we only need care the path which depends on is_writable_pte(), no?

So, where call is_writable_pte() are spte_has_volatile_bits(),
spte_write_protect() and set_spte().

I have changed these functions:
In spte_has_volatile_bits():
 static bool spte_has_volatile_bits(u64 spte)
 {
+	/*
+	 * Always atomicly update spte if it can be updated
+	 * out of mmu-lock.
+	 */
+	if (spte_can_lockless_update(spte))
+		return true;
+

In spte_write_protect():

+	spte = mmu_spte_update(sptep, spte);
+
+	if (is_writable_pte(spte))
+		*flush |= true;
+
The 'spte' is from atomically read-write (xchg).

in set_spte():
 set_pte:
-	mmu_spte_update(sptep, spte);
+	entry = mmu_spte_update(sptep, spte);
 	/*
 	 * If we overwrite a writable spte with a read-only one we
 	 * should flush remote TLBs. Otherwise rmap_write_protect
The 'entry' is also the latest value.

>         /*
>          * If we overwrite a writable spte with a read-only one we
>          * should flush remote TLBs. Otherwise rmap_write_protect
>          * will find a read-only spte, even though the writable spte
>          * might be cached on a CPU's TLB.
>          */
>         if (is_writable_pte(entry) && !is_writable_pte(*sptep))
>                 kvm_flush_remote_tlbs(vcpu->kvm);
> 
> This is inconsistent with the above obviously.
> 


'entry' is not a problem since it is from atomically read-write as
mentioned above, i need change this code to:

		/*
		 * Optimization: for pte sync, if spte was writable the hash
		 * lookup is unnecessary (and expensive). Write protection
		 * is responsibility of mmu_get_page / kvm_sync_page.
		 * Same reasoning can be applied to dirty page accounting.
		 */
		if (!can_unsync && is_writable_pte(entry) /* Use 'entry' instead of '*sptep'. */
			goto set_pte
   ......


         if (is_writable_pte(entry) && !is_writable_pte(spte)) /* Use 'spte' instead of '*sptep'. */
                 kvm_flush_remote_tlbs(vcpu->kvm);


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-04-29  8:50         ` Takuya Yoshikawa
  2012-05-01  2:31           ` Marcelo Tosatti
@ 2012-05-02  5:39           ` Xiao Guangrong
  2012-05-02 21:10             ` Marcelo Tosatti
  2012-05-03  0:15             ` Takuya Yoshikawa
  1 sibling, 2 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-05-02  5:39 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: Marcelo Tosatti, Avi Kivity, LKML, KVM

On 04/29/2012 04:50 PM, Takuya Yoshikawa wrote:

> On Fri, 27 Apr 2012 11:52:13 -0300
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
>> Yes but the objective you are aiming for is to read and write sptes
>> without mmu_lock. That is, i am not talking about this patch. 
>> Please read carefully the two examples i gave (separated by "example)").
> 
> The real objective is not still clear.
> 
> The ~10% improvement reported before was on macro benchmarks during live
> migration.  At least, that optimization was the initial objective.
> 
> But at some point, the objective suddenly changed to "lock-less" without
> understanding what introduced the original improvement.
> 
> Was the problem really mmu_lock contention?
> 


Takuya, i am so tired to argue the advantage of lockless write-protect
and lockless O(1) dirty-log again and again.

> If the path being introduced by this patch is really fast, isn't it
> possible to achieve the same improvement still using mmu_lock?
> 
> 
> Note: During live migration, the fact that the guest gets faulted is
> itself a limitation.  We could easily see noticeable slowdown of a
> program even if it runs only between two GET_DIRTY_LOGs.
> 


Obviously no.

It depends on what the guest is doing, from my autotest test, it very
easily to see that, the huge improvement is on bench-migration not
pure-migration.

> 
>> The rules for code under mmu_lock should be:
>>
>> 1) Spte updates under mmu lock must always be atomic and 
>> with locked instructions.
>> 2) Spte values must be read once, and appropriate action
>> must be taken when writing them back in case their value 
>> has changed (remote TLB flush might be required).
> 
> Although I am not certain about what will be really needed in the
> final form, if this kind of maybe-needed-overhead is going to be
> added little by little, I worry about possible regression.


Well, will you suggest Linus to reject all patches and stop
all discussion for the "possible regression" reason?


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-02  5:28             ` Xiao Guangrong
@ 2012-05-02 21:07               ` Marcelo Tosatti
  2012-05-03 11:26                 ` Xiao Guangrong
  0 siblings, 1 reply; 31+ messages in thread
From: Marcelo Tosatti @ 2012-05-02 21:07 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Wed, May 02, 2012 at 01:28:39PM +0800, Xiao Guangrong wrote:
> On 05/01/2012 09:34 AM, Marcelo Tosatti wrote:
> 
> 
> > 
> > It is getting better, but not yet, there are still reads of sptep
> > scattered all over (as mentioned before, i think a pattern of read spte
> > once, work on top of that, atomically write and then deal with results
> > _everywhere_ (where mmu lock is held) is more consistent.
> > 
> 
> 
> But we only need care the path which depends on is_writable_pte(), no?

Yes.

> So, where call is_writable_pte() are spte_has_volatile_bits(),
> spte_write_protect() and set_spte().
> 
> I have changed these functions:
> In spte_has_volatile_bits():
>  static bool spte_has_volatile_bits(u64 spte)
>  {
> +	/*
> +	 * Always atomicly update spte if it can be updated
> +	 * out of mmu-lock.
> +	 */
> +	if (spte_can_lockless_update(spte))
> +		return true;
> +
> 
> In spte_write_protect():
> 
> +	spte = mmu_spte_update(sptep, spte);
> +
> +	if (is_writable_pte(spte))
> +		*flush |= true;
> +
> The 'spte' is from atomically read-write (xchg).
> 
> in set_spte():
>  set_pte:
> -	mmu_spte_update(sptep, spte);
> +	entry = mmu_spte_update(sptep, spte);
>  	/*
>  	 * If we overwrite a writable spte with a read-only one we
>  	 * should flush remote TLBs. Otherwise rmap_write_protect
> The 'entry' is also the latest value.
> 
> >         /*
> >          * If we overwrite a writable spte with a read-only one we
> >          * should flush remote TLBs. Otherwise rmap_write_protect
> >          * will find a read-only spte, even though the writable spte
> >          * might be cached on a CPU's TLB.
> >          */
> >         if (is_writable_pte(entry) && !is_writable_pte(*sptep))
> >                 kvm_flush_remote_tlbs(vcpu->kvm);
> > 
> > This is inconsistent with the above obviously.
> > 
> 
> 
> 'entry' is not a problem since it is from atomically read-write as
> mentioned above, i need change this code to:
> 
> 		/*
> 		 * Optimization: for pte sync, if spte was writable the hash
> 		 * lookup is unnecessary (and expensive). Write protection
> 		 * is responsibility of mmu_get_page / kvm_sync_page.
> 		 * Same reasoning can be applied to dirty page accounting.
> 		 */
> 		if (!can_unsync && is_writable_pte(entry) /* Use 'entry' instead of '*sptep'. */
> 			goto set_pte
>    ......
> 
> 
>          if (is_writable_pte(entry) && !is_writable_pte(spte)) /* Use 'spte' instead of '*sptep'. */
>                  kvm_flush_remote_tlbs(vcpu->kvm);

What is of more importance than the ability to verify that this or that
particular case are ok at the moment is to write code in such a way that
its easy to verify that it is correct.

Thus the suggestion above:

"scattered all over (as mentioned before, i think a pattern of read spte
once, work on top of that, atomically write and then deal with results
_everywhere_ (where mmu lock is held) is more consistent."



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-02  5:39           ` Xiao Guangrong
@ 2012-05-02 21:10             ` Marcelo Tosatti
  2012-05-03 12:09               ` Xiao Guangrong
  2012-05-03  0:15             ` Takuya Yoshikawa
  1 sibling, 1 reply; 31+ messages in thread
From: Marcelo Tosatti @ 2012-05-02 21:10 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Takuya Yoshikawa, Avi Kivity, LKML, KVM

On Wed, May 02, 2012 at 01:39:51PM +0800, Xiao Guangrong wrote:
> On 04/29/2012 04:50 PM, Takuya Yoshikawa wrote:
> 
> > On Fri, 27 Apr 2012 11:52:13 -0300
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > 
> >> Yes but the objective you are aiming for is to read and write sptes
> >> without mmu_lock. That is, i am not talking about this patch. 
> >> Please read carefully the two examples i gave (separated by "example)").
> > 
> > The real objective is not still clear.
> > 
> > The ~10% improvement reported before was on macro benchmarks during live
> > migration.  At least, that optimization was the initial objective.
> > 
> > But at some point, the objective suddenly changed to "lock-less" without
> > understanding what introduced the original improvement.
> > 
> > Was the problem really mmu_lock contention?
> > 
> 
> 
> Takuya, i am so tired to argue the advantage of lockless write-protect
> and lockless O(1) dirty-log again and again.

His point is valid: there is a lack of understanding on the details of
the improvement.

Did you see the pahole output on struct kvm? Apparently mmu_lock is
sharing a cacheline with read-intensive memslots pointer. It would be 
interesting to see what are the effects of cacheline aligning mmu_lock.

> > If the path being introduced by this patch is really fast, isn't it
> > possible to achieve the same improvement still using mmu_lock?
> > 
> > 
> > Note: During live migration, the fact that the guest gets faulted is
> > itself a limitation.  We could easily see noticeable slowdown of a
> > program even if it runs only between two GET_DIRTY_LOGs.
> > 
> 
> 
> Obviously no.
> 
> It depends on what the guest is doing, from my autotest test, it very
> easily to see that, the huge improvement is on bench-migration not
> pure-migration.
> 
> > 
> >> The rules for code under mmu_lock should be:
> >>
> >> 1) Spte updates under mmu lock must always be atomic and 
> >> with locked instructions.
> >> 2) Spte values must be read once, and appropriate action
> >> must be taken when writing them back in case their value 
> >> has changed (remote TLB flush might be required).
> > 
> > Although I am not certain about what will be really needed in the
> > final form, if this kind of maybe-needed-overhead is going to be
> > added little by little, I worry about possible regression.
> 
> 
> Well, will you suggest Linus to reject all patches and stop
> all discussion for the "possible regression" reason?
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-02  5:39           ` Xiao Guangrong
  2012-05-02 21:10             ` Marcelo Tosatti
@ 2012-05-03  0:15             ` Takuya Yoshikawa
  2012-05-03 12:23               ` Xiao Guangrong
  1 sibling, 1 reply; 31+ messages in thread
From: Takuya Yoshikawa @ 2012-05-03  0:15 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, Avi Kivity, LKML, KVM

On Wed, 02 May 2012 13:39:51 +0800
Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> wrote:

> > Was the problem really mmu_lock contention?

> Takuya, i am so tired to argue the advantage of lockless write-protect
> and lockless O(1) dirty-log again and again.

You are missing my point.  Please do not take my comments as an objection
to your whole work: whey do you feel so?

I thought that your new fast-page-fault path was fast and optimized
the guest during dirty logging.

So in this v4, you might get a similar result even before dropping
mmu_lock, without 07/10?, if the problem Marcelo explained was not there.


Of course there is a problem of mmu_lock contention.  What I am suggesting
is to split that problem and do measurement separately so that part of
your work can be merged soon.

Your guest size and workload was small to make get_dirty hold mmu_lock
long time.  If you want to appeal the real value of lock-less, you need to
do another measurment.


But this is your work and it's up to you.  Although I was thinking to help
your measurement, I cannot do that knowing the fact that you would not
welcome my help.


> > Although I am not certain about what will be really needed in the
> > final form, if this kind of maybe-needed-overhead is going to be
> > added little by little, I worry about possible regression.

> Well, will you suggest Linus to reject all patches and stop
> all discussion for the "possible regression" reason?

My concern was for Marcelo's examples, not your current implementation.
If you can show explicitely what will be needed in the final form,
I do not have any concern.


Sorry for disturbing.

Thanks,
	Takuya

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-02 21:07               ` Marcelo Tosatti
@ 2012-05-03 11:26                 ` Xiao Guangrong
  2012-05-05 14:08                   ` Marcelo Tosatti
  0 siblings, 1 reply; 31+ messages in thread
From: Xiao Guangrong @ 2012-05-03 11:26 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, LKML, KVM

On 05/03/2012 05:07 AM, Marcelo Tosatti wrote:


>> 'entry' is not a problem since it is from atomically read-write as
>> mentioned above, i need change this code to:
>>
>> 		/*
>> 		 * Optimization: for pte sync, if spte was writable the hash
>> 		 * lookup is unnecessary (and expensive). Write protection
>> 		 * is responsibility of mmu_get_page / kvm_sync_page.
>> 		 * Same reasoning can be applied to dirty page accounting.
>> 		 */
>> 		if (!can_unsync && is_writable_pte(entry) /* Use 'entry' instead of '*sptep'. */
>> 			goto set_pte
>>    ......
>>
>>
>>          if (is_writable_pte(entry) && !is_writable_pte(spte)) /* Use 'spte' instead of '*sptep'. */
>>                  kvm_flush_remote_tlbs(vcpu->kvm);
> 
> What is of more importance than the ability to verify that this or that
> particular case are ok at the moment is to write code in such a way that
> its easy to verify that it is correct.
> 
> Thus the suggestion above:
> 
> "scattered all over (as mentioned before, i think a pattern of read spte
> once, work on top of that, atomically write and then deal with results
> _everywhere_ (where mmu lock is held) is more consistent."
> 


Marcelo, thanks for your time to patiently review/reply my mail.

I am confused with ' _everywhere_ ', it means all of the path read/update
spte? why not only verify the path which depends on is_writable_pte()?

For the reason of "its easy to verify that it is correct"? But these
paths are safe since it is not care PT_WRITABLE_MASK at all. What these
paths care is the Dirty-bit and Accessed-bit are not lost, that is why
we always treat the spte as "volatile" if it is can be updated out of
mmu-lock.

For the further development? We can add the delta comment for
is_writable_pte() to warn the developer use it more carefully.

It is also very hard to verify spte everywhere. :(

Actually, the current code to care PT_WRITABLE_MASK is just for
tlb flush, may be we can fold it into mmu_spte_update.
[
  There are tree ways to modify spte, present -> nonpresent, nonpresent -> present,
  present -> present.

  But we only need care present -> present for lockless.
]

/*
 * return true means we need flush tlbs caused by changing spte from writeable
 * to read-only.
 */
bool mmu_update_spte(u64 *sptep, u64 spte)
{
	u64 last_spte, old_spte = *sptep;
	bool flush = false;

	last_spte = xchg(sptep, spte);

	if ((is_writable_pte(last_spte) ||
	      spte_has_updated_lockless(old_spte, last_spte)) &&
	         !is_writable_pte(spte) )
		flush = true;

	.... track Drity/Accessed bit ...


	return flush		
}

Furthermore, the style of "if (spte-has-changed) goto beginning" is feasible
in set_spte since this path is a fast path. (i can speed up mmu_need_write_protect)



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-02 21:10             ` Marcelo Tosatti
@ 2012-05-03 12:09               ` Xiao Guangrong
  2012-05-03 12:13                 ` Avi Kivity
  0 siblings, 1 reply; 31+ messages in thread
From: Xiao Guangrong @ 2012-05-03 12:09 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Takuya Yoshikawa, Avi Kivity, LKML, KVM

On 05/03/2012 05:10 AM, Marcelo Tosatti wrote:

> On Wed, May 02, 2012 at 01:39:51PM +0800, Xiao Guangrong wrote:
>> On 04/29/2012 04:50 PM, Takuya Yoshikawa wrote:
>>
>>> On Fri, 27 Apr 2012 11:52:13 -0300
>>> Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>>
>>>> Yes but the objective you are aiming for is to read and write sptes
>>>> without mmu_lock. That is, i am not talking about this patch. 
>>>> Please read carefully the two examples i gave (separated by "example)").
>>>
>>> The real objective is not still clear.
>>>
>>> The ~10% improvement reported before was on macro benchmarks during live
>>> migration.  At least, that optimization was the initial objective.
>>>
>>> But at some point, the objective suddenly changed to "lock-less" without
>>> understanding what introduced the original improvement.
>>>
>>> Was the problem really mmu_lock contention?
>>>
>>
>>
>> Takuya, i am so tired to argue the advantage of lockless write-protect
>> and lockless O(1) dirty-log again and again.
> 
> His point is valid: there is a lack of understanding on the details of
> the improvement.
> 


Actually, the improvement of lockless is that it can let vcpu to be parallel
as possible.

>From the test result, lockless gains little improvement for unix-migration,
in this case, the vcpus are almost idle (at least not busy).

The large improvement is from dbench-migration, in this case, all vcpus are
busy accessing memory which is write-protected by dirty-log. If you enable
page-fault/fast-page-fault tracepoints, you can see huge number of page fault
from different vcpu during the migration.

> Did you see the pahole output on struct kvm? Apparently mmu_lock is
> sharing a cacheline with read-intensive memslots pointer. It would be 
> interesting to see what are the effects of cacheline aligning mmu_lock.
> 


Yes, i see that. In my test .config, i have enabled
CONFIG_DEBUG_SPINLOCK/CONFIG_DEBUG_LOCK_ALLOC, mmu-lock is not sharing cacheline
with memslots. That means it is not a problem during my test.
(BTW, pahole can not work on my box, it shows:
......
DW_AT_<0x3c>=0x19
DW_AT_<0x3c>=0x19
DW_AT_<0x3c>=0x19
die__process_function: DW_TAG_INVALID (0x4109) @ <0x12886> not handled!
)

If we reorganize 'struct kvm', i guess it is good for kvm but it can not improve
too much for migration. :)



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-03 12:09               ` Xiao Guangrong
@ 2012-05-03 12:13                 ` Avi Kivity
  0 siblings, 0 replies; 31+ messages in thread
From: Avi Kivity @ 2012-05-03 12:13 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, Takuya Yoshikawa, LKML, KVM

On 05/03/2012 03:09 PM, Xiao Guangrong wrote:
> Actually, the improvement of lockless is that it can let vcpu to be parallel
> as possible.
>
> From the test result, lockless gains little improvement for unix-migration,
> in this case, the vcpus are almost idle (at least not busy).
>
> The large improvement is from dbench-migration, in this case, all vcpus are
> busy accessing memory which is write-protected by dirty-log. If you enable
> page-fault/fast-page-fault tracepoints, you can see huge number of page fault
> from different vcpu during the migration.
>

We can kill the page faults completely by using A/D bits in shadow page
tables.  The latest version of the Intel SDM defines A/D bits for EPT,
and NPT has had them from day one.

Of course this comes at a cost, instead of traversing a bitmap we have
to read all sptes, which are 64x as large.  But it's probably worthwhile
for large guests and perhaps also for smaller ones.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-03  0:15             ` Takuya Yoshikawa
@ 2012-05-03 12:23               ` Xiao Guangrong
  2012-05-03 12:40                 ` Takuya Yoshikawa
  0 siblings, 1 reply; 31+ messages in thread
From: Xiao Guangrong @ 2012-05-03 12:23 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: Marcelo Tosatti, Avi Kivity, LKML, KVM

On 05/03/2012 08:15 AM, Takuya Yoshikawa wrote:

> On Wed, 02 May 2012 13:39:51 +0800
> Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> wrote:
> 
>>> Was the problem really mmu_lock contention?
> 
>> Takuya, i am so tired to argue the advantage of lockless write-protect
>> and lockless O(1) dirty-log again and again.
> 
> You are missing my point.  Please do not take my comments as an objection
> to your whole work: whey do you feel so?
> 


Takuya, i am sorry, please forgive my rudeness! Since my English is
so poor that it is easy for me to misunderstand the mail. :(

> I thought that your new fast-page-fault path was fast and optimized
> the guest during dirty logging.
> 
> So in this v4, you might get a similar result even before dropping
> mmu_lock, without 07/10?, if the problem Marcelo explained was not there.
> 


Actually, the improvement is larger than v2/v3 if ept is enabled, but
it is lower for ept disabled. This is because the fask-fask (rmap.WRITABLE bit)
is dropped for better review.

> 
> Of course there is a problem of mmu_lock contention.  What I am suggesting
> is to split that problem and do measurement separately so that part of
> your work can be merged soon.
> 
> Your guest size and workload was small to make get_dirty hold mmu_lock
> long time.  If you want to appeal the real value of lock-less, you need to
> do another measurment.
> 
> 
> But this is your work and it's up to you.  Although I was thinking to help
> your measurement, I cannot do that knowing the fact that you would not
> welcome my help.
> 


Of course, any measurement is appreciative!

> 
>>> Although I am not certain about what will be really needed in the
>>> final form, if this kind of maybe-needed-overhead is going to be
>>> added little by little, I worry about possible regression.
> 
>> Well, will you suggest Linus to reject all patches and stop
>> all discussion for the "possible regression" reason?
> 
> My concern was for Marcelo's examples, not your current implementation.
> If you can show explicitely what will be needed in the final form,
> I do not have any concern.
> 
> 
> Sorry for disturbing.


Sorry again.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-03 12:23               ` Xiao Guangrong
@ 2012-05-03 12:40                 ` Takuya Yoshikawa
  0 siblings, 0 replies; 31+ messages in thread
From: Takuya Yoshikawa @ 2012-05-03 12:40 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, Avi Kivity, LKML, KVM

On Thu, 03 May 2012 20:23:18 +0800
Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> wrote:

> Takuya, i am sorry, please forgive my rudeness! Since my English is
> so poor that it is easy for me to misunderstand the mail. :(

Me too, I am not good at reading/speaking English!
No problem.

> > But this is your work and it's up to you.  Although I was thinking to help
> > your measurement, I cannot do that knowing the fact that you would not
> > welcome my help.

> Of course, any measurement is appreciative!

I am interested in checking the improvement in my own eyes.

> > Sorry for disturbing.

> Sorry again.

No problem, really.
Let's discuss more from now on!

What I am really worrying about is the lack of review in this area.
I am not the person who is good at MMU things.

We should collect more comments if possible from other developers.

Thanks,
	Takuya

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-03 11:26                 ` Xiao Guangrong
@ 2012-05-05 14:08                   ` Marcelo Tosatti
  2012-05-06  9:36                     ` Avi Kivity
  2012-05-07  6:52                     ` Xiao Guangrong
  0 siblings, 2 replies; 31+ messages in thread
From: Marcelo Tosatti @ 2012-05-05 14:08 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Thu, May 03, 2012 at 07:26:38PM +0800, Xiao Guangrong wrote:
> On 05/03/2012 05:07 AM, Marcelo Tosatti wrote:
> 
> 
> >> 'entry' is not a problem since it is from atomically read-write as
> >> mentioned above, i need change this code to:
> >>
> >> 		/*
> >> 		 * Optimization: for pte sync, if spte was writable the hash
> >> 		 * lookup is unnecessary (and expensive). Write protection
> >> 		 * is responsibility of mmu_get_page / kvm_sync_page.
> >> 		 * Same reasoning can be applied to dirty page accounting.
> >> 		 */
> >> 		if (!can_unsync && is_writable_pte(entry) /* Use 'entry' instead of '*sptep'. */
> >> 			goto set_pte
> >>    ......
> >>
> >>
> >>          if (is_writable_pte(entry) && !is_writable_pte(spte)) /* Use 'spte' instead of '*sptep'. */
> >>                  kvm_flush_remote_tlbs(vcpu->kvm);
> > 
> > What is of more importance than the ability to verify that this or that
> > particular case are ok at the moment is to write code in such a way that
> > its easy to verify that it is correct.
> > 
> > Thus the suggestion above:
> > 
> > "scattered all over (as mentioned before, i think a pattern of read spte
> > once, work on top of that, atomically write and then deal with results
> > _everywhere_ (where mmu lock is held) is more consistent."
> > 
> 
> 
> Marcelo, thanks for your time to patiently review/reply my mail.
> 
> I am confused with ' _everywhere_ ', it means all of the path read/update
> spte? why not only verify the path which depends on is_writable_pte()?

I meant any path that updates from present->present.

> For the reason of "its easy to verify that it is correct"? But these
> paths are safe since it is not care PT_WRITABLE_MASK at all. What these
> paths care is the Dirty-bit and Accessed-bit are not lost, that is why
> we always treat the spte as "volatile" if it is can be updated out of
> mmu-lock.
> 
> For the further development? We can add the delta comment for
> is_writable_pte() to warn the developer use it more carefully.
> 
> It is also very hard to verify spte everywhere. :(
> 
> Actually, the current code to care PT_WRITABLE_MASK is just for
> tlb flush, may be we can fold it into mmu_spte_update.
> [
>   There are tree ways to modify spte, present -> nonpresent, nonpresent -> present,
>   present -> present.
> 
>   But we only need care present -> present for lockless.
> ]

Also need to take memory ordering into account, which was not an issue
before. So it is not only TLB flush.

> /*
>  * return true means we need flush tlbs caused by changing spte from writeable
>  * to read-only.
>  */
> bool mmu_update_spte(u64 *sptep, u64 spte)
> {
> 	u64 last_spte, old_spte = *sptep;
> 	bool flush = false;
> 
> 	last_spte = xchg(sptep, spte);
> 
> 	if ((is_writable_pte(last_spte) ||
> 	      spte_has_updated_lockless(old_spte, last_spte)) &&
> 	         !is_writable_pte(spte) )
> 		flush = true;
> 
> 	.... track Drity/Accessed bit ...
> 
> 
> 	return flush		
> }
> 
> Furthermore, the style of "if (spte-has-changed) goto beginning" is feasible
> in set_spte since this path is a fast path. (i can speed up mmu_need_write_protect)

What you mean exactly?

It would be better if all these complications introduced by lockless
updates can be avoided, say using A/D bits as Avi suggested.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-05 14:08                   ` Marcelo Tosatti
@ 2012-05-06  9:36                     ` Avi Kivity
  2012-05-07  6:52                     ` Xiao Guangrong
  1 sibling, 0 replies; 31+ messages in thread
From: Avi Kivity @ 2012-05-06  9:36 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, LKML, KVM

On 05/05/2012 05:08 PM, Marcelo Tosatti wrote:
> It would be better if all these complications introduced by lockless
> updates can be avoided, say using A/D bits as Avi suggested.

Note that using A/D bits introduces new tradeoffs (when just a few bits
are dirtied per iteration, we reduce guest overhead, but increase host
overhead, since it has to scan a large number of sptes), and also a
large fraction of deployed systems don't have A/D bits support.  But it
should affect our thinking - since in the long term all hosts will have it.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault
  2012-05-05 14:08                   ` Marcelo Tosatti
  2012-05-06  9:36                     ` Avi Kivity
@ 2012-05-07  6:52                     ` Xiao Guangrong
  1 sibling, 0 replies; 31+ messages in thread
From: Xiao Guangrong @ 2012-05-07  6:52 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, LKML, KVM

On 05/05/2012 10:08 PM, Marcelo Tosatti wrote:


>>
>> I am confused with ' _everywhere_ ', it means all of the path read/update
>> spte? why not only verify the path which depends on is_writable_pte()?
> 
> I meant any path that updates from present->present.
> 


OK, got it. So let us focus on mmu_spte_update() only. :)

>> For the reason of "its easy to verify that it is correct"? But these
>> paths are safe since it is not care PT_WRITABLE_MASK at all. What these
>> paths care is the Dirty-bit and Accessed-bit are not lost, that is why
>> we always treat the spte as "volatile" if it is can be updated out of
>> mmu-lock.
>>
>> For the further development? We can add the delta comment for
>> is_writable_pte() to warn the developer use it more carefully.
>>
>> It is also very hard to verify spte everywhere. :(
>>
>> Actually, the current code to care PT_WRITABLE_MASK is just for
>> tlb flush, may be we can fold it into mmu_spte_update.
>> [
>>   There are tree ways to modify spte, present -> nonpresent, nonpresent -> present,
>>   present -> present.
>>
>>   But we only need care present -> present for lockless.
>> ]
> 
> Also need to take memory ordering into account, which was not an issue
> before. So it is not only TLB flush.


It seems do not need explicit barrier, we always use atomic-xchg to update
spte, it has already guaranteed the memory ordering.

In mmu_spte_update():

/* the return value indicates wheater tlb need be flushed.*/
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
	u64 old_spte = *sptep;
	bool flush = false;

	old_spte = xchg(sptep, new_spte);

	if (is_writable_pte(old_spte) && !is_writable_pte(spte) )
		flush = true;

	.....
}

> 
>> /*
>>  * return true means we need flush tlbs caused by changing spte from writeable
>>  * to read-only.
>>  */
>> bool mmu_update_spte(u64 *sptep, u64 spte)
>> {
>> 	u64 last_spte, old_spte = *sptep;
>> 	bool flush = false;
>>
>> 	last_spte = xchg(sptep, spte);
>>
>> 	if ((is_writable_pte(last_spte) ||
>> 	      spte_has_updated_lockless(old_spte, last_spte)) &&
>> 	         !is_writable_pte(spte) )
>> 		flush = true;
>>
>> 	.... track Drity/Accessed bit ...
>>
>>
>> 	return flush		
>> }
>>
>> Furthermore, the style of "if (spte-has-changed) goto beginning" is feasible
>> in set_spte since this path is a fast path. (i can speed up mmu_need_write_protect)
> 
> What you mean exactly?
> 
> It would be better if all these complications introduced by lockless
> updates can be avoided, say using A/D bits as Avi suggested.


Anyway, i do not object it if we have a better way to do these, but ......


^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2012-05-07  6:52 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
2012-04-25  4:01 ` [PATCH v4 01/10] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
2012-04-25  4:01 ` [PATCH v4 02/10] KVM: MMU: abstract spte write-protect Xiao Guangrong
2012-04-25  4:02 ` [PATCH v4 03/10] KVM: VMX: export PFEC.P bit on ept Xiao Guangrong
2012-04-25  4:02 ` [PATCH v4 04/10] KVM: MMU: introduce SPTE_MMU_WRITEABLE bit Xiao Guangrong
2012-04-25  4:03 ` [PATCH v4 05/10] KVM: MMU: introduce SPTE_WRITE_PROTECT bit Xiao Guangrong
2012-04-25  4:03 ` [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
2012-04-26 23:45   ` Marcelo Tosatti
2012-04-27  5:53     ` Xiao Guangrong
2012-04-27 14:52       ` Marcelo Tosatti
2012-04-28  6:10         ` Xiao Guangrong
2012-05-01  1:34           ` Marcelo Tosatti
2012-05-02  5:28             ` Xiao Guangrong
2012-05-02 21:07               ` Marcelo Tosatti
2012-05-03 11:26                 ` Xiao Guangrong
2012-05-05 14:08                   ` Marcelo Tosatti
2012-05-06  9:36                     ` Avi Kivity
2012-05-07  6:52                     ` Xiao Guangrong
2012-04-29  8:50         ` Takuya Yoshikawa
2012-05-01  2:31           ` Marcelo Tosatti
2012-05-02  5:39           ` Xiao Guangrong
2012-05-02 21:10             ` Marcelo Tosatti
2012-05-03 12:09               ` Xiao Guangrong
2012-05-03 12:13                 ` Avi Kivity
2012-05-03  0:15             ` Takuya Yoshikawa
2012-05-03 12:23               ` Xiao Guangrong
2012-05-03 12:40                 ` Takuya Yoshikawa
2012-04-25  4:04 ` [PATCH v4 07/10] KVM: MMU: lockless update spte on fast page fault path Xiao Guangrong
2012-04-25  4:04 ` [PATCH v4 08/10] KVM: MMU: trace fast page fault Xiao Guangrong
2012-04-25  4:05 ` [PATCH v4 09/10] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint Xiao Guangrong
2012-04-25  4:06 ` [PATCH v4 10/10] KVM: MMU: document mmu-lock and fast page fault Xiao Guangrong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).