All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/9] KVM: MMU: fast page fault
@ 2012-04-20  8:16 Xiao Guangrong
  2012-04-20  8:17 ` [PATCH v3 1/9] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
                   ` (9 more replies)
  0 siblings, 10 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-20  8:16 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

Changlog:

- split the long series for better review. This is the core
  implementation of fast page fault.
- document fast page fault in locking.txt


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH v3 1/9] KVM: MMU: return bool in __rmap_write_protect
  2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
@ 2012-04-20  8:17 ` Xiao Guangrong
  2012-04-20  8:17 ` [PATCH v3 2/9] KVM: MMU: abstract spte write-protect Xiao Guangrong
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-20  8:17 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

The reture value of __rmap_write_protect is either 1 or 0, use
true/false instead of these

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   13 +++++++------
 1 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 29ad6f9..4a3cc18 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1041,11 +1041,12 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }

-static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+static bool
+__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int write_protected = 0;
+	bool write_protected = false;

 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
@@ -1066,7 +1067,7 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level
 			sptep = rmap_get_first(*rmapp, &iter);
 		}

-		write_protected = 1;
+		write_protected = true;
 	}

 	return write_protected;
@@ -1097,12 +1098,12 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 	}
 }

-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	int i;
-	int write_protected = 0;
+	bool write_protected = false;

 	slot = gfn_to_memslot(kvm, gfn);

@@ -1691,7 +1692,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,

 	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
-		int protected = 0;
+		bool protected = false;

 		for_each_sp(pages, sp, parents, i)
 			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
  2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
  2012-04-20  8:17 ` [PATCH v3 1/9] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
@ 2012-04-20  8:17 ` Xiao Guangrong
  2012-04-20 21:33   ` Marcelo Tosatti
  2012-04-20  8:18 ` [PATCH v3 3/9] KVM: VMX: export PFEC.P bit on ept Xiao Guangrong
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-20  8:17 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

Introduce a common function to abstract spte write-protect to
cleanup the code

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   60 ++++++++++++++++++++++++++++++---------------------
 1 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4a3cc18..e70ff38 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1041,6 +1041,34 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }

+/* Return true if the spte is dropped. */
+static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
+			       bool *flush)
+{
+	u64 spte = *sptep;
+
+	if (!is_writable_pte(spte))
+		return false;
+
+	*flush |= true;
+
+	if (large) {
+		pgprintk("rmap_write_protect(large): spte %p %llx\n",
+			 spte, *spte);
+		BUG_ON(!is_large_pte(spte));
+
+		drop_spte(kvm, sptep);
+		--kvm->stat.lpages;
+		return true;
+	}
+
+	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+	spte = spte & ~PT_WRITABLE_MASK;
+	mmu_spte_update(sptep, spte);
+
+	return false;
+}
+
 static bool
 __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 {
@@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)

 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-		rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
-
-		if (!is_writable_pte(*sptep)) {
-			sptep = rmap_get_next(&iter);
-			continue;
-		}
-
-		if (level == PT_PAGE_TABLE_LEVEL) {
-			mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
-			sptep = rmap_get_next(&iter);
-		} else {
-			BUG_ON(!is_large_pte(*sptep));
-			drop_spte(kvm, sptep);
-			--kvm->stat.lpages;
+		if (spte_write_protect(kvm, sptep, level > PT_PAGE_TABLE_LEVEL,
+			  &write_protected)) {
 			sptep = rmap_get_first(*rmapp, &iter);
+			continue;
 		}

-		write_protected = true;
+		sptep = rmap_get_next(&iter);
 	}

 	return write_protected;
@@ -3902,6 +3919,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
+	bool flush = false;

 	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
 		int i;
@@ -3916,16 +3934,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 			      !is_last_spte(pt[i], sp->role.level))
 				continue;

-			if (is_large_pte(pt[i])) {
-				drop_spte(kvm, &pt[i]);
-				--kvm->stat.lpages;
-				continue;
-			}
-
-			/* avoid RMW */
-			if (is_writable_pte(pt[i]))
-				mmu_spte_update(&pt[i],
-						pt[i] & ~PT_WRITABLE_MASK);
+			spte_write_protect(kvm, &pt[i],
+					   is_large_pte(pt[i]), &flush);
 		}
 	}
 	kvm_flush_remote_tlbs(kvm);
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH v3 3/9] KVM: VMX: export PFEC.P bit on ept
  2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
  2012-04-20  8:17 ` [PATCH v3 1/9] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
  2012-04-20  8:17 ` [PATCH v3 2/9] KVM: MMU: abstract spte write-protect Xiao Guangrong
@ 2012-04-20  8:18 ` Xiao Guangrong
  2012-04-20  8:18 ` [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit Xiao Guangrong
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-20  8:18 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

Export the present bit of page fault error code, the later patch
will use it

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/vmx.c |    9 ++++++++-
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 52f6856..2c98057 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4735,6 +4735,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification;
 	gpa_t gpa;
+	u32 error_code;
 	int gla_validity;

 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4759,7 +4760,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)

 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	trace_kvm_page_fault(gpa, exit_qualification);
-	return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+
+	/* It is a write fault? */
+	error_code = exit_qualification & (1U << 1);
+	/* ept page table is present? */
+	error_code |= (exit_qualification >> 3) & 0x1;
+
+	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }

 static u64 ept_rsvd_mask(u64 spte, int level)
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit
  2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
                   ` (2 preceding siblings ...)
  2012-04-20  8:18 ` [PATCH v3 3/9] KVM: VMX: export PFEC.P bit on ept Xiao Guangrong
@ 2012-04-20  8:18 ` Xiao Guangrong
  2012-04-20 21:39   ` Marcelo Tosatti
  2012-04-20  8:19 ` [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit Xiao Guangrong
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-20  8:18 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

This bit indicates whether the spte is allow to be writable that
means the gpte of this spte is writable and the pfn pointed by
this spte is writable on host

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   13 ++++++-------
 1 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e70ff38..dd984b6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
 #define CREATE_TRACE_POINTS
 #include "mmutrace.h"

-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))

 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

@@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
 			new_spte |= (u64)new_pfn << PAGE_SHIFT;

-			new_spte &= ~PT_WRITABLE_MASK;
-			new_spte &= ~SPTE_HOST_WRITEABLE;
-			new_spte &= ~shadow_accessed_mask;
+			new_spte &= ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
+				      shadow_accessed_mask | SPTE_ALLOW_WRITE);

 			mmu_spte_clear_track_bits(sptep);
 			mmu_spte_set(sptep, new_spte);
@@ -2316,7 +2316,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			goto done;
 		}

-		spte |= PT_WRITABLE_MASK;
+		spte |= PT_WRITABLE_MASK | SPTE_ALLOW_WRITE;

 		if (!vcpu->arch.mmu.direct_map
 		    && !(pte_access & ACC_WRITE_MASK)) {
@@ -2345,8 +2345,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
-			if (is_writable_pte(spte))
-				spte &= ~PT_WRITABLE_MASK;
+			spte &= ~PT_WRITABLE_MASK;
 		}
 	}

-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
                   ` (3 preceding siblings ...)
  2012-04-20  8:18 ` [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit Xiao Guangrong
@ 2012-04-20  8:19 ` Xiao Guangrong
  2012-04-20 21:52   ` Marcelo Tosatti
  2012-04-20  8:19 ` [PATCH v3 6/9] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-20  8:19 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

If this bit is set, it means the W bit of the spte is cleared due
to shadow page table protection

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   56 ++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dd984b6..eb02fc4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);

 #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))

 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

@@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }

+static bool spte_wp_by_dirty_log(u64 spte)
+{
+	WARN_ON(is_writable_pte(spte));
+
+	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
+}
+
 /* Return true if the spte is dropped. */
 static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
-			       bool *flush)
+			       bool *flush, bool page_table_protect)
 {
 	u64 spte = *sptep;

-	if (!is_writable_pte(spte))
-		return false;
+	if (is_writable_pte(spte)) {
+		*flush |= true;

-	*flush |= true;
+		if (large) {
+			pgprintk("rmap_write_protect(large): spte %p %llx\n",
+				 spte, *spte);
+			BUG_ON(!is_large_pte(spte));

-	if (large) {
-		pgprintk("rmap_write_protect(large): spte %p %llx\n",
-			 spte, *spte);
-		BUG_ON(!is_large_pte(spte));
+			drop_spte(kvm, sptep);
+			--kvm->stat.lpages;
+			return true;
+		}

-		drop_spte(kvm, sptep);
-		--kvm->stat.lpages;
-		return true;
+		goto reset_spte;
 	}

+	if (page_table_protect && spte_wp_by_dirty_log(spte))
+		goto reset_spte;
+
+	return false;
+
+reset_spte:
 	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 	spte = spte & ~PT_WRITABLE_MASK;
+	if (page_table_protect)
+		spte |= SPTE_WRITE_PROTECT;
 	mmu_spte_update(sptep, spte);
-
 	return false;
 }

-static bool
-__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+				 int level, bool page_table_protect)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1080,7 +1096,7 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
 		if (spte_write_protect(kvm, sptep, level > PT_PAGE_TABLE_LEVEL,
-			  &write_protected)) {
+			  &write_protected, page_table_protect)) {
 			sptep = rmap_get_first(*rmapp, &iter);
 			continue;
 		}
@@ -1109,7 +1125,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,

 	while (mask) {
 		rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
-		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
+		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);

 		/* clear the first set bit */
 		mask &= mask - 1;
@@ -1128,7 +1144,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 	for (i = PT_PAGE_TABLE_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		rmapp = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(kvm, rmapp, i);
+		write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
 	}

 	return write_protected;
@@ -1179,7 +1195,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			new_spte |= (u64)new_pfn << PAGE_SHIFT;

 			new_spte &= ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
-				      shadow_accessed_mask | SPTE_ALLOW_WRITE);
+				      shadow_accessed_mask | SPTE_ALLOW_WRITE |
+				      SPTE_WRITE_PROTECT);

 			mmu_spte_clear_track_bits(sptep);
 			mmu_spte_set(sptep, new_spte);
@@ -2346,6 +2363,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
 			spte &= ~PT_WRITABLE_MASK;
+			spte |= SPTE_WRITE_PROTECT;
 		}
 	}

@@ -3934,7 +3952,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 				continue;

 			spte_write_protect(kvm, &pt[i],
-					   is_large_pte(pt[i]), &flush);
+					   is_large_pte(pt[i]), &flush, false);
 		}
 	}
 	kvm_flush_remote_tlbs(kvm);
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH v3 6/9] KVM: MMU: fast path of handling guest page fault
  2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
                   ` (4 preceding siblings ...)
  2012-04-20  8:19 ` [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit Xiao Guangrong
@ 2012-04-20  8:19 ` Xiao Guangrong
  2012-04-20  8:20 ` [PATCH v3 7/9] KVM: MMU: trace fast " Xiao Guangrong
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-20  8:19 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

If the the present bit of page fault error code is set, it indicates
the shadow page is populated on all levels, it means what we do is
only modify the access bit which can be done out of mmu-lock

Currently, in order to simplify the code, we only fix the page fault
caused by write-protect on the fast path

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c         |  197 ++++++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/paging_tmpl.h |    3 +
 2 files changed, 184 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eb02fc4..7aea156 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -446,6 +446,13 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
 }
 #endif

+static bool spte_wp_by_dirty_log(u64 spte)
+{
+	WARN_ON(is_writable_pte(spte));
+
+	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
+}
+
 static bool spte_has_volatile_bits(u64 spte)
 {
 	if (!shadow_accessed_mask)
@@ -454,9 +461,18 @@ static bool spte_has_volatile_bits(u64 spte)
 	if (!is_shadow_present_pte(spte))
 		return false;

-	if ((spte & shadow_accessed_mask) &&
-	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
-		return false;
+	if (spte & shadow_accessed_mask) {
+		if (is_writable_pte(spte))
+			return !(spte & shadow_dirty_mask);
+
+		/*
+		 * If the spte is write-protected by dirty-log, it may
+		 * be marked writable on fast page fault path, then CPU
+		 * can modify the Dirty bit.
+		 */
+		if (!spte_wp_by_dirty_log(spte))
+			return false;
+	}

 	return true;
 }
@@ -1043,13 +1059,6 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }

-static bool spte_wp_by_dirty_log(u64 spte)
-{
-	WARN_ON(is_writable_pte(spte));
-
-	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
-}
-
 /* Return true if the spte is dropped. */
 static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 			       bool *flush, bool page_table_protect)
@@ -1057,13 +1066,12 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 	u64 spte = *sptep;

 	if (is_writable_pte(spte)) {
-		*flush |= true;
-
 		if (large) {
 			pgprintk("rmap_write_protect(large): spte %p %llx\n",
 				 spte, *spte);
 			BUG_ON(!is_large_pte(spte));

+			*flush |= true;
 			drop_spte(kvm, sptep);
 			--kvm->stat.lpages;
 			return true;
@@ -1072,6 +1080,9 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 		goto reset_spte;
 	}

+	/* We need flush tlbs in this case: the fast page fault path
+	 * can mark the spte writable after we read the sptep.
+	 */
 	if (page_table_protect && spte_wp_by_dirty_log(spte))
 		goto reset_spte;

@@ -1079,6 +1090,8 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,

 reset_spte:
 	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+
+	*flush |= true;
 	spte = spte & ~PT_WRITABLE_MASK;
 	if (page_table_protect)
 		spte |= SPTE_WRITE_PROTECT;
@@ -2676,18 +2689,164 @@ exit:
 	return ret;
 }

+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
+				   u32 error_code)
+{
+	/*
+	 * #PF can be fast only if the shadow page table is present and it
+	 * is caused by write-protect, that means we just need change the
+	 * W bit of the spte which can be done out of mmu-lock.
+	 */
+	if (!(error_code & PFERR_PRESENT_MASK) ||
+	      !(error_code & PFERR_WRITE_MASK))
+		return false;
+
+	return true;
+}
+
+static bool
+fast_pf_fix_indirect_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			  u64 *sptep, u64 spte, gfn_t gfn)
+{
+	pfn_t pfn;
+	bool ret = false;
+
+	/*
+	 * For the indirect spte, it is hard to get a stable gfn since
+	 * we just use a cmpxchg to avoid all the races which is not
+	 * enough to avoid the ABA problem: the host can arbitrarily
+	 * change spte and the mapping from gfn to pfn.
+	 *
+	 * What we do is call gfn_to_pfn_atomic to bind the gfn and the
+	 * pfn because after the call:
+	 * - we have held the refcount of pfn that means the pfn can not
+	 *   be freed and be reused for another gfn.
+	 * - the pfn is writable that means it can not be shared by different
+	 *   gfn.
+	 */
+	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+	/* The host page is swapped out or merged. */
+	if (mmu_invalid_pfn(pfn))
+		goto exit;
+
+	ret = true;
+
+	if (pfn != spte_to_pfn(spte))
+		goto exit;
+
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+		mark_page_dirty(vcpu->kvm, gfn);
+
+exit:
+	kvm_release_pfn_clean(pfn);
+	return ret;
+}
+
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			u64 *sptep, u64 spte)
+{
+	gfn_t gfn;
+
+	WARN_ON(!sp->role.direct);
+
+	/*
+	 * The gfn of direct spte is stable since it is calculated
+	 * by sp->gfn.
+	 */
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+		mark_page_dirty(vcpu->kvm, gfn);
+
+	return true;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+			    int level, u32 error_code)
+{
+	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_mmu_page *sp;
+	bool ret = false;
+	u64 spte = 0ull;
+
+	if (!page_fault_can_be_fast(vcpu, gfn, error_code))
+		return false;
+
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+		if (!is_shadow_present_pte(spte) || iterator.level < level)
+			break;
+
+	/*
+	 * If the mapping has been changed, let the vcpu fault on the
+	 * same address again.
+	 */
+	if (!is_rmap_spte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	if (!is_last_spte(spte, level))
+		goto exit;
+
+	/*
+	 * Check if it is a spurious fault caused by TLB lazily flushed.
+	 *
+	 * Need not check the access of upper level table entries since
+	 * they are always ACC_ALL.
+	 */
+	 if (is_writable_pte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	/*
+	 * If the page fault is caused by write but host do not allow
+	 * to write the page, we need cow the host page.
+	 */
+	if (!(spte & SPTE_ALLOW_WRITE))
+		goto exit;
+
+	/*
+	 * Currently, to simplify the code, only the spte write-protected
+	 * by dirty-log can be fast fixed.
+	 */
+	if (!spte_wp_by_dirty_log(spte))
+		goto exit;
+
+	sp = page_header(__pa(iterator.sptep));
+
+	if (sp->role.direct)
+		ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
+	else
+		ret = fast_pf_fix_indirect_spte(vcpu, sp, iterator.sptep,
+						spte, gfn);
+
+exit:
+	walk_shadow_page_lockless_end(vcpu);
+
+	return ret;
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);

-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
-			 bool prefault)
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+			 gfn_t gfn, bool prefault)
 {
 	int r;
 	int level;
 	int force_pt_level;
 	pfn_t pfn;
 	unsigned long mmu_seq;
-	bool map_writable;
+	bool map_writable, write = error_code & PFERR_WRITE_MASK;

 	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
 	if (likely(!force_pt_level)) {
@@ -2704,6 +2863,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 	} else
 		level = PT_PAGE_TABLE_LEVEL;

+	if (fast_page_fault(vcpu, v, gfn, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

@@ -3092,7 +3254,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	gfn = gva >> PAGE_SHIFT;

 	return nonpaging_map(vcpu, gva & PAGE_MASK,
-			     error_code & PFERR_WRITE_MASK, gfn, prefault);
+			     error_code, gfn, prefault);
 }

 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3172,6 +3334,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	} else
 		level = PT_PAGE_TABLE_LEVEL;

+	if (fast_page_fault(vcpu, gpa, gfn, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df5a703..80493fb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -617,6 +617,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
 	}

+	if (fast_page_fault(vcpu, addr, walker.gfn, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH v3 7/9] KVM: MMU: trace fast page fault
  2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
                   ` (5 preceding siblings ...)
  2012-04-20  8:19 ` [PATCH v3 6/9] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
@ 2012-04-20  8:20 ` Xiao Guangrong
  2012-04-20  8:20 ` [PATCH v3 8/9] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint Xiao Guangrong
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-20  8:20 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

To see what happen on this path and help us to optimize it

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c      |    2 ++
 arch/x86/kvm/mmutrace.h |   41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7aea156..ac9c285 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2830,6 +2830,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 						spte, gfn);

 exit:
+	trace_fast_page_fault(vcpu, gva, gfn, error_code, iterator.sptep,
+			      spte, ret);
 	walk_shadow_page_lockless_end(vcpu);

 	return ret;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e8..84da94f 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,6 +243,47 @@ TRACE_EVENT(
 	TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
 		  __entry->access)
 );
+
+#define __spte_satisfied(__spte)				\
+	(__entry->retry && is_writable_pte(__entry->__spte))
+
+TRACE_EVENT(
+	fast_page_fault,
+	TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, u32 error_code,
+		 u64 *sptep, u64 old_spte, bool retry),
+	TP_ARGS(vcpu, gva, gfn, error_code, sptep, old_spte, retry),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(gva_t, gva)
+		__field(gfn_t, gfn)
+		__field(u32, error_code)
+		__field(u64 *, sptep)
+		__field(u64, old_spte)
+		__field(u64, new_spte)
+		__field(bool, retry)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu->vcpu_id;
+		__entry->gva = gva;
+		__entry->gfn = gfn;
+		__entry->error_code = error_code;
+		__entry->sptep = sptep;
+		__entry->old_spte = old_spte;
+		__entry->new_spte = *sptep;
+		__entry->retry = retry;
+	),
+
+	TP_printk("vcpu %d gva %lx gfn %llx error_code %s sptep %p "
+		  "old %#llx new %llx spurious %d fixed %d", __entry->vcpu_id,
+		  __entry->gva, __entry->gfn,
+		  __print_flags(__entry->error_code, "|",
+				kvm_mmu_trace_pferr_flags),
+		  __entry->sptep, __entry->old_spte, __entry->new_spte,
+		  __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+	)
+);
 #endif /* _TRACE_KVMMMU_H */

 #undef TRACE_INCLUDE_PATH
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH v3 8/9] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint
  2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
                   ` (6 preceding siblings ...)
  2012-04-20  8:20 ` [PATCH v3 7/9] KVM: MMU: trace fast " Xiao Guangrong
@ 2012-04-20  8:20 ` Xiao Guangrong
  2012-04-20  8:21 ` [PATCH v3 9/9] KVM: MMU: document mmu-lock and fast page fault Xiao Guangrong
  2012-04-21  0:59 ` [PATCH v3 0/9] KVM: MMU: " Marcelo Tosatti
  9 siblings, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-20  8:20 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

P bit of page fault error code is missed in this tracepoint, fix it by
passing the full error code

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmutrace.h    |    7 +++----
 arch/x86/kvm/paging_tmpl.h |    3 +--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 84da94f..e762d35 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
  */
 TRACE_EVENT(
 	kvm_mmu_pagetable_walk,
-	TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
-	TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+	TP_PROTO(u64 addr, u32 pferr),
+	TP_ARGS(addr, pferr),

 	TP_STRUCT__entry(
 		__field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(

 	TP_fast_assign(
 		__entry->addr = addr;
-		__entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
-		                 | (!!fetch_fault << 4);
+		__entry->pferr = pferr;
 	),

 	TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 80493fb..8e6aac9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	const int fetch_fault = access & PFERR_FETCH_MASK;
 	u16 errcode = 0;

-	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
-				     fetch_fault);
+	trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
 	eperm = false;
 	walker->level = mmu->root_level;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH v3 9/9] KVM: MMU: document mmu-lock and fast page fault
  2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
                   ` (7 preceding siblings ...)
  2012-04-20  8:20 ` [PATCH v3 8/9] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint Xiao Guangrong
@ 2012-04-20  8:21 ` Xiao Guangrong
  2012-04-21  0:59 ` [PATCH v3 0/9] KVM: MMU: " Marcelo Tosatti
  9 siblings, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-20  8:21 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, Marcelo Tosatti, LKML, KVM

Document fast page fault and mmu-lock in locking.txt

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 Documentation/virtual/kvm/locking.txt |  148 ++++++++++++++++++++++++++++++++-
 1 files changed, 147 insertions(+), 1 deletions(-)

diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 3b4cd3b..089d462 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -6,7 +6,147 @@ KVM Lock Overview

 (to be written)

-2. Reference
+3: Exception
+------------
+
+Fast page fault:
+
+Fast page fault is the fast path which fixes the guest page fault out of
+the mmu-lock on x86. Currently, the page fault can be fast only if the
+shadow page table is present and it is caused by write-protect, that means
+we just need change the W bit of the spte.
+
+What we use to avoid all the race is the SPTE_ALLOW_WRITE bit and
+SPTE_WRITE_PROTECT bit on the spte:
+- SPTE_ALLOW_WRITE means the gfn is writable on both guest and host.
+- SPTE_WRITE_PROTECT means the gfn is not write-protected for shadow page
+  write protection.
+
+On fast page fault path, we will use cmpxchg to atomically set the spte W
+bit if spte.SPTE_WRITE_PROTECT = 1 and spte.SPTE_WRITE_PROTECT = 0, this is
+safe because whenever changing these bits can be detected by cmpxchg.
+
+But we need carefully check these cases:
+1): The mapping from gfn to pfn
+
+The mapping from gfn to pfn may be changed since we can only ensure the pfn
+is not changed during cmpxchg. This is a ABA problem, for example, below case
+will happen:
+
+At the beginning:
+gpte = gfn1
+gfn1 is mapped to pfn1 on host
+spte is the shadow page table entry corresponding with gpte and
+spte = pfn1
+
+   VCPU 0                           VCPU0
+on fast page fault path:
+
+   old_spte = *spte;
+                                 pfn1 is swapped out:
+                                    spte = 0;
+
+                                 pfn1 is re-alloced for gfn2.
+
+                                 gpte is changed to point to
+                                 gfn2 by the guest:
+                                    spte = pfn1;
+
+   if (cmpxchg(spte, old_spte, old_spte+W)
+	mark_page_dirty(vcpu->kvm, gfn1)
+             OOPS!!!
+
+We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
+
+For direct sp, we can easily avoid it since the spte of direct sp is fixed
+to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic()
+to pin gfn to pfn, because after gfn_to_pfn_atomic():
+- We have held the refcount of pfn that means the pfn can not be freed and
+  be reused for another gfn.
+- The pfn is writable that means it can not be shared between different gfns
+  by KSM.
+
+Then, we can ensure the dirty bitmaps is correctly set for a gfn.
+
+2): flush tlbs due to shadow page table write-protected
+
+In rmap_write_protect(), we always need flush tlbs if spte.SPTE_ALLOW_WRITE = 1
+even if the current spte is read-only. The reason is fast page fault path
+will mark the spte to writable and the writable spte will be cached into tlb.
+Like below case:
+
+At the beginning:
+spte.W = 0
+spte.SPTE_WRITE_PROTECT = 0
+spte.SPTE_ALLOW_WRITE = 1
+
+   VCPU 0                          VCPU0
+In rmap_write_protect():
+
+   flush = false;
+
+   if (spte.W == 1)
+      flush = true;
+
+                               On fast page fault path:
+                                  old_spte = *spte
+                                  cmpxchg(spte, old_spte, old_spte + W)
+
+                               the spte is fetched/prefetched into
+                               tlb by CPU
+
+   spte = (spte | SPTE_WRITE_PROTECT) &
+                      ~PT_WRITABLE_MASK;
+
+   if (flush)
+         kvm_flush_remote_tlbs(vcpu->kvm)
+               OOPS!!!
+
+The tlbs are not flushed since the spte is read-only, but invalid writable
+spte has been cached in the tlbs caused by fast page fault.
+
+3): Dirty bit tracking
+In the origin code, the spte can be fast updated (non-atomically) if the
+spte is read-only and the Accessed bit has already been set since the
+Accessed bit and Dirty bit can not be lost.
+
+But it is not true after fast page fault since the spte can be marked
+writable between reading spte and updating spte. Like below case:
+
+At the beginning:
+spte.W = 0
+spte.Accessed = 1
+
+   VCPU 0                                       VCPU0
+In mmu_spte_clear_track_bits():
+
+   old_spte = *spte;
+
+   /* 'if' condition is satisfied. */
+   if (old_spte.Accssed == 1 &&
+        old_spte.W == 0)
+      spte = 0ull;
+                                         on fast page fault path:
+                                             spte.W = 1
+                                         memory write on the spte:
+                                             spte.Dirty = 1
+
+
+   else
+      old_spte = xchg(spte, 0ull)
+
+
+   if (old_spte.Accssed == 1)
+      kvm_set_pfn_accessed(spte.pfn);
+   if (old_spte.Dirty == 1)
+      kvm_set_pfn_dirty(spte.pfn);
+      OOPS!!!
+
+The Dirt bit is lost in this case. We can call the slow path
+(__update_clear_spte_slow()) to update the spte if the spte can be changed
+by fast page fault.
+
+3. Reference
 ------------

 Name:		kvm_lock
@@ -23,3 +163,9 @@ Arch:		x86
 Protects:	- kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset}
 		- tsc offset in vmcb
 Comment:	'raw' because updating the tsc offsets must not be preempted.
+
+Name:		kvm->mmu_lock
+Type:		spinlock_t
+Arch:		any
+Protects:	-shadow page/shadow tlb entry
+Comment:	it is a spinlock since it will be used in mmu notifier.
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
  2012-04-20  8:17 ` [PATCH v3 2/9] KVM: MMU: abstract spte write-protect Xiao Guangrong
@ 2012-04-20 21:33   ` Marcelo Tosatti
  2012-04-21  1:10     ` Takuya Yoshikawa
  2012-04-21  3:24     ` Xiao Guangrong
  0 siblings, 2 replies; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-20 21:33 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Fri, Apr 20, 2012 at 04:17:47PM +0800, Xiao Guangrong wrote:
> Introduce a common function to abstract spte write-protect to
> cleanup the code
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/kvm/mmu.c |   60 ++++++++++++++++++++++++++++++---------------------
>  1 files changed, 35 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4a3cc18..e70ff38 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1041,6 +1041,34 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
>  		rmap_remove(kvm, sptep);
>  }
> 
> +/* Return true if the spte is dropped. */
> +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
> +			       bool *flush)
> +{
> +	u64 spte = *sptep;
> +
> +	if (!is_writable_pte(spte))
> +		return false;
> +
> +	*flush |= true;
> +
> +	if (large) {
> +		pgprintk("rmap_write_protect(large): spte %p %llx\n",
> +			 spte, *spte);
> +		BUG_ON(!is_large_pte(spte));
> +
> +		drop_spte(kvm, sptep);
> +		--kvm->stat.lpages;
> +		return true;
> +	}
> +
> +	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
> +	spte = spte & ~PT_WRITABLE_MASK;
> +	mmu_spte_update(sptep, spte);
> +
> +	return false;
> +}
> +
>  static bool
>  __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
>  {
> @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
> 
>  	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
>  		BUG_ON(!(*sptep & PT_PRESENT_MASK));
> -		rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> -
> -		if (!is_writable_pte(*sptep)) {
> -			sptep = rmap_get_next(&iter);
> -			continue;
> -		}
> -
> -		if (level == PT_PAGE_TABLE_LEVEL) {
> -			mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
> -			sptep = rmap_get_next(&iter);
> -		} else {
> -			BUG_ON(!is_large_pte(*sptep));
> -			drop_spte(kvm, sptep);
> -			--kvm->stat.lpages;

It is preferable to remove all large sptes including read-only ones, the
current behaviour, then to verify that no read->write transition can
occur in fault paths (fault paths which are increasing in number).


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit
  2012-04-20  8:18 ` [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit Xiao Guangrong
@ 2012-04-20 21:39   ` Marcelo Tosatti
  2012-04-21  3:30     ` Xiao Guangrong
  0 siblings, 1 reply; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-20 21:39 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Fri, Apr 20, 2012 at 04:18:47PM +0800, Xiao Guangrong wrote:
> This bit indicates whether the spte is allow to be writable that
> means the gpte of this spte is writable and the pfn pointed by
> this spte is writable on host
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/kvm/mmu.c |   13 ++++++-------
>  1 files changed, 6 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index e70ff38..dd984b6 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
>  #define CREATE_TRACE_POINTS
>  #include "mmutrace.h"
> 
> -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> +#define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> +#define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> 
>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> 
> @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
>  			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
>  			new_spte |= (u64)new_pfn << PAGE_SHIFT;
> 
> -			new_spte &= ~PT_WRITABLE_MASK;
> -			new_spte &= ~SPTE_HOST_WRITEABLE;
> -			new_spte &= ~shadow_accessed_mask;
> +			new_spte &= ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
> +				      shadow_accessed_mask | SPTE_ALLOW_WRITE);

Each bit should have a distinct meaning. Here the host pte is being
write-protected, which means only the SPTE_HOST_WRITEABLE bit
should be cleared.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-20  8:19 ` [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit Xiao Guangrong
@ 2012-04-20 21:52   ` Marcelo Tosatti
  2012-04-21  0:40     ` Marcelo Tosatti
  2012-04-21  3:47     ` Xiao Guangrong
  0 siblings, 2 replies; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-20 21:52 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
> If this bit is set, it means the W bit of the spte is cleared due
> to shadow page table protection
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/kvm/mmu.c |   56 ++++++++++++++++++++++++++++++++++-----------------
>  1 files changed, 37 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index dd984b6..eb02fc4 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
> 
>  #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
>  #define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> +#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))
> 
>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> 
> @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
>  		rmap_remove(kvm, sptep);
>  }
> 
> +static bool spte_wp_by_dirty_log(u64 spte)
> +{
> +	WARN_ON(is_writable_pte(spte));
> +
> +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
> +}

Is the information accurate? Say:

- dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
- shadow gfn, rmap_write_protect finds page not WRITABLE.
- spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.

BTW,

"introduce SPTE_ALLOW_WRITE bit

This bit indicates whether the spte is allow to be writable that
means the gpte of this spte is writable and the pfn pointed by
this spte is writable on host"

Other than the fact that each bit should have one meaning, how
can this bit be accurate without write protection of the gpte?

As soon as guest writes to gpte, information in bit is outdated.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-20 21:52   ` Marcelo Tosatti
@ 2012-04-21  0:40     ` Marcelo Tosatti
  2012-04-21  0:55       ` Marcelo Tosatti
  2012-04-21  4:00       ` Xiao Guangrong
  2012-04-21  3:47     ` Xiao Guangrong
  1 sibling, 2 replies; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-21  0:40 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Fri, Apr 20, 2012 at 06:52:11PM -0300, Marcelo Tosatti wrote:
> On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
> > If this bit is set, it means the W bit of the spte is cleared due
> > to shadow page table protection
> > 
> > Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> > ---
> >  arch/x86/kvm/mmu.c |   56 ++++++++++++++++++++++++++++++++++-----------------
> >  1 files changed, 37 insertions(+), 19 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index dd984b6..eb02fc4 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
> > 
> >  #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> >  #define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> > +#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))
> > 
> >  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > 
> > @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
> >  		rmap_remove(kvm, sptep);
> >  }
> > 
> > +static bool spte_wp_by_dirty_log(u64 spte)
> > +{
> > +	WARN_ON(is_writable_pte(spte));
> > +
> > +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
> > +}
> 
> Is the information accurate? Say:
> 
> - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
> - shadow gfn, rmap_write_protect finds page not WRITABLE.
> - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.
> 
> BTW,
> 
> "introduce SPTE_ALLOW_WRITE bit
> 
> This bit indicates whether the spte is allow to be writable that
> means the gpte of this spte is writable and the pfn pointed by
> this spte is writable on host"
> 
> Other than the fact that each bit should have one meaning, how
> can this bit be accurate without write protection of the gpte?
> 
> As soon as guest writes to gpte, information in bit is outdated.

Ok, i found one example where mmu_lock was expecting sptes not 
to change:


VCPU0				VCPU1

- read-only gpte
- read-only spte
- write fault
- spte = *sptep
				guest write to gpte, set writable bit
				spte writable
				parent page unsync
				guest write to gpte writable bit clear
				guest invlpg updates spte to RO
				sync_page
				enter set_spte from sync_page
- cmpxchg(spte) is now writable
[window where another vcpu can
cache spte with writable bit
set]

				if (is_writable_pte(entry) && !is_writable_pte(*sptep))
					kvm_flush_remote_tlbs(vcpu->kvm);

The flush is not executed because spte was read-only (which is 
a correct assumption as long as sptes updates are protected
by mmu_lock).

So this is an example of implicit assumptions which break if you update
spte without mmu_lock. Certainly there are more cases. :(



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-21  0:40     ` Marcelo Tosatti
@ 2012-04-21  0:55       ` Marcelo Tosatti
  2012-04-21  1:38         ` Takuya Yoshikawa
  2012-04-21  4:29         ` Xiao Guangrong
  2012-04-21  4:00       ` Xiao Guangrong
  1 sibling, 2 replies; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-21  0:55 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Fri, Apr 20, 2012 at 09:40:30PM -0300, Marcelo Tosatti wrote:
> On Fri, Apr 20, 2012 at 06:52:11PM -0300, Marcelo Tosatti wrote:
> > On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
> > > If this bit is set, it means the W bit of the spte is cleared due
> > > to shadow page table protection
> > > 
> > > Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> > > ---
> > >  arch/x86/kvm/mmu.c |   56 ++++++++++++++++++++++++++++++++++-----------------
> > >  1 files changed, 37 insertions(+), 19 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > > index dd984b6..eb02fc4 100644
> > > --- a/arch/x86/kvm/mmu.c
> > > +++ b/arch/x86/kvm/mmu.c
> > > @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
> > > 
> > >  #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> > >  #define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> > > +#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))
> > > 
> > >  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > > 
> > > @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
> > >  		rmap_remove(kvm, sptep);
> > >  }
> > > 
> > > +static bool spte_wp_by_dirty_log(u64 spte)
> > > +{
> > > +	WARN_ON(is_writable_pte(spte));
> > > +
> > > +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
> > > +}
> > 
> > Is the information accurate? Say:
> > 
> > - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
> > - shadow gfn, rmap_write_protect finds page not WRITABLE.
> > - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.
> > 
> > BTW,
> > 
> > "introduce SPTE_ALLOW_WRITE bit
> > 
> > This bit indicates whether the spte is allow to be writable that
> > means the gpte of this spte is writable and the pfn pointed by
> > this spte is writable on host"
> > 
> > Other than the fact that each bit should have one meaning, how
> > can this bit be accurate without write protection of the gpte?
> > 
> > As soon as guest writes to gpte, information in bit is outdated.
> 
> Ok, i found one example where mmu_lock was expecting sptes not 
> to change:
> 
> 
> VCPU0				VCPU1
> 
> - read-only gpte
> - read-only spte
> - write fault
> - spte = *sptep
> 				guest write to gpte, set writable bit
> 				spte writable
> 				parent page unsync
> 				guest write to gpte writable bit clear
> 				guest invlpg updates spte to RO
> 				sync_page
> 				enter set_spte from sync_page
> - cmpxchg(spte) is now writable
> [window where another vcpu can
> cache spte with writable bit
> set]
> 
> 				if (is_writable_pte(entry) && !is_writable_pte(*sptep))
> 					kvm_flush_remote_tlbs(vcpu->kvm);
> 
> The flush is not executed because spte was read-only (which is 
> a correct assumption as long as sptes updates are protected
> by mmu_lock).
> 
> So this is an example of implicit assumptions which break if you update
> spte without mmu_lock. Certainly there are more cases. :(

OK, i now see you mentioned a similar case in the document, for
rmap_write_protect.

More importantly than the particular flush TLB case, the point is
every piece of code that reads and writes sptes must now be aware that
mmu_lock alone does not guarantee stability. Everything must be audited.

Where the bulk of the improvement comes from again? If there is little
or no mmu_lock contention (which we have no consistent data to be honest
in your testcase) is the bouncing off mmu_lock's cacheline that hurts?


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 0/9] KVM: MMU: fast page fault
  2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
                   ` (8 preceding siblings ...)
  2012-04-20  8:21 ` [PATCH v3 9/9] KVM: MMU: document mmu-lock and fast page fault Xiao Guangrong
@ 2012-04-21  0:59 ` Marcelo Tosatti
  9 siblings, 0 replies; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-21  0:59 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Fri, Apr 20, 2012 at 04:16:52PM +0800, Xiao Guangrong wrote:
> Changlog:
> 
> - split the long series for better review. This is the core
>   implementation of fast page fault.
> - document fast page fault in locking.txt

Great, this is a big improvement, thanks.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
  2012-04-20 21:33   ` Marcelo Tosatti
@ 2012-04-21  1:10     ` Takuya Yoshikawa
  2012-04-21  4:34       ` Xiao Guangrong
  2012-04-21  3:24     ` Xiao Guangrong
  1 sibling, 1 reply; 35+ messages in thread
From: Takuya Yoshikawa @ 2012-04-21  1:10 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On Fri, 20 Apr 2012 18:33:19 -0300
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> It is preferable to remove all large sptes including read-only ones, the
> current behaviour, then to verify that no read->write transition can
> occur in fault paths (fault paths which are increasing in number).
> 

I think we should use separate function than spte_write_protect() for
the large case.

	Takuya

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-21  0:55       ` Marcelo Tosatti
@ 2012-04-21  1:38         ` Takuya Yoshikawa
  2012-04-21  4:29         ` Xiao Guangrong
  1 sibling, 0 replies; 35+ messages in thread
From: Takuya Yoshikawa @ 2012-04-21  1:38 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On Fri, 20 Apr 2012 21:55:55 -0300
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> More importantly than the particular flush TLB case, the point is
> every piece of code that reads and writes sptes must now be aware that
> mmu_lock alone does not guarantee stability. Everything must be audited.

In addition, please give me some stress-test cases to verify these in the
real environments.  Live migration with KSM, with notifier call, etc?

Although the current logic is verified by dirty-log api test, the new logic
may need another api test program.

Note: the problem is that live migration can fail silently.  We cannot know
the data loss is from guest side problem or get_dirty side.

> Where the bulk of the improvement comes from again? If there is little
> or no mmu_lock contention (which we have no consistent data to be honest
> in your testcase) is the bouncing off mmu_lock's cacheline that hurts?

This week, I was doing some simplified "worst-latency-tests" for my work.
It was difficult than I thought.

But Xiao's "lock-less" should see the reduction of mmu_lock contention
more easily, if there is really some.

To make things simple, e.g., we can do the same kind of write-loop as
XBZRLE people are doing in the guest - with more VCPUs if possible.

Thanks,
	Takuya

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
  2012-04-20 21:33   ` Marcelo Tosatti
  2012-04-21  1:10     ` Takuya Yoshikawa
@ 2012-04-21  3:24     ` Xiao Guangrong
  2012-04-21  4:18       ` Marcelo Tosatti
  1 sibling, 1 reply; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-21  3:24 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On 04/21/2012 05:33 AM, Marcelo Tosatti wrote:


>>  static bool
>>  __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
>>  {
>> @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
>>
>>  	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
>>  		BUG_ON(!(*sptep & PT_PRESENT_MASK));
>> -		rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
>> -
>> -		if (!is_writable_pte(*sptep)) {
>> -			sptep = rmap_get_next(&iter);
>> -			continue;
>> -		}
>> -
>> -		if (level == PT_PAGE_TABLE_LEVEL) {
>> -			mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
>> -			sptep = rmap_get_next(&iter);
>> -		} else {
>> -			BUG_ON(!is_large_pte(*sptep));
>> -			drop_spte(kvm, sptep);
>> -			--kvm->stat.lpages;
> 
> It is preferable to remove all large sptes including read-only ones, the


It can cause page faults even if read memory on these large sptse.

Actually, Avi suggested that make large writable spte to be readonly
(not dropped) on this path.

> current behaviour, then to verify that no read->write transition can
> occur in fault paths (fault paths which are increasing in number).


Yes, the small spte also has issue (find a write-protected spte in
fault paths). Later, the second part of this patchset will introduce
rmap.WRITE_PROTECTED bit, then we can do the fast check before calling
fast page fault.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit
  2012-04-20 21:39   ` Marcelo Tosatti
@ 2012-04-21  3:30     ` Xiao Guangrong
  2012-04-21  4:22       ` Marcelo Tosatti
  0 siblings, 1 reply; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-21  3:30 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On 04/21/2012 05:39 AM, Marcelo Tosatti wrote:


>> @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
>>  			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
>>  			new_spte |= (u64)new_pfn << PAGE_SHIFT;
>>
>> -			new_spte &= ~PT_WRITABLE_MASK;
>> -			new_spte &= ~SPTE_HOST_WRITEABLE;
>> -			new_spte &= ~shadow_accessed_mask;
>> +			new_spte &= ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
>> +				      shadow_accessed_mask | SPTE_ALLOW_WRITE);
> 
> Each bit should have a distinct meaning. Here the host pte is being
> write-protected, which means only the SPTE_HOST_WRITEABLE bit
> should be cleared.


Hmm, it is no problem if SPTE_ALLOW_WRITE is not cleared.

But the meaning of SPTE_ALLOW_WRITE will become strange: we will see a
spte with spte.SPTE_ALLOW_WRITE = 1 (means the spte is writable on host
and guest) and spte.SPTE_HOST_WRITEABLE = 0 (means the spte is read-only
on host).


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-20 21:52   ` Marcelo Tosatti
  2012-04-21  0:40     ` Marcelo Tosatti
@ 2012-04-21  3:47     ` Xiao Guangrong
  2012-04-21  4:38       ` Marcelo Tosatti
  1 sibling, 1 reply; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-21  3:47 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On 04/21/2012 05:52 AM, Marcelo Tosatti wrote:

> On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
>> If this bit is set, it means the W bit of the spte is cleared due
>> to shadow page table protection
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>> ---
>>  arch/x86/kvm/mmu.c |   56 ++++++++++++++++++++++++++++++++++-----------------
>>  1 files changed, 37 insertions(+), 19 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index dd984b6..eb02fc4 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
>>
>>  #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
>>  #define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
>> +#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))
>>
>>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
>>
>> @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
>>  		rmap_remove(kvm, sptep);
>>  }
>>
>> +static bool spte_wp_by_dirty_log(u64 spte)
>> +{
>> +	WARN_ON(is_writable_pte(spte));
>> +
>> +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
>> +}
> 
> Is the information accurate? Say:
> 
> - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
> - shadow gfn, rmap_write_protect finds page not WRITABLE.
> - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.


It can not happen, rmap_write_protect will set SPTE_WRITE_PROTECT
even if the spte is not WRITABLE, please see:

+	if (page_table_protect && spte_wp_by_dirty_log(spte))
+		goto reset_spte;
+
+	return false;
+
+reset_spte:
 	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 	spte = spte & ~PT_WRITABLE_MASK;
+	if (page_table_protect)
+		spte |= SPTE_WRITE_PROTECT;
 	mmu_spte_update(sptep, spte);
-
 	return false;
 }

> 
> BTW,
> 
> "introduce SPTE_ALLOW_WRITE bit
> 
> This bit indicates whether the spte is allow to be writable that
> means the gpte of this spte is writable and the pfn pointed by
> this spte is writable on host"
> 
> Other than the fact that each bit should have one meaning, how
> can this bit be accurate without write protection of the gpte?
> 


Above explanation can ensure the meaning of this bit is accurate?
Or it has another case? :)

> As soon as guest writes to gpte, information in bit is outdated.
> 


The bit will be updated when spte is updated.

When the guest write gpte, the spte is not updated immediately,
yes, the bit is outdated at that time, but it is ok since tlb is
not flushed.

After tlb flush, the bit can be coincident with gpte.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-21  0:40     ` Marcelo Tosatti
  2012-04-21  0:55       ` Marcelo Tosatti
@ 2012-04-21  4:00       ` Xiao Guangrong
  2012-04-24  0:45         ` Marcelo Tosatti
  1 sibling, 1 reply; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-21  4:00 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On 04/21/2012 08:40 AM, Marcelo Tosatti wrote:

> On Fri, Apr 20, 2012 at 06:52:11PM -0300, Marcelo Tosatti wrote:
>> On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
>>> If this bit is set, it means the W bit of the spte is cleared due
>>> to shadow page table protection
>>>
>>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>>> ---
>>>  arch/x86/kvm/mmu.c |   56 ++++++++++++++++++++++++++++++++++-----------------
>>>  1 files changed, 37 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>> index dd984b6..eb02fc4 100644
>>> --- a/arch/x86/kvm/mmu.c
>>> +++ b/arch/x86/kvm/mmu.c
>>> @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
>>>
>>>  #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
>>>  #define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
>>> +#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))
>>>
>>>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
>>>
>>> @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
>>>  		rmap_remove(kvm, sptep);
>>>  }
>>>
>>> +static bool spte_wp_by_dirty_log(u64 spte)
>>> +{
>>> +	WARN_ON(is_writable_pte(spte));
>>> +
>>> +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
>>> +}
>>
>> Is the information accurate? Say:
>>
>> - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
>> - shadow gfn, rmap_write_protect finds page not WRITABLE.
>> - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.
>>
>> BTW,
>>
>> "introduce SPTE_ALLOW_WRITE bit
>>
>> This bit indicates whether the spte is allow to be writable that
>> means the gpte of this spte is writable and the pfn pointed by
>> this spte is writable on host"
>>
>> Other than the fact that each bit should have one meaning, how
>> can this bit be accurate without write protection of the gpte?
>>
>> As soon as guest writes to gpte, information in bit is outdated.
> 
> Ok, i found one example where mmu_lock was expecting sptes not 
> to change:
> 
> 
> VCPU0				VCPU1
> 
> - read-only gpte
> - read-only spte
> - write fault


It is not true, gpte is read-only, and it is a write fault, then we
should reject the page fault to guest, the fast page fault is not called. :)

> - spte = *sptep
> 				guest write to gpte, set writable bit
> 				spte writable
> 				parent page unsync
> 				guest write to gpte writable bit clear
> 				guest invlpg updates spte to RO
> 				sync_page
> 				enter set_spte from sync_page
> - cmpxchg(spte) is now writable
> [window where another vcpu can
> cache spte with writable bit
> set]
> 
> 				if (is_writable_pte(entry) && !is_writable_pte(*sptep))
> 					kvm_flush_remote_tlbs(vcpu->kvm);
> 
> The flush is not executed because spte was read-only (which is 
> a correct assumption as long as sptes updates are protected
> by mmu_lock).
> 


It is also not true, flush tlbs in set_sptes is used to ensure rmap_write_protect
work correctly, but rmap_write_protect will flush tlbs even if the spte can be changed
by fast page fault.

> So this is an example of implicit assumptions which break if you update
> spte without mmu_lock. Certainly there are more cases. :(


We only need care the path which is depends on spte.WRITEABLE == 0, since only
these spte has chance to be changed out of mmu-lock.

The most trouble is in rmap_write_protect that need flush tlb to protect shadow
page table.

I think it is not too hard to check. :)



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
  2012-04-21  3:24     ` Xiao Guangrong
@ 2012-04-21  4:18       ` Marcelo Tosatti
  2012-04-21  6:52         ` Xiao Guangrong
  0 siblings, 1 reply; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-21  4:18 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On Sat, Apr 21, 2012 at 11:24:54AM +0800, Xiao Guangrong wrote:
> On 04/21/2012 05:33 AM, Marcelo Tosatti wrote:
> 
> 
> >>  static bool
> >>  __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
> >>  {
> >> @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
> >>
> >>  	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
> >>  		BUG_ON(!(*sptep & PT_PRESENT_MASK));
> >> -		rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> >> -
> >> -		if (!is_writable_pte(*sptep)) {
> >> -			sptep = rmap_get_next(&iter);
> >> -			continue;
> >> -		}
> >> -
> >> -		if (level == PT_PAGE_TABLE_LEVEL) {
> >> -			mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
> >> -			sptep = rmap_get_next(&iter);
> >> -		} else {
> >> -			BUG_ON(!is_large_pte(*sptep));
> >> -			drop_spte(kvm, sptep);
> >> -			--kvm->stat.lpages;
> > 
> > It is preferable to remove all large sptes including read-only ones, the
> 
> 
> It can cause page faults even if read memory on these large sptse.
> 
> Actually, Avi suggested that make large writable spte to be readonly
> (not dropped) on this path.

See commits e49146dce8c3dc6f4485c1904b6587855f393e71,
38187c830cab84daecb41169948467f1f19317e3 for issues
with large read-only sptes.

> > current behaviour, then to verify that no read->write transition can
> > occur in fault paths (fault paths which are increasing in number).
> 
> 
> Yes, the small spte also has issue (find a write-protected spte in
> fault paths). Later, the second part of this patchset will introduce
> rmap.WRITE_PROTECTED bit, then we can do the fast check before calling
> fast page fault.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit
  2012-04-21  3:30     ` Xiao Guangrong
@ 2012-04-21  4:22       ` Marcelo Tosatti
  2012-04-21  6:55         ` Xiao Guangrong
  2012-04-22 15:12         ` Avi Kivity
  0 siblings, 2 replies; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-21  4:22 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On Sat, Apr 21, 2012 at 11:30:55AM +0800, Xiao Guangrong wrote:
> On 04/21/2012 05:39 AM, Marcelo Tosatti wrote:
> 
> 
> >> @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
> >>  			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
> >>  			new_spte |= (u64)new_pfn << PAGE_SHIFT;
> >>
> >> -			new_spte &= ~PT_WRITABLE_MASK;
> >> -			new_spte &= ~SPTE_HOST_WRITEABLE;
> >> -			new_spte &= ~shadow_accessed_mask;
> >> +			new_spte &= ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
> >> +				      shadow_accessed_mask | SPTE_ALLOW_WRITE);
> > 
> > Each bit should have a distinct meaning. Here the host pte is being
> > write-protected, which means only the SPTE_HOST_WRITEABLE bit
> > should be cleared.
> 
> 
> Hmm, it is no problem if SPTE_ALLOW_WRITE is not cleared.
> 
> But the meaning of SPTE_ALLOW_WRITE will become strange: we will see a
> spte with spte.SPTE_ALLOW_WRITE = 1 (means the spte is writable on host
> and guest) and spte.SPTE_HOST_WRITEABLE = 0 (means the spte is read-only
> on host).
 
You are combining gpte writable bit, and host pte writable bit (which
are separate and independent of each other) into one bit. 

SPTE_HOST_WRITEABLE already indicates whether the host pte is writable 
or not.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-21  0:55       ` Marcelo Tosatti
  2012-04-21  1:38         ` Takuya Yoshikawa
@ 2012-04-21  4:29         ` Xiao Guangrong
  1 sibling, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-21  4:29 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On 04/21/2012 08:55 AM, Marcelo Tosatti wrote:


>> So this is an example of implicit assumptions which break if you update
>> spte without mmu_lock. Certainly there are more cases. :(
> 
> OK, i now see you mentioned a similar case in the document, for
> rmap_write_protect.
> 
> More importantly than the particular flush TLB case, the point is
> every piece of code that reads and writes sptes must now be aware that
> mmu_lock alone does not guarantee stability. Everything must be audited.
> 


Yes, that is true, but it is not hard to audit the code since we only
change the spte from read-only to writable, also all information that
fast page fault depends on is from spte.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
  2012-04-21  1:10     ` Takuya Yoshikawa
@ 2012-04-21  4:34       ` Xiao Guangrong
  0 siblings, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-21  4:34 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: Marcelo Tosatti, Xiao Guangrong, Avi Kivity, LKML, KVM

On 04/21/2012 09:10 AM, Takuya Yoshikawa wrote:

> On Fri, 20 Apr 2012 18:33:19 -0300
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
>> It is preferable to remove all large sptes including read-only ones, the
>> current behaviour, then to verify that no read->write transition can
>> occur in fault paths (fault paths which are increasing in number).
>>
> 
> I think we should use separate function than spte_write_protect() for
> the large case.


I will introduce a function to handle large sptes when i implement the
idea of making writable spte to be read-only.

But, keep it in this patchset first.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-21  3:47     ` Xiao Guangrong
@ 2012-04-21  4:38       ` Marcelo Tosatti
  2012-04-21  7:25         ` Xiao Guangrong
  0 siblings, 1 reply; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-21  4:38 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On Sat, Apr 21, 2012 at 11:47:46AM +0800, Xiao Guangrong wrote:
> On 04/21/2012 05:52 AM, Marcelo Tosatti wrote:
> 
> > On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
> >> If this bit is set, it means the W bit of the spte is cleared due
> >> to shadow page table protection
> >>
> >> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >> ---
> >>  arch/x86/kvm/mmu.c |   56 ++++++++++++++++++++++++++++++++++-----------------
> >>  1 files changed, 37 insertions(+), 19 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >> index dd984b6..eb02fc4 100644
> >> --- a/arch/x86/kvm/mmu.c
> >> +++ b/arch/x86/kvm/mmu.c
> >> @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
> >>
> >>  #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> >>  #define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> >> +#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))
> >>
> >>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> >>
> >> @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
> >>  		rmap_remove(kvm, sptep);
> >>  }
> >>
> >> +static bool spte_wp_by_dirty_log(u64 spte)
> >> +{
> >> +	WARN_ON(is_writable_pte(spte));
> >> +
> >> +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
> >> +}
> > 
> > Is the information accurate? Say:
> > 
> > - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
> > - shadow gfn, rmap_write_protect finds page not WRITABLE.
> > - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.
> 
> 
> It can not happen, rmap_write_protect will set SPTE_WRITE_PROTECT
> even if the spte is not WRITABLE, please see:
> 
> +	if (page_table_protect && spte_wp_by_dirty_log(spte))
> +		goto reset_spte;
> +
> +	return false;
> +
> +reset_spte:
>  	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
>  	spte = spte & ~PT_WRITABLE_MASK;
> +	if (page_table_protect)
> +		spte |= SPTE_WRITE_PROTECT;
>  	mmu_spte_update(sptep, spte);
> -
>  	return false;
>  }

Right. What about sync path, fault path, prefault path, do they update
SPTE_WRITE_PROTECT properly?

> > BTW,
> > 
> > "introduce SPTE_ALLOW_WRITE bit
> > 
> > This bit indicates whether the spte is allow to be writable that
> > means the gpte of this spte is writable and the pfn pointed by
> > this spte is writable on host"
> > 
> > Other than the fact that each bit should have one meaning, how
> > can this bit be accurate without write protection of the gpte?
> > 
> 
> Above explanation can ensure the meaning of this bit is accurate?
> Or it has another case? :)

No, it is out of sync with guest pte.

> > As soon as guest writes to gpte, information in bit is outdated.
> > 
> 
> 
> The bit will be updated when spte is updated.
> 
> When the guest write gpte, the spte is not updated immediately,
> yes, the bit is outdated at that time, but it is ok since tlb is
> not flushed.

Page faults cause TLB flushes.

>From Intel manual:

"In addition to the instructions identified above, page faults
invalidate entries in the TLBs and paging-structure caches. In
particular, a page-fault exception resulting from an attempt to use
a linear address will invalidate any TLB entries that are for a page
number corresponding to that linear address and that are associated with
the current PCID."

> After tlb flush, the bit can be coincident with gpte.

You must read the gpte before updating from ro->rw, unless you write
protect gpte. IIRC you were doing that in previous patches?


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
  2012-04-21  4:18       ` Marcelo Tosatti
@ 2012-04-21  6:52         ` Xiao Guangrong
  0 siblings, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-21  6:52 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On 04/21/2012 12:18 PM, Marcelo Tosatti wrote:

> On Sat, Apr 21, 2012 at 11:24:54AM +0800, Xiao Guangrong wrote:
>> On 04/21/2012 05:33 AM, Marcelo Tosatti wrote:
>>
>>
>>>>  static bool
>>>>  __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
>>>>  {
>>>> @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
>>>>
>>>>  	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
>>>>  		BUG_ON(!(*sptep & PT_PRESENT_MASK));
>>>> -		rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
>>>> -
>>>> -		if (!is_writable_pte(*sptep)) {
>>>> -			sptep = rmap_get_next(&iter);
>>>> -			continue;
>>>> -		}
>>>> -
>>>> -		if (level == PT_PAGE_TABLE_LEVEL) {
>>>> -			mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
>>>> -			sptep = rmap_get_next(&iter);
>>>> -		} else {
>>>> -			BUG_ON(!is_large_pte(*sptep));
>>>> -			drop_spte(kvm, sptep);
>>>> -			--kvm->stat.lpages;
>>>
>>> It is preferable to remove all large sptes including read-only ones, the
>>
>>
>> It can cause page faults even if read memory on these large sptse.
>>
>> Actually, Avi suggested that make large writable spte to be readonly
>> (not dropped) on this path.
> 
> See commits e49146dce8c3dc6f4485c1904b6587855f393e71,
> 38187c830cab84daecb41169948467f1f19317e3 for issues
> with large read-only sptes.
> 


Yes, we need check the code carefully when change writable spte to be
read-only, let us discuss it in the separate patchset later. :)


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit
  2012-04-21  4:22       ` Marcelo Tosatti
@ 2012-04-21  6:55         ` Xiao Guangrong
  2012-04-22 15:12         ` Avi Kivity
  1 sibling, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-21  6:55 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On 04/21/2012 12:22 PM, Marcelo Tosatti wrote:

> On Sat, Apr 21, 2012 at 11:30:55AM +0800, Xiao Guangrong wrote:
>> On 04/21/2012 05:39 AM, Marcelo Tosatti wrote:
>>
>>
>>>> @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
>>>>  			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
>>>>  			new_spte |= (u64)new_pfn << PAGE_SHIFT;
>>>>
>>>> -			new_spte &= ~PT_WRITABLE_MASK;
>>>> -			new_spte &= ~SPTE_HOST_WRITEABLE;
>>>> -			new_spte &= ~shadow_accessed_mask;
>>>> +			new_spte &= ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
>>>> +				      shadow_accessed_mask | SPTE_ALLOW_WRITE);
>>>
>>> Each bit should have a distinct meaning. Here the host pte is being
>>> write-protected, which means only the SPTE_HOST_WRITEABLE bit
>>> should be cleared.
>>
>>
>> Hmm, it is no problem if SPTE_ALLOW_WRITE is not cleared.
>>
>> But the meaning of SPTE_ALLOW_WRITE will become strange: we will see a
>> spte with spte.SPTE_ALLOW_WRITE = 1 (means the spte is writable on host
>> and guest) and spte.SPTE_HOST_WRITEABLE = 0 (means the spte is read-only
>> on host).
>  
> You are combining gpte writable bit, and host pte writable bit (which
> are separate and independent of each other) into one bit. 
> 
> SPTE_HOST_WRITEABLE already indicates whether the host pte is writable 
> or not.


Okay, i will split the meaning in next version! Thank you, Marcelo!


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-21  4:38       ` Marcelo Tosatti
@ 2012-04-21  7:25         ` Xiao Guangrong
  2012-04-24  0:24           ` Marcelo Tosatti
  0 siblings, 1 reply; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-21  7:25 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On 04/21/2012 12:38 PM, Marcelo Tosatti wrote:

> On Sat, Apr 21, 2012 at 11:47:46AM +0800, Xiao Guangrong wrote:
>> On 04/21/2012 05:52 AM, Marcelo Tosatti wrote:
>>
>>> On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
>>>> If this bit is set, it means the W bit of the spte is cleared due
>>>> to shadow page table protection
>>>>
>>>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>>>> ---
>>>>  arch/x86/kvm/mmu.c |   56 ++++++++++++++++++++++++++++++++++-----------------
>>>>  1 files changed, 37 insertions(+), 19 deletions(-)
>>>>
>>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>>> index dd984b6..eb02fc4 100644
>>>> --- a/arch/x86/kvm/mmu.c
>>>> +++ b/arch/x86/kvm/mmu.c
>>>> @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
>>>>
>>>>  #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
>>>>  #define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
>>>> +#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))
>>>>
>>>>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
>>>>
>>>> @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
>>>>  		rmap_remove(kvm, sptep);
>>>>  }
>>>>
>>>> +static bool spte_wp_by_dirty_log(u64 spte)
>>>> +{
>>>> +	WARN_ON(is_writable_pte(spte));
>>>> +
>>>> +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
>>>> +}
>>>
>>> Is the information accurate? Say:
>>>
>>> - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
>>> - shadow gfn, rmap_write_protect finds page not WRITABLE.
>>> - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.
>>
>>
>> It can not happen, rmap_write_protect will set SPTE_WRITE_PROTECT
>> even if the spte is not WRITABLE, please see:
>>
>> +	if (page_table_protect && spte_wp_by_dirty_log(spte))
>> +		goto reset_spte;
>> +
>> +	return false;
>> +
>> +reset_spte:
>>  	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
>>  	spte = spte & ~PT_WRITABLE_MASK;
>> +	if (page_table_protect)
>> +		spte |= SPTE_WRITE_PROTECT;
>>  	mmu_spte_update(sptep, spte);
>> -
>>  	return false;
>>  }
> 
> Right. What about sync path, fault path, prefault path, do they update
> SPTE_WRITE_PROTECT properly?
> 


All of these case can call set_spte() to update the spte,

@@ -2346,6 +2363,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
 			spte &= ~PT_WRITABLE_MASK;
+			spte |= SPTE_WRITE_PROTECT;

SPTE_WRITE_PROTECT bit is set if the page is write-protected.

>>> BTW,
>>>
>>> "introduce SPTE_ALLOW_WRITE bit
>>>
>>> This bit indicates whether the spte is allow to be writable that
>>> means the gpte of this spte is writable and the pfn pointed by
>>> this spte is writable on host"
>>>
>>> Other than the fact that each bit should have one meaning, how
>>> can this bit be accurate without write protection of the gpte?
>>>
>>
>> Above explanation can ensure the meaning of this bit is accurate?
>> Or it has another case? :)
> 
> No, it is out of sync with guest pte.
> 
>>> As soon as guest writes to gpte, information in bit is outdated.
>>>
>>
>>
>> The bit will be updated when spte is updated.
>>
>> When the guest write gpte, the spte is not updated immediately,
>> yes, the bit is outdated at that time, but it is ok since tlb is
>> not flushed.
> 
> Page faults cause TLB flushes.
> 
>>From Intel manual:
> 
> "In addition to the instructions identified above, page faults
> invalidate entries in the TLBs and paging-structure caches. In
> particular, a page-fault exception resulting from an attempt to use
> a linear address will invalidate any TLB entries that are for a page
> number corresponding to that linear address and that are associated with
> the current PCID."
> 


Yes, the fault tlb entries is removed _after_ page fault. On page fault
path, the spte is correctly updated (clear SPTE_ALLOW_WRITABLE bit),
and the fast page fault path is fail to update the spte (Since
SPTE_ALLOW_WRITABLE is not set or cmpxchg is fail.)

There is windows that is between guest write and shadow page page fault,
it this windows, fast page fault can make the spte to be writable, it is
ok, since the guest write instruction is not completed.

>> After tlb flush, the bit can be coincident with gpte.
> 
> You must read the gpte before updating from ro->rw, unless you write
> protect gpte. IIRC you were doing that in previous patches?
> 


Not need. Please see the below sequence:

gpte.W = 1
spte is the shadow page entry of gpte.
spte.W = 0


      VCPU 0                         VCPU 1
guest write gpte.W = 0
                                guest write memory through gpte
                                fast page fault:
                                     cmpxchg spte + W

SPTE.SPTE_ALLOW_WRITE = 0 when
host emulate the write or sync shadow pages
(spte is zapped or read-only)

flush_tlb

return to guest
the guest write operation is completed.

It does not break anything.

Marcelo, i guess you missed "gpte to be written" and "access through gpte",
yes? A write operation changes the page which the pte points to, not change
the pte.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit
  2012-04-21  4:22       ` Marcelo Tosatti
  2012-04-21  6:55         ` Xiao Guangrong
@ 2012-04-22 15:12         ` Avi Kivity
  2012-04-23  7:24           ` Xiao Guangrong
  1 sibling, 1 reply; 35+ messages in thread
From: Avi Kivity @ 2012-04-22 15:12 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Xiao Guangrong, LKML, KVM

On 04/21/2012 07:22 AM, Marcelo Tosatti wrote:
> On Sat, Apr 21, 2012 at 11:30:55AM +0800, Xiao Guangrong wrote:
> > On 04/21/2012 05:39 AM, Marcelo Tosatti wrote:
> > 
> > 
> > >> @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
> > >>  			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
> > >>  			new_spte |= (u64)new_pfn << PAGE_SHIFT;
> > >>
> > >> -			new_spte &= ~PT_WRITABLE_MASK;
> > >> -			new_spte &= ~SPTE_HOST_WRITEABLE;
> > >> -			new_spte &= ~shadow_accessed_mask;
> > >> +			new_spte &= ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
> > >> +				      shadow_accessed_mask | SPTE_ALLOW_WRITE);
> > > 
> > > Each bit should have a distinct meaning. Here the host pte is being
> > > write-protected, which means only the SPTE_HOST_WRITEABLE bit
> > > should be cleared.
> > 
> > 
> > Hmm, it is no problem if SPTE_ALLOW_WRITE is not cleared.
> > 
> > But the meaning of SPTE_ALLOW_WRITE will become strange: we will see a
> > spte with spte.SPTE_ALLOW_WRITE = 1 (means the spte is writable on host
> > and guest) and spte.SPTE_HOST_WRITEABLE = 0 (means the spte is read-only
> > on host).
>  
> You are combining gpte writable bit, and host pte writable bit (which
> are separate and independent of each other) into one bit. 
>
> SPTE_HOST_WRITEABLE already indicates whether the host pte is writable 
> or not.

Maybe we should rename SPTE_ALLOW_WRITE to SPTE_NOT_SHADOWED (or
SPTE_SHADOWED with the opposite meaning).

Alternatively, SPTE_MMU_WRITEABLE (complements SPTE_HOST_WRITEABLE).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit
  2012-04-22 15:12         ` Avi Kivity
@ 2012-04-23  7:24           ` Xiao Guangrong
  0 siblings, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-23  7:24 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Xiao Guangrong, LKML, KVM

On 04/22/2012 11:12 PM, Avi Kivity wrote:

> On 04/21/2012 07:22 AM, Marcelo Tosatti wrote:
>> On Sat, Apr 21, 2012 at 11:30:55AM +0800, Xiao Guangrong wrote:
>>> On 04/21/2012 05:39 AM, Marcelo Tosatti wrote:
>>>
>>>
>>>>> @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
>>>>>  			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
>>>>>  			new_spte |= (u64)new_pfn << PAGE_SHIFT;
>>>>>
>>>>> -			new_spte &= ~PT_WRITABLE_MASK;
>>>>> -			new_spte &= ~SPTE_HOST_WRITEABLE;
>>>>> -			new_spte &= ~shadow_accessed_mask;
>>>>> +			new_spte &= ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
>>>>> +				      shadow_accessed_mask | SPTE_ALLOW_WRITE);
>>>>
>>>> Each bit should have a distinct meaning. Here the host pte is being
>>>> write-protected, which means only the SPTE_HOST_WRITEABLE bit
>>>> should be cleared.
>>>
>>>
>>> Hmm, it is no problem if SPTE_ALLOW_WRITE is not cleared.
>>>
>>> But the meaning of SPTE_ALLOW_WRITE will become strange: we will see a
>>> spte with spte.SPTE_ALLOW_WRITE = 1 (means the spte is writable on host
>>> and guest) and spte.SPTE_HOST_WRITEABLE = 0 (means the spte is read-only
>>> on host).
>>  
>> You are combining gpte writable bit, and host pte writable bit (which
>> are separate and independent of each other) into one bit. 
>>
>> SPTE_HOST_WRITEABLE already indicates whether the host pte is writable 
>> or not.
> 
> Maybe we should rename SPTE_ALLOW_WRITE to SPTE_NOT_SHADOWED (or
> SPTE_SHADOWED with the opposite meaning).
> 
> Alternatively, SPTE_MMU_WRITEABLE (complements SPTE_HOST_WRITEABLE).
> 


I like SPTE_MMU_WRITEABLE :)


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-21  7:25         ` Xiao Guangrong
@ 2012-04-24  0:24           ` Marcelo Tosatti
  0 siblings, 0 replies; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-24  0:24 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On Sat, Apr 21, 2012 at 03:25:47PM +0800, Xiao Guangrong wrote:
> On 04/21/2012 12:38 PM, Marcelo Tosatti wrote:
> 
> > On Sat, Apr 21, 2012 at 11:47:46AM +0800, Xiao Guangrong wrote:
> >> On 04/21/2012 05:52 AM, Marcelo Tosatti wrote:
> >>
> >>> On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
> >>>> If this bit is set, it means the W bit of the spte is cleared due
> >>>> to shadow page table protection
> >>>>
> >>>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >>>> ---
> >>>>  arch/x86/kvm/mmu.c |   56 ++++++++++++++++++++++++++++++++++-----------------
> >>>>  1 files changed, 37 insertions(+), 19 deletions(-)
> >>>>
> >>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >>>> index dd984b6..eb02fc4 100644
> >>>> --- a/arch/x86/kvm/mmu.c
> >>>> +++ b/arch/x86/kvm/mmu.c
> >>>> @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
> >>>>
> >>>>  #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> >>>>  #define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> >>>> +#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))
> >>>>
> >>>>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> >>>>
> >>>> @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
> >>>>  		rmap_remove(kvm, sptep);
> >>>>  }
> >>>>
> >>>> +static bool spte_wp_by_dirty_log(u64 spte)
> >>>> +{
> >>>> +	WARN_ON(is_writable_pte(spte));
> >>>> +
> >>>> +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
> >>>> +}
> >>>
> >>> Is the information accurate? Say:
> >>>
> >>> - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
> >>> - shadow gfn, rmap_write_protect finds page not WRITABLE.
> >>> - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.
> >>
> >>
> >> It can not happen, rmap_write_protect will set SPTE_WRITE_PROTECT
> >> even if the spte is not WRITABLE, please see:
> >>
> >> +	if (page_table_protect && spte_wp_by_dirty_log(spte))
> >> +		goto reset_spte;
> >> +
> >> +	return false;
> >> +
> >> +reset_spte:
> >>  	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
> >>  	spte = spte & ~PT_WRITABLE_MASK;
> >> +	if (page_table_protect)
> >> +		spte |= SPTE_WRITE_PROTECT;
> >>  	mmu_spte_update(sptep, spte);
> >> -
> >>  	return false;
> >>  }
> > 
> > Right. What about sync path, fault path, prefault path, do they update
> > SPTE_WRITE_PROTECT properly?
> > 
> 
> 
> All of these case can call set_spte() to update the spte,
> 
> @@ -2346,6 +2363,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  			ret = 1;
>  			pte_access &= ~ACC_WRITE_MASK;
>  			spte &= ~PT_WRITABLE_MASK;
> +			spte |= SPTE_WRITE_PROTECT;
> 
> SPTE_WRITE_PROTECT bit is set if the page is write-protected.
> 
> >>> BTW,
> >>>
> >>> "introduce SPTE_ALLOW_WRITE bit
> >>>
> >>> This bit indicates whether the spte is allow to be writable that
> >>> means the gpte of this spte is writable and the pfn pointed by
> >>> this spte is writable on host"
> >>>
> >>> Other than the fact that each bit should have one meaning, how
> >>> can this bit be accurate without write protection of the gpte?
> >>>
> >>
> >> Above explanation can ensure the meaning of this bit is accurate?
> >> Or it has another case? :)
> > 
> > No, it is out of sync with guest pte.
> > 
> >>> As soon as guest writes to gpte, information in bit is outdated.
> >>>
> >>
> >>
> >> The bit will be updated when spte is updated.
> >>
> >> When the guest write gpte, the spte is not updated immediately,
> >> yes, the bit is outdated at that time, but it is ok since tlb is
> >> not flushed.
> > 
> > Page faults cause TLB flushes.
> > 
> >>From Intel manual:
> > 
> > "In addition to the instructions identified above, page faults
> > invalidate entries in the TLBs and paging-structure caches. In
> > particular, a page-fault exception resulting from an attempt to use
> > a linear address will invalidate any TLB entries that are for a page
> > number corresponding to that linear address and that are associated with
> > the current PCID."
> > 
> 
> 
> Yes, the fault tlb entries is removed _after_ page fault. On page fault
> path, the spte is correctly updated (clear SPTE_ALLOW_WRITABLE bit),
> and the fast page fault path is fail to update the spte (Since
> SPTE_ALLOW_WRITABLE is not set or cmpxchg is fail.)
> 
> There is windows that is between guest write and shadow page page fault,
> it this windows, fast page fault can make the spte to be writable, it is
> ok, since the guest write instruction is not completed.

Yes, the TLB flush on pagefault is after page fault.

> >> After tlb flush, the bit can be coincident with gpte.
> > 
> > You must read the gpte before updating from ro->rw, unless you write
> > protect gpte. IIRC you were doing that in previous patches?
> > 
> 
> 
> Not need. Please see the below sequence:
> 
> gpte.W = 1
> spte is the shadow page entry of gpte.
> spte.W = 0
> 
> 
>       VCPU 0                         VCPU 1
> guest write gpte.W = 0
>                                 guest write memory through gpte
>                                 fast page fault:
>                                      cmpxchg spte + W
> 
> SPTE.SPTE_ALLOW_WRITE = 0 when
> host emulate the write or sync shadow pages
> (spte is zapped or read-only)
> 
> flush_tlb
> 
> return to guest
> the guest write operation is completed.
> 
> It does not break anything.
> 
> Marcelo, i guess you missed "gpte to be written" and "access through gpte",
> yes? A write operation changes the page which the pte points to, not change
> the pte.

No, but the TLB flush on page-fault is irrelevant because no software
should rely on it.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-21  4:00       ` Xiao Guangrong
@ 2012-04-24  0:45         ` Marcelo Tosatti
  2012-04-24  3:34           ` Xiao Guangrong
  0 siblings, 1 reply; 35+ messages in thread
From: Marcelo Tosatti @ 2012-04-24  0:45 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On Sat, Apr 21, 2012 at 12:00:00PM +0800, Xiao Guangrong wrote:
> On 04/21/2012 08:40 AM, Marcelo Tosatti wrote:
> 
> > On Fri, Apr 20, 2012 at 06:52:11PM -0300, Marcelo Tosatti wrote:
> >> On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
> >>> If this bit is set, it means the W bit of the spte is cleared due
> >>> to shadow page table protection
> >>>
> >>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >>> ---
> >>>  arch/x86/kvm/mmu.c |   56 ++++++++++++++++++++++++++++++++++-----------------
> >>>  1 files changed, 37 insertions(+), 19 deletions(-)
> >>>
> >>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >>> index dd984b6..eb02fc4 100644
> >>> --- a/arch/x86/kvm/mmu.c
> >>> +++ b/arch/x86/kvm/mmu.c
> >>> @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
> >>>
> >>>  #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> >>>  #define SPTE_ALLOW_WRITE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> >>> +#define SPTE_WRITE_PROTECT	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 2))
> >>>
> >>>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> >>>
> >>> @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
> >>>  		rmap_remove(kvm, sptep);
> >>>  }
> >>>
> >>> +static bool spte_wp_by_dirty_log(u64 spte)
> >>> +{
> >>> +	WARN_ON(is_writable_pte(spte));
> >>> +
> >>> +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
> >>> +}
> >>
> >> Is the information accurate? Say:
> >>
> >> - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
> >> - shadow gfn, rmap_write_protect finds page not WRITABLE.
> >> - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.
> >>
> >> BTW,
> >>
> >> "introduce SPTE_ALLOW_WRITE bit
> >>
> >> This bit indicates whether the spte is allow to be writable that
> >> means the gpte of this spte is writable and the pfn pointed by
> >> this spte is writable on host"
> >>
> >> Other than the fact that each bit should have one meaning, how
> >> can this bit be accurate without write protection of the gpte?
> >>
> >> As soon as guest writes to gpte, information in bit is outdated.
> > 
> > Ok, i found one example where mmu_lock was expecting sptes not 
> > to change:
> > 
> > 
> > VCPU0				VCPU1
> > 
> > - read-only gpte
> > - read-only spte
> > - write fault
> 
> 
> It is not true, gpte is read-only, and it is a write fault, then we
> should reject the page fault to guest, the fast page fault is not called. :)
> 
> > - spte = *sptep
> > 				guest write to gpte, set writable bit
> > 				spte writable
> > 				parent page unsync
> > 				guest write to gpte writable bit clear
> > 				guest invlpg updates spte to RO
> > 				sync_page
> > 				enter set_spte from sync_page
> > - cmpxchg(spte) is now writable
> > [window where another vcpu can
> > cache spte with writable bit
> > set]
> > 
> > 				if (is_writable_pte(entry) && !is_writable_pte(*sptep))
> > 					kvm_flush_remote_tlbs(vcpu->kvm);
> > 
> > The flush is not executed because spte was read-only (which is 
> > a correct assumption as long as sptes updates are protected
> > by mmu_lock).
> 
> 
> It is also not true, flush tlbs in set_sptes is used to ensure rmap_write_protect
> work correctly, but rmap_write_protect will flush tlbs even if the spte can be changed
> by fast page fault.
> 
> > So this is an example of implicit assumptions which break if you update
> > spte without mmu_lock. Certainly there are more cases. :(
> 
> 
> We only need care the path which is depends on spte.WRITEABLE == 0, since only
> these spte has chance to be changed out of mmu-lock.
> 
> The most trouble is in rmap_write_protect that need flush tlb to protect shadow
> page table.
> 
> I think it is not too hard to check. :)

You are minimizing the possible impact these modifications have.

Perhaps you should prepare code under mmu_lock to handle concurrent spte
R->W updates first, and then later introduce the concurrent updates. In
a way that its clear for somebody reading the code that parallel updates
can happen (say read spte once, work on local copy, later re-read spte).

I find it quite difficult to read the code as it is now. Now introduce a
parallel operation on top, complexity goes way up.




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
  2012-04-24  0:45         ` Marcelo Tosatti
@ 2012-04-24  3:34           ` Xiao Guangrong
  0 siblings, 0 replies; 35+ messages in thread
From: Xiao Guangrong @ 2012-04-24  3:34 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Avi Kivity, LKML, KVM

On 04/24/2012 08:45 AM, Marcelo Tosatti wrote:


>> I think it is not too hard to check. :)
> 
> You are minimizing the possible impact these modifications have.
> 
> Perhaps you should prepare code under mmu_lock to handle concurrent spte
> R->W updates first, and then later introduce the concurrent updates. In
> a way that its clear for somebody reading the code that parallel updates
> can happen (say read spte once, work on local copy, later re-read spte).
> 


Good idea. I will refine it in the next version. Thank you, Marcelo! :)


^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2012-04-24  3:34 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-20  8:16 [PATCH v3 0/9] KVM: MMU: fast page fault Xiao Guangrong
2012-04-20  8:17 ` [PATCH v3 1/9] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
2012-04-20  8:17 ` [PATCH v3 2/9] KVM: MMU: abstract spte write-protect Xiao Guangrong
2012-04-20 21:33   ` Marcelo Tosatti
2012-04-21  1:10     ` Takuya Yoshikawa
2012-04-21  4:34       ` Xiao Guangrong
2012-04-21  3:24     ` Xiao Guangrong
2012-04-21  4:18       ` Marcelo Tosatti
2012-04-21  6:52         ` Xiao Guangrong
2012-04-20  8:18 ` [PATCH v3 3/9] KVM: VMX: export PFEC.P bit on ept Xiao Guangrong
2012-04-20  8:18 ` [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit Xiao Guangrong
2012-04-20 21:39   ` Marcelo Tosatti
2012-04-21  3:30     ` Xiao Guangrong
2012-04-21  4:22       ` Marcelo Tosatti
2012-04-21  6:55         ` Xiao Guangrong
2012-04-22 15:12         ` Avi Kivity
2012-04-23  7:24           ` Xiao Guangrong
2012-04-20  8:19 ` [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit Xiao Guangrong
2012-04-20 21:52   ` Marcelo Tosatti
2012-04-21  0:40     ` Marcelo Tosatti
2012-04-21  0:55       ` Marcelo Tosatti
2012-04-21  1:38         ` Takuya Yoshikawa
2012-04-21  4:29         ` Xiao Guangrong
2012-04-21  4:00       ` Xiao Guangrong
2012-04-24  0:45         ` Marcelo Tosatti
2012-04-24  3:34           ` Xiao Guangrong
2012-04-21  3:47     ` Xiao Guangrong
2012-04-21  4:38       ` Marcelo Tosatti
2012-04-21  7:25         ` Xiao Guangrong
2012-04-24  0:24           ` Marcelo Tosatti
2012-04-20  8:19 ` [PATCH v3 6/9] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
2012-04-20  8:20 ` [PATCH v3 7/9] KVM: MMU: trace fast " Xiao Guangrong
2012-04-20  8:20 ` [PATCH v3 8/9] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint Xiao Guangrong
2012-04-20  8:21 ` [PATCH v3 9/9] KVM: MMU: document mmu-lock and fast page fault Xiao Guangrong
2012-04-21  0:59 ` [PATCH v3 0/9] KVM: MMU: " Marcelo Tosatti

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.