linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
To: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Avi Kivity <avi@redhat.com>,
	Marcelo Tosatti <mtosatti@redhat.com>,
	LKML <linux-kernel@vger.kernel.org>, KVM <kvm@vger.kernel.org>
Subject: [PATCH v4 07/10] KVM: MMU: lockless update spte on fast page fault path
Date: Wed, 25 Apr 2012 12:04:18 +0800	[thread overview]
Message-ID: <4F9777C2.4090106@linux.vnet.ibm.com> (raw)
In-Reply-To: <4F9776D2.7020506@linux.vnet.ibm.com>

All the information we need on the fast page fault path is
SPTE_HOST_WRITEABLE bit, SPTE_MMU_WRITEABLE bit and SPTE_WRITE_PROTECT bit
on spte, actually, all these bits can be protected by cmpxchg

But, we need carefully handle the paths which depend on spte.W bit, it will
be documented in locking.txt in the next patch

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |  104 +++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 96a9d5b..15f01a3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -446,6 +446,15 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
 }
 #endif

+static bool spte_wp_by_dirty_log(u64 spte)
+{
+	u64 mask = SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE;
+
+	WARN_ON(is_writable_pte(spte));
+
+	return ((spte & mask) == mask) && !(spte & SPTE_WRITE_PROTECT);
+}
+
 static bool spte_has_volatile_bits(u64 spte)
 {
 	if (!shadow_accessed_mask)
@@ -454,9 +463,18 @@ static bool spte_has_volatile_bits(u64 spte)
 	if (!is_shadow_present_pte(spte))
 		return false;

-	if ((spte & shadow_accessed_mask) &&
-	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
-		return false;
+	if (spte & shadow_accessed_mask) {
+		if (is_writable_pte(spte))
+			return !(spte & shadow_dirty_mask);
+
+		/*
+		 * If the spte is write-protected by dirty-log, it may
+		 * be marked writable on fast page fault path, then CPU
+		 * can modify the Dirty bit.
+		 */
+		if (!spte_wp_by_dirty_log(spte))
+			return false;
+	}

 	return true;
 }
@@ -1043,15 +1061,6 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }

-static bool spte_wp_by_dirty_log(u64 spte)
-{
-	u64 mask = SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE;
-
-	WARN_ON(is_writable_pte(spte));
-
-	return ((spte & mask) == mask) && !(spte & SPTE_WRITE_PROTECT);
-}
-
 /* Return true if the spte is dropped. */
 static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 			       bool *flush, bool page_table_protect)
@@ -1059,13 +1068,12 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 	u64 spte = *sptep;

 	if (is_writable_pte(spte)) {
-		*flush |= true;
-
 		if (large) {
 			pgprintk("rmap_write_protect(large): spte %p %llx\n",
 				 spte, *spte);
 			BUG_ON(!is_large_pte(spte));

+			*flush |= true;
 			drop_spte(kvm, sptep);
 			--kvm->stat.lpages;
 			return true;
@@ -1074,6 +1082,10 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 		goto reset_spte;
 	}

+	/*
+	 * We need flush tlbs in this case: the fast page fault path
+	 * can mark the spte writable after we read the sptep.
+	 */
 	if (page_table_protect && spte_wp_by_dirty_log(spte))
 		goto reset_spte;

@@ -1081,6 +1093,8 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,

 reset_spte:
 	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+
+	*flush |= true;
 	spte = spte & ~PT_WRITABLE_MASK;
 	if (page_table_protect)
 		spte |= SPTE_WRITE_PROTECT;
@@ -2699,24 +2713,60 @@ static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
 }

 static bool
-fast_pf_fix_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-		  u64 *sptep, u64 spte)
+fast_pf_fix_indirect_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			  u64 *sptep, u64 spte, gfn_t gfn)
 {
-	gfn_t gfn;
+	pfn_t pfn;
+	bool ret = false;

-	spin_lock(&vcpu->kvm->mmu_lock);
+	/*
+	 * For the indirect spte, it is hard to get a stable gfn since
+	 * we just use a cmpxchg to avoid all the races which is not
+	 * enough to avoid the ABA problem: the host can arbitrarily
+	 * change spte and the mapping from gfn to pfn.
+	 *
+	 * What we do is call gfn_to_pfn_atomic to bind the gfn and the
+	 * pfn because after the call:
+	 * - we have held the refcount of pfn that means the pfn can not
+	 *   be freed and be reused for another gfn.
+	 * - the pfn is writable that means it can not be shared by different
+	 *   gfn.
+	 */
+	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);

-	/* The spte has been changed. */
-	if (*sptep != spte)
+	/* The host page is swapped out or merged. */
+	if (mmu_invalid_pfn(pfn))
 		goto exit;

-	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	ret = true;
+
+	if (pfn != spte_to_pfn(spte))
+		goto exit;

-	*sptep = spte | PT_WRITABLE_MASK;
-	mark_page_dirty(vcpu->kvm, gfn);
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+		mark_page_dirty(vcpu->kvm, gfn);

 exit:
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return ret;
+}
+
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			u64 *sptep, u64 spte)
+{
+	gfn_t gfn;
+
+	WARN_ON(!sp->role.direct);
+
+	/*
+	 * The gfn of direct spte is stable since it is calculated
+	 * by sp->gfn.
+	 */
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+		mark_page_dirty(vcpu->kvm, gfn);

 	return true;
 }
@@ -2774,7 +2824,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,

 	sp = page_header(__pa(iterator.sptep));

-	ret = fast_pf_fix_spte(vcpu, sp, iterator.sptep, spte);
+	if (sp->role.direct)
+		ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
+	else
+		ret = fast_pf_fix_indirect_spte(vcpu, sp, iterator.sptep,
+						spte, gfn);

 exit:
 	walk_shadow_page_lockless_end(vcpu);
-- 
1.7.7.6


  parent reply	other threads:[~2012-04-25  4:04 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-04-25  4:00 [PATCH v4 00/10] KVM: MMU: fast page fault Xiao Guangrong
2012-04-25  4:01 ` [PATCH v4 01/10] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
2012-04-25  4:01 ` [PATCH v4 02/10] KVM: MMU: abstract spte write-protect Xiao Guangrong
2012-04-25  4:02 ` [PATCH v4 03/10] KVM: VMX: export PFEC.P bit on ept Xiao Guangrong
2012-04-25  4:02 ` [PATCH v4 04/10] KVM: MMU: introduce SPTE_MMU_WRITEABLE bit Xiao Guangrong
2012-04-25  4:03 ` [PATCH v4 05/10] KVM: MMU: introduce SPTE_WRITE_PROTECT bit Xiao Guangrong
2012-04-25  4:03 ` [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
2012-04-26 23:45   ` Marcelo Tosatti
2012-04-27  5:53     ` Xiao Guangrong
2012-04-27 14:52       ` Marcelo Tosatti
2012-04-28  6:10         ` Xiao Guangrong
2012-05-01  1:34           ` Marcelo Tosatti
2012-05-02  5:28             ` Xiao Guangrong
2012-05-02 21:07               ` Marcelo Tosatti
2012-05-03 11:26                 ` Xiao Guangrong
2012-05-05 14:08                   ` Marcelo Tosatti
2012-05-06  9:36                     ` Avi Kivity
2012-05-07  6:52                     ` Xiao Guangrong
2012-04-29  8:50         ` Takuya Yoshikawa
2012-05-01  2:31           ` Marcelo Tosatti
2012-05-02  5:39           ` Xiao Guangrong
2012-05-02 21:10             ` Marcelo Tosatti
2012-05-03 12:09               ` Xiao Guangrong
2012-05-03 12:13                 ` Avi Kivity
2012-05-03  0:15             ` Takuya Yoshikawa
2012-05-03 12:23               ` Xiao Guangrong
2012-05-03 12:40                 ` Takuya Yoshikawa
2012-04-25  4:04 ` Xiao Guangrong [this message]
2012-04-25  4:04 ` [PATCH v4 08/10] KVM: MMU: trace fast " Xiao Guangrong
2012-04-25  4:05 ` [PATCH v4 09/10] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint Xiao Guangrong
2012-04-25  4:06 ` [PATCH v4 10/10] KVM: MMU: document mmu-lock and fast page fault Xiao Guangrong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4F9777C2.4090106@linux.vnet.ibm.com \
    --to=xiaoguangrong@linux.vnet.ibm.com \
    --cc=avi@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mtosatti@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).