All of lore.kernel.org
 help / color / mirror / Atom feed
From: Sean Christopherson <sean.j.christopherson@intel.com>
To: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>,
	Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>, Joerg Roedel <joro@8bytes.org>,
	kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	eric van tassell <Eric.VanTassell@amd.com>,
	Tom Lendacky <thomas.lendacky@amd.com>
Subject: [RFC PATCH 4/8] KVM: x86/mmu: Add infrastructure for pinning PFNs on demand
Date: Fri, 31 Jul 2020 14:23:19 -0700	[thread overview]
Message-ID: <20200731212323.21746-5-sean.j.christopherson@intel.com> (raw)
In-Reply-To: <20200731212323.21746-1-sean.j.christopherson@intel.com>

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
 arch/x86/include/asm/kvm_host.h |   7 ++
 arch/x86/kvm/mmu/mmu.c          | 111 ++++++++++++++++++++++++++------
 2 files changed, 99 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1bab87a444d78..b14864f3e8e74 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1138,6 +1138,13 @@ struct kvm_x86_ops {
 
 	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
+	bool (*pin_spte)(struct kvm_vcpu *vcpu, gfn_t gfn, int level,
+			 kvm_pfn_t pfn);
+	void (*drop_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level,
+				 kvm_pfn_t pfn);
+	void (*zap_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level);
+	void (*unzap_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level);
+
 	bool (*has_wbinvd_exit)(void);
 
 	/* Returns actual tsc_offset set in active VMCS */
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 182f398036248..cab3b2f2f49c3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -133,6 +133,9 @@ module_param(dbg, bool, 0644);
 #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
 #define SPTE_MMIO_MASK (3ULL << 52)
 
+/* Special SPTEs flags that can only be used for non-MMIO SPTEs. */
+#define SPTE_PINNED_MASK	BIT_ULL(62)
+
 #define PT64_LEVEL_BITS 9
 
 #define PT64_LEVEL_SHIFT(level) \
@@ -211,6 +214,7 @@ enum {
 	RET_PF_EMULATE = 1,
 	RET_PF_INVALID = 2,
 	RET_PF_FIXED = 3,
+	RET_PF_UNZAPPED = 4,
 };
 
 struct pte_list_desc {
@@ -635,6 +639,11 @@ static bool is_shadow_present_pte(u64 pte)
 	return __is_shadow_present_pte(pte) && !is_mmio_spte(pte);
 }
 
+static bool is_pinned_pte(u64 pte)
+{
+	return !!(pte & SPTE_PINNED_MASK);
+}
+
 static int is_large_pte(u64 pte)
 {
 	return pte & PT_PAGE_SIZE_MASK;
@@ -937,15 +946,15 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
  * state bits, it is used to clear the last level sptep.
  * Returns the old PTE.
  */
-static u64 mmu_spte_clear_track_bits(u64 *sptep)
+static u64 __mmu_spte_clear_track_bits(u64 *sptep, u64 clear_value)
 {
 	kvm_pfn_t pfn;
 	u64 old_spte = *sptep;
 
 	if (!spte_has_volatile_bits(old_spte))
-		__update_clear_spte_fast(sptep, 0ull);
+		__update_clear_spte_fast(sptep, clear_value);
 	else
-		old_spte = __update_clear_spte_slow(sptep, 0ull);
+		old_spte = __update_clear_spte_slow(sptep, clear_value);
 
 	if (!is_shadow_present_pte(old_spte))
 		return old_spte;
@@ -968,6 +977,11 @@ static u64 mmu_spte_clear_track_bits(u64 *sptep)
 	return old_spte;
 }
 
+static inline u64 mmu_spte_clear_track_bits(u64 *sptep)
+{
+	return __mmu_spte_clear_track_bits(sptep, 0ull);
+}
+
 /*
  * Rules for using mmu_spte_clear_no_track:
  * Directly clear spte without caring the state bits of sptep,
@@ -1399,7 +1413,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	return pte_list_add(vcpu, spte, rmap_head);
 }
 
-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void rmap_remove(struct kvm *kvm, u64 *spte, u64 old_spte)
 {
 	struct kvm_mmu_page *sp;
 	gfn_t gfn;
@@ -1409,6 +1423,10 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmap_head = gfn_to_rmap(kvm, gfn, sp);
 	__pte_list_remove(spte, rmap_head);
+
+	if (is_pinned_pte(old_spte))
+		kvm_x86_ops.drop_pinned_spte(kvm, gfn, sp->role.level - 1,
+					     spte_to_pfn(old_spte));
 }
 
 /*
@@ -1446,7 +1464,7 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
 	iter->pos = 0;
 	sptep = iter->desc->sptes[iter->pos];
 out:
-	BUG_ON(!is_shadow_present_pte(*sptep));
+	BUG_ON(!is_shadow_present_pte(*sptep) && !is_pinned_pte(*sptep));
 	return sptep;
 }
 
@@ -1491,8 +1509,8 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
 	u64 old_spte = mmu_spte_clear_track_bits(sptep);
 
-	if (is_shadow_present_pte(old_spte))
-		rmap_remove(kvm, sptep);
+	if (is_shadow_present_pte(old_spte) || is_pinned_pte(old_spte))
+		rmap_remove(kvm, sptep, old_spte);
 }
 
 
@@ -1730,17 +1748,49 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
 	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
 }
 
+static bool kvm_mmu_zap_pinned_spte(struct kvm *kvm, u64 *sptep)
+{
+	struct kvm_mmu_page *sp;
+	kvm_pfn_t pfn;
+	gfn_t gfn;
+
+	if (!(*sptep & SPTE_PINNED_MASK))
+		return false;
+
+	sp = sptep_to_sp(sptep);
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	pfn = spte_to_pfn(*sptep);
+
+	if (kvm_x86_ops.zap_pinned_spte)
+		kvm_x86_ops.zap_pinned_spte(kvm, gfn, sp->role.level - 1);
+
+	__mmu_spte_clear_track_bits(sptep, SPTE_PINNED_MASK | pfn << PAGE_SHIFT);
+	return true;
+}
+
 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
 	bool flush = false;
 
-	while ((sptep = rmap_get_first(rmap_head, &iter))) {
+restart:
+	for_each_rmap_spte(rmap_head, &iter, sptep) {
 		rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
 
+		if (!is_shadow_present_pte(*sptep)) {
+			WARN_ON_ONCE(!is_pinned_pte(*sptep));
+			continue;
+		}
+
+		flush = true;
+
+		/* Keep the rmap if the SPTE is pinned. */
+		if (kvm_mmu_zap_pinned_spte(kvm, sptep))
+			continue;
+
 		pte_list_remove(rmap_head, sptep);
-		flush = true;
+		goto restart;
 	}
 
 	return flush;
@@ -1774,6 +1824,10 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 
 		need_flush = 1;
 
+		/* Pinned pages should not be relocated (obviously). */
+		if (WARN_ON_ONCE(is_pinned_pte(*sptep)))
+			continue;
+
 		if (pte_write(*ptep)) {
 			pte_list_remove(rmap_head, sptep);
 			goto restart;
@@ -2630,7 +2684,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 	struct kvm_mmu_page *child;
 
 	pte = *spte;
-	if (is_shadow_present_pte(pte)) {
+	if (is_shadow_present_pte(pte) || is_pinned_pte(pte)) {
 		if (is_last_spte(pte, sp->role.level)) {
 			drop_spte(kvm, spte);
 			if (is_large_pte(pte))
@@ -2639,7 +2693,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 			child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, spte);
 		}
-		return true;
+		return is_shadow_present_pte(pte);
 	}
 
 	if (is_mmio_spte(pte))
@@ -2987,10 +3041,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	u64 spte = 0;
 	int ret = 0;
 	struct kvm_mmu_page *sp;
+	bool is_mmio_pfn;
 
 	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
 		return 0;
 
+	is_mmio_pfn = kvm_is_mmio_pfn(pfn);
+
 	sp = sptep_to_sp(sptep);
 	if (sp_ad_disabled(sp))
 		spte |= SPTE_AD_DISABLED_MASK;
@@ -3023,15 +3080,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (level > PG_LEVEL_4K)
 		spte |= PT_PAGE_SIZE_MASK;
 	if (tdp_enabled)
-		spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
-			kvm_is_mmio_pfn(pfn));
+		spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, is_mmio_pfn);
 
 	if (host_writable)
 		spte |= SPTE_HOST_WRITEABLE;
 	else
 		pte_access &= ~ACC_WRITE_MASK;
 
-	if (!kvm_is_mmio_pfn(pfn))
+	if (!is_mmio_pfn)
 		spte |= shadow_me_mask;
 
 	spte |= (u64)pfn << PAGE_SHIFT;
@@ -3065,6 +3121,12 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (speculative)
 		spte = mark_spte_for_access_track(spte);
 
+	if (is_pinned_pte(*sptep) ||
+	    (vcpu->arch.mmu->direct_map && !is_mmio_pfn &&
+	     kvm_x86_ops.pin_spte &&
+	     kvm_x86_ops.pin_spte(vcpu, gfn, level, pfn)))
+		spte |= SPTE_PINNED_MASK;
+
 set_pte:
 	if (mmu_spte_update(sptep, spte))
 		ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
@@ -3081,29 +3143,33 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	int set_spte_ret;
 	int ret = RET_PF_FIXED;
 	bool flush = false;
+	u64 pte = *sptep;
 
 	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
 		 *sptep, write_fault, gfn);
 
-	if (is_shadow_present_pte(*sptep)) {
+	if (is_shadow_present_pte(pte)) {
 		/*
 		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
 		 * the parent of the now unreachable PTE.
 		 */
-		if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
+		if (level > PG_LEVEL_4K && !is_large_pte(pte)) {
 			struct kvm_mmu_page *child;
-			u64 pte = *sptep;
 
 			child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, sptep);
 			flush = true;
-		} else if (pfn != spte_to_pfn(*sptep)) {
+		} else if (pfn != spte_to_pfn(pte)) {
 			pgprintk("hfn old %llx new %llx\n",
-				 spte_to_pfn(*sptep), pfn);
+				 spte_to_pfn(pte), pfn);
 			drop_spte(vcpu->kvm, sptep);
 			flush = true;
 		} else
 			was_rmapped = 1;
+	} else if (is_pinned_pte(pte)) {
+		WARN_ON_ONCE(pfn != spte_to_pfn(pte));
+		ret = RET_PF_UNZAPPED;
+		was_rmapped = 1;
 	}
 
 	set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
@@ -3136,6 +3202,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			rmap_recycle(vcpu, sptep, gfn);
 	}
 
+	if (ret == RET_PF_UNZAPPED && kvm_x86_ops.unzap_pinned_spte)
+		kvm_x86_ops.unzap_pinned_spte(vcpu->kvm, gfn, level - 1);
+
 	return ret;
 }
 
@@ -5921,6 +5990,10 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 		sp = sptep_to_sp(sptep);
 		pfn = spte_to_pfn(*sptep);
 
+		/* Pinned page dirty logging is not supported. */
+		if (WARN_ON_ONCE(is_pinned_pte(*sptep)))
+			continue;
+
 		/*
 		 * We cannot do huge page mapping for indirect shadow pages,
 		 * which are found on the last rmap (level = 1) when not using
-- 
2.28.0


  parent reply	other threads:[~2020-07-31 21:23 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-31 21:23 [RFC PATCH 0/8] KVM: x86/mmu: Introduce pinned SPTEs framework Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 1/8] KVM: x86/mmu: Return old SPTE from mmu_spte_clear_track_bits() Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 2/8] KVM: x86/mmu: Use bits 2:0 to check for present SPTEs Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 3/8] KVM: x86/mmu: Refactor handling of not-present SPTEs in mmu_set_spte() Sean Christopherson
2020-07-31 21:23 ` Sean Christopherson [this message]
2020-07-31 21:23 ` [RFC PATCH 5/8] KVM: SVM: Use the KVM MMU SPTE pinning hooks to pin pages on demand Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 6/8] KVM: x86/mmu: Move 'pfn' variable to caller of direct_page_fault() Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 7/8] KVM: x86/mmu: Introduce kvm_mmu_map_tdp_page() for use by SEV Sean Christopherson
2020-07-31 21:23 ` [RFC PATCH 8/8] KVM: SVM: Pin SEV pages in MMU during sev_launch_update_data() Sean Christopherson
2020-08-03  3:00 ` [RFC PATCH 0/8] KVM: x86/mmu: Introduce pinned SPTEs framework Eric van Tassell
2020-08-03 15:00   ` Sean Christopherson
2020-08-03 15:52 ` Brijesh Singh
2020-08-03 17:16   ` Sean Christopherson
2020-08-04 19:40     ` Brijesh Singh
2020-10-27  3:22       ` Brijesh Singh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200731212323.21746-5-sean.j.christopherson@intel.com \
    --to=sean.j.christopherson@intel.com \
    --cc=Eric.VanTassell@amd.com \
    --cc=jmattson@google.com \
    --cc=joro@8bytes.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=thomas.lendacky@amd.com \
    --cc=vkuznets@redhat.com \
    --cc=wanpengli@tencent.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.