[RFC PATCH 4/8] KVM: x86/mmu: Add infrastructure for pinning PFNs on demand

From: Sean Christopherson <sean.j.christopherson@intel.com>
To: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>,
	Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>, Joerg Roedel <joro@8bytes.org>,
	kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	eric van tassell <Eric.VanTassell@amd.com>,
	Tom Lendacky <thomas.lendacky@amd.com>
Subject: [RFC PATCH 4/8] KVM: x86/mmu: Add infrastructure for pinning PFNs on demand
Date: Fri, 31 Jul 2020 14:23:19 -0700	[thread overview]
Message-ID: <20200731212323.21746-5-sean.j.christopherson@intel.com> (raw)
In-Reply-To: <20200731212323.21746-1-sean.j.christopherson@intel.com>

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
 arch/x86/include/asm/kvm_host.h |   7 ++
 arch/x86/kvm/mmu/mmu.c          | 111 ++++++++++++++++++++++++++------
 2 files changed, 99 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1bab87a444d78..b14864f3e8e74 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1138,6 +1138,13 @@ struct kvm_x86_ops {
 
 	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
+	bool (*pin_spte)(struct kvm_vcpu *vcpu, gfn_t gfn, int level,
+			 kvm_pfn_t pfn);
+	void (*drop_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level,
+				 kvm_pfn_t pfn);
+	void (*zap_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level);
+	void (*unzap_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level);
+
 	bool (*has_wbinvd_exit)(void);
 
 	/* Returns actual tsc_offset set in active VMCS */
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 182f398036248..cab3b2f2f49c3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -133,6 +133,9 @@ module_param(dbg, bool, 0644);
 #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
 #define SPTE_MMIO_MASK (3ULL << 52)
 
+/* Special SPTEs flags that can only be used for non-MMIO SPTEs. */
+#define SPTE_PINNED_MASK	BIT_ULL(62)
+
 #define PT64_LEVEL_BITS 9
 
 #define PT64_LEVEL_SHIFT(level) \
@@ -211,6 +214,7 @@ enum {
 	RET_PF_EMULATE = 1,
 	RET_PF_INVALID = 2,
 	RET_PF_FIXED = 3,
+	RET_PF_UNZAPPED = 4,
 };
 
 struct pte_list_desc {
@@ -635,6 +639,11 @@ static bool is_shadow_present_pte(u64 pte)
 	return __is_shadow_present_pte(pte) && !is_mmio_spte(pte);
 }
 
+static bool is_pinned_pte(u64 pte)
+{
+	return !!(pte & SPTE_PINNED_MASK);
+}
+
 static int is_large_pte(u64 pte)
 {
 	return pte & PT_PAGE_SIZE_MASK;
@@ -937,15 +946,15 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
  * state bits, it is used to clear the last level sptep.
  * Returns the old PTE.
  */
-static u64 mmu_spte_clear_track_bits(u64 *sptep)
+static u64 __mmu_spte_clear_track_bits(u64 *sptep, u64 clear_value)
 {
 	kvm_pfn_t pfn;
 	u64 old_spte = *sptep;
 
 	if (!spte_has_volatile_bits(old_spte))
-		__update_clear_spte_fast(sptep, 0ull);
+		__update_clear_spte_fast(sptep, clear_value);
 	else
-		old_spte = __update_clear_spte_slow(sptep, 0ull);
+		old_spte = __update_clear_spte_slow(sptep, clear_value);
 
 	if (!is_shadow_present_pte(old_spte))
 		return old_spte;
@@ -968,6 +977,11 @@ static u64 mmu_spte_clear_track_bits(u64 *sptep)
 	return old_spte;
 }
 
+static inline u64 mmu_spte_clear_track_bits(u64 *sptep)
+{
+	return __mmu_spte_clear_track_bits(sptep, 0ull);
+}
+
 /*
  * Rules for using mmu_spte_clear_no_track:
  * Directly clear spte without caring the state bits of sptep,
@@ -1399,7 +1413,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	return pte_list_add(vcpu, spte, rmap_head);
 }
 
-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void rmap_remove(struct kvm *kvm, u64 *spte, u64 old_spte)
 {
 	struct kvm_mmu_page *sp;
 	gfn_t gfn;
@@ -1409,6 +1423,10 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmap_head = gfn_to_rmap(kvm, gfn, sp);
 	__pte_list_remove(spte, rmap_head);
+
+	if (is_pinned_pte(old_spte))
+		kvm_x86_ops.drop_pinned_spte(kvm, gfn, sp->role.level - 1,
+					     spte_to_pfn(old_spte));
 }
 
 /*
@@ -1446,7 +1464,7 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
 	iter->pos = 0;
 	sptep = iter->desc->sptes[iter->pos];
 out:
-	BUG_ON(!is_shadow_present_pte(*sptep));
+	BUG_ON(!is_shadow_present_pte(*sptep) && !is_pinned_pte(*sptep));
 	return sptep;
 }
 
@@ -1491,8 +1509,8 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
 	u64 old_spte = mmu_spte_clear_track_bits(sptep);
 
-	if (is_shadow_present_pte(old_spte))
-		rmap_remove(kvm, sptep);
+	if (is_shadow_present_pte(old_spte) || is_pinned_pte(old_spte))
+		rmap_remove(kvm, sptep, old_spte);
 }
 
 
@@ -1730,17 +1748,49 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
 	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
 }
 
+static bool kvm_mmu_zap_pinned_spte(struct kvm *kvm, u64 *sptep)
+{
+	struct kvm_mmu_page *sp;
+	kvm_pfn_t pfn;
+	gfn_t gfn;
+
+	if (!(*sptep & SPTE_PINNED_MASK))
+		return false;
+
+	sp = sptep_to_sp(sptep);
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	pfn = spte_to_pfn(*sptep);
+
+	if (kvm_x86_ops.zap_pinned_spte)
+		kvm_x86_ops.zap_pinned_spte(kvm, gfn, sp->role.level - 1);
+
+	__mmu_spte_clear_track_bits(sptep, SPTE_PINNED_MASK | pfn << PAGE_SHIFT);
+	return true;
+}
+
 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
 	bool flush = false;
 
-	while ((sptep = rmap_get_first(rmap_head, &iter))) {
+restart:
+	for_each_rmap_spte(rmap_head, &iter, sptep) {
 		rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
 
+		if (!is_shadow_present_pte(*sptep)) {
+			WARN_ON_ONCE(!is_pinned_pte(*sptep));
+			continue;
+		}
+
+		flush = true;
+
+		/* Keep the rmap if the SPTE is pinned. */
+		if (kvm_mmu_zap_pinned_spte(kvm, sptep))
+			continue;
+
 		pte_list_remove(rmap_head, sptep);
-		flush = true;
+		goto restart;
 	}
 
 	return flush;
@@ -1774,6 +1824,10 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 
 		need_flush = 1;
 
+		/* Pinned pages should not be relocated (obviously). */
+		if (WARN_ON_ONCE(is_pinned_pte(*sptep)))
+			continue;
+
 		if (pte_write(*ptep)) {
 			pte_list_remove(rmap_head, sptep);
 			goto restart;
@@ -2630,7 +2684,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 	struct kvm_mmu_page *child;
 
 	pte = *spte;
-	if (is_shadow_present_pte(pte)) {
+	if (is_shadow_present_pte(pte) || is_pinned_pte(pte)) {
 		if (is_last_spte(pte, sp->role.level)) {
 			drop_spte(kvm, spte);
 			if (is_large_pte(pte))
@@ -2639,7 +2693,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 			child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, spte);
 		}
-		return true;
+		return is_shadow_present_pte(pte);
 	}
 
 	if (is_mmio_spte(pte))
@@ -2987,10 +3041,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	u64 spte = 0;
 	int ret = 0;
 	struct kvm_mmu_page *sp;
+	bool is_mmio_pfn;
 
 	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
 		return 0;
 
+	is_mmio_pfn = kvm_is_mmio_pfn(pfn);
+
 	sp = sptep_to_sp(sptep);
 	if (sp_ad_disabled(sp))
 		spte |= SPTE_AD_DISABLED_MASK;
@@ -3023,15 +3080,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (level > PG_LEVEL_4K)
 		spte |= PT_PAGE_SIZE_MASK;
 	if (tdp_enabled)
-		spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
-			kvm_is_mmio_pfn(pfn));
+		spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, is_mmio_pfn);
 
 	if (host_writable)
 		spte |= SPTE_HOST_WRITEABLE;
 	else
 		pte_access &= ~ACC_WRITE_MASK;
 
-	if (!kvm_is_mmio_pfn(pfn))
+	if (!is_mmio_pfn)
 		spte |= shadow_me_mask;
 
 	spte |= (u64)pfn << PAGE_SHIFT;
@@ -3065,6 +3121,12 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (speculative)
 		spte = mark_spte_for_access_track(spte);
 
+	if (is_pinned_pte(*sptep) ||
+	    (vcpu->arch.mmu->direct_map && !is_mmio_pfn &&
+	     kvm_x86_ops.pin_spte &&
+	     kvm_x86_ops.pin_spte(vcpu, gfn, level, pfn)))
+		spte |= SPTE_PINNED_MASK;
+
 set_pte:
 	if (mmu_spte_update(sptep, spte))
 		ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
@@ -3081,29 +3143,33 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	int set_spte_ret;
 	int ret = RET_PF_FIXED;
 	bool flush = false;
+	u64 pte = *sptep;
 
 	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
 		 *sptep, write_fault, gfn);
 
-	if (is_shadow_present_pte(*sptep)) {
+	if (is_shadow_present_pte(pte)) {
 		/*
 		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
 		 * the parent of the now unreachable PTE.
 		 */
-		if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
+		if (level > PG_LEVEL_4K && !is_large_pte(pte)) {
 			struct kvm_mmu_page *child;
-			u64 pte = *sptep;
 
 			child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, sptep);
 			flush = true;
-		} else if (pfn != spte_to_pfn(*sptep)) {
+		} else if (pfn != spte_to_pfn(pte)) {
 			pgprintk("hfn old %llx new %llx\n",
-				 spte_to_pfn(*sptep), pfn);
+				 spte_to_pfn(pte), pfn);
 			drop_spte(vcpu->kvm, sptep);
 			flush = true;
 		} else
 			was_rmapped = 1;
+	} else if (is_pinned_pte(pte)) {
+		WARN_ON_ONCE(pfn != spte_to_pfn(pte));
+		ret = RET_PF_UNZAPPED;
+		was_rmapped = 1;
 	}
 
 	set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
@@ -3136,6 +3202,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			rmap_recycle(vcpu, sptep, gfn);
 	}
 
+	if (ret == RET_PF_UNZAPPED && kvm_x86_ops.unzap_pinned_spte)
+		kvm_x86_ops.unzap_pinned_spte(vcpu->kvm, gfn, level - 1);
+
 	return ret;
 }
 
@@ -5921,6 +5990,10 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 		sp = sptep_to_sp(sptep);
 		pfn = spte_to_pfn(*sptep);
 
+		/* Pinned page dirty logging is not supported. */
+		if (WARN_ON_ONCE(is_pinned_pte(*sptep)))
+			continue;
+
 		/*
 		 * We cannot do huge page mapping for indirect shadow pages,
 		 * which are found on the last rmap (level = 1) when not using
-- 
2.28.0