[RFC PATCH 16/28] kvm: mmu: Add direct MMU page fault handler

From: Ben Gardon <bgardon@google.com>
To: kvm@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>,
	Peter Feiner <pfeiner@google.com>,
	Peter Shier <pshier@google.com>,
	Junaid Shahid <junaids@google.com>,
	Jim Mattson <jmattson@google.com>,
	Ben Gardon <bgardon@google.com>
Subject: [RFC PATCH 16/28] kvm: mmu: Add direct MMU page fault handler
Date: Thu, 26 Sep 2019 16:18:12 -0700	[thread overview]
Message-ID: <20190926231824.149014-17-bgardon@google.com> (raw)
In-Reply-To: <20190926231824.149014-1-bgardon@google.com>

Adds handler functions to replace __direct_map in handling direct page
faults. These functions, unlike __direct_map can handle page faults on
multiple VCPUs simultaneously.

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu.c | 192 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 179 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f0696658b527c..f3a26a32c8174 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1117,6 +1117,24 @@ static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
 }
 
+/*
+ * Return an unused object to the specified cache. The object's memory should
+ * be zeroed before being returned if that memory was modified after allocation
+ * from the cache.
+ */
+static void mmu_memory_cache_return(struct kvm_mmu_memory_cache *mc,
+				     void *obj)
+{
+	/*
+	 * Since this object was allocated from the cache, the cache should
+	 * have at least one spare capacity to put the object back.
+	 */
+	BUG_ON(mc->nobjs >= ARRAY_SIZE(mc->objects));
+
+	mc->objects[mc->nobjs] = obj;
+	mc->nobjs++;
+}
+
 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 {
 	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
@@ -2426,6 +2444,21 @@ static bool direct_walk_iterator_set_pte(struct direct_walk_iterator *iter,
 	return r;
 }
 
+static u64 generate_nonleaf_pte(u64 *child_pt, bool ad_disabled)
+{
+	u64 pte;
+
+	pte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+	       shadow_user_mask | shadow_x_mask | shadow_me_mask;
+
+	if (ad_disabled)
+		pte |= shadow_acc_track_value;
+	else
+		pte |= shadow_accessed_mask;
+
+	return pte;
+}
+
 /**
  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
@@ -3432,13 +3465,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
 
 	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
 
-	spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
-	       shadow_user_mask | shadow_x_mask | shadow_me_mask;
-
-	if (sp_ad_disabled(sp))
-		spte |= shadow_acc_track_value;
-	else
-		spte |= shadow_accessed_mask;
+	spte = generate_nonleaf_pte(sp->spt, sp_ad_disabled(sp));
 
 	mmu_spte_set(sptep, spte);
 
@@ -4071,6 +4098,126 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 	return ret;
 }
 
+static int direct_page_fault_handle_target_level(struct kvm_vcpu *vcpu,
+		int write, int map_writable, struct direct_walk_iterator *iter,
+		kvm_pfn_t pfn, bool prefault)
+{
+	u64 new_pte;
+	int ret = 0;
+	int generate_pte_ret = 0;
+
+	if (unlikely(is_noslot_pfn(pfn)))
+		new_pte = generate_mmio_pte(vcpu, iter->pte_gfn_start, ACC_ALL);
+	else {
+		generate_pte_ret = generate_pte(vcpu, ACC_ALL, iter->level,
+						iter->pte_gfn_start, pfn,
+						iter->old_pte, prefault, false,
+						map_writable, false, &new_pte);
+		/* Failed to construct a PTE. Retry the page fault. */
+		if (!new_pte)
+			return RET_PF_RETRY;
+	}
+
+	/*
+	 * If the page fault was caused by a write but the page is write
+	 * protected, emulation is needed. If the emulation was skipped,
+	 * the vcpu would have the same fault again.
+	 */
+	if ((generate_pte_ret & SET_SPTE_WRITE_PROTECTED_PT) && write)
+		ret = RET_PF_EMULATE;
+
+	/* If an MMIO PTE was installed, the MMIO will need to be emulated. */
+	if (unlikely(is_mmio_spte(new_pte)))
+		ret = RET_PF_EMULATE;
+
+	/*
+	 * If this would not change the PTE then some other thread must have
+	 * already fixed the page fault and there's no need to proceed.
+	 */
+	if (iter->old_pte == new_pte)
+		return ret;
+
+	/*
+	 * If this warning were to trigger, it would indicate that there was a
+	 * missing MMU notifier or this thread raced with some notifier
+	 * handler. The page fault handler should never change a present, leaf
+	 * PTE to point to a differnt PFN. A notifier handler should have
+	 * zapped the PTE before the main MM's page table was changed.
+	 */
+	WARN_ON(is_present_direct_pte(iter->old_pte) &&
+		is_present_direct_pte(new_pte) &&
+		is_last_spte(iter->old_pte, iter->level) &&
+		is_last_spte(new_pte, iter->level) &&
+		spte_to_pfn(iter->old_pte) != spte_to_pfn(new_pte));
+
+	/*
+	 * If the page fault handler lost the race to set the PTE, retry the
+	 * page fault.
+	 */
+	if (!direct_walk_iterator_set_pte(iter, new_pte))
+		return RET_PF_RETRY;
+
+	/*
+	 * Update some stats for this page fault, if the page
+	 * fault was not speculative.
+	 */
+	if (!prefault)
+		vcpu->stat.pf_fixed++;
+
+	return ret;
+
+}
+
+static int handle_direct_page_fault(struct kvm_vcpu *vcpu,
+		unsigned long mmu_seq, int write, int map_writable, int level,
+		gpa_t gpa, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+{
+	struct direct_walk_iterator iter;
+	struct kvm_mmu_memory_cache *pf_pt_cache = &vcpu->arch.mmu_page_cache;
+	u64 *child_pt;
+	u64 new_pte;
+	int ret = RET_PF_RETRY;
+
+	direct_walk_iterator_setup_walk(&iter, vcpu->kvm,
+			kvm_arch_vcpu_memslots_id(vcpu), gpa >> PAGE_SHIFT,
+			(gpa >> PAGE_SHIFT) + 1, MMU_READ_LOCK);
+	while (direct_walk_iterator_next_pte(&iter)) {
+		if (iter.level == level) {
+			ret = direct_page_fault_handle_target_level(vcpu,
+					write, map_writable, &iter, pfn,
+					prefault);
+
+			break;
+		} else if (!is_present_direct_pte(iter.old_pte) ||
+			   is_large_pte(iter.old_pte)) {
+			/*
+			 * The leaf PTE for this fault must be mapped at a
+			 * lower level, so a non-leaf PTE must be inserted into
+			 * the paging structure. If the assignment below
+			 * succeeds, it will add the non-leaf PTE and a new
+			 * page of page table memory. Then the iterator can
+			 * traverse into that new page. If the atomic compare/
+			 * exchange fails, the iterator will repeat the current
+			 * PTE, so the only thing this function must do
+			 * differently is return the page table memory to the
+			 * vCPU's fault cache.
+			 */
+			child_pt = mmu_memory_cache_alloc(pf_pt_cache);
+			new_pte = generate_nonleaf_pte(child_pt, false);
+
+			if (!direct_walk_iterator_set_pte(&iter, new_pte))
+				mmu_memory_cache_return(pf_pt_cache, child_pt);
+		}
+	}
+	direct_walk_iterator_end_traversal(&iter);
+
+	/* If emulating, flush this vcpu's TLB. */
+	if (ret == RET_PF_EMULATE)
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+	return ret;
+}
+
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
 {
 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
@@ -5014,7 +5161,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
-	bool map_writable;
+	bool map_writable = false;
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
 
@@ -5035,8 +5182,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
 	}
 
-	if (fast_page_fault(vcpu, gpa, level, error_code))
-		return RET_PF_RETRY;
+	if (!vcpu->kvm->arch.direct_mmu_enabled)
+		if (fast_page_fault(vcpu, gpa, level, error_code))
+			return RET_PF_RETRY;
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
@@ -5048,17 +5196,31 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		return r;
 
 	r = RET_PF_RETRY;
-	write_lock(&vcpu->kvm->mmu_lock);
+	if (vcpu->kvm->arch.direct_mmu_enabled)
+		read_lock(&vcpu->kvm->mmu_lock);
+	else
+		write_lock(&vcpu->kvm->mmu_lock);
+
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
 	if (make_mmu_pages_available(vcpu) < 0)
 		goto out_unlock;
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
+
+	if (vcpu->kvm->arch.direct_mmu_enabled)
+		r = handle_direct_page_fault(vcpu, mmu_seq, write, map_writable,
+				level, gpa, gfn, pfn, prefault);
+	else
+		r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+				 prefault);
 
 out_unlock:
-	write_unlock(&vcpu->kvm->mmu_lock);
+	if (vcpu->kvm->arch.direct_mmu_enabled)
+		read_unlock(&vcpu->kvm->mmu_lock);
+	else
+		write_unlock(&vcpu->kvm->mmu_lock);
+
 	kvm_release_pfn_clean(pfn);
 	return r;
 }
@@ -6242,6 +6404,10 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
 {
 	LIST_HEAD(invalid_list);
 
+	if (vcpu->arch.mmu->direct_map && vcpu->kvm->arch.direct_mmu_enabled)
+		/* Reclaim is a todo. */
+		return true;
+
 	if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
 		return 0;
 
-- 
2.23.0.444.g18eeb5a265-goog