kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Ben Gardon <bgardon@google.com>
To: kvm@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>,
	Peter Feiner <pfeiner@google.com>,
	Peter Shier <pshier@google.com>,
	Junaid Shahid <junaids@google.com>,
	Jim Mattson <jmattson@google.com>,
	Ben Gardon <bgardon@google.com>
Subject: [RFC PATCH 16/28] kvm: mmu: Add direct MMU page fault handler
Date: Thu, 26 Sep 2019 16:18:12 -0700	[thread overview]
Message-ID: <20190926231824.149014-17-bgardon@google.com> (raw)
In-Reply-To: <20190926231824.149014-1-bgardon@google.com>

Adds handler functions to replace __direct_map in handling direct page
faults. These functions, unlike __direct_map can handle page faults on
multiple VCPUs simultaneously.

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu.c | 192 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 179 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f0696658b527c..f3a26a32c8174 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1117,6 +1117,24 @@ static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
 }
 
+/*
+ * Return an unused object to the specified cache. The object's memory should
+ * be zeroed before being returned if that memory was modified after allocation
+ * from the cache.
+ */
+static void mmu_memory_cache_return(struct kvm_mmu_memory_cache *mc,
+				     void *obj)
+{
+	/*
+	 * Since this object was allocated from the cache, the cache should
+	 * have at least one spare capacity to put the object back.
+	 */
+	BUG_ON(mc->nobjs >= ARRAY_SIZE(mc->objects));
+
+	mc->objects[mc->nobjs] = obj;
+	mc->nobjs++;
+}
+
 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 {
 	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
@@ -2426,6 +2444,21 @@ static bool direct_walk_iterator_set_pte(struct direct_walk_iterator *iter,
 	return r;
 }
 
+static u64 generate_nonleaf_pte(u64 *child_pt, bool ad_disabled)
+{
+	u64 pte;
+
+	pte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+	       shadow_user_mask | shadow_x_mask | shadow_me_mask;
+
+	if (ad_disabled)
+		pte |= shadow_acc_track_value;
+	else
+		pte |= shadow_accessed_mask;
+
+	return pte;
+}
+
 /**
  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
@@ -3432,13 +3465,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
 
 	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
 
-	spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
-	       shadow_user_mask | shadow_x_mask | shadow_me_mask;
-
-	if (sp_ad_disabled(sp))
-		spte |= shadow_acc_track_value;
-	else
-		spte |= shadow_accessed_mask;
+	spte = generate_nonleaf_pte(sp->spt, sp_ad_disabled(sp));
 
 	mmu_spte_set(sptep, spte);
 
@@ -4071,6 +4098,126 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 	return ret;
 }
 
+static int direct_page_fault_handle_target_level(struct kvm_vcpu *vcpu,
+		int write, int map_writable, struct direct_walk_iterator *iter,
+		kvm_pfn_t pfn, bool prefault)
+{
+	u64 new_pte;
+	int ret = 0;
+	int generate_pte_ret = 0;
+
+	if (unlikely(is_noslot_pfn(pfn)))
+		new_pte = generate_mmio_pte(vcpu, iter->pte_gfn_start, ACC_ALL);
+	else {
+		generate_pte_ret = generate_pte(vcpu, ACC_ALL, iter->level,
+						iter->pte_gfn_start, pfn,
+						iter->old_pte, prefault, false,
+						map_writable, false, &new_pte);
+		/* Failed to construct a PTE. Retry the page fault. */
+		if (!new_pte)
+			return RET_PF_RETRY;
+	}
+
+	/*
+	 * If the page fault was caused by a write but the page is write
+	 * protected, emulation is needed. If the emulation was skipped,
+	 * the vcpu would have the same fault again.
+	 */
+	if ((generate_pte_ret & SET_SPTE_WRITE_PROTECTED_PT) && write)
+		ret = RET_PF_EMULATE;
+
+	/* If an MMIO PTE was installed, the MMIO will need to be emulated. */
+	if (unlikely(is_mmio_spte(new_pte)))
+		ret = RET_PF_EMULATE;
+
+	/*
+	 * If this would not change the PTE then some other thread must have
+	 * already fixed the page fault and there's no need to proceed.
+	 */
+	if (iter->old_pte == new_pte)
+		return ret;
+
+	/*
+	 * If this warning were to trigger, it would indicate that there was a
+	 * missing MMU notifier or this thread raced with some notifier
+	 * handler. The page fault handler should never change a present, leaf
+	 * PTE to point to a differnt PFN. A notifier handler should have
+	 * zapped the PTE before the main MM's page table was changed.
+	 */
+	WARN_ON(is_present_direct_pte(iter->old_pte) &&
+		is_present_direct_pte(new_pte) &&
+		is_last_spte(iter->old_pte, iter->level) &&
+		is_last_spte(new_pte, iter->level) &&
+		spte_to_pfn(iter->old_pte) != spte_to_pfn(new_pte));
+
+	/*
+	 * If the page fault handler lost the race to set the PTE, retry the
+	 * page fault.
+	 */
+	if (!direct_walk_iterator_set_pte(iter, new_pte))
+		return RET_PF_RETRY;
+
+	/*
+	 * Update some stats for this page fault, if the page
+	 * fault was not speculative.
+	 */
+	if (!prefault)
+		vcpu->stat.pf_fixed++;
+
+	return ret;
+
+}
+
+static int handle_direct_page_fault(struct kvm_vcpu *vcpu,
+		unsigned long mmu_seq, int write, int map_writable, int level,
+		gpa_t gpa, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+{
+	struct direct_walk_iterator iter;
+	struct kvm_mmu_memory_cache *pf_pt_cache = &vcpu->arch.mmu_page_cache;
+	u64 *child_pt;
+	u64 new_pte;
+	int ret = RET_PF_RETRY;
+
+	direct_walk_iterator_setup_walk(&iter, vcpu->kvm,
+			kvm_arch_vcpu_memslots_id(vcpu), gpa >> PAGE_SHIFT,
+			(gpa >> PAGE_SHIFT) + 1, MMU_READ_LOCK);
+	while (direct_walk_iterator_next_pte(&iter)) {
+		if (iter.level == level) {
+			ret = direct_page_fault_handle_target_level(vcpu,
+					write, map_writable, &iter, pfn,
+					prefault);
+
+			break;
+		} else if (!is_present_direct_pte(iter.old_pte) ||
+			   is_large_pte(iter.old_pte)) {
+			/*
+			 * The leaf PTE for this fault must be mapped at a
+			 * lower level, so a non-leaf PTE must be inserted into
+			 * the paging structure. If the assignment below
+			 * succeeds, it will add the non-leaf PTE and a new
+			 * page of page table memory. Then the iterator can
+			 * traverse into that new page. If the atomic compare/
+			 * exchange fails, the iterator will repeat the current
+			 * PTE, so the only thing this function must do
+			 * differently is return the page table memory to the
+			 * vCPU's fault cache.
+			 */
+			child_pt = mmu_memory_cache_alloc(pf_pt_cache);
+			new_pte = generate_nonleaf_pte(child_pt, false);
+
+			if (!direct_walk_iterator_set_pte(&iter, new_pte))
+				mmu_memory_cache_return(pf_pt_cache, child_pt);
+		}
+	}
+	direct_walk_iterator_end_traversal(&iter);
+
+	/* If emulating, flush this vcpu's TLB. */
+	if (ret == RET_PF_EMULATE)
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+	return ret;
+}
+
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
 {
 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
@@ -5014,7 +5161,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
-	bool map_writable;
+	bool map_writable = false;
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
 
@@ -5035,8 +5182,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
 	}
 
-	if (fast_page_fault(vcpu, gpa, level, error_code))
-		return RET_PF_RETRY;
+	if (!vcpu->kvm->arch.direct_mmu_enabled)
+		if (fast_page_fault(vcpu, gpa, level, error_code))
+			return RET_PF_RETRY;
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
@@ -5048,17 +5196,31 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		return r;
 
 	r = RET_PF_RETRY;
-	write_lock(&vcpu->kvm->mmu_lock);
+	if (vcpu->kvm->arch.direct_mmu_enabled)
+		read_lock(&vcpu->kvm->mmu_lock);
+	else
+		write_lock(&vcpu->kvm->mmu_lock);
+
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
 	if (make_mmu_pages_available(vcpu) < 0)
 		goto out_unlock;
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
+
+	if (vcpu->kvm->arch.direct_mmu_enabled)
+		r = handle_direct_page_fault(vcpu, mmu_seq, write, map_writable,
+				level, gpa, gfn, pfn, prefault);
+	else
+		r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+				 prefault);
 
 out_unlock:
-	write_unlock(&vcpu->kvm->mmu_lock);
+	if (vcpu->kvm->arch.direct_mmu_enabled)
+		read_unlock(&vcpu->kvm->mmu_lock);
+	else
+		write_unlock(&vcpu->kvm->mmu_lock);
+
 	kvm_release_pfn_clean(pfn);
 	return r;
 }
@@ -6242,6 +6404,10 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
 {
 	LIST_HEAD(invalid_list);
 
+	if (vcpu->arch.mmu->direct_map && vcpu->kvm->arch.direct_mmu_enabled)
+		/* Reclaim is a todo. */
+		return true;
+
 	if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
 		return 0;
 
-- 
2.23.0.444.g18eeb5a265-goog


  parent reply	other threads:[~2019-09-26 23:19 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-09-26 23:17 [RFC PATCH 00/28] kvm: mmu: Rework the x86 TDP direct mapped case Ben Gardon
2019-09-26 23:17 ` [RFC PATCH 01/28] kvm: mmu: Separate generating and setting mmio ptes Ben Gardon
2019-11-27 18:15   ` Sean Christopherson
2019-09-26 23:17 ` [RFC PATCH 02/28] kvm: mmu: Separate pte generation from set_spte Ben Gardon
2019-11-27 18:25   ` Sean Christopherson
2019-09-26 23:17 ` [RFC PATCH 03/28] kvm: mmu: Zero page cache memory at allocation time Ben Gardon
2019-11-27 18:32   ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 04/28] kvm: mmu: Update the lpages stat atomically Ben Gardon
2019-11-27 18:39   ` Sean Christopherson
2019-12-06 20:10     ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 05/28] sched: Add cond_resched_rwlock Ben Gardon
2019-11-27 18:42   ` Sean Christopherson
2019-12-06 20:12     ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 06/28] kvm: mmu: Replace mmu_lock with a read/write lock Ben Gardon
2019-11-27 18:47   ` Sean Christopherson
2019-12-02 22:45     ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 07/28] kvm: mmu: Add functions for handling changed PTEs Ben Gardon
2019-11-27 19:04   ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 08/28] kvm: mmu: Init / Uninit the direct MMU Ben Gardon
2019-12-02 23:40   ` Sean Christopherson
2019-12-06 20:25     ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 09/28] kvm: mmu: Free direct MMU page table memory in an RCU callback Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 10/28] kvm: mmu: Flush TLBs before freeing direct MMU page table memory Ben Gardon
2019-12-02 23:46   ` Sean Christopherson
2019-12-06 20:31     ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 11/28] kvm: mmu: Optimize for freeing direct MMU PTs on teardown Ben Gardon
2019-12-02 23:54   ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 12/28] kvm: mmu: Set tlbs_dirty atomically Ben Gardon
2019-12-03  0:13   ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 13/28] kvm: mmu: Add an iterator for concurrent paging structure walks Ben Gardon
2019-12-03  2:15   ` Sean Christopherson
2019-12-18 18:25     ` Ben Gardon
2019-12-18 19:14       ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 14/28] kvm: mmu: Batch updates to the direct mmu disconnected list Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 15/28] kvm: mmu: Support invalidate_zap_all_pages Ben Gardon
2019-09-26 23:18 ` Ben Gardon [this message]
2020-01-08 17:20   ` [RFC PATCH 16/28] kvm: mmu: Add direct MMU page fault handler Peter Xu
2020-01-08 18:15     ` Ben Gardon
2020-01-08 19:00       ` Peter Xu
2019-09-26 23:18 ` [RFC PATCH 17/28] kvm: mmu: Add direct MMU fast " Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 18/28] kvm: mmu: Add an hva range iterator for memslot GFNs Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 19/28] kvm: mmu: Make address space ID a property of memslots Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 20/28] kvm: mmu: Implement the invalidation MMU notifiers for the direct MMU Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 21/28] kvm: mmu: Integrate the direct mmu with the changed pte notifier Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 22/28] kvm: mmu: Implement access tracking for the direct MMU Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 23/28] kvm: mmu: Make mark_page_dirty_in_slot usable from outside kvm_main Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 24/28] kvm: mmu: Support dirty logging in the direct MMU Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 25/28] kvm: mmu: Support kvm_zap_gfn_range " Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 26/28] kvm: mmu: Integrate direct MMU with nesting Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 27/28] kvm: mmu: Lazily allocate rmap when direct MMU is enabled Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 28/28] kvm: mmu: Support MMIO in the direct MMU Ben Gardon
2019-10-17 18:50 ` [RFC PATCH 00/28] kvm: mmu: Rework the x86 TDP direct mapped case Sean Christopherson
2019-10-18 13:42   ` Paolo Bonzini
2019-11-27 19:09 ` Sean Christopherson
2019-12-06 19:55   ` Ben Gardon
2019-12-06 19:57     ` Sean Christopherson
2019-12-06 20:42       ` Ben Gardon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190926231824.149014-17-bgardon@google.com \
    --to=bgardon@google.com \
    --cc=jmattson@google.com \
    --cc=junaids@google.com \
    --cc=kvm@vger.kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=pfeiner@google.com \
    --cc=pshier@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).