From: Ben Gardon <bgardon@google.com>
To: kvm@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>,
Peter Feiner <pfeiner@google.com>,
Peter Shier <pshier@google.com>,
Junaid Shahid <junaids@google.com>,
Jim Mattson <jmattson@google.com>,
Ben Gardon <bgardon@google.com>
Subject: [RFC PATCH 16/28] kvm: mmu: Add direct MMU page fault handler
Date: Thu, 26 Sep 2019 16:18:12 -0700 [thread overview]
Message-ID: <20190926231824.149014-17-bgardon@google.com> (raw)
In-Reply-To: <20190926231824.149014-1-bgardon@google.com>
Adds handler functions to replace __direct_map in handling direct page
faults. These functions, unlike __direct_map can handle page faults on
multiple VCPUs simultaneously.
Signed-off-by: Ben Gardon <bgardon@google.com>
---
arch/x86/kvm/mmu.c | 192 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 179 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f0696658b527c..f3a26a32c8174 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1117,6 +1117,24 @@ static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
}
+/*
+ * Return an unused object to the specified cache. The object's memory should
+ * be zeroed before being returned if that memory was modified after allocation
+ * from the cache.
+ */
+static void mmu_memory_cache_return(struct kvm_mmu_memory_cache *mc,
+ void *obj)
+{
+ /*
+ * Since this object was allocated from the cache, the cache should
+ * have at least one spare capacity to put the object back.
+ */
+ BUG_ON(mc->nobjs >= ARRAY_SIZE(mc->objects));
+
+ mc->objects[mc->nobjs] = obj;
+ mc->nobjs++;
+}
+
static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
{
kmem_cache_free(pte_list_desc_cache, pte_list_desc);
@@ -2426,6 +2444,21 @@ static bool direct_walk_iterator_set_pte(struct direct_walk_iterator *iter,
return r;
}
+static u64 generate_nonleaf_pte(u64 *child_pt, bool ad_disabled)
+{
+ u64 pte;
+
+ pte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_mask;
+
+ if (ad_disabled)
+ pte |= shadow_acc_track_value;
+ else
+ pte |= shadow_accessed_mask;
+
+ return pte;
+}
+
/**
* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
* @kvm: kvm instance
@@ -3432,13 +3465,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
- spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_me_mask;
-
- if (sp_ad_disabled(sp))
- spte |= shadow_acc_track_value;
- else
- spte |= shadow_accessed_mask;
+ spte = generate_nonleaf_pte(sp->spt, sp_ad_disabled(sp));
mmu_spte_set(sptep, spte);
@@ -4071,6 +4098,126 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
return ret;
}
+static int direct_page_fault_handle_target_level(struct kvm_vcpu *vcpu,
+ int write, int map_writable, struct direct_walk_iterator *iter,
+ kvm_pfn_t pfn, bool prefault)
+{
+ u64 new_pte;
+ int ret = 0;
+ int generate_pte_ret = 0;
+
+ if (unlikely(is_noslot_pfn(pfn)))
+ new_pte = generate_mmio_pte(vcpu, iter->pte_gfn_start, ACC_ALL);
+ else {
+ generate_pte_ret = generate_pte(vcpu, ACC_ALL, iter->level,
+ iter->pte_gfn_start, pfn,
+ iter->old_pte, prefault, false,
+ map_writable, false, &new_pte);
+ /* Failed to construct a PTE. Retry the page fault. */
+ if (!new_pte)
+ return RET_PF_RETRY;
+ }
+
+ /*
+ * If the page fault was caused by a write but the page is write
+ * protected, emulation is needed. If the emulation was skipped,
+ * the vcpu would have the same fault again.
+ */
+ if ((generate_pte_ret & SET_SPTE_WRITE_PROTECTED_PT) && write)
+ ret = RET_PF_EMULATE;
+
+ /* If an MMIO PTE was installed, the MMIO will need to be emulated. */
+ if (unlikely(is_mmio_spte(new_pte)))
+ ret = RET_PF_EMULATE;
+
+ /*
+ * If this would not change the PTE then some other thread must have
+ * already fixed the page fault and there's no need to proceed.
+ */
+ if (iter->old_pte == new_pte)
+ return ret;
+
+ /*
+ * If this warning were to trigger, it would indicate that there was a
+ * missing MMU notifier or this thread raced with some notifier
+ * handler. The page fault handler should never change a present, leaf
+ * PTE to point to a differnt PFN. A notifier handler should have
+ * zapped the PTE before the main MM's page table was changed.
+ */
+ WARN_ON(is_present_direct_pte(iter->old_pte) &&
+ is_present_direct_pte(new_pte) &&
+ is_last_spte(iter->old_pte, iter->level) &&
+ is_last_spte(new_pte, iter->level) &&
+ spte_to_pfn(iter->old_pte) != spte_to_pfn(new_pte));
+
+ /*
+ * If the page fault handler lost the race to set the PTE, retry the
+ * page fault.
+ */
+ if (!direct_walk_iterator_set_pte(iter, new_pte))
+ return RET_PF_RETRY;
+
+ /*
+ * Update some stats for this page fault, if the page
+ * fault was not speculative.
+ */
+ if (!prefault)
+ vcpu->stat.pf_fixed++;
+
+ return ret;
+
+}
+
+static int handle_direct_page_fault(struct kvm_vcpu *vcpu,
+ unsigned long mmu_seq, int write, int map_writable, int level,
+ gpa_t gpa, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+{
+ struct direct_walk_iterator iter;
+ struct kvm_mmu_memory_cache *pf_pt_cache = &vcpu->arch.mmu_page_cache;
+ u64 *child_pt;
+ u64 new_pte;
+ int ret = RET_PF_RETRY;
+
+ direct_walk_iterator_setup_walk(&iter, vcpu->kvm,
+ kvm_arch_vcpu_memslots_id(vcpu), gpa >> PAGE_SHIFT,
+ (gpa >> PAGE_SHIFT) + 1, MMU_READ_LOCK);
+ while (direct_walk_iterator_next_pte(&iter)) {
+ if (iter.level == level) {
+ ret = direct_page_fault_handle_target_level(vcpu,
+ write, map_writable, &iter, pfn,
+ prefault);
+
+ break;
+ } else if (!is_present_direct_pte(iter.old_pte) ||
+ is_large_pte(iter.old_pte)) {
+ /*
+ * The leaf PTE for this fault must be mapped at a
+ * lower level, so a non-leaf PTE must be inserted into
+ * the paging structure. If the assignment below
+ * succeeds, it will add the non-leaf PTE and a new
+ * page of page table memory. Then the iterator can
+ * traverse into that new page. If the atomic compare/
+ * exchange fails, the iterator will repeat the current
+ * PTE, so the only thing this function must do
+ * differently is return the page table memory to the
+ * vCPU's fault cache.
+ */
+ child_pt = mmu_memory_cache_alloc(pf_pt_cache);
+ new_pte = generate_nonleaf_pte(child_pt, false);
+
+ if (!direct_walk_iterator_set_pte(&iter, new_pte))
+ mmu_memory_cache_return(pf_pt_cache, child_pt);
+ }
+ }
+ direct_walk_iterator_end_traversal(&iter);
+
+ /* If emulating, flush this vcpu's TLB. */
+ if (ret == RET_PF_EMULATE)
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+ return ret;
+}
+
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
{
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
@@ -5014,7 +5161,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
- bool map_writable;
+ bool map_writable = false;
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
@@ -5035,8 +5182,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
- if (fast_page_fault(vcpu, gpa, level, error_code))
- return RET_PF_RETRY;
+ if (!vcpu->kvm->arch.direct_mmu_enabled)
+ if (fast_page_fault(vcpu, gpa, level, error_code))
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -5048,17 +5196,31 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
return r;
r = RET_PF_RETRY;
- write_lock(&vcpu->kvm->mmu_lock);
+ if (vcpu->kvm->arch.direct_mmu_enabled)
+ read_lock(&vcpu->kvm->mmu_lock);
+ else
+ write_lock(&vcpu->kvm->mmu_lock);
+
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
if (make_mmu_pages_available(vcpu) < 0)
goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
+
+ if (vcpu->kvm->arch.direct_mmu_enabled)
+ r = handle_direct_page_fault(vcpu, mmu_seq, write, map_writable,
+ level, gpa, gfn, pfn, prefault);
+ else
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+ prefault);
out_unlock:
- write_unlock(&vcpu->kvm->mmu_lock);
+ if (vcpu->kvm->arch.direct_mmu_enabled)
+ read_unlock(&vcpu->kvm->mmu_lock);
+ else
+ write_unlock(&vcpu->kvm->mmu_lock);
+
kvm_release_pfn_clean(pfn);
return r;
}
@@ -6242,6 +6404,10 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
{
LIST_HEAD(invalid_list);
+ if (vcpu->arch.mmu->direct_map && vcpu->kvm->arch.direct_mmu_enabled)
+ /* Reclaim is a todo. */
+ return true;
+
if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
return 0;
--
2.23.0.444.g18eeb5a265-goog
next prev parent reply other threads:[~2019-09-26 23:19 UTC|newest]
Thread overview: 57+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-09-26 23:17 [RFC PATCH 00/28] kvm: mmu: Rework the x86 TDP direct mapped case Ben Gardon
2019-09-26 23:17 ` [RFC PATCH 01/28] kvm: mmu: Separate generating and setting mmio ptes Ben Gardon
2019-11-27 18:15 ` Sean Christopherson
2019-09-26 23:17 ` [RFC PATCH 02/28] kvm: mmu: Separate pte generation from set_spte Ben Gardon
2019-11-27 18:25 ` Sean Christopherson
2019-09-26 23:17 ` [RFC PATCH 03/28] kvm: mmu: Zero page cache memory at allocation time Ben Gardon
2019-11-27 18:32 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 04/28] kvm: mmu: Update the lpages stat atomically Ben Gardon
2019-11-27 18:39 ` Sean Christopherson
2019-12-06 20:10 ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 05/28] sched: Add cond_resched_rwlock Ben Gardon
2019-11-27 18:42 ` Sean Christopherson
2019-12-06 20:12 ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 06/28] kvm: mmu: Replace mmu_lock with a read/write lock Ben Gardon
2019-11-27 18:47 ` Sean Christopherson
2019-12-02 22:45 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 07/28] kvm: mmu: Add functions for handling changed PTEs Ben Gardon
2019-11-27 19:04 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 08/28] kvm: mmu: Init / Uninit the direct MMU Ben Gardon
2019-12-02 23:40 ` Sean Christopherson
2019-12-06 20:25 ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 09/28] kvm: mmu: Free direct MMU page table memory in an RCU callback Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 10/28] kvm: mmu: Flush TLBs before freeing direct MMU page table memory Ben Gardon
2019-12-02 23:46 ` Sean Christopherson
2019-12-06 20:31 ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 11/28] kvm: mmu: Optimize for freeing direct MMU PTs on teardown Ben Gardon
2019-12-02 23:54 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 12/28] kvm: mmu: Set tlbs_dirty atomically Ben Gardon
2019-12-03 0:13 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 13/28] kvm: mmu: Add an iterator for concurrent paging structure walks Ben Gardon
2019-12-03 2:15 ` Sean Christopherson
2019-12-18 18:25 ` Ben Gardon
2019-12-18 19:14 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 14/28] kvm: mmu: Batch updates to the direct mmu disconnected list Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 15/28] kvm: mmu: Support invalidate_zap_all_pages Ben Gardon
2019-09-26 23:18 ` Ben Gardon [this message]
2020-01-08 17:20 ` [RFC PATCH 16/28] kvm: mmu: Add direct MMU page fault handler Peter Xu
2020-01-08 18:15 ` Ben Gardon
2020-01-08 19:00 ` Peter Xu
2019-09-26 23:18 ` [RFC PATCH 17/28] kvm: mmu: Add direct MMU fast " Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 18/28] kvm: mmu: Add an hva range iterator for memslot GFNs Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 19/28] kvm: mmu: Make address space ID a property of memslots Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 20/28] kvm: mmu: Implement the invalidation MMU notifiers for the direct MMU Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 21/28] kvm: mmu: Integrate the direct mmu with the changed pte notifier Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 22/28] kvm: mmu: Implement access tracking for the direct MMU Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 23/28] kvm: mmu: Make mark_page_dirty_in_slot usable from outside kvm_main Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 24/28] kvm: mmu: Support dirty logging in the direct MMU Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 25/28] kvm: mmu: Support kvm_zap_gfn_range " Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 26/28] kvm: mmu: Integrate direct MMU with nesting Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 27/28] kvm: mmu: Lazily allocate rmap when direct MMU is enabled Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 28/28] kvm: mmu: Support MMIO in the direct MMU Ben Gardon
2019-10-17 18:50 ` [RFC PATCH 00/28] kvm: mmu: Rework the x86 TDP direct mapped case Sean Christopherson
2019-10-18 13:42 ` Paolo Bonzini
2019-11-27 19:09 ` Sean Christopherson
2019-12-06 19:55 ` Ben Gardon
2019-12-06 19:57 ` Sean Christopherson
2019-12-06 20:42 ` Ben Gardon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190926231824.149014-17-bgardon@google.com \
--to=bgardon@google.com \
--cc=jmattson@google.com \
--cc=junaids@google.com \
--cc=kvm@vger.kernel.org \
--cc=pbonzini@redhat.com \
--cc=pfeiner@google.com \
--cc=pshier@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).