All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ben Gardon <bgardon@google.com>
To: linux-kernel@vger.kernel.org, kvm@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>, Peter Xu <peterx@redhat.com>,
	Sean Christopherson <seanjc@google.com>,
	Peter Shier <pshier@google.com>,
	David Matlack <dmatlack@google.com>,
	Mingwei Zhang <mizhang@google.com>,
	Yulei Zhang <yulei.kernel@gmail.com>,
	Wanpeng Li <kernellwp@gmail.com>,
	Xiao Guangrong <xiaoguangrong.eric@gmail.com>,
	Kai Huang <kai.huang@intel.com>,
	Keqian Zhu <zhukeqian1@huawei.com>,
	David Hildenbrand <david@redhat.com>,
	Ben Gardon <bgardon@google.com>
Subject: [RFC 19/19] KVM: x86/mmu: Promote pages in-place when disabling dirty logging
Date: Wed, 10 Nov 2021 14:30:10 -0800	[thread overview]
Message-ID: <20211110223010.1392399-20-bgardon@google.com> (raw)
In-Reply-To: <20211110223010.1392399-1-bgardon@google.com>

When disabling dirty logging, the TDP MMU currently zaps each leaf entry
mapping memory in the relevant memslot. This is very slow. Doing the zaps
under the mmu read lock requires a TLB flush for every zap and the
zapping causes a storm of ETP/NPT violations.

Instead of zapping, replace the split large pages with large page
mappings directly. While this sort of operation has historically only
been done in the vCPU page fault handler context, refactorings earlier
in this series and the relative simplicity of the TDP MMU make it
possible here as well.

Running the dirty_log_perf_test on an Intel Skylake with 96 vCPUs and 1G
of memory per vCPU, this reduces the time required to disable dirty
logging from over 45 seconds to just over 1 second. It also avoids
provoking page faults, improving vCPU performance while disabling
dirty logging.


Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu/mmu.c          |  2 +-
 arch/x86/kvm/mmu/mmu_internal.h |  4 ++
 arch/x86/kvm/mmu/tdp_mmu.c      | 69 ++++++++++++++++++++++++++++++++-
 3 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index ef7a84422463..add724aa9e8c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4449,7 +4449,7 @@ static inline bool boot_cpu_is_amd(void)
  * the direct page table on host, use as much mmu features as
  * possible, however, kvm currently does not do execution-protection.
  */
-static void
+void
 build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
 				int shadow_root_level)
 {
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 6563cce9c438..84d439432acf 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -161,4 +161,8 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+void
+build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
+				int shadow_root_level);
+
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 836eadd4e73a..77ff7f1d0d0a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1435,6 +1435,66 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
 }
 
+static void try_promote_lpage(struct kvm *kvm,
+			      const struct kvm_memory_slot *slot,
+			      struct tdp_iter *iter)
+{
+	struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
+	struct rsvd_bits_validate shadow_zero_check;
+	/*
+	 * Since the TDP  MMU doesn't manage nested PTs, there's no need to
+	 * write protect for a nested VM when PML is in use.
+	 */
+	bool ad_need_write_protect = false;
+	bool map_writable;
+	kvm_pfn_t pfn;
+	u64 new_spte;
+	u64 mt_mask;
+
+	/*
+	 * If addresses are being invalidated, don't do in-place promotion to
+	 * avoid accidentally mapping an invalidated address.
+	 */
+	if (unlikely(kvm->mmu_notifier_count))
+		return;
+
+	pfn = __gfn_to_pfn_memslot(slot, iter->gfn, true, NULL, true,
+				   &map_writable, NULL);
+
+	/*
+	 * Can't reconstitute an lpage if the consituent pages can't be
+	 * mapped higher.
+	 */
+	if (iter->level > kvm_mmu_max_mapping_level(kvm, slot, iter->gfn,
+						    pfn, PG_LEVEL_NUM))
+		return;
+
+	build_tdp_shadow_zero_bits_mask(&shadow_zero_check, iter->root_level);
+
+	/*
+	 * In some cases, a vCPU pointer is required to get the MT mask,
+	 * however in most cases it can be generated without one. If a
+	 * vCPU pointer is needed kvm_x86_try_get_mt_mask will fail.
+	 * In that case, bail on in-place promotion.
+	 */
+	if (unlikely(!static_call(kvm_x86_try_get_mt_mask)(kvm, iter->gfn,
+							   kvm_is_mmio_pfn(pfn),
+							   &mt_mask)))
+		return;
+
+	make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,
+		  map_writable, ad_need_write_protect, mt_mask,
+		  &shadow_zero_check, &new_spte);
+
+	tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
+
+	/*
+	 * Re-read the SPTE to avoid recursing into one of the removed child
+	 * page tables.
+	 */
+	iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+}
+
 /*
  * Clear leaf entries which could be replaced by large mappings, for
  * GFNs within the slot.
@@ -1455,9 +1515,14 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
 			continue;
 
-		if (!is_shadow_present_pte(iter.old_spte) ||
-		    !is_last_spte(iter.old_spte, iter.level))
+		if (!is_shadow_present_pte(iter.old_spte))
+			continue;
+
+		/* Try to promote the constitutent pages to an lpage. */
+		if (!is_last_spte(iter.old_spte, iter.level)) {
+			try_promote_lpage(kvm, slot, &iter);
 			continue;
+		}
 
 		pfn = spte_to_pfn(iter.old_spte);
 		if (kvm_is_reserved_pfn(pfn) ||
-- 
2.34.0.rc0.344.g81b53c2807-goog


  parent reply	other threads:[~2021-11-10 22:31 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-10 22:29 [RFC 00/19] KVM: x86/mmu: Optimize disabling dirty logging Ben Gardon
2021-11-10 22:29 ` [RFC 01/19] KVM: x86/mmu: Fix TLB flush range when handling disconnected pt Ben Gardon
2021-11-11 17:44   ` David Matlack
2021-11-10 22:29 ` [RFC 02/19] KVM: x86/mmu: Batch TLB flushes for a single zap Ben Gardon
2021-11-11 18:06   ` David Matlack
2021-11-12 23:53   ` Sean Christopherson
2021-11-10 22:29 ` [RFC 03/19] KVM: x86/mmu: Factor flush and free up when zapping under MMU write lock Ben Gardon
2021-11-11 18:31   ` David Matlack
2021-11-10 22:29 ` [RFC 04/19] KVM: x86/mmu: Yield while processing disconnected_sps Ben Gardon
2021-11-11 18:50   ` David Matlack
2021-11-10 22:29 ` [RFC 05/19] KVM: x86/mmu: Remove redundant flushes when disabling dirty logging Ben Gardon
2021-11-11 18:55   ` David Matlack
2021-11-10 22:29 ` [RFC 06/19] KVM: x86/mmu: Introduce vcpu_make_spte Ben Gardon
2021-11-10 22:29 ` [RFC 07/19] KVM: x86/mmu: Factor wrprot for nested PML out of make_spte Ben Gardon
2021-11-18  2:12   ` Sean Christopherson
2021-11-18 17:43     ` Ben Gardon
2021-11-18 18:04       ` Paolo Bonzini
2021-11-10 22:29 ` [RFC 08/19] KVM: x86/mmu: Factor mt_mask " Ben Gardon
2021-11-10 22:30 ` [RFC 09/19] KVM: x86/mmu: Remove need for a vcpu from kvm_slot_page_track_is_active Ben Gardon
2021-11-10 22:30 ` [RFC 10/19] KVM: x86/mmu: Remove need for a vcpu from mmu_try_to_unsync_pages Ben Gardon
2021-11-10 22:30 ` [RFC 11/19] KVM: x86/mmu: Factor shadow_zero_check out of make_spte Ben Gardon
2021-11-10 22:44   ` Paolo Bonzini
2021-11-10 23:49     ` Ben Gardon
2021-11-11  1:18       ` Sean Christopherson
2021-11-11  1:44         ` Sean Christopherson
2021-11-11  7:06         ` Paolo Bonzini
2021-11-18  2:05   ` Sean Christopherson
2021-11-18  3:29     ` Sean Christopherson
2021-11-18 16:37       ` Sean Christopherson
2021-11-18 17:19         ` Paolo Bonzini
2021-11-18 18:02           ` Sean Christopherson
2021-11-18 18:07             ` Paolo Bonzini
2021-11-18 18:14               ` Sean Christopherson
2021-11-10 22:30 ` [RFC 12/19] KVM: x86/mmu: Replace vcpu argument with kvm pointer in make_spte Ben Gardon
2021-11-10 22:30 ` [RFC 13/19] KVM: x86/mmu: Factor out the meat of reset_tdp_shadow_zero_bits_mask Ben Gardon
2021-11-10 22:30 ` [RFC 14/19] KVM: x86/mmu: Propagate memslot const qualifier Ben Gardon
2021-11-10 22:30 ` [RFC 15/19] KVM: x86/MMU: Refactor vmx_get_mt_mask Ben Gardon
2021-11-10 22:30 ` [RFC 16/19] KVM: x86/mmu: Factor out part of vmx_get_mt_mask which does not depend on vcpu Ben Gardon
2021-11-10 22:30 ` [RFC 17/19] KVM: x86/mmu: Add try_get_mt_mask to x86_ops Ben Gardon
2021-11-10 22:30 ` [RFC 18/19] KVM: x86/mmu: Make kvm_is_mmio_pfn usable outside of spte.c Ben Gardon
2021-11-10 22:30 ` Ben Gardon [this message]
2021-11-15 21:24 ` [RFC 00/19] KVM: x86/mmu: Optimize disabling dirty logging Ben Gardon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211110223010.1392399-20-bgardon@google.com \
    --to=bgardon@google.com \
    --cc=david@redhat.com \
    --cc=dmatlack@google.com \
    --cc=kai.huang@intel.com \
    --cc=kernellwp@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mizhang@google.com \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=pshier@google.com \
    --cc=seanjc@google.com \
    --cc=xiaoguangrong.eric@gmail.com \
    --cc=yulei.kernel@gmail.com \
    --cc=zhukeqian1@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.