From: Ben Gardon <bgardon@google.com>
To: linux-kernel@vger.kernel.org, kvm@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>, Peter Xu <peterx@redhat.com>,
Sean Christopherson <seanjc@google.com>,
Peter Shier <pshier@google.com>,
Peter Feiner <pfeiner@google.com>,
Junaid Shahid <junaids@google.com>,
Jim Mattson <jmattson@google.com>,
Yulei Zhang <yulei.kernel@gmail.com>,
Wanpeng Li <kernellwp@gmail.com>,
Vitaly Kuznetsov <vkuznets@redhat.com>,
Xiao Guangrong <xiaoguangrong.eric@gmail.com>,
Ben Gardon <bgardon@google.com>
Subject: [PATCH 09/13] KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock
Date: Wed, 31 Mar 2021 14:08:37 -0700 [thread overview]
Message-ID: <20210331210841.3996155-10-bgardon@google.com> (raw)
In-Reply-To: <20210331210841.3996155-1-bgardon@google.com>
To reduce lock contention and interference with page fault handlers,
allow the TDP MMU function to zap a GFN range to operate under the MMU
read lock.
Signed-off-by: Ben Gardon <bgardon@google.com>
---
arch/x86/kvm/mmu/mmu.c | 15 ++++--
arch/x86/kvm/mmu/tdp_mmu.c | 102 ++++++++++++++++++++++++++-----------
arch/x86/kvm/mmu/tdp_mmu.h | 6 ++-
3 files changed, 87 insertions(+), 36 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 667d64daa82c..dcbfc784cf2f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3155,7 +3155,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
if (is_tdp_mmu_page(sp))
- kvm_tdp_mmu_put_root(kvm, sp);
+ kvm_tdp_mmu_put_root(kvm, sp, false);
else if (!--sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
@@ -5514,13 +5514,17 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
}
}
+ write_unlock(&kvm->mmu_lock);
+
if (is_tdp_mmu_enabled(kvm)) {
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
+ read_lock(&kvm->mmu_lock);
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end,
+ true);
if (flush)
kvm_flush_remote_tlbs(kvm);
- }
- write_unlock(&kvm->mmu_lock);
+ read_unlock(&kvm->mmu_lock);
+ }
}
static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5959,7 +5963,8 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
WARN_ON_ONCE(!sp->lpage_disallowed);
if (is_tdp_mmu_page(sp)) {
kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
- sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
+ sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level),
+ false);
} else {
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->lpage_disallowed);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d255125059c4..0e99e4675dd4 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
}
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+ bool shared)
+{
+ if (shared)
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
if (!kvm->arch.tdp_mmu_enabled)
@@ -42,7 +51,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
}
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield);
+ gfn_t start, gfn_t end, bool can_yield, bool shared);
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
@@ -66,11 +75,12 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
tdp_mmu_free_sp(sp);
}
-void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
{
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- lockdep_assert_held_write(&kvm->mmu_lock);
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
@@ -81,7 +91,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
- zap_gfn_range(kvm, root, 0, max_gfn, false);
+ zap_gfn_range(kvm, root, 0, max_gfn, false, shared);
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
@@ -94,11 +104,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
* function will return NULL.
*/
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
- struct kvm_mmu_page *prev_root)
+ struct kvm_mmu_page *prev_root,
+ bool shared)
{
struct kvm_mmu_page *next_root;
- lockdep_assert_held_write(&kvm->mmu_lock);
rcu_read_lock();
@@ -117,7 +127,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
rcu_read_unlock();
if (prev_root)
- kvm_tdp_mmu_put_root(kvm, prev_root);
+ kvm_tdp_mmu_put_root(kvm, prev_root, shared);
return next_root;
}
@@ -127,11 +137,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* This makes it safe to release the MMU lock and yield within the loop, but
* if exiting the loop early, the caller must drop the reference to the most
* recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
*/
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
- for (_root = tdp_mmu_next_root(_kvm, NULL); \
- _root; \
- _root = tdp_mmu_next_root(_kvm, _root))
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared))
/* Only safe under the MMU lock in write mode, without yielding. */
#define for_each_tdp_mmu_root(_kvm, _root) \
@@ -632,7 +646,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* Return false if a yield was not needed.
*/
static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
- struct tdp_iter *iter, bool flush)
+ struct tdp_iter *iter, bool flush,
+ bool shared)
{
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -644,7 +659,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
if (flush)
kvm_flush_remote_tlbs(kvm);
- cond_resched_rwlock_write(&kvm->mmu_lock);
+ if (shared)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+
rcu_read_lock();
WARN_ON(iter->gfn > iter->next_last_level_gfn);
@@ -662,23 +681,33 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
* non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
+ *
* If can_yield is true, will release the MMU lock and reschedule if the
* scheduler needs the CPU or there is contention on the MMU lock. If this
* function cannot yield, it will not release the MMU lock or reschedule and
* the caller must ensure it does not supply too large a GFN range, or the
* operation can cause a soft lockup.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU lock in write mode.
*/
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield)
+ gfn_t start, gfn_t end, bool can_yield, bool shared)
{
struct tdp_iter iter;
bool flush_needed = false;
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
+retry:
if (can_yield &&
- tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed,
+ shared)) {
flush_needed = false;
continue;
}
@@ -696,8 +725,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
- flush_needed = true;
+ if (!shared) {
+ tdp_mmu_set_spte(kvm, &iter, 0);
+ flush_needed = true;
+ } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
}
rcu_read_unlock();
@@ -709,14 +747,20 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
* non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU in write mode.
*/
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
+bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool shared)
{
struct kvm_mmu_page *root;
bool flush = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root)
- flush |= zap_gfn_range(kvm, root, start, end, true);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, shared)
+ flush |= zap_gfn_range(kvm, root, start, end, true, shared);
return flush;
}
@@ -726,7 +770,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
bool flush;
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn, false);
if (flush)
kvm_flush_remote_tlbs(kvm);
}
@@ -893,7 +937,7 @@ static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
int ret = 0;
int as_id;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root, false) {
as_id = kvm_mmu_page_as_id(root);
slots = __kvm_memslots(kvm, as_id);
kvm_for_each_memslot(memslot, slots) {
@@ -933,7 +977,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
struct kvm_mmu_page *root, gfn_t start,
gfn_t end, unsigned long unused)
{
- return zap_gfn_range(kvm, root, start, end, false);
+ return zap_gfn_range(kvm, root, start, end, false, false);
}
int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
@@ -1098,7 +1142,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
min_level, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
continue;
if (!is_shadow_present_pte(iter.old_spte) ||
@@ -1128,7 +1172,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root, false) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
continue;
@@ -1157,7 +1201,7 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
continue;
if (spte_ad_need_write_protect(iter.old_spte)) {
@@ -1193,7 +1237,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root, false) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
continue;
@@ -1291,7 +1335,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set, false)) {
spte_set = false;
continue;
}
@@ -1326,7 +1370,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
struct kvm_mmu_page *root;
int root_as_id;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root, false) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
continue;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 9961df505067..855e58856815 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -13,9 +13,11 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
return refcount_inc_not_zero(&root->tdp_mmu_root_count);
}
-void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared);
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
+bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool shared);
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
--
2.31.0.291.g576ba9dcdaf-goog
next prev parent reply other threads:[~2021-03-31 21:10 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-03-31 21:08 [PATCH 00/13] More parallel operations for the TDP MMU Ben Gardon
2021-03-31 21:08 ` [PATCH 01/13] KVM: x86/mmu: Re-add const qualifier in kvm_tdp_mmu_zap_collapsible_sptes Ben Gardon
2021-03-31 21:08 ` [PATCH 02/13] KVM: x86/mmu: Move kvm_mmu_(get|put)_root to TDP MMU Ben Gardon
2021-03-31 21:08 ` [PATCH 03/13] KVM: x86/mmu: use tdp_mmu_free_sp to free roots Ben Gardon
2021-03-31 21:08 ` [PATCH 04/13] KVM: x86/mmu: Merge TDP MMU put and free root Ben Gardon
2021-03-31 21:08 ` [PATCH 05/13] KVM: x86/mmu: comment for_each_tdp_mmu_root requires MMU write lock Ben Gardon
2021-03-31 21:08 ` [PATCH 06/13] KVM: x86/mmu: Refactor yield safe root iterator Ben Gardon
2021-03-31 21:08 ` [PATCH 07/13] KVM: x86/mmu: Make TDP MMU root refcount atomic Ben Gardon
2021-03-31 22:21 ` Sean Christopherson
2021-04-01 16:50 ` Ben Gardon
2021-03-31 21:08 ` [PATCH 08/13] KVM: x86/mmu: Protect the tdp_mmu_roots list with RCU Ben Gardon
2021-04-01 9:37 ` Paolo Bonzini
2021-04-01 16:48 ` Ben Gardon
2021-04-01 13:16 ` kernel test robot
2021-04-01 16:50 ` Ben Gardon
2021-03-31 21:08 ` Ben Gardon [this message]
2021-04-01 9:58 ` [PATCH 09/13] KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock Paolo Bonzini
2021-04-01 16:50 ` Ben Gardon
2021-03-31 21:08 ` [PATCH 10/13] KVM: x86/mmu: Allow zapping collapsible SPTEs to use MMU " Ben Gardon
2021-04-01 10:12 ` Paolo Bonzini
2021-03-31 21:08 ` [PATCH 11/13] KVM: x86/mmu: Allow enabling / disabling dirty logging under " Ben Gardon
2021-03-31 21:08 ` [PATCH 12/13] KVM: x86/mmu: Fast invalidation for TDP MMU Ben Gardon
2021-03-31 22:27 ` Sean Christopherson
2021-04-01 16:50 ` Ben Gardon
2021-04-01 10:36 ` Paolo Bonzini
2021-04-01 16:50 ` Ben Gardon
2021-04-01 17:02 ` Paolo Bonzini
2021-03-31 21:08 ` [PATCH 13/13] KVM: x86/mmu: Tear down roots in fast invalidation thread Ben Gardon
2021-03-31 22:29 ` Sean Christopherson
2021-04-01 10:41 ` Paolo Bonzini
2021-04-01 10:37 ` Paolo Bonzini
2021-04-01 10:43 ` [PATCH 00/13] More parallel operations for the TDP MMU Paolo Bonzini
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210331210841.3996155-10-bgardon@google.com \
--to=bgardon@google.com \
--cc=jmattson@google.com \
--cc=junaids@google.com \
--cc=kernellwp@gmail.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=pbonzini@redhat.com \
--cc=peterx@redhat.com \
--cc=pfeiner@google.com \
--cc=pshier@google.com \
--cc=seanjc@google.com \
--cc=vkuznets@redhat.com \
--cc=xiaoguangrong.eric@gmail.com \
--cc=yulei.kernel@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).