All of lore.kernel.org
 help / color / mirror / Atom feed
From: Sean Christopherson <seanjc@google.com>
To: Marc Zyngier <maz@kernel.org>,
	Huacai Chen <chenhuacai@kernel.org>,
	Aleksandar Markovic <aleksandar.qemu.devel@gmail.com>,
	Paul Mackerras <paulus@ozlabs.org>,
	Paolo Bonzini <pbonzini@redhat.com>
Cc: James Morse <james.morse@arm.com>,
	Julien Thierry <julien.thierry.kdev@gmail.com>,
	Suzuki K Poulose <suzuki.poulose@arm.com>,
	Sean Christopherson <seanjc@google.com>,
	Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>, Joerg Roedel <joro@8bytes.org>,
	linux-arm-kernel@lists.infradead.org,
	kvmarm@lists.cs.columbia.edu, linux-mips@vger.kernel.org,
	kvm@vger.kernel.org, kvm-ppc@vger.kernel.org,
	linux-kernel@vger.kernel.org, Ben Gardon <bgardon@google.com>
Subject: [PATCH v2 09/10] KVM: Don't take mmu_lock for range invalidation unless necessary
Date: Thu,  1 Apr 2021 17:56:57 -0700	[thread overview]
Message-ID: <20210402005658.3024832-10-seanjc@google.com> (raw)
In-Reply-To: <20210402005658.3024832-1-seanjc@google.com>

Avoid taking mmu_lock for unrelated .invalidate_range_{start,end}()
notifications.  Because mmu_notifier_count must be modified while holding
mmu_lock for write, and must always be paired across start->end to stay
balanced, lock elision must happen in both or none.  To meet that
requirement, add a rwsem to prevent memslot updates across range_start()
and range_end().

Use a rwsem instead of a rwlock since most notifiers _allow_ blocking,
and the lock will be endl across the entire start() ... end() sequence.
If anything in the sequence sleeps, including the caller or a different
notifier, holding the spinlock would be disastrous.

For notifiers that _disallow_ blocking, e.g. OOM reaping, simply go down
the slow path of unconditionally acquiring mmu_lock.  The sane
alternative would be to try to acquire the lock and force the notifier
to retry on failure.  But since OOM is currently the _only_ scenario
where blocking is disallowed attempting to optimize a guest that has been
marked for death is pointless.

Unconditionally define and use mmu_notifier_slots_lock in the memslots
code, purely to avoid more #ifdefs.  The overhead of acquiring the lock
is negligible when the lock is uncontested, which will always be the case
when the MMU notifiers are not used.

Note, technically flag-only memslot updates could be allowed in parallel,
but stalling a memslot update for a relatively short amount of time is
not a scalability issue, and this is all more than complex enough.

Based heavily on code from Ben Gardon.

Suggested-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h |  6 ++-
 virt/kvm/kvm_main.c      | 96 +++++++++++++++++++++++++++++++---------
 2 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 40ac2d40bb5a..bc3dd2838bb8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -472,6 +472,7 @@ struct kvm {
 #endif /* KVM_HAVE_MMU_RWLOCK */
 
 	struct mutex slots_lock;
+	struct rw_semaphore mmu_notifier_slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
@@ -660,8 +661,9 @@ static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
 {
 	as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM);
 	return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
-			lockdep_is_held(&kvm->slots_lock) ||
-			!refcount_read(&kvm->users_count));
+				      lockdep_is_held(&kvm->slots_lock) ||
+				      lockdep_is_held(&kvm->mmu_notifier_slots_lock) ||
+				      !refcount_read(&kvm->users_count));
 }
 
 static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f6697ad741ed..af28f39817a5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -462,6 +462,7 @@ struct kvm_hva_range {
 	pte_t pte;
 	hva_handler_t handler;
 	on_lock_fn_t on_lock;
+	bool must_lock;
 	bool flush_on_ret;
 	bool may_block;
 };
@@ -479,6 +480,25 @@ static void kvm_null_fn(void)
 }
 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 
+
+/* Acquire mmu_lock if necessary.  Returns %true if @handler is "null" */
+static __always_inline bool kvm_mmu_lock_and_check_handler(struct kvm *kvm,
+							   const struct kvm_hva_range *range,
+							   bool *locked)
+{
+	if (*locked)
+		return false;
+
+	*locked = true;
+
+	KVM_MMU_LOCK(kvm);
+
+	if (!IS_KVM_NULL_FN(range->on_lock))
+		range->on_lock(kvm, range->start, range->end);
+
+	return IS_KVM_NULL_FN(range->handler);
+}
+
 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 						  const struct kvm_hva_range *range)
 {
@@ -495,16 +515,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 
 	idx = srcu_read_lock(&kvm->srcu);
 
-	/* The on_lock() path does not yet support lock elision. */
-	if (!IS_KVM_NULL_FN(range->on_lock)) {
-		locked = true;
-		KVM_MMU_LOCK(kvm);
-
-		range->on_lock(kvm, range->start, range->end);
-
-		if (IS_KVM_NULL_FN(range->handler))
-			goto out_unlock;
-	}
+	if (range->must_lock &&
+	    kvm_mmu_lock_and_check_handler(kvm, range, &locked))
+		goto out_unlock;
 
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 		slots = __kvm_memslots(kvm, i);
@@ -534,10 +547,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 			gfn_range.slot = slot;
 
-			if (!locked) {
-				locked = true;
-				KVM_MMU_LOCK(kvm);
-			}
+			if (kvm_mmu_lock_and_check_handler(kvm, range, &locked))
+				goto out_unlock;
+
 			ret |= range->handler(kvm, &gfn_range);
 		}
 	}
@@ -568,6 +580,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 		.pte		= pte,
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
+		.must_lock	= false,
 		.flush_on_ret	= true,
 		.may_block	= false,
 	};
@@ -587,6 +600,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
 		.pte		= __pte(0),
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
+		.must_lock	= false,
 		.flush_on_ret	= false,
 		.may_block	= false,
 	};
@@ -603,11 +617,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	trace_kvm_set_spte_hva(address);
 
 	/*
-	 * .change_pte() must be bookended by .invalidate_range_{start,end}(),
-	 * and so always runs with an elevated notifier count.  This obviates
-	 * the need to bump the sequence count.
+	 * .change_pte() must be bookended by .invalidate_range_{start,end}().
+	 * If mmu_notifier_count is zero, then start() didn't find a relevant
+	 * memslot and wasn't forced down the slow path; rechecking here is
+	 * unnecessary.  This can only occur if memslot updates are blocked.
 	 */
-	WARN_ON_ONCE(!kvm->mmu_notifier_count);
+	if (!kvm->mmu_notifier_count) {
+		lockdep_assert_held(&kvm->mmu_notifier_slots_lock);
+		return;
+	}
 
 	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 }
@@ -644,6 +662,7 @@ static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
+	bool blockable = mmu_notifier_range_blockable(range);
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
@@ -651,12 +670,29 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		.pte		= __pte(0),
 		.handler	= kvm_unmap_gfn_range,
 		.on_lock	= kvm_inc_notifier_count,
+		.must_lock	= !blockable,
 		.flush_on_ret	= true,
-		.may_block	= mmu_notifier_range_blockable(range),
+		.may_block	= blockable,
 	};
 
 	trace_kvm_unmap_hva_range(range->start, range->end);
 
+	/*
+	 * Prevent memslot modification between range_start() and range_end()
+	 * so that conditionally locking provides the same result in both
+	 * functions.  Without that guarantee, the mmu_notifier_count
+	 * adjustments will be imbalanced.
+	 *
+	 * Skip the memslot-lookup lock elision (set @must_lock above) to avoid
+	 * having to take the semaphore on non-blockable calls, e.g. OOM kill.
+	 * The complexity required to handle conditional locking for this case
+	 * is not worth the marginal benefits, the VM is likely doomed anyways.
+	 *
+	 * Pairs with the unlock in range_end().
+	 */
+	if (blockable)
+		down_read(&kvm->mmu_notifier_slots_lock);
+
 	__kvm_handle_hva_range(kvm, &hva_range);
 
 	return 0;
@@ -683,6 +719,7 @@ static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
+	bool blockable = mmu_notifier_range_blockable(range);
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
@@ -690,12 +727,17 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 		.pte		= __pte(0),
 		.handler	= (void *)kvm_null_fn,
 		.on_lock	= kvm_dec_notifier_count,
+		.must_lock	= !blockable,
 		.flush_on_ret	= true,
-		.may_block	= mmu_notifier_range_blockable(range),
+		.may_block	= blockable,
 	};
 
 	__kvm_handle_hva_range(kvm, &hva_range);
 
+	/* Pairs with the lock in range_start(). */
+	if (blockable)
+		up_read(&kvm->mmu_notifier_slots_lock);
+
 	BUG_ON(kvm->mmu_notifier_count < 0);
 }
 
@@ -908,6 +950,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
+	init_rwsem(&kvm->mmu_notifier_slots_lock);
 	INIT_LIST_HEAD(&kvm->devices);
 
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -1028,6 +1071,16 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_coalesced_mmio_free(kvm);
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+	/*
+	 * Reset the lock used to prevent memslot updates between MMU notifier
+	 * invalidate_range_start() and invalidate_range_end().  At this point,
+	 * no more MMU notifiers will run and pending calls to ...start() have
+	 * completed.  But, the lock could still be held if KVM's notifier was
+	 * removed between ...start() and ...end().  No threads can be waiting
+	 * on the lock as the last reference on KVM has been dropped.  If the
+	 * lock is still held, freeing memslots will deadlock.
+	 */
+	init_rwsem(&kvm->mmu_notifier_slots_lock);
 #else
 	kvm_arch_flush_shadow_all(kvm);
 #endif
@@ -1279,7 +1332,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
 	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
 
+	down_write(&kvm->mmu_notifier_slots_lock);
 	rcu_assign_pointer(kvm->memslots[as_id], slots);
+	up_write(&kvm->mmu_notifier_slots_lock);
+
 	synchronize_srcu_expedited(&kvm->srcu);
 
 	/*
-- 
2.31.0.208.g409f899ff0-goog


WARNING: multiple messages have this Message-ID (diff)
From: Sean Christopherson <seanjc@google.com>
To: Marc Zyngier <maz@kernel.org>,
	Huacai Chen <chenhuacai@kernel.org>,
	 Aleksandar Markovic <aleksandar.qemu.devel@gmail.com>,
	Paul Mackerras <paulus@ozlabs.org>,
	Paolo Bonzini <pbonzini@redhat.com>
Cc: Wanpeng Li <wanpengli@tencent.com>,
	kvm@vger.kernel.org, Sean Christopherson <seanjc@google.com>,
	Joerg Roedel <joro@8bytes.org>,
	linux-mips@vger.kernel.org, kvm-ppc@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	Ben Gardon <bgardon@google.com>,
	Vitaly Kuznetsov <vkuznets@redhat.com>,
	kvmarm@lists.cs.columbia.edu, Jim Mattson <jmattson@google.com>
Subject: [PATCH v2 09/10] KVM: Don't take mmu_lock for range invalidation unless necessary
Date: Thu,  1 Apr 2021 17:56:57 -0700	[thread overview]
Message-ID: <20210402005658.3024832-10-seanjc@google.com> (raw)
In-Reply-To: <20210402005658.3024832-1-seanjc@google.com>

Avoid taking mmu_lock for unrelated .invalidate_range_{start,end}()
notifications.  Because mmu_notifier_count must be modified while holding
mmu_lock for write, and must always be paired across start->end to stay
balanced, lock elision must happen in both or none.  To meet that
requirement, add a rwsem to prevent memslot updates across range_start()
and range_end().

Use a rwsem instead of a rwlock since most notifiers _allow_ blocking,
and the lock will be endl across the entire start() ... end() sequence.
If anything in the sequence sleeps, including the caller or a different
notifier, holding the spinlock would be disastrous.

For notifiers that _disallow_ blocking, e.g. OOM reaping, simply go down
the slow path of unconditionally acquiring mmu_lock.  The sane
alternative would be to try to acquire the lock and force the notifier
to retry on failure.  But since OOM is currently the _only_ scenario
where blocking is disallowed attempting to optimize a guest that has been
marked for death is pointless.

Unconditionally define and use mmu_notifier_slots_lock in the memslots
code, purely to avoid more #ifdefs.  The overhead of acquiring the lock
is negligible when the lock is uncontested, which will always be the case
when the MMU notifiers are not used.

Note, technically flag-only memslot updates could be allowed in parallel,
but stalling a memslot update for a relatively short amount of time is
not a scalability issue, and this is all more than complex enough.

Based heavily on code from Ben Gardon.

Suggested-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h |  6 ++-
 virt/kvm/kvm_main.c      | 96 +++++++++++++++++++++++++++++++---------
 2 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 40ac2d40bb5a..bc3dd2838bb8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -472,6 +472,7 @@ struct kvm {
 #endif /* KVM_HAVE_MMU_RWLOCK */
 
 	struct mutex slots_lock;
+	struct rw_semaphore mmu_notifier_slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
@@ -660,8 +661,9 @@ static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
 {
 	as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM);
 	return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
-			lockdep_is_held(&kvm->slots_lock) ||
-			!refcount_read(&kvm->users_count));
+				      lockdep_is_held(&kvm->slots_lock) ||
+				      lockdep_is_held(&kvm->mmu_notifier_slots_lock) ||
+				      !refcount_read(&kvm->users_count));
 }
 
 static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f6697ad741ed..af28f39817a5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -462,6 +462,7 @@ struct kvm_hva_range {
 	pte_t pte;
 	hva_handler_t handler;
 	on_lock_fn_t on_lock;
+	bool must_lock;
 	bool flush_on_ret;
 	bool may_block;
 };
@@ -479,6 +480,25 @@ static void kvm_null_fn(void)
 }
 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 
+
+/* Acquire mmu_lock if necessary.  Returns %true if @handler is "null" */
+static __always_inline bool kvm_mmu_lock_and_check_handler(struct kvm *kvm,
+							   const struct kvm_hva_range *range,
+							   bool *locked)
+{
+	if (*locked)
+		return false;
+
+	*locked = true;
+
+	KVM_MMU_LOCK(kvm);
+
+	if (!IS_KVM_NULL_FN(range->on_lock))
+		range->on_lock(kvm, range->start, range->end);
+
+	return IS_KVM_NULL_FN(range->handler);
+}
+
 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 						  const struct kvm_hva_range *range)
 {
@@ -495,16 +515,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 
 	idx = srcu_read_lock(&kvm->srcu);
 
-	/* The on_lock() path does not yet support lock elision. */
-	if (!IS_KVM_NULL_FN(range->on_lock)) {
-		locked = true;
-		KVM_MMU_LOCK(kvm);
-
-		range->on_lock(kvm, range->start, range->end);
-
-		if (IS_KVM_NULL_FN(range->handler))
-			goto out_unlock;
-	}
+	if (range->must_lock &&
+	    kvm_mmu_lock_and_check_handler(kvm, range, &locked))
+		goto out_unlock;
 
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 		slots = __kvm_memslots(kvm, i);
@@ -534,10 +547,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 			gfn_range.slot = slot;
 
-			if (!locked) {
-				locked = true;
-				KVM_MMU_LOCK(kvm);
-			}
+			if (kvm_mmu_lock_and_check_handler(kvm, range, &locked))
+				goto out_unlock;
+
 			ret |= range->handler(kvm, &gfn_range);
 		}
 	}
@@ -568,6 +580,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 		.pte		= pte,
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
+		.must_lock	= false,
 		.flush_on_ret	= true,
 		.may_block	= false,
 	};
@@ -587,6 +600,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
 		.pte		= __pte(0),
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
+		.must_lock	= false,
 		.flush_on_ret	= false,
 		.may_block	= false,
 	};
@@ -603,11 +617,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	trace_kvm_set_spte_hva(address);
 
 	/*
-	 * .change_pte() must be bookended by .invalidate_range_{start,end}(),
-	 * and so always runs with an elevated notifier count.  This obviates
-	 * the need to bump the sequence count.
+	 * .change_pte() must be bookended by .invalidate_range_{start,end}().
+	 * If mmu_notifier_count is zero, then start() didn't find a relevant
+	 * memslot and wasn't forced down the slow path; rechecking here is
+	 * unnecessary.  This can only occur if memslot updates are blocked.
 	 */
-	WARN_ON_ONCE(!kvm->mmu_notifier_count);
+	if (!kvm->mmu_notifier_count) {
+		lockdep_assert_held(&kvm->mmu_notifier_slots_lock);
+		return;
+	}
 
 	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 }
@@ -644,6 +662,7 @@ static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
+	bool blockable = mmu_notifier_range_blockable(range);
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
@@ -651,12 +670,29 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		.pte		= __pte(0),
 		.handler	= kvm_unmap_gfn_range,
 		.on_lock	= kvm_inc_notifier_count,
+		.must_lock	= !blockable,
 		.flush_on_ret	= true,
-		.may_block	= mmu_notifier_range_blockable(range),
+		.may_block	= blockable,
 	};
 
 	trace_kvm_unmap_hva_range(range->start, range->end);
 
+	/*
+	 * Prevent memslot modification between range_start() and range_end()
+	 * so that conditionally locking provides the same result in both
+	 * functions.  Without that guarantee, the mmu_notifier_count
+	 * adjustments will be imbalanced.
+	 *
+	 * Skip the memslot-lookup lock elision (set @must_lock above) to avoid
+	 * having to take the semaphore on non-blockable calls, e.g. OOM kill.
+	 * The complexity required to handle conditional locking for this case
+	 * is not worth the marginal benefits, the VM is likely doomed anyways.
+	 *
+	 * Pairs with the unlock in range_end().
+	 */
+	if (blockable)
+		down_read(&kvm->mmu_notifier_slots_lock);
+
 	__kvm_handle_hva_range(kvm, &hva_range);
 
 	return 0;
@@ -683,6 +719,7 @@ static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
+	bool blockable = mmu_notifier_range_blockable(range);
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
@@ -690,12 +727,17 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 		.pte		= __pte(0),
 		.handler	= (void *)kvm_null_fn,
 		.on_lock	= kvm_dec_notifier_count,
+		.must_lock	= !blockable,
 		.flush_on_ret	= true,
-		.may_block	= mmu_notifier_range_blockable(range),
+		.may_block	= blockable,
 	};
 
 	__kvm_handle_hva_range(kvm, &hva_range);
 
+	/* Pairs with the lock in range_start(). */
+	if (blockable)
+		up_read(&kvm->mmu_notifier_slots_lock);
+
 	BUG_ON(kvm->mmu_notifier_count < 0);
 }
 
@@ -908,6 +950,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
+	init_rwsem(&kvm->mmu_notifier_slots_lock);
 	INIT_LIST_HEAD(&kvm->devices);
 
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -1028,6 +1071,16 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_coalesced_mmio_free(kvm);
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+	/*
+	 * Reset the lock used to prevent memslot updates between MMU notifier
+	 * invalidate_range_start() and invalidate_range_end().  At this point,
+	 * no more MMU notifiers will run and pending calls to ...start() have
+	 * completed.  But, the lock could still be held if KVM's notifier was
+	 * removed between ...start() and ...end().  No threads can be waiting
+	 * on the lock as the last reference on KVM has been dropped.  If the
+	 * lock is still held, freeing memslots will deadlock.
+	 */
+	init_rwsem(&kvm->mmu_notifier_slots_lock);
 #else
 	kvm_arch_flush_shadow_all(kvm);
 #endif
@@ -1279,7 +1332,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
 	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
 
+	down_write(&kvm->mmu_notifier_slots_lock);
 	rcu_assign_pointer(kvm->memslots[as_id], slots);
+	up_write(&kvm->mmu_notifier_slots_lock);
+
 	synchronize_srcu_expedited(&kvm->srcu);
 
 	/*
-- 
2.31.0.208.g409f899ff0-goog

_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

WARNING: multiple messages have this Message-ID (diff)
From: Sean Christopherson <seanjc@google.com>
To: Marc Zyngier <maz@kernel.org>,
	Huacai Chen <chenhuacai@kernel.org>,
	 Aleksandar Markovic <aleksandar.qemu.devel@gmail.com>,
	Paul Mackerras <paulus@ozlabs.org>,
	Paolo Bonzini <pbonzini@redhat.com>
Cc: James Morse <james.morse@arm.com>,
	Julien Thierry <julien.thierry.kdev@gmail.com>,
	 Suzuki K Poulose <suzuki.poulose@arm.com>,
	Sean Christopherson <seanjc@google.com>,
	 Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	 Jim Mattson <jmattson@google.com>,
	Joerg Roedel <joro@8bytes.org>,
	 linux-arm-kernel@lists.infradead.org,
	kvmarm@lists.cs.columbia.edu,  linux-mips@vger.kernel.org,
	kvm@vger.kernel.org, kvm-ppc@vger.kernel.org,
	 linux-kernel@vger.kernel.org, Ben Gardon <bgardon@google.com>
Subject: [PATCH v2 09/10] KVM: Don't take mmu_lock for range invalidation unless necessary
Date: Thu,  1 Apr 2021 17:56:57 -0700	[thread overview]
Message-ID: <20210402005658.3024832-10-seanjc@google.com> (raw)
In-Reply-To: <20210402005658.3024832-1-seanjc@google.com>

Avoid taking mmu_lock for unrelated .invalidate_range_{start,end}()
notifications.  Because mmu_notifier_count must be modified while holding
mmu_lock for write, and must always be paired across start->end to stay
balanced, lock elision must happen in both or none.  To meet that
requirement, add a rwsem to prevent memslot updates across range_start()
and range_end().

Use a rwsem instead of a rwlock since most notifiers _allow_ blocking,
and the lock will be endl across the entire start() ... end() sequence.
If anything in the sequence sleeps, including the caller or a different
notifier, holding the spinlock would be disastrous.

For notifiers that _disallow_ blocking, e.g. OOM reaping, simply go down
the slow path of unconditionally acquiring mmu_lock.  The sane
alternative would be to try to acquire the lock and force the notifier
to retry on failure.  But since OOM is currently the _only_ scenario
where blocking is disallowed attempting to optimize a guest that has been
marked for death is pointless.

Unconditionally define and use mmu_notifier_slots_lock in the memslots
code, purely to avoid more #ifdefs.  The overhead of acquiring the lock
is negligible when the lock is uncontested, which will always be the case
when the MMU notifiers are not used.

Note, technically flag-only memslot updates could be allowed in parallel,
but stalling a memslot update for a relatively short amount of time is
not a scalability issue, and this is all more than complex enough.

Based heavily on code from Ben Gardon.

Suggested-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h |  6 ++-
 virt/kvm/kvm_main.c      | 96 +++++++++++++++++++++++++++++++---------
 2 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 40ac2d40bb5a..bc3dd2838bb8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -472,6 +472,7 @@ struct kvm {
 #endif /* KVM_HAVE_MMU_RWLOCK */
 
 	struct mutex slots_lock;
+	struct rw_semaphore mmu_notifier_slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
@@ -660,8 +661,9 @@ static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
 {
 	as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM);
 	return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
-			lockdep_is_held(&kvm->slots_lock) ||
-			!refcount_read(&kvm->users_count));
+				      lockdep_is_held(&kvm->slots_lock) ||
+				      lockdep_is_held(&kvm->mmu_notifier_slots_lock) ||
+				      !refcount_read(&kvm->users_count));
 }
 
 static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f6697ad741ed..af28f39817a5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -462,6 +462,7 @@ struct kvm_hva_range {
 	pte_t pte;
 	hva_handler_t handler;
 	on_lock_fn_t on_lock;
+	bool must_lock;
 	bool flush_on_ret;
 	bool may_block;
 };
@@ -479,6 +480,25 @@ static void kvm_null_fn(void)
 }
 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 
+
+/* Acquire mmu_lock if necessary.  Returns %true if @handler is "null" */
+static __always_inline bool kvm_mmu_lock_and_check_handler(struct kvm *kvm,
+							   const struct kvm_hva_range *range,
+							   bool *locked)
+{
+	if (*locked)
+		return false;
+
+	*locked = true;
+
+	KVM_MMU_LOCK(kvm);
+
+	if (!IS_KVM_NULL_FN(range->on_lock))
+		range->on_lock(kvm, range->start, range->end);
+
+	return IS_KVM_NULL_FN(range->handler);
+}
+
 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 						  const struct kvm_hva_range *range)
 {
@@ -495,16 +515,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 
 	idx = srcu_read_lock(&kvm->srcu);
 
-	/* The on_lock() path does not yet support lock elision. */
-	if (!IS_KVM_NULL_FN(range->on_lock)) {
-		locked = true;
-		KVM_MMU_LOCK(kvm);
-
-		range->on_lock(kvm, range->start, range->end);
-
-		if (IS_KVM_NULL_FN(range->handler))
-			goto out_unlock;
-	}
+	if (range->must_lock &&
+	    kvm_mmu_lock_and_check_handler(kvm, range, &locked))
+		goto out_unlock;
 
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 		slots = __kvm_memslots(kvm, i);
@@ -534,10 +547,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 			gfn_range.slot = slot;
 
-			if (!locked) {
-				locked = true;
-				KVM_MMU_LOCK(kvm);
-			}
+			if (kvm_mmu_lock_and_check_handler(kvm, range, &locked))
+				goto out_unlock;
+
 			ret |= range->handler(kvm, &gfn_range);
 		}
 	}
@@ -568,6 +580,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 		.pte		= pte,
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
+		.must_lock	= false,
 		.flush_on_ret	= true,
 		.may_block	= false,
 	};
@@ -587,6 +600,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
 		.pte		= __pte(0),
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
+		.must_lock	= false,
 		.flush_on_ret	= false,
 		.may_block	= false,
 	};
@@ -603,11 +617,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	trace_kvm_set_spte_hva(address);
 
 	/*
-	 * .change_pte() must be bookended by .invalidate_range_{start,end}(),
-	 * and so always runs with an elevated notifier count.  This obviates
-	 * the need to bump the sequence count.
+	 * .change_pte() must be bookended by .invalidate_range_{start,end}().
+	 * If mmu_notifier_count is zero, then start() didn't find a relevant
+	 * memslot and wasn't forced down the slow path; rechecking here is
+	 * unnecessary.  This can only occur if memslot updates are blocked.
 	 */
-	WARN_ON_ONCE(!kvm->mmu_notifier_count);
+	if (!kvm->mmu_notifier_count) {
+		lockdep_assert_held(&kvm->mmu_notifier_slots_lock);
+		return;
+	}
 
 	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 }
@@ -644,6 +662,7 @@ static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
+	bool blockable = mmu_notifier_range_blockable(range);
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
@@ -651,12 +670,29 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		.pte		= __pte(0),
 		.handler	= kvm_unmap_gfn_range,
 		.on_lock	= kvm_inc_notifier_count,
+		.must_lock	= !blockable,
 		.flush_on_ret	= true,
-		.may_block	= mmu_notifier_range_blockable(range),
+		.may_block	= blockable,
 	};
 
 	trace_kvm_unmap_hva_range(range->start, range->end);
 
+	/*
+	 * Prevent memslot modification between range_start() and range_end()
+	 * so that conditionally locking provides the same result in both
+	 * functions.  Without that guarantee, the mmu_notifier_count
+	 * adjustments will be imbalanced.
+	 *
+	 * Skip the memslot-lookup lock elision (set @must_lock above) to avoid
+	 * having to take the semaphore on non-blockable calls, e.g. OOM kill.
+	 * The complexity required to handle conditional locking for this case
+	 * is not worth the marginal benefits, the VM is likely doomed anyways.
+	 *
+	 * Pairs with the unlock in range_end().
+	 */
+	if (blockable)
+		down_read(&kvm->mmu_notifier_slots_lock);
+
 	__kvm_handle_hva_range(kvm, &hva_range);
 
 	return 0;
@@ -683,6 +719,7 @@ static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
+	bool blockable = mmu_notifier_range_blockable(range);
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
@@ -690,12 +727,17 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 		.pte		= __pte(0),
 		.handler	= (void *)kvm_null_fn,
 		.on_lock	= kvm_dec_notifier_count,
+		.must_lock	= !blockable,
 		.flush_on_ret	= true,
-		.may_block	= mmu_notifier_range_blockable(range),
+		.may_block	= blockable,
 	};
 
 	__kvm_handle_hva_range(kvm, &hva_range);
 
+	/* Pairs with the lock in range_start(). */
+	if (blockable)
+		up_read(&kvm->mmu_notifier_slots_lock);
+
 	BUG_ON(kvm->mmu_notifier_count < 0);
 }
 
@@ -908,6 +950,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
+	init_rwsem(&kvm->mmu_notifier_slots_lock);
 	INIT_LIST_HEAD(&kvm->devices);
 
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -1028,6 +1071,16 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_coalesced_mmio_free(kvm);
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+	/*
+	 * Reset the lock used to prevent memslot updates between MMU notifier
+	 * invalidate_range_start() and invalidate_range_end().  At this point,
+	 * no more MMU notifiers will run and pending calls to ...start() have
+	 * completed.  But, the lock could still be held if KVM's notifier was
+	 * removed between ...start() and ...end().  No threads can be waiting
+	 * on the lock as the last reference on KVM has been dropped.  If the
+	 * lock is still held, freeing memslots will deadlock.
+	 */
+	init_rwsem(&kvm->mmu_notifier_slots_lock);
 #else
 	kvm_arch_flush_shadow_all(kvm);
 #endif
@@ -1279,7 +1332,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
 	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
 
+	down_write(&kvm->mmu_notifier_slots_lock);
 	rcu_assign_pointer(kvm->memslots[as_id], slots);
+	up_write(&kvm->mmu_notifier_slots_lock);
+
 	synchronize_srcu_expedited(&kvm->srcu);
 
 	/*
-- 
2.31.0.208.g409f899ff0-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

WARNING: multiple messages have this Message-ID (diff)
From: Sean Christopherson <seanjc@google.com>
To: Marc Zyngier <maz@kernel.org>,
	Huacai Chen <chenhuacai@kernel.org>,
	Aleksandar Markovic <aleksandar.qemu.devel@gmail.com>,
	Paul Mackerras <paulus@ozlabs.org>,
	Paolo Bonzini <pbonzini@redhat.com>
Cc: James Morse <james.morse@arm.com>,
	Julien Thierry <julien.thierry.kdev@gmail.com>,
	Suzuki K Poulose <suzuki.poulose@arm.com>,
	Sean Christopherson <seanjc@google.com>,
	Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>, Joerg Roedel <joro@8bytes.org>,
	linux-arm-kernel@lists.infradead.org,
	kvmarm@lists.cs.columbia.edu, linux-mips@vger.kernel.org,
	kvm@vger.kernel.org, kvm-ppc@vger.kernel.org,
	linux-kernel@vger.kernel.org, Ben Gardon <bgardon@google.com>
Subject: [PATCH v2 09/10] KVM: Don't take mmu_lock for range invalidation unless necessary
Date: Fri, 02 Apr 2021 00:56:57 +0000	[thread overview]
Message-ID: <20210402005658.3024832-10-seanjc@google.com> (raw)
In-Reply-To: <20210402005658.3024832-1-seanjc@google.com>

Avoid taking mmu_lock for unrelated .invalidate_range_{start,end}()
notifications.  Because mmu_notifier_count must be modified while holding
mmu_lock for write, and must always be paired across start->end to stay
balanced, lock elision must happen in both or none.  To meet that
requirement, add a rwsem to prevent memslot updates across range_start()
and range_end().

Use a rwsem instead of a rwlock since most notifiers _allow_ blocking,
and the lock will be endl across the entire start() ... end() sequence.
If anything in the sequence sleeps, including the caller or a different
notifier, holding the spinlock would be disastrous.

For notifiers that _disallow_ blocking, e.g. OOM reaping, simply go down
the slow path of unconditionally acquiring mmu_lock.  The sane
alternative would be to try to acquire the lock and force the notifier
to retry on failure.  But since OOM is currently the _only_ scenario
where blocking is disallowed attempting to optimize a guest that has been
marked for death is pointless.

Unconditionally define and use mmu_notifier_slots_lock in the memslots
code, purely to avoid more #ifdefs.  The overhead of acquiring the lock
is negligible when the lock is uncontested, which will always be the case
when the MMU notifiers are not used.

Note, technically flag-only memslot updates could be allowed in parallel,
but stalling a memslot update for a relatively short amount of time is
not a scalability issue, and this is all more than complex enough.

Based heavily on code from Ben Gardon.

Suggested-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h |  6 ++-
 virt/kvm/kvm_main.c      | 96 +++++++++++++++++++++++++++++++---------
 2 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 40ac2d40bb5a..bc3dd2838bb8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -472,6 +472,7 @@ struct kvm {
 #endif /* KVM_HAVE_MMU_RWLOCK */
 
 	struct mutex slots_lock;
+	struct rw_semaphore mmu_notifier_slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
@@ -660,8 +661,9 @@ static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
 {
 	as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM);
 	return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
-			lockdep_is_held(&kvm->slots_lock) ||
-			!refcount_read(&kvm->users_count));
+				      lockdep_is_held(&kvm->slots_lock) ||
+				      lockdep_is_held(&kvm->mmu_notifier_slots_lock) ||
+				      !refcount_read(&kvm->users_count));
 }
 
 static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f6697ad741ed..af28f39817a5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -462,6 +462,7 @@ struct kvm_hva_range {
 	pte_t pte;
 	hva_handler_t handler;
 	on_lock_fn_t on_lock;
+	bool must_lock;
 	bool flush_on_ret;
 	bool may_block;
 };
@@ -479,6 +480,25 @@ static void kvm_null_fn(void)
 }
 #define IS_KVM_NULL_FN(fn) ((fn) = (void *)kvm_null_fn)
 
+
+/* Acquire mmu_lock if necessary.  Returns %true if @handler is "null" */
+static __always_inline bool kvm_mmu_lock_and_check_handler(struct kvm *kvm,
+							   const struct kvm_hva_range *range,
+							   bool *locked)
+{
+	if (*locked)
+		return false;
+
+	*locked = true;
+
+	KVM_MMU_LOCK(kvm);
+
+	if (!IS_KVM_NULL_FN(range->on_lock))
+		range->on_lock(kvm, range->start, range->end);
+
+	return IS_KVM_NULL_FN(range->handler);
+}
+
 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 						  const struct kvm_hva_range *range)
 {
@@ -495,16 +515,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 
 	idx = srcu_read_lock(&kvm->srcu);
 
-	/* The on_lock() path does not yet support lock elision. */
-	if (!IS_KVM_NULL_FN(range->on_lock)) {
-		locked = true;
-		KVM_MMU_LOCK(kvm);
-
-		range->on_lock(kvm, range->start, range->end);
-
-		if (IS_KVM_NULL_FN(range->handler))
-			goto out_unlock;
-	}
+	if (range->must_lock &&
+	    kvm_mmu_lock_and_check_handler(kvm, range, &locked))
+		goto out_unlock;
 
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 		slots = __kvm_memslots(kvm, i);
@@ -534,10 +547,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 			gfn_range.slot = slot;
 
-			if (!locked) {
-				locked = true;
-				KVM_MMU_LOCK(kvm);
-			}
+			if (kvm_mmu_lock_and_check_handler(kvm, range, &locked))
+				goto out_unlock;
+
 			ret |= range->handler(kvm, &gfn_range);
 		}
 	}
@@ -568,6 +580,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 		.pte		= pte,
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
+		.must_lock	= false,
 		.flush_on_ret	= true,
 		.may_block	= false,
 	};
@@ -587,6 +600,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
 		.pte		= __pte(0),
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
+		.must_lock	= false,
 		.flush_on_ret	= false,
 		.may_block	= false,
 	};
@@ -603,11 +617,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	trace_kvm_set_spte_hva(address);
 
 	/*
-	 * .change_pte() must be bookended by .invalidate_range_{start,end}(),
-	 * and so always runs with an elevated notifier count.  This obviates
-	 * the need to bump the sequence count.
+	 * .change_pte() must be bookended by .invalidate_range_{start,end}().
+	 * If mmu_notifier_count is zero, then start() didn't find a relevant
+	 * memslot and wasn't forced down the slow path; rechecking here is
+	 * unnecessary.  This can only occur if memslot updates are blocked.
 	 */
-	WARN_ON_ONCE(!kvm->mmu_notifier_count);
+	if (!kvm->mmu_notifier_count) {
+		lockdep_assert_held(&kvm->mmu_notifier_slots_lock);
+		return;
+	}
 
 	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 }
@@ -644,6 +662,7 @@ static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
+	bool blockable = mmu_notifier_range_blockable(range);
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
@@ -651,12 +670,29 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		.pte		= __pte(0),
 		.handler	= kvm_unmap_gfn_range,
 		.on_lock	= kvm_inc_notifier_count,
+		.must_lock	= !blockable,
 		.flush_on_ret	= true,
-		.may_block	= mmu_notifier_range_blockable(range),
+		.may_block	= blockable,
 	};
 
 	trace_kvm_unmap_hva_range(range->start, range->end);
 
+	/*
+	 * Prevent memslot modification between range_start() and range_end()
+	 * so that conditionally locking provides the same result in both
+	 * functions.  Without that guarantee, the mmu_notifier_count
+	 * adjustments will be imbalanced.
+	 *
+	 * Skip the memslot-lookup lock elision (set @must_lock above) to avoid
+	 * having to take the semaphore on non-blockable calls, e.g. OOM kill.
+	 * The complexity required to handle conditional locking for this case
+	 * is not worth the marginal benefits, the VM is likely doomed anyways.
+	 *
+	 * Pairs with the unlock in range_end().
+	 */
+	if (blockable)
+		down_read(&kvm->mmu_notifier_slots_lock);
+
 	__kvm_handle_hva_range(kvm, &hva_range);
 
 	return 0;
@@ -683,6 +719,7 @@ static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
+	bool blockable = mmu_notifier_range_blockable(range);
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
@@ -690,12 +727,17 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 		.pte		= __pte(0),
 		.handler	= (void *)kvm_null_fn,
 		.on_lock	= kvm_dec_notifier_count,
+		.must_lock	= !blockable,
 		.flush_on_ret	= true,
-		.may_block	= mmu_notifier_range_blockable(range),
+		.may_block	= blockable,
 	};
 
 	__kvm_handle_hva_range(kvm, &hva_range);
 
+	/* Pairs with the lock in range_start(). */
+	if (blockable)
+		up_read(&kvm->mmu_notifier_slots_lock);
+
 	BUG_ON(kvm->mmu_notifier_count < 0);
 }
 
@@ -908,6 +950,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
+	init_rwsem(&kvm->mmu_notifier_slots_lock);
 	INIT_LIST_HEAD(&kvm->devices);
 
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -1028,6 +1071,16 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_coalesced_mmio_free(kvm);
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+	/*
+	 * Reset the lock used to prevent memslot updates between MMU notifier
+	 * invalidate_range_start() and invalidate_range_end().  At this point,
+	 * no more MMU notifiers will run and pending calls to ...start() have
+	 * completed.  But, the lock could still be held if KVM's notifier was
+	 * removed between ...start() and ...end().  No threads can be waiting
+	 * on the lock as the last reference on KVM has been dropped.  If the
+	 * lock is still held, freeing memslots will deadlock.
+	 */
+	init_rwsem(&kvm->mmu_notifier_slots_lock);
 #else
 	kvm_arch_flush_shadow_all(kvm);
 #endif
@@ -1279,7 +1332,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
 	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
 
+	down_write(&kvm->mmu_notifier_slots_lock);
 	rcu_assign_pointer(kvm->memslots[as_id], slots);
+	up_write(&kvm->mmu_notifier_slots_lock);
+
 	synchronize_srcu_expedited(&kvm->srcu);
 
 	/*
-- 
2.31.0.208.g409f899ff0-goog

  parent reply	other threads:[~2021-04-02  0:57 UTC|newest]

Thread overview: 84+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-02  0:56 [PATCH v2 00/10] KVM: Consolidate and optimize MMU notifiers Sean Christopherson
2021-04-02  0:56 ` Sean Christopherson
2021-04-02  0:56 ` Sean Christopherson
2021-04-02  0:56 ` Sean Christopherson
2021-04-02  0:56 ` [PATCH v2 01/10] KVM: Assert that notifier count is elevated in .change_pte() Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02 11:08   ` Paolo Bonzini
2021-04-02 11:08     ` Paolo Bonzini
2021-04-02 11:08     ` Paolo Bonzini
2021-04-02 11:08     ` Paolo Bonzini
2021-04-02  0:56 ` [PATCH v2 02/10] KVM: Move x86's MMU notifier memslot walkers to generic code Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56 ` [PATCH v2 03/10] KVM: arm64: Convert to the gfn-based MMU notifier callbacks Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-12 10:12   ` Marc Zyngier
2021-04-12 10:12     ` Marc Zyngier
2021-04-12 10:12     ` Marc Zyngier
2021-04-12 10:12     ` Marc Zyngier
2021-04-02  0:56 ` [PATCH v2 04/10] KVM: MIPS/MMU: " Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56 ` [PATCH v2 05/10] KVM: PPC: " Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56 ` [PATCH v2 06/10] KVM: Kill off the old hva-based " Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56 ` [PATCH v2 07/10] KVM: Move MMU notifier's mmu_lock acquisition into common helper Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  9:35   ` Paolo Bonzini
2021-04-02  9:35     ` Paolo Bonzini
2021-04-02  9:35     ` Paolo Bonzini
2021-04-02  9:35     ` Paolo Bonzini
2021-04-02 14:59     ` Sean Christopherson
2021-04-02 14:59       ` Sean Christopherson
2021-04-02 14:59       ` Sean Christopherson
2021-04-02 14:59       ` Sean Christopherson
2021-04-02  0:56 ` [PATCH v2 08/10] KVM: Take mmu_lock when handling MMU notifier iff the hva hits a memslot Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56 ` Sean Christopherson [this message]
2021-04-02  0:56   ` [PATCH v2 09/10] KVM: Don't take mmu_lock for range invalidation unless necessary Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  9:34   ` Paolo Bonzini
2021-04-02  9:34     ` Paolo Bonzini
2021-04-02  9:34     ` Paolo Bonzini
2021-04-02  9:34     ` Paolo Bonzini
2021-04-02 14:59     ` Sean Christopherson
2021-04-02 14:59       ` Sean Christopherson
2021-04-02 14:59       ` Sean Christopherson
2021-04-02 14:59       ` Sean Christopherson
2021-04-19  8:49   ` Wanpeng Li
2021-04-19  8:49     ` Wanpeng Li
2021-04-19  8:49     ` Wanpeng Li
2021-04-19  8:49     ` Wanpeng Li
2021-04-19 13:50     ` Paolo Bonzini
2021-04-19 15:09       ` Sean Christopherson
2021-04-19 22:09         ` Paolo Bonzini
2021-04-20  1:17           ` Sean Christopherson
2021-04-02  0:56 ` [PATCH v2 10/10] KVM: x86/mmu: Allow yielding during MMU notifier unmap/zap, if possible Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02  0:56   ` Sean Christopherson
2021-04-02 12:17 ` [PATCH v2 00/10] KVM: Consolidate and optimize MMU notifiers Paolo Bonzini
2021-04-02 12:17   ` Paolo Bonzini
2021-04-02 12:17   ` Paolo Bonzini
2021-04-02 12:17   ` Paolo Bonzini
2021-04-12 10:27   ` Marc Zyngier
2021-04-12 10:27     ` Marc Zyngier
2021-04-12 10:27     ` Marc Zyngier
2021-04-12 10:27     ` Marc Zyngier

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210402005658.3024832-10-seanjc@google.com \
    --to=seanjc@google.com \
    --cc=aleksandar.qemu.devel@gmail.com \
    --cc=bgardon@google.com \
    --cc=chenhuacai@kernel.org \
    --cc=james.morse@arm.com \
    --cc=jmattson@google.com \
    --cc=joro@8bytes.org \
    --cc=julien.thierry.kdev@gmail.com \
    --cc=kvm-ppc@vger.kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=kvmarm@lists.cs.columbia.edu \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mips@vger.kernel.org \
    --cc=maz@kernel.org \
    --cc=paulus@ozlabs.org \
    --cc=pbonzini@redhat.com \
    --cc=suzuki.poulose@arm.com \
    --cc=vkuznets@redhat.com \
    --cc=wanpengli@tencent.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.