[16/18] KVM: Don't take mmu_lock for range invalidation unless necessary
diff mbox series

Message ID 20210326021957.1424875-17-seanjc@google.com
State New, archived
Headers show
Series
  • KVM: Consolidate and optimize MMU notifiers
Related show

Commit Message

Sean Christopherson March 26, 2021, 2:19 a.m. UTC
Avoid taking mmu_lock for unrelated .invalidate_range_{start,end}()
notifications.  Because mmu_notifier_count must be modified while holding
mmu_lock for write, and must always be paired across start->end to stay
balanced, lock elision must happen in both or none.  To meet that
requirement, add a rwsem to prevent memslot updates across range_start()
and range_end().

For notifiers that disallow blocking, e.g. OOM reaping, simply go down
the slow path of unconditionally acquiring mmu_lock.  The sane
alternative would be to try to acquire the lock and force the notifier
to retry on failure.  But since OOM is currently the _only_ scenario
where blocking is disallowed attempting to optimize a guest that has been
marked for death is pointless.

Note, technically flag-only memslot updates could be allowed in parallel,
but stalling a memslot update for a relatively short amount of time is
not a scalability issue, and this is all more than complex enough.

Based heavily on code from Ben Gardon.

Suggested-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h |   8 +-
 virt/kvm/kvm_main.c      | 174 ++++++++++++++++++++++++++++++---------
 2 files changed, 142 insertions(+), 40 deletions(-)

Comments

Paolo Bonzini March 31, 2021, 7:52 a.m. UTC | #1
On 26/03/21 03:19, Sean Christopherson wrote:
> Avoid taking mmu_lock for unrelated .invalidate_range_{start,end}()
> notifications.  Because mmu_notifier_count must be modified while holding
> mmu_lock for write, and must always be paired across start->end to stay
> balanced, lock elision must happen in both or none.  To meet that
> requirement, add a rwsem to prevent memslot updates across range_start()
> and range_end().
> 
> For notifiers that disallow blocking, e.g. OOM reaping, simply go down
> the slow path of unconditionally acquiring mmu_lock.  The sane
> alternative would be to try to acquire the lock and force the notifier
> to retry on failure.  But since OOM is currently the _only_ scenario
> where blocking is disallowed attempting to optimize a guest that has been
> marked for death is pointless.
> 
> Note, technically flag-only memslot updates could be allowed in parallel,
> but stalling a memslot update for a relatively short amount of time is
> not a scalability issue, and this is all more than complex enough.
> 
> Based heavily on code from Ben Gardon.
> 
> Suggested-by: Ben Gardon <bgardon@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>

Please submit this as a separate patch.

Paolo

> ---
>   include/linux/kvm_host.h |   8 +-
>   virt/kvm/kvm_main.c      | 174 ++++++++++++++++++++++++++++++---------
>   2 files changed, 142 insertions(+), 40 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 40ac2d40bb5a..2cc0f87d936e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -523,6 +523,7 @@ struct kvm {
>   	long mmu_notifier_count;
>   	unsigned long mmu_notifier_range_start;
>   	unsigned long mmu_notifier_range_end;
> +	struct rw_semaphore mmu_notifier_slots_lock;
>   #endif
>   	long tlbs_dirty;
>   	struct list_head devices;
> @@ -660,8 +661,11 @@ static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
>   {
>   	as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM);
>   	return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
> -			lockdep_is_held(&kvm->slots_lock) ||
> -			!refcount_read(&kvm->users_count));
> +				      lockdep_is_held(&kvm->slots_lock) ||
> +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> +				      lockdep_is_held(&kvm->mmu_notifier_slots_lock) ||
> +#endif
> +				      !refcount_read(&kvm->users_count));
>   }
>   
>   static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 0c2aff8a4aa1..9ebc6d3e4a21 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -453,20 +453,56 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
>   
>   typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
>   
> +typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
> +			     unsigned long end);
> +
>   struct kvm_hva_range {
>   	unsigned long start;
>   	unsigned long end;
>   	pte_t pte;
>   	hva_handler_t handler;
> -	bool caller_locked;
> +	on_lock_fn_t on_lock;
> +	bool must_lock;
>   	bool flush_on_ret;
>   	bool may_block;
>   };
>   
> +/*
> + * Use a dedicated stub instead of NULL to indicate that there is no callback
> + * function/handler.  The compiler technically can't guarantee that a real
> + * function will have a non-zero address, and so it will generate code to
> + * check for !NULL, whereas comparing against a stub will be elided at compile
> + * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
> + */
> +static void kvm_null_fn(void)
> +{
> +
> +}
> +#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
> +
> +
> +/* Acquire mmu_lock if necessary.  Returns %true if @handler is "null" */
> +static __always_inline bool kvm_mmu_lock_and_check_handler(struct kvm *kvm,
> +							   const struct kvm_hva_range *range,
> +							   bool *locked)
> +{
> +	if (*locked)
> +		return false;
> +
> +	*locked = true;
> +
> +	KVM_MMU_LOCK(kvm);
> +
> +	if (!IS_KVM_NULL_FN(range->on_lock))
> +		range->on_lock(kvm, range->start, range->end);
> +
> +	return IS_KVM_NULL_FN(range->handler);
> +}
> +
>   static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
>   						  const struct kvm_hva_range *range)
>   {
> -	bool ret = false, locked = range->caller_locked;
> +	bool ret = false, locked = false;
>   	struct kvm_gfn_range gfn_range;
>   	struct kvm_memory_slot *slot;
>   	struct kvm_memslots *slots;
> @@ -474,6 +510,10 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
>   
>   	idx = srcu_read_lock(&kvm->srcu);
>   
> +	if (range->must_lock &&
> +	    kvm_mmu_lock_and_check_handler(kvm, range, &locked))
> +		goto out_unlock;
> +
>   	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
>   		slots = __kvm_memslots(kvm, i);
>   		kvm_for_each_memslot(slot, slots) {
> @@ -502,10 +542,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
>   			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
>   			gfn_range.slot = slot;
>   
> -			if (!locked) {
> -				locked = true;
> -				KVM_MMU_LOCK(kvm);
> -			}
> +			if (kvm_mmu_lock_and_check_handler(kvm, range, &locked))
> +				goto out_unlock;
> +
>   			ret |= range->handler(kvm, &gfn_range);
>   		}
>   	}
> @@ -513,7 +552,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
>   	if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
>   		kvm_flush_remote_tlbs(kvm);
>   
> -	if (locked && !range->caller_locked)
> +out_unlock:
> +	if (locked)
>   		KVM_MMU_UNLOCK(kvm);
>   
>   	srcu_read_unlock(&kvm->srcu, idx);
> @@ -534,10 +574,12 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
>   		.end		= end,
>   		.pte		= pte,
>   		.handler	= handler,
> -		.caller_locked	= false,
> +		.on_lock	= (void *)kvm_null_fn,
> +		.must_lock	= false,
>   		.flush_on_ret	= true,
>   		.may_block	= false,
>   	};
> +
>   	return __kvm_handle_hva_range(kvm, &range);
>   }
>   
> @@ -552,7 +594,8 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
>   		.end		= end,
>   		.pte		= __pte(0),
>   		.handler	= handler,
> -		.caller_locked	= false,
> +		.on_lock	= (void *)kvm_null_fn,
> +		.must_lock	= false,
>   		.flush_on_ret	= false,
>   		.may_block	= false,
>   	};
> @@ -569,23 +612,9 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
>   	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
>   }
>   
> -static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> -					const struct mmu_notifier_range *range)
> +static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
> +				   unsigned long end)
>   {
> -	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> -	const struct kvm_hva_range hva_range = {
> -		.start		= range->start,
> -		.end		= range->end,
> -		.pte		= __pte(0),
> -		.handler	= kvm_unmap_gfn_range,
> -		.caller_locked	= true,
> -		.flush_on_ret	= true,
> -		.may_block	= mmu_notifier_range_blockable(range),
> -	};
> -
> -	trace_kvm_unmap_hva_range(range->start, range->end);
> -
> -	KVM_MMU_LOCK(kvm);
>   	/*
>   	 * The count increase must become visible at unlock time as no
>   	 * spte can be established without taking the mmu_lock and
> @@ -593,8 +622,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>   	 */
>   	kvm->mmu_notifier_count++;
>   	if (likely(kvm->mmu_notifier_count == 1)) {
> -		kvm->mmu_notifier_range_start = range->start;
> -		kvm->mmu_notifier_range_end = range->end;
> +		kvm->mmu_notifier_range_start = start;
> +		kvm->mmu_notifier_range_end = end;
>   	} else {
>   		/*
>   		 * Fully tracking multiple concurrent ranges has dimishing
> @@ -606,24 +635,54 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>   		 * complete.
>   		 */
>   		kvm->mmu_notifier_range_start =
> -			min(kvm->mmu_notifier_range_start, range->start);
> +			min(kvm->mmu_notifier_range_start, start);
>   		kvm->mmu_notifier_range_end =
> -			max(kvm->mmu_notifier_range_end, range->end);
> +			max(kvm->mmu_notifier_range_end, end);
>   	}
> -
> -	__kvm_handle_hva_range(kvm, &hva_range);
> -
> -	KVM_MMU_UNLOCK(kvm);
> -
> -	return 0;
>   }
>   
> -static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> +static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>   					const struct mmu_notifier_range *range)
>   {
> +	bool blockable = mmu_notifier_range_blockable(range);
>   	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	const struct kvm_hva_range hva_range = {
> +		.start		= range->start,
> +		.end		= range->end,
> +		.pte		= __pte(0),
> +		.handler	= kvm_unmap_gfn_range,
> +		.on_lock	= kvm_inc_notifier_count,
> +		.must_lock	= !blockable,
> +		.flush_on_ret	= true,
> +		.may_block	= blockable,
> +	};
>   
> -	KVM_MMU_LOCK(kvm);
> +	trace_kvm_unmap_hva_range(range->start, range->end);
> +
> +	/*
> +	 * Prevent memslot modification between range_start() and range_end()
> +	 * so that conditionally locking provides the same result in both
> +	 * functions.  Without that guarantee, the mmu_notifier_count
> +	 * adjustments will be imbalanced.
> +	 *
> +	 * Skip the memslot-lookup lock elision (set @must_lock above) to avoid
> +	 * having to take the semaphore on non-blockable calls, e.g. OOM kill.
> +	 * The complexity required to handle conditional locking for this case
> +	 * is not worth the marginal benefits, the VM is likely doomed anyways.
> +	 *
> +	 * Pairs with the unlock in range_end().
> +	 */
> +	if (blockable)
> +		down_read(&kvm->mmu_notifier_slots_lock);
> +
> +	__kvm_handle_hva_range(kvm, &hva_range);
> +
> +	return 0;
> +}
> +
> +static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
> +				   unsigned long end)
> +{
>   	/*
>   	 * This sequence increase will notify the kvm page fault that
>   	 * the page that is going to be mapped in the spte could have
> @@ -637,7 +696,29 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
>   	 * in conjunction with the smp_rmb in mmu_notifier_retry().
>   	 */
>   	kvm->mmu_notifier_count--;
> -	KVM_MMU_UNLOCK(kvm);
> +}
> +
> +static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> +					const struct mmu_notifier_range *range)
> +{
> +	bool blockable = mmu_notifier_range_blockable(range);
> +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	const struct kvm_hva_range hva_range = {
> +		.start		= range->start,
> +		.end		= range->end,
> +		.pte		= __pte(0),
> +		.handler	= (void *)kvm_null_fn,
> +		.on_lock	= kvm_dec_notifier_count,
> +		.must_lock	= !blockable,
> +		.flush_on_ret	= true,
> +		.may_block	= blockable,
> +	};
> +
> +	__kvm_handle_hva_range(kvm, &hva_range);
> +
> +	/* Pairs with the lock in range_start(). */
> +	if (blockable)
> +		up_read(&kvm->mmu_notifier_slots_lock);
>   
>   	BUG_ON(kvm->mmu_notifier_count < 0);
>   }
> @@ -709,6 +790,8 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
>   
>   static int kvm_init_mmu_notifier(struct kvm *kvm)
>   {
> +	init_rwsem(&kvm->mmu_notifier_slots_lock);
> +
>   	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
>   	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
>   }
> @@ -971,6 +1054,15 @@ static void kvm_destroy_vm(struct kvm *kvm)
>   	kvm_coalesced_mmio_free(kvm);
>   #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>   	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
> +	/*
> +	 * Reset the lock used to prevent memslot updates between MMU notifier
> +	 * range_start and range_end.  At this point no more MMU notifiers will
> +	 * run, but the lock could still be held if KVM's notifier was removed
> +	 * between range_start and range_end.  No threads can be waiting on the
> +	 * lock as the last reference on KVM has been dropped.  If the lock is
> +	 * still held, freeing memslots will deadlock.
> +	 */
> +	init_rwsem(&kvm->mmu_notifier_slots_lock);
>   #else
>   	kvm_arch_flush_shadow_all(kvm);
>   #endif
> @@ -1222,7 +1314,13 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
>   	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
>   	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
>   
> +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> +	down_write(&kvm->mmu_notifier_slots_lock);
> +#endif
>   	rcu_assign_pointer(kvm->memslots[as_id], slots);
> +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> +	up_write(&kvm->mmu_notifier_slots_lock);
> +#endif
>   	synchronize_srcu_expedited(&kvm->srcu);
>   
>   	/*
>
Paolo Bonzini March 31, 2021, 8:35 a.m. UTC | #2
On 26/03/21 03:19, Sean Christopherson wrote:
> +	/*
> +	 * Reset the lock used to prevent memslot updates between MMU notifier
> +	 * range_start and range_end.  At this point no more MMU notifiers will
> +	 * run, but the lock could still be held if KVM's notifier was removed
> +	 * between range_start and range_end.  No threads can be waiting on the
> +	 * lock as the last reference on KVM has been dropped.  If the lock is
> +	 * still held, freeing memslots will deadlock.
> +	 */
> +	init_rwsem(&kvm->mmu_notifier_slots_lock);

I was going to say that this is nasty, then I noticed that 
mmu_notifier_unregister uses SRCU to ensure completion of concurrent 
calls to the MMU notifier.  So I guess it's fine, but it's better to 
point it out:

	/*
	 * At this point no more MMU notifiers will run and pending
	 * calls to range_start have completed, but the lock would
	 * still be held and never released if the MMU notifier was
	 * removed between range_start and range_end.  Since the last
	 * reference to the struct kvm has been dropped, no threads can
	 * be waiting on the lock, but we might still end up taking it
	 * when freeing memslots in kvm_arch_destroy_vm.  Reset the lock
	 * to avoid deadlocks.
	 */

That said, the easiest way to avoid this would be to always update 
mmu_notifier_count.  I don't mind the rwsem, but at least I suggest that 
you split the patch in two---the first one keeping the 
mmu_notifier_count update unconditional, and the second one introducing 
the rwsem and the on_lock function kvm_inc_notifier_count.  Please 
document the new lock in Documentation/virt/kvm/locking.rst too.

Also, related to the first part of the series, perhaps you could 
structure the series in a slightly different way:

1) introduce the HVA walking API in common code, complete with on_lock 
and patch 15, so that you can use on_lock to increase mmu_notifier_seq

2) then migrate all architectures including x86 to the new API

IOW, first half of patch 10 and all of patch 15; then the second half of 
patch 10; then patches 11-14.

> +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> +	down_write(&kvm->mmu_notifier_slots_lock);
> +#endif
>  	rcu_assign_pointer(kvm->memslots[as_id], slots);
> +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> +	up_write(&kvm->mmu_notifier_slots_lock);
> +#endif

Please do this unconditionally, the cost is minimal if the rwsem is not 
contended (as is the case if the architecture doesn't use MMU notifiers 
at all).

Paolo
Sean Christopherson March 31, 2021, 4:41 p.m. UTC | #3
On Wed, Mar 31, 2021, Paolo Bonzini wrote:
> On 26/03/21 03:19, Sean Christopherson wrote:
> > +	/*
> > +	 * Reset the lock used to prevent memslot updates between MMU notifier
> > +	 * range_start and range_end.  At this point no more MMU notifiers will
> > +	 * run, but the lock could still be held if KVM's notifier was removed
> > +	 * between range_start and range_end.  No threads can be waiting on the
> > +	 * lock as the last reference on KVM has been dropped.  If the lock is
> > +	 * still held, freeing memslots will deadlock.
> > +	 */
> > +	init_rwsem(&kvm->mmu_notifier_slots_lock);
> 
> I was going to say that this is nasty,

Heh, I still think it's nasty.

> then I noticed that
> mmu_notifier_unregister uses SRCU to ensure completion of concurrent calls
> to the MMU notifier.  So I guess it's fine, but it's better to point it out:
> 
> 	/*
> 	 * At this point no more MMU notifiers will run and pending
> 	 * calls to range_start have completed, but the lock would
> 	 * still be held and never released if the MMU notifier was
> 	 * removed between range_start and range_end.  Since the last
> 	 * reference to the struct kvm has been dropped, no threads can
> 	 * be waiting on the lock, but we might still end up taking it
> 	 * when freeing memslots in kvm_arch_destroy_vm.  Reset the lock
> 	 * to avoid deadlocks.
> 	 */
> 
> That said, the easiest way to avoid this would be to always update
> mmu_notifier_count.

Updating mmu_notifier_count requires taking mmu_lock, which would defeat the
purpose of these shenanigans.  I think it could be made atomic, since mmu_lock
would be taken before the elevated count _must_ be visible, but that would
break the mmu_notifier_range_{start,end} optimization that was recently added.

Or did I completely misunderstand what you're suggesting?

> I don't mind the rwsem, but at least I suggest that you
> split the patch in two---the first one keeping the mmu_notifier_count update
> unconditional, and the second one introducing the rwsem and the on_lock
> function kvm_inc_notifier_count.  Please document the new lock in
> Documentation/virt/kvm/locking.rst too.

Note, will update docs.

> Also, related to the first part of the series, perhaps you could structure
> the series in a slightly different way:
> 
> 1) introduce the HVA walking API in common code, complete with on_lock and
> patch 15, so that you can use on_lock to increase mmu_notifier_seq
> 
> 2) then migrate all architectures including x86 to the new API
> 
> IOW, first half of patch 10 and all of patch 15; then the second half of
> patch 10; then patches 11-14.
> 
> > +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > +	down_write(&kvm->mmu_notifier_slots_lock);
> > +#endif
> >  	rcu_assign_pointer(kvm->memslots[as_id], slots);
> > +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > +	up_write(&kvm->mmu_notifier_slots_lock);
> > +#endif
> 
> Please do this unconditionally, the cost is minimal if the rwsem is not
> contended (as is the case if the architecture doesn't use MMU notifiers at
> all).

It's not the cost, it's that mmu_notifier_slots_lock doesn't exist.  That's an
easily solved problem, but then the lock wouldn't be initialized since
kvm_init_mmu_notifier() is a nop.  That's again easy to solve, but IMO would
look rather weird.  I guess the counter argument is that __kvm_memslots()
wouldn't need #ifdeffery.

These are the to ideas I've come up with:

Option 1:
	static int kvm_init_mmu_notifier(struct kvm *kvm)
	{
		init_rwsem(&kvm->mmu_notifier_slots_lock);

	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
		kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
		return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
	#else
		return 0;
	#endif
	}


Option 2:
	kvm_mmu_notifier_lock(kvm);
	rcu_assign_pointer(kvm->memslots[as_id], slots);
	kvm_mmu_notifier_unlock(kvm);
Paolo Bonzini March 31, 2021, 4:47 p.m. UTC | #4
On 31/03/21 18:41, Sean Christopherson wrote:
>> That said, the easiest way to avoid this would be to always update
>> mmu_notifier_count.
> Updating mmu_notifier_count requires taking mmu_lock, which would defeat the
> purpose of these shenanigans.

Okay; I wasn't sure if the problem was contention with page faults in 
general, or just the long critical sections from the MMU notifier 
callbacks.  Still updating mmu_notifier_count unconditionally is a good 
way to break up the patch in two and keep one commit just for the rwsem 
nastiness.

>>> +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>>> +	down_write(&kvm->mmu_notifier_slots_lock);
>>> +#endif
>>>   	rcu_assign_pointer(kvm->memslots[as_id], slots);
>>> +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>>> +	up_write(&kvm->mmu_notifier_slots_lock);
>>> +#endif
>> Please do this unconditionally, the cost is minimal if the rwsem is not
>> contended (as is the case if the architecture doesn't use MMU notifiers at
>> all).
> It's not the cost, it's that mmu_notifier_slots_lock doesn't exist.  That's an
> easily solved problem, but then the lock wouldn't be initialized since
> kvm_init_mmu_notifier() is a nop.  That's again easy to solve, but IMO would
> look rather weird.  I guess the counter argument is that __kvm_memslots()
> wouldn't need #ifdeffery.

Yep.  Less #ifdefs usually wins. :)

> These are the to ideas I've come up with:
> 
> Option 1:
> 	static int kvm_init_mmu_notifier(struct kvm *kvm)
> 	{
> 		init_rwsem(&kvm->mmu_notifier_slots_lock);
> 
> 	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> 		kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
> 		return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
> 	#else
> 		return 0;
> 	#endif
> 	}

Option 2 is also okay I guess, but the simplest is option 1 + just init 
it in kvm_create_vm.

Paolo
Sean Christopherson March 31, 2021, 7:47 p.m. UTC | #5
On Wed, Mar 31, 2021, Paolo Bonzini wrote:
> On 31/03/21 18:41, Sean Christopherson wrote:
> > > That said, the easiest way to avoid this would be to always update
> > > mmu_notifier_count.
> > Updating mmu_notifier_count requires taking mmu_lock, which would defeat the
> > purpose of these shenanigans.
> 
> Okay; I wasn't sure if the problem was contention with page faults in
> general, or just the long critical sections from the MMU notifier callbacks.
> Still updating mmu_notifier_count unconditionally is a good way to break up
> the patch in two and keep one commit just for the rwsem nastiness.

Rereading things, a small chunk of the rwsem nastiness can go away.  I don't see
any reason to use rw_semaphore instead of rwlock_t.  install_new_memslots() only
holds the lock for a handful of instructions.  Readers could get queued up
behind a writer, but since install_new_memslots() is serialized by slots_lock
(the existing mutex), there is no chance of multiple writers, i.e. the worst
case wait duration is bounded at the length of an in-flight notification.  And
that's _already_ the worst case since notifications are currently serialized by
mmu_lock.  In practice, the existing worst case is probably far worse since
there can be far more writers trying to acquire mmu_lock.

In other words, there's no strong argument for sleeping instead of busy waiting
in the notifiers.

By switching to rwlock_t, taking mmu_notifier_slots_lock doesn't have to depend
on mmu_notifier_range_blockable(), and the must_lock path also goes away.

> > > > +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > > > +	down_write(&kvm->mmu_notifier_slots_lock);
> > > > +#endif
> > > >   	rcu_assign_pointer(kvm->memslots[as_id], slots);
> > > > +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > > > +	up_write(&kvm->mmu_notifier_slots_lock);
> > > > +#endif
> > > Please do this unconditionally, the cost is minimal if the rwsem is not
> > > contended (as is the case if the architecture doesn't use MMU notifiers at
> > > all).
> > It's not the cost, it's that mmu_notifier_slots_lock doesn't exist.  That's an
> > easily solved problem, but then the lock wouldn't be initialized since
> > kvm_init_mmu_notifier() is a nop.  That's again easy to solve, but IMO would
> > look rather weird.  I guess the counter argument is that __kvm_memslots()
> > wouldn't need #ifdeffery.
> 
> Yep.  Less #ifdefs usually wins. :)
> 
> > These are the to ideas I've come up with:
> > 
> > Option 1:
> > 	static int kvm_init_mmu_notifier(struct kvm *kvm)
> > 	{
> > 		init_rwsem(&kvm->mmu_notifier_slots_lock);
> > 
> > 	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> > 		kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
> > 		return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
> > 	#else
> > 		return 0;
> > 	#endif
> > 	}
> 
> Option 2 is also okay I guess, but the simplest is option 1 + just init it
> in kvm_create_vm.

Arr.  I'll play around with it to try and purge the #ifdefs.
Sean Christopherson March 31, 2021, 8:15 p.m. UTC | #6
On Wed, Mar 31, 2021, Paolo Bonzini wrote:
> On 26/03/21 03:19, Sean Christopherson wrote:
> > +	/*
> > +	 * Reset the lock used to prevent memslot updates between MMU notifier
> > +	 * range_start and range_end.  At this point no more MMU notifiers will
> > +	 * run, but the lock could still be held if KVM's notifier was removed
> > +	 * between range_start and range_end.  No threads can be waiting on the
> > +	 * lock as the last reference on KVM has been dropped.  If the lock is
> > +	 * still held, freeing memslots will deadlock.
> > +	 */
> > +	init_rwsem(&kvm->mmu_notifier_slots_lock);
> 
> I was going to say that this is nasty, then I noticed that
> mmu_notifier_unregister uses SRCU to ensure completion of concurrent calls
> to the MMU notifier.  So I guess it's fine, but it's better to point it out:
> 
> 	/*
> 	 * At this point no more MMU notifiers will run and pending
> 	 * calls to range_start have completed, but the lock would
> 	 * still be held and never released if the MMU notifier was
> 	 * removed between range_start and range_end.  Since the last
> 	 * reference to the struct kvm has been dropped, no threads can
> 	 * be waiting on the lock, but we might still end up taking it
> 	 * when freeing memslots in kvm_arch_destroy_vm.  Reset the lock
> 	 * to avoid deadlocks.
> 	 */

An alternative would be to not take the lock in install_new_memslots() if
kvm->users_count == 0.  It'd be weirder to document, and the conditional locking
would still be quite ugly.  Not sure if that's better than blasting a lock
during destruction?
Paolo Bonzini March 31, 2021, 8:30 p.m. UTC | #7
On 31/03/21 22:15, Sean Christopherson wrote:
> On Wed, Mar 31, 2021, Paolo Bonzini wrote:
>> On 26/03/21 03:19, Sean Christopherson wrote:
>>> +	/*
>>> +	 * Reset the lock used to prevent memslot updates between MMU notifier
>>> +	 * range_start and range_end.  At this point no more MMU notifiers will
>>> +	 * run, but the lock could still be held if KVM's notifier was removed
>>> +	 * between range_start and range_end.  No threads can be waiting on the
>>> +	 * lock as the last reference on KVM has been dropped.  If the lock is
>>> +	 * still held, freeing memslots will deadlock.
>>> +	 */
>>> +	init_rwsem(&kvm->mmu_notifier_slots_lock);
>>
>> I was going to say that this is nasty, then I noticed that
>> mmu_notifier_unregister uses SRCU to ensure completion of concurrent calls
>> to the MMU notifier.  So I guess it's fine, but it's better to point it out:
>>
>> 	/*
>> 	 * At this point no more MMU notifiers will run and pending
>> 	 * calls to range_start have completed, but the lock would
>> 	 * still be held and never released if the MMU notifier was
>> 	 * removed between range_start and range_end.  Since the last
>> 	 * reference to the struct kvm has been dropped, no threads can
>> 	 * be waiting on the lock, but we might still end up taking it
>> 	 * when freeing memslots in kvm_arch_destroy_vm.  Reset the lock
>> 	 * to avoid deadlocks.
>> 	 */
> 
> An alternative would be to not take the lock in install_new_memslots() if
> kvm->users_count == 0.  It'd be weirder to document, and the conditional locking
> would still be quite ugly.  Not sure if that's better than blasting a lock
> during destruction?

No, that's worse...

Paolo
Paolo Bonzini March 31, 2021, 8:42 p.m. UTC | #8
On 31/03/21 21:47, Sean Christopherson wrote:
> Rereading things, a small chunk of the rwsem nastiness can go away.  I don't see
> any reason to use rw_semaphore instead of rwlock_t.

Wouldn't it be incorrect to lock a mutex (e.g. inside *another* MMU 
notifier's invalidate callback) while holding an rwlock_t?  That makes 
sense because anybody that's busy waiting in write_lock potentially 
cannot be preempted until the other task gets the mutex.  This is a 
potential deadlock.

I also thought of busy waiting on down_read_trylock if the MMU notifier 
cannot block, but that would also be invalid for the opposite reason 
(the down_write task might be asleep, waiting for other readers to 
release the task, and the down_read_trylock busy loop might not let that 
task run).

> And that's _already_ the worst case since notifications are currently
> serialized by mmu_lock.

But right now notifications are not a single critical section, they're 
two, aren't they?

Paolo
Sean Christopherson March 31, 2021, 8:52 p.m. UTC | #9
On Wed, Mar 31, 2021, Paolo Bonzini wrote:
> On 26/03/21 03:19, Sean Christopherson wrote:
> Also, related to the first part of the series, perhaps you could structure
> the series in a slightly different way:
> 
> 1) introduce the HVA walking API in common code, complete with on_lock and
> patch 15, so that you can use on_lock to increase mmu_notifier_seq
> 
> 2) then migrate all architectures including x86 to the new API
> 
> IOW, first half of patch 10 and all of patch 15; then the second half of
> patch 10; then patches 11-14.

100% agree with introducing on_lock separately from the conditional locking.

Not so sure about introducing conditional locking and then converting non-x86
archs.  I'd prefer to keep the conditional locking after arch conversion.
If something does go awry, it would be nice to be able to preciesly bisect to
the conditional locking.  Ditto if it needs to be reverted because it breaks an
arch.
Paolo Bonzini March 31, 2021, 9 p.m. UTC | #10
On 31/03/21 22:52, Sean Christopherson wrote:
> 100% agree with introducing on_lock separately from the conditional locking.
> 
> Not so sure about introducing conditional locking and then converting non-x86
> archs.  I'd prefer to keep the conditional locking after arch conversion.
> If something does go awry, it would be nice to be able to preciesly bisect to
> the conditional locking.  Ditto if it needs to be reverted because it breaks an
> arch.

Ok, that sounds good too.

Paolo
Sean Christopherson March 31, 2021, 9:05 p.m. UTC | #11
On Wed, Mar 31, 2021, Paolo Bonzini wrote:
> On 31/03/21 21:47, Sean Christopherson wrote:
> > Rereading things, a small chunk of the rwsem nastiness can go away.  I don't see
> > any reason to use rw_semaphore instead of rwlock_t.
> 
> Wouldn't it be incorrect to lock a mutex (e.g. inside *another* MMU
> notifier's invalidate callback) while holding an rwlock_t?  That makes sense
> because anybody that's busy waiting in write_lock potentially cannot be
> preempted until the other task gets the mutex.  This is a potential
> deadlock.

Yes?  I don't think I follow your point though.  Nesting a spinlock or rwlock
inside a rwlock is ok, so long as the locks are always taken in the same order,
i.e. it's never mmu_lock -> mmu_notifier_slots_lock.

> I also thought of busy waiting on down_read_trylock if the MMU notifier
> cannot block, but that would also be invalid for the opposite reason (the
> down_write task might be asleep, waiting for other readers to release the
> task, and the down_read_trylock busy loop might not let that task run).
> 
> > And that's _already_ the worst case since notifications are currently
> > serialized by mmu_lock.
> 
> But right now notifications are not a single critical section, they're two,
> aren't they?

Ah, crud, yes.  Holding a spinlock across the entire start() ... end() would be
bad, especially when the notifier can block since that opens up the possibility
of the task sleeping/blocking/yielding while the spinlock is held.  Bummer.
Sean Christopherson March 31, 2021, 9:22 p.m. UTC | #12
On Wed, Mar 31, 2021, Sean Christopherson wrote:
> On Wed, Mar 31, 2021, Paolo Bonzini wrote:
> > On 31/03/21 21:47, Sean Christopherson wrote:
> > I also thought of busy waiting on down_read_trylock if the MMU notifier
> > cannot block, but that would also be invalid for the opposite reason (the
> > down_write task might be asleep, waiting for other readers to release the
> > task, and the down_read_trylock busy loop might not let that task run).
> > 
> > > And that's _already_ the worst case since notifications are currently
> > > serialized by mmu_lock.
> > 
> > But right now notifications are not a single critical section, they're two,
> > aren't they?
> 
> Ah, crud, yes.  Holding a spinlock across the entire start() ... end() would be
> bad, especially when the notifier can block since that opens up the possibility
> of the task sleeping/blocking/yielding while the spinlock is held.  Bummer.

On a related topic, any preference on whether to have an explicit "must_lock"
flag (what I posted), or derive the logic based on other params?

The helper I posted does:

	if (range->must_lock &&
	    kvm_mmu_lock_and_check_handler(kvm, range, &locked))
		goto out_unlock;

but it could be:

	if (!IS_KVM_NULL_FN(range->on_lock) && !range->may_block &&
	    kvm_mmu_lock_and_check_handler(kvm, range, &locked))
		goto out_unlock;

The generated code should be nearly identical on a modern compiler, so it's
purely a question of aesthetics.  I slightly prefer the explicit "must_lock" to
avoid spreading out the logic too much, but it also feels a bit superfluous.
Paolo Bonzini March 31, 2021, 9:35 p.m. UTC | #13
On 31/03/21 23:05, Sean Christopherson wrote:
>> Wouldn't it be incorrect to lock a mutex (e.g. inside*another*  MMU
>> notifier's invalidate callback) while holding an rwlock_t?  That makes sense
>> because anybody that's busy waiting in write_lock potentially cannot be
>> preempted until the other task gets the mutex.  This is a potential
>> deadlock.
>
> Yes?  I don't think I follow your point though.  Nesting a spinlock or rwlock
> inside a rwlock is ok, so long as the locks are always taken in the same order,
> i.e. it's never mmu_lock -> mmu_notifier_slots_lock.

*Another* MMU notifier could nest a mutex inside KVM's rwlock.

But... is it correct that the MMU notifier invalidate callbacks are 
always called with the mmap_sem taken (sometimes for reading, e.g. 
try_to_merge_with_ksm_page->try_to_merge_one_page->write_protect_page)? 
  We could take it temporarily in install_memslots, since the MMU 
notifier's mm is stored in kvm->mm.

In this case, a pair of kvm_mmu_notifier_lock/unlock functions would be 
the best way to abstract it.

Paolo
Paolo Bonzini March 31, 2021, 9:36 p.m. UTC | #14
On 31/03/21 23:22, Sean Christopherson wrote:
> On a related topic, any preference on whether to have an explicit "must_lock"
> flag (what I posted), or derive the logic based on other params?
> 
> The helper I posted does:
> 
> 	if (range->must_lock &&
> 	    kvm_mmu_lock_and_check_handler(kvm, range, &locked))
> 		goto out_unlock;
> 
> but it could be:
> 
> 	if (!IS_KVM_NULL_FN(range->on_lock) && !range->may_block &&
> 	    kvm_mmu_lock_and_check_handler(kvm, range, &locked))
> 		goto out_unlock;
> 
> The generated code should be nearly identical on a modern compiler, so it's
> purely a question of aesthetics.  I slightly prefer the explicit "must_lock" to
> avoid spreading out the logic too much, but it also feels a bit superfluous.

I do as well, but I hope we don't need any lock after all as in the 
email I've just sent.

Paolo
Sean Christopherson March 31, 2021, 9:47 p.m. UTC | #15
On Wed, Mar 31, 2021, Paolo Bonzini wrote:
> On 31/03/21 23:05, Sean Christopherson wrote:
> > > Wouldn't it be incorrect to lock a mutex (e.g. inside*another*  MMU
> > > notifier's invalidate callback) while holding an rwlock_t?  That makes sense
> > > because anybody that's busy waiting in write_lock potentially cannot be
> > > preempted until the other task gets the mutex.  This is a potential
> > > deadlock.
> > 
> > Yes?  I don't think I follow your point though.  Nesting a spinlock or rwlock
> > inside a rwlock is ok, so long as the locks are always taken in the same order,
> > i.e. it's never mmu_lock -> mmu_notifier_slots_lock.
> 
> *Another* MMU notifier could nest a mutex inside KVM's rwlock.
> 
> But... is it correct that the MMU notifier invalidate callbacks are always
> called with the mmap_sem taken (sometimes for reading, e.g.
> try_to_merge_with_ksm_page->try_to_merge_one_page->write_protect_page)?

No :-(

File-based invalidations through the rmaps do not take mmap_sem.  They get at
the VMAs via the address_space's interval tree, which is protected by its own
i_mmap_rwsem.

E.g. try_to_unmap() -> rmap_walk_file() -> try_to_unmap_one() 

> We could take it temporarily in install_memslots, since the MMU notifier's mm
> is stored in kvm->mm.
> 
> In this case, a pair of kvm_mmu_notifier_lock/unlock functions would be the
> best way to abstract it.
> 
> Paolo
>

Patch
diff mbox series

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 40ac2d40bb5a..2cc0f87d936e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -523,6 +523,7 @@  struct kvm {
 	long mmu_notifier_count;
 	unsigned long mmu_notifier_range_start;
 	unsigned long mmu_notifier_range_end;
+	struct rw_semaphore mmu_notifier_slots_lock;
 #endif
 	long tlbs_dirty;
 	struct list_head devices;
@@ -660,8 +661,11 @@  static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
 {
 	as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM);
 	return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
-			lockdep_is_held(&kvm->slots_lock) ||
-			!refcount_read(&kvm->users_count));
+				      lockdep_is_held(&kvm->slots_lock) ||
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+				      lockdep_is_held(&kvm->mmu_notifier_slots_lock) ||
+#endif
+				      !refcount_read(&kvm->users_count));
 }
 
 static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0c2aff8a4aa1..9ebc6d3e4a21 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -453,20 +453,56 @@  static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
 
 typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 
+typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
+			     unsigned long end);
+
 struct kvm_hva_range {
 	unsigned long start;
 	unsigned long end;
 	pte_t pte;
 	hva_handler_t handler;
-	bool caller_locked;
+	on_lock_fn_t on_lock;
+	bool must_lock;
 	bool flush_on_ret;
 	bool may_block;
 };
 
+/*
+ * Use a dedicated stub instead of NULL to indicate that there is no callback
+ * function/handler.  The compiler technically can't guarantee that a real
+ * function will have a non-zero address, and so it will generate code to
+ * check for !NULL, whereas comparing against a stub will be elided at compile
+ * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
+ */
+static void kvm_null_fn(void)
+{
+
+}
+#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
+
+
+/* Acquire mmu_lock if necessary.  Returns %true if @handler is "null" */
+static __always_inline bool kvm_mmu_lock_and_check_handler(struct kvm *kvm,
+							   const struct kvm_hva_range *range,
+							   bool *locked)
+{
+	if (*locked)
+		return false;
+
+	*locked = true;
+
+	KVM_MMU_LOCK(kvm);
+
+	if (!IS_KVM_NULL_FN(range->on_lock))
+		range->on_lock(kvm, range->start, range->end);
+
+	return IS_KVM_NULL_FN(range->handler);
+}
+
 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 						  const struct kvm_hva_range *range)
 {
-	bool ret = false, locked = range->caller_locked;
+	bool ret = false, locked = false;
 	struct kvm_gfn_range gfn_range;
 	struct kvm_memory_slot *slot;
 	struct kvm_memslots *slots;
@@ -474,6 +510,10 @@  static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 
 	idx = srcu_read_lock(&kvm->srcu);
 
+	if (range->must_lock &&
+	    kvm_mmu_lock_and_check_handler(kvm, range, &locked))
+		goto out_unlock;
+
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 		slots = __kvm_memslots(kvm, i);
 		kvm_for_each_memslot(slot, slots) {
@@ -502,10 +542,9 @@  static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 			gfn_range.slot = slot;
 
-			if (!locked) {
-				locked = true;
-				KVM_MMU_LOCK(kvm);
-			}
+			if (kvm_mmu_lock_and_check_handler(kvm, range, &locked))
+				goto out_unlock;
+
 			ret |= range->handler(kvm, &gfn_range);
 		}
 	}
@@ -513,7 +552,8 @@  static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 	if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
 		kvm_flush_remote_tlbs(kvm);
 
-	if (locked && !range->caller_locked)
+out_unlock:
+	if (locked)
 		KVM_MMU_UNLOCK(kvm);
 
 	srcu_read_unlock(&kvm->srcu, idx);
@@ -534,10 +574,12 @@  static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 		.end		= end,
 		.pte		= pte,
 		.handler	= handler,
-		.caller_locked	= false,
+		.on_lock	= (void *)kvm_null_fn,
+		.must_lock	= false,
 		.flush_on_ret	= true,
 		.may_block	= false,
 	};
+
 	return __kvm_handle_hva_range(kvm, &range);
 }
 
@@ -552,7 +594,8 @@  static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
 		.end		= end,
 		.pte		= __pte(0),
 		.handler	= handler,
-		.caller_locked	= false,
+		.on_lock	= (void *)kvm_null_fn,
+		.must_lock	= false,
 		.flush_on_ret	= false,
 		.may_block	= false,
 	};
@@ -569,23 +612,9 @@  static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 }
 
-static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
-					const struct mmu_notifier_range *range)
+static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
+				   unsigned long end)
 {
-	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	const struct kvm_hva_range hva_range = {
-		.start		= range->start,
-		.end		= range->end,
-		.pte		= __pte(0),
-		.handler	= kvm_unmap_gfn_range,
-		.caller_locked	= true,
-		.flush_on_ret	= true,
-		.may_block	= mmu_notifier_range_blockable(range),
-	};
-
-	trace_kvm_unmap_hva_range(range->start, range->end);
-
-	KVM_MMU_LOCK(kvm);
 	/*
 	 * The count increase must become visible at unlock time as no
 	 * spte can be established without taking the mmu_lock and
@@ -593,8 +622,8 @@  static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 */
 	kvm->mmu_notifier_count++;
 	if (likely(kvm->mmu_notifier_count == 1)) {
-		kvm->mmu_notifier_range_start = range->start;
-		kvm->mmu_notifier_range_end = range->end;
+		kvm->mmu_notifier_range_start = start;
+		kvm->mmu_notifier_range_end = end;
 	} else {
 		/*
 		 * Fully tracking multiple concurrent ranges has dimishing
@@ -606,24 +635,54 @@  static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		 * complete.
 		 */
 		kvm->mmu_notifier_range_start =
-			min(kvm->mmu_notifier_range_start, range->start);
+			min(kvm->mmu_notifier_range_start, start);
 		kvm->mmu_notifier_range_end =
-			max(kvm->mmu_notifier_range_end, range->end);
+			max(kvm->mmu_notifier_range_end, end);
 	}
-
-	__kvm_handle_hva_range(kvm, &hva_range);
-
-	KVM_MMU_UNLOCK(kvm);
-
-	return 0;
 }
 
-static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
+	bool blockable = mmu_notifier_range_blockable(range);
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	const struct kvm_hva_range hva_range = {
+		.start		= range->start,
+		.end		= range->end,
+		.pte		= __pte(0),
+		.handler	= kvm_unmap_gfn_range,
+		.on_lock	= kvm_inc_notifier_count,
+		.must_lock	= !blockable,
+		.flush_on_ret	= true,
+		.may_block	= blockable,
+	};
 
-	KVM_MMU_LOCK(kvm);
+	trace_kvm_unmap_hva_range(range->start, range->end);
+
+	/*
+	 * Prevent memslot modification between range_start() and range_end()
+	 * so that conditionally locking provides the same result in both
+	 * functions.  Without that guarantee, the mmu_notifier_count
+	 * adjustments will be imbalanced.
+	 *
+	 * Skip the memslot-lookup lock elision (set @must_lock above) to avoid
+	 * having to take the semaphore on non-blockable calls, e.g. OOM kill.
+	 * The complexity required to handle conditional locking for this case
+	 * is not worth the marginal benefits, the VM is likely doomed anyways.
+	 *
+	 * Pairs with the unlock in range_end().
+	 */
+	if (blockable)
+		down_read(&kvm->mmu_notifier_slots_lock);
+
+	__kvm_handle_hva_range(kvm, &hva_range);
+
+	return 0;
+}
+
+static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
+				   unsigned long end)
+{
 	/*
 	 * This sequence increase will notify the kvm page fault that
 	 * the page that is going to be mapped in the spte could have
@@ -637,7 +696,29 @@  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	 * in conjunction with the smp_rmb in mmu_notifier_retry().
 	 */
 	kvm->mmu_notifier_count--;
-	KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+					const struct mmu_notifier_range *range)
+{
+	bool blockable = mmu_notifier_range_blockable(range);
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	const struct kvm_hva_range hva_range = {
+		.start		= range->start,
+		.end		= range->end,
+		.pte		= __pte(0),
+		.handler	= (void *)kvm_null_fn,
+		.on_lock	= kvm_dec_notifier_count,
+		.must_lock	= !blockable,
+		.flush_on_ret	= true,
+		.may_block	= blockable,
+	};
+
+	__kvm_handle_hva_range(kvm, &hva_range);
+
+	/* Pairs with the lock in range_start(). */
+	if (blockable)
+		up_read(&kvm->mmu_notifier_slots_lock);
 
 	BUG_ON(kvm->mmu_notifier_count < 0);
 }
@@ -709,6 +790,8 @@  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 
 static int kvm_init_mmu_notifier(struct kvm *kvm)
 {
+	init_rwsem(&kvm->mmu_notifier_slots_lock);
+
 	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 }
@@ -971,6 +1054,15 @@  static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_coalesced_mmio_free(kvm);
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+	/*
+	 * Reset the lock used to prevent memslot updates between MMU notifier
+	 * range_start and range_end.  At this point no more MMU notifiers will
+	 * run, but the lock could still be held if KVM's notifier was removed
+	 * between range_start and range_end.  No threads can be waiting on the
+	 * lock as the last reference on KVM has been dropped.  If the lock is
+	 * still held, freeing memslots will deadlock.
+	 */
+	init_rwsem(&kvm->mmu_notifier_slots_lock);
 #else
 	kvm_arch_flush_shadow_all(kvm);
 #endif
@@ -1222,7 +1314,13 @@  static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
 	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
 
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+	down_write(&kvm->mmu_notifier_slots_lock);
+#endif
 	rcu_assign_pointer(kvm->memslots[as_id], slots);
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+	up_write(&kvm->mmu_notifier_slots_lock);
+#endif
 	synchronize_srcu_expedited(&kvm->srcu);
 
 	/*