[PATCH] kvm: x86: mmu: Always flush TLBs when enabling dirty logging

* [PATCH] kvm: x86: mmu: Always flush TLBs when enabling dirty logging
@ 2022-07-23  2:41 Junaid Shahid
  2022-07-25 17:27 ` David Matlack
  0 siblings, 1 reply; 4+ messages in thread
From: Junaid Shahid @ 2022-07-23  2:41 UTC (permalink / raw)
  To: kvm, pbonzini; +Cc: seanjc, dmatlack

When A/D bits are not available, KVM uses a software access tracking
mechanism, which involves making the SPTEs inaccessible. However,
the clear_young() MMU notifier does not flush TLBs. So it is possible
that there may still be stale, potentially writable, TLB entries.
This is usually fine, but can be problematic when enabling dirty
logging, because it currently only does a TLB flush if any SPTEs were
modified. But if all SPTEs are in access-tracked state, then there
won't be a TLB flush, which means that the guest could still possibly
write to memory and not have it reflected in the dirty bitmap.

So just unconditionally flush the TLBs when enabling dirty logging.
We could also do something more sophisticated if needed, but given
that a flush almost always happens anyway, so just making it
unconditional doesn't seem too bad.

Signed-off-by: Junaid Shahid <junaids@google.com>
---
 arch/x86/kvm/mmu/mmu.c  | 28 ++++++++++------------------
 arch/x86/kvm/mmu/spte.h |  9 +++++++--
 arch/x86/kvm/x86.c      |  7 +++++++
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 52664c3caaab..f0d7193db455 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6058,27 +6058,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 				      const struct kvm_memory_slot *memslot,
 				      int start_level)
 {
-	bool flush = false;
-
 	if (kvm_memslots_have_rmaps(kvm)) {
 		write_lock(&kvm->mmu_lock);
-		flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
-					  start_level, KVM_MAX_HUGEPAGE_LEVEL,
-					  false);
+		slot_handle_level(kvm, memslot, slot_rmap_write_protect,
+				  start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
 		write_unlock(&kvm->mmu_lock);
 	}
 
 	if (is_tdp_mmu_enabled(kvm)) {
 		read_lock(&kvm->mmu_lock);
-		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+		kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
 		read_unlock(&kvm->mmu_lock);
 	}
 
 	/*
-	 * Flush TLBs if any SPTEs had to be write-protected to ensure that
-	 * guest writes are reflected in the dirty bitmap before the memslot
-	 * update completes, i.e. before enabling dirty logging is visible to
-	 * userspace.
+	 * The caller will flush TLBs to ensure that guest writes are reflected
+	 * in the dirty bitmap before the memslot update completes, i.e. before
+	 * enabling dirty logging is visible to userspace.
 	 *
 	 * Perform the TLB flush outside the mmu_lock to reduce the amount of
 	 * time the lock is held. However, this does mean that another CPU can
@@ -6097,8 +6093,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 	 *
 	 * See is_writable_pte() for more details.
 	 */
-	if (flush)
-		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
 
 static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
@@ -6468,32 +6462,30 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   const struct kvm_memory_slot *memslot)
 {
-	bool flush = false;
-
 	if (kvm_memslots_have_rmaps(kvm)) {
 		write_lock(&kvm->mmu_lock);
 		/*
 		 * Clear dirty bits only on 4k SPTEs since the legacy MMU only
 		 * support dirty logging at a 4k granularity.
 		 */
-		flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
+		slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
 		write_unlock(&kvm->mmu_lock);
 	}
 
 	if (is_tdp_mmu_enabled(kvm)) {
 		read_lock(&kvm->mmu_lock);
-		flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+		kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
 		read_unlock(&kvm->mmu_lock);
 	}
 
 	/*
+	 * The caller will flush the TLBs after this function returns.
+	 *
 	 * It's also safe to flush TLBs out of mmu lock here as currently this
 	 * function is only used for dirty logging, in which case flushing TLB
 	 * out of mmu lock also guarantees no dirty pages will be lost in
 	 * dirty_bitmap.
 	 */
-	if (flush)
-		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
 
 void kvm_mmu_zap_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index ba3dccb202bc..ec3e79ac4449 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -330,7 +330,7 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
 }
 
 /*
- * An shadow-present leaf SPTE may be non-writable for 3 possible reasons:
+ * A shadow-present leaf SPTE may be non-writable for 4 possible reasons:
  *
  *  1. To intercept writes for dirty logging. KVM write-protects huge pages
  *     so that they can be split be split down into the dirty logging
@@ -348,6 +348,8 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
  *     read-only memslot or guest memory backed by a read-only VMA. Writes to
  *     such pages are disallowed entirely.
  *
+ *  4. To track the Accessed status for SPTEs without A/D bits.
+ *
  * To keep track of why a given SPTE is write-protected, KVM uses 2
  * software-only bits in the SPTE:
  *
@@ -358,6 +360,8 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
  *  shadow_host_writable_mask, aka Host-writable -
  *    Cleared on SPTEs that are not host-writable (case 3 above)
  *
+ * In addition, is_acc_track_spte() is true in the case 4 above.
+ *
  * Note, not all possible combinations of PT_WRITABLE_MASK,
  * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given
  * SPTE can be in only one of the following states, which map to the
@@ -378,7 +382,8 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
  * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging
  * (which does not clear the MMU-writable bit), does not flush TLBs before
  * dropping the lock, as it only needs to synchronize guest writes with the
- * dirty bitmap.
+ * dirty bitmap. Similarly, the clear_young() MMU notifier also does not flush
+ * TLBs even though the SPTE can become non-writable because of case 4.
  *
  * So, there is the problem: clearing the MMU-writable bit can encounter a
  * write-protected SPTE while CPUs still have writable mappings for that SPTE
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f389691d8c04..8e33e35e4da4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12448,6 +12448,13 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 		} else {
 			kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
 		}
+
+		/*
+		 * Always flush the TLB even if no PTEs were modified above,
+		 * because it is possible that there may still be stale writable
+		 * TLB entries for non-AD PTEs from a prior clear_young().
+		 */
+		kvm_arch_flush_remote_tlbs_memslot(kvm, new);
 	}
 }
 

base-commit: a4850b5590d01bf3fb19fda3fc5d433f7382a974
-- 
2.37.1.359.gd136c6c3e2-goog


^ permalink raw reply related	[flat|nested] 4+ messages in thread