All of lore.kernel.org
 help / color / mirror / Atom feed
* [FYI PATCH] Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
@ 2022-03-18 16:48 Paolo Bonzini
  2022-03-21  9:13 ` Paolo Bonzini
  0 siblings, 1 reply; 8+ messages in thread
From: Paolo Bonzini @ 2022-03-18 16:48 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: Vitaly Kuznetsov

This reverts commit cf3e26427c08ad9015956293ab389004ac6a338e.

Multi-vCPU Hyper-V guests started crashing randomly on boot with the
latest kvm/queue and the problem can be bisected the problem to this
particular patch. Basically, I'm not able to boot e.g. 16-vCPU guest
successfully anymore. Both Intel and AMD seem to be affected. Reverting
the commit saves the day.

Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c     |  4 ++--
 arch/x86/kvm/mmu/tdp_mmu.c | 41 ++++++++++++++++++++++++++++----------
 arch/x86/kvm/mmu/tdp_mmu.h |  8 +++++++-
 3 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3b8da8b0745e..51671cb34fb6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5842,8 +5842,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 
 	if (is_tdp_mmu_enabled(kvm)) {
 		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
-			flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
-						      gfn_end, true, flush);
+			flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
+							  gfn_end, flush);
 	}
 
 	if (flush)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index af60922906ef..87d8910c9ac2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -906,8 +906,10 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 }
 
 /*
- * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
- * have been cleared and a TLB flush is needed before releasing the MMU lock.
+ * Tears down the mappings for the range of gfns, [start, end), and frees the
+ * non-root pages mapping GFNs strictly within that range. Returns true if
+ * SPTEs have been cleared and a TLB flush is needed before releasing the
+ * MMU lock.
  *
  * If can_yield is true, will release the MMU lock and reschedule if the
  * scheduler needs the CPU or there is contention on the MMU lock. If this
@@ -915,25 +917,42 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
  * the caller must ensure it does not supply too large a GFN range, or the
  * operation can cause a soft lockup.
  */
-static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
-			      gfn_t start, gfn_t end, bool can_yield, bool flush)
+static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+			  gfn_t start, gfn_t end, bool can_yield, bool flush)
 {
+	bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host());
 	struct tdp_iter iter;
 
+	/*
+	 * No need to try to step down in the iterator when zapping all SPTEs,
+	 * zapping the top-level non-leaf SPTEs will recurse on their children.
+	 */
+	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
+
 	end = min(end, tdp_mmu_max_gfn_host());
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 
 	rcu_read_lock();
 
-	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
+	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
 		if (can_yield &&
 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
 			flush = false;
 			continue;
 		}
 
-		if (!is_shadow_present_pte(iter.old_spte) ||
+		if (!is_shadow_present_pte(iter.old_spte))
+			continue;
+
+		/*
+		 * If this is a non-last-level SPTE that covers a larger range
+		 * than should be zapped, continue, and zap the mappings at a
+		 * lower level, except when zapping all SPTEs.
+		 */
+		if (!zap_all &&
+		    (iter.gfn < start ||
+		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 		    !is_last_spte(iter.old_spte, iter.level))
 			continue;
 
@@ -956,13 +975,13 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
  * SPTEs have been cleared and a TLB flush is needed before releasing the
  * MMU lock.
  */
-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
-			   bool can_yield, bool flush)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+				 gfn_t end, bool can_yield, bool flush)
 {
 	struct kvm_mmu_page *root;
 
 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
-		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
+		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
 
 	return flush;
 }
@@ -1210,8 +1229,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
 				 bool flush)
 {
-	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
-				     range->end, range->may_block, flush);
+	return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
+					   range->end, range->may_block, flush);
 }
 
 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 54bc8118c40a..5e5ef2576c81 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -15,8 +15,14 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
 			  bool shared);
 
-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 				 gfn_t end, bool can_yield, bool flush);
+static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
+					     gfn_t start, gfn_t end, bool flush)
+{
+	return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
+}
+
 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [FYI PATCH] Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
  2022-03-18 16:48 [FYI PATCH] Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()" Paolo Bonzini
@ 2022-03-21  9:13 ` Paolo Bonzini
  2022-03-24 23:57   ` Sean Christopherson
  0 siblings, 1 reply; 8+ messages in thread
From: Paolo Bonzini @ 2022-03-21  9:13 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: Vitaly Kuznetsov, Sean Christopherson

On 3/18/22 17:48, Paolo Bonzini wrote:
> This reverts commit cf3e26427c08ad9015956293ab389004ac6a338e.
> 
> Multi-vCPU Hyper-V guests started crashing randomly on boot with the
> latest kvm/queue and the problem can be bisected the problem to this
> particular patch. Basically, I'm not able to boot e.g. 16-vCPU guest
> successfully anymore. Both Intel and AMD seem to be affected. Reverting
> the commit saves the day.
> 
> Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

This is not enough, the following is also needed to account
for "KVM: x86/mmu: Defer TLB flush to caller when freeing TDP MMU shadow
pages":

------------------- 8< ----------------
From: Paolo Bonzini <pbonzini@redhat.com>
Subject: [PATCH] kvm: x86/mmu: Flush TLB before zap_gfn_range releases RCU

Since "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
is going to be reverted, it's not going to be true anymore that
the zap-page flow does not free any 'struct kvm_mmu_page'.  Introduce
an early flush before tdp_mmu_zap_leafs() returns, to preserve
bisectability.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index af60922906ef..7f63e1a704e3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -941,13 +941,17 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
  		flush = true;
  	}
  
+	/*
+	 * Need to flush before releasing RCU.  TODO: do it only if intermediate
+	 * page tables were zapped; there is no need to flush under RCU protection
+	 * if no 'struct kvm_mmu_page' is freed.
+	 */
+	if (flush)
+		kvm_flush_remote_tlbs_with_address(kvm, start, end - start);
+
  	rcu_read_unlock();
  
-	/*
-	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
-	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
-	 */
-	return flush;
+	return false;
  }
  
  /*


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [FYI PATCH] Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
  2022-03-21  9:13 ` Paolo Bonzini
@ 2022-03-24 23:57   ` Sean Christopherson
  2022-03-25 10:38     ` Vitaly Kuznetsov
                       ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Sean Christopherson @ 2022-03-24 23:57 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, Vitaly Kuznetsov, Sean Christopherson

On Mon, Mar 21, 2022, Paolo Bonzini wrote:
> On 3/18/22 17:48, Paolo Bonzini wrote:
> > This reverts commit cf3e26427c08ad9015956293ab389004ac6a338e.
> > 
> > Multi-vCPU Hyper-V guests started crashing randomly on boot with the
> > latest kvm/queue and the problem can be bisected the problem to this
> > particular patch. Basically, I'm not able to boot e.g. 16-vCPU guest
> > successfully anymore. Both Intel and AMD seem to be affected. Reverting
> > the commit saves the day.
> > 
> > Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> 
> This is not enough, the following is also needed to account
> for "KVM: x86/mmu: Defer TLB flush to caller when freeing TDP MMU shadow
> pages":
> 
> ------------------- 8< ----------------
> From: Paolo Bonzini <pbonzini@redhat.com>
> Subject: [PATCH] kvm: x86/mmu: Flush TLB before zap_gfn_range releases RCU
> 
> Since "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
> is going to be reverted, it's not going to be true anymore that
> the zap-page flow does not free any 'struct kvm_mmu_page'.  Introduce
> an early flush before tdp_mmu_zap_leafs() returns, to preserve
> bisectability.

Can I have 1-2 weeks to try and root cause and fix the underlying issue before
sending reverts to Linus?  I really don't want to paper over a TLB flushing bug
or an off-by-one bug, and I really, really don't want to end up with another
scenario where KVM zaps everything just because.

Vitaly, can you provide repro instructions?  A nearly-complete QEMU command line
would be wonderful :-)  Is the issue unique to any particular guest kernel?  I've
been unable to repro with a 112 vCPU Linux guest with these Hyper-V enlightenments:

$ : dm | grep -i hyper-v
[    0.000000] Hypervisor detected: Microsoft Hyper-V
[    0.000000] Hyper-V: privilege flags low 0x2aff, high 0x830, hints 0x4e2c, misc 0x80d12
[    0.000000] Hyper-V Host Build:14393-10.0-0-0.0
[    0.000000] Hyper-V: Nested features: 0x80101
[    0.000000] Hyper-V: LAPIC Timer Frequency: 0x3d0900
[    0.000000] Hyper-V: Using hypercall for remote TLB flush
[    0.000004] tsc: Marking TSC unstable due to running on Hyper-V
[    0.129376] Booting paravirtualized kernel on Hyper-V
[    0.140419] Hyper-V: PV spinlocks disabled
[    0.247500] Hyper-V: Using IPI hypercalls
[    0.247502] Hyper-V: Using enlightened APIC (x2apic mode)

Actually, since this is apparently specific to kvm_zap_gfn_range(), can you add
printk "tracing" in update_mtrr(), kvm_post_set_cr0(), and __kvm_request_apicv_update()
to see what is actually triggering zaps?  Capturing the start and end GFNs would be very
helpful for the MTRR case.

The APICv update seems unlikely to affect only Hyper-V guests, though there is the auto
EOI crud.  And the other two only come into play with non-coherent DMA.  In other words,
figuring out exactly what sequence leads to failure should be straightforward.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FYI PATCH] Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
  2022-03-24 23:57   ` Sean Christopherson
@ 2022-03-25 10:38     ` Vitaly Kuznetsov
  2022-03-25 11:21     ` Paolo Bonzini
  2022-03-25 15:03     ` Vitaly Kuznetsov
  2 siblings, 0 replies; 8+ messages in thread
From: Vitaly Kuznetsov @ 2022-03-25 10:38 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, Sean Christopherson, Paolo Bonzini

Sean Christopherson <seanjc@google.com> writes:

> On Mon, Mar 21, 2022, Paolo Bonzini wrote:
>> On 3/18/22 17:48, Paolo Bonzini wrote:
>> > This reverts commit cf3e26427c08ad9015956293ab389004ac6a338e.
>> > 
>> > Multi-vCPU Hyper-V guests started crashing randomly on boot with the
>> > latest kvm/queue and the problem can be bisected the problem to this
>> > particular patch.

...

>
> Vitaly, can you provide repro instructions?  A nearly-complete QEMU command line
> would be wonderful :-)  

The issue was observed with genuine Hyper-V guests, with or without any
Hyper-V enlightenments (not with Linux using Hyper-V enlightenments)
The QEMU command line is nothing special, e.g.

~/qemu/build/qemu-system-x86_64 -machine
q35,accel=kvm,kernel-irqchip=split -name guest=win2019 -cpu host -smp 16
-m 16384 -drive
file=/home/VMs/ws2019_gen1.qcow2,format=qcow2,if=none,id=drive-ide0-0-0
-device
ide-hd,bus=ide.0,unit=0,drive=drive-ide0-0-0,id=ide0-0-0,bootindex=1
-vnc :0 -rtc base=localtime,driftfix=slew --no-hpet -monitor stdio
--no-reboot

I'm also pretty sure I saw this on both AMD and Intel hosts, I can try
reproducing if needed.

-- 
Vitaly


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FYI PATCH] Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
  2022-03-24 23:57   ` Sean Christopherson
  2022-03-25 10:38     ` Vitaly Kuznetsov
@ 2022-03-25 11:21     ` Paolo Bonzini
  2022-03-25 20:22       ` Sean Christopherson
  2022-03-25 15:03     ` Vitaly Kuznetsov
  2 siblings, 1 reply; 8+ messages in thread
From: Paolo Bonzini @ 2022-03-25 11:21 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: linux-kernel, kvm, Vitaly Kuznetsov, Sean Christopherson

On 3/25/22 00:57, Sean Christopherson wrote:
> Can I have 1-2 weeks to try and root cause and fix the underlying issue before
> sending reverts to Linus?  I really don't want to paper over a TLB flushing bug
> or an off-by-one bug, and I really, really don't want to end up with another
> scenario where KVM zaps everything just because.

Well, too late...  I didn't want to send a pull request that was broken, 
and Mingwei provided a convincing reason for the breakage.

Paolo


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [FYI PATCH] Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
  2022-03-24 23:57   ` Sean Christopherson
  2022-03-25 10:38     ` Vitaly Kuznetsov
  2022-03-25 11:21     ` Paolo Bonzini
@ 2022-03-25 15:03     ` Vitaly Kuznetsov
  2022-03-25 20:18       ` Sean Christopherson
  2 siblings, 1 reply; 8+ messages in thread
From: Vitaly Kuznetsov @ 2022-03-25 15:03 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, Paolo Bonzini

Sean Christopherson <seanjc@google.com> writes:

...

So I went back to "KVM: x86/mmu: Zap only TDP MMU leafs in
kvm_zap_gfn_range()" and confirmed that with the patch in place Hyper-V
always crashes, sooner or later. With the patch reverted (as well as
with current 'kvm/queue') it boots.

>
> Actually, since this is apparently specific to kvm_zap_gfn_range(), can you add
> printk "tracing" in update_mtrr(), kvm_post_set_cr0(), and __kvm_request_apicv_update()
> to see what is actually triggering zaps?  Capturing the start and end GFNs would be very
> helpful for the MTRR case.
>
> The APICv update seems unlikely to affect only Hyper-V guests, though there is the auto
> EOI crud.  And the other two only come into play with non-coherent DMA.  In other words,
> figuring out exactly what sequence leads to failure should be straightforward.

The tricky part here is that Hyper-V doesn't crash immediately, the
crash is always different (if you look at the BSOD) and happens at
different times. Crashes mention various stuff like trying to execute
non-executable memory, ...

I've added tracing you've suggested:
- __kvm_request_apicv_update() happens only once in the very beginning.

- update_mtrr() never actually reaches kvm_zap_gfn_range()

- kvm_post_set_cr0() happen in early boot but the crash happen much much
  later. E.g.:
...
 qemu-system-x86-117525  [019] .....  4738.682954: kvm_post_set_cr0: vCPU 12 10 11
 qemu-system-x86-117525  [019] .....  4738.682997: kvm_post_set_cr0: vCPU 12 11 80000011
 qemu-system-x86-117525  [019] .....  4738.683053: kvm_post_set_cr0: vCPU 12 80000011 c0000011
 qemu-system-x86-117525  [019] .....  4738.683059: kvm_post_set_cr0: vCPU 12 c0000011 80010031
 qemu-system-x86-117526  [005] .....  4738.812107: kvm_post_set_cr0: vCPU 13 10 11
 qemu-system-x86-117526  [005] .....  4738.812148: kvm_post_set_cr0: vCPU 13 11 80000011
 qemu-system-x86-117526  [005] .....  4738.812198: kvm_post_set_cr0: vCPU 13 80000011 c0000011
 qemu-system-x86-117526  [005] .....  4738.812205: kvm_post_set_cr0: vCPU 13 c0000011 80010031
 qemu-system-x86-117527  [003] .....  4738.941004: kvm_post_set_cr0: vCPU 14 10 11
 qemu-system-x86-117527  [003] .....  4738.941107: kvm_post_set_cr0: vCPU 14 11 80000011
 qemu-system-x86-117527  [003] .....  4738.941218: kvm_post_set_cr0: vCPU 14 80000011 c0000011
 qemu-system-x86-117527  [003] .....  4738.941235: kvm_post_set_cr0: vCPU 14 c0000011 80010031
 qemu-system-x86-117528  [035] .....  4739.070338: kvm_post_set_cr0: vCPU 15 10 11
 qemu-system-x86-117528  [035] .....  4739.070428: kvm_post_set_cr0: vCPU 15 11 80000011
 qemu-system-x86-117528  [035] .....  4739.070539: kvm_post_set_cr0: vCPU 15 80000011 c0000011
 qemu-system-x86-117528  [035] .....  4739.070557: kvm_post_set_cr0: vCPU 15 c0000011 80010031
##### CPU 8 buffer started ####
 qemu-system-x86-117528  [008] .....  4760.099532: kvm_hv_set_msr_pw: 15

The debug patch for kvm_post_set_cr0() is:

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4fa4d8269e5b..db7c5a05e574 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -870,6 +870,8 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
 
 void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
 {
+       trace_printk("vCPU %d %lx %lx\n", vcpu->vcpu_id, old_cr0, cr0);
+
        if ((cr0 ^ old_cr0) & X86_CR0_PG) {
                kvm_clear_async_pf_completion_queue(vcpu);
                kvm_async_pf_hash_reset(vcpu);

kvm_hv_set_msr_pw() call is when Hyper-V writes to HV_X64_MSR_CRASH_CTL
('hv-crash' QEMU flag is needed to enable the feature). The debug patch
is:

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index a32f54ab84a2..59a72f6ced99 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1391,6 +1391,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 
                        /* Send notification about crash to user space */
                        kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+                       trace_printk("%d\n", vcpu->vcpu_id);
                }
                break;
        case HV_X64_MSR_RESET:

So it's 20 seconds (!) between the last kvm_post_set_cr0() call and the
crash. My (disappointing) conclusion is: the problem can be anywhere and
Hyper-V detects it much much later.

-- 
Vitaly


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [FYI PATCH] Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
  2022-03-25 15:03     ` Vitaly Kuznetsov
@ 2022-03-25 20:18       ` Sean Christopherson
  0 siblings, 0 replies; 8+ messages in thread
From: Sean Christopherson @ 2022-03-25 20:18 UTC (permalink / raw)
  To: Vitaly Kuznetsov; +Cc: linux-kernel, kvm, Paolo Bonzini

On Fri, Mar 25, 2022, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> > Actually, since this is apparently specific to kvm_zap_gfn_range(), can you add
> > printk "tracing" in update_mtrr(), kvm_post_set_cr0(), and __kvm_request_apicv_update()
> > to see what is actually triggering zaps?  Capturing the start and end GFNs would be very
> > helpful for the MTRR case.
> >
> > The APICv update seems unlikely to affect only Hyper-V guests, though there is the auto
> > EOI crud.  And the other two only come into play with non-coherent DMA.  In other words,
> > figuring out exactly what sequence leads to failure should be straightforward.
> 
> The tricky part here is that Hyper-V doesn't crash immediately, the
> crash is always different (if you look at the BSOD) and happens at
> different times. Crashes mention various stuff like trying to execute
> non-executable memory, ...
> 
> I've added tracing you've suggested:
> - __kvm_request_apicv_update() happens only once in the very beginning.

And thinking through this again, APICv changes should never result in a shadow
page being zapped as they'll only zap a 4k range.

> - update_mtrr() never actually reaches kvm_zap_gfn_range()
> 
> - kvm_post_set_cr0() happen in early boot but the crash happen much much
>   later. E.g.:

Ah rats, I got the sequencing of the revert messed up.  mmu_notifier is also in
play, via kvm_tdp_mmu_unmap_gfn_range().

> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 4fa4d8269e5b..db7c5a05e574 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -870,6 +870,8 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
>  
>  void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
>  {
> +       trace_printk("vCPU %d %lx %lx\n", vcpu->vcpu_id, old_cr0, cr0);

This doesn't guarantee kvm_zap_gfn_range() will be reached.   The guest has to
have non-coherente DMA and be running with the CD/NW memtyp quirk.  Moving the
print inside the if statement would show if KVM is actually zapping in those
cases.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d3a9ce07a565..25c7d8fc3287 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -887,8 +887,10 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon

        if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
            kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
-           !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+           !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) {
+               trace_printk("vCPU %d %lx %lx\n", vcpu->vcpu_id, old_cr0, cr0);
                kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+       }
 }
 EXPORT_SYMBOL_GPL(kvm_post_set_cr0);

> kvm_hv_set_msr_pw() call is when Hyper-V writes to HV_X64_MSR_CRASH_CTL
> ('hv-crash' QEMU flag is needed to enable the feature). The debug patch
> is:
> 
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index a32f54ab84a2..59a72f6ced99 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1391,6 +1391,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
>  
>                         /* Send notification about crash to user space */
>                         kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
> +                       trace_printk("%d\n", vcpu->vcpu_id);
>                 }
>                 break;
>         case HV_X64_MSR_RESET:
> 
> So it's 20 seconds (!) between the last kvm_post_set_cr0() call and the
> crash. My (disappointing) conclusion is: the problem can be anywhere and
> Hyper-V detects it much much later.

And reproduced... 'twas indeed the mmu_notifier.  Hyper-V 2019 booted just fine,
until I turned on KSM and cranked up the scanning.

The bug has nothing to do with zapping only leafs, it's a simple goof where the
TLB flush gets lost.  Not sure why only Hyper-V detects the issue; maybe because
it maintains a pool of zeroed pages that are KSM-friendly?

I'll send a patch to reintroduce the reverted code.

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c71debdbc732..a641737725d1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -885,7 +885,7 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
        struct kvm_mmu_page *root;

        for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
-               flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
+               flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);

        return flush;
 }

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [FYI PATCH] Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()"
  2022-03-25 11:21     ` Paolo Bonzini
@ 2022-03-25 20:22       ` Sean Christopherson
  0 siblings, 0 replies; 8+ messages in thread
From: Sean Christopherson @ 2022-03-25 20:22 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, Vitaly Kuznetsov, Sean Christopherson

On Fri, Mar 25, 2022, Paolo Bonzini wrote:
> On 3/25/22 00:57, Sean Christopherson wrote:
> > Can I have 1-2 weeks to try and root cause and fix the underlying issue before
> > sending reverts to Linus?  I really don't want to paper over a TLB flushing bug
> > or an off-by-one bug, and I really, really don't want to end up with another
> > scenario where KVM zaps everything just because.
> 
> Well, too late...  I didn't want to send a pull request that was broken,

Ah, I didn't see that it was in the initial pull request, thought it was only in
kvm/next.  I'll send a full patch.

> Mingwei provided a convincing reason for the breakage.

No, the side effects are completely benign, and arguably desirable.  The issue is
that KVM loses a pending TLB flush if there are multiple roots.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-03-25 20:22 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-18 16:48 [FYI PATCH] Revert "KVM: x86/mmu: Zap only TDP MMU leafs in kvm_zap_gfn_range()" Paolo Bonzini
2022-03-21  9:13 ` Paolo Bonzini
2022-03-24 23:57   ` Sean Christopherson
2022-03-25 10:38     ` Vitaly Kuznetsov
2022-03-25 11:21     ` Paolo Bonzini
2022-03-25 20:22       ` Sean Christopherson
2022-03-25 15:03     ` Vitaly Kuznetsov
2022-03-25 20:18       ` Sean Christopherson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.