[2/3] kvm: x86/mmu: Ensure TDP MMU roots are freed after yield
diff mbox series

Message ID 20210105233136.2140335-2-bgardon@google.com
State New, archived
Headers show
Series
  • [1/3] kvm: x86/mmu: Clarify TDP MMU page list invariants
Related show

Commit Message

Ben Gardon Jan. 5, 2021, 11:31 p.m. UTC
Many TDP MMU functions which need to perform some action on all TDP MMU
roots hold a reference on that root so that they can safely drop the MMU
lock in order to yield to other threads. However, when releasing the
reference on the root, there is a bug: the root will not be freed even
if its reference count (root_count) is reduced to 0. Ensure that these
roots are properly freed.

Reported-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Fixes: faaf05b00aec ("kvm: x86/mmu: Support zapping SPTEs in the TDP MMU")
Fixes: 063afacd8730 ("kvm: x86/mmu: Support invalidate range MMU notifier for TDP MMU")
Fixes: a6a0b05da9f3 ("kvm: x86/mmu: Support dirty logging for the TDP MMU")
Fixes: 14881998566d ("kvm: x86/mmu: Support disabling dirty logging for the tdp MMU")
Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

Comments

Ben Gardon Jan. 5, 2021, 11:38 p.m. UTC | #1
On Tue, Jan 5, 2021 at 3:31 PM Ben Gardon <bgardon@google.com> wrote:
>
> Many TDP MMU functions which need to perform some action on all TDP MMU
> roots hold a reference on that root so that they can safely drop the MMU
> lock in order to yield to other threads. However, when releasing the
> reference on the root, there is a bug: the root will not be freed even
> if its reference count (root_count) is reduced to 0. Ensure that these
> roots are properly freed.
>
> Reported-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> Fixes: faaf05b00aec ("kvm: x86/mmu: Support zapping SPTEs in the TDP MMU")
> Fixes: 063afacd8730 ("kvm: x86/mmu: Support invalidate range MMU notifier for TDP MMU")
> Fixes: a6a0b05da9f3 ("kvm: x86/mmu: Support dirty logging for the TDP MMU")
> Fixes: 14881998566d ("kvm: x86/mmu: Support disabling dirty logging for the tdp MMU")
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/kvm/mmu/tdp_mmu.c | 18 ++++++++++++------
>  1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 75db27fda8f3..5ec6fae36e33 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -83,6 +83,12 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
>         kmem_cache_free(mmu_page_header_cache, root);
>  }
>
> +static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
> +{
> +       if (kvm_mmu_put_root(kvm, root))
> +               kvm_tdp_mmu_free_root(kvm, root);
> +}
> +
>  static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
>                                                    int level)
>  {
> @@ -456,7 +462,7 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
>
>                 flush |= zap_gfn_range(kvm, root, start, end, true);
>
> -               kvm_mmu_put_root(kvm, root);
> +               tdp_mmu_put_root(kvm, root);
>         }
>
>         return flush;
> @@ -648,7 +654,7 @@ static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
>                                        gfn_end, data);
>                 }
>
> -               kvm_mmu_put_root(kvm, root);
> +               tdp_mmu_put_root(kvm, root);
>         }
>
>         return ret;
> @@ -852,7 +858,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
>                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
>                              slot->base_gfn + slot->npages, min_level);
>
> -               kvm_mmu_put_root(kvm, root);
> +               tdp_mmu_put_root(kvm, root);
>         }
>
>         return spte_set;
> @@ -920,7 +926,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
>                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
>                                 slot->base_gfn + slot->npages);
>
> -               kvm_mmu_put_root(kvm, root);
> +               tdp_mmu_put_root(kvm, root);
>         }
>
>         return spte_set;
> @@ -1043,7 +1049,7 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
>                 spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
>                                 slot->base_gfn + slot->npages);
>
> -               kvm_mmu_put_root(kvm, root);
> +               tdp_mmu_put_root(kvm, root);
>         }
>         return spte_set;
>  }
> @@ -1103,7 +1109,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
>                 zap_collapsible_spte_range(kvm, root, slot->base_gfn,
>                                            slot->base_gfn + slot->npages);
>
> -               kvm_mmu_put_root(kvm, root);
> +               tdp_mmu_put_root(kvm, root);
>         }
>  }
>
> --
> 2.29.2.729.g45daf8777d-goog
>

+Sean Christopherson, for whom I used a stale email address.
.
I tested this series by running kvm-unit-tests on an Intel Skylake
machine. It did not introduce any new failures. I also ran the
set_memory_region_test, but was unable to reproduce Maciej's problem.
Maciej, if you'd be willing to confirm this series solves the problem
you observed, or provide more details on the setup in which you
observed it, I'd appreciate it.
Maciej S. Szmigiero Jan. 6, 2021, 9:26 a.m. UTC | #2
Thanks for looking at it Ben.

On 06.01.2021 00:38, Ben Gardon wrote:
(..)
> 
> +Sean Christopherson, for whom I used a stale email address.
> .
> I tested this series by running kvm-unit-tests on an Intel Skylake
> machine. It did not introduce any new failures. I also ran the
> set_memory_region_test

It's "memslot_move_test" that is crashing the kernel - a memslot
move test based on "set_memory_region_test".

>, but was unable to reproduce Maciej's problem.
> Maciej, if you'd be willing to confirm this series solves the problem
> you observed, or provide more details on the setup in which you
> observed it, I'd appreciate it.
> 

I've applied your patches and now are getting a slightly
different backtrace for the same test:
[  534.768212] general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] SMP PTI
[  534.887969] CPU: 97 PID: 4651 Comm: memslot_move_te Not tainted 5.11.0-rc2+ #81
[  534.975465] Hardware name: Oracle Corporation ORACLE SERVER X7-2c/SERVER MODULE ASSY, , BIOS 46070300 12/20/2019
[  535.097288] RIP: 0010:kvm_tdp_mmu_zap_gfn_range+0x70/0xb0 [kvm]
[  535.168199] Code: b8 01 00 00 00 4c 89 f1 41 89 45 50 4c 89 ee 48 89 df e8 a3 f3 ff ff 41 09 c4 41 83 6d 50 01 74 13 4d 8b 6d 00 4d 39 fd 74 1e <41> 8b 45 50 85 c0 75 c6 0f 0b 4c 89 ee 48 89 df e8 0b fc ff ff 4d
[  535.393005] RSP: 0018:ffffbded19083b90 EFLAGS: 00010297
[  535.455533] RAX: 0000000000000001 RBX: ffffbded1a27d000 RCX: 000000008030000e
[  535.540945] RDX: 000000008030000f RSI: ffffffffc0ad5453 RDI: ffff9cd72a00d300
[  535.626358] RBP: ffffbded19083bc0 R08: 0000000000000001 R09: ffffffffc0ad5400
[  535.711769] R10: ffff9d370acf31b8 R11: 0000000000000001 R12: 0000000000000001
[  535.797181] R13: dead000000000100 R14: 0000000400000000 R15: ffffbded1a292418
[  535.882590] FS:  00007ff50312e740(0000) GS:ffff9d947fb40000(0000) knlGS:0000000000000000
[  535.979443] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  536.048211] CR2: 0000000001e02fe0 CR3: 00000060a78e8003 CR4: 00000000007726e0
[  536.133628] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  536.219043] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  536.304452] PKRU: 55555554
[  536.336813] Call Trace:
[  536.366057]  kvm_tdp_mmu_zap_all+0x26/0x40 [kvm]
[  536.421357]  kvm_mmu_zap_all_fast+0x167/0x180 [kvm]
[  536.479767]  kvm_mmu_invalidate_zap_pages_in_memslot+0xe/0x10 [kvm]
[  536.554817]  kvm_page_track_flush_slot+0x5a/0x90 [kvm]
[  536.616344]  kvm_arch_flush_shadow_memslot+0xe/0x10 [kvm]
[  536.680986]  kvm_set_memslot+0x18f/0x690 [kvm]
[  536.734186]  __kvm_set_memory_region+0x41f/0x580 [kvm]
[  536.795705]  kvm_set_memory_region+0x2b/0x40 [kvm]
[  536.853062]  kvm_vm_ioctl+0x216/0x1060 [kvm]
[  536.904182]  ? irqtime_account_irq+0x40/0xc0
[  536.955270]  ? irq_exit_rcu+0x55/0xf0
[  536.999079]  ? sysvec_apic_timer_interrupt+0x45/0x90
[  537.058485]  ? asm_sysvec_apic_timer_interrupt+0x12/0x20
[  537.122058]  ? __audit_syscall_entry+0xdd/0x130
[  537.176267]  __x64_sys_ioctl+0x92/0xd0
[  537.221114]  do_syscall_64+0x37/0x50
[  537.263878]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  537.324324] RIP: 0033:0x7ff502a27307
[  537.367882] Code: 44 00 00 48 8b 05 69 1b 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 39 1b 2d 00 f7 d8 64 89 01 48
[  537.594221] RSP: 002b:00007fffde6b2d38 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[  537.685616] RAX: ffffffffffffffda RBX: 0000000001de8000 RCX: 00007ff502a27307
[  537.771797] RDX: 0000000001e02fe0 RSI: 000000004020ae46 RDI: 0000000000000004
[  537.857967] RBP: 00000000000001fc R08: 00007fffde74b090 R09: 000000000005af86
[  537.944110] R10: 000000000005af86 R11: 0000000000000246 R12: 0000000050000000
[  538.030236] R13: 0000000000000000 R14: 0000000000000000 R15: 00000000000001fb
[  538.116345] Modules linked in: kvm_intel kvm xt_comment xt_owner ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute ip6table_nat ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_nat iptable_mangle iptable_security iptable_raw nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter rpcrdma ib_isert iscsi_target_mod ib_iser ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_umad iw_cxgb4 rdma_cm iw_cm ib_cm intel_rapl_msr intel_rapl_common skx_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp mgag200 drm_kms_helper iTCO_wdt bnxt_re cec iTCO_vendor_support drm ib_uverbs syscopyarea sysfillrect ib_core sg irqbypass sysimgblt pcspkr ioatdma i2c_i801 fb_sys_fops joydev lpc_ich intel_pch_thermal i2c_smbus i2c_algo_bit dca ip_tables vfat fat xfs sd_mod t10_pi be2iscsi bnx2i cnic uio cxgb4i cxgb4 tls cxgb3i cxgb3 mdio libcxgbi
[  538.116423]  libcxgb qla4xxx iscsi_boot_sysfs crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd cryptd glue_helper bnxt_en wmi sunrpc dm_mirror dm_region_hash dm_log dm_mod iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi [last unloaded: kvm]
[  539.450863] ---[ end trace 7c17f445a2093145 ]---
[  539.623473] RIP: 0010:kvm_tdp_mmu_zap_gfn_range+0x70/0xb0 [kvm]
[  539.695136] Code: b8 01 00 00 00 4c 89 f1 41 89 45 50 4c 89 ee 48 89 df e8 a3 f3 ff ff 41 09 c4 41 83 6d 50 01 74 13 4d 8b 6d 00 4d 39 fd 74 1e <41> 8b 45 50 85 c0 75 c6 0f 0b 4c 89 ee 48 89 df e8 0b fc ff ff 4d
[  539.921479] RSP: 0018:ffffbded19083b90 EFLAGS: 00010297
[  539.984788] RAX: 0000000000000001 RBX: ffffbded1a27d000 RCX: 000000008030000e
[  540.070982] RDX: 000000008030000f RSI: ffffffffc0ad5453 RDI: ffff9cd72a00d300
[  540.157173] RBP: ffffbded19083bc0 R08: 0000000000000001 R09: ffffffffc0ad5400
[  540.243372] R10: ffff9d370acf31b8 R11: 0000000000000001 R12: 0000000000000001
[  540.329567] R13: dead000000000100 R14: 0000000400000000 R15: ffffbded1a292418
[  540.415772] FS:  00007ff50312e740(0000) GS:ffff9d947fb40000(0000) knlGS:0000000000000000
[  540.513427] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  540.583005] CR2: 0000000001e02fe0 CR3: 00000060a78e8003 CR4: 00000000007726e0
[  540.669228] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  540.755448] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  540.841659] PKRU: 55555554
[  540.874826] Kernel panic - not syncing: Fatal exception
[  540.938269] Kernel Offset: 0xe200000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[  542.097054] ---[ end Kernel panic - not syncing: Fatal exception ]---

The code that is crashing is:
# arch/x86/kvm/mmu/mmu_internal.h:100:  BUG_ON(!sp->root_count);
         movl    80(%r13), %eax  # MEM[(int *)__mptr_14 + 80B], _17 <- here
         testl   %eax, %eax      # _17
         jne     .L421   #,

So it looks like it now crashes in the same BUG_ON() but when trying to
deference the "dead" sp pointer instead.

It's bad that you can't reproduce the issue, however, as this would
probably make the root causing process much more effective.
Are you testing on bare metal like me or while running nested?

My test machine has Xeon Platinum 8167M CPU, so it's a Skylake, too.
It has 754G RAM + 8G swap, running just the test program.

I've uploaded the kernel that I've used for testing here:
https://github.com/maciejsszmigiero/linux/tree/tdp_mmu_bug

It is basically a 5.11.0-rc2 kernel with
"KVM: x86/mmu: Bug fixes and cleanups in get_mmio_spte()" series and
your fixes applied on top of it.

In addition to that, I've updated
https://gist.github.com/maciejsszmigiero/890218151c242d99f63ea0825334c6c0
with the kernel .config file that was used.

The compiler that I've used to compile the test kernel was:
"gcc version 8.3.1 20190311 (Red Hat 8.3.1-3.2.0.1)"

Thanks,
Maciej
Ben Gardon Jan. 6, 2021, 5:28 p.m. UTC | #3
On Wed, Jan 6, 2021 at 1:26 AM Maciej S. Szmigiero
<maciej.szmigiero@oracle.com> wrote:
>
> Thanks for looking at it Ben.
>
> On 06.01.2021 00:38, Ben Gardon wrote:
> (..)
> >
> > +Sean Christopherson, for whom I used a stale email address.
> > .
> > I tested this series by running kvm-unit-tests on an Intel Skylake
> > machine. It did not introduce any new failures. I also ran the
> > set_memory_region_test
>
> It's "memslot_move_test" that is crashing the kernel - a memslot
> move test based on "set_memory_region_test".

I apologize if I'm being very dense, but I can't find this test
anywhere. Is this something you have in-house but haven't upstreamed
or just the test_move_memory_region(); testcase from
set_memory_region_test? I have a similar memslot-moving-stress-test in
the pipeline I need to send out, but I didn't think such a test
existed yet and my test hadn't caught this issue.

>
> >, but was unable to reproduce Maciej's problem.
> > Maciej, if you'd be willing to confirm this series solves the problem
> > you observed, or provide more details on the setup in which you
> > observed it, I'd appreciate it.
> >
>
> I've applied your patches and now are getting a slightly
> different backtrace for the same test:
> [  534.768212] general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] SMP PTI
> [  534.887969] CPU: 97 PID: 4651 Comm: memslot_move_te Not tainted 5.11.0-rc2+ #81
> [  534.975465] Hardware name: Oracle Corporation ORACLE SERVER X7-2c/SERVER MODULE ASSY, , BIOS 46070300 12/20/2019
> [  535.097288] RIP: 0010:kvm_tdp_mmu_zap_gfn_range+0x70/0xb0 [kvm]
> [  535.168199] Code: b8 01 00 00 00 4c 89 f1 41 89 45 50 4c 89 ee 48 89 df e8 a3 f3 ff ff 41 09 c4 41 83 6d 50 01 74 13 4d 8b 6d 00 4d 39 fd 74 1e <41> 8b 45 50 85 c0 75 c6 0f 0b 4c 89 ee 48 89 df e8 0b fc ff ff 4d
> [  535.393005] RSP: 0018:ffffbded19083b90 EFLAGS: 00010297
> [  535.455533] RAX: 0000000000000001 RBX: ffffbded1a27d000 RCX: 000000008030000e
> [  535.540945] RDX: 000000008030000f RSI: ffffffffc0ad5453 RDI: ffff9cd72a00d300
> [  535.626358] RBP: ffffbded19083bc0 R08: 0000000000000001 R09: ffffffffc0ad5400
> [  535.711769] R10: ffff9d370acf31b8 R11: 0000000000000001 R12: 0000000000000001
> [  535.797181] R13: dead000000000100 R14: 0000000400000000 R15: ffffbded1a292418
> [  535.882590] FS:  00007ff50312e740(0000) GS:ffff9d947fb40000(0000) knlGS:0000000000000000
> [  535.979443] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  536.048211] CR2: 0000000001e02fe0 CR3: 00000060a78e8003 CR4: 00000000007726e0
> [  536.133628] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  536.219043] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  536.304452] PKRU: 55555554
> [  536.336813] Call Trace:
> [  536.366057]  kvm_tdp_mmu_zap_all+0x26/0x40 [kvm]
> [  536.421357]  kvm_mmu_zap_all_fast+0x167/0x180 [kvm]
> [  536.479767]  kvm_mmu_invalidate_zap_pages_in_memslot+0xe/0x10 [kvm]
> [  536.554817]  kvm_page_track_flush_slot+0x5a/0x90 [kvm]
> [  536.616344]  kvm_arch_flush_shadow_memslot+0xe/0x10 [kvm]
> [  536.680986]  kvm_set_memslot+0x18f/0x690 [kvm]
> [  536.734186]  __kvm_set_memory_region+0x41f/0x580 [kvm]
> [  536.795705]  kvm_set_memory_region+0x2b/0x40 [kvm]
> [  536.853062]  kvm_vm_ioctl+0x216/0x1060 [kvm]
> [  536.904182]  ? irqtime_account_irq+0x40/0xc0
> [  536.955270]  ? irq_exit_rcu+0x55/0xf0
> [  536.999079]  ? sysvec_apic_timer_interrupt+0x45/0x90
> [  537.058485]  ? asm_sysvec_apic_timer_interrupt+0x12/0x20
> [  537.122058]  ? __audit_syscall_entry+0xdd/0x130
> [  537.176267]  __x64_sys_ioctl+0x92/0xd0
> [  537.221114]  do_syscall_64+0x37/0x50
> [  537.263878]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  537.324324] RIP: 0033:0x7ff502a27307
> [  537.367882] Code: 44 00 00 48 8b 05 69 1b 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 39 1b 2d 00 f7 d8 64 89 01 48
> [  537.594221] RSP: 002b:00007fffde6b2d38 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> [  537.685616] RAX: ffffffffffffffda RBX: 0000000001de8000 RCX: 00007ff502a27307
> [  537.771797] RDX: 0000000001e02fe0 RSI: 000000004020ae46 RDI: 0000000000000004
> [  537.857967] RBP: 00000000000001fc R08: 00007fffde74b090 R09: 000000000005af86
> [  537.944110] R10: 000000000005af86 R11: 0000000000000246 R12: 0000000050000000
> [  538.030236] R13: 0000000000000000 R14: 0000000000000000 R15: 00000000000001fb
> [  538.116345] Modules linked in: kvm_intel kvm xt_comment xt_owner ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute ip6table_nat ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_nat iptable_mangle iptable_security iptable_raw nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter rpcrdma ib_isert iscsi_target_mod ib_iser ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_umad iw_cxgb4 rdma_cm iw_cm ib_cm intel_rapl_msr intel_rapl_common skx_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp mgag200 drm_kms_helper iTCO_wdt bnxt_re cec iTCO_vendor_support drm ib_uverbs syscopyarea sysfillrect ib_core sg irqbypass sysimgblt pcspkr ioatdma i2c_i801 fb_sys_fops joydev lpc_ich intel_pch_thermal i2c_smbus i2c_algo_bit dca ip_tables vfat fat xfs sd_mod t10_pi be2iscsi bnx2i cnic uio cxgb4i cxgb4 tls cxgb3i cxgb3 mdio libcxgbi
> [  538.116423]  libcxgb qla4xxx iscsi_boot_sysfs crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd cryptd glue_helper bnxt_en wmi sunrpc dm_mirror dm_region_hash dm_log dm_mod iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi [last unloaded: kvm]
> [  539.450863] ---[ end trace 7c17f445a2093145 ]---
> [  539.623473] RIP: 0010:kvm_tdp_mmu_zap_gfn_range+0x70/0xb0 [kvm]
> [  539.695136] Code: b8 01 00 00 00 4c 89 f1 41 89 45 50 4c 89 ee 48 89 df e8 a3 f3 ff ff 41 09 c4 41 83 6d 50 01 74 13 4d 8b 6d 00 4d 39 fd 74 1e <41> 8b 45 50 85 c0 75 c6 0f 0b 4c 89 ee 48 89 df e8 0b fc ff ff 4d
> [  539.921479] RSP: 0018:ffffbded19083b90 EFLAGS: 00010297
> [  539.984788] RAX: 0000000000000001 RBX: ffffbded1a27d000 RCX: 000000008030000e
> [  540.070982] RDX: 000000008030000f RSI: ffffffffc0ad5453 RDI: ffff9cd72a00d300
> [  540.157173] RBP: ffffbded19083bc0 R08: 0000000000000001 R09: ffffffffc0ad5400
> [  540.243372] R10: ffff9d370acf31b8 R11: 0000000000000001 R12: 0000000000000001
> [  540.329567] R13: dead000000000100 R14: 0000000400000000 R15: ffffbded1a292418
> [  540.415772] FS:  00007ff50312e740(0000) GS:ffff9d947fb40000(0000) knlGS:0000000000000000
> [  540.513427] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  540.583005] CR2: 0000000001e02fe0 CR3: 00000060a78e8003 CR4: 00000000007726e0
> [  540.669228] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  540.755448] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  540.841659] PKRU: 55555554
> [  540.874826] Kernel panic - not syncing: Fatal exception
> [  540.938269] Kernel Offset: 0xe200000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
> [  542.097054] ---[ end Kernel panic - not syncing: Fatal exception ]---
>
> The code that is crashing is:
> # arch/x86/kvm/mmu/mmu_internal.h:100:  BUG_ON(!sp->root_count);
>          movl    80(%r13), %eax  # MEM[(int *)__mptr_14 + 80B], _17 <- here
>          testl   %eax, %eax      # _17
>          jne     .L421   #,
>
> So it looks like it now crashes in the same BUG_ON() but when trying to
> deference the "dead" sp pointer instead.

Hmm thanks for testing the patches, I'll take another try at
reproducing the issue and amend the commits.

>
> It's bad that you can't reproduce the issue, however, as this would
> probably make the root causing process much more effective.
> Are you testing on bare metal like me or while running nested?

I'm running on bare metal too.

>
> My test machine has Xeon Platinum 8167M CPU, so it's a Skylake, too.
> It has 754G RAM + 8G swap, running just the test program.
>
> I've uploaded the kernel that I've used for testing here:
> https://github.com/maciejsszmigiero/linux/tree/tdp_mmu_bug
>
> It is basically a 5.11.0-rc2 kernel with
> "KVM: x86/mmu: Bug fixes and cleanups in get_mmio_spte()" series and
> your fixes applied on top of it.
>
> In addition to that, I've updated
> https://gist.github.com/maciejsszmigiero/890218151c242d99f63ea0825334c6c0
> with the kernel .config file that was used.
>
> The compiler that I've used to compile the test kernel was:
> "gcc version 8.3.1 20190311 (Red Hat 8.3.1-3.2.0.1)"

Thank you for the details, hopefully those can shed some light on why
I wasn't able to reproduce the issue.

>
> Thanks,
> Maciej
Maciej S. Szmigiero Jan. 6, 2021, 5:37 p.m. UTC | #4
On 06.01.2021 18:28, Ben Gardon wrote:
> On Wed, Jan 6, 2021 at 1:26 AM Maciej S. Szmigiero
> <maciej.szmigiero@oracle.com> wrote:
>>
>> Thanks for looking at it Ben.
>>
>> On 06.01.2021 00:38, Ben Gardon wrote:
>> (..)
>>>
>>> +Sean Christopherson, for whom I used a stale email address.
>>> .
>>> I tested this series by running kvm-unit-tests on an Intel Skylake
>>> machine. It did not introduce any new failures. I also ran the
>>> set_memory_region_test
>>
>> It's "memslot_move_test" that is crashing the kernel - a memslot
>> move test based on "set_memory_region_test".
> 
> I apologize if I'm being very dense, but I can't find this test
> anywhere.

No problem, the reproducer is available here:
https://gist.github.com/maciejsszmigiero/890218151c242d99f63ea0825334c6c0
as I stated in my original report.

> Is this something you have in-house but haven't upstreamed
> or just the test_move_memory_region(); testcase from
> set_memory_region_test? I have a similar memslot-moving-stress-test in
> the pipeline I need to send out, but I didn't think such a test
> existed yet and my test hadn't caught this issue.

The reproducer at that GitHub link is taken from my KVM memslot
test mini-set, itself based on set_memory_region_test.c from
KVM selftests.
The full mini-set will be posted as soon as I finish it :)

Thanks,
Maciej
Ben Gardon Jan. 6, 2021, 5:56 p.m. UTC | #5
On Wed, Jan 6, 2021 at 9:37 AM Maciej S. Szmigiero
<maciej.szmigiero@oracle.com> wrote:
>
> On 06.01.2021 18:28, Ben Gardon wrote:
> > On Wed, Jan 6, 2021 at 1:26 AM Maciej S. Szmigiero
> > <maciej.szmigiero@oracle.com> wrote:
> >>
> >> Thanks for looking at it Ben.
> >>
> >> On 06.01.2021 00:38, Ben Gardon wrote:
> >> (..)
> >>>
> >>> +Sean Christopherson, for whom I used a stale email address.
> >>> .
> >>> I tested this series by running kvm-unit-tests on an Intel Skylake
> >>> machine. It did not introduce any new failures. I also ran the
> >>> set_memory_region_test
> >>
> >> It's "memslot_move_test" that is crashing the kernel - a memslot
> >> move test based on "set_memory_region_test".
> >
> > I apologize if I'm being very dense, but I can't find this test
> > anywhere.
>
> No problem, the reproducer is available here:
> https://gist.github.com/maciejsszmigiero/890218151c242d99f63ea0825334c6c0
> as I stated in my original report.

Ah, that makes sense now. I didn't realize there were more files below
the .config. I added your test and can now reproduce the issue!

>
> > Is this something you have in-house but haven't upstreamed
> > or just the test_move_memory_region(); testcase from
> > set_memory_region_test? I have a similar memslot-moving-stress-test in
> > the pipeline I need to send out, but I didn't think such a test
> > existed yet and my test hadn't caught this issue.
>
> The reproducer at that GitHub link is taken from my KVM memslot
> test mini-set, itself based on set_memory_region_test.c from
> KVM selftests.
> The full mini-set will be posted as soon as I finish it :)

Awesome, thank you for writing this test!

>
> Thanks,
> Maciej
Maciej S. Szmigiero Jan. 6, 2021, 6:02 p.m. UTC | #6
On 06.01.2021 18:56, Ben Gardon wrote:
> On Wed, Jan 6, 2021 at 9:37 AM Maciej S. Szmigiero
> <maciej.szmigiero@oracle.com> wrote:
>>
>> On 06.01.2021 18:28, Ben Gardon wrote:
>>> On Wed, Jan 6, 2021 at 1:26 AM Maciej S. Szmigiero
>>> <maciej.szmigiero@oracle.com> wrote:
>>>>
>>>> Thanks for looking at it Ben.
>>>>
>>>> On 06.01.2021 00:38, Ben Gardon wrote:
>>>> (..)
>>>>>
>>>>> +Sean Christopherson, for whom I used a stale email address.
>>>>> .
>>>>> I tested this series by running kvm-unit-tests on an Intel Skylake
>>>>> machine. It did not introduce any new failures. I also ran the
>>>>> set_memory_region_test
>>>>
>>>> It's "memslot_move_test" that is crashing the kernel - a memslot
>>>> move test based on "set_memory_region_test".
>>>
>>> I apologize if I'm being very dense, but I can't find this test
>>> anywhere.
>>
>> No problem, the reproducer is available here:
>> https://urldefense.com/v3/__https://gist.github.com/maciejsszmigiero/890218151c242d99f63ea0825334c6c0__;!!GqivPVa7Brio!N4KXBbUfHtzFMD33RIWVJQmeTtSrFm4n-Ve84kFEkuewBJNuaOpsdmdoqTGnw8DT_EktHA$
>> as I stated in my original report.
> 
> Ah, that makes sense now. I didn't realize there were more files below
> the .config. I added your test and can now reproduce the issue!

That's great!
Thanks for looking at it once again.

Maciej

Patch
diff mbox series

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 75db27fda8f3..5ec6fae36e33 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -83,6 +83,12 @@  void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
 	kmem_cache_free(mmu_page_header_cache, root);
 }
 
+static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+	if (kvm_mmu_put_root(kvm, root))
+		kvm_tdp_mmu_free_root(kvm, root);
+}
+
 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 						   int level)
 {
@@ -456,7 +462,7 @@  bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
 
 		flush |= zap_gfn_range(kvm, root, start, end, true);
 
-		kvm_mmu_put_root(kvm, root);
+		tdp_mmu_put_root(kvm, root);
 	}
 
 	return flush;
@@ -648,7 +654,7 @@  static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
 				       gfn_end, data);
 		}
 
-		kvm_mmu_put_root(kvm, root);
+		tdp_mmu_put_root(kvm, root);
 	}
 
 	return ret;
@@ -852,7 +858,7 @@  bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
 			     slot->base_gfn + slot->npages, min_level);
 
-		kvm_mmu_put_root(kvm, root);
+		tdp_mmu_put_root(kvm, root);
 	}
 
 	return spte_set;
@@ -920,7 +926,7 @@  bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
 				slot->base_gfn + slot->npages);
 
-		kvm_mmu_put_root(kvm, root);
+		tdp_mmu_put_root(kvm, root);
 	}
 
 	return spte_set;
@@ -1043,7 +1049,7 @@  bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
 		spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
 				slot->base_gfn + slot->npages);
 
-		kvm_mmu_put_root(kvm, root);
+		tdp_mmu_put_root(kvm, root);
 	}
 	return spte_set;
 }
@@ -1103,7 +1109,7 @@  void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
 		zap_collapsible_spte_range(kvm, root, slot->base_gfn,
 					   slot->base_gfn + slot->npages);
 
-		kvm_mmu_put_root(kvm, root);
+		tdp_mmu_put_root(kvm, root);
 	}
 }