All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [Bug 1807052] [NEW] Qemu hangs during migration
@ 2018-12-05 23:54 Matthew Schumacher
  2018-12-05 23:57 ` [Qemu-devel] [Bug 1807052] " Matthew Schumacher
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Matthew Schumacher @ 2018-12-05 23:54 UTC (permalink / raw)
  To: qemu-devel

Public bug reported:

Source server: linux 4.19.5 qemu-3.0.0 from source, libvirt 4.9
Dest server: linux 4.18.19 qemu-3.0.0 from source, libvirt 4.9

When this VM is running on source server:

/usr/bin/qemu-system-x86_64 -name guest=testvm,debug-threads=on -S
-object
secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-13-testvm
/master-key.aes -machine pc-q35-3.0,accel=kvm,usb=off,dump-guest-
core=off -cpu Skylake-Server-
IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer,hv_reset,hv_vendor_id=KVM
Hv -m 4096 -realtime mlock=off -smp 2,sockets=2,cores=1,threads=1
-object iothread,id=iothread1 -uuid 3b00b788-ee91-4e45-80a6-c7319da71225
-no-user-config -nodefaults -chardev
socket,id=charmonitor,fd=23,server,nowait -mon
chardev=charmonitor,id=monitor,mode=control -rtc
base=localtime,driftfix=slew -global kvm-pit.lost_tick_policy=delay -no-
hpet -no-shutdown -boot strict=on -device pcie-root-
port,port=0x10,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x2
-device pcie-root-
port,port=0x11,chassis=2,id=pci.2,bus=pcie.0,addr=0x2.0x1 -device pcie-
pci-bridge,id=pci.3,bus=pci.1,addr=0x0 -device pcie-root-
port,port=0x12,chassis=4,id=pci.4,bus=pcie.0,addr=0x2.0x2 -device pcie-
root-port,port=0x13,chassis=5,id=pci.5,bus=pcie.0,addr=0x2.0x3 -device
piix3-usb-uhci,id=usb,bus=pci.3,addr=0x1 -device virtio-scsi-
pci,iothread=iothread1,id=scsi0,bus=pci.4,addr=0x0 -drive
file=/dev/zvol/datastore/vm/testvm-vda,format=raw,if=none,id=drive-
scsi0-0-0-0,cache=writeback,aio=threads -device scsi-
hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-
scsi0-0-0-0,id=scsi0-0-0-0,bootindex=2,write-cache=on -drive if=none,id
=drive-sata0-0-4,media=cdrom,readonly=on -device ide-cd,bus=ide.4,drive
=drive-sata0-0-4,id=sata0-0-4,bootindex=1 -netdev
tap,fd=25,id=hostnet0,vhost=on,vhostfd=26 -device virtio-net-
pci,netdev=hostnet0,id=net0,mac=52:54:00:a2:b7:a1,bus=pci.2,addr=0x0
-device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 127.0.0.1:0 -device
cirrus-vga,id=video0,bus=pcie.0,addr=0x1 -s -sandbox
on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny
-msg timestamp=on

I try to migrate it and the disks to the other side:

virsh migrate --live --undefinesource --persistent --verbose --copy-
storage-all testvm qemu+ssh://wasvirt1/system

We get to 99% then hang with both sides in the pause state.

Source server is stuck here:
(gdb) bt full
#0  0x00007f327994f3c1 in ppoll () at /lib64/libc.so.6
#1  0x000000000086167b in qemu_poll_ns (fds=<optimized out>, nfds=nfds@entry=1, timeout=<optimized out>) at util/qemu-timer.c:322
#2  0x0000000000863302 in aio_poll (ctx=0x21044e0, blocking=blocking@entry=true) at util/aio-posix.c:629
        node = <optimized out>
        i = <optimized out>
        ret = 0
        progress = <optimized out>
        timeout = <optimized out>
        start = <optimized out>
        __PRETTY_FUNCTION__ = "aio_poll"
#3  0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:62
        waited_ = <optimized out>
        wait_ = 0x2ba563c
        ctx_ = 0x2109bb0
        bs_ = 0x2ba2400
        client = 0x31287e0
        client = <optimized out>
        request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
#4  0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:965
        client = <optimized out>
        request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
#5  0x00000000007de5ca in nbd_close (bs=<optimized out>) at block/nbd.c:491
        s = 0x31287e0
#6  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3352
        ban = <optimized out>
        ban_next = <optimized out>
        child = <optimized out>
        next = <optimized out>
#7  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3560
#8  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:4616
#9  0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3359
        ban = <optimized out>
        ban_next = <optimized out>
        child = <optimized out>
        next = <optimized out>
#10 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3560
#11 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:4616
#12 0x0000000000785784 in block_job_remove_all_bdrv (job=job@entry=0x2f32570) at blockjob.c:200
        c = 0x23bac30
        l = 0x20dd330 = {0x23bac30, 0x2b89410}
#13 0x00000000007ceb5f in mirror_exit (job=0x2f32570, opaque=0x7f326407a350) at block/mirror.c:700
        s = 0x2f32570
        bjob = 0x2f32570
        data = 0x7f326407a350
        bs_opaque = 0x30d5600
        replace_aio_context = <optimized out>
        src = 0x2131080
        target_bs = 0x2af96f0
        mirror_top_bs = 0x210eb70
        local_err = 0x0
#14 0x0000000000786452 in job_defer_to_main_loop_bh (opaque=0x7f32640786a0) at job.c:973
        data = 0x7f32640786a0
        job = <optimized out>
        aio_context = 0x2109bb0
#15 0x000000000085fd3f in aio_bh_poll (ctx=ctx@entry=0x21044e0) at util/async.c:118
---Type <return> to continue, or q <return> to quit---
        bh = <optimized out>
        bhp = <optimized out>
        next = 0x2ea86e0
        ret = 1
        deleted = false
#16 0x00000000008631b0 in aio_dispatch (ctx=0x21044e0) at util/aio-posix.c:436
#17 0x000000000085fc1e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at util/async.c:261
        ctx = <optimized out>
#18 0x00007f327f17d797 in g_main_context_dispatch () at /usr/lib64/libglib-2.0.so.0
#19 0x00000000008622ed in main_loop_wait () at util/main-loop.c:215
        context = 0x2104900
        pfds = <optimized out>
        context = 0x2104900
        ret = 1
        ret = 1
        timeout = 4294967295
        timeout_ns = <optimized out>
#20 0x00000000008622ed in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:238
        context = 0x2104900
        ret = 1
        ret = 1
        timeout = 4294967295
        timeout_ns = <optimized out>
#21 0x00000000008622ed in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
        ret = 1
        timeout = 4294967295
        timeout_ns = <optimized out>
#22 0x0000000000595dee in main_loop () at vl.c:1866
#23 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
        i = <optimized out>
        snapshot = 0
        linux_boot = <optimized out>
        initrd_filename = 0x0
        kernel_filename = <optimized out>
        kernel_cmdline = <optimized out>
        boot_order = 0x918f44 "cad"
        boot_once = 0x0
        ds = <optimized out>
        opts = <optimized out>
        machine_opts = <optimized out>
        icount_opts = <optimized out>
        accel_opts = 0x0
        olist = <optimized out>
        optind = 71
        optarg = 0x7ffdfc94df69 "timestamp=on"
        loadvm = 0x0
        machine_class = 0x0
        cpu_model = 0x7ffdfc94d864 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
        vga_model = 0x0
        qtest_chrdev = 0x0
        qtest_log = 0x0
        pid_file = <optimized out>
        incoming = 0x0
        userconfig = <optimized out>
---Type <return> to continue, or q <return> to quit---
        nographic = false
        display_remote = <optimized out>
        log_mask = <optimized out>
        log_file = <optimized out>
        trace_file = <optimized out>
        maxram_size = 4294967296
        ram_slots = 0
        vmstate_dump_file = 0x0
        main_loop_err = 0x0
        err = 0x0
        list_data_dirs = false
        dir = <optimized out>
        dirs = <optimized out>
        bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdfc94c170}
        __func__ = "main"

Strace shows:
ppoll([{fd=9, events=POLLIN|POLLERR|POLLHUP}], 1, NULL, NULL, 8

Which points to this:

ls -al /proc/2286/fd/9
lrwx------    1 root     users           64 Dec  5 13:04 /proc/2286/fd/9 -> anon_inode:[eventfd]

The dest side is stuck here:

(gdb) bt full
#0  0x00007f21f070d3c1 in ppoll () at /lib64/libc.so.6
#1  0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=timeout@entry=2999926258) at util/qemu-timer.c:334
        ts = {tv_sec = 2, tv_nsec = 999926258}
Python Exception <class 'gdb.error'> That operation is not available on integers of more than 8 bytes.:
#2  0x00000000008622a4 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:233
        context = 0x2142900
        ret = <optimized out>
        ret = -1295041038
        timeout = 4294967295
        timeout_ns = <optimized out>
#3  0x00000000008622a4 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
        ret = -1295041038
        timeout = 4294967295
        timeout_ns = <optimized out>
#4  0x0000000000595dee in main_loop () at vl.c:1866
#5  0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
        i = <optimized out>
        snapshot = 0
        linux_boot = <optimized out>
        initrd_filename = 0x0
        kernel_filename = <optimized out>
        kernel_cmdline = <optimized out>
        boot_order = 0x918f44 "cad"
        boot_once = 0x0
        ds = <optimized out>
        opts = <optimized out>
        machine_opts = <optimized out>
        icount_opts = <optimized out>
        accel_opts = 0x0
        olist = <optimized out>
        optind = 73
        optarg = 0x7ffdd6ee8f69 "timestamp=on"
        loadvm = 0x0
        machine_class = 0x0
        cpu_model = 0x7ffdd6ee8854 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
        vga_model = 0x0
        qtest_chrdev = 0x0
        qtest_log = 0x0
        pid_file = <optimized out>
        incoming = 0x7ffdd6ee8f0a "defer"
        userconfig = <optimized out>
        nographic = false
        display_remote = <optimized out>
        log_mask = <optimized out>
        log_file = <optimized out>
        trace_file = <optimized out>
        maxram_size = 4294967296
        ram_slots = 0
        vmstate_dump_file = 0x0
        main_loop_err = 0x0
        err = 0x0
        list_data_dirs = false
        dir = <optimized out>
        dirs = <optimized out>
        bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdd6ee6630}
---Type <return> to continue, or q <return> to quit---
        __func__ = "main"

Strace show this over and over
ppoll([{fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=9, events=POLLIN}, {fd=10, events=POLLIN}, {fd=21, events=POLLIN}, {fd=22, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=27, events=POLLIN}], 9, {0, 594527977}, NULL, 8) = 0 (Timeout)

lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/10 -> anon_inode:[eventfd]
lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/21 -> socket:[42631161]
lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/22 -> socket:[42631165]
lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/23 -> socket:[42631167]
lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/24 -> socket:[42631168]
lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/27 -> socket:[42690422]
lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/6 -> anon_inode:[eventfd]
lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/7 -> anon_inode:[signalfd]
lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/9 -> anon_inode:[eventfd]

** Affects: qemu
     Importance: Undecided
         Status: New

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1807052

Title:
  Qemu hangs during migration

Status in QEMU:
  New

Bug description:
  Source server: linux 4.19.5 qemu-3.0.0 from source, libvirt 4.9
  Dest server: linux 4.18.19 qemu-3.0.0 from source, libvirt 4.9

  When this VM is running on source server:

  /usr/bin/qemu-system-x86_64 -name guest=testvm,debug-threads=on -S
  -object
  secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-13-testvm
  /master-key.aes -machine pc-q35-3.0,accel=kvm,usb=off,dump-guest-
  core=off -cpu Skylake-Server-
  IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer,hv_reset,hv_vendor_id=KVM
  Hv -m 4096 -realtime mlock=off -smp 2,sockets=2,cores=1,threads=1
  -object iothread,id=iothread1 -uuid
  3b00b788-ee91-4e45-80a6-c7319da71225 -no-user-config -nodefaults
  -chardev socket,id=charmonitor,fd=23,server,nowait -mon
  chardev=charmonitor,id=monitor,mode=control -rtc
  base=localtime,driftfix=slew -global kvm-pit.lost_tick_policy=delay
  -no-hpet -no-shutdown -boot strict=on -device pcie-root-
  port,port=0x10,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x2
  -device pcie-root-
  port,port=0x11,chassis=2,id=pci.2,bus=pcie.0,addr=0x2.0x1 -device
  pcie-pci-bridge,id=pci.3,bus=pci.1,addr=0x0 -device pcie-root-
  port,port=0x12,chassis=4,id=pci.4,bus=pcie.0,addr=0x2.0x2 -device
  pcie-root-port,port=0x13,chassis=5,id=pci.5,bus=pcie.0,addr=0x2.0x3
  -device piix3-usb-uhci,id=usb,bus=pci.3,addr=0x1 -device virtio-scsi-
  pci,iothread=iothread1,id=scsi0,bus=pci.4,addr=0x0 -drive
  file=/dev/zvol/datastore/vm/testvm-vda,format=raw,if=none,id=drive-
  scsi0-0-0-0,cache=writeback,aio=threads -device scsi-
  hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-
  scsi0-0-0-0,id=scsi0-0-0-0,bootindex=2,write-cache=on -drive
  if=none,id=drive-sata0-0-4,media=cdrom,readonly=on -device ide-
  cd,bus=ide.4,drive=drive-sata0-0-4,id=sata0-0-4,bootindex=1 -netdev
  tap,fd=25,id=hostnet0,vhost=on,vhostfd=26 -device virtio-net-
  pci,netdev=hostnet0,id=net0,mac=52:54:00:a2:b7:a1,bus=pci.2,addr=0x0
  -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 127.0.0.1:0 -device
  cirrus-vga,id=video0,bus=pcie.0,addr=0x1 -s -sandbox
  on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny
  -msg timestamp=on

  I try to migrate it and the disks to the other side:

  virsh migrate --live --undefinesource --persistent --verbose --copy-
  storage-all testvm qemu+ssh://wasvirt1/system

  We get to 99% then hang with both sides in the pause state.

  Source server is stuck here:
  (gdb) bt full
  #0  0x00007f327994f3c1 in ppoll () at /lib64/libc.so.6
  #1  0x000000000086167b in qemu_poll_ns (fds=<optimized out>, nfds=nfds@entry=1, timeout=<optimized out>) at util/qemu-timer.c:322
  #2  0x0000000000863302 in aio_poll (ctx=0x21044e0, blocking=blocking@entry=true) at util/aio-posix.c:629
          node = <optimized out>
          i = <optimized out>
          ret = 0
          progress = <optimized out>
          timeout = <optimized out>
          start = <optimized out>
          __PRETTY_FUNCTION__ = "aio_poll"
  #3  0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:62
          waited_ = <optimized out>
          wait_ = 0x2ba563c
          ctx_ = 0x2109bb0
          bs_ = 0x2ba2400
          client = 0x31287e0
          client = <optimized out>
          request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
  #4  0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:965
          client = <optimized out>
          request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
  #5  0x00000000007de5ca in nbd_close (bs=<optimized out>) at block/nbd.c:491
          s = 0x31287e0
  #6  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3352
          ban = <optimized out>
          ban_next = <optimized out>
          child = <optimized out>
          next = <optimized out>
  #7  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3560
  #8  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:4616
  #9  0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3359
          ban = <optimized out>
          ban_next = <optimized out>
          child = <optimized out>
          next = <optimized out>
  #10 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3560
  #11 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:4616
  #12 0x0000000000785784 in block_job_remove_all_bdrv (job=job@entry=0x2f32570) at blockjob.c:200
          c = 0x23bac30
          l = 0x20dd330 = {0x23bac30, 0x2b89410}
  #13 0x00000000007ceb5f in mirror_exit (job=0x2f32570, opaque=0x7f326407a350) at block/mirror.c:700
          s = 0x2f32570
          bjob = 0x2f32570
          data = 0x7f326407a350
          bs_opaque = 0x30d5600
          replace_aio_context = <optimized out>
          src = 0x2131080
          target_bs = 0x2af96f0
          mirror_top_bs = 0x210eb70
          local_err = 0x0
  #14 0x0000000000786452 in job_defer_to_main_loop_bh (opaque=0x7f32640786a0) at job.c:973
          data = 0x7f32640786a0
          job = <optimized out>
          aio_context = 0x2109bb0
  #15 0x000000000085fd3f in aio_bh_poll (ctx=ctx@entry=0x21044e0) at util/async.c:118
  ---Type <return> to continue, or q <return> to quit---
          bh = <optimized out>
          bhp = <optimized out>
          next = 0x2ea86e0
          ret = 1
          deleted = false
  #16 0x00000000008631b0 in aio_dispatch (ctx=0x21044e0) at util/aio-posix.c:436
  #17 0x000000000085fc1e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at util/async.c:261
          ctx = <optimized out>
  #18 0x00007f327f17d797 in g_main_context_dispatch () at /usr/lib64/libglib-2.0.so.0
  #19 0x00000000008622ed in main_loop_wait () at util/main-loop.c:215
          context = 0x2104900
          pfds = <optimized out>
          context = 0x2104900
          ret = 1
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #20 0x00000000008622ed in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:238
          context = 0x2104900
          ret = 1
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #21 0x00000000008622ed in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #22 0x0000000000595dee in main_loop () at vl.c:1866
  #23 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
          i = <optimized out>
          snapshot = 0
          linux_boot = <optimized out>
          initrd_filename = 0x0
          kernel_filename = <optimized out>
          kernel_cmdline = <optimized out>
          boot_order = 0x918f44 "cad"
          boot_once = 0x0
          ds = <optimized out>
          opts = <optimized out>
          machine_opts = <optimized out>
          icount_opts = <optimized out>
          accel_opts = 0x0
          olist = <optimized out>
          optind = 71
          optarg = 0x7ffdfc94df69 "timestamp=on"
          loadvm = 0x0
          machine_class = 0x0
          cpu_model = 0x7ffdfc94d864 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
          vga_model = 0x0
          qtest_chrdev = 0x0
          qtest_log = 0x0
          pid_file = <optimized out>
          incoming = 0x0
          userconfig = <optimized out>
  ---Type <return> to continue, or q <return> to quit---
          nographic = false
          display_remote = <optimized out>
          log_mask = <optimized out>
          log_file = <optimized out>
          trace_file = <optimized out>
          maxram_size = 4294967296
          ram_slots = 0
          vmstate_dump_file = 0x0
          main_loop_err = 0x0
          err = 0x0
          list_data_dirs = false
          dir = <optimized out>
          dirs = <optimized out>
          bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdfc94c170}
          __func__ = "main"

  Strace shows:
  ppoll([{fd=9, events=POLLIN|POLLERR|POLLHUP}], 1, NULL, NULL, 8

  Which points to this:

  ls -al /proc/2286/fd/9
  lrwx------    1 root     users           64 Dec  5 13:04 /proc/2286/fd/9 -> anon_inode:[eventfd]

  The dest side is stuck here:

  (gdb) bt full
  #0  0x00007f21f070d3c1 in ppoll () at /lib64/libc.so.6
  #1  0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=timeout@entry=2999926258) at util/qemu-timer.c:334
          ts = {tv_sec = 2, tv_nsec = 999926258}
  Python Exception <class 'gdb.error'> That operation is not available on integers of more than 8 bytes.:
  #2  0x00000000008622a4 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:233
          context = 0x2142900
          ret = <optimized out>
          ret = -1295041038
          timeout = 4294967295
          timeout_ns = <optimized out>
  #3  0x00000000008622a4 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
          ret = -1295041038
          timeout = 4294967295
          timeout_ns = <optimized out>
  #4  0x0000000000595dee in main_loop () at vl.c:1866
  #5  0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
          i = <optimized out>
          snapshot = 0
          linux_boot = <optimized out>
          initrd_filename = 0x0
          kernel_filename = <optimized out>
          kernel_cmdline = <optimized out>
          boot_order = 0x918f44 "cad"
          boot_once = 0x0
          ds = <optimized out>
          opts = <optimized out>
          machine_opts = <optimized out>
          icount_opts = <optimized out>
          accel_opts = 0x0
          olist = <optimized out>
          optind = 73
          optarg = 0x7ffdd6ee8f69 "timestamp=on"
          loadvm = 0x0
          machine_class = 0x0
          cpu_model = 0x7ffdd6ee8854 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
          vga_model = 0x0
          qtest_chrdev = 0x0
          qtest_log = 0x0
          pid_file = <optimized out>
          incoming = 0x7ffdd6ee8f0a "defer"
          userconfig = <optimized out>
          nographic = false
          display_remote = <optimized out>
          log_mask = <optimized out>
          log_file = <optimized out>
          trace_file = <optimized out>
          maxram_size = 4294967296
          ram_slots = 0
          vmstate_dump_file = 0x0
          main_loop_err = 0x0
          err = 0x0
          list_data_dirs = false
          dir = <optimized out>
          dirs = <optimized out>
          bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdd6ee6630}
  ---Type <return> to continue, or q <return> to quit---
          __func__ = "main"

  Strace show this over and over
  ppoll([{fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=9, events=POLLIN}, {fd=10, events=POLLIN}, {fd=21, events=POLLIN}, {fd=22, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=27, events=POLLIN}], 9, {0, 594527977}, NULL, 8) = 0 (Timeout)

  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/10 -> anon_inode:[eventfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/21 -> socket:[42631161]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/22 -> socket:[42631165]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/23 -> socket:[42631167]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/24 -> socket:[42631168]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/27 -> socket:[42690422]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/6 -> anon_inode:[eventfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/7 -> anon_inode:[signalfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/9 -> anon_inode:[eventfd]

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1807052/+subscriptions

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Qemu-devel] [Bug 1807052] Re: Qemu hangs during migration
  2018-12-05 23:54 [Qemu-devel] [Bug 1807052] [NEW] Qemu hangs during migration Matthew Schumacher
@ 2018-12-05 23:57 ` Matthew Schumacher
  2021-04-20  6:49 ` Thomas Huth
  2021-06-20  4:17 ` Launchpad Bug Tracker
  2 siblings, 0 replies; 4+ messages in thread
From: Matthew Schumacher @ 2018-12-05 23:57 UTC (permalink / raw)
  To: qemu-devel

If I remote iothreads and writeback caching, it seems more reliable, but
I can still get it to hang.

This time the source server shows the VM as running, backtrace looks
like:

(gdb) bt full
#0  0x00007f27eab0028c in __lll_lock_wait () at /lib64/libpthread.so.0
#1  0x00007f27eaaf9d35 in pthread_mutex_lock () at /lib64/libpthread.so.0
#2  0x0000000000865419 in qemu_mutex_lock_impl (mutex=mutex@entry=0x115b8e0 <qemu_global_mutex>, file=file@entry=0x8fdf14 "/tmp/qemu-3.0.0/cpus.c", line=line@entry=1768)
    at util/qemu-thread-posix.c:66
        err = <optimized out>
        __PRETTY_FUNCTION__ = "qemu_mutex_lock_impl"
        __func__ = "qemu_mutex_lock_impl"
#3  0x0000000000477578 in qemu_mutex_lock_iothread () at /tmp/qemu-3.0.0/cpus.c:1768
#4  0x00000000008622b0 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:236
        context = 0x1e72810
        ret = 1
        ret = 1
        timeout = 4294967295
        timeout_ns = <optimized out>
#5  0x00000000008622b0 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
        ret = 1
        timeout = 4294967295
        timeout_ns = <optimized out>
#6  0x0000000000595dee in main_loop () at vl.c:1866
#7  0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
        i = <optimized out>
        snapshot = 0
        linux_boot = <optimized out>
        initrd_filename = 0x0
        kernel_filename = <optimized out>
        kernel_cmdline = <optimized out>
        boot_order = 0x918f44 "cad"
        boot_once = 0x0
        ds = <optimized out>
        opts = <optimized out>
        machine_opts = <optimized out>
        icount_opts = <optimized out>
        accel_opts = 0x0
        olist = <optimized out>
        optind = 71
        optarg = 0x7fff5edcff69 "timestamp=on"
        loadvm = 0x0
        machine_class = 0x0
        cpu_model = 0x7fff5edcf88a "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
        vga_model = 0x0
        qtest_chrdev = 0x0
        qtest_log = 0x0
        pid_file = <optimized out>
        incoming = 0x7fff5edcff0a "defer"
        userconfig = <optimized out>
        nographic = false
        display_remote = <optimized out>
        log_mask = <optimized out>
        log_file = <optimized out>
        trace_file = <optimized out>
        maxram_size = 4294967296
        ram_slots = 0
        vmstate_dump_file = 0x0
        main_loop_err = 0x0
---Type <return> to continue, or q <return> to quit---
        err = 0x0
        list_data_dirs = false
        dir = <optimized out>
        dirs = <optimized out>
        bdo_queue = {sqh_first = 0x0, sqh_last = 0x7fff5edcd670}
        __func__ = "main"


Dest server is paused, and looks like this:

#0  0x00007f11c48bc3c1 in ppoll () at /lib64/libc.so.6
#1  0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=timeout@entry=2999892383) at util/qemu-timer.c:334
        ts = {tv_sec = 2, tv_nsec = 999892383}
Python Exception <class 'gdb.error'> That operation is not available on integers of more than 8 bytes.:
#2  0x00000000008622a4 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:233
        context = 0x2342810
        ret = <optimized out>
        ret = -1295074913
        timeout = 4294967295
        timeout_ns = <optimized out>
#3  0x00000000008622a4 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
        ret = -1295074913
        timeout = 4294967295
        timeout_ns = <optimized out>
#4  0x0000000000595dee in main_loop () at vl.c:1866
#5  0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
        i = <optimized out>
        snapshot = 0
        linux_boot = <optimized out>
        initrd_filename = 0x0
        kernel_filename = <optimized out>
        kernel_cmdline = <optimized out>
        boot_order = 0x918f44 "cad"
        boot_once = 0x0
        ds = <optimized out>
        opts = <optimized out>
        machine_opts = <optimized out>
        icount_opts = <optimized out>
        accel_opts = 0x0
        olist = <optimized out>
        optind = 71
        optarg = 0x7ffe6b899f69 "timestamp=on"
        loadvm = 0x0
        machine_class = 0x0
        cpu_model = 0x7ffe6b89988a "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
        vga_model = 0x0
        qtest_chrdev = 0x0
        qtest_log = 0x0
        pid_file = <optimized out>
        incoming = 0x7ffe6b899f0a "defer"
        userconfig = <optimized out>
        nographic = false
        display_remote = <optimized out>
        log_mask = <optimized out>
        log_file = <optimized out>
        trace_file = <optimized out>
        maxram_size = 4294967296
        ram_slots = 0
        vmstate_dump_file = 0x0
        main_loop_err = 0x0
        err = 0x0
        list_data_dirs = false
        dir = <optimized out>
        dirs = <optimized out>
        bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffe6b8988e0}
---Type <return> to continue, or q <return> to quit---
        __func__ = "main"

Honestly looks pretty much like the same bug....

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1807052

Title:
  Qemu hangs during migration

Status in QEMU:
  New

Bug description:
  Source server: linux 4.19.5 qemu-3.0.0 from source, libvirt 4.9
  Dest server: linux 4.18.19 qemu-3.0.0 from source, libvirt 4.9

  When this VM is running on source server:

  /usr/bin/qemu-system-x86_64 -name guest=testvm,debug-threads=on -S
  -object
  secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-13-testvm
  /master-key.aes -machine pc-q35-3.0,accel=kvm,usb=off,dump-guest-
  core=off -cpu Skylake-Server-
  IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer,hv_reset,hv_vendor_id=KVM
  Hv -m 4096 -realtime mlock=off -smp 2,sockets=2,cores=1,threads=1
  -object iothread,id=iothread1 -uuid
  3b00b788-ee91-4e45-80a6-c7319da71225 -no-user-config -nodefaults
  -chardev socket,id=charmonitor,fd=23,server,nowait -mon
  chardev=charmonitor,id=monitor,mode=control -rtc
  base=localtime,driftfix=slew -global kvm-pit.lost_tick_policy=delay
  -no-hpet -no-shutdown -boot strict=on -device pcie-root-
  port,port=0x10,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x2
  -device pcie-root-
  port,port=0x11,chassis=2,id=pci.2,bus=pcie.0,addr=0x2.0x1 -device
  pcie-pci-bridge,id=pci.3,bus=pci.1,addr=0x0 -device pcie-root-
  port,port=0x12,chassis=4,id=pci.4,bus=pcie.0,addr=0x2.0x2 -device
  pcie-root-port,port=0x13,chassis=5,id=pci.5,bus=pcie.0,addr=0x2.0x3
  -device piix3-usb-uhci,id=usb,bus=pci.3,addr=0x1 -device virtio-scsi-
  pci,iothread=iothread1,id=scsi0,bus=pci.4,addr=0x0 -drive
  file=/dev/zvol/datastore/vm/testvm-vda,format=raw,if=none,id=drive-
  scsi0-0-0-0,cache=writeback,aio=threads -device scsi-
  hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-
  scsi0-0-0-0,id=scsi0-0-0-0,bootindex=2,write-cache=on -drive
  if=none,id=drive-sata0-0-4,media=cdrom,readonly=on -device ide-
  cd,bus=ide.4,drive=drive-sata0-0-4,id=sata0-0-4,bootindex=1 -netdev
  tap,fd=25,id=hostnet0,vhost=on,vhostfd=26 -device virtio-net-
  pci,netdev=hostnet0,id=net0,mac=52:54:00:a2:b7:a1,bus=pci.2,addr=0x0
  -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 127.0.0.1:0 -device
  cirrus-vga,id=video0,bus=pcie.0,addr=0x1 -s -sandbox
  on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny
  -msg timestamp=on

  I try to migrate it and the disks to the other side:

  virsh migrate --live --undefinesource --persistent --verbose --copy-
  storage-all testvm qemu+ssh://wasvirt1/system

  We get to 99% then hang with both sides in the pause state.

  Source server is stuck here:
  (gdb) bt full
  #0  0x00007f327994f3c1 in ppoll () at /lib64/libc.so.6
  #1  0x000000000086167b in qemu_poll_ns (fds=<optimized out>, nfds=nfds@entry=1, timeout=<optimized out>) at util/qemu-timer.c:322
  #2  0x0000000000863302 in aio_poll (ctx=0x21044e0, blocking=blocking@entry=true) at util/aio-posix.c:629
          node = <optimized out>
          i = <optimized out>
          ret = 0
          progress = <optimized out>
          timeout = <optimized out>
          start = <optimized out>
          __PRETTY_FUNCTION__ = "aio_poll"
  #3  0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:62
          waited_ = <optimized out>
          wait_ = 0x2ba563c
          ctx_ = 0x2109bb0
          bs_ = 0x2ba2400
          client = 0x31287e0
          client = <optimized out>
          request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
  #4  0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:965
          client = <optimized out>
          request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
  #5  0x00000000007de5ca in nbd_close (bs=<optimized out>) at block/nbd.c:491
          s = 0x31287e0
  #6  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3352
          ban = <optimized out>
          ban_next = <optimized out>
          child = <optimized out>
          next = <optimized out>
  #7  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3560
  #8  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:4616
  #9  0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3359
          ban = <optimized out>
          ban_next = <optimized out>
          child = <optimized out>
          next = <optimized out>
  #10 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3560
  #11 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:4616
  #12 0x0000000000785784 in block_job_remove_all_bdrv (job=job@entry=0x2f32570) at blockjob.c:200
          c = 0x23bac30
          l = 0x20dd330 = {0x23bac30, 0x2b89410}
  #13 0x00000000007ceb5f in mirror_exit (job=0x2f32570, opaque=0x7f326407a350) at block/mirror.c:700
          s = 0x2f32570
          bjob = 0x2f32570
          data = 0x7f326407a350
          bs_opaque = 0x30d5600
          replace_aio_context = <optimized out>
          src = 0x2131080
          target_bs = 0x2af96f0
          mirror_top_bs = 0x210eb70
          local_err = 0x0
  #14 0x0000000000786452 in job_defer_to_main_loop_bh (opaque=0x7f32640786a0) at job.c:973
          data = 0x7f32640786a0
          job = <optimized out>
          aio_context = 0x2109bb0
  #15 0x000000000085fd3f in aio_bh_poll (ctx=ctx@entry=0x21044e0) at util/async.c:118
  ---Type <return> to continue, or q <return> to quit---
          bh = <optimized out>
          bhp = <optimized out>
          next = 0x2ea86e0
          ret = 1
          deleted = false
  #16 0x00000000008631b0 in aio_dispatch (ctx=0x21044e0) at util/aio-posix.c:436
  #17 0x000000000085fc1e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at util/async.c:261
          ctx = <optimized out>
  #18 0x00007f327f17d797 in g_main_context_dispatch () at /usr/lib64/libglib-2.0.so.0
  #19 0x00000000008622ed in main_loop_wait () at util/main-loop.c:215
          context = 0x2104900
          pfds = <optimized out>
          context = 0x2104900
          ret = 1
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #20 0x00000000008622ed in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:238
          context = 0x2104900
          ret = 1
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #21 0x00000000008622ed in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #22 0x0000000000595dee in main_loop () at vl.c:1866
  #23 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
          i = <optimized out>
          snapshot = 0
          linux_boot = <optimized out>
          initrd_filename = 0x0
          kernel_filename = <optimized out>
          kernel_cmdline = <optimized out>
          boot_order = 0x918f44 "cad"
          boot_once = 0x0
          ds = <optimized out>
          opts = <optimized out>
          machine_opts = <optimized out>
          icount_opts = <optimized out>
          accel_opts = 0x0
          olist = <optimized out>
          optind = 71
          optarg = 0x7ffdfc94df69 "timestamp=on"
          loadvm = 0x0
          machine_class = 0x0
          cpu_model = 0x7ffdfc94d864 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
          vga_model = 0x0
          qtest_chrdev = 0x0
          qtest_log = 0x0
          pid_file = <optimized out>
          incoming = 0x0
          userconfig = <optimized out>
  ---Type <return> to continue, or q <return> to quit---
          nographic = false
          display_remote = <optimized out>
          log_mask = <optimized out>
          log_file = <optimized out>
          trace_file = <optimized out>
          maxram_size = 4294967296
          ram_slots = 0
          vmstate_dump_file = 0x0
          main_loop_err = 0x0
          err = 0x0
          list_data_dirs = false
          dir = <optimized out>
          dirs = <optimized out>
          bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdfc94c170}
          __func__ = "main"

  Strace shows:
  ppoll([{fd=9, events=POLLIN|POLLERR|POLLHUP}], 1, NULL, NULL, 8

  Which points to this:

  ls -al /proc/2286/fd/9
  lrwx------    1 root     users           64 Dec  5 13:04 /proc/2286/fd/9 -> anon_inode:[eventfd]

  The dest side is stuck here:

  (gdb) bt full
  #0  0x00007f21f070d3c1 in ppoll () at /lib64/libc.so.6
  #1  0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=timeout@entry=2999926258) at util/qemu-timer.c:334
          ts = {tv_sec = 2, tv_nsec = 999926258}
  Python Exception <class 'gdb.error'> That operation is not available on integers of more than 8 bytes.:
  #2  0x00000000008622a4 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:233
          context = 0x2142900
          ret = <optimized out>
          ret = -1295041038
          timeout = 4294967295
          timeout_ns = <optimized out>
  #3  0x00000000008622a4 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
          ret = -1295041038
          timeout = 4294967295
          timeout_ns = <optimized out>
  #4  0x0000000000595dee in main_loop () at vl.c:1866
  #5  0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
          i = <optimized out>
          snapshot = 0
          linux_boot = <optimized out>
          initrd_filename = 0x0
          kernel_filename = <optimized out>
          kernel_cmdline = <optimized out>
          boot_order = 0x918f44 "cad"
          boot_once = 0x0
          ds = <optimized out>
          opts = <optimized out>
          machine_opts = <optimized out>
          icount_opts = <optimized out>
          accel_opts = 0x0
          olist = <optimized out>
          optind = 73
          optarg = 0x7ffdd6ee8f69 "timestamp=on"
          loadvm = 0x0
          machine_class = 0x0
          cpu_model = 0x7ffdd6ee8854 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
          vga_model = 0x0
          qtest_chrdev = 0x0
          qtest_log = 0x0
          pid_file = <optimized out>
          incoming = 0x7ffdd6ee8f0a "defer"
          userconfig = <optimized out>
          nographic = false
          display_remote = <optimized out>
          log_mask = <optimized out>
          log_file = <optimized out>
          trace_file = <optimized out>
          maxram_size = 4294967296
          ram_slots = 0
          vmstate_dump_file = 0x0
          main_loop_err = 0x0
          err = 0x0
          list_data_dirs = false
          dir = <optimized out>
          dirs = <optimized out>
          bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdd6ee6630}
  ---Type <return> to continue, or q <return> to quit---
          __func__ = "main"

  Strace show this over and over
  ppoll([{fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=9, events=POLLIN}, {fd=10, events=POLLIN}, {fd=21, events=POLLIN}, {fd=22, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=27, events=POLLIN}], 9, {0, 594527977}, NULL, 8) = 0 (Timeout)

  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/10 -> anon_inode:[eventfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/21 -> socket:[42631161]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/22 -> socket:[42631165]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/23 -> socket:[42631167]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/24 -> socket:[42631168]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/27 -> socket:[42690422]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/6 -> anon_inode:[eventfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/7 -> anon_inode:[signalfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/9 -> anon_inode:[eventfd]

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1807052/+subscriptions

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug 1807052] Re: Qemu hangs during migration
  2018-12-05 23:54 [Qemu-devel] [Bug 1807052] [NEW] Qemu hangs during migration Matthew Schumacher
  2018-12-05 23:57 ` [Qemu-devel] [Bug 1807052] " Matthew Schumacher
@ 2021-04-20  6:49 ` Thomas Huth
  2021-06-20  4:17 ` Launchpad Bug Tracker
  2 siblings, 0 replies; 4+ messages in thread
From: Thomas Huth @ 2021-04-20  6:49 UTC (permalink / raw)
  To: qemu-devel

The QEMU project is currently considering to move its bug tracking to another system. For this we need to know which bugs are still valid and which could be closed already. Thus we are setting older bugs to "Incomplete" now.
If you still think this bug report here is valid, then please switch the state back to "New" within the next 60 days, otherwise this report will be marked as "Expired". Or mark it as "Fix Released" if the problem has been solved with a newer version of QEMU already. Thank you and sorry for the inconvenience.

** Changed in: qemu
       Status: New => Incomplete

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1807052

Title:
  Qemu hangs during migration

Status in QEMU:
  Incomplete

Bug description:
  Source server: linux 4.19.5 qemu-3.0.0 from source, libvirt 4.9
  Dest server: linux 4.18.19 qemu-3.0.0 from source, libvirt 4.9

  When this VM is running on source server:

  /usr/bin/qemu-system-x86_64 -name guest=testvm,debug-threads=on -S
  -object
  secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-13-testvm
  /master-key.aes -machine pc-q35-3.0,accel=kvm,usb=off,dump-guest-
  core=off -cpu Skylake-Server-
  IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer,hv_reset,hv_vendor_id=KVM
  Hv -m 4096 -realtime mlock=off -smp 2,sockets=2,cores=1,threads=1
  -object iothread,id=iothread1 -uuid
  3b00b788-ee91-4e45-80a6-c7319da71225 -no-user-config -nodefaults
  -chardev socket,id=charmonitor,fd=23,server,nowait -mon
  chardev=charmonitor,id=monitor,mode=control -rtc
  base=localtime,driftfix=slew -global kvm-pit.lost_tick_policy=delay
  -no-hpet -no-shutdown -boot strict=on -device pcie-root-
  port,port=0x10,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x2
  -device pcie-root-
  port,port=0x11,chassis=2,id=pci.2,bus=pcie.0,addr=0x2.0x1 -device
  pcie-pci-bridge,id=pci.3,bus=pci.1,addr=0x0 -device pcie-root-
  port,port=0x12,chassis=4,id=pci.4,bus=pcie.0,addr=0x2.0x2 -device
  pcie-root-port,port=0x13,chassis=5,id=pci.5,bus=pcie.0,addr=0x2.0x3
  -device piix3-usb-uhci,id=usb,bus=pci.3,addr=0x1 -device virtio-scsi-
  pci,iothread=iothread1,id=scsi0,bus=pci.4,addr=0x0 -drive
  file=/dev/zvol/datastore/vm/testvm-vda,format=raw,if=none,id=drive-
  scsi0-0-0-0,cache=writeback,aio=threads -device scsi-
  hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-
  scsi0-0-0-0,id=scsi0-0-0-0,bootindex=2,write-cache=on -drive
  if=none,id=drive-sata0-0-4,media=cdrom,readonly=on -device ide-
  cd,bus=ide.4,drive=drive-sata0-0-4,id=sata0-0-4,bootindex=1 -netdev
  tap,fd=25,id=hostnet0,vhost=on,vhostfd=26 -device virtio-net-
  pci,netdev=hostnet0,id=net0,mac=52:54:00:a2:b7:a1,bus=pci.2,addr=0x0
  -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 127.0.0.1:0 -device
  cirrus-vga,id=video0,bus=pcie.0,addr=0x1 -s -sandbox
  on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny
  -msg timestamp=on

  I try to migrate it and the disks to the other side:

  virsh migrate --live --undefinesource --persistent --verbose --copy-
  storage-all testvm qemu+ssh://wasvirt1/system

  We get to 99% then hang with both sides in the pause state.

  Source server is stuck here:
  (gdb) bt full
  #0  0x00007f327994f3c1 in ppoll () at /lib64/libc.so.6
  #1  0x000000000086167b in qemu_poll_ns (fds=<optimized out>, nfds=nfds@entry=1, timeout=<optimized out>) at util/qemu-timer.c:322
  #2  0x0000000000863302 in aio_poll (ctx=0x21044e0, blocking=blocking@entry=true) at util/aio-posix.c:629
          node = <optimized out>
          i = <optimized out>
          ret = 0
          progress = <optimized out>
          timeout = <optimized out>
          start = <optimized out>
          __PRETTY_FUNCTION__ = "aio_poll"
  #3  0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:62
          waited_ = <optimized out>
          wait_ = 0x2ba563c
          ctx_ = 0x2109bb0
          bs_ = 0x2ba2400
          client = 0x31287e0
          client = <optimized out>
          request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
  #4  0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:965
          client = <optimized out>
          request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
  #5  0x00000000007de5ca in nbd_close (bs=<optimized out>) at block/nbd.c:491
          s = 0x31287e0
  #6  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3352
          ban = <optimized out>
          ban_next = <optimized out>
          child = <optimized out>
          next = <optimized out>
  #7  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3560
  #8  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:4616
  #9  0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3359
          ban = <optimized out>
          ban_next = <optimized out>
          child = <optimized out>
          next = <optimized out>
  #10 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3560
  #11 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:4616
  #12 0x0000000000785784 in block_job_remove_all_bdrv (job=job@entry=0x2f32570) at blockjob.c:200
          c = 0x23bac30
          l = 0x20dd330 = {0x23bac30, 0x2b89410}
  #13 0x00000000007ceb5f in mirror_exit (job=0x2f32570, opaque=0x7f326407a350) at block/mirror.c:700
          s = 0x2f32570
          bjob = 0x2f32570
          data = 0x7f326407a350
          bs_opaque = 0x30d5600
          replace_aio_context = <optimized out>
          src = 0x2131080
          target_bs = 0x2af96f0
          mirror_top_bs = 0x210eb70
          local_err = 0x0
  #14 0x0000000000786452 in job_defer_to_main_loop_bh (opaque=0x7f32640786a0) at job.c:973
          data = 0x7f32640786a0
          job = <optimized out>
          aio_context = 0x2109bb0
  #15 0x000000000085fd3f in aio_bh_poll (ctx=ctx@entry=0x21044e0) at util/async.c:118
  ---Type <return> to continue, or q <return> to quit---
          bh = <optimized out>
          bhp = <optimized out>
          next = 0x2ea86e0
          ret = 1
          deleted = false
  #16 0x00000000008631b0 in aio_dispatch (ctx=0x21044e0) at util/aio-posix.c:436
  #17 0x000000000085fc1e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at util/async.c:261
          ctx = <optimized out>
  #18 0x00007f327f17d797 in g_main_context_dispatch () at /usr/lib64/libglib-2.0.so.0
  #19 0x00000000008622ed in main_loop_wait () at util/main-loop.c:215
          context = 0x2104900
          pfds = <optimized out>
          context = 0x2104900
          ret = 1
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #20 0x00000000008622ed in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:238
          context = 0x2104900
          ret = 1
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #21 0x00000000008622ed in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #22 0x0000000000595dee in main_loop () at vl.c:1866
  #23 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
          i = <optimized out>
          snapshot = 0
          linux_boot = <optimized out>
          initrd_filename = 0x0
          kernel_filename = <optimized out>
          kernel_cmdline = <optimized out>
          boot_order = 0x918f44 "cad"
          boot_once = 0x0
          ds = <optimized out>
          opts = <optimized out>
          machine_opts = <optimized out>
          icount_opts = <optimized out>
          accel_opts = 0x0
          olist = <optimized out>
          optind = 71
          optarg = 0x7ffdfc94df69 "timestamp=on"
          loadvm = 0x0
          machine_class = 0x0
          cpu_model = 0x7ffdfc94d864 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
          vga_model = 0x0
          qtest_chrdev = 0x0
          qtest_log = 0x0
          pid_file = <optimized out>
          incoming = 0x0
          userconfig = <optimized out>
  ---Type <return> to continue, or q <return> to quit---
          nographic = false
          display_remote = <optimized out>
          log_mask = <optimized out>
          log_file = <optimized out>
          trace_file = <optimized out>
          maxram_size = 4294967296
          ram_slots = 0
          vmstate_dump_file = 0x0
          main_loop_err = 0x0
          err = 0x0
          list_data_dirs = false
          dir = <optimized out>
          dirs = <optimized out>
          bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdfc94c170}
          __func__ = "main"

  Strace shows:
  ppoll([{fd=9, events=POLLIN|POLLERR|POLLHUP}], 1, NULL, NULL, 8

  Which points to this:

  ls -al /proc/2286/fd/9
  lrwx------    1 root     users           64 Dec  5 13:04 /proc/2286/fd/9 -> anon_inode:[eventfd]

  The dest side is stuck here:

  (gdb) bt full
  #0  0x00007f21f070d3c1 in ppoll () at /lib64/libc.so.6
  #1  0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=timeout@entry=2999926258) at util/qemu-timer.c:334
          ts = {tv_sec = 2, tv_nsec = 999926258}
  Python Exception <class 'gdb.error'> That operation is not available on integers of more than 8 bytes.:
  #2  0x00000000008622a4 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:233
          context = 0x2142900
          ret = <optimized out>
          ret = -1295041038
          timeout = 4294967295
          timeout_ns = <optimized out>
  #3  0x00000000008622a4 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
          ret = -1295041038
          timeout = 4294967295
          timeout_ns = <optimized out>
  #4  0x0000000000595dee in main_loop () at vl.c:1866
  #5  0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
          i = <optimized out>
          snapshot = 0
          linux_boot = <optimized out>
          initrd_filename = 0x0
          kernel_filename = <optimized out>
          kernel_cmdline = <optimized out>
          boot_order = 0x918f44 "cad"
          boot_once = 0x0
          ds = <optimized out>
          opts = <optimized out>
          machine_opts = <optimized out>
          icount_opts = <optimized out>
          accel_opts = 0x0
          olist = <optimized out>
          optind = 73
          optarg = 0x7ffdd6ee8f69 "timestamp=on"
          loadvm = 0x0
          machine_class = 0x0
          cpu_model = 0x7ffdd6ee8854 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
          vga_model = 0x0
          qtest_chrdev = 0x0
          qtest_log = 0x0
          pid_file = <optimized out>
          incoming = 0x7ffdd6ee8f0a "defer"
          userconfig = <optimized out>
          nographic = false
          display_remote = <optimized out>
          log_mask = <optimized out>
          log_file = <optimized out>
          trace_file = <optimized out>
          maxram_size = 4294967296
          ram_slots = 0
          vmstate_dump_file = 0x0
          main_loop_err = 0x0
          err = 0x0
          list_data_dirs = false
          dir = <optimized out>
          dirs = <optimized out>
          bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdd6ee6630}
  ---Type <return> to continue, or q <return> to quit---
          __func__ = "main"

  Strace show this over and over
  ppoll([{fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=9, events=POLLIN}, {fd=10, events=POLLIN}, {fd=21, events=POLLIN}, {fd=22, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=27, events=POLLIN}], 9, {0, 594527977}, NULL, 8) = 0 (Timeout)

  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/10 -> anon_inode:[eventfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/21 -> socket:[42631161]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/22 -> socket:[42631165]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/23 -> socket:[42631167]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/24 -> socket:[42631168]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/27 -> socket:[42690422]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/6 -> anon_inode:[eventfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/7 -> anon_inode:[signalfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/9 -> anon_inode:[eventfd]

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1807052/+subscriptions


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Bug 1807052] Re: Qemu hangs during migration
  2018-12-05 23:54 [Qemu-devel] [Bug 1807052] [NEW] Qemu hangs during migration Matthew Schumacher
  2018-12-05 23:57 ` [Qemu-devel] [Bug 1807052] " Matthew Schumacher
  2021-04-20  6:49 ` Thomas Huth
@ 2021-06-20  4:17 ` Launchpad Bug Tracker
  2 siblings, 0 replies; 4+ messages in thread
From: Launchpad Bug Tracker @ 2021-06-20  4:17 UTC (permalink / raw)
  To: qemu-devel

[Expired for QEMU because there has been no activity for 60 days.]

** Changed in: qemu
       Status: Incomplete => Expired

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1807052

Title:
  Qemu hangs during migration

Status in QEMU:
  Expired

Bug description:
  Source server: linux 4.19.5 qemu-3.0.0 from source, libvirt 4.9
  Dest server: linux 4.18.19 qemu-3.0.0 from source, libvirt 4.9

  When this VM is running on source server:

  /usr/bin/qemu-system-x86_64 -name guest=testvm,debug-threads=on -S
  -object
  secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-13-testvm
  /master-key.aes -machine pc-q35-3.0,accel=kvm,usb=off,dump-guest-
  core=off -cpu Skylake-Server-
  IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer,hv_reset,hv_vendor_id=KVM
  Hv -m 4096 -realtime mlock=off -smp 2,sockets=2,cores=1,threads=1
  -object iothread,id=iothread1 -uuid
  3b00b788-ee91-4e45-80a6-c7319da71225 -no-user-config -nodefaults
  -chardev socket,id=charmonitor,fd=23,server,nowait -mon
  chardev=charmonitor,id=monitor,mode=control -rtc
  base=localtime,driftfix=slew -global kvm-pit.lost_tick_policy=delay
  -no-hpet -no-shutdown -boot strict=on -device pcie-root-
  port,port=0x10,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x2
  -device pcie-root-
  port,port=0x11,chassis=2,id=pci.2,bus=pcie.0,addr=0x2.0x1 -device
  pcie-pci-bridge,id=pci.3,bus=pci.1,addr=0x0 -device pcie-root-
  port,port=0x12,chassis=4,id=pci.4,bus=pcie.0,addr=0x2.0x2 -device
  pcie-root-port,port=0x13,chassis=5,id=pci.5,bus=pcie.0,addr=0x2.0x3
  -device piix3-usb-uhci,id=usb,bus=pci.3,addr=0x1 -device virtio-scsi-
  pci,iothread=iothread1,id=scsi0,bus=pci.4,addr=0x0 -drive
  file=/dev/zvol/datastore/vm/testvm-vda,format=raw,if=none,id=drive-
  scsi0-0-0-0,cache=writeback,aio=threads -device scsi-
  hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-
  scsi0-0-0-0,id=scsi0-0-0-0,bootindex=2,write-cache=on -drive
  if=none,id=drive-sata0-0-4,media=cdrom,readonly=on -device ide-
  cd,bus=ide.4,drive=drive-sata0-0-4,id=sata0-0-4,bootindex=1 -netdev
  tap,fd=25,id=hostnet0,vhost=on,vhostfd=26 -device virtio-net-
  pci,netdev=hostnet0,id=net0,mac=52:54:00:a2:b7:a1,bus=pci.2,addr=0x0
  -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 127.0.0.1:0 -device
  cirrus-vga,id=video0,bus=pcie.0,addr=0x1 -s -sandbox
  on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny
  -msg timestamp=on

  I try to migrate it and the disks to the other side:

  virsh migrate --live --undefinesource --persistent --verbose --copy-
  storage-all testvm qemu+ssh://wasvirt1/system

  We get to 99% then hang with both sides in the pause state.

  Source server is stuck here:
  (gdb) bt full
  #0  0x00007f327994f3c1 in ppoll () at /lib64/libc.so.6
  #1  0x000000000086167b in qemu_poll_ns (fds=<optimized out>, nfds=nfds@entry=1, timeout=<optimized out>) at util/qemu-timer.c:322
  #2  0x0000000000863302 in aio_poll (ctx=0x21044e0, blocking=blocking@entry=true) at util/aio-posix.c:629
          node = <optimized out>
          i = <optimized out>
          ret = 0
          progress = <optimized out>
          timeout = <optimized out>
          start = <optimized out>
          __PRETTY_FUNCTION__ = "aio_poll"
  #3  0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:62
          waited_ = <optimized out>
          wait_ = 0x2ba563c
          ctx_ = 0x2109bb0
          bs_ = 0x2ba2400
          client = 0x31287e0
          client = <optimized out>
          request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
  #4  0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-client.c:965
          client = <optimized out>
          request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
  #5  0x00000000007de5ca in nbd_close (bs=<optimized out>) at block/nbd.c:491
          s = 0x31287e0
  #6  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3352
          ban = <optimized out>
          ban_next = <optimized out>
          child = <optimized out>
          next = <optimized out>
  #7  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3560
  #8  0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:4616
  #9  0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3359
          ban = <optimized out>
          ban_next = <optimized out>
          child = <optimized out>
          next = <optimized out>
  #10 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3560
  #11 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:4616
  #12 0x0000000000785784 in block_job_remove_all_bdrv (job=job@entry=0x2f32570) at blockjob.c:200
          c = 0x23bac30
          l = 0x20dd330 = {0x23bac30, 0x2b89410}
  #13 0x00000000007ceb5f in mirror_exit (job=0x2f32570, opaque=0x7f326407a350) at block/mirror.c:700
          s = 0x2f32570
          bjob = 0x2f32570
          data = 0x7f326407a350
          bs_opaque = 0x30d5600
          replace_aio_context = <optimized out>
          src = 0x2131080
          target_bs = 0x2af96f0
          mirror_top_bs = 0x210eb70
          local_err = 0x0
  #14 0x0000000000786452 in job_defer_to_main_loop_bh (opaque=0x7f32640786a0) at job.c:973
          data = 0x7f32640786a0
          job = <optimized out>
          aio_context = 0x2109bb0
  #15 0x000000000085fd3f in aio_bh_poll (ctx=ctx@entry=0x21044e0) at util/async.c:118
  ---Type <return> to continue, or q <return> to quit---
          bh = <optimized out>
          bhp = <optimized out>
          next = 0x2ea86e0
          ret = 1
          deleted = false
  #16 0x00000000008631b0 in aio_dispatch (ctx=0x21044e0) at util/aio-posix.c:436
  #17 0x000000000085fc1e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at util/async.c:261
          ctx = <optimized out>
  #18 0x00007f327f17d797 in g_main_context_dispatch () at /usr/lib64/libglib-2.0.so.0
  #19 0x00000000008622ed in main_loop_wait () at util/main-loop.c:215
          context = 0x2104900
          pfds = <optimized out>
          context = 0x2104900
          ret = 1
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #20 0x00000000008622ed in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:238
          context = 0x2104900
          ret = 1
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #21 0x00000000008622ed in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
          ret = 1
          timeout = 4294967295
          timeout_ns = <optimized out>
  #22 0x0000000000595dee in main_loop () at vl.c:1866
  #23 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
          i = <optimized out>
          snapshot = 0
          linux_boot = <optimized out>
          initrd_filename = 0x0
          kernel_filename = <optimized out>
          kernel_cmdline = <optimized out>
          boot_order = 0x918f44 "cad"
          boot_once = 0x0
          ds = <optimized out>
          opts = <optimized out>
          machine_opts = <optimized out>
          icount_opts = <optimized out>
          accel_opts = 0x0
          olist = <optimized out>
          optind = 71
          optarg = 0x7ffdfc94df69 "timestamp=on"
          loadvm = 0x0
          machine_class = 0x0
          cpu_model = 0x7ffdfc94d864 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
          vga_model = 0x0
          qtest_chrdev = 0x0
          qtest_log = 0x0
          pid_file = <optimized out>
          incoming = 0x0
          userconfig = <optimized out>
  ---Type <return> to continue, or q <return> to quit---
          nographic = false
          display_remote = <optimized out>
          log_mask = <optimized out>
          log_file = <optimized out>
          trace_file = <optimized out>
          maxram_size = 4294967296
          ram_slots = 0
          vmstate_dump_file = 0x0
          main_loop_err = 0x0
          err = 0x0
          list_data_dirs = false
          dir = <optimized out>
          dirs = <optimized out>
          bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdfc94c170}
          __func__ = "main"

  Strace shows:
  ppoll([{fd=9, events=POLLIN|POLLERR|POLLHUP}], 1, NULL, NULL, 8

  Which points to this:

  ls -al /proc/2286/fd/9
  lrwx------    1 root     users           64 Dec  5 13:04 /proc/2286/fd/9 -> anon_inode:[eventfd]

  The dest side is stuck here:

  (gdb) bt full
  #0  0x00007f21f070d3c1 in ppoll () at /lib64/libc.so.6
  #1  0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=timeout@entry=2999926258) at util/qemu-timer.c:334
          ts = {tv_sec = 2, tv_nsec = 999926258}
  Python Exception <class 'gdb.error'> That operation is not available on integers of more than 8 bytes.:
  #2  0x00000000008622a4 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:233
          context = 0x2142900
          ret = <optimized out>
          ret = -1295041038
          timeout = 4294967295
          timeout_ns = <optimized out>
  #3  0x00000000008622a4 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497
          ret = -1295041038
          timeout = 4294967295
          timeout_ns = <optimized out>
  #4  0x0000000000595dee in main_loop () at vl.c:1866
  #5  0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
          i = <optimized out>
          snapshot = 0
          linux_boot = <optimized out>
          initrd_filename = 0x0
          kernel_filename = <optimized out>
          kernel_cmdline = <optimized out>
          boot_order = 0x918f44 "cad"
          boot_once = 0x0
          ds = <optimized out>
          opts = <optimized out>
          machine_opts = <optimized out>
          icount_opts = <optimized out>
          accel_opts = 0x0
          olist = <optimized out>
          optind = 73
          optarg = 0x7ffdd6ee8f69 "timestamp=on"
          loadvm = 0x0
          machine_class = 0x0
          cpu_model = 0x7ffdd6ee8854 "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"...
          vga_model = 0x0
          qtest_chrdev = 0x0
          qtest_log = 0x0
          pid_file = <optimized out>
          incoming = 0x7ffdd6ee8f0a "defer"
          userconfig = <optimized out>
          nographic = false
          display_remote = <optimized out>
          log_mask = <optimized out>
          log_file = <optimized out>
          trace_file = <optimized out>
          maxram_size = 4294967296
          ram_slots = 0
          vmstate_dump_file = 0x0
          main_loop_err = 0x0
          err = 0x0
          list_data_dirs = false
          dir = <optimized out>
          dirs = <optimized out>
          bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdd6ee6630}
  ---Type <return> to continue, or q <return> to quit---
          __func__ = "main"

  Strace show this over and over
  ppoll([{fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=9, events=POLLIN}, {fd=10, events=POLLIN}, {fd=21, events=POLLIN}, {fd=22, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=27, events=POLLIN}], 9, {0, 594527977}, NULL, 8) = 0 (Timeout)

  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/10 -> anon_inode:[eventfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/21 -> socket:[42631161]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/22 -> socket:[42631165]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/23 -> socket:[42631167]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/24 -> socket:[42631168]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/27 -> socket:[42690422]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/6 -> anon_inode:[eventfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/7 -> anon_inode:[signalfd]
  lrwx------    1 root     users           64 Dec  5 13:15 /proc/20170/fd/9 -> anon_inode:[eventfd]

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1807052/+subscriptions


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-06-20  4:37 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-05 23:54 [Qemu-devel] [Bug 1807052] [NEW] Qemu hangs during migration Matthew Schumacher
2018-12-05 23:57 ` [Qemu-devel] [Bug 1807052] " Matthew Schumacher
2021-04-20  6:49 ` Thomas Huth
2021-06-20  4:17 ` Launchpad Bug Tracker

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.