All of lore.kernel.org
 help / color / mirror / Atom feed
From: Michael Roth <mdroth@linux.vnet.ibm.com>
To: libvir-list@redhat.com
Cc: qemu-devel@nongnu.org, qemu-ppc@nongnu.org, aik@ozlabs.ru,
	alex.williamson@redhat.com, abologna@redhat.com, laine@laine.org,
	pkrempa@redhat.com, berrange@redhat.com, mprivozn@redhat.com,
	sbhat@linux.vnet.ibm.com
Subject: [Qemu-devel] [RFC PATCH 5/5] qemu: hotplug: wait for VFIO group FD close before unbind
Date: Wed, 28 Jun 2017 19:25:00 -0500	[thread overview]
Message-ID: <1498695900-1648-6-git-send-email-mdroth@linux.vnet.ibm.com> (raw)
In-Reply-To: <1498695900-1648-1-git-send-email-mdroth@linux.vnet.ibm.com>

QEMU emits DEVICE_DELETED events during a device's "unparent"
callback, but some additional cleanup occurs afterward via
"finalize". In most cases libvirt can ignore the latter, but in the
case of VFIO the closing of a device's group FD happens here, which
is something libvirt needs to wait for before attempting to bind a
hostdev back to a host driver. In the case of powernv, and possibly
other host archs as well, failing to do this can lead to the host
device driver crashing due to necessary setup (like restoring default
DMA windows for the IOMMU group) not being completed yet.

We attempt to avoid this here by polling the QEMU process for open
FDs referencing /dev/vfio/<group num> and waiting for a certain period
of time. In practice the delay between the DEVICE_DELETED
event and closing of the group FD seems to be around 6 seconds, so we
set the max wait time at 15 seconds. If we time out we leave the
device in the inactiveList and bound to VFIO. We only attempt the
wait if the last hostdev from an IOMMU group is being detached and
there's reasonable expectation that the group FD will be closed soon.

There are alternatives to this approach, like adding a specific
group delete event to QEMU and handling this cleanup via and
asynchronous event handler, nut since we do a similar poll-wait for
things like KVM device passthrough this simple approach is hopefully
a reasonable starting point at least.

Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
---
 src/libvirt_private.syms |  1 +
 src/qemu/qemu_hotplug.c  | 34 +++++++++++++++++++++++++++++--
 src/util/virfile.c       | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/util/virfile.h       |  1 +
 4 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index ba7fa39..787267c 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -1657,6 +1657,7 @@ virFileIsDir;
 virFileIsExecutable;
 virFileIsLink;
 virFileIsMountPoint;
+virFileIsOpenByPid;
 virFileIsSharedFS;
 virFileIsSharedFSType;
 virFileLength;
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index af5ee6f..d200bab 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -68,6 +68,8 @@ VIR_LOG_INIT("qemu.qemu_hotplug");
 /* Wait up to 5 seconds for device removal to finish. */
 unsigned long long qemuDomainRemoveDeviceWaitTime = 1000ull * 5;
 
+/* Wait up to 15 seconds for iommu group close */
+unsigned long long qemuDomainRemoveDeviceGroupWaitTime = 1000ull * 15;
 
 /**
  * qemuDomainPrepareDisk:
@@ -3830,6 +3832,32 @@ qemuDomainRemoveSCSIVHostDevice(virQEMUDriverPtr driver,
 }
 
 static int
+qemuDomainWaitForDeviceGroupClose(virDomainObjPtr vm, int iommu_group)
+{
+    char *group_path;
+    unsigned long long remaining_ms = qemuDomainRemoveDeviceGroupWaitTime;
+    int rc = -1;
+
+    if (virAsprintf(&group_path, "/dev/vfio/%d", iommu_group) < 0)
+        return -1;
+
+    while ((rc = virFileIsOpenByPid(group_path, vm->pid)) == 1) {
+        if (remaining_ms <= 0)
+            break;
+        usleep(100*1000);
+        remaining_ms -= 100;
+    }
+
+    VIR_DEBUG("IOMMU group %d FD status: %d, wait time: %llu ms",
+              iommu_group, rc,
+              qemuDomainRemoveDeviceGroupWaitTime - remaining_ms);
+
+    VIR_FREE(group_path);
+    return rc;
+}
+
+
+static int
 qemuDomainRemoveHostDevice(virQEMUDriverPtr driver,
                            virDomainObjPtr vm,
                            virDomainHostdevDefPtr hostdev)
@@ -3933,8 +3961,10 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver,
             virPCIDeviceAddressGetIOMMUGroupNum(&hostdev->source.subsys.u.pci.addr);
         if (virHostdevPCIDeviceGroupUnbindable(driver->hostdevMgr,
                                                iommu_group)) {
-            virHostdevPCIDeviceGroupUnbind(driver->hostdevMgr,
-                                           iommu_group);
+            if (qemuDomainWaitForDeviceGroupClose(vm, iommu_group) == 0) {
+                virHostdevPCIDeviceGroupUnbind(driver->hostdevMgr,
+                                               iommu_group);
+            }
         }
     }
 
diff --git a/src/util/virfile.c b/src/util/virfile.c
index d444b32..29b762f 100644
--- a/src/util/virfile.c
+++ b/src/util/virfile.c
@@ -4162,3 +4162,55 @@ virFileReadValueString(char **value, const char *format, ...)
     VIR_FREE(str);
     return ret;
 }
+
+int
+virFileIsOpenByPid(const char *path, pid_t pid)
+{
+    struct dirent *ent;
+    DIR *filelist_dir;
+    char *filelist_path;
+    bool found = false;
+    int rc = -1;
+
+    if (!path || !IS_ABSOLUTE_FILE_NAME(path)) {
+        virReportError(VIR_ERR_INTERNAL_ERROR,
+                       _("invalid path: %s"), path ? path : "null");
+        goto error;
+    }
+
+    if (virAsprintf(&filelist_path, "/proc/%d/fd", pid) < 0)
+        goto error;
+
+    if (virDirOpen(&filelist_dir, filelist_path) < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR,
+                       _("unable to open directory: %s"), filelist_path);
+        goto error;
+    }
+
+    while (!found &&
+           (rc = virDirRead(filelist_dir, &ent, filelist_path)) == 1) {
+        char *resolved_path = NULL;
+        char *link_path;
+        if ((rc = virAsprintf(&link_path, "%s/%s", filelist_path, ent->d_name)) < 0)
+            break;
+        if (virFileResolveLink(link_path, &resolved_path) == 0) {
+            if (resolved_path) {
+                VIR_DEBUG("checking absolute path for match (need: %s, got: %s)",
+                          path, resolved_path);
+                if (STREQ(resolved_path, path))
+                    found = true;
+                VIR_FREE(resolved_path);
+            }
+        }
+    }
+
+    VIR_DIR_CLOSE(filelist_dir);
+ error:
+    VIR_FREE(filelist_path);
+
+    VIR_DEBUG("returning, rc: %d, found: %d", rc, found);
+    if (rc < 0)
+        return rc;
+
+    return found ? 1 : 0;
+}
diff --git a/src/util/virfile.h b/src/util/virfile.h
index 57ceb80..fb86786 100644
--- a/src/util/virfile.h
+++ b/src/util/virfile.h
@@ -347,6 +347,7 @@ int virFileReadValueScaledInt(unsigned long long *value, const char *format, ...
 int virFileReadValueString(char **value, const char *format, ...)
  ATTRIBUTE_FMT_PRINTF(2, 3);
 
+int virFileIsOpenByPid(const char *path, pid_t pid);
 
 int virFileInData(int fd,
                   int *inData,
-- 
2.7.4

  parent reply	other threads:[~2017-06-29  0:26 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-06-29  0:24 [Qemu-devel] [RFC PATCH 0/5] hotplug: fix premature rebinding of VFIO devices to host Michael Roth
2017-06-29  0:24 ` [Qemu-devel] [RFC PATCH 1/5] virhostdev: factor release out from reattach and export it for use later Michael Roth
2017-06-29  0:24 ` [Qemu-devel] [RFC PATCH 2/5] qemu_hotplug: squash qemuDomainRemovePCIHostDevice into caller Michael Roth
2017-06-29  0:24 ` [Qemu-devel] [RFC PATCH 3/5] virpci: introduce virPCIIOMMUGroupIterate() Michael Roth
2017-06-29  0:24 ` [Qemu-devel] [RFC PATCH 4/5] qemu: hotplug: unbind VFIO devices as a group Michael Roth
2017-06-29  0:25 ` Michael Roth [this message]
2017-06-29  8:33 ` [Qemu-devel] [RFC PATCH 0/5] hotplug: fix premature rebinding of VFIO devices to host Daniel P. Berrange
2017-06-29 18:22   ` Michael Roth
2017-06-29 19:31     ` Alex Williamson
2017-06-29 19:28   ` Alex Williamson
2017-06-29 20:21     ` Michael Roth
2017-06-29 18:50 ` Laine Stump
2017-06-29 19:44   ` Alex Williamson
2017-06-30  2:27     ` Laine Stump
2017-06-29 20:59   ` Michael Roth
2017-06-30  6:59     ` Andrea Bolognani

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1498695900-1648-6-git-send-email-mdroth@linux.vnet.ibm.com \
    --to=mdroth@linux.vnet.ibm.com \
    --cc=abologna@redhat.com \
    --cc=aik@ozlabs.ru \
    --cc=alex.williamson@redhat.com \
    --cc=berrange@redhat.com \
    --cc=laine@laine.org \
    --cc=libvir-list@redhat.com \
    --cc=mprivozn@redhat.com \
    --cc=pkrempa@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=qemu-ppc@nongnu.org \
    --cc=sbhat@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.