[Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2
@ 2016-03-23 10:11 Cao jin
  2016-03-23 10:11 ` [Qemu-devel] [patch v5 01/12] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
                   ` (11 more replies)
  0 siblings, 12 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:11 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

v4-v5:
   1. add back the common function 0 hotplug code in pci core.
   2. fix a sporadic device stuck on D3 problem when doing aer recovery.
   3. fix patches 5/12 ~ 9/12 as Alex sugguestion.

v3-v4:
   1. rebase patchset to fit latest master branch.
   2. modifying patches 5/10 ~ 8/10 as Alex sugguestion(Thanks).

v2-v3:
   1. fix patch 4/9, 5/9 as Alex sugguestion.
   2. patches 5/9 ~ 8/9 are made to force limiting that all vfio functions
      are combined in the same way as on the host.

v1-v2:
   1. limit all devices on same bus in guest are on same bus in host in patch 5/11.
   2. patch 05/11 ~ 09/11 has been changed.


Chen Fan (12):
  vfio: extract vfio_get_hot_reset_info as a single function
  vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
  vfio: add pcie extended capability support
  vfio: add aer support for vfio device
  vfio: refine function vfio_pci_host_match
  vfio: add check host bus reset is support or not
  pci: add a pci_function_is_valid callback to check function if valid
  vfio: add check aer functionality for hotplug device
  vfio: vote the function 0 to do host bus reset when aer occurred
  vfio-pci: pass the aer error to guest
  vfio: device may stuck in D3 when doing aer recovery
  vfio: add 'aer' property to expose aercap

 hw/pci/pci.c         |  32 +++
 hw/vfio/pci.c        | 648 +++++++++++++++++++++++++++++++++++++++++++++------
 hw/vfio/pci.h        |   5 +
 include/hw/pci/pci.h |   1 +
 4 files changed, 620 insertions(+), 66 deletions(-)

-- 
1.9.3

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 01/12] vfio: extract vfio_get_hot_reset_info as a single function
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
@ 2016-03-23 10:11 ` Cao jin
  2016-03-23 10:11 ` [Qemu-devel] [patch v5 02/12] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset Cao jin
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:11 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

the function is used to get affected devices by bus reset.
so here extract it, and can used for aer soon.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 66 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d091d8c..cf40f9e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1701,6 +1701,51 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
     }
 }
 
+/*
+ * return negative with errno, return 0 on success.
+ * if success, the point of ret_info fill with the affected device reset info.
+ *
+ */
+static int vfio_get_hot_reset_info(VFIOPCIDevice *vdev,
+                                   struct vfio_pci_hot_reset_info **ret_info)
+{
+    struct vfio_pci_hot_reset_info *info;
+    int ret, count;
+
+    *ret_info = NULL;
+
+    info = g_malloc0(sizeof(*info));
+    info->argsz = sizeof(*info);
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+    if (ret && errno != ENOSPC) {
+        ret = -errno;
+        goto error;
+    }
+
+    count = info->count;
+
+    info = g_realloc(info, sizeof(*info) +
+                     (count * sizeof(struct vfio_pci_dependent_device)));
+    info->argsz = sizeof(*info) +
+                  (count * sizeof(struct vfio_pci_dependent_device));
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+    if (ret) {
+        ret = -errno;
+        error_report("vfio: hot reset info failed: %m");
+        goto error;
+    }
+
+    *ret_info = info;
+    info = NULL;
+
+    return 0;
+error:
+    g_free(info);
+    return ret;
+}
+
 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -1842,7 +1887,7 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 {
     VFIOGroup *group;
-    struct vfio_pci_hot_reset_info *info;
+    struct vfio_pci_hot_reset_info *info = NULL;
     struct vfio_pci_dependent_device *devices;
     struct vfio_pci_hot_reset *reset;
     int32_t *fds;
@@ -1854,12 +1899,8 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
     vfio_pci_pre_reset(vdev);
     vdev->vbasedev.needs_reset = false;
 
-    info = g_malloc0(sizeof(*info));
-    info->argsz = sizeof(*info);
-
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
-    if (ret && errno != ENOSPC) {
-        ret = -errno;
+    ret = vfio_get_hot_reset_info(vdev, &info);
+    if (ret) {
         if (!vdev->has_pm_reset) {
             error_report("vfio: Cannot reset device %s, "
                          "no available reset mechanism.", vdev->vbasedev.name);
@@ -1867,18 +1908,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
         goto out_single;
     }
 
-    count = info->count;
-    info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
-    info->argsz = sizeof(*info) + (count * sizeof(*devices));
     devices = &info->devices[0];
-
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
-    if (ret) {
-        ret = -errno;
-        error_report("vfio: hot reset info failed: %m");
-        goto out_single;
-    }
-
     trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
 
     /* Verify that we have all the groups required */
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 02/12] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
  2016-03-23 10:11 ` [Qemu-devel] [patch v5 01/12] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
@ 2016-03-23 10:11 ` Cao jin
  2016-03-23 10:11 ` [Qemu-devel] [patch v5 03/12] vfio: add pcie extended capability support Cao jin
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:11 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

squeeze out vfio_pci_do_hot_reset to do host bus reset when AER recovery.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 75 +++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index cf40f9e..1ad47ef 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1746,6 +1746,48 @@ error:
     return ret;
 }
 
+static int vfio_pci_do_hot_reset(VFIOPCIDevice *vdev,
+                                 struct vfio_pci_hot_reset_info *info)
+{
+    VFIOGroup *group;
+    struct vfio_pci_hot_reset *reset;
+    int32_t *fds;
+    int ret, i, count;
+    struct vfio_pci_dependent_device *devices;
+
+    /* Determine how many group fds need to be passed */
+    count = 0;
+    devices = &info->devices[0];
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        for (i = 0; i < info->count; i++) {
+            if (group->groupid == devices[i].group_id) {
+                count++;
+                break;
+            }
+        }
+    }
+
+    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
+    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
+    fds = &reset->group_fds[0];
+
+    /* Fill in group fds */
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        for (i = 0; i < info->count; i++) {
+            if (group->groupid == devices[i].group_id) {
+                fds[reset->count++] = group->fd;
+                break;
+            }
+        }
+    }
+
+    /* Bus reset! */
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+    g_free(reset);
+
+    return ret;
+}
+
 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -1889,9 +1931,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
     VFIOGroup *group;
     struct vfio_pci_hot_reset_info *info = NULL;
     struct vfio_pci_dependent_device *devices;
-    struct vfio_pci_hot_reset *reset;
-    int32_t *fds;
-    int ret, i, count;
+    int ret, i;
     bool multi = false;
 
     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
@@ -1969,34 +2009,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
         goto out_single;
     }
 
-    /* Determine how many group fds need to be passed */
-    count = 0;
-    QLIST_FOREACH(group, &vfio_group_list, next) {
-        for (i = 0; i < info->count; i++) {
-            if (group->groupid == devices[i].group_id) {
-                count++;
-                break;
-            }
-        }
-    }
-
-    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
-    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
-    fds = &reset->group_fds[0];
-
-    /* Fill in group fds */
-    QLIST_FOREACH(group, &vfio_group_list, next) {
-        for (i = 0; i < info->count; i++) {
-            if (group->groupid == devices[i].group_id) {
-                fds[reset->count++] = group->fd;
-                break;
-            }
-        }
-    }
-
-    /* Bus reset! */
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
-    g_free(reset);
+    ret = vfio_pci_do_hot_reset(vdev, info);
 
     trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
                                     ret ? "%m" : "Success");
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 03/12] vfio: add pcie extended capability support
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
  2016-03-23 10:11 ` [Qemu-devel] [patch v5 01/12] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
  2016-03-23 10:11 ` [Qemu-devel] [patch v5 02/12] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset Cao jin
@ 2016-03-23 10:11 ` Cao jin
  2016-03-23 10:11 ` [Qemu-devel] [patch v5 04/12] vfio: add aer support for vfio device Cao jin
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:11 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

For vfio pcie device, we could expose the extended capability on
PCIE bus. due to add a new pcie capability at the tail of the chain,
in order to avoid config space overwritten, we introduce a copy config
for parsing extended caps. and rebuild the pcie extended config space.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 1ad47ef..ff14af0 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1528,6 +1528,21 @@ static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
     return next - pos;
 }
 
+
+static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
+{
+    uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
+
+    for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
+        tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
+        if (tmp > pos && tmp < next) {
+            next = tmp;
+        }
+    }
+
+    return next - pos;
+}
+
 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
 {
     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
@@ -1862,16 +1877,71 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    uint32_t header;
+    uint16_t cap_id, next, size;
+    uint8_t cap_ver;
+    uint8_t *config;
+
+    /*
+     * pcie_add_capability always inserts the new capability at the tail
+     * of the chain.  Therefore to end up with a chain that matches the
+     * physical device, we cache the config space to avoid overwriting
+     * the original config space when we parse the extended capabilities.
+     */
+    config = g_memdup(pdev->config, vdev->config_size);
+
+    for (next = PCI_CONFIG_SPACE_SIZE; next;
+         next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
+        header = pci_get_long(config + next);
+        cap_id = PCI_EXT_CAP_ID(header);
+        cap_ver = PCI_EXT_CAP_VER(header);
+
+        /*
+         * If it becomes important to configure extended capabilities to their
+         * actual size, use this as the default when it's something we don't
+         * recognize. Since QEMU doesn't actually handle many of the config
+         * accesses, exact size doesn't seem worthwhile.
+         */
+        size = vfio_ext_cap_max_size(config, next);
+
+        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+        pci_set_long(dev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
+
+        /* Use emulated next pointer to allow dropping extended caps */
+        pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
+                                   PCI_EXT_CAP_NEXT_MASK);
+    }
+
+    g_free(config);
+    return 0;
+}
+
 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
 {
     PCIDevice *pdev = &vdev->pdev;
+    int ret;
 
     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
         !pdev->config[PCI_CAPABILITY_LIST]) {
         return 0; /* Nothing to add */
     }
 
-    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+    ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+    if (ret) {
+        return ret;
+    }
+
+    /* on PCI bus, it doesn't make sense to expose extended capabilities. */
+    if (!pci_is_express(pdev) ||
+        !pci_bus_is_express(pdev->bus) ||
+        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
+        return 0;
+    }
+
+    return vfio_add_ext_cap(vdev);
 }
 
 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 04/12] vfio: add aer support for vfio device
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (2 preceding siblings ...)
  2016-03-23 10:11 ` [Qemu-devel] [patch v5 03/12] vfio: add pcie extended capability support Cao jin
@ 2016-03-23 10:11 ` Cao jin
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 05/12] vfio: refine function vfio_pci_host_match Cao jin
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:11 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

Calling pcie_aer_init to initilize aer related registers for
vfio device, then reload physical related registers to expose
device capability.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 hw/vfio/pci.h |  3 +++
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ff14af0..0516d94 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1877,6 +1877,66 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
+                          int pos, uint16_t size)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    PCIDevice *dev_iter;
+    uint8_t type;
+    uint32_t errcap;
+
+    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+        pcie_add_capability(pdev, PCI_EXT_CAP_ID_ERR,
+                            cap_ver, pos, size);
+        return 0;
+    }
+
+    dev_iter = pci_bridge_get_device(pdev->bus);
+    if (!dev_iter) {
+        goto error;
+    }
+
+    while (dev_iter) {
+        if (!pci_is_express(dev_iter)) {
+            goto error;
+        }
+
+        type = pcie_cap_get_type(dev_iter);
+        if ((type != PCI_EXP_TYPE_ROOT_PORT &&
+             type != PCI_EXP_TYPE_UPSTREAM &&
+             type != PCI_EXP_TYPE_DOWNSTREAM)) {
+            goto error;
+        }
+
+        if (!dev_iter->exp.aer_cap) {
+            goto error;
+        }
+
+        dev_iter = pci_bridge_get_device(dev_iter->bus);
+    }
+
+    errcap = vfio_pci_read_config(pdev, pos + PCI_ERR_CAP, 4);
+    /*
+     * The ability to record multiple headers is depending on
+     * the state of the Multiple Header Recording Capable bit and
+     * enabled by the Multiple Header Recording Enable bit.
+     */
+    if ((errcap & PCI_ERR_CAP_MHRC) &&
+        (errcap & PCI_ERR_CAP_MHRE)) {
+        pdev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
+    } else {
+        pdev->exp.aer_log.log_max = 0;
+    }
+
+    pcie_cap_deverr_init(pdev);
+    return pcie_aer_init(pdev, pos, size);
+
+error:
+    error_report("vfio: Unable to enable AER for device %s, parent bus "
+                 "does not support AER signaling", vdev->vbasedev.name);
+    return -1;
+}
+
 static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -1884,6 +1944,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
     uint16_t cap_id, next, size;
     uint8_t cap_ver;
     uint8_t *config;
+    int ret = 0;
 
     /*
      * pcie_add_capability always inserts the new capability at the tail
@@ -1907,16 +1968,29 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
          */
         size = vfio_ext_cap_max_size(config, next);
 
-        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
-        pci_set_long(dev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
+        switch (cap_id) {
+        case PCI_EXT_CAP_ID_ERR:
+            ret = vfio_setup_aer(vdev, cap_ver, next, size);
+            break;
+        default:
+            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+            break;
+        }
+
+        if (ret) {
+            goto out;
+        }
+
+        pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
 
         /* Use emulated next pointer to allow dropping extended caps */
         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
                                    PCI_EXT_CAP_NEXT_MASK);
     }
 
+out:
     g_free(config);
-    return 0;
+    return ret;
 }
 
 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
@@ -2673,6 +2747,11 @@ static int vfio_initfn(PCIDevice *pdev)
         goto out_teardown;
     }
 
+    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+        !pdev->exp.aer_cap) {
+        goto out_teardown;
+    }
+
     /* QEMU emulates all of MSI & MSIX */
     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 3976f68..7b3924e 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -15,6 +15,7 @@
 #include "qemu-common.h"
 #include "exec/memory.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bridge.h"
 #include "hw/vfio/vfio-common.h"
 #include "qemu/event_notifier.h"
 #include "qemu/queue.h"
@@ -128,6 +129,8 @@ typedef struct VFIOPCIDevice {
 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
 #define VFIO_FEATURE_ENABLE_REQ_BIT 1
 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT)
+#define VFIO_FEATURE_ENABLE_AER_BIT 2
+#define VFIO_FEATURE_ENABLE_AER (1 << VFIO_FEATURE_ENABLE_AER_BIT)
     int32_t bootindex;
     uint8_t pm_cap;
     bool has_vga;
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 05/12] vfio: refine function vfio_pci_host_match
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (3 preceding siblings ...)
  2016-03-23 10:11 ` [Qemu-devel] [patch v5 04/12] vfio: add aer support for vfio device Cao jin
@ 2016-03-23 10:12 ` Cao jin
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 06/12] vfio: add check host bus reset is support or not Cao jin
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0516d94..5b23a86 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2060,14 +2060,27 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
     vfio_intx_enable(vdev);
 }
 
+static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
+{
+    if (strlen(name) != 12 ||
+        sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
+               &addr->bus, &addr->slot, &addr->function) != 4) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
 {
-    char tmp[13];
+    PCIHostDeviceAddress tmp;
 
-    sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
-            addr->bus, addr->slot, addr->function);
+    if (vfio_pci_name_to_addr(name, &tmp)) {
+        return false;
+    }
 
-    return (strcmp(tmp, name) == 0);
+    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+            tmp.slot == addr->slot && tmp.function == addr->function);
 }
 
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 06/12] vfio: add check host bus reset is support or not
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (4 preceding siblings ...)
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 05/12] vfio: refine function vfio_pci_host_match Cao jin
@ 2016-03-23 10:12 ` Cao jin
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 07/12] pci: add a pci_function_is_valid callback to check function if valid Cao jin
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

When assigning a vfio device with AER enabled, we must check whether
the device supports a host bus reset (ie. hot reset) as this may be
used by the guest OS in order to recover the device from an AER
error.  QEMU must therefore have the ability to perform a physical
host bus reset using the existing vfio APIs in response to a virtual
bus reset in the VM.  A physical bus reset affects all of the devices
on the host bus, therefore we place a few simplifying configuration
restriction on the VM:

 - All physical devices affected by a bus reset must be assigned to
   the VM with AER enabled on each and be configured on the same
   virtual bus in the VM.

 - No devices unaffected by the bus reset, be they physical, emulated,
   or paravirtual may be configured on the same virtual bus as a
   device supporting AER signaling through vfio.

In other words users wishing to enable AER on a multifunction device
need to assign all functions of the device to the same virtual bus
and enable AER support for each device.  The easiest way to
accomplish this is to identity map the physical functions to virtual
functions with multifunction enabled on the virtual device.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 276 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 hw/vfio/pci.h |   1 +
 2 files changed, 254 insertions(+), 23 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 5b23a86..939b764 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1716,6 +1716,41 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
     }
 }
 
+static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
+{
+    if (strlen(name) != 12 ||
+        sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
+               &addr->bus, &addr->slot, &addr->function) != 4) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
+{
+    PCIHostDeviceAddress tmp;
+
+    if (vfio_pci_name_to_addr(name, &tmp)) {
+        return false;
+    }
+
+    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+            tmp.slot == addr->slot && tmp.function == addr->function);
+}
+
+static bool vfio_pci_host_match_slot(PCIHostDeviceAddress *addr, const char *name)
+{
+    PCIHostDeviceAddress tmp;
+
+    if (vfio_pci_name_to_addr(name, &tmp)) {
+        return false;
+    }
+
+    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+            tmp.slot == addr->slot);
+}
+
 /*
  * return negative with errno, return 0 on success.
  * if success, the point of ret_info fill with the affected device reset info.
@@ -1877,6 +1912,199 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static int vfio_device_range_limit(PCIBus *bus)
+{
+    PCIDevice *br;
+
+    br = pci_bridge_get_device(bus);
+    if (!br ||
+        !pci_is_express(br) ||
+        !(br->exp.exp_cap) ||
+        pcie_cap_is_arifwd_enabled(br)) {
+        return 255;
+    }
+
+    return 8;
+}
+
+static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
+{
+    PCIBus *bus = vdev->pdev.bus;
+    struct vfio_pci_hot_reset_info *info = NULL;
+    struct vfio_pci_dependent_device *devices;
+    VFIOGroup *group;
+    int ret, i, devfn, range_limit;
+
+    ret = vfio_get_hot_reset_info(vdev, &info);
+    if (ret) {
+        error_setg(errp, "vfio: Cannot enable AER for device %s,"
+                   " device does not support hot reset.",
+                   vdev->vbasedev.name);
+        return;
+    }
+
+    /* List all affected devices by bus reset */
+    devices = &info->devices[0];
+
+    /* Verify that we have all the groups required */
+    for (i = 0; i < info->count; i++) {
+        PCIHostDeviceAddress host;
+        VFIOPCIDevice *tmp;
+        VFIODevice *vbasedev_iter;
+        bool found = false;
+
+        host.domain = devices[i].segment;
+        host.bus = devices[i].bus;
+        host.slot = PCI_SLOT(devices[i].devfn);
+        host.function = PCI_FUNC(devices[i].devfn);
+
+        /* Skip the current device */
+        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
+            continue;
+        }
+
+        /* Ensure we own the group of the affected device */
+        QLIST_FOREACH(group, &vfio_group_list, next) {
+            if (group->groupid == devices[i].group_id) {
+                break;
+            }
+        }
+
+        if (!group) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, "
+                       "depends on group %d which is not owned.",
+                       vdev->vbasedev.name, devices[i].group_id);
+            goto out;
+        }
+
+        /* Ensure affected devices for reset on the same bus */
+        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
+                /*
+                 * AER errors may be broadcast to all functions of a multi-
+                 * function endpoint.  If any of those sibling functions are
+                 * also assigned, they need to have AER enabled or else an
+                 * error may continue to cause a vm_stop condition.  IOW,
+                 * AER setup of this function would be pointless.
+                 */
+                if (vfio_pci_host_match_slot(&host, vdev->vbasedev.name) &&
+                    !(tmp->features & VFIO_FEATURE_ENABLE_AER)) {
+                    error_setg(errp, "vfio: Cannot enable AER for device %s, on same slot"
+                               " the dependent device %s which does not enable AER.",
+                               vdev->vbasedev.name, tmp->vbasedev.name);
+                    goto out;
+                }
+
+                if (tmp->pdev.bus != bus) {
+                    error_setg(errp, "vfio: Cannot enable AER for device %s, "
+                               "the dependent device %s is not on the same bus",
+                               vdev->vbasedev.name, tmp->vbasedev.name);
+                    goto out;
+                }
+                found = true;
+                break;
+            }
+        }
+
+        /* Ensure all affected devices assigned to VM */
+        if (!found) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, "
+                       "the dependent device %04x:%02x:%02x.%x "
+                       "is not assigned to VM.",
+                       vdev->vbasedev.name, host.domain, host.bus,
+                       host.slot, host.function);
+            goto out;
+        }
+    }
+
+    /*
+     * The above code verified that all devices affected by a bus reset
+     * exist on the same bus in the VM.  To further simplify, we also
+     * require that there are no additional devices beyond those existing on
+     * the VM bus.
+     */
+    range_limit = vfio_device_range_limit(bus);
+    for (devfn = 0; devfn < range_limit; devfn++) {
+        VFIOPCIDevice *tmp;
+        PCIDevice *dev;
+        bool found = false;
+
+        dev = pci_find_device(bus, pci_bus_num(bus),
+                  PCI_DEVFN(PCI_SLOT(vdev->pdev.devfn), devfn));
+
+        if (!dev) {
+            continue;
+        }
+
+        if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, device"
+                             " %s: slot %d function%d cannot be configured"
+                             " on the same virtual bus",
+                             vdev->vbasedev.name, dev->name,
+                             PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+            goto out;
+        }
+
+        tmp = DO_UPCAST(VFIOPCIDevice, pdev, dev);
+        for (i = 0; i < info->count; i++) {
+            PCIHostDeviceAddress host;
+
+            host.domain = devices[i].segment;
+            host.bus = devices[i].bus;
+            host.slot = PCI_SLOT(devices[i].devfn);
+            host.function = PCI_FUNC(devices[i].devfn);
+
+            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
+                found = true;
+                break;
+            }
+        }
+
+        if (!found) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, affected"
+                             " device %s does not be configured on the same"
+                             " virtual bus",
+                       vdev->vbasedev.name, tmp->vbasedev.name);
+            goto out;
+        }
+    }
+
+out:
+    g_free(info);
+    return;
+}
+
+static void vfio_aer_check_host_bus_reset(Error **errp)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev;
+    VFIOPCIDevice *vdev;
+    Error *local_err = NULL;
+
+    /* Check All vfio-pci devices if have bus reset capability */
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+            if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+                vfio_check_hot_bus_reset(vdev, &local_err);
+                if (local_err) {
+                    error_propagate(errp, local_err);
+                    return;
+                }
+            }
+        }
+    }
+
+    return;
+}
+
 static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
                           int pos, uint16_t size)
 {
@@ -2060,29 +2288,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
     vfio_intx_enable(vdev);
 }
 
-static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
-{
-    if (strlen(name) != 12 ||
-        sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
-               &addr->bus, &addr->slot, &addr->function) != 4) {
-        return -EINVAL;
-    }
-
-    return 0;
-}
-
-static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
-{
-    PCIHostDeviceAddress tmp;
-
-    if (vfio_pci_name_to_addr(name, &tmp)) {
-        return false;
-    }
-
-    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
-            tmp.slot == addr->slot && tmp.function == addr->function);
-}
-
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 {
     VFIOGroup *group;
@@ -2589,6 +2794,22 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }
 
+static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused)
+{
+    Error *local_err = NULL;
+
+    vfio_aer_check_host_bus_reset(&local_err);
+    if (local_err) {
+        fprintf(stderr, "%s\n", error_get_pretty(local_err));
+        error_free(local_err);
+        exit(1);
+    }
+}
+
+static Notifier machine_notifier = {
+    .notify = vfio_pci_machine_done_notify,
+};
+
 static int vfio_initfn(PCIDevice *pdev)
 {
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
@@ -2934,6 +3155,15 @@ static const TypeInfo vfio_pci_dev_info = {
 static void register_vfio_pci_dev_type(void)
 {
     type_register_static(&vfio_pci_dev_info);
+
+    /*
+     * The AER configuration may depend on multiple devices, so we cannot
+     * validate consistency after each device is initialized.  We can only
+     * depend on function initialization order (function 0 last) for hotplug
+     * devices, therefore a machine-init-done notifier is used to validate
+     * the configuration after all cold-plug devices are processed.
+     */
+     qemu_add_machine_init_done_notifier(&machine_notifier);
 }
 
 type_init(register_vfio_pci_dev_type)
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 7b3924e..db7c6d5 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -15,6 +15,7 @@
 #include "qemu-common.h"
 #include "exec/memory.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_bridge.h"
 #include "hw/vfio/vfio-common.h"
 #include "qemu/event_notifier.h"
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 07/12] pci: add a pci_function_is_valid callback to check function if valid
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (5 preceding siblings ...)
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 06/12] vfio: add check host bus reset is support or not Cao jin
@ 2016-03-23 10:12 ` Cao jin
  2016-03-24 22:54   ` Alex Williamson
  2016-03-27 12:52   ` Michael S. Tsirkin
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 08/12] vfio: add check aer functionality for hotplug device Cao jin
                   ` (4 subsequent siblings)
  11 siblings, 2 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

PCI hotplug requires that function 0 is added last to close the
slot.  Since vfio supporting AER, we require that the VM bus
contains the same set of devices as the host bus to support AER,
we can perform an AER validation test whenever a function 0 in
the VM is hot-added.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/pci/pci.c         | 32 ++++++++++++++++++++++++++++++++
 include/hw/pci/pci.h |  1 +
 2 files changed, 33 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e67664d..2a5291a 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1840,6 +1840,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
     return bus->devices[devfn];
 }
 
+static void pci_function_is_valid(PCIBus *bus, PCIDevice *d, void *opaque)
+{
+    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(d);
+    Error **errp = opaque;
+
+    if (*errp) {
+        return;
+    }
+
+    if (!pc->is_valid_func) {
+        return;
+    }
+
+    pc->is_valid_func(d, errp);
+}
+
 static void pci_qdev_realize(DeviceState *qdev, Error **errp)
 {
     PCIDevice *pci_dev = (PCIDevice *)qdev;
@@ -1882,6 +1898,22 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
         pci_qdev_unrealize(DEVICE(pci_dev), NULL);
         return;
     }
+
+    /*
+     *  If the function number is 0, indicate the closure of the slot.
+     *  then we get the chance to check all functions on same device
+     *  if valid.
+     */
+    if (DEVICE(pci_dev)->hotplugged &&
+        pci_get_function_0(pci_dev) == pci_dev) {
+        pci_for_each_device(bus, pci_bus_num(bus),
+                            pci_function_is_valid, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            pci_qdev_unrealize(DEVICE(pci_dev), NULL);
+            return;
+        }
+    }
 }
 
 static void pci_default_realize(PCIDevice *dev, Error **errp)
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 0be07c8..4a2f7d4 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -190,6 +190,7 @@ typedef struct PCIDeviceClass {
 
     void (*realize)(PCIDevice *dev, Error **errp);
     int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
+    void (*is_valid_func)(PCIDevice *dev, Error **errp);
     PCIUnregisterFunc *exit;
     PCIConfigReadFunc *config_read;
     PCIConfigWriteFunc *config_write;
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 08/12] vfio: add check aer functionality for hotplug device
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (6 preceding siblings ...)
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 07/12] pci: add a pci_function_is_valid callback to check function if valid Cao jin
@ 2016-03-23 10:12 ` Cao jin
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 09/12] vfio: vote the function 0 to do host bus reset when aer occurred Cao jin
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

when function 0 is hot-added, we can check the vfio device
whether support hot bus reset.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 939b764..90447ac 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3084,6 +3084,19 @@ post_reset:
     vfio_pci_post_reset(vdev);
 }
 
+static void vfio_pci_is_valid(PCIDevice *dev, Error **errp)
+{
+    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, dev);
+    Error *local_err = NULL;
+
+    if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+        vfio_check_hot_bus_reset(vdev, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+        }
+    }
+}
+
 static void vfio_instance_init(Object *obj)
 {
     PCIDevice *pci_dev = PCI_DEVICE(obj);
@@ -3138,6 +3151,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
     pdc->init = vfio_initfn;
     pdc->exit = vfio_exitfn;
+    pdc->is_valid_func = vfio_pci_is_valid;
     pdc->config_read = vfio_pci_read_config;
     pdc->config_write = vfio_pci_write_config;
     pdc->is_express = 1; /* We might be */
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 09/12] vfio: vote the function 0 to do host bus reset when aer occurred
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (7 preceding siblings ...)
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 08/12] vfio: add check aer functionality for hotplug device Cao jin
@ 2016-03-23 10:12 ` Cao jin
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 10/12] vfio-pci: pass the aer error to guest Cao jin
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

Due to all devices assigned to VM on the same way as host if enable
aer, so we can easily do the hot reset by selecting the function #0
to do the hot reset.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 14 ++++++++++++++
 hw/vfio/pci.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 90447ac..d83d9bd 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1946,6 +1946,8 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
     /* List all affected devices by bus reset */
     devices = &info->devices[0];
 
+    vdev->single_depend_dev = (info->count == 1);
+
     /* Verify that we have all the groups required */
     for (i = 0; i < info->count; i++) {
         PCIHostDeviceAddress host;
@@ -3055,6 +3057,18 @@ static void vfio_pci_reset(DeviceState *dev)
 
     trace_vfio_pci_reset(vdev->vbasedev.name);
 
+    if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+        PCIDevice *br = pci_bridge_get_device(pdev->bus);
+
+        if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
+             PCI_BRIDGE_CTL_BUS_RESET)) {
+            if (pci_get_function_0(pdev) == pdev) {
+                vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+            }
+            return;
+        }
+    }
+
     vfio_pci_pre_reset(vdev);
 
     if (vdev->resetfn && !vdev->resetfn(vdev)) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index db7c6d5..9fb0206 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -143,6 +143,7 @@ typedef struct VFIOPCIDevice {
     bool no_kvm_intx;
     bool no_kvm_msi;
     bool no_kvm_msix;
+    bool single_depend_dev;
 } VFIOPCIDevice;
 
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 10/12] vfio-pci: pass the aer error to guest
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (8 preceding siblings ...)
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 09/12] vfio: vote the function 0 to do host bus reset when aer occurred Cao jin
@ 2016-03-23 10:12 ` Cao jin
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery Cao jin
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 12/12] vfio: add 'aer' property to expose aercap Cao jin
  11 siblings, 0 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

when the vfio device encounters an uncorrectable error in host,
the vfio_pci driver will signal the eventfd registered by this
vfio device, resulting in the qemu eventfd handler getting
invoked.

this patch is to pass the error to guest and let the guest driver
recover from the error.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d83d9bd..25fc095 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2606,18 +2606,66 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
 static void vfio_err_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
+    PCIDevice *dev = &vdev->pdev;
+    Error *local_err = NULL;
+    PCIEAERMsg msg = {
+        .severity = 0,
+        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
+    };
 
     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
         return;
     }
 
+
+    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+        goto stop;
+    }
+
+    /*
+     * in case the real hardware configuration has been changed,
+     * here we should recheck the bus reset capability.
+     */
+    vfio_check_hot_bus_reset(vdev, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        goto stop;
+    }
+
+    /*
+     * we should read the error details from the real hardware
+     * configuration spaces, here we only need to do is signaling
+     * to guest an uncorrectable error has occurred.
+     */
+    if (dev->exp.aer_cap) {
+        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+        uint32_t uncor_status;
+        bool isfatal;
+
+        uncor_status = vfio_pci_read_config(dev,
+                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
+
+        /*
+         * if the error is not emitted by this device, we can
+         * just ignore it.
+         */
+        if (!(uncor_status & ~0UL)) {
+            return;
+        }
+
+        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+
+        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
+                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
+
+        pcie_aer_msg(dev, &msg);
+        return;
+    }
+
+stop:
     /*
-     * TBD. Retrieve the error details and decide what action
-     * needs to be taken. One of the actions could be to pass
-     * the error to the guest and have the guest driver recover
-     * from the error. This requires that PCIe capabilities be
-     * exposed to the guest. For now, we just terminate the
-     * guest to contain the error.
+     * If the aer capability is not exposed to the guest. we just
+     * terminate the guest to contain the error.
      */
 
     error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (9 preceding siblings ...)
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 10/12] vfio-pci: pass the aer error to guest Cao jin
@ 2016-03-23 10:12 ` Cao jin
  2016-03-24 22:54   ` Alex Williamson
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 12/12] vfio: add 'aer' property to expose aercap Cao jin
  11 siblings, 1 reply; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

when a physical device aer occurred, the device state probably
is not in D0 in a short time, if we recover the device quickly.
we may stuck in D3 state when force to change device state to D0.
we may need to wait for a short time to inject the error to guest.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 25fc095..5216e7f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2658,6 +2658,9 @@ static void vfio_err_notifier_handler(void *opaque)
         msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
                                  PCI_ERR_ROOT_CMD_NONFATAL_EN;
 
+        /* wait a bit to ensure aer device is ready */
+        usleep(2 * 1000);
+
         pcie_aer_msg(dev, &msg);
         return;
     }
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [Qemu-devel] [patch v5 12/12] vfio: add 'aer' property to expose aercap
  2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
                   ` (10 preceding siblings ...)
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery Cao jin
@ 2016-03-23 10:12 ` Cao jin
  11 siblings, 0 replies; 24+ messages in thread
From: Cao jin @ 2016-03-23 10:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

add 'aer' property to let user able to decide whether expose
the aer capability. by default we should disable aer feature,
because it needs configuration restrictions.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 5216e7f..a80fa4b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3191,6 +3191,8 @@ static Property vfio_pci_dev_properties[] = {
                        sub_vendor_id, PCI_ANY_ID),
     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
                        sub_device_id, PCI_ANY_ID),
+    DEFINE_PROP_BIT("aer", VFIOPCIDevice, features,
+                    VFIO_FEATURE_ENABLE_AER_BIT, false),
     /*
      * TODO - support passed fds... is this necessary?
      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery Cao jin
@ 2016-03-24 22:54   ` Alex Williamson
  2016-03-25  1:38     ` Chen Fan
  0 siblings, 1 reply; 24+ messages in thread
From: Alex Williamson @ 2016-03-24 22:54 UTC (permalink / raw)
  To: Cao jin; +Cc: chen.fan.fnst, izumi.taku, qemu-devel, mst

On Wed, 23 Mar 2016 18:12:06 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> when a physical device aer occurred, the device state probably
> is not in D0 in a short time, if we recover the device quickly.
> we may stuck in D3 state when force to change device state to D0.
> we may need to wait for a short time to inject the error to guest.
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>  hw/vfio/pci.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 25fc095..5216e7f 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2658,6 +2658,9 @@ static void vfio_err_notifier_handler(void *opaque)
>          msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>                                   PCI_ERR_ROOT_CMD_NONFATAL_EN;
>  
> +        /* wait a bit to ensure aer device is ready */
> +        usleep(2 * 1000);

Where does this number come from?  Why would the device be in D3?  I
don't understand this at all.

> +
>          pcie_aer_msg(dev, &msg);
>          return;
>      }

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 07/12] pci: add a pci_function_is_valid callback to check function if valid
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 07/12] pci: add a pci_function_is_valid callback to check function if valid Cao jin
@ 2016-03-24 22:54   ` Alex Williamson
  2016-03-27 12:19     ` Michael S. Tsirkin
  2016-03-27 12:52   ` Michael S. Tsirkin
  1 sibling, 1 reply; 24+ messages in thread
From: Alex Williamson @ 2016-03-24 22:54 UTC (permalink / raw)
  To: Cao jin; +Cc: chen.fan.fnst, izumi.taku, qemu-devel, mst

On Wed, 23 Mar 2016 18:12:02 +0800
Cao jin <caoj.fnst@cn.fujitsu.com> wrote:

> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> PCI hotplug requires that function 0 is added last to close the
> slot.  Since vfio supporting AER, we require that the VM bus
> contains the same set of devices as the host bus to support AER,
> we can perform an AER validation test whenever a function 0 in
> the VM is hot-added.
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>  hw/pci/pci.c         | 32 ++++++++++++++++++++++++++++++++
>  include/hw/pci/pci.h |  1 +
>  2 files changed, 33 insertions(+)


This would of course need Michael's ack.

Michael, we can't do this in vfio-pci code because we can't guarantee
that function 0 is a vfio-pci device.  That would allow users to bypass
our configuration requirements by putting any random non-vfio-pci
device at function 0 of a hot-added multifunction device.

> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index e67664d..2a5291a 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1840,6 +1840,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
>      return bus->devices[devfn];
>  }
>  
> +static void pci_function_is_valid(PCIBus *bus, PCIDevice *d, void *opaque)
> +{
> +    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(d);
> +    Error **errp = opaque;
> +
> +    if (*errp) {
> +        return;
> +    }
> +
> +    if (!pc->is_valid_func) {
> +        return;
> +    }
> +
> +    pc->is_valid_func(d, errp);
> +}
> +
>  static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>  {
>      PCIDevice *pci_dev = (PCIDevice *)qdev;
> @@ -1882,6 +1898,22 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>          pci_qdev_unrealize(DEVICE(pci_dev), NULL);
>          return;
>      }
> +
> +    /*
> +     *  If the function number is 0, indicate the closure of the slot.
> +     *  then we get the chance to check all functions on same device
> +     *  if valid.
> +     */
> +    if (DEVICE(pci_dev)->hotplugged &&
> +        pci_get_function_0(pci_dev) == pci_dev) {
> +        pci_for_each_device(bus, pci_bus_num(bus),
> +                            pci_function_is_valid, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            pci_qdev_unrealize(DEVICE(pci_dev), NULL);
> +            return;
> +        }
> +    }
>  }
>  
>  static void pci_default_realize(PCIDevice *dev, Error **errp)
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 0be07c8..4a2f7d4 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -190,6 +190,7 @@ typedef struct PCIDeviceClass {
>  
>      void (*realize)(PCIDevice *dev, Error **errp);
>      int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
> +    void (*is_valid_func)(PCIDevice *dev, Error **errp);
>      PCIUnregisterFunc *exit;
>      PCIConfigReadFunc *config_read;
>      PCIConfigWriteFunc *config_write;

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery
  2016-03-24 22:54   ` Alex Williamson
@ 2016-03-25  1:38     ` Chen Fan
  2016-03-25  2:22       ` Alex Williamson
  0 siblings, 1 reply; 24+ messages in thread
From: Chen Fan @ 2016-03-25  1:38 UTC (permalink / raw)
  To: Alex Williamson, Cao jin; +Cc: izumi.taku, qemu-devel, mst


On 03/25/2016 06:54 AM, Alex Williamson wrote:
> On Wed, 23 Mar 2016 18:12:06 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> when a physical device aer occurred, the device state probably
>> is not in D0 in a short time, if we recover the device quickly.
>> we may stuck in D3 state when force to change device state to D0.
>> we may need to wait for a short time to inject the error to guest.
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/vfio/pci.c | 3 +++
>>   1 file changed, 3 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 25fc095..5216e7f 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2658,6 +2658,9 @@ static void vfio_err_notifier_handler(void *opaque)
>>           msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>>                                    PCI_ERR_ROOT_CMD_NONFATAL_EN;
>>   
>> +        /* wait a bit to ensure aer device is ready */
>> +        usleep(2 * 1000);
> Where does this number come from?  Why would the device be in D3?  I
> don't understand this at all.
Hi Alex,

     when I tested the code in my environment, I found that when I used
the aer-inject module to inject a fake aer error to device on host, the qemu
would throw out the message "vfio: Unable to power on device, stuck in D3"
on and off. if I use "gdb" to debug the vfio_pci_pre_reset, the phenomenon
would not appearance, I just thought it should be some timing race issue,
so I use a sleep() to wait 2ms (double the reset time of 1ms) to ensure the
device state is ready. maybe the root reason still need to be 
investigated deeply.

Thanks,
Chen


>
>> +
>>           pcie_aer_msg(dev, &msg);
>>           return;
>>       }
>
>
> .
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery
  2016-03-25  1:38     ` Chen Fan
@ 2016-03-25  2:22       ` Alex Williamson
  2016-03-31  6:55         ` Chen Fan
  0 siblings, 1 reply; 24+ messages in thread
From: Alex Williamson @ 2016-03-25  2:22 UTC (permalink / raw)
  To: Chen Fan; +Cc: izumi.taku, Cao jin, qemu-devel, mst

On Fri, 25 Mar 2016 09:38:09 +0800
Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:

> On 03/25/2016 06:54 AM, Alex Williamson wrote:
> > On Wed, 23 Mar 2016 18:12:06 +0800
> > Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >  
> >> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>
> >> when a physical device aer occurred, the device state probably
> >> is not in D0 in a short time, if we recover the device quickly.
> >> we may stuck in D3 state when force to change device state to D0.
> >> we may need to wait for a short time to inject the error to guest.
> >>
> >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >> ---
> >>   hw/vfio/pci.c | 3 +++
> >>   1 file changed, 3 insertions(+)
> >>
> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >> index 25fc095..5216e7f 100644
> >> --- a/hw/vfio/pci.c
> >> +++ b/hw/vfio/pci.c
> >> @@ -2658,6 +2658,9 @@ static void vfio_err_notifier_handler(void *opaque)
> >>           msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
> >>                                    PCI_ERR_ROOT_CMD_NONFATAL_EN;
> >>   
> >> +        /* wait a bit to ensure aer device is ready */
> >> +        usleep(2 * 1000);  
> > Where does this number come from?  Why would the device be in D3?  I
> > don't understand this at all.  
> Hi Alex,
> 
>      when I tested the code in my environment, I found that when I used
> the aer-inject module to inject a fake aer error to device on host, the qemu
> would throw out the message "vfio: Unable to power on device, stuck in D3"
> on and off. if I use "gdb" to debug the vfio_pci_pre_reset, the phenomenon
> would not appearance, I just thought it should be some timing race issue,
> so I use a sleep() to wait 2ms (double the reset time of 1ms) to ensure the
> device state is ready. maybe the root reason still need to be 
> investigated deeply.

Yes, it sounds like you need to investigate this further, the delay is
arbitrary and perhaps suggests a race that needs to be fixed
correctly.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 07/12] pci: add a pci_function_is_valid callback to check function if valid
  2016-03-24 22:54   ` Alex Williamson
@ 2016-03-27 12:19     ` Michael S. Tsirkin
  2016-03-31  7:23       ` Chen Fan
  0 siblings, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2016-03-27 12:19 UTC (permalink / raw)
  To: Alex Williamson; +Cc: chen.fan.fnst, izumi.taku, Cao jin, qemu-devel

On Thu, Mar 24, 2016 at 04:54:33PM -0600, Alex Williamson wrote:
> On Wed, 23 Mar 2016 18:12:02 +0800
> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> 
> > From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > 
> > PCI hotplug requires that function 0 is added last to close the
> > slot.  Since vfio supporting AER, we require that the VM bus
> > contains the same set of devices as the host bus to support AER,
> > we can perform an AER validation test whenever a function 0 in
> > the VM is hot-added.
> > 
> > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > ---
> >  hw/pci/pci.c         | 32 ++++++++++++++++++++++++++++++++
> >  include/hw/pci/pci.h |  1 +
> >  2 files changed, 33 insertions(+)
> 
> 
> This would of course need Michael's ack.
> 
> Michael, we can't do this in vfio-pci code because we can't guarantee
> that function 0 is a vfio-pci device.  That would allow users to bypass
> our configuration requirements by putting any random non-vfio-pci
> device at function 0 of a hot-added multifunction device.

I got that. What I don't understand is how is the non hotplug
variant handled, given that is_valid_func is not called
in that case.

Also, why does it scan all devices on bus, not all functions
of the device?

> > 
> > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > index e67664d..2a5291a 100644
> > --- a/hw/pci/pci.c
> > +++ b/hw/pci/pci.c
> > @@ -1840,6 +1840,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
> >      return bus->devices[devfn];
> >  }
> >  
> > +static void pci_function_is_valid(PCIBus *bus, PCIDevice *d, void *opaque)
> > +{
> > +    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(d);
> > +    Error **errp = opaque;
> > +
> > +    if (*errp) {
> > +        return;
> > +    }
> > +
> > +    if (!pc->is_valid_func) {
> > +        return;
> > +    }
> > +
> > +    pc->is_valid_func(d, errp);
> > +}
> > +
> >  static void pci_qdev_realize(DeviceState *qdev, Error **errp)
> >  {
> >      PCIDevice *pci_dev = (PCIDevice *)qdev;
> > @@ -1882,6 +1898,22 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
> >          pci_qdev_unrealize(DEVICE(pci_dev), NULL);
> >          return;
> >      }
> > +
> > +    /*
> > +     *  If the function number is 0, indicate the closure of the slot.
> > +     *  then we get the chance to check all functions on same device
> > +     *  if valid.
> > +     */
> > +    if (DEVICE(pci_dev)->hotplugged &&
> > +        pci_get_function_0(pci_dev) == pci_dev) {
> > +        pci_for_each_device(bus, pci_bus_num(bus),
> > +                            pci_function_is_valid, &local_err);
> > +        if (local_err) {
> > +            error_propagate(errp, local_err);
> > +            pci_qdev_unrealize(DEVICE(pci_dev), NULL);
> > +            return;
> > +        }
> > +    }
> >  }
> >  
> >  static void pci_default_realize(PCIDevice *dev, Error **errp)
> > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> > index 0be07c8..4a2f7d4 100644
> > --- a/include/hw/pci/pci.h
> > +++ b/include/hw/pci/pci.h
> > @@ -190,6 +190,7 @@ typedef struct PCIDeviceClass {
> >  
> >      void (*realize)(PCIDevice *dev, Error **errp);
> >      int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
> > +    void (*is_valid_func)(PCIDevice *dev, Error **errp);
> >      PCIUnregisterFunc *exit;
> >      PCIConfigReadFunc *config_read;
> >      PCIConfigWriteFunc *config_write;

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 07/12] pci: add a pci_function_is_valid callback to check function if valid
  2016-03-23 10:12 ` [Qemu-devel] [patch v5 07/12] pci: add a pci_function_is_valid callback to check function if valid Cao jin
  2016-03-24 22:54   ` Alex Williamson
@ 2016-03-27 12:52   ` Michael S. Tsirkin
  1 sibling, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2016-03-27 12:52 UTC (permalink / raw)
  To: Cao jin; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, qemu-devel

On Wed, Mar 23, 2016 at 06:12:02PM +0800, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> PCI hotplug requires that function 0 is added last to close the
> slot.  Since vfio supporting AER, we require that the VM bus
> contains the same set of devices as the host bus to support AER,
> we can perform an AER validation test whenever a function 0 in
> the VM is hot-added.
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

So I sent some comments, but I do not want to hold up this series.
If there's a respin anyway, pls try to address my comments.
Alex, if you want to merge this in this shape, you can include my

Reviewed-by: Michael S. Tsirkin <mst@redhat.com>

for now, and we can clean up the issues afterwards.


> ---
>  hw/pci/pci.c         | 32 ++++++++++++++++++++++++++++++++
>  include/hw/pci/pci.h |  1 +
>  2 files changed, 33 insertions(+)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index e67664d..2a5291a 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1840,6 +1840,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
>      return bus->devices[devfn];
>  }
>  
> +static void pci_function_is_valid(PCIBus *bus, PCIDevice *d, void *opaque)
> +{
> +    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(d);
> +    Error **errp = opaque;
> +
> +    if (*errp) {
> +        return;
> +    }
> +
> +    if (!pc->is_valid_func) {
> +        return;
> +    }
> +
> +    pc->is_valid_func(d, errp);
> +}
> +
>  static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>  {
>      PCIDevice *pci_dev = (PCIDevice *)qdev;
> @@ -1882,6 +1898,22 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>          pci_qdev_unrealize(DEVICE(pci_dev), NULL);
>          return;
>      }
> +
> +    /*
> +     *  If the function number is 0, indicate the closure of the slot.
> +     *  then we get the chance to check all functions on same device
> +     *  if valid.
> +     */
> +    if (DEVICE(pci_dev)->hotplugged &&
> +        pci_get_function_0(pci_dev) == pci_dev) {
> +        pci_for_each_device(bus, pci_bus_num(bus),
> +                            pci_function_is_valid, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            pci_qdev_unrealize(DEVICE(pci_dev), NULL);
> +            return;
> +        }
> +    }
>  }
>  
>  static void pci_default_realize(PCIDevice *dev, Error **errp)
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 0be07c8..4a2f7d4 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -190,6 +190,7 @@ typedef struct PCIDeviceClass {
>  
>      void (*realize)(PCIDevice *dev, Error **errp);
>      int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
> +    void (*is_valid_func)(PCIDevice *dev, Error **errp);
>      PCIUnregisterFunc *exit;
>      PCIConfigReadFunc *config_read;
>      PCIConfigWriteFunc *config_write;
> -- 
> 1.9.3
> 
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery
  2016-03-25  2:22       ` Alex Williamson
@ 2016-03-31  6:55         ` Chen Fan
  2016-03-31 15:44           ` Alex Williamson
  0 siblings, 1 reply; 24+ messages in thread
From: Chen Fan @ 2016-03-31  6:55 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, Cao jin, qemu-devel, mst


On 03/25/2016 10:22 AM, Alex Williamson wrote:
> On Fri, 25 Mar 2016 09:38:09 +0800
> Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:
>
>> On 03/25/2016 06:54 AM, Alex Williamson wrote:
>>> On Wed, 23 Mar 2016 18:12:06 +0800
>>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>>   
>>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>
>>>> when a physical device aer occurred, the device state probably
>>>> is not in D0 in a short time, if we recover the device quickly.
>>>> we may stuck in D3 state when force to change device state to D0.
>>>> we may need to wait for a short time to inject the error to guest.
>>>>
>>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>> ---
>>>>    hw/vfio/pci.c | 3 +++
>>>>    1 file changed, 3 insertions(+)
>>>>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index 25fc095..5216e7f 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -2658,6 +2658,9 @@ static void vfio_err_notifier_handler(void *opaque)
>>>>            msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>>>>                                     PCI_ERR_ROOT_CMD_NONFATAL_EN;
>>>>    
>>>> +        /* wait a bit to ensure aer device is ready */
>>>> +        usleep(2 * 1000);
>>> Where does this number come from?  Why would the device be in D3?  I
>>> don't understand this at all.
>> Hi Alex,
>>
>>       when I tested the code in my environment, I found that when I used
>> the aer-inject module to inject a fake aer error to device on host, the qemu
>> would throw out the message "vfio: Unable to power on device, stuck in D3"
>> on and off. if I use "gdb" to debug the vfio_pci_pre_reset, the phenomenon
>> would not appearance, I just thought it should be some timing race issue,
>> so I use a sleep() to wait 2ms (double the reset time of 1ms) to ensure the
>> device state is ready. maybe the root reason still need to be
>> investigated deeply.
> Yes, it sounds like you need to investigate this further, the delay is
> arbitrary and perhaps suggests a race that needs to be fixed
> correctly.  Thanks,
Hi Alex,

     after done some investigation of the problem, I found that only 
when the injected
  error is fatal, the problem will appear. because in aer do_recovery, 
host will call reset_link
on the root port, which would invoke pci_reset_bridge_secondary_bus in 
aer_root_reset,
that would reset the bridge and all the device under that. so when qemu 
receive the aer
notification, then propagate the error to guest, guest does the same way 
to perform the
recovery, if the guest `reset_link` that will call the vfio_pre_reset 
done at the stage of host
bridge reset, the device status would probable stick in D3.

so I think after qemu receive the aer notification, we should wait for 
enough time to
ensure the bridge has been reset completely. I just use sleep <=10ms to 
test the code,
seems still appear the message "vfio: Unable to power on device, stuck 
in D3". so I think
we should sleep 100ms to ensure the delay sufficient. I have tested that 
code 100+ times
by inject aer error. the issue no longer appears.

Thanks,
Chen

>
> Alex
>
>
> .
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 07/12] pci: add a pci_function_is_valid callback to check function if valid
  2016-03-27 12:19     ` Michael S. Tsirkin
@ 2016-03-31  7:23       ` Chen Fan
  0 siblings, 0 replies; 24+ messages in thread
From: Chen Fan @ 2016-03-31  7:23 UTC (permalink / raw)
  To: Michael S. Tsirkin, Alex Williamson; +Cc: izumi.taku, Cao jin, qemu-devel


On 03/27/2016 08:19 PM, Michael S. Tsirkin wrote:
> On Thu, Mar 24, 2016 at 04:54:33PM -0600, Alex Williamson wrote:
>> On Wed, 23 Mar 2016 18:12:02 +0800
>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>
>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>
>>> PCI hotplug requires that function 0 is added last to close the
>>> slot.  Since vfio supporting AER, we require that the VM bus
>>> contains the same set of devices as the host bus to support AER,
>>> we can perform an AER validation test whenever a function 0 in
>>> the VM is hot-added.
>>>
>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>> ---
>>>   hw/pci/pci.c         | 32 ++++++++++++++++++++++++++++++++
>>>   include/hw/pci/pci.h |  1 +
>>>   2 files changed, 33 insertions(+)
>>
>> This would of course need Michael's ack.
>>
>> Michael, we can't do this in vfio-pci code because we can't guarantee
>> that function 0 is a vfio-pci device.  That would allow users to bypass
>> our configuration requirements by putting any random non-vfio-pci
>> device at function 0 of a hot-added multifunction device.
> I got that. What I don't understand is how is the non hotplug
> variant handled, given that is_valid_func is not called
> in that case.
for non-hotplug case, we used machine_done_notifier to check
the bus reset functionality. because we can't ensure the function 0
was initialized at last when cold boot up.
>
> Also, why does it scan all devices on bus, not all functions
> of the device?
I can fix this by testing the bridge whether support ARI.

Thanks.
Chen

>
>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>>> index e67664d..2a5291a 100644
>>> --- a/hw/pci/pci.c
>>> +++ b/hw/pci/pci.c
>>> @@ -1840,6 +1840,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
>>>       return bus->devices[devfn];
>>>   }
>>>   
>>> +static void pci_function_is_valid(PCIBus *bus, PCIDevice *d, void *opaque)
>>> +{
>>> +    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(d);
>>> +    Error **errp = opaque;
>>> +
>>> +    if (*errp) {
>>> +        return;
>>> +    }
>>> +
>>> +    if (!pc->is_valid_func) {
>>> +        return;
>>> +    }
>>> +
>>> +    pc->is_valid_func(d, errp);
>>> +}
>>> +
>>>   static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>>>   {
>>>       PCIDevice *pci_dev = (PCIDevice *)qdev;
>>> @@ -1882,6 +1898,22 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>>>           pci_qdev_unrealize(DEVICE(pci_dev), NULL);
>>>           return;
>>>       }
>>> +
>>> +    /*
>>> +     *  If the function number is 0, indicate the closure of the slot.
>>> +     *  then we get the chance to check all functions on same device
>>> +     *  if valid.
>>> +     */
>>> +    if (DEVICE(pci_dev)->hotplugged &&
>>> +        pci_get_function_0(pci_dev) == pci_dev) {
>>> +        pci_for_each_device(bus, pci_bus_num(bus),
>>> +                            pci_function_is_valid, &local_err);
>>> +        if (local_err) {
>>> +            error_propagate(errp, local_err);
>>> +            pci_qdev_unrealize(DEVICE(pci_dev), NULL);
>>> +            return;
>>> +        }
>>> +    }
>>>   }
>>>   
>>>   static void pci_default_realize(PCIDevice *dev, Error **errp)
>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>>> index 0be07c8..4a2f7d4 100644
>>> --- a/include/hw/pci/pci.h
>>> +++ b/include/hw/pci/pci.h
>>> @@ -190,6 +190,7 @@ typedef struct PCIDeviceClass {
>>>   
>>>       void (*realize)(PCIDevice *dev, Error **errp);
>>>       int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
>>> +    void (*is_valid_func)(PCIDevice *dev, Error **errp);
>>>       PCIUnregisterFunc *exit;
>>>       PCIConfigReadFunc *config_read;
>>>       PCIConfigWriteFunc *config_write;
>
> .
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery
  2016-03-31  6:55         ` Chen Fan
@ 2016-03-31 15:44           ` Alex Williamson
  2016-04-01  1:40             ` Chen Fan
  0 siblings, 1 reply; 24+ messages in thread
From: Alex Williamson @ 2016-03-31 15:44 UTC (permalink / raw)
  To: Chen Fan; +Cc: izumi.taku, Cao jin, qemu-devel, mst

On Thu, 31 Mar 2016 14:55:07 +0800
Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:

> On 03/25/2016 10:22 AM, Alex Williamson wrote:
> > On Fri, 25 Mar 2016 09:38:09 +0800
> > Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:
> >  
> >> On 03/25/2016 06:54 AM, Alex Williamson wrote:  
> >>> On Wed, 23 Mar 2016 18:12:06 +0800
> >>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
> >>>     
> >>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>>>
> >>>> when a physical device aer occurred, the device state probably
> >>>> is not in D0 in a short time, if we recover the device quickly.
> >>>> we may stuck in D3 state when force to change device state to D0.
> >>>> we may need to wait for a short time to inject the error to guest.
> >>>>
> >>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>>> ---
> >>>>    hw/vfio/pci.c | 3 +++
> >>>>    1 file changed, 3 insertions(+)
> >>>>
> >>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >>>> index 25fc095..5216e7f 100644
> >>>> --- a/hw/vfio/pci.c
> >>>> +++ b/hw/vfio/pci.c
> >>>> @@ -2658,6 +2658,9 @@ static void vfio_err_notifier_handler(void *opaque)
> >>>>            msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
> >>>>                                     PCI_ERR_ROOT_CMD_NONFATAL_EN;
> >>>>    
> >>>> +        /* wait a bit to ensure aer device is ready */
> >>>> +        usleep(2 * 1000);  
> >>> Where does this number come from?  Why would the device be in D3?  I
> >>> don't understand this at all.  
> >> Hi Alex,
> >>
> >>       when I tested the code in my environment, I found that when I used
> >> the aer-inject module to inject a fake aer error to device on host, the qemu
> >> would throw out the message "vfio: Unable to power on device, stuck in D3"
> >> on and off. if I use "gdb" to debug the vfio_pci_pre_reset, the phenomenon
> >> would not appearance, I just thought it should be some timing race issue,
> >> so I use a sleep() to wait 2ms (double the reset time of 1ms) to ensure the
> >> device state is ready. maybe the root reason still need to be
> >> investigated deeply.  
> > Yes, it sounds like you need to investigate this further, the delay is
> > arbitrary and perhaps suggests a race that needs to be fixed
> > correctly.  Thanks,  
> Hi Alex,
> 
>      after done some investigation of the problem, I found that only 
> when the injected
>   error is fatal, the problem will appear. because in aer do_recovery, 
> host will call reset_link
> on the root port, which would invoke pci_reset_bridge_secondary_bus in 
> aer_root_reset,
> that would reset the bridge and all the device under that. so when qemu 
> receive the aer
> notification, then propagate the error to guest, guest does the same way 
> to perform the
> recovery, if the guest `reset_link` that will call the vfio_pre_reset 
> done at the stage of host
> bridge reset, the device status would probable stick in D3.
> 
> so I think after qemu receive the aer notification, we should wait for 
> enough time to
> ensure the bridge has been reset completely. I just use sleep <=10ms to 
> test the code,
> seems still appear the message "vfio: Unable to power on device, stuck 
> in D3". so I think
> we should sleep 100ms to ensure the delay sufficient. I have tested that 
> code 100+ times
> by inject aer error. the issue no longer appears.

I'm not satisfied with this.  pci_reset_bridge_secondary_bus() is
invoked by both the host AER code and the guest AER code, the latter
via the vfio PCI hot reset interface.  The
pci_reset_bridge_secondary_bus() function includes the spec defined
delay by which point all the devices should be operational again.  The
spec also defines that devices are in D0 after reset, which implies
that the only reason we would ever be seeing a device in D3 is if we're
reading the device while it is still in reset or before it has
recovered from reset.  That implies that either
pci_reset_bridge_secondary_bus() is not waiting long enough or QEMU is
allowing one device to call vfio_pre_reset while another device is
still in reset.  I suspect QEMU serializes reset such that the latter
case is not possible, which means that you might have a device that
takes longer to reset than the spec defines.  Such a quirk should be
handled in the host kernel reset, not by adding arbitrary delays in
userspace code.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery
  2016-03-31 15:44           ` Alex Williamson
@ 2016-04-01  1:40             ` Chen Fan
  2016-04-01  1:53               ` Chen Fan
  0 siblings, 1 reply; 24+ messages in thread
From: Chen Fan @ 2016-04-01  1:40 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, Cao jin, qemu-devel, mst


On 03/31/2016 11:44 PM, Alex Williamson wrote:
> On Thu, 31 Mar 2016 14:55:07 +0800
> Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:
>
>> On 03/25/2016 10:22 AM, Alex Williamson wrote:
>>> On Fri, 25 Mar 2016 09:38:09 +0800
>>> Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:
>>>   
>>>> On 03/25/2016 06:54 AM, Alex Williamson wrote:
>>>>> On Wed, 23 Mar 2016 18:12:06 +0800
>>>>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>>>>      
>>>>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>>>
>>>>>> when a physical device aer occurred, the device state probably
>>>>>> is not in D0 in a short time, if we recover the device quickly.
>>>>>> we may stuck in D3 state when force to change device state to D0.
>>>>>> we may need to wait for a short time to inject the error to guest.
>>>>>>
>>>>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>>> ---
>>>>>>     hw/vfio/pci.c | 3 +++
>>>>>>     1 file changed, 3 insertions(+)
>>>>>>
>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>> index 25fc095..5216e7f 100644
>>>>>> --- a/hw/vfio/pci.c
>>>>>> +++ b/hw/vfio/pci.c
>>>>>> @@ -2658,6 +2658,9 @@ static void vfio_err_notifier_handler(void *opaque)
>>>>>>             msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>>>>>>                                      PCI_ERR_ROOT_CMD_NONFATAL_EN;
>>>>>>     
>>>>>> +        /* wait a bit to ensure aer device is ready */
>>>>>> +        usleep(2 * 1000);
>>>>> Where does this number come from?  Why would the device be in D3?  I
>>>>> don't understand this at all.
>>>> Hi Alex,
>>>>
>>>>        when I tested the code in my environment, I found that when I used
>>>> the aer-inject module to inject a fake aer error to device on host, the qemu
>>>> would throw out the message "vfio: Unable to power on device, stuck in D3"
>>>> on and off. if I use "gdb" to debug the vfio_pci_pre_reset, the phenomenon
>>>> would not appearance, I just thought it should be some timing race issue,
>>>> so I use a sleep() to wait 2ms (double the reset time of 1ms) to ensure the
>>>> device state is ready. maybe the root reason still need to be
>>>> investigated deeply.
>>> Yes, it sounds like you need to investigate this further, the delay is
>>> arbitrary and perhaps suggests a race that needs to be fixed
>>> correctly.  Thanks,
>> Hi Alex,
>>
>>       after done some investigation of the problem, I found that only
>> when the injected
>>    error is fatal, the problem will appear. because in aer do_recovery,
>> host will call reset_link
>> on the root port, which would invoke pci_reset_bridge_secondary_bus in
>> aer_root_reset,
>> that would reset the bridge and all the device under that. so when qemu
>> receive the aer
>> notification, then propagate the error to guest, guest does the same way
>> to perform the
>> recovery, if the guest `reset_link` that will call the vfio_pre_reset
>> done at the stage of host
>> bridge reset, the device status would probable stick in D3.
>>
>> so I think after qemu receive the aer notification, we should wait for
>> enough time to
>> ensure the bridge has been reset completely. I just use sleep <=10ms to
>> test the code,
>> seems still appear the message "vfio: Unable to power on device, stuck
>> in D3". so I think
>> we should sleep 100ms to ensure the delay sufficient. I have tested that
>> code 100+ times
>> by inject aer error. the issue no longer appears.
> I'm not satisfied with this.  pci_reset_bridge_secondary_bus() is
> invoked by both the host AER code and the guest AER code, the latter
> via the vfio PCI hot reset interface.  The
> pci_reset_bridge_secondary_bus() function includes the spec defined
> delay by which point all the devices should be operational again.  The
> spec also defines that devices are in D0 after reset, which implies
> that the only reason we would ever be seeing a device in D3 is if we're
> reading the device while it is still in reset or before it has
> recovered from reset.  That implies that either
> pci_reset_bridge_secondary_bus() is not waiting long enough or QEMU is
> allowing one device to call vfio_pre_reset while another device is
> still in reset.  I suspect QEMU serializes reset such that the latter
> case is not possible, which means that you might have a device that
> takes longer to reset than the spec defines.  Such a quirk should be
> handled in the host kernel reset, not by adding arbitrary delays in
> userspace code.  Thanks,
maybe it is not the delay issue actually. let me analyze the aer process
in do_recovery:

                host                                      qemu          
      guest

         broadcast: error_detected
         +--> error_detected (VFIO)
           +--> eventfd_signal   ----->    inject_aer ---------> 
broadcast: error_detected
*reset_link* +--> error_detected (real driver)
*reset_link*
         broadcast: mmio/slot_reset                                     
broadcast: mmio/slot_reset
         broadcast: 
resume                                                  broadcast: resume

we can see that the reset_link in host and in guest are not called in 
order. the reset_link
in guest may execute during the host broadcast "error_detected" or 
"reset_link" as long
as there are many devices need to walk on the bus. so in the case qemu 
has the opportunity
to call vfio_pre_reset while host is still in reset.

so can we consider moving the eventfd_signal in error_detected to 
resume/slot_reset callback
in vfio driver, then we can assure that the host reset_link finish 
before guest do reset_link.
or adding a new resume/slot_reset event between vfio driver and qemu to 
delay the guest reset_link.

Thanks,
Chen











>
> Alex
>
>
> .
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery
  2016-04-01  1:40             ` Chen Fan
@ 2016-04-01  1:53               ` Chen Fan
  0 siblings, 0 replies; 24+ messages in thread
From: Chen Fan @ 2016-04-01  1:53 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, Cao jin, qemu-devel, mst


On 04/01/2016 09:40 AM, Chen Fan wrote:
>
> On 03/31/2016 11:44 PM, Alex Williamson wrote:
>> On Thu, 31 Mar 2016 14:55:07 +0800
>> Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:
>>
>>> On 03/25/2016 10:22 AM, Alex Williamson wrote:
>>>> On Fri, 25 Mar 2016 09:38:09 +0800
>>>> Chen Fan <chen.fan.fnst@cn.fujitsu.com> wrote:
>>>>> On 03/25/2016 06:54 AM, Alex Williamson wrote:
>>>>>> On Wed, 23 Mar 2016 18:12:06 +0800
>>>>>> Cao jin <caoj.fnst@cn.fujitsu.com> wrote:
>>>>>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>>>>
>>>>>>> when a physical device aer occurred, the device state probably
>>>>>>> is not in D0 in a short time, if we recover the device quickly.
>>>>>>> we may stuck in D3 state when force to change device state to D0.
>>>>>>> we may need to wait for a short time to inject the error to guest.
>>>>>>>
>>>>>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>>>> ---
>>>>>>>     hw/vfio/pci.c | 3 +++
>>>>>>>     1 file changed, 3 insertions(+)
>>>>>>>
>>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>>> index 25fc095..5216e7f 100644
>>>>>>> --- a/hw/vfio/pci.c
>>>>>>> +++ b/hw/vfio/pci.c
>>>>>>> @@ -2658,6 +2658,9 @@ static void vfio_err_notifier_handler(void 
>>>>>>> *opaque)
>>>>>>>             msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>>>>>>> PCI_ERR_ROOT_CMD_NONFATAL_EN;
>>>>>>>     +        /* wait a bit to ensure aer device is ready */
>>>>>>> +        usleep(2 * 1000);
>>>>>> Where does this number come from?  Why would the device be in D3?  I
>>>>>> don't understand this at all.
>>>>> Hi Alex,
>>>>>
>>>>>        when I tested the code in my environment, I found that when 
>>>>> I used
>>>>> the aer-inject module to inject a fake aer error to device on 
>>>>> host, the qemu
>>>>> would throw out the message "vfio: Unable to power on device, 
>>>>> stuck in D3"
>>>>> on and off. if I use "gdb" to debug the vfio_pci_pre_reset, the 
>>>>> phenomenon
>>>>> would not appearance, I just thought it should be some timing race 
>>>>> issue,
>>>>> so I use a sleep() to wait 2ms (double the reset time of 1ms) to 
>>>>> ensure the
>>>>> device state is ready. maybe the root reason still need to be
>>>>> investigated deeply.
>>>> Yes, it sounds like you need to investigate this further, the delay is
>>>> arbitrary and perhaps suggests a race that needs to be fixed
>>>> correctly.  Thanks,
>>> Hi Alex,
>>>
>>>       after done some investigation of the problem, I found that only
>>> when the injected
>>>    error is fatal, the problem will appear. because in aer do_recovery,
>>> host will call reset_link
>>> on the root port, which would invoke pci_reset_bridge_secondary_bus in
>>> aer_root_reset,
>>> that would reset the bridge and all the device under that. so when qemu
>>> receive the aer
>>> notification, then propagate the error to guest, guest does the same 
>>> way
>>> to perform the
>>> recovery, if the guest `reset_link` that will call the vfio_pre_reset
>>> done at the stage of host
>>> bridge reset, the device status would probable stick in D3.
>>>
>>> so I think after qemu receive the aer notification, we should wait for
>>> enough time to
>>> ensure the bridge has been reset completely. I just use sleep <=10ms to
>>> test the code,
>>> seems still appear the message "vfio: Unable to power on device, stuck
>>> in D3". so I think
>>> we should sleep 100ms to ensure the delay sufficient. I have tested 
>>> that
>>> code 100+ times
>>> by inject aer error. the issue no longer appears.
>> I'm not satisfied with this.  pci_reset_bridge_secondary_bus() is
>> invoked by both the host AER code and the guest AER code, the latter
>> via the vfio PCI hot reset interface.  The
>> pci_reset_bridge_secondary_bus() function includes the spec defined
>> delay by which point all the devices should be operational again.  The
>> spec also defines that devices are in D0 after reset, which implies
>> that the only reason we would ever be seeing a device in D3 is if we're
>> reading the device while it is still in reset or before it has
>> recovered from reset.  That implies that either
>> pci_reset_bridge_secondary_bus() is not waiting long enough or QEMU is
>> allowing one device to call vfio_pre_reset while another device is
>> still in reset.  I suspect QEMU serializes reset such that the latter
>> case is not possible, which means that you might have a device that
>> takes longer to reset than the spec defines.  Such a quirk should be
>> handled in the host kernel reset, not by adding arbitrary delays in
>> userspace code.  Thanks,
> maybe it is not the delay issue actually. let me analyze the aer process
> in do_recovery:
>
>                host qemu               guest
>
>         broadcast: error_detected
>         +--> error_detected (VFIO)
>           +--> eventfd_signal   ----->    inject_aer ---------> 
> broadcast: error_detected
> *reset_link* +--> error_detected (real driver)
> *reset_link*
>         broadcast: mmio/slot_reset                                     
> broadcast: mmio/slot_reset
>         broadcast: 
> resume                                                  broadcast: resume
apologies to all, the above diagram was chaotic. I redraw it.

                host                             qemu        guest

broadcast: error_detected (host)
  +--> error_detected (VFIO)
    +--> eventfd_signal   ----->  inject_aer  -----> broadcast: 
error_detected (gust)
*reset_link* (host) +--> error_detected (real driver) (gust)
*reset_link* (gust)
broadcast: mmio/slot_reset (host)                      broadcast: 
mmio/slot_reset (gust)
broadcast: resume (host) broadcast: resume (gust)

Thanks,
Chen

>
> we can see that the reset_link in host and in guest are not called in 
> order. the reset_link
> in guest may execute during the host broadcast "error_detected" or 
> "reset_link" as long
> as there are many devices need to walk on the bus. so in the case qemu 
> has the opportunity
> to call vfio_pre_reset while host is still in reset.
>
> so can we consider moving the eventfd_signal in error_detected to 
> resume/slot_reset callback
> in vfio driver, then we can assure that the host reset_link finish 
> before guest do reset_link.
> or adding a new resume/slot_reset event between vfio driver and qemu 
> to delay the guest reset_link.
>
> Thanks,
> Chen
>
>
>
>
>
>
>
>
>
>
>
>>
>> Alex
>>
>>
>> .
>>
>
>
>
>
> .
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2016-04-01  1:59 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-23 10:11 [Qemu-devel] [patch v5 00/12] vfio-pci: pass the aer error to guest, part2 Cao jin
2016-03-23 10:11 ` [Qemu-devel] [patch v5 01/12] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
2016-03-23 10:11 ` [Qemu-devel] [patch v5 02/12] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset Cao jin
2016-03-23 10:11 ` [Qemu-devel] [patch v5 03/12] vfio: add pcie extended capability support Cao jin
2016-03-23 10:11 ` [Qemu-devel] [patch v5 04/12] vfio: add aer support for vfio device Cao jin
2016-03-23 10:12 ` [Qemu-devel] [patch v5 05/12] vfio: refine function vfio_pci_host_match Cao jin
2016-03-23 10:12 ` [Qemu-devel] [patch v5 06/12] vfio: add check host bus reset is support or not Cao jin
2016-03-23 10:12 ` [Qemu-devel] [patch v5 07/12] pci: add a pci_function_is_valid callback to check function if valid Cao jin
2016-03-24 22:54   ` Alex Williamson
2016-03-27 12:19     ` Michael S. Tsirkin
2016-03-31  7:23       ` Chen Fan
2016-03-27 12:52   ` Michael S. Tsirkin
2016-03-23 10:12 ` [Qemu-devel] [patch v5 08/12] vfio: add check aer functionality for hotplug device Cao jin
2016-03-23 10:12 ` [Qemu-devel] [patch v5 09/12] vfio: vote the function 0 to do host bus reset when aer occurred Cao jin
2016-03-23 10:12 ` [Qemu-devel] [patch v5 10/12] vfio-pci: pass the aer error to guest Cao jin
2016-03-23 10:12 ` [Qemu-devel] [patch v5 11/12] vfio: device may stuck in D3 when doing aer recovery Cao jin
2016-03-24 22:54   ` Alex Williamson
2016-03-25  1:38     ` Chen Fan
2016-03-25  2:22       ` Alex Williamson
2016-03-31  6:55         ` Chen Fan
2016-03-31 15:44           ` Alex Williamson
2016-04-01  1:40             ` Chen Fan
2016-04-01  1:53               ` Chen Fan
2016-03-23 10:12 ` [Qemu-devel] [patch v5 12/12] vfio: add 'aer' property to expose aercap Cao jin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).