[Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
@ 2016-01-12  2:43 Cao jin
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 01/14] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
                   ` (14 more replies)
  0 siblings, 15 replies; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

For now, for vfio pci passthough devices when qemu receives
an error from host aer report, currentlly just terminate the guest,
but usually user want to know what error occurred but stopping the
guest, so this patches add aer capability support for vfio device,
and pass the error to guest, and have guest driver to recover
from the error.

v15-v16:
   10/14, 11/14 are new to introduce a reset sequence id to specify the
   vfio devices has been reset for that reset. other patches aren't modified.

v14-v15:
   1. add device hot reset callback
   2. add bus_in_reset for vfio device to avoid multi do host bus reset

v13-v14:
   1. for multifunction device, requiring all functions enable AER.(9/13)
   2. due to all affected functions receive error signal, ignore no
      error occurred function. (12/13)

v12-v13:
   1. since support multifuncion hotplug, here add callback to enable aer.
   2. add pci device pre+post reset for aer host reset.

Chen Fan (14):
  vfio: extract vfio_get_hot_reset_info as a single function
  vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
  pcie: modify the capability size assert
  vfio: make the 4 bytes aligned for capability size
  vfio: add pcie extanded capability support
  aer: impove pcie_aer_init to support vfio device
  vfio: add aer support for vfio device
  vfio: add check host bus reset is support or not
  add check reset mechanism when hotplug vfio device
  pci: introduce pci bus pre reset
  vfio: introduce last reset sequence id
  pcie_aer: expose pcie_aer_msg() interface
  vfio-pci: pass the aer error to guest
  vfio: add 'aer' property to expose aercap

 hw/pci-bridge/ioh3420.c            |   2 +-
 hw/pci-bridge/xio3130_downstream.c |   2 +-
 hw/pci-bridge/xio3130_upstream.c   |   2 +-
 hw/pci/pci.c                       |  42 +++
 hw/pci/pci_bridge.c                |   3 +
 hw/pci/pcie.c                      |   2 +-
 hw/pci/pcie_aer.c                  |   6 +-
 hw/vfio/pci.c                      | 616 +++++++++++++++++++++++++++++++++----
 hw/vfio/pci.h                      |   9 +
 include/hw/pci/pci.h               |   1 +
 include/hw/pci/pci_bus.h           |   8 +
 include/hw/pci/pcie_aer.h          |   3 +-
 12 files changed, 624 insertions(+), 72 deletions(-)

-- 
1.9.3

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 01/14] vfio: extract vfio_get_hot_reset_info as a single function
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-17 12:48   ` Marcel Apfelbaum
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 02/14] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset Cao jin
                   ` (13 subsequent siblings)
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

the function is used to get affected devices by bus reset.
so here extract it, and can used for aer soon.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 66 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 1fb868c..efcd3cd 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1654,6 +1654,51 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
     }
 }
 
+/*
+ * return negative with errno, return 0 on success.
+ * if success, the point of ret_info fill with the affected device reset info.
+ *
+ */
+static int vfio_get_hot_reset_info(VFIOPCIDevice *vdev,
+                                   struct vfio_pci_hot_reset_info **ret_info)
+{
+    struct vfio_pci_hot_reset_info *info;
+    int ret, count;
+
+    *ret_info = NULL;
+
+    info = g_malloc0(sizeof(*info));
+    info->argsz = sizeof(*info);
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+    if (ret && errno != ENOSPC) {
+        ret = -errno;
+        goto error;
+    }
+
+    count = info->count;
+
+    info = g_realloc(info, sizeof(*info) +
+                     (count * sizeof(struct vfio_pci_dependent_device)));
+    info->argsz = sizeof(*info) +
+                  (count * sizeof(struct vfio_pci_dependent_device));
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+    if (ret) {
+        ret = -errno;
+        error_report("vfio: hot reset info failed: %m");
+        goto error;
+    }
+
+    *ret_info = info;
+    info = NULL;
+
+    return 0;
+error:
+    g_free(info);
+    return ret;
+}
+
 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -1793,7 +1838,7 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 {
     VFIOGroup *group;
-    struct vfio_pci_hot_reset_info *info;
+    struct vfio_pci_hot_reset_info *info = NULL;
     struct vfio_pci_dependent_device *devices;
     struct vfio_pci_hot_reset *reset;
     int32_t *fds;
@@ -1805,12 +1850,8 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
     vfio_pci_pre_reset(vdev);
     vdev->vbasedev.needs_reset = false;
 
-    info = g_malloc0(sizeof(*info));
-    info->argsz = sizeof(*info);
-
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
-    if (ret && errno != ENOSPC) {
-        ret = -errno;
+    ret = vfio_get_hot_reset_info(vdev, &info);
+    if (ret) {
         if (!vdev->has_pm_reset) {
             error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
                          "no available reset mechanism.", vdev->host.domain,
@@ -1819,18 +1860,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
         goto out_single;
     }
 
-    count = info->count;
-    info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
-    info->argsz = sizeof(*info) + (count * sizeof(*devices));
     devices = &info->devices[0];
-
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
-    if (ret) {
-        ret = -errno;
-        error_report("vfio: hot reset info failed: %m");
-        goto out_single;
-    }
-
     trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
 
     /* Verify that we have all the groups required */
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 02/14] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 01/14] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-17 13:01   ` Marcel Apfelbaum
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 03/14] pcie: modify the capability size assert Cao jin
                   ` (12 subsequent siblings)
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

squeeze out vfio_pci_do_hot_reset to do host bus reset when AER recovery.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 75 +++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index efcd3cd..a63cf85 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1699,6 +1699,48 @@ error:
     return ret;
 }
 
+static int vfio_pci_do_hot_reset(VFIOPCIDevice *vdev,
+                                 struct vfio_pci_hot_reset_info *info)
+{
+    VFIOGroup *group;
+    struct vfio_pci_hot_reset *reset;
+    int32_t *fds;
+    int ret, i, count;
+    struct vfio_pci_dependent_device *devices;
+
+    /* Determine how many group fds need to be passed */
+    count = 0;
+    devices = &info->devices[0];
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        for (i = 0; i < info->count; i++) {
+            if (group->groupid == devices[i].group_id) {
+                count++;
+                break;
+            }
+        }
+    }
+
+    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
+    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
+    fds = &reset->group_fds[0];
+
+    /* Fill in group fds */
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        for (i = 0; i < info->count; i++) {
+            if (group->groupid == devices[i].group_id) {
+                fds[reset->count++] = group->fd;
+                break;
+            }
+        }
+    }
+
+    /* Bus reset! */
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+    g_free(reset);
+
+    return ret;
+}
+
 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -1840,9 +1882,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
     VFIOGroup *group;
     struct vfio_pci_hot_reset_info *info = NULL;
     struct vfio_pci_dependent_device *devices;
-    struct vfio_pci_hot_reset *reset;
-    int32_t *fds;
-    int ret, i, count;
+    int ret, i;
     bool multi = false;
 
     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
@@ -1921,34 +1961,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
         goto out_single;
     }
 
-    /* Determine how many group fds need to be passed */
-    count = 0;
-    QLIST_FOREACH(group, &vfio_group_list, next) {
-        for (i = 0; i < info->count; i++) {
-            if (group->groupid == devices[i].group_id) {
-                count++;
-                break;
-            }
-        }
-    }
-
-    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
-    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
-    fds = &reset->group_fds[0];
-
-    /* Fill in group fds */
-    QLIST_FOREACH(group, &vfio_group_list, next) {
-        for (i = 0; i < info->count; i++) {
-            if (group->groupid == devices[i].group_id) {
-                fds[reset->count++] = group->fd;
-                break;
-            }
-        }
-    }
-
-    /* Bus reset! */
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
-    g_free(reset);
+    ret = vfio_pci_do_hot_reset(vdev, info);
 
     trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
                                     ret ? "%m" : "Success");
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 03/14] pcie: modify the capability size assert
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 01/14] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 02/14] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 04/14] vfio: make the 4 bytes aligned for capability size Cao jin
                   ` (11 subsequent siblings)
  14 siblings, 0 replies; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

 Device's Offset and size can reach PCIE_CONFIG_SPACE_SIZE,
 fix the corresponding assert.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Reviewed-by: Marcel Apfelbaum <marcel@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci/pcie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 0eab29d..8f4c0e5 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -607,7 +607,7 @@ void pcie_add_capability(PCIDevice *dev,
 
     assert(offset >= PCI_CONFIG_SPACE_SIZE);
     assert(offset < offset + size);
-    assert(offset + size < PCIE_CONFIG_SPACE_SIZE);
+    assert(offset + size <= PCIE_CONFIG_SPACE_SIZE);
     assert(size >= 8);
     assert(pci_is_express(dev));
 
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 04/14] vfio: make the 4 bytes aligned for capability size
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (2 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 03/14] pcie: modify the capability size assert Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-17 12:30   ` Marcel Apfelbaum
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 05/14] vfio: add pcie extanded capability support Cao jin
                   ` (10 subsequent siblings)
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

this function search the capability from the end, the last
size should 0x100 - pos, not 0xff - pos.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a63cf85..288f2c7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1469,7 +1469,8 @@ static void vfio_unmap_bars(VFIOPCIDevice *vdev)
  */
 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
 {
-    uint8_t tmp, next = 0xff;
+    uint8_t tmp;
+    uint16_t next = PCI_CONFIG_SPACE_SIZE;
 
     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
          tmp = pdev->config[tmp + 1]) {
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 05/14] vfio: add pcie extanded capability support
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (3 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 04/14] vfio: make the 4 bytes aligned for capability size Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-17 13:22   ` Marcel Apfelbaum
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 06/14] aer: impove pcie_aer_init to support vfio device Cao jin
                   ` (9 subsequent siblings)
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

For vfio pcie device, we could expose the extended capability on
PCIE bus. in order to avoid config space broken, we introduce
a copy config for parsing extended caps. and rebuild the pcie
extended config space.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 288f2c7..64b0867 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1482,6 +1482,21 @@ static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
     return next - pos;
 }
 
+
+static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
+{
+    uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
+
+    for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
+        tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
+        if (tmp > pos && tmp < next) {
+            next = tmp;
+        }
+    }
+
+    return next - pos;
+}
+
 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
 {
     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
@@ -1817,16 +1832,69 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    uint32_t header;
+    uint16_t cap_id, next, size;
+    uint8_t cap_ver;
+    uint8_t *config;
+
+    /*
+     * In order to avoid breaking config space, create a copy to
+     * use for parsing extended capabilities.
+     */
+    config = g_memdup(pdev->config, vdev->config_size);
+
+    for (next = PCI_CONFIG_SPACE_SIZE; next;
+         next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
+        header = pci_get_long(config + next);
+        cap_id = PCI_EXT_CAP_ID(header);
+        cap_ver = PCI_EXT_CAP_VER(header);
+
+        /*
+         * If it becomes important to configure extended capabilities to their
+         * actual size, use this as the default when it's something we don't
+         * recognize. Since QEMU doesn't actually handle many of the config
+         * accesses, exact size doesn't seem worthwhile.
+         */
+        size = vfio_ext_cap_max_size(config, next);
+
+        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+        pci_set_long(dev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
+
+        /* Use emulated next pointer to allow dropping extended caps */
+        pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
+                                   PCI_EXT_CAP_NEXT_MASK);
+    }
+
+    g_free(config);
+    return 0;
+}
+
 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
 {
     PCIDevice *pdev = &vdev->pdev;
+    int ret;
 
     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
         !pdev->config[PCI_CAPABILITY_LIST]) {
         return 0; /* Nothing to add */
     }
 
-    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+    ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+    if (ret) {
+        return ret;
+    }
+
+    /* on PCI bus, it doesn't make sense to expose extended capabilities. */
+    if (!pci_is_express(pdev) ||
+        !pci_bus_is_express(pdev->bus) ||
+        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
+        return 0;
+    }
+
+    return vfio_add_ext_cap(vdev);
 }
 
 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 06/14] aer: impove pcie_aer_init to support vfio device
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (4 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 05/14] vfio: add pcie extanded capability support Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 07/14] vfio: add aer support for " Cao jin
                   ` (8 subsequent siblings)
  14 siblings, 0 replies; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

pcie_aer_init was used to emulate an aer capability for pcie device,
but for vfio device, the aer config space size is mutable and is not
always equal to PCI_ERR_SIZEOF(0x48). it depends on where the TLP Prefix
register required, so here we add a size argument.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci-bridge/ioh3420.c            | 2 +-
 hw/pci-bridge/xio3130_downstream.c | 2 +-
 hw/pci-bridge/xio3130_upstream.c   | 2 +-
 hw/pci/pcie_aer.c                  | 4 ++--
 include/hw/pci/pcie_aer.h          | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/pci-bridge/ioh3420.c b/hw/pci-bridge/ioh3420.c
index cce2fdd..4d9cd3f 100644
--- a/hw/pci-bridge/ioh3420.c
+++ b/hw/pci-bridge/ioh3420.c
@@ -129,7 +129,7 @@ static int ioh3420_initfn(PCIDevice *d)
         goto err_pcie_cap;
     }
     pcie_cap_root_init(d);
-    rc = pcie_aer_init(d, IOH_EP_AER_OFFSET);
+    rc = pcie_aer_init(d, IOH_EP_AER_OFFSET, PCI_ERR_SIZEOF);
     if (rc < 0) {
         goto err;
     }
diff --git a/hw/pci-bridge/xio3130_downstream.c b/hw/pci-bridge/xio3130_downstream.c
index b3a6479..9737041 100644
--- a/hw/pci-bridge/xio3130_downstream.c
+++ b/hw/pci-bridge/xio3130_downstream.c
@@ -92,7 +92,7 @@ static int xio3130_downstream_initfn(PCIDevice *d)
         goto err_pcie_cap;
     }
     pcie_cap_arifwd_init(d);
-    rc = pcie_aer_init(d, XIO3130_AER_OFFSET);
+    rc = pcie_aer_init(d, XIO3130_AER_OFFSET, PCI_ERR_SIZEOF);
     if (rc < 0) {
         goto err;
     }
diff --git a/hw/pci-bridge/xio3130_upstream.c b/hw/pci-bridge/xio3130_upstream.c
index eada582..4d7f894 100644
--- a/hw/pci-bridge/xio3130_upstream.c
+++ b/hw/pci-bridge/xio3130_upstream.c
@@ -81,7 +81,7 @@ static int xio3130_upstream_initfn(PCIDevice *d)
     }
     pcie_cap_flr_init(d);
     pcie_cap_deverr_init(d);
-    rc = pcie_aer_init(d, XIO3130_AER_OFFSET);
+    rc = pcie_aer_init(d, XIO3130_AER_OFFSET, PCI_ERR_SIZEOF);
     if (rc < 0) {
         goto err;
     }
diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c
index 98d2c18..45f351b 100644
--- a/hw/pci/pcie_aer.c
+++ b/hw/pci/pcie_aer.c
@@ -94,12 +94,12 @@ static void aer_log_clear_all_err(PCIEAERLog *aer_log)
     aer_log->log_num = 0;
 }
 
-int pcie_aer_init(PCIDevice *dev, uint16_t offset)
+int pcie_aer_init(PCIDevice *dev, uint16_t offset, uint16_t size)
 {
     PCIExpressDevice *exp;
 
     pcie_add_capability(dev, PCI_EXT_CAP_ID_ERR, PCI_ERR_VER,
-                        offset, PCI_ERR_SIZEOF);
+                        offset, size);
     exp = &dev->exp;
     exp->aer_cap = offset;
 
diff --git a/include/hw/pci/pcie_aer.h b/include/hw/pci/pcie_aer.h
index 2fb8388..156acb0 100644
--- a/include/hw/pci/pcie_aer.h
+++ b/include/hw/pci/pcie_aer.h
@@ -87,7 +87,7 @@ struct PCIEAERErr {
 
 extern const VMStateDescription vmstate_pcie_aer_log;
 
-int pcie_aer_init(PCIDevice *dev, uint16_t offset);
+int pcie_aer_init(PCIDevice *dev, uint16_t offset, uint16_t size);
 void pcie_aer_exit(PCIDevice *dev);
 void pcie_aer_write_config(PCIDevice *dev,
                            uint32_t addr, uint32_t val, int len);
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 07/14] vfio: add aer support for vfio device
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (5 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 06/14] aer: impove pcie_aer_init to support vfio device Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-18  9:12   ` Marcel Apfelbaum
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 08/14] vfio: add check host bus reset is support or not Cao jin
                   ` (7 subsequent siblings)
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

Calling pcie_aer_init to initilize aer related registers for
vfio device, then reload physical related registers to expose
device capability.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 hw/vfio/pci.h |  3 +++
 2 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 64b0867..38b0aa5 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1832,6 +1832,62 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
+                          int pos, uint16_t size)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    PCIDevice *dev_iter;
+    uint8_t type;
+    uint32_t errcap;
+
+    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+        pcie_add_capability(pdev, PCI_EXT_CAP_ID_ERR,
+                            cap_ver, pos, size);
+        return 0;
+    }
+
+    dev_iter = pci_bridge_get_device(pdev->bus);
+    if (!dev_iter) {
+        goto error;
+    }
+
+    while (dev_iter) {
+        type = pcie_cap_get_type(dev_iter);
+        if ((type != PCI_EXP_TYPE_ROOT_PORT &&
+             type != PCI_EXP_TYPE_UPSTREAM &&
+             type != PCI_EXP_TYPE_DOWNSTREAM)) {
+            goto error;
+        }
+
+        if (!dev_iter->exp.aer_cap) {
+            goto error;
+        }
+
+        dev_iter = pci_bridge_get_device(dev_iter->bus);
+    }
+
+    errcap = vfio_pci_read_config(pdev, pos + PCI_ERR_CAP, 4);
+    /*
+     * The ability to record multiple headers is depending on
+     * the state of the Multiple Header Recording Capable bit and
+     * enabled by the Multiple Header Recording Enable bit.
+     */
+    if ((errcap & PCI_ERR_CAP_MHRC) &&
+        (errcap & PCI_ERR_CAP_MHRE)) {
+        pdev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
+    } else {
+        pdev->exp.aer_log.log_max = 0;
+    }
+
+    pcie_cap_deverr_init(pdev);
+    return pcie_aer_init(pdev, pos, size);
+
+error:
+    error_report("vfio: Unable to enable AER for device %s, parent bus "
+                 "does not support AER signaling", vdev->vbasedev.name);
+    return -1;
+}
+
 static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -1839,6 +1895,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
     uint16_t cap_id, next, size;
     uint8_t cap_ver;
     uint8_t *config;
+    int ret = 0;
 
     /*
      * In order to avoid breaking config space, create a copy to
@@ -1860,16 +1917,29 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
          */
         size = vfio_ext_cap_max_size(config, next);
 
-        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
-        pci_set_long(dev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
+        switch (cap_id) {
+        case PCI_EXT_CAP_ID_ERR:
+            ret = vfio_setup_aer(vdev, cap_ver, next, size);
+            break;
+        default:
+            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+            break;
+        }
+
+        if (ret) {
+            goto out;
+        }
+
+        pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
 
         /* Use emulated next pointer to allow dropping extended caps */
         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
                                    PCI_EXT_CAP_NEXT_MASK);
     }
 
+out:
     g_free(config);
-    return 0;
+    return ret;
 }
 
 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
@@ -2624,6 +2694,11 @@ static int vfio_initfn(PCIDevice *pdev)
         goto out_teardown;
     }
 
+    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+        !pdev->exp.aer_cap) {
+        goto out_teardown;
+    }
+
     /* QEMU emulates all of MSI & MSIX */
     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index f004d52..48c1f69 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -15,6 +15,7 @@
 #include "qemu-common.h"
 #include "exec/memory.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bridge.h"
 #include "hw/vfio/vfio-common.h"
 #include "qemu/event_notifier.h"
 #include "qemu/queue.h"
@@ -127,6 +128,8 @@ typedef struct VFIOPCIDevice {
 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
 #define VFIO_FEATURE_ENABLE_REQ_BIT 1
 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT)
+#define VFIO_FEATURE_ENABLE_AER_BIT 2
+#define VFIO_FEATURE_ENABLE_AER (1 << VFIO_FEATURE_ENABLE_AER_BIT)
     int32_t bootindex;
     uint8_t pm_cap;
     bool has_vga;
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 08/14] vfio: add check host bus reset is support or not
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (6 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 07/14] vfio: add aer support for " Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-18 10:32   ` Marcel Apfelbaum
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 09/14] add check reset mechanism when hotplug vfio device Cao jin
                   ` (6 subsequent siblings)
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

when init vfio devices done, we should test all the devices supported
aer whether conflict with others. For each one, get the hot reset
info for the affected device list.  For each affected device, all
should attach to the VM and on/below the same bus. also, we should test
all of the non-AER supporting vfio-pci devices on or below the target
bus to verify they have a reset mechanism.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 hw/vfio/pci.h |   1 +
 2 files changed, 232 insertions(+), 7 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 38b0aa5..16ab0e3 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1832,6 +1832,218 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static bool vfio_pci_host_slot_match(PCIHostDeviceAddress *host1,
+                                     PCIHostDeviceAddress *host2)
+{
+    return (host1->domain == host2->domain && host1->bus == host2->bus &&
+            host1->slot == host2->slot);
+}
+
+static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
+                                PCIHostDeviceAddress *host2)
+{
+    return (vfio_pci_host_slot_match(host1, host2) &&
+            host1->function == host2->function);
+}
+
+struct VFIODeviceFind {
+    PCIDevice *pdev;
+    bool found;
+};
+
+static void vfio_check_device_noreset(PCIBus *bus, PCIDevice *pdev,
+                                      void *opaque)
+{
+    DeviceState *dev = DEVICE(pdev);
+    DeviceClass *dc = DEVICE_GET_CLASS(dev);
+    VFIOPCIDevice *vdev;
+    struct VFIODeviceFind *find = opaque;
+
+    if (find->found) {
+        return;
+    }
+
+    if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
+        if (!dc->reset) {
+            goto found;
+        }
+        return;
+    }
+    vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
+    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+        !vdev->vbasedev.reset_works) {
+        goto found;
+    }
+
+    return;
+found:
+    find->pdev = pdev;
+    find->found = true;
+}
+
+static void device_find(PCIBus *bus, PCIDevice *pdev, void *opaque)
+{
+    struct VFIODeviceFind *find = opaque;
+
+    if (find->found) {
+        return;
+    }
+
+    if (pdev == find->pdev) {
+        find->found = true;
+    }
+}
+
+static int vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
+{
+    PCIBus *bus = vdev->pdev.bus;
+    struct vfio_pci_hot_reset_info *info = NULL;
+    struct vfio_pci_dependent_device *devices;
+    VFIOGroup *group;
+    struct VFIODeviceFind find;
+    int ret, i;
+
+    ret = vfio_get_hot_reset_info(vdev, &info);
+    if (ret) {
+        error_report("vfio: Cannot enable AER for device %s,"
+                     " device does not support hot reset.",
+                     vdev->vbasedev.name);
+        goto out;
+    }
+
+    /* List all affected devices by bus reset */
+    devices = &info->devices[0];
+
+    /* Verify that we have all the groups required */
+    for (i = 0; i < info->count; i++) {
+        PCIHostDeviceAddress host;
+        VFIOPCIDevice *tmp;
+        VFIODevice *vbasedev_iter;
+        bool found = false;
+
+        host.domain = devices[i].segment;
+        host.bus = devices[i].bus;
+        host.slot = PCI_SLOT(devices[i].devfn);
+        host.function = PCI_FUNC(devices[i].devfn);
+
+        /* Skip the current device */
+        if (vfio_pci_host_match(&host, &vdev->host)) {
+            continue;
+        }
+
+        /* Ensure we own the group of the affected device */
+        QLIST_FOREACH(group, &vfio_group_list, next) {
+            if (group->groupid == devices[i].group_id) {
+                break;
+            }
+        }
+
+        if (!group) {
+            error_report("vfio: Cannot enable AER for device %s, "
+                         "depends on group %d which is not owned.",
+                         vdev->vbasedev.name, devices[i].group_id);
+            ret = -1;
+            goto out;
+        }
+
+        /* Ensure affected devices for reset on/blow the bus */
+        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+            if (vfio_pci_host_match(&host, &tmp->host)) {
+                PCIDevice *pci = PCI_DEVICE(tmp);
+
+                /*
+                 * AER errors may be broadcast to all functions of a multi-
+                 * function endpoint.  If any of those sibling functions are
+                 * also assigned, they need to have AER enabled or else an
+                 * error may continue to cause a vm_stop condition.  IOW,
+                 * AER setup of this function would be pointless.
+                 */
+                if (vfio_pci_host_slot_match(&vdev->host, &tmp->host) &&
+                    !(tmp->features & VFIO_FEATURE_ENABLE_AER)) {
+                    error_report("vfio: Cannot enable AER for device %s, on same slot"
+                                 " the dependent device %s which does not enable AER.",
+                                 vdev->vbasedev.name, tmp->vbasedev.name);
+                    ret = -1;
+                    goto out;
+                }
+
+                find.pdev = pci;
+                find.found = false;
+                pci_for_each_device(bus, pci_bus_num(bus),
+                                    device_find, &find);
+                if (!find.found) {
+                    error_report("vfio: Cannot enable AER for device %s, "
+                                 "the dependent device %s is not under the same bus",
+                                 vdev->vbasedev.name, tmp->vbasedev.name);
+                    ret = -1;
+                    goto out;
+                }
+                found = true;
+                break;
+            }
+        }
+
+        /* Ensure all affected devices assigned to VM */
+        if (!found) {
+            error_report("vfio: Cannot enable AER for device %s, "
+                         "the dependent device %04x:%02x:%02x.%x "
+                         "is not assigned to VM.",
+                         vdev->vbasedev.name, host.domain, host.bus,
+                         host.slot, host.function);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    /*
+     * Check the all pci devices on or below the target bus
+     * have a reset mechanism at least.
+     */
+    find.pdev = NULL;
+    find.found = false;
+    pci_for_each_device(bus, pci_bus_num(bus),
+                        vfio_check_device_noreset, &find);
+    if (find.found) {
+        error_report("vfio: Cannot enable AER for device %s, "
+                     "the affected device %s does not have a reset mechanism.",
+                     vdev->vbasedev.name, find.pdev->name);
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    g_free(info);
+    return ret;
+}
+
+static int vfio_check_devices_host_bus_reset(void)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev;
+    VFIOPCIDevice *vdev;
+
+    /* Check All vfio-pci devices if have bus reset capability */
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+            if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+                vfio_check_host_bus_reset(vdev)) {
+                return -1;
+            }
+        }
+    }
+
+    return 0;
+}
+
 static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
                           int pos, uint16_t size)
 {
@@ -2009,13 +2221,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
     vfio_intx_enable(vdev);
 }
 
-static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
-                                PCIHostDeviceAddress *host2)
-{
-    return (host1->domain == host2->domain && host1->bus == host2->bus &&
-            host1->slot == host2->slot && host1->function == host2->function);
-}
-
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 {
     VFIOGroup *group;
@@ -2521,6 +2726,20 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }
 
+static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused)
+{
+    int ret;
+
+    ret = vfio_check_devices_host_bus_reset();
+    if (ret) {
+        exit(1);
+    }
+}
+
+static Notifier machine_notifier = {
+    .notify = vfio_pci_machine_done_notify,
+};
+
 static int vfio_initfn(PCIDevice *pdev)
 {
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
@@ -2867,6 +3086,11 @@ static const TypeInfo vfio_pci_dev_info = {
 static void register_vfio_pci_dev_type(void)
 {
     type_register_static(&vfio_pci_dev_info);
+    /*
+     * Register notifier when machine init is done, since we need
+     * check the configration manner after all vfio device are inited.
+     */
+    qemu_add_machine_init_done_notifier(&machine_notifier);
 }
 
 type_init(register_vfio_pci_dev_type)
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 48c1f69..59ae194 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -15,6 +15,7 @@
 #include "qemu-common.h"
 #include "exec/memory.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_bridge.h"
 #include "hw/vfio/vfio-common.h"
 #include "qemu/event_notifier.h"
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 09/14] add check reset mechanism when hotplug vfio device
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (7 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 08/14] vfio: add check host bus reset is support or not Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-18 11:03   ` Marcel Apfelbaum
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 10/14] pci: introduce pci bus pre reset Cao jin
                   ` (5 subsequent siblings)
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

Since we support multi-function hotplug. the function 0 indicate
the closure of the slot, so we have the chance to do the check.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/pci/pci.c             | 29 +++++++++++++++++++++++++++++
 hw/vfio/pci.c            | 19 +++++++++++++++++++
 hw/vfio/pci.h            |  2 ++
 include/hw/pci/pci_bus.h |  5 +++++
 4 files changed, 55 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 168b9cc..f6ca6ef 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -81,6 +81,7 @@ static void pci_bus_realize(BusState *qbus, Error **errp)
     PCIBus *bus = PCI_BUS(qbus);
 
     vmstate_register(NULL, -1, &vmstate_pcibus, bus);
+    notifier_with_return_list_init(&bus->hotplug_notifiers);
 }
 
 static void pci_bus_unrealize(BusState *qbus, Error **errp)
@@ -1835,6 +1836,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
     return bus->devices[devfn];
 }
 
+void pci_bus_add_hotplug_notifier(PCIBus *bus, NotifierWithReturn *notify)
+{
+    notifier_with_return_list_add(&bus->hotplug_notifiers, notify);
+}
+
+void pci_bus_remove_hotplug_notifier(NotifierWithReturn *notifier)
+{
+    notifier_with_return_remove(notifier);
+}
+
+static int pci_bus_hotplug_notifier(PCIBus *bus, void *opaque)
+{
+    return notifier_with_return_list_notify(&bus->hotplug_notifiers,
+                                            opaque);
+}
+
 static void pci_qdev_realize(DeviceState *qdev, Error **errp)
 {
     PCIDevice *pci_dev = (PCIDevice *)qdev;
@@ -1877,6 +1894,18 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
         pci_qdev_unrealize(DEVICE(pci_dev), NULL);
         return;
     }
+
+    /*
+     *  If the function is func 0, indicate the closure of the slot.
+     *  signal the callback.
+     */
+    if (DEVICE(pci_dev)->hotplugged &&
+        pci_get_function_0(pci_dev) == pci_dev &&
+        pci_bus_hotplug_notifier(bus, pci_dev)) {
+        error_setg(errp, "failed to hotplug function 0");
+        pci_qdev_unrealize(DEVICE(pci_dev), NULL);
+        return;
+    }
 }
 
 static void pci_default_realize(PCIDevice *dev, Error **errp)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 16ab0e3..ff25c9b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2044,6 +2044,19 @@ static int vfio_check_devices_host_bus_reset(void)
     return 0;
 }
 
+static int vfio_check_bus_reset(NotifierWithReturn *n, void *opaque)
+{
+    VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, hotplug_notifier);
+    PCIDevice *pci_dev = PCI_DEVICE(vdev);
+    PCIDevice *pci_func0 = opaque;
+
+    if (pci_get_function_0(pci_dev) != pci_func0) {
+        return 0;
+    }
+
+    return vfio_check_host_bus_reset(vdev);
+}
+
 static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
                           int pos, uint16_t size)
 {
@@ -2091,6 +2104,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
         pdev->exp.aer_log.log_max = 0;
     }
 
+    vdev->hotplug_notifier.notify = vfio_check_bus_reset;
+    pci_bus_add_hotplug_notifier(pdev->bus, &vdev->hotplug_notifier);
+
     pcie_cap_deverr_init(pdev);
     return pcie_aer_init(pdev, pos, size);
 
@@ -2972,6 +2988,9 @@ static void vfio_exitfn(PCIDevice *pdev)
     vfio_unregister_req_notifier(vdev);
     vfio_unregister_err_notifier(vdev);
     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+    if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+        pci_bus_remove_hotplug_notifier(&vdev->hotplug_notifier);
+    }
     vfio_disable_interrupts(vdev);
     if (vdev->intx.mmap_timer) {
         timer_free(vdev->intx.mmap_timer);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 59ae194..b385f07 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -142,6 +142,8 @@ typedef struct VFIOPCIDevice {
     bool no_kvm_intx;
     bool no_kvm_msi;
     bool no_kvm_msix;
+
+    NotifierWithReturn hotplug_notifier;
 } VFIOPCIDevice;
 
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
index 403fec6..7812fa9 100644
--- a/include/hw/pci/pci_bus.h
+++ b/include/hw/pci/pci_bus.h
@@ -39,8 +39,13 @@ struct PCIBus {
        Keep a count of the number of devices with raised IRQs.  */
     int nirq;
     int *irq_count;
+
+    NotifierWithReturnList hotplug_notifiers;
 };
 
+void pci_bus_add_hotplug_notifier(PCIBus *bus, NotifierWithReturn *notify);
+void pci_bus_remove_hotplug_notifier(NotifierWithReturn *notify);
+
 typedef struct PCIBridgeWindows PCIBridgeWindows;
 
 /*
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 10/14] pci: introduce pci bus pre reset
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (8 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 09/14] add check reset mechanism when hotplug vfio device Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-14 20:36   ` Alex Williamson
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 11/14] vfio: introduce last reset sequence id Cao jin
                   ` (4 subsequent siblings)
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

avoid repeat bus reset, here introduce a sequence ID for each time
bus hot reset, so each vfio device could know whether they've already
been reset for that sequence ID.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/pci/pci.c             | 13 +++++++++++++
 hw/pci/pci_bridge.c      |  3 +++
 include/hw/pci/pci.h     |  1 +
 include/hw/pci/pci_bus.h |  3 +++
 4 files changed, 20 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index f6ca6ef..ceb72d5 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -91,6 +91,18 @@ static void pci_bus_unrealize(BusState *qbus, Error **errp)
     vmstate_unregister(NULL, &vmstate_pcibus, bus);
 }
 
+void pci_bus_pre_reset(PCIBus *bus, uint32_t seqid)
+{
+    PCIBus *sec;
+
+    bus->in_reset = true;
+    bus->reset_seqid = seqid;
+
+    QLIST_FOREACH(sec, &bus->child, sibling) {
+        pci_bus_pre_reset(sec, seqid);
+    }
+}
+
 static bool pcibus_is_root(PCIBus *bus)
 {
     return !bus->parent_dev;
@@ -276,6 +288,7 @@ static void pcibus_reset(BusState *qbus)
     for (i = 0; i < bus->nirq; i++) {
         assert(bus->irq_count[i] == 0);
     }
+    bus->in_reset = false;
 }
 
 static void pci_host_bus_register(PCIBus *bus, DeviceState *parent)
diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
index 40c97b1..c7f15a1 100644
--- a/hw/pci/pci_bridge.c
+++ b/hw/pci/pci_bridge.c
@@ -268,6 +268,9 @@ void pci_bridge_write_config(PCIDevice *d,
     newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
     if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
         /* Trigger hot reset on 0->1 transition. */
+        uint32_t seqid = s->sec_bus.reset_seqid++;
+
+        pci_bus_pre_reset(&s->sec_bus, seqid ? seqid : 1);
         qbus_reset_all(&s->sec_bus.qbus);
     }
 }
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 379b6e1..b811279 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -381,6 +381,7 @@ void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
 void pci_device_set_intx_routing_notifier(PCIDevice *dev,
                                           PCIINTxRoutingNotifier notifier);
 void pci_device_reset(PCIDevice *dev);
+void pci_bus_pre_reset(PCIBus *bus, uint32_t seqid);
 
 PCIDevice *pci_nic_init_nofail(NICInfo *nd, PCIBus *rootbus,
                                const char *default_model,
diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
index 7812fa9..dd6aaf1 100644
--- a/include/hw/pci/pci_bus.h
+++ b/include/hw/pci/pci_bus.h
@@ -40,6 +40,9 @@ struct PCIBus {
     int nirq;
     int *irq_count;
 
+    bool in_reset;
+    uint32_t reset_seqid;
+
     NotifierWithReturnList hotplug_notifiers;
 };
 
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 11/14] vfio: introduce last reset sequence id
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (9 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 10/14] pci: introduce pci bus pre reset Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-14 20:43   ` Alex Williamson
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 12/14] pcie_aer: expose pcie_aer_msg() interface Cao jin
                   ` (3 subsequent siblings)
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

avoid multi-reset host bus, we introduce sequence id to secify which
bus is resetting. and if one of the dependent devices has been do reset.
the others should skip.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 15 +++++++++++++++
 hw/vfio/pci.h |  3 +++
 2 files changed, 18 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ff25c9b..da4815e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1914,6 +1914,8 @@ static int vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
     /* List all affected devices by bus reset */
     devices = &info->devices[0];
 
+    vdev->single_depend_dev = (info->count == 1);
+
     /* Verify that we have all the groups required */
     for (i = 0; i < info->count; i++) {
         PCIHostDeviceAddress host;
@@ -2249,6 +2251,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 
     vfio_pci_pre_reset(vdev);
     vdev->vbasedev.needs_reset = false;
+    vdev->last_bus_reset_seqid = 0;
 
     ret = vfio_get_hot_reset_info(vdev, &info);
     if (ret) {
@@ -2310,6 +2313,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
                 }
                 vfio_pci_pre_reset(tmp);
                 tmp->vbasedev.needs_reset = false;
+                tmp->last_bus_reset_seqid = vdev->pdev.bus->reset_seqid;
                 multi = true;
                 break;
             }
@@ -3006,6 +3010,17 @@ static void vfio_pci_reset(DeviceState *dev)
 
     trace_vfio_pci_reset(vdev->vbasedev.name);
 
+    /* if need to do a hot reset */
+    if (pdev->bus->in_reset) {
+        if (vdev->last_bus_reset_seqid == pdev->bus->reset_seqid) {
+            vdev->last_bus_reset_seqid = 0;
+            return;
+        } else if ((vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+            vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+            return;
+        }
+    }
+
     vfio_pci_pre_reset(vdev);
 
     if (vdev->resetfn && !vdev->resetfn(vdev)) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index b385f07..728cbb8 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -143,6 +143,9 @@ typedef struct VFIOPCIDevice {
     bool no_kvm_msi;
     bool no_kvm_msix;
 
+    bool single_depend_dev;
+    uint32_t last_bus_reset_seqid;
+
     NotifierWithReturn hotplug_notifier;
 } VFIOPCIDevice;
 
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 12/14] pcie_aer: expose pcie_aer_msg() interface
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (10 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 11/14] vfio: introduce last reset sequence id Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 13/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (2 subsequent siblings)
  14 siblings, 0 replies; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

For vfio device, we need to propagate the aer error to
Guest OS. we use the pcie_aer_msg() to send aer error
to guest.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci/pcie_aer.c         | 2 +-
 include/hw/pci/pcie_aer.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c
index 45f351b..fbbd7d2 100644
--- a/hw/pci/pcie_aer.c
+++ b/hw/pci/pcie_aer.c
@@ -370,7 +370,7 @@ static void pcie_aer_msg_root_port(PCIDevice *dev, const PCIEAERMsg *msg)
  *
  * Walk up the bus tree from the device, propagate the error message.
  */
-static void pcie_aer_msg(PCIDevice *dev, const PCIEAERMsg *msg)
+void pcie_aer_msg(PCIDevice *dev, const PCIEAERMsg *msg)
 {
     uint8_t type;
 
diff --git a/include/hw/pci/pcie_aer.h b/include/hw/pci/pcie_aer.h
index 156acb0..c2ee4e2 100644
--- a/include/hw/pci/pcie_aer.h
+++ b/include/hw/pci/pcie_aer.h
@@ -102,5 +102,6 @@ void pcie_aer_root_write_config(PCIDevice *dev,
 
 /* error injection */
 int pcie_aer_inject_error(PCIDevice *dev, const PCIEAERErr *err);
+void pcie_aer_msg(PCIDevice *dev, const PCIEAERMsg *msg);
 
 #endif /* QEMU_PCIE_AER_H */
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 13/14] vfio-pci: pass the aer error to guest
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (11 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 12/14] pcie_aer: expose pcie_aer_msg() interface Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-18 10:45   ` Marcel Apfelbaum
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 14/14] vfio: add 'aer' property to expose aercap Cao jin
  2016-01-16 18:34 ` [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Michael S. Tsirkin
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

when the vfio device encounters an uncorrectable error in host,
the vfio_pci driver will signal the eventfd registered by this
vfio device, the results in the qemu eventfd handler getting
invoked.

this patch is to pass the error to guest and have the guest driver
recover from the error.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index da4815e..efa5e01 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2553,18 +2553,59 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
 static void vfio_err_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
+    PCIDevice *dev = &vdev->pdev;
+    PCIEAERMsg msg = {
+        .severity = 0,
+        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
+    };
 
     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
         return;
     }
 
     /*
-     * TBD. Retrieve the error details and decide what action
-     * needs to be taken. One of the actions could be to pass
-     * the error to the guest and have the guest driver recover
-     * from the error. This requires that PCIe capabilities be
-     * exposed to the guest. For now, we just terminate the
-     * guest to contain the error.
+     * in case the real hardware configration has been changed,
+     * here we should recheck the bus reset capability.
+     */
+    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+        vfio_check_host_bus_reset(vdev)) {
+        goto stop;
+    }
+    /*
+     * we should read the error details from the real hardware
+     * configuration spaces, here we only need to do is signaling
+     * to guest an uncorrectable error has occurred.
+     */
+    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+        dev->exp.aer_cap) {
+        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+        uint32_t uncor_status;
+        bool isfatal;
+
+        uncor_status = vfio_pci_read_config(dev,
+                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
+
+        /*
+         * if we receive the error signal but not this device, we can
+         * just ignore it.
+         */
+        if (!(uncor_status & ~0UL)) {
+            return;
+        }
+
+        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+
+        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
+                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
+
+        pcie_aer_msg(dev, &msg);
+        return;
+    }
+
+stop:
+    /*
+     * If the aer capability is not exposed to the guest. we just
+     * terminate the guest to contain the error.
      */
 
     error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH v16 14/14] vfio: add 'aer' property to expose aercap
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (12 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 13/14] vfio-pci: pass the aer error to guest Cao jin
@ 2016-01-12  2:43 ` Cao jin
  2016-01-18 10:46   ` Marcel Apfelbaum
  2016-01-16 18:34 ` [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Michael S. Tsirkin
  14 siblings, 1 reply; 42+ messages in thread
From: Cao jin @ 2016-01-12  2:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

add 'aer' property to let user able to decide whether expose
the aer capability. by default we should disable aer feature,
because it needs configuration restrictions.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index efa5e01..9afb5e0 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3119,6 +3119,8 @@ static Property vfio_pci_dev_properties[] = {
                        sub_vendor_id, PCI_ANY_ID),
     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
                        sub_device_id, PCI_ANY_ID),
+    DEFINE_PROP_BIT("aer", VFIOPCIDevice, features,
+                    VFIO_FEATURE_ENABLE_AER_BIT, false),
     /*
      * TODO - support passed fds... is this necessary?
      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 10/14] pci: introduce pci bus pre reset
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 10/14] pci: introduce pci bus pre reset Cao jin
@ 2016-01-14 20:36   ` Alex Williamson
  2016-01-19 10:15     ` Chen Fan
  0 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2016-01-14 20:36 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, mst

On Tue, 2016-01-12 at 10:43 +0800, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> avoid repeat bus reset, here introduce a sequence ID for each time
> bus hot reset, so each vfio device could know whether they've already
> been reset for that sequence ID.
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>  hw/pci/pci.c             | 13 +++++++++++++
>  hw/pci/pci_bridge.c      |  3 +++
>  include/hw/pci/pci.h     |  1 +
>  include/hw/pci/pci_bus.h |  3 +++
>  4 files changed, 20 insertions(+)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index f6ca6ef..ceb72d5 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -91,6 +91,18 @@ static void pci_bus_unrealize(BusState *qbus,
> Error **errp)
>      vmstate_unregister(NULL, &vmstate_pcibus, bus);
>  }
>  
> +void pci_bus_pre_reset(PCIBus *bus, uint32_t seqid)
> +{
> +    PCIBus *sec;
> +
> +    bus->in_reset = true;
> +    bus->reset_seqid = seqid;
> +
> +    QLIST_FOREACH(sec, &bus->child, sibling) {
> +        pci_bus_pre_reset(sec, seqid);
> +    }
> +}
> +
>  static bool pcibus_is_root(PCIBus *bus)
>  {
>      return !bus->parent_dev;
> @@ -276,6 +288,7 @@ static void pcibus_reset(BusState *qbus)
>      for (i = 0; i < bus->nirq; i++) {
>          assert(bus->irq_count[i] == 0);
>      }
> +    bus->in_reset = false;
>  }
>  
>  static void pci_host_bus_register(PCIBus *bus, DeviceState *parent)
> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> index 40c97b1..c7f15a1 100644
> --- a/hw/pci/pci_bridge.c
> +++ b/hw/pci/pci_bridge.c
> @@ -268,6 +268,9 @@ void pci_bridge_write_config(PCIDevice *d,
>      newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
>      if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
>          /* Trigger hot reset on 0->1 transition. */
> +        uint32_t seqid = s->sec_bus.reset_seqid++;

Doesn't this need to come from a global sequence ID?  Imagine the case
of a nested bus, the leaf bus is reset incrementing the sequence ID.
The devices on that bus store that sequence ID as they're reset.  The
parent bus is then reset, but all the devices on the leaf bus have
already been reset for that sequence ID and ignore the reset.

> +
> +        pci_bus_pre_reset(&s->sec_bus, seqid ? seqid : 1);

Does this work?  Seems like this would make devices ignore the second
bus reset after the VM is instantiated.  ie.  the first bus reset seqid
is 0, so we call pre_reset with 1, the second time we call it with 1
again.

>          qbus_reset_all(&s->sec_bus.qbus);

I'd be tempted to call qbus_walk_children() directly, it already has a
pre_busfn callback hook.

>      }
>  }
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 379b6e1..b811279 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -381,6 +381,7 @@ void pci_bus_fire_intx_routing_notifier(PCIBus
> *bus);
>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>                                            PCIINTxRoutingNotifier
> notifier);
>  void pci_device_reset(PCIDevice *dev);
> +void pci_bus_pre_reset(PCIBus *bus, uint32_t seqid);
>  
>  PCIDevice *pci_nic_init_nofail(NICInfo *nd, PCIBus *rootbus,
>                                 const char *default_model,
> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> index 7812fa9..dd6aaf1 100644
> --- a/include/hw/pci/pci_bus.h
> +++ b/include/hw/pci/pci_bus.h
> @@ -40,6 +40,9 @@ struct PCIBus {
>      int nirq;
>      int *irq_count;
>  
> +    bool in_reset;
> +    uint32_t reset_seqid;
> +
>      NotifierWithReturnList hotplug_notifiers;
>  };
>  

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 11/14] vfio: introduce last reset sequence id
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 11/14] vfio: introduce last reset sequence id Cao jin
@ 2016-01-14 20:43   ` Alex Williamson
  0 siblings, 0 replies; 42+ messages in thread
From: Alex Williamson @ 2016-01-14 20:43 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, mst

On Tue, 2016-01-12 at 10:43 +0800, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> avoid multi-reset host bus, we introduce sequence id to secify which
> bus is resetting. and if one of the dependent devices has been do reset.
> the others should skip.
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>  hw/vfio/pci.c | 15 +++++++++++++++
>  hw/vfio/pci.h |  3 +++
>  2 files changed, 18 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index ff25c9b..da4815e 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1914,6 +1914,8 @@ static int vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
>      /* List all affected devices by bus reset */
>      devices = &info->devices[0];
>  
> +    vdev->single_depend_dev = (info->count == 1);
> +
>      /* Verify that we have all the groups required */
>      for (i = 0; i < info->count; i++) {
>          PCIHostDeviceAddress host;
> @@ -2249,6 +2251,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>  
>      vfio_pci_pre_reset(vdev);
>      vdev->vbasedev.needs_reset = false;
> +    vdev->last_bus_reset_seqid = 0;
>  
>      ret = vfio_get_hot_reset_info(vdev, &info);
>      if (ret) {
> @@ -2310,6 +2313,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>                  }
>                  vfio_pci_pre_reset(tmp);
>                  tmp->vbasedev.needs_reset = false;
> +                tmp->last_bus_reset_seqid = vdev->pdev.bus->reset_seqid;
>                  multi = true;
>                  break;
>              }
> @@ -3006,6 +3010,17 @@ static void vfio_pci_reset(DeviceState *dev)
>  
>      trace_vfio_pci_reset(vdev->vbasedev.name);
>  
> +    /* if need to do a hot reset */
> +    if (pdev->bus->in_reset) {
> +        if (vdev->last_bus_reset_seqid == pdev->bus->reset_seqid) {
> +            vdev->last_bus_reset_seqid = 0;
> +            return;

Why would we ever clear this?  You're assuming there will only ever be
one reset callback per device per bus reset.  But we really don't need
to make that assumption.

> +        } else if ((vdev->features & VFIO_FEATURE_ENABLE_AER)) {
> +            vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +            return;
> +        }
> +    }
> +
>      vfio_pci_pre_reset(vdev);
>  
>      if (vdev->resetfn && !vdev->resetfn(vdev)) {
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index b385f07..728cbb8 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -143,6 +143,9 @@ typedef struct VFIOPCIDevice {
>      bool no_kvm_msi;
>      bool no_kvm_msix;
>  
> +    bool single_depend_dev;
> +    uint32_t last_bus_reset_seqid;
> +
>      NotifierWithReturn hotplug_notifier;
>  } VFIOPCIDevice;
>  

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
  2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
                   ` (13 preceding siblings ...)
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 14/14] vfio: add 'aer' property to expose aercap Cao jin
@ 2016-01-16 18:34 ` Michael S. Tsirkin
  2016-01-19  9:09   ` Chen Fan
  2016-02-03  8:54   ` Chen Fan
  14 siblings, 2 replies; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-01-16 18:34 UTC (permalink / raw)
  To: Cao jin; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, qemu-devel

On Tue, Jan 12, 2016 at 10:43:01AM +0800, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> For now, for vfio pci passthough devices when qemu receives
> an error from host aer report, currentlly just terminate the guest,
> but usually user want to know what error occurred but stopping the
> guest, so this patches add aer capability support for vfio device,
> and pass the error to guest, and have guest driver to recover
> from the error.

I would like to see a version of this patchset that doesn't
depend on pci core changes.
I think that if you make this simplifying assumption:

- all devices on same bus in guest are on same bus in host

then you can handle both reset and hotplug simply in function 0
since it will belong to vfio.

So we can have a version without pci core changes that simply assumes
this, and things will just work.


Now, if we wanted to enforce this limitation, I think the
cleanest way would be to add a callback in struct PCIDevice:

	bool is_valid_function(PCIDevice *newfunction)

and call it as each function is added.
This way aer function can validate that each function
added shares the same bus.
And this way issues will be detected directly and not when
function 0 is added.

I would prefer this validation code to be a patch on top so we can merge
the functionality directly and avoid blocking it while we figure out the
best api to validate things.

I don't see why making guest topology match host would
ever be a problem, but if it's required to support
configurations where these differ, I'd like to see
an attempt to address that be split out, after aer
is supported.



> v15-v16:
>    10/14, 11/14 are new to introduce a reset sequence id to specify the
>    vfio devices has been reset for that reset. other patches aren't modified.
> 
> v14-v15:
>    1. add device hot reset callback
>    2. add bus_in_reset for vfio device to avoid multi do host bus reset
> 
> v13-v14:
>    1. for multifunction device, requiring all functions enable AER.(9/13)
>    2. due to all affected functions receive error signal, ignore no
>       error occurred function. (12/13)
> 
> v12-v13:
>    1. since support multifuncion hotplug, here add callback to enable aer.
>    2. add pci device pre+post reset for aer host reset.
> 
> Chen Fan (14):
>   vfio: extract vfio_get_hot_reset_info as a single function
>   vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
>   pcie: modify the capability size assert
>   vfio: make the 4 bytes aligned for capability size
>   vfio: add pcie extanded capability support
>   aer: impove pcie_aer_init to support vfio device
>   vfio: add aer support for vfio device
>   vfio: add check host bus reset is support or not
>   add check reset mechanism when hotplug vfio device
>   pci: introduce pci bus pre reset
>   vfio: introduce last reset sequence id
>   pcie_aer: expose pcie_aer_msg() interface
>   vfio-pci: pass the aer error to guest
>   vfio: add 'aer' property to expose aercap
> 
>  hw/pci-bridge/ioh3420.c            |   2 +-
>  hw/pci-bridge/xio3130_downstream.c |   2 +-
>  hw/pci-bridge/xio3130_upstream.c   |   2 +-
>  hw/pci/pci.c                       |  42 +++
>  hw/pci/pci_bridge.c                |   3 +
>  hw/pci/pcie.c                      |   2 +-
>  hw/pci/pcie_aer.c                  |   6 +-
>  hw/vfio/pci.c                      | 616 +++++++++++++++++++++++++++++++++----
>  hw/vfio/pci.h                      |   9 +
>  include/hw/pci/pci.h               |   1 +
>  include/hw/pci/pci_bus.h           |   8 +
>  include/hw/pci/pcie_aer.h          |   3 +-
>  12 files changed, 624 insertions(+), 72 deletions(-)
> 
> -- 
> 1.9.3
> 
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 04/14] vfio: make the 4 bytes aligned for capability size
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 04/14] vfio: make the 4 bytes aligned for capability size Cao jin
@ 2016-01-17 12:30   ` Marcel Apfelbaum
  0 siblings, 0 replies; 42+ messages in thread
From: Marcel Apfelbaum @ 2016-01-17 12:30 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

On 01/12/2016 04:43 AM, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>
> this function search the capability from the end, the last
> size should 0x100 - pos, not 0xff - pos.

Indeed, "next" should be the first address of the next capability.


Reviewed-by: Marcel Apfelbaum <marcel@redhat.com>

>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>   hw/vfio/pci.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index a63cf85..288f2c7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1469,7 +1469,8 @@ static void vfio_unmap_bars(VFIOPCIDevice *vdev)
>    */
>   static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
>   {
> -    uint8_t tmp, next = 0xff;
> +    uint8_t tmp;
> +    uint16_t next = PCI_CONFIG_SPACE_SIZE;
>
>       for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
>            tmp = pdev->config[tmp + 1]) {
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 01/14] vfio: extract vfio_get_hot_reset_info as a single function
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 01/14] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
@ 2016-01-17 12:48   ` Marcel Apfelbaum
  0 siblings, 0 replies; 42+ messages in thread
From: Marcel Apfelbaum @ 2016-01-17 12:48 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

On 01/12/2016 04:43 AM, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>
> the function is used to get affected devices by bus reset.
> so here extract it, and can used for aer soon.
>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>   hw/vfio/pci.c | 66 +++++++++++++++++++++++++++++++++++++++++++----------------
>   1 file changed, 48 insertions(+), 18 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 1fb868c..efcd3cd 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1654,6 +1654,51 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
>       }
>   }
>
> +/*
> + * return negative with errno, return 0 on success.
> + * if success, the point of ret_info fill with the affected device reset info.
> + *
> + */
> +static int vfio_get_hot_reset_info(VFIOPCIDevice *vdev,
> +                                   struct vfio_pci_hot_reset_info **ret_info)
> +{
> +    struct vfio_pci_hot_reset_info *info;
> +    int ret, count;
> +
> +    *ret_info = NULL;
> +
> +    info = g_malloc0(sizeof(*info));
> +    info->argsz = sizeof(*info);
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
> +    if (ret && errno != ENOSPC) {
> +        ret = -errno;
> +        goto error;
> +    }
> +
> +    count = info->count;
> +
> +    info = g_realloc(info, sizeof(*info) +
> +                     (count * sizeof(struct vfio_pci_dependent_device)));
> +    info->argsz = sizeof(*info) +
> +                  (count * sizeof(struct vfio_pci_dependent_device));
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
> +    if (ret) {
> +        ret = -errno;
> +        error_report("vfio: hot reset info failed: %m");
> +        goto error;
> +    }
> +
> +    *ret_info = info;
> +    info = NULL;
> +
> +    return 0;
> +error:
> +    g_free(info);
> +    return ret;
> +}
> +
>   static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
>   {
>       PCIDevice *pdev = &vdev->pdev;
> @@ -1793,7 +1838,7 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
>   static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>   {
>       VFIOGroup *group;
> -    struct vfio_pci_hot_reset_info *info;
> +    struct vfio_pci_hot_reset_info *info = NULL;
>       struct vfio_pci_dependent_device *devices;
>       struct vfio_pci_hot_reset *reset;
>       int32_t *fds;
> @@ -1805,12 +1850,8 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>       vfio_pci_pre_reset(vdev);
>       vdev->vbasedev.needs_reset = false;
>
> -    info = g_malloc0(sizeof(*info));
> -    info->argsz = sizeof(*info);
> -
> -    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
> -    if (ret && errno != ENOSPC) {
> -        ret = -errno;
> +    ret = vfio_get_hot_reset_info(vdev, &info);
> +    if (ret) {
>           if (!vdev->has_pm_reset) {
>               error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
>                            "no available reset mechanism.", vdev->host.domain,


Hi,

I don't know how important this is, however if the second call to ioctl
fails (int the new vfio_get_hot_reset_info function) we will get both error
messages, the last one "no available reset mechanism"  being unnecessary/(wrong?).

You may want to move the error message to the new function (again, not sure
if it worth doing it)

Thanks,
Marcel

> @@ -1819,18 +1860,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>           goto out_single;
>       }
>
> -    count = info->count;
> -    info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
> -    info->argsz = sizeof(*info) + (count * sizeof(*devices));
>       devices = &info->devices[0];
> -
> -    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
> -    if (ret) {
> -        ret = -errno;
> -        error_report("vfio: hot reset info failed: %m");
> -        goto out_single;
> -    }
> -
>       trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
>
>       /* Verify that we have all the groups required */
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 02/14] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 02/14] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset Cao jin
@ 2016-01-17 13:01   ` Marcel Apfelbaum
  0 siblings, 0 replies; 42+ messages in thread
From: Marcel Apfelbaum @ 2016-01-17 13:01 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

On 01/12/2016 04:43 AM, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>
> squeeze out vfio_pci_do_hot_reset to do host bus reset when AER recovery.
>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>   hw/vfio/pci.c | 75 +++++++++++++++++++++++++++++++++++------------------------
>   1 file changed, 44 insertions(+), 31 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index efcd3cd..a63cf85 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1699,6 +1699,48 @@ error:
>       return ret;
>   }
>
> +static int vfio_pci_do_hot_reset(VFIOPCIDevice *vdev,
> +                                 struct vfio_pci_hot_reset_info *info)
> +{
> +    VFIOGroup *group;
> +    struct vfio_pci_hot_reset *reset;
> +    int32_t *fds;
> +    int ret, i, count;
> +    struct vfio_pci_dependent_device *devices;
> +
> +    /* Determine how many group fds need to be passed */
> +    count = 0;
> +    devices = &info->devices[0];
> +    QLIST_FOREACH(group, &vfio_group_list, next) {
> +        for (i = 0; i < info->count; i++) {
> +            if (group->groupid == devices[i].group_id) {
> +                count++;
> +                break;
> +            }
> +        }
> +    }
> +
> +    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
> +    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
> +    fds = &reset->group_fds[0];
> +
> +    /* Fill in group fds */
> +    QLIST_FOREACH(group, &vfio_group_list, next) {
> +        for (i = 0; i < info->count; i++) {
> +            if (group->groupid == devices[i].group_id) {
> +                fds[reset->count++] = group->fd;
> +                break;
> +            }
> +        }
> +    }
> +
> +    /* Bus reset! */
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
> +    g_free(reset);
> +
> +    return ret;
> +}
> +
>   static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
>   {
>       PCIDevice *pdev = &vdev->pdev;
> @@ -1840,9 +1882,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>       VFIOGroup *group;
>       struct vfio_pci_hot_reset_info *info = NULL;
>       struct vfio_pci_dependent_device *devices;
> -    struct vfio_pci_hot_reset *reset;
> -    int32_t *fds;
> -    int ret, i, count;
> +    int ret, i;
>       bool multi = false;
>
>       trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
> @@ -1921,34 +1961,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>           goto out_single;
>       }
>
> -    /* Determine how many group fds need to be passed */
> -    count = 0;
> -    QLIST_FOREACH(group, &vfio_group_list, next) {
> -        for (i = 0; i < info->count; i++) {
> -            if (group->groupid == devices[i].group_id) {
> -                count++;
> -                break;
> -            }
> -        }
> -    }
> -
> -    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
> -    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
> -    fds = &reset->group_fds[0];
> -
> -    /* Fill in group fds */
> -    QLIST_FOREACH(group, &vfio_group_list, next) {
> -        for (i = 0; i < info->count; i++) {
> -            if (group->groupid == devices[i].group_id) {
> -                fds[reset->count++] = group->fd;
> -                break;
> -            }
> -        }
> -    }
> -
> -    /* Bus reset! */
> -    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
> -    g_free(reset);
> +    ret = vfio_pci_do_hot_reset(vdev, info);
>
>       trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
>                                       ret ? "%m" : "Success");
>


The commit message may be improved, other than that it looks OK to me.

Reviewed-by: Marcel Apfelbaum <marcel@redhat.com>

Thanks,
Marcel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 05/14] vfio: add pcie extanded capability support
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 05/14] vfio: add pcie extanded capability support Cao jin
@ 2016-01-17 13:22   ` Marcel Apfelbaum
  2016-01-19  9:44     ` Chen Fan
  0 siblings, 1 reply; 42+ messages in thread
From: Marcel Apfelbaum @ 2016-01-17 13:22 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

On 01/12/2016 04:43 AM, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>

Hi,

I noticed a type in the subject, extanded -> extended

> For vfio pcie device, we could expose the extended capability on
> PCIE bus. in order to avoid config space broken, we introduce
> a copy config for parsing extended caps. and rebuild the pcie
> extended config space.

Maybe we can re-word this. Will someone with better English skills
advice :) ?

>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>   hw/vfio/pci.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 69 insertions(+), 1 deletion(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 288f2c7..64b0867 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1482,6 +1482,21 @@ static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
>       return next - pos;
>   }
>
> +
> +static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
> +{
> +    uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
> +
> +    for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
> +        tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
> +        if (tmp > pos && tmp < next) {
> +            next = tmp;
> +        }
> +    }
> +
> +    return next - pos;
> +}

Can't we reuse vfio_std_cap_max_size here? if only the config size differs,
we can pass it as parameter.

> +
>   static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
>   {
>       pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
> @@ -1817,16 +1832,69 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
>       return 0;
>   }
>
> +static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +    uint32_t header;
> +    uint16_t cap_id, next, size;
> +    uint8_t cap_ver;
> +    uint8_t *config;
> +
> +    /*
> +     * In order to avoid breaking config space, create a copy to
> +     * use for parsing extended capabilities.

It will be nice to know *how* do we break/*what* will break the config
space, I confess that I didn't see it :(.

> +     */
> +    config = g_memdup(pdev->config, vdev->config_size);
> +
> +    for (next = PCI_CONFIG_SPACE_SIZE; next;
> +         next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
> +        header = pci_get_long(config + next);
> +        cap_id = PCI_EXT_CAP_ID(header);
> +        cap_ver = PCI_EXT_CAP_VER(header);
> +
> +        /*
> +         * If it becomes important to configure extended capabilities to their
> +         * actual size, use this as the default when it's something we don't
> +         * recognize. Since QEMU doesn't actually handle many of the config
> +         * accesses, exact size doesn't seem worthwhile.
> +         */
> +        size = vfio_ext_cap_max_size(config, next);
> +
> +        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> +        pci_set_long(dev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
> +
> +        /* Use emulated next pointer to allow dropping extended caps */
> +        pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
> +                                   PCI_EXT_CAP_NEXT_MASK);
> +    }
> +
> +    g_free(config);
> +    return 0;
> +}
> +
>   static int vfio_add_capabilities(VFIOPCIDevice *vdev)
>   {
>       PCIDevice *pdev = &vdev->pdev;
> +    int ret;
>
>       if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
>           !pdev->config[PCI_CAPABILITY_LIST]) {
>           return 0; /* Nothing to add */
>       }
>
> -    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
> +    ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    /* on PCI bus, it doesn't make sense to expose extended capabilities. */
> +    if (!pci_is_express(pdev) ||
> +        !pci_bus_is_express(pdev->bus) ||
> +        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {

I am curious about the last check, "!pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)",
can you please explain?

Thank you,
Marcel

> +        return 0;
> +    }
> +
> +    return vfio_add_ext_cap(vdev);
>   }
>
>   static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 07/14] vfio: add aer support for vfio device
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 07/14] vfio: add aer support for " Cao jin
@ 2016-01-18  9:12   ` Marcel Apfelbaum
  2016-01-19  9:47     ` Chen Fan
  0 siblings, 1 reply; 42+ messages in thread
From: Marcel Apfelbaum @ 2016-01-18  9:12 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

On 01/12/2016 04:43 AM, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>
> Calling pcie_aer_init to initilize aer related registers for
> vfio device, then reload physical related registers to expose
> device capability.
>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>   hw/vfio/pci.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>   hw/vfio/pci.h |  3 +++
>   2 files changed, 81 insertions(+), 3 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 64b0867..38b0aa5 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1832,6 +1832,62 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
>       return 0;
>   }
>
> +static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
> +                          int pos, uint16_t size)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +    PCIDevice *dev_iter;
> +    uint8_t type;
> +    uint32_t errcap;
> +
> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
> +        pcie_add_capability(pdev, PCI_EXT_CAP_ID_ERR,
> +                            cap_ver, pos, size);
> +        return 0;
> +    }
> +
> +    dev_iter = pci_bridge_get_device(pdev->bus);
> +    if (!dev_iter) {
> +        goto error;
> +    }
> +
> +    while (dev_iter) {
> +        type = pcie_cap_get_type(dev_iter);
> +        if ((type != PCI_EXP_TYPE_ROOT_PORT &&
> +             type != PCI_EXP_TYPE_UPSTREAM &&
> +             type != PCI_EXP_TYPE_DOWNSTREAM)) {
> +            goto error;
> +        }
> +
> +        if (!dev_iter->exp.aer_cap) {
> +            goto error;
> +        }
> +
> +        dev_iter = pci_bridge_get_device(dev_iter->bus);
> +    }
> +
> +    errcap = vfio_pci_read_config(pdev, pos + PCI_ERR_CAP, 4);
> +    /*
> +     * The ability to record multiple headers is depending on
> +     * the state of the Multiple Header Recording Capable bit and
> +     * enabled by the Multiple Header Recording Enable bit.
> +     */
> +    if ((errcap & PCI_ERR_CAP_MHRC) &&
> +        (errcap & PCI_ERR_CAP_MHRE)) {
> +        pdev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
> +    } else {
> +        pdev->exp.aer_log.log_max = 0;
> +    }
> +
> +    pcie_cap_deverr_init(pdev);
> +    return pcie_aer_init(pdev, pos, size);
> +
> +error:
> +    error_report("vfio: Unable to enable AER for device %s, parent bus "
> +                 "does not support AER signaling", vdev->vbasedev.name);
> +    return -1;
> +}
> +
>   static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>   {
>       PCIDevice *pdev = &vdev->pdev;
> @@ -1839,6 +1895,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>       uint16_t cap_id, next, size;
>       uint8_t cap_ver;
>       uint8_t *config;
> +    int ret = 0;
>
>       /*
>        * In order to avoid breaking config space, create a copy to
> @@ -1860,16 +1917,29 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>            */
>           size = vfio_ext_cap_max_size(config, next);
>
> -        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> -        pci_set_long(dev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
> +        switch (cap_id) {
> +        case PCI_EXT_CAP_ID_ERR:
> +            ret = vfio_setup_aer(vdev, cap_ver, next, size);
> +            break;
> +        default:
> +            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> +            break;
> +        }
> +
> +        if (ret) {
> +            goto out;
> +        }
> +
> +        pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
>
>           /* Use emulated next pointer to allow dropping extended caps */
>           pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
>                                      PCI_EXT_CAP_NEXT_MASK);
>       }
>
> +out:
>       g_free(config);
> -    return 0;
> +    return ret;
>   }
>
>   static int vfio_add_capabilities(VFIOPCIDevice *vdev)
> @@ -2624,6 +2694,11 @@ static int vfio_initfn(PCIDevice *pdev)
>           goto out_teardown;
>       }
>
> +    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
> +        !pdev->exp.aer_cap) {

Hi,

I think we need an error_report here, otherwise the init
will fail without knowing the reason.


Thanks,
Marcel


> +        goto out_teardown;
> +    }
> +
>       /* QEMU emulates all of MSI & MSIX */
>       if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
>           memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index f004d52..48c1f69 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -15,6 +15,7 @@
>   #include "qemu-common.h"
>   #include "exec/memory.h"
>   #include "hw/pci/pci.h"
> +#include "hw/pci/pci_bridge.h"
>   #include "hw/vfio/vfio-common.h"
>   #include "qemu/event_notifier.h"
>   #include "qemu/queue.h"
> @@ -127,6 +128,8 @@ typedef struct VFIOPCIDevice {
>   #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
>   #define VFIO_FEATURE_ENABLE_REQ_BIT 1
>   #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT)
> +#define VFIO_FEATURE_ENABLE_AER_BIT 2
> +#define VFIO_FEATURE_ENABLE_AER (1 << VFIO_FEATURE_ENABLE_AER_BIT)
>       int32_t bootindex;
>       uint8_t pm_cap;
>       bool has_vga;
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 08/14] vfio: add check host bus reset is support or not
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 08/14] vfio: add check host bus reset is support or not Cao jin
@ 2016-01-18 10:32   ` Marcel Apfelbaum
  2016-01-19  9:55     ` Chen Fan
  0 siblings, 1 reply; 42+ messages in thread
From: Marcel Apfelbaum @ 2016-01-18 10:32 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

On 01/12/2016 04:43 AM, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>

Hi,

I think the subject should be rephrased.

> when init vfio devices done, we should test all the devices supported
> aer whether conflict with others. For each one, get the hot reset
> info for the affected device list.  For each affected device, all
> should attach to the VM and on/below the same bus. also, we should test
> all of the non-AER supporting vfio-pci devices on or below the target
> bus to verify they have a reset mechanism.


Maybe instead of explaining what this patch does you can simply
say what are the requirements:

Something like: Check there are no AER conflicts by making sure the
devices are behind on/below the same bus... (someone with better English may help :) )

>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>   hw/vfio/pci.c | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>   hw/vfio/pci.h |   1 +
>   2 files changed, 232 insertions(+), 7 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 38b0aa5..16ab0e3 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1832,6 +1832,218 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
>       return 0;
>   }
>
> +static bool vfio_pci_host_slot_match(PCIHostDeviceAddress *host1,
> +                                     PCIHostDeviceAddress *host2)
> +{
> +    return (host1->domain == host2->domain && host1->bus == host2->bus &&
> +            host1->slot == host2->slot);
> +}
> +
> +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
> +                                PCIHostDeviceAddress *host2)
> +{
> +    return (vfio_pci_host_slot_match(host1, host2) &&
> +            host1->function == host2->function);
> +}
> +
> +struct VFIODeviceFind {
> +    PCIDevice *pdev;
> +    bool found;
> +};
> +
> +static void vfio_check_device_noreset(PCIBus *bus, PCIDevice *pdev,
> +                                      void *opaque)
> +{
> +    DeviceState *dev = DEVICE(pdev);
> +    DeviceClass *dc = DEVICE_GET_CLASS(dev);
> +    VFIOPCIDevice *vdev;
> +    struct VFIODeviceFind *find = opaque;
> +
> +    if (find->found) {
> +        return;
> +    }
> +
> +    if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
> +        if (!dc->reset) {
> +            goto found;
> +        }
> +        return;
> +    }
> +    vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER) &&
> +        !vdev->vbasedev.reset_works) {
> +        goto found;
> +    }
> +
> +    return;
> +found:
> +    find->pdev = pdev;
> +    find->found = true;
> +}
> +
> +static void device_find(PCIBus *bus, PCIDevice *pdev, void *opaque)
> +{
> +    struct VFIODeviceFind *find = opaque;
> +
> +    if (find->found) {
> +        return;
> +    }
> +
> +    if (pdev == find->pdev) {
> +        find->found = true;
> +    }
> +}
> +
> +static int vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
> +{
> +    PCIBus *bus = vdev->pdev.bus;
> +    struct vfio_pci_hot_reset_info *info = NULL;
> +    struct vfio_pci_dependent_device *devices;
> +    VFIOGroup *group;
> +    struct VFIODeviceFind find;
> +    int ret, i;
> +
> +    ret = vfio_get_hot_reset_info(vdev, &info);
> +    if (ret) {
> +        error_report("vfio: Cannot enable AER for device %s,"
> +                     " device does not support hot reset.",
> +                     vdev->vbasedev.name);
> +        goto out;



"return" is enough here in case of an error, info is released inside vfio_get_hot_reset_info

> +    }
> +
> +    /* List all affected devices by bus reset */
> +    devices = &info->devices[0];
> +
> +    /* Verify that we have all the groups required */
> +    for (i = 0; i < info->count; i++) {
> +        PCIHostDeviceAddress host;
> +        VFIOPCIDevice *tmp;
> +        VFIODevice *vbasedev_iter;
> +        bool found = false;
> +
> +        host.domain = devices[i].segment;
> +        host.bus = devices[i].bus;
> +        host.slot = PCI_SLOT(devices[i].devfn);
> +        host.function = PCI_FUNC(devices[i].devfn);
> +
> +        /* Skip the current device */
> +        if (vfio_pci_host_match(&host, &vdev->host)) {
> +            continue;
> +        }
> +
> +        /* Ensure we own the group of the affected device */
> +        QLIST_FOREACH(group, &vfio_group_list, next) {
> +            if (group->groupid == devices[i].group_id) {
> +                break;
> +            }
> +        }
> +
> +        if (!group) {
> +            error_report("vfio: Cannot enable AER for device %s, "
> +                         "depends on group %d which is not owned.",
> +                         vdev->vbasedev.name, devices[i].group_id);
> +            ret = -1;

You can use error codes on return, or you can init ret to -1 at declaration,
and delete some code lines.

> +            goto out;
> +        }
> +
> +        /* Ensure affected devices for reset on/blow the bus */

I think you meant below, not blow.

> +        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
> +            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
> +                continue;
> +            }
> +            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
> +            if (vfio_pci_host_match(&host, &tmp->host)) {
> +                PCIDevice *pci = PCI_DEVICE(tmp);
> +
> +                /*
> +                 * AER errors may be broadcast to all functions of a multi-
> +                 * function endpoint.  If any of those sibling functions are
> +                 * also assigned, they need to have AER enabled or else an
> +                 * error may continue to cause a vm_stop condition.  IOW,
> +                 * AER setup of this function would be pointless.
> +                 */
> +                if (vfio_pci_host_slot_match(&vdev->host, &tmp->host) &&
> +                    !(tmp->features & VFIO_FEATURE_ENABLE_AER)) {
> +                    error_report("vfio: Cannot enable AER for device %s, on same slot"
> +                                 " the dependent device %s which does not enable AER.",
> +                                 vdev->vbasedev.name, tmp->vbasedev.name);
> +                    ret = -1;
> +                    goto out;
> +                }
> +
> +                find.pdev = pci;
> +                find.found = false;
> +                pci_for_each_device(bus, pci_bus_num(bus),
> +                                    device_find, &find);
> +                if (!find.found) {
> +                    error_report("vfio: Cannot enable AER for device %s, "
> +                                 "the dependent device %s is not under the same bus",
> +                                 vdev->vbasedev.name, tmp->vbasedev.name);
> +                    ret = -1;
> +                    goto out;
> +                }
> +                found = true;
> +                break;
> +            }
> +        }
> +
> +        /* Ensure all affected devices assigned to VM */
> +        if (!found) {
> +            error_report("vfio: Cannot enable AER for device %s, "
> +                         "the dependent device %04x:%02x:%02x.%x "
> +                         "is not assigned to VM.",
> +                         vdev->vbasedev.name, host.domain, host.bus,
> +                         host.slot, host.function);
> +            ret = -1;
> +            goto out;
> +        }
> +    }
> +
> +    /*
> +     * Check the all pci devices on or below the target bus
> +     * have a reset mechanism at least.
> +     */
> +    find.pdev = NULL;
> +    find.found = false;
> +    pci_for_each_device(bus, pci_bus_num(bus),
> +                        vfio_check_device_noreset, &find);
> +    if (find.found) {
> +        error_report("vfio: Cannot enable AER for device %s, "
> +                     "the affected device %s does not have a reset mechanism.",
> +                     vdev->vbasedev.name, find.pdev->name);
> +        ret = -1;
> +        goto out;
> +    }
> +
> +    ret = 0;
> +out:
> +    g_free(info);
> +    return ret;
> +}
> +
> +static int vfio_check_devices_host_bus_reset(void)
> +{
> +    VFIOGroup *group;
> +    VFIODevice *vbasedev;
> +    VFIOPCIDevice *vdev;
> +
> +    /* Check All vfio-pci devices if have bus reset capability */
> +    QLIST_FOREACH(group, &vfio_group_list, next) {
> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
> +                continue;
> +            }
> +            vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +            if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
> +                vfio_check_host_bus_reset(vdev)) {
> +                return -1;
> +            }
> +        }
> +    }
> +
> +    return 0;
> +}
> +
>   static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
>                             int pos, uint16_t size)
>   {
> @@ -2009,13 +2221,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
>       vfio_intx_enable(vdev);
>   }
>
> -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
> -                                PCIHostDeviceAddress *host2)
> -{
> -    return (host1->domain == host2->domain && host1->bus == host2->bus &&
> -            host1->slot == host2->slot && host1->function == host2->function);
> -}
> -
>   static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>   {
>       VFIOGroup *group;
> @@ -2521,6 +2726,20 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>       vdev->req_enabled = false;
>   }
>
> +static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused)
> +{
> +    int ret;
> +
> +    ret = vfio_check_devices_host_bus_reset();
> +    if (ret) {
> +        exit(1);
> +    }
> +}
> +
> +static Notifier machine_notifier = {
> +    .notify = vfio_pci_machine_done_notify,
> +};
> +
>   static int vfio_initfn(PCIDevice *pdev)
>   {
>       VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> @@ -2867,6 +3086,11 @@ static const TypeInfo vfio_pci_dev_info = {
>   static void register_vfio_pci_dev_type(void)
>   {
>       type_register_static(&vfio_pci_dev_info);
> +    /*
> +     * Register notifier when machine init is done, since we need
> +     * check the configration manner after all vfio device are inited.
> +     */

I think you meant configuration and you can skip "manner".


Thanks,
Marcel

> +    qemu_add_machine_init_done_notifier(&machine_notifier);
>   }
>
>   type_init(register_vfio_pci_dev_type)
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 48c1f69..59ae194 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -15,6 +15,7 @@
>   #include "qemu-common.h"
>   #include "exec/memory.h"
>   #include "hw/pci/pci.h"
> +#include "hw/pci/pci_bus.h"
>   #include "hw/pci/pci_bridge.h"
>   #include "hw/vfio/vfio-common.h"
>   #include "qemu/event_notifier.h"
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 13/14] vfio-pci: pass the aer error to guest
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 13/14] vfio-pci: pass the aer error to guest Cao jin
@ 2016-01-18 10:45   ` Marcel Apfelbaum
  2016-01-19  9:27     ` Chen Fan
  0 siblings, 1 reply; 42+ messages in thread
From: Marcel Apfelbaum @ 2016-01-18 10:45 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

On 01/12/2016 04:43 AM, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>
> when the vfio device encounters an uncorrectable error in host,
> the vfio_pci driver will signal the eventfd registered by this
> vfio device, the results in the qemu eventfd handler getting

Maybe "the results in" -> resulting in

> invoked.
>
> this patch is to pass the error to guest and have the guest driver
> recover from the error.

Maybe "Pass the error to... and let the ... "

>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>   hw/vfio/pci.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++------
>   1 file changed, 47 insertions(+), 6 deletions(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index da4815e..efa5e01 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2553,18 +2553,59 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>   static void vfio_err_notifier_handler(void *opaque)
>   {
>       VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIEAERMsg msg = {
> +        .severity = 0,
> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
> +    };
>
>       if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>           return;
>       }
>
>       /*
> -     * TBD. Retrieve the error details and decide what action
> -     * needs to be taken. One of the actions could be to pass
> -     * the error to the guest and have the guest driver recover
> -     * from the error. This requires that PCIe capabilities be
> -     * exposed to the guest. For now, we just terminate the
> -     * guest to contain the error.
> +     * in case the real hardware configration has been changed,

configration -> configuration


> +     * here we should recheck the bus reset capability.
> +     */
> +    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
> +        vfio_check_host_bus_reset(vdev)) {
> +        goto stop;
> +    }
> +    /*
> +     * we should read the error details from the real hardware
> +     * configuration spaces, here we only need to do is signaling
> +     * to guest an uncorrectable error has occurred.
> +     */
> +    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
> +        dev->exp.aer_cap) {

Why do we need dev->exp.aer_cap check here? In patch 7/14 we fail the device init
process if this happens, right?

> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +        uint32_t uncor_status;
> +        bool isfatal;
> +
> +        uncor_status = vfio_pci_read_config(dev,
> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> +
> +        /*
> +         * if we receive the error signal but not this device, we can

maybe "if the error is not emitted by this device..."


Thanks,
Marcel

> +         * just ignore it.
> +         */
> +        if (!(uncor_status & ~0UL)) {
> +            return;
> +        }
> +
> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> +
> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
> +                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
> +
> +        pcie_aer_msg(dev, &msg);
> +        return;
> +    }
> +
> +stop:
> +    /*
> +     * If the aer capability is not exposed to the guest. we just
> +     * terminate the guest to contain the error.
>        */
>
>       error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 14/14] vfio: add 'aer' property to expose aercap
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 14/14] vfio: add 'aer' property to expose aercap Cao jin
@ 2016-01-18 10:46   ` Marcel Apfelbaum
  0 siblings, 0 replies; 42+ messages in thread
From: Marcel Apfelbaum @ 2016-01-18 10:46 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

On 01/12/2016 04:43 AM, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>
> add 'aer' property to let user able to decide whether expose
> the aer capability. by default we should disable aer feature,
> because it needs configuration restrictions.
>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>   hw/vfio/pci.c | 2 ++
>   1 file changed, 2 insertions(+)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index efa5e01..9afb5e0 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3119,6 +3119,8 @@ static Property vfio_pci_dev_properties[] = {
>                          sub_vendor_id, PCI_ANY_ID),
>       DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
>                          sub_device_id, PCI_ANY_ID),
> +    DEFINE_PROP_BIT("aer", VFIOPCIDevice, features,
> +                    VFIO_FEATURE_ENABLE_AER_BIT, false),
>       /*
>        * TODO - support passed fds... is this necessary?
>        * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
>


Reviewed-by: Marcel Apfelbaum <marcel@redhat.com>

Thanks,
Marcel

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 09/14] add check reset mechanism when hotplug vfio device
  2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 09/14] add check reset mechanism when hotplug vfio device Cao jin
@ 2016-01-18 11:03   ` Marcel Apfelbaum
  2016-01-19  1:46     ` Cao jin
  0 siblings, 1 reply; 42+ messages in thread
From: Marcel Apfelbaum @ 2016-01-18 11:03 UTC (permalink / raw)
  To: Cao jin, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst

On 01/12/2016 04:43 AM, Cao jin wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>
> Since we support multi-function hotplug. the function 0 indicate

I think you wanted , instead of ., also I would suggest
"..., function 0 indicates..."

> the closure of the slot, so we have the chance to do the check.
>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>   hw/pci/pci.c             | 29 +++++++++++++++++++++++++++++
>   hw/vfio/pci.c            | 19 +++++++++++++++++++
>   hw/vfio/pci.h            |  2 ++
>   include/hw/pci/pci_bus.h |  5 +++++
>   4 files changed, 55 insertions(+)
>
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 168b9cc..f6ca6ef 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -81,6 +81,7 @@ static void pci_bus_realize(BusState *qbus, Error **errp)
>       PCIBus *bus = PCI_BUS(qbus);
>
>       vmstate_register(NULL, -1, &vmstate_pcibus, bus);
> +    notifier_with_return_list_init(&bus->hotplug_notifiers);
>   }
>
>   static void pci_bus_unrealize(BusState *qbus, Error **errp)
> @@ -1835,6 +1836,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn)
>       return bus->devices[devfn];
>   }
>
> +void pci_bus_add_hotplug_notifier(PCIBus *bus, NotifierWithReturn *notify)
> +{
> +    notifier_with_return_list_add(&bus->hotplug_notifiers, notify);
> +}
> +
> +void pci_bus_remove_hotplug_notifier(NotifierWithReturn *notifier)
> +{
> +    notifier_with_return_remove(notifier);
> +}
> +
> +static int pci_bus_hotplug_notifier(PCIBus *bus, void *opaque)

Maybe  pci_bus_hotplug_notify instead of pci_bus_hotplug_notifier


Thanks,
Marcel

> +{
> +    return notifier_with_return_list_notify(&bus->hotplug_notifiers,
> +                                            opaque);
> +}
> +
>   static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>   {
>       PCIDevice *pci_dev = (PCIDevice *)qdev;
> @@ -1877,6 +1894,18 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>           pci_qdev_unrealize(DEVICE(pci_dev), NULL);
>           return;
>       }
> +
> +    /*
> +     *  If the function is func 0, indicate the closure of the slot.
> +     *  signal the callback.
> +     */
> +    if (DEVICE(pci_dev)->hotplugged &&
> +        pci_get_function_0(pci_dev) == pci_dev &&
> +        pci_bus_hotplug_notifier(bus, pci_dev)) {
> +        error_setg(errp, "failed to hotplug function 0");
> +        pci_qdev_unrealize(DEVICE(pci_dev), NULL);
> +        return;
> +    }
>   }
>
>   static void pci_default_realize(PCIDevice *dev, Error **errp)
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 16ab0e3..ff25c9b 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2044,6 +2044,19 @@ static int vfio_check_devices_host_bus_reset(void)
>       return 0;
>   }
>
> +static int vfio_check_bus_reset(NotifierWithReturn *n, void *opaque)
> +{
> +    VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, hotplug_notifier);
> +    PCIDevice *pci_dev = PCI_DEVICE(vdev);
> +    PCIDevice *pci_func0 = opaque;
> +
> +    if (pci_get_function_0(pci_dev) != pci_func0) {
> +        return 0;
> +    }
> +
> +    return vfio_check_host_bus_reset(vdev);
> +}
> +
>   static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
>                             int pos, uint16_t size)
>   {
> @@ -2091,6 +2104,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
>           pdev->exp.aer_log.log_max = 0;
>       }
>
> +    vdev->hotplug_notifier.notify = vfio_check_bus_reset;
> +    pci_bus_add_hotplug_notifier(pdev->bus, &vdev->hotplug_notifier);
> +
>       pcie_cap_deverr_init(pdev);
>       return pcie_aer_init(pdev, pos, size);
>
> @@ -2972,6 +2988,9 @@ static void vfio_exitfn(PCIDevice *pdev)
>       vfio_unregister_req_notifier(vdev);
>       vfio_unregister_err_notifier(vdev);
>       pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> +    if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
> +        pci_bus_remove_hotplug_notifier(&vdev->hotplug_notifier);
> +    }
>       vfio_disable_interrupts(vdev);
>       if (vdev->intx.mmap_timer) {
>           timer_free(vdev->intx.mmap_timer);
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 59ae194..b385f07 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -142,6 +142,8 @@ typedef struct VFIOPCIDevice {
>       bool no_kvm_intx;
>       bool no_kvm_msi;
>       bool no_kvm_msix;
> +
> +    NotifierWithReturn hotplug_notifier;
>   } VFIOPCIDevice;
>
>   uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> index 403fec6..7812fa9 100644
> --- a/include/hw/pci/pci_bus.h
> +++ b/include/hw/pci/pci_bus.h
> @@ -39,8 +39,13 @@ struct PCIBus {
>          Keep a count of the number of devices with raised IRQs.  */
>       int nirq;
>       int *irq_count;
> +
> +    NotifierWithReturnList hotplug_notifiers;
>   };
>
> +void pci_bus_add_hotplug_notifier(PCIBus *bus, NotifierWithReturn *notify);
> +void pci_bus_remove_hotplug_notifier(NotifierWithReturn *notify);
> +
>   typedef struct PCIBridgeWindows PCIBridgeWindows;
>
>   /*
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 09/14] add check reset mechanism when hotplug vfio device
  2016-01-18 11:03   ` Marcel Apfelbaum
@ 2016-01-19  1:46     ` Cao jin
  0 siblings, 0 replies; 42+ messages in thread
From: Cao jin @ 2016-01-19  1:46 UTC (permalink / raw)
  To: marcel, qemu-devel; +Cc: chen.fan.fnst, izumi.taku, alex.williamson, mst



On 01/18/2016 07:03 PM, Marcel Apfelbaum wrote:
> On 01/12/2016 04:43 AM, Cao jin wrote:
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> Since we support multi-function hotplug. the function 0 indicate
>
> I think you wanted , instead of ., also I would suggest
> "..., function 0 indicates..."
>
>> the closure of the slot, so we have the chance to do the check.


>> +static int pci_bus_hotplug_notifier(PCIBus *bus, void *opaque)
>
> Maybe  pci_bus_hotplug_notify instead of pci_bus_hotplug_notifier
>
>
> Thanks,
> Marcel
>

Sure, thanks for all the correction, will update it

-- 
Yours Sincerely,

Cao jin

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
  2016-01-16 18:34 ` [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Michael S. Tsirkin
@ 2016-01-19  9:09   ` Chen Fan
  2016-02-03  8:54   ` Chen Fan
  1 sibling, 0 replies; 42+ messages in thread
From: Chen Fan @ 2016-01-19  9:09 UTC (permalink / raw)
  To: Michael S. Tsirkin, Cao jin; +Cc: izumi.taku, alex.williamson, qemu-devel


On 01/17/2016 02:34 AM, Michael S. Tsirkin wrote:
> On Tue, Jan 12, 2016 at 10:43:01AM +0800, Cao jin wrote:
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> For now, for vfio pci passthough devices when qemu receives
>> an error from host aer report, currentlly just terminate the guest,
>> but usually user want to know what error occurred but stopping the
>> guest, so this patches add aer capability support for vfio device,
>> and pass the error to guest, and have guest driver to recover
>> from the error.
> I would like to see a version of this patchset that doesn't
> depend on pci core changes.
> I think that if you make this simplifying assumption:
>
> - all devices on same bus in guest are on same bus in host
>
> then you can handle both reset and hotplug simply in function 0
> since it will belong to vfio.
>
> So we can have a version without pci core changes that simply assumes
> this, and things will just work.
>
>
> Now, if we wanted to enforce this limitation, I think the
> cleanest way would be to add a callback in struct PCIDevice:
>
> 	bool is_valid_function(PCIDevice *newfunction)
>
> and call it as each function is added.
> This way aer function can validate that each function
> added shares the same bus.
> And this way issues will be detected directly and not when
> function 0 is added.
>
> I would prefer this validation code to be a patch on top so we can merge
> the functionality directly and avoid blocking it while we figure out the
> best api to validate things.
>
> I don't see why making guest topology match host would
> ever be a problem, but if it's required to support
> configurations where these differ, I'd like to see
> an attempt to address that be split out, after aer
> is supported.
Hi Michael,

    it's a good idea. we should simplify the implementation of the aer 
function first
without more affect on pci core code.

Thanks,
Chen

>
>
>> v15-v16:
>>     10/14, 11/14 are new to introduce a reset sequence id to specify the
>>     vfio devices has been reset for that reset. other patches aren't modified.
>>
>> v14-v15:
>>     1. add device hot reset callback
>>     2. add bus_in_reset for vfio device to avoid multi do host bus reset
>>
>> v13-v14:
>>     1. for multifunction device, requiring all functions enable AER.(9/13)
>>     2. due to all affected functions receive error signal, ignore no
>>        error occurred function. (12/13)
>>
>> v12-v13:
>>     1. since support multifuncion hotplug, here add callback to enable aer.
>>     2. add pci device pre+post reset for aer host reset.
>>
>> Chen Fan (14):
>>    vfio: extract vfio_get_hot_reset_info as a single function
>>    vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
>>    pcie: modify the capability size assert
>>    vfio: make the 4 bytes aligned for capability size
>>    vfio: add pcie extanded capability support
>>    aer: impove pcie_aer_init to support vfio device
>>    vfio: add aer support for vfio device
>>    vfio: add check host bus reset is support or not
>>    add check reset mechanism when hotplug vfio device
>>    pci: introduce pci bus pre reset
>>    vfio: introduce last reset sequence id
>>    pcie_aer: expose pcie_aer_msg() interface
>>    vfio-pci: pass the aer error to guest
>>    vfio: add 'aer' property to expose aercap
>>
>>   hw/pci-bridge/ioh3420.c            |   2 +-
>>   hw/pci-bridge/xio3130_downstream.c |   2 +-
>>   hw/pci-bridge/xio3130_upstream.c   |   2 +-
>>   hw/pci/pci.c                       |  42 +++
>>   hw/pci/pci_bridge.c                |   3 +
>>   hw/pci/pcie.c                      |   2 +-
>>   hw/pci/pcie_aer.c                  |   6 +-
>>   hw/vfio/pci.c                      | 616 +++++++++++++++++++++++++++++++++----
>>   hw/vfio/pci.h                      |   9 +
>>   include/hw/pci/pci.h               |   1 +
>>   include/hw/pci/pci_bus.h           |   8 +
>>   include/hw/pci/pcie_aer.h          |   3 +-
>>   12 files changed, 624 insertions(+), 72 deletions(-)
>>
>> -- 
>> 1.9.3
>>
>>
>
> .
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 13/14] vfio-pci: pass the aer error to guest
  2016-01-18 10:45   ` Marcel Apfelbaum
@ 2016-01-19  9:27     ` Chen Fan
  0 siblings, 0 replies; 42+ messages in thread
From: Chen Fan @ 2016-01-19  9:27 UTC (permalink / raw)
  To: marcel, Cao jin, qemu-devel; +Cc: izumi.taku, alex.williamson, mst


On 01/18/2016 06:45 PM, Marcel Apfelbaum wrote:
> On 01/12/2016 04:43 AM, Cao jin wrote:
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> when the vfio device encounters an uncorrectable error in host,
>> the vfio_pci driver will signal the eventfd registered by this
>> vfio device, the results in the qemu eventfd handler getting
>
> Maybe "the results in" -> resulting in
>
>> invoked.
>>
>> this patch is to pass the error to guest and have the guest driver
>> recover from the error.
>
> Maybe "Pass the error to... and let the ... "
>
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/vfio/pci.c | 53 
>> +++++++++++++++++++++++++++++++++++++++++++++++------
>>   1 file changed, 47 insertions(+), 6 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index da4815e..efa5e01 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2553,18 +2553,59 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>>   static void vfio_err_notifier_handler(void *opaque)
>>   {
>>       VFIOPCIDevice *vdev = opaque;
>> +    PCIDevice *dev = &vdev->pdev;
>> +    PCIEAERMsg msg = {
>> +        .severity = 0,
>> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
>> +    };
>>
>>       if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>           return;
>>       }
>>
>>       /*
>> -     * TBD. Retrieve the error details and decide what action
>> -     * needs to be taken. One of the actions could be to pass
>> -     * the error to the guest and have the guest driver recover
>> -     * from the error. This requires that PCIe capabilities be
>> -     * exposed to the guest. For now, we just terminate the
>> -     * guest to contain the error.
>> +     * in case the real hardware configration has been changed,
>
> configration -> configuration
>
>
>> +     * here we should recheck the bus reset capability.
>> +     */
>> +    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
>> +        vfio_check_host_bus_reset(vdev)) {
>> +        goto stop;
>> +    }
>> +    /*
>> +     * we should read the error details from the real hardware
>> +     * configuration spaces, here we only need to do is signaling
>> +     * to guest an uncorrectable error has occurred.
>> +     */
>> +    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
>> +        dev->exp.aer_cap) {
>
> Why do we need dev->exp.aer_cap check here? In patch 7/14 we fail the 
> device init
> process if this happens, right?

the property FEATURE_ENABLE_AER can't represent the vfio device actually 
has the aer
capability. so here we should check it.

>
>> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
>> +        uint32_t uncor_status;
>> +        bool isfatal;
>> +
>> +        uncor_status = vfio_pci_read_config(dev,
>> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
>> +
>> +        /*
>> +         * if we receive the error signal but not this device, we can
>
> maybe "if the error is not emitted by this device..."

thank you for your careful review for my bad english description in the 
patchset,
I will update them in the next version.

Thanks,
Chen

>
>
> Thanks,
> Marcel
>
>> +         * just ignore it.
>> +         */
>> +        if (!(uncor_status & ~0UL)) {
>> +            return;
>> +        }
>> +
>> +        isfatal = uncor_status & pci_get_long(aer_cap + 
>> PCI_ERR_UNCOR_SEVER);
>> +
>> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>> +                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
>> +
>> +        pcie_aer_msg(dev, &msg);
>> +        return;
>> +    }
>> +
>> +stop:
>> +    /*
>> +     * If the aer capability is not exposed to the guest. we just
>> +     * terminate the guest to contain the error.
>>        */
>>
>>       error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error 
>> detected.  "
>>
>
>
>
> .
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 05/14] vfio: add pcie extanded capability support
  2016-01-17 13:22   ` Marcel Apfelbaum
@ 2016-01-19  9:44     ` Chen Fan
  0 siblings, 0 replies; 42+ messages in thread
From: Chen Fan @ 2016-01-19  9:44 UTC (permalink / raw)
  To: marcel, Cao jin, qemu-devel; +Cc: izumi.taku, alex.williamson, mst


On 01/17/2016 09:22 PM, Marcel Apfelbaum wrote:
> On 01/12/2016 04:43 AM, Cao jin wrote:
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>
> Hi,
>
> I noticed a type in the subject, extanded -> extended
>
>> For vfio pcie device, we could expose the extended capability on
>> PCIE bus. in order to avoid config space broken, we introduce
>> a copy config for parsing extended caps. and rebuild the pcie
>> extended config space.
>
> Maybe we can re-word this. Will someone with better English skills
> advice :) ?
>
that will be helpful. ;)

>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/vfio/pci.c | 70 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 69 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 288f2c7..64b0867 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -1482,6 +1482,21 @@ static uint8_t vfio_std_cap_max_size(PCIDevice 
>> *pdev, uint8_t pos)
>>       return next - pos;
>>   }
>>
>> +
>> +static uint16_t vfio_ext_cap_max_size(const uint8_t *config, 
>> uint16_t pos)
>> +{
>> +    uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
>> +
>> +    for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
>> +        tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
>> +        if (tmp > pos && tmp < next) {
>> +            next = tmp;
>> +        }
>> +    }
>> +
>> +    return next - pos;
>> +}
>
> Can't we reuse vfio_std_cap_max_size here? if only the config size 
> differs,
> we can pass it as parameter.
not only the config size differ, but also the PCI express Extended 
Capability header,
the pci express head use 16bit to store the cap id and 12bit to store 
the next offset.

>
>> +
>>   static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t 
>> mask)
>>   {
>>       pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
>> @@ -1817,16 +1832,69 @@ static int vfio_add_std_cap(VFIOPCIDevice 
>> *vdev, uint8_t pos)
>>       return 0;
>>   }
>>
>> +static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>> +{
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    uint32_t header;
>> +    uint16_t cap_id, next, size;
>> +    uint8_t cap_ver;
>> +    uint8_t *config;
>> +
>> +    /*
>> +     * In order to avoid breaking config space, create a copy to
>> +     * use for parsing extended capabilities.
>
> It will be nice to know *how* do we break/*what* will break the config
> space, I confess that I didn't see it :(.
I will improve it.

>
>> +     */
>> +    config = g_memdup(pdev->config, vdev->config_size);
>> +
>> +    for (next = PCI_CONFIG_SPACE_SIZE; next;
>> +         next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
>> +        header = pci_get_long(config + next);
>> +        cap_id = PCI_EXT_CAP_ID(header);
>> +        cap_ver = PCI_EXT_CAP_VER(header);
>> +
>> +        /*
>> +         * If it becomes important to configure extended 
>> capabilities to their
>> +         * actual size, use this as the default when it's something 
>> we don't
>> +         * recognize. Since QEMU doesn't actually handle many of the 
>> config
>> +         * accesses, exact size doesn't seem worthwhile.
>> +         */
>> +        size = vfio_ext_cap_max_size(config, next);
>> +
>> +        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
>> +        pci_set_long(dev->config + next, PCI_EXT_CAP(cap_id, 
>> cap_ver, 0));
>> +
>> +        /* Use emulated next pointer to allow dropping extended caps */
>> + pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
>> +                                   PCI_EXT_CAP_NEXT_MASK);
>> +    }
>> +
>> +    g_free(config);
>> +    return 0;
>> +}
>> +
>>   static int vfio_add_capabilities(VFIOPCIDevice *vdev)
>>   {
>>       PCIDevice *pdev = &vdev->pdev;
>> +    int ret;
>>
>>       if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
>>           !pdev->config[PCI_CAPABILITY_LIST]) {
>>           return 0; /* Nothing to add */
>>       }
>>
>> -    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
>> +    ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    /* on PCI bus, it doesn't make sense to expose extended 
>> capabilities. */
>> +    if (!pci_is_express(pdev) ||
>> +        !pci_bus_is_express(pdev->bus) ||
>> +        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
>
> I am curious about the last check, "!pci_get_long(pdev->config + 
> PCI_CONFIG_SPACE_SIZE)",
> can you please explain?
the pcie spec 3.0 defines that:

7.9.1. Extended Capabilities in Configuration Space
Extended Capabilities in Configuration Space always begin at offset 100h 
with a PCI Express
Extended Capability header (Section 7.9.3). Absence of any Extended 
Capabilities is required to be
indicated by an Extended Capability header with a Capability ID of 
0000h, a Capability Version of
0h, and a Next Capability Offset of 000h.

so here we test whether the offset 100h is zero.

Thanks,
Chen


>
> Thank you,
> Marcel
>
>> +        return 0;
>> +    }
>> +
>> +    return vfio_add_ext_cap(vdev);
>>   }
>>
>>   static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
>>
>
>
>
> .
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 07/14] vfio: add aer support for vfio device
  2016-01-18  9:12   ` Marcel Apfelbaum
@ 2016-01-19  9:47     ` Chen Fan
  0 siblings, 0 replies; 42+ messages in thread
From: Chen Fan @ 2016-01-19  9:47 UTC (permalink / raw)
  To: marcel, Cao jin, qemu-devel; +Cc: izumi.taku, alex.williamson, mst


On 01/18/2016 05:12 PM, Marcel Apfelbaum wrote:
> On 01/12/2016 04:43 AM, Cao jin wrote:
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> Calling pcie_aer_init to initilize aer related registers for
>> vfio device, then reload physical related registers to expose
>> device capability.
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/vfio/pci.c | 81 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>   hw/vfio/pci.h |  3 +++
>>   2 files changed, 81 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 64b0867..38b0aa5 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -1832,6 +1832,62 @@ static int vfio_add_std_cap(VFIOPCIDevice 
>> *vdev, uint8_t pos)
>>       return 0;
>>   }
>>
>> +static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
>> +                          int pos, uint16_t size)
>> +{
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    PCIDevice *dev_iter;
>> +    uint8_t type;
>> +    uint32_t errcap;
>> +
>> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
>> +        pcie_add_capability(pdev, PCI_EXT_CAP_ID_ERR,
>> +                            cap_ver, pos, size);
>> +        return 0;
>> +    }
>> +
>> +    dev_iter = pci_bridge_get_device(pdev->bus);
>> +    if (!dev_iter) {
>> +        goto error;
>> +    }
>> +
>> +    while (dev_iter) {
>> +        type = pcie_cap_get_type(dev_iter);
>> +        if ((type != PCI_EXP_TYPE_ROOT_PORT &&
>> +             type != PCI_EXP_TYPE_UPSTREAM &&
>> +             type != PCI_EXP_TYPE_DOWNSTREAM)) {
>> +            goto error;
>> +        }
>> +
>> +        if (!dev_iter->exp.aer_cap) {
>> +            goto error;
>> +        }
>> +
>> +        dev_iter = pci_bridge_get_device(dev_iter->bus);
>> +    }
>> +
>> +    errcap = vfio_pci_read_config(pdev, pos + PCI_ERR_CAP, 4);
>> +    /*
>> +     * The ability to record multiple headers is depending on
>> +     * the state of the Multiple Header Recording Capable bit and
>> +     * enabled by the Multiple Header Recording Enable bit.
>> +     */
>> +    if ((errcap & PCI_ERR_CAP_MHRC) &&
>> +        (errcap & PCI_ERR_CAP_MHRE)) {
>> +        pdev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
>> +    } else {
>> +        pdev->exp.aer_log.log_max = 0;
>> +    }
>> +
>> +    pcie_cap_deverr_init(pdev);
>> +    return pcie_aer_init(pdev, pos, size);
>> +
>> +error:
>> +    error_report("vfio: Unable to enable AER for device %s, parent 
>> bus "
>> +                 "does not support AER signaling", 
>> vdev->vbasedev.name);
>> +    return -1;
>> +}
>> +
>>   static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>>   {
>>       PCIDevice *pdev = &vdev->pdev;
>> @@ -1839,6 +1895,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>>       uint16_t cap_id, next, size;
>>       uint8_t cap_ver;
>>       uint8_t *config;
>> +    int ret = 0;
>>
>>       /*
>>        * In order to avoid breaking config space, create a copy to
>> @@ -1860,16 +1917,29 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
>>            */
>>           size = vfio_ext_cap_max_size(config, next);
>>
>> -        pcie_add_capability(pdev, cap_id, cap_ver, next, size);
>> -        pci_set_long(dev->config + next, PCI_EXT_CAP(cap_id, 
>> cap_ver, 0));
>> +        switch (cap_id) {
>> +        case PCI_EXT_CAP_ID_ERR:
>> +            ret = vfio_setup_aer(vdev, cap_ver, next, size);
>> +            break;
>> +        default:
>> +            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
>> +            break;
>> +        }
>> +
>> +        if (ret) {
>> +            goto out;
>> +        }
>> +
>> +        pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, 
>> cap_ver, 0));
>>
>>           /* Use emulated next pointer to allow dropping extended 
>> caps */
>> pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
>>                                      PCI_EXT_CAP_NEXT_MASK);
>>       }
>>
>> +out:
>>       g_free(config);
>> -    return 0;
>> +    return ret;
>>   }
>>
>>   static int vfio_add_capabilities(VFIOPCIDevice *vdev)
>> @@ -2624,6 +2694,11 @@ static int vfio_initfn(PCIDevice *pdev)
>>           goto out_teardown;
>>       }
>>
>> +    if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
>> +        !pdev->exp.aer_cap) {
>
> Hi,
>
> I think we need an error_report here, otherwise the init
> will fail without knowing the reason.
maybe we need to exclude this case. if the device hasn't the aer cap,
the ENABLE_AER would be not affected.

Thanks,
Chen

>
>
> Thanks,
> Marcel
>
>
>> +        goto out_teardown;
>> +    }
>> +
>>       /* QEMU emulates all of MSI & MSIX */
>>       if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
>>           memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index f004d52..48c1f69 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -15,6 +15,7 @@
>>   #include "qemu-common.h"
>>   #include "exec/memory.h"
>>   #include "hw/pci/pci.h"
>> +#include "hw/pci/pci_bridge.h"
>>   #include "hw/vfio/vfio-common.h"
>>   #include "qemu/event_notifier.h"
>>   #include "qemu/queue.h"
>> @@ -127,6 +128,8 @@ typedef struct VFIOPCIDevice {
>>   #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
>>   #define VFIO_FEATURE_ENABLE_REQ_BIT 1
>>   #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT)
>> +#define VFIO_FEATURE_ENABLE_AER_BIT 2
>> +#define VFIO_FEATURE_ENABLE_AER (1 << VFIO_FEATURE_ENABLE_AER_BIT)
>>       int32_t bootindex;
>>       uint8_t pm_cap;
>>       bool has_vga;
>>
>
>
>
> .
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 08/14] vfio: add check host bus reset is support or not
  2016-01-18 10:32   ` Marcel Apfelbaum
@ 2016-01-19  9:55     ` Chen Fan
  0 siblings, 0 replies; 42+ messages in thread
From: Chen Fan @ 2016-01-19  9:55 UTC (permalink / raw)
  To: marcel, Cao jin, qemu-devel; +Cc: izumi.taku, alex.williamson, mst


On 01/18/2016 06:32 PM, Marcel Apfelbaum wrote:
> On 01/12/2016 04:43 AM, Cao jin wrote:
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>
> Hi,
>
> I think the subject should be rephrased.
>
>> when init vfio devices done, we should test all the devices supported
>> aer whether conflict with others. For each one, get the hot reset
>> info for the affected device list.  For each affected device, all
>> should attach to the VM and on/below the same bus. also, we should test
>> all of the non-AER supporting vfio-pci devices on or below the target
>> bus to verify they have a reset mechanism.
>
>
> Maybe instead of explaining what this patch does you can simply
> say what are the requirements:
>
> Something like: Check there are no AER conflicts by making sure the
> devices are behind on/below the same bus... (someone with better 
> English may help :) )
>
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/vfio/pci.c | 238 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>>   hw/vfio/pci.h |   1 +
>>   2 files changed, 232 insertions(+), 7 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 38b0aa5..16ab0e3 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -1832,6 +1832,218 @@ static int vfio_add_std_cap(VFIOPCIDevice 
>> *vdev, uint8_t pos)
>>       return 0;
>>   }
>>
>> +static bool vfio_pci_host_slot_match(PCIHostDeviceAddress *host1,
>> +                                     PCIHostDeviceAddress *host2)
>> +{
>> +    return (host1->domain == host2->domain && host1->bus == 
>> host2->bus &&
>> +            host1->slot == host2->slot);
>> +}
>> +
>> +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
>> +                                PCIHostDeviceAddress *host2)
>> +{
>> +    return (vfio_pci_host_slot_match(host1, host2) &&
>> +            host1->function == host2->function);
>> +}
>> +
>> +struct VFIODeviceFind {
>> +    PCIDevice *pdev;
>> +    bool found;
>> +};
>> +
>> +static void vfio_check_device_noreset(PCIBus *bus, PCIDevice *pdev,
>> +                                      void *opaque)
>> +{
>> +    DeviceState *dev = DEVICE(pdev);
>> +    DeviceClass *dc = DEVICE_GET_CLASS(dev);
>> +    VFIOPCIDevice *vdev;
>> +    struct VFIODeviceFind *find = opaque;
>> +
>> +    if (find->found) {
>> +        return;
>> +    }
>> +
>> +    if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
>> +        if (!dc->reset) {
>> +            goto found;
>> +        }
>> +        return;
>> +    }
>> +    vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER) &&
>> +        !vdev->vbasedev.reset_works) {
>> +        goto found;
>> +    }
>> +
>> +    return;
>> +found:
>> +    find->pdev = pdev;
>> +    find->found = true;
>> +}
>> +
>> +static void device_find(PCIBus *bus, PCIDevice *pdev, void *opaque)
>> +{
>> +    struct VFIODeviceFind *find = opaque;
>> +
>> +    if (find->found) {
>> +        return;
>> +    }
>> +
>> +    if (pdev == find->pdev) {
>> +        find->found = true;
>> +    }
>> +}
>> +
>> +static int vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
>> +{
>> +    PCIBus *bus = vdev->pdev.bus;
>> +    struct vfio_pci_hot_reset_info *info = NULL;
>> +    struct vfio_pci_dependent_device *devices;
>> +    VFIOGroup *group;
>> +    struct VFIODeviceFind find;
>> +    int ret, i;
>> +
>> +    ret = vfio_get_hot_reset_info(vdev, &info);
>> +    if (ret) {
>> +        error_report("vfio: Cannot enable AER for device %s,"
>> +                     " device does not support hot reset.",
>> +                     vdev->vbasedev.name);
>> +        goto out;
>
>
>
> "return" is enough here in case of an error, info is released inside 
> vfio_get_hot_reset_info

indeed.

>
>> +    }
>> +
>> +    /* List all affected devices by bus reset */
>> +    devices = &info->devices[0];
>> +
>> +    /* Verify that we have all the groups required */
>> +    for (i = 0; i < info->count; i++) {
>> +        PCIHostDeviceAddress host;
>> +        VFIOPCIDevice *tmp;
>> +        VFIODevice *vbasedev_iter;
>> +        bool found = false;
>> +
>> +        host.domain = devices[i].segment;
>> +        host.bus = devices[i].bus;
>> +        host.slot = PCI_SLOT(devices[i].devfn);
>> +        host.function = PCI_FUNC(devices[i].devfn);
>> +
>> +        /* Skip the current device */
>> +        if (vfio_pci_host_match(&host, &vdev->host)) {
>> +            continue;
>> +        }
>> +
>> +        /* Ensure we own the group of the affected device */
>> +        QLIST_FOREACH(group, &vfio_group_list, next) {
>> +            if (group->groupid == devices[i].group_id) {
>> +                break;
>> +            }
>> +        }
>> +
>> +        if (!group) {
>> +            error_report("vfio: Cannot enable AER for device %s, "
>> +                         "depends on group %d which is not owned.",
>> +                         vdev->vbasedev.name, devices[i].group_id);
>> +            ret = -1;
>
> You can use error codes on return, or you can init ret to -1 at 
> declaration,
> and delete some code lines.
>
>> +            goto out;
>> +        }
>> +
>> +        /* Ensure affected devices for reset on/blow the bus */
>
> I think you meant below, not blow.
>
>> +        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
>> +            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
>> +                continue;
>> +            }
>> +            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
>> +            if (vfio_pci_host_match(&host, &tmp->host)) {
>> +                PCIDevice *pci = PCI_DEVICE(tmp);
>> +
>> +                /*
>> +                 * AER errors may be broadcast to all functions of a 
>> multi-
>> +                 * function endpoint.  If any of those sibling 
>> functions are
>> +                 * also assigned, they need to have AER enabled or 
>> else an
>> +                 * error may continue to cause a vm_stop condition.  
>> IOW,
>> +                 * AER setup of this function would be pointless.
>> +                 */
>> +                if (vfio_pci_host_slot_match(&vdev->host, 
>> &tmp->host) &&
>> +                    !(tmp->features & VFIO_FEATURE_ENABLE_AER)) {
>> +                    error_report("vfio: Cannot enable AER for device 
>> %s, on same slot"
>> +                                 " the dependent device %s which 
>> does not enable AER.",
>> +                                 vdev->vbasedev.name, 
>> tmp->vbasedev.name);
>> +                    ret = -1;
>> +                    goto out;
>> +                }
>> +
>> +                find.pdev = pci;
>> +                find.found = false;
>> +                pci_for_each_device(bus, pci_bus_num(bus),
>> +                                    device_find, &find);
>> +                if (!find.found) {
>> +                    error_report("vfio: Cannot enable AER for device 
>> %s, "
>> +                                 "the dependent device %s is not 
>> under the same bus",
>> +                                 vdev->vbasedev.name, 
>> tmp->vbasedev.name);
>> +                    ret = -1;
>> +                    goto out;
>> +                }
>> +                found = true;
>> +                break;
>> +            }
>> +        }
>> +
>> +        /* Ensure all affected devices assigned to VM */
>> +        if (!found) {
>> +            error_report("vfio: Cannot enable AER for device %s, "
>> +                         "the dependent device %04x:%02x:%02x.%x "
>> +                         "is not assigned to VM.",
>> +                         vdev->vbasedev.name, host.domain, host.bus,
>> +                         host.slot, host.function);
>> +            ret = -1;
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    /*
>> +     * Check the all pci devices on or below the target bus
>> +     * have a reset mechanism at least.
>> +     */
>> +    find.pdev = NULL;
>> +    find.found = false;
>> +    pci_for_each_device(bus, pci_bus_num(bus),
>> +                        vfio_check_device_noreset, &find);
>> +    if (find.found) {
>> +        error_report("vfio: Cannot enable AER for device %s, "
>> +                     "the affected device %s does not have a reset 
>> mechanism.",
>> +                     vdev->vbasedev.name, find.pdev->name);
>> +        ret = -1;
>> +        goto out;
>> +    }
>> +
>> +    ret = 0;
>> +out:
>> +    g_free(info);
>> +    return ret;
>> +}
>> +
>> +static int vfio_check_devices_host_bus_reset(void)
>> +{
>> +    VFIOGroup *group;
>> +    VFIODevice *vbasedev;
>> +    VFIOPCIDevice *vdev;
>> +
>> +    /* Check All vfio-pci devices if have bus reset capability */
>> +    QLIST_FOREACH(group, &vfio_group_list, next) {
>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>> +                continue;
>> +            }
>> +            vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +            if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
>> +                vfio_check_host_bus_reset(vdev)) {
>> +                return -1;
>> +            }
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
>>                             int pos, uint16_t size)
>>   {
>> @@ -2009,13 +2221,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice 
>> *vdev)
>>       vfio_intx_enable(vdev);
>>   }
>>
>> -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
>> -                                PCIHostDeviceAddress *host2)
>> -{
>> -    return (host1->domain == host2->domain && host1->bus == 
>> host2->bus &&
>> -            host1->slot == host2->slot && host1->function == 
>> host2->function);
>> -}
>> -
>>   static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>>   {
>>       VFIOGroup *group;
>> @@ -2521,6 +2726,20 @@ static void 
>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>       vdev->req_enabled = false;
>>   }
>>
>> +static void vfio_pci_machine_done_notify(Notifier *notifier, void 
>> *unused)
>> +{
>> +    int ret;
>> +
>> +    ret = vfio_check_devices_host_bus_reset();
>> +    if (ret) {
>> +        exit(1);
>> +    }
>> +}
>> +
>> +static Notifier machine_notifier = {
>> +    .notify = vfio_pci_machine_done_notify,
>> +};
>> +
>>   static int vfio_initfn(PCIDevice *pdev)
>>   {
>>       VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>> @@ -2867,6 +3086,11 @@ static const TypeInfo vfio_pci_dev_info = {
>>   static void register_vfio_pci_dev_type(void)
>>   {
>>       type_register_static(&vfio_pci_dev_info);
>> +    /*
>> +     * Register notifier when machine init is done, since we need
>> +     * check the configration manner after all vfio device are inited.
>> +     */
>
> I think you meant configuration and you can skip "manner".

right, I will fix them in next version.

Thanks,
Chen

>
>
> Thanks,
> Marcel
>
>> + qemu_add_machine_init_done_notifier(&machine_notifier);
>>   }
>>
>>   type_init(register_vfio_pci_dev_type)
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index 48c1f69..59ae194 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -15,6 +15,7 @@
>>   #include "qemu-common.h"
>>   #include "exec/memory.h"
>>   #include "hw/pci/pci.h"
>> +#include "hw/pci/pci_bus.h"
>>   #include "hw/pci/pci_bridge.h"
>>   #include "hw/vfio/vfio-common.h"
>>   #include "qemu/event_notifier.h"
>>
>
>
>
> .
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 10/14] pci: introduce pci bus pre reset
  2016-01-14 20:36   ` Alex Williamson
@ 2016-01-19 10:15     ` Chen Fan
  0 siblings, 0 replies; 42+ messages in thread
From: Chen Fan @ 2016-01-19 10:15 UTC (permalink / raw)
  To: Alex Williamson, Cao jin, qemu-devel; +Cc: izumi.taku, mst


On 01/15/2016 04:36 AM, Alex Williamson wrote:
> On Tue, 2016-01-12 at 10:43 +0800, Cao jin wrote:
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> avoid repeat bus reset, here introduce a sequence ID for each time
>> bus hot reset, so each vfio device could know whether they've already
>> been reset for that sequence ID.
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/pci/pci.c             | 13 +++++++++++++
>>   hw/pci/pci_bridge.c      |  3 +++
>>   include/hw/pci/pci.h     |  1 +
>>   include/hw/pci/pci_bus.h |  3 +++
>>   4 files changed, 20 insertions(+)
>>
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index f6ca6ef..ceb72d5 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -91,6 +91,18 @@ static void pci_bus_unrealize(BusState *qbus,
>> Error **errp)
>>       vmstate_unregister(NULL, &vmstate_pcibus, bus);
>>   }
>>   
>> +void pci_bus_pre_reset(PCIBus *bus, uint32_t seqid)
>> +{
>> +    PCIBus *sec;
>> +
>> +    bus->in_reset = true;
>> +    bus->reset_seqid = seqid;
>> +
>> +    QLIST_FOREACH(sec, &bus->child, sibling) {
>> +        pci_bus_pre_reset(sec, seqid);
>> +    }
>> +}
>> +
>>   static bool pcibus_is_root(PCIBus *bus)
>>   {
>>       return !bus->parent_dev;
>> @@ -276,6 +288,7 @@ static void pcibus_reset(BusState *qbus)
>>       for (i = 0; i < bus->nirq; i++) {
>>           assert(bus->irq_count[i] == 0);
>>       }
>> +    bus->in_reset = false;
>>   }
>>   
>>   static void pci_host_bus_register(PCIBus *bus, DeviceState *parent)
>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>> index 40c97b1..c7f15a1 100644
>> --- a/hw/pci/pci_bridge.c
>> +++ b/hw/pci/pci_bridge.c
>> @@ -268,6 +268,9 @@ void pci_bridge_write_config(PCIDevice *d,
>>       newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
>>       if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
>>           /* Trigger hot reset on 0->1 transition. */
>> +        uint32_t seqid = s->sec_bus.reset_seqid++;
> Doesn't this need to come from a global sequence ID?  Imagine the case
> of a nested bus, the leaf bus is reset incrementing the sequence ID.
> The devices on that bus store that sequence ID as they're reset.  The
> parent bus is then reset, but all the devices on the leaf bus have
> already been reset for that sequence ID and ignore the reset.
>
>> +
>> +        pci_bus_pre_reset(&s->sec_bus, seqid ? seqid : 1);
> Does this work?  Seems like this would make devices ignore the second
> bus reset after the VM is instantiated.  ie.  the first bus reset seqid
> is 0, so we call pre_reset with 1, the second time we call it with 1
> again.
>
>>           qbus_reset_all(&s->sec_bus.qbus);
> I'd be tempted to call qbus_walk_children() directly, it already has a
> pre_busfn callback hook.
Hi Alex,

this looks like need to change much pci core code,  as Michael suggested 
in 00/14,
maybe we should simply the aer implementation. what do you think of that?

Thanks,
Chen


>
>>       }
>>   }
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index 379b6e1..b811279 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -381,6 +381,7 @@ void pci_bus_fire_intx_routing_notifier(PCIBus
>> *bus);
>>   void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>>                                             PCIINTxRoutingNotifier
>> notifier);
>>   void pci_device_reset(PCIDevice *dev);
>> +void pci_bus_pre_reset(PCIBus *bus, uint32_t seqid);
>>   
>>   PCIDevice *pci_nic_init_nofail(NICInfo *nd, PCIBus *rootbus,
>>                                  const char *default_model,
>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
>> index 7812fa9..dd6aaf1 100644
>> --- a/include/hw/pci/pci_bus.h
>> +++ b/include/hw/pci/pci_bus.h
>> @@ -40,6 +40,9 @@ struct PCIBus {
>>       int nirq;
>>       int *irq_count;
>>   
>> +    bool in_reset;
>> +    uint32_t reset_seqid;
>> +
>>       NotifierWithReturnList hotplug_notifiers;
>>   };
>>   
>
>
> .
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
  2016-01-16 18:34 ` [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Michael S. Tsirkin
  2016-01-19  9:09   ` Chen Fan
@ 2016-02-03  8:54   ` Chen Fan
  2016-02-03 13:57     ` Michael S. Tsirkin
  1 sibling, 1 reply; 42+ messages in thread
From: Chen Fan @ 2016-02-03  8:54 UTC (permalink / raw)
  To: Michael S. Tsirkin, Cao jin; +Cc: izumi.taku, alex.williamson, qemu-devel


On 01/17/2016 02:34 AM, Michael S. Tsirkin wrote:
> On Tue, Jan 12, 2016 at 10:43:01AM +0800, Cao jin wrote:
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> For now, for vfio pci passthough devices when qemu receives
>> an error from host aer report, currentlly just terminate the guest,
>> but usually user want to know what error occurred but stopping the
>> guest, so this patches add aer capability support for vfio device,
>> and pass the error to guest, and have guest driver to recover
>> from the error.
> I would like to see a version of this patchset that doesn't
> depend on pci core changes.
> I think that if you make this simplifying assumption:
>
> - all devices on same bus in guest are on same bus in host
>
> then you can handle both reset and hotplug simply in function 0
> since it will belong to vfio.
>
> So we can have a version without pci core changes that simply assumes
> this, and things will just work.
>
>
> Now, if we wanted to enforce this limitation, I think the
> cleanest way would be to add a callback in struct PCIDevice:
>
> 	bool is_valid_function(PCIDevice *newfunction)
>
> and call it as each function is added.
> This way aer function can validate that each function
> added shares the same bus.
> And this way issues will be detected directly and not when
> function 0 is added.
>
> I would prefer this validation code to be a patch on top so we can merge
> the functionality directly and avoid blocking it while we figure out the
> best api to validate things.
>
> I don't see why making guest topology match host would
> ever be a problem, but if it's required to support
> configurations where these differ, I'd like to see
> an attempt to address that be split out, after aer
> is supported.
Hi Michael,

Just think about this more,  I think we also should check the vfio
devices whether on the same bus at the time of function 0 is added.
because we don't know the affected devices by a bus reset have
already all been assigned to VM. for example, the multi-function's hotplug.
devices on same bus in host are added to VM one by one. when we
test one device, we haven't yet added the other devices. so I think
the patch should like below. then we could add a vfio_is_valid_function 
in vfio
to test each device whether the affected devices on the same bus.

Thanks,
Chen

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index d940f79..7163b56 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1836,6 +1836,38 @@ PCIDevice *pci_find_device(PCIBus *bus, int 
bus_num, uint8_t devfn)
      return bus->devices[devfn];
  }

+static int pci_bus_check_devices(PCIBus *bus)
+{
+    PCIDeviceClass *pc;
+    int i, ret = 0;
+
+    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
+        if (!bus->devices[i]) {
+            continue;
+        }
+
+        pc = PCI_DEVICE_GET_CLASS(bus->devices[i]);
+        if (!pc->is_valid_func) {
+            continue;
+        }
+
+        ret = pc->is_valid_func(bus->devices[i], bus);
+        if (!ret) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static bool pci_is_valid_function(PCIDevice *pdev, PCIBus *bus)
+{
+    if (pdev->bus == bus) {
+        return true;
+    }
+
+    return false;
+}
+
  static void pci_qdev_realize(DeviceState *qdev, Error **errp)
  {
      PCIDevice *pci_dev = (PCIDevice *)qdev;
@@ -1878,6 +1910,14 @@ static void pci_qdev_realize(DeviceState *qdev, 
Error **errp)
          pci_qdev_unrealize(DEVICE(pci_dev), NULL);
          return;
      }
+
+    if (DEVICE(pci_dev)->hotplugged &&
+        pci_get_function_0(pci_dev) == pci_dev &&
+        pci_bus_check_devices(bus)) {
+        error_setg(errp, "failed to hotplug function 0");
+        pci_qdev_unrealize(DEVICE(pci_dev), NULL);
+        return;
+    }
  }

  static void pci_default_realize(PCIDevice *dev, Error **errp)
@@ -2390,6 +2430,7 @@ static void pci_device_class_init(ObjectClass 
*klass, void *data)
      k->bus_type = TYPE_PCI_BUS;
      k->props = pci_props;
      pc->realize = pci_default_realize;
+    pc->is_valid_func = pci_is_valid_function;
  }

  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index dedf277..a89580f 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -191,6 +191,7 @@ typedef struct PCIDeviceClass {

      void (*realize)(PCIDevice *dev, Error **errp);
      int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
+    bool (*is_valid_func)(PCIDevice *dev, PCIBus *bus);
      PCIUnregisterFunc *exit;
      PCIConfigReadFunc *config_read;
      PCIConfigWriteFunc *config_write;



>
>
>
>> v15-v16:
>>     10/14, 11/14 are new to introduce a reset sequence id to specify the
>>     vfio devices has been reset for that reset. other patches aren't modified.
>>
>> v14-v15:
>>     1. add device hot reset callback
>>     2. add bus_in_reset for vfio device to avoid multi do host bus reset
>>
>> v13-v14:
>>     1. for multifunction device, requiring all functions enable AER.(9/13)
>>     2. due to all affected functions receive error signal, ignore no
>>        error occurred function. (12/13)
>>
>> v12-v13:
>>     1. since support multifuncion hotplug, here add callback to enable aer.
>>     2. add pci device pre+post reset for aer host reset.
>>
>> Chen Fan (14):
>>    vfio: extract vfio_get_hot_reset_info as a single function
>>    vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
>>    pcie: modify the capability size assert
>>    vfio: make the 4 bytes aligned for capability size
>>    vfio: add pcie extanded capability support
>>    aer: impove pcie_aer_init to support vfio device
>>    vfio: add aer support for vfio device
>>    vfio: add check host bus reset is support or not
>>    add check reset mechanism when hotplug vfio device
>>    pci: introduce pci bus pre reset
>>    vfio: introduce last reset sequence id
>>    pcie_aer: expose pcie_aer_msg() interface
>>    vfio-pci: pass the aer error to guest
>>    vfio: add 'aer' property to expose aercap
>>
>>   hw/pci-bridge/ioh3420.c            |   2 +-
>>   hw/pci-bridge/xio3130_downstream.c |   2 +-
>>   hw/pci-bridge/xio3130_upstream.c   |   2 +-
>>   hw/pci/pci.c                       |  42 +++
>>   hw/pci/pci_bridge.c                |   3 +
>>   hw/pci/pcie.c                      |   2 +-
>>   hw/pci/pcie_aer.c                  |   6 +-
>>   hw/vfio/pci.c                      | 616 +++++++++++++++++++++++++++++++++----
>>   hw/vfio/pci.h                      |   9 +
>>   include/hw/pci/pci.h               |   1 +
>>   include/hw/pci/pci_bus.h           |   8 +
>>   include/hw/pci/pcie_aer.h          |   3 +-
>>   12 files changed, 624 insertions(+), 72 deletions(-)
>>
>> -- 
>> 1.9.3
>>
>>
>
> .
>

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
  2016-02-03  8:54   ` Chen Fan
@ 2016-02-03 13:57     ` Michael S. Tsirkin
  2016-02-04  2:04       ` Chen Fan
  0 siblings, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-02-03 13:57 UTC (permalink / raw)
  To: Chen Fan; +Cc: izumi.taku, Cao jin, alex.williamson, qemu-devel

On Wed, Feb 03, 2016 at 04:54:01PM +0800, Chen Fan wrote:
> 
> On 01/17/2016 02:34 AM, Michael S. Tsirkin wrote:
> >On Tue, Jan 12, 2016 at 10:43:01AM +0800, Cao jin wrote:
> >>From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>
> >>For now, for vfio pci passthough devices when qemu receives
> >>an error from host aer report, currentlly just terminate the guest,
> >>but usually user want to know what error occurred but stopping the
> >>guest, so this patches add aer capability support for vfio device,
> >>and pass the error to guest, and have guest driver to recover
> >>from the error.
> >I would like to see a version of this patchset that doesn't
> >depend on pci core changes.
> >I think that if you make this simplifying assumption:
> >
> >- all devices on same bus in guest are on same bus in host
> >
> >then you can handle both reset and hotplug simply in function 0
> >since it will belong to vfio.
> >
> >So we can have a version without pci core changes that simply assumes
> >this, and things will just work.
> >
> >
> >Now, if we wanted to enforce this limitation, I think the
> >cleanest way would be to add a callback in struct PCIDevice:
> >
> >	bool is_valid_function(PCIDevice *newfunction)
> >
> >and call it as each function is added.
> >This way aer function can validate that each function
> >added shares the same bus.
> >And this way issues will be detected directly and not when
> >function 0 is added.
> >
> >I would prefer this validation code to be a patch on top so we can merge
> >the functionality directly and avoid blocking it while we figure out the
> >best api to validate things.
> >
> >I don't see why making guest topology match host would
> >ever be a problem, but if it's required to support
> >configurations where these differ, I'd like to see
> >an attempt to address that be split out, after aer
> >is supported.
> Hi Michael,
> 
> Just think about this more,  I think we also should check the vfio
> devices whether on the same bus at the time of function 0 is added.
> because we don't know the affected devices by a bus reset have
> already all been assigned to VM.

This is something vfio in kernel should check.
You can't rely on qemu being well behaved, so don't
even try to catch cases which would break host in userspace.

qemu should only worry about not breaking guest.


> for example, the multi-function's hotplug.
> devices on same bus in host are added to VM one by one. when we
> test one device, we haven't yet added the other devices.
> so I think
> the patch should like below. then we could add a vfio_is_valid_function in
> vfio
> to test each device whether the affected devices on the same bus.
> 
> Thanks,
> Chen
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index d940f79..7163b56 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1836,6 +1836,38 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num,
> uint8_t devfn)
>      return bus->devices[devfn];
>  }
> 
> +static int pci_bus_check_devices(PCIBus *bus)
> +{
> +    PCIDeviceClass *pc;
> +    int i, ret = 0;
> +
> +    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
> +        if (!bus->devices[i]) {
> +            continue;
> +        }
> +
> +        pc = PCI_DEVICE_GET_CLASS(bus->devices[i]);
> +        if (!pc->is_valid_func) {
> +            continue;
> +        }
> +
> +        ret = pc->is_valid_func(bus->devices[i], bus);
> +        if (!ret) {
> +            return -1;
> +        }
> +    }
> +    return 0;
> +}
> +
> +static bool pci_is_valid_function(PCIDevice *pdev, PCIBus *bus)
> +{
> +    if (pdev->bus == bus) {
> +        return true;
> +    }
> +
> +    return false;
> +}
> +

I don't really understand what is this one doing.
Why do we need a default function?

>  static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>  {
>      PCIDevice *pci_dev = (PCIDevice *)qdev;
> @@ -1878,6 +1910,14 @@ static void pci_qdev_realize(DeviceState *qdev, Error
> **errp)
>          pci_qdev_unrealize(DEVICE(pci_dev), NULL);
>          return;
>      }
> +
> +    if (DEVICE(pci_dev)->hotplugged &&
> +        pci_get_function_0(pci_dev) == pci_dev &&
> +        pci_bus_check_devices(bus)) {
> +        error_setg(errp, "failed to hotplug function 0");
> +        pci_qdev_unrealize(DEVICE(pci_dev), NULL);
> +        return;
> +    }
>  }
> 
>  static void pci_default_realize(PCIDevice *dev, Error **errp)
> @@ -2390,6 +2430,7 @@ static void pci_device_class_init(ObjectClass *klass,
> void *data)
>      k->bus_type = TYPE_PCI_BUS;
>      k->props = pci_props;
>      pc->realize = pci_default_realize;
> +    pc->is_valid_func = pci_is_valid_function;
>  }
> 
>  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index dedf277..a89580f 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -191,6 +191,7 @@ typedef struct PCIDeviceClass {
> 
>      void (*realize)(PCIDevice *dev, Error **errp);
>      int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
> +    bool (*is_valid_func)(PCIDevice *dev, PCIBus *bus);
>      PCIUnregisterFunc *exit;
>      PCIConfigReadFunc *config_read;
>      PCIConfigWriteFunc *config_write;
> 
> 
> 
> >
> >
> >
> >>v15-v16:
> >>    10/14, 11/14 are new to introduce a reset sequence id to specify the
> >>    vfio devices has been reset for that reset. other patches aren't modified.
> >>
> >>v14-v15:
> >>    1. add device hot reset callback
> >>    2. add bus_in_reset for vfio device to avoid multi do host bus reset
> >>
> >>v13-v14:
> >>    1. for multifunction device, requiring all functions enable AER.(9/13)
> >>    2. due to all affected functions receive error signal, ignore no
> >>       error occurred function. (12/13)
> >>
> >>v12-v13:
> >>    1. since support multifuncion hotplug, here add callback to enable aer.
> >>    2. add pci device pre+post reset for aer host reset.
> >>
> >>Chen Fan (14):
> >>   vfio: extract vfio_get_hot_reset_info as a single function
> >>   vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
> >>   pcie: modify the capability size assert
> >>   vfio: make the 4 bytes aligned for capability size
> >>   vfio: add pcie extanded capability support
> >>   aer: impove pcie_aer_init to support vfio device
> >>   vfio: add aer support for vfio device
> >>   vfio: add check host bus reset is support or not
> >>   add check reset mechanism when hotplug vfio device
> >>   pci: introduce pci bus pre reset
> >>   vfio: introduce last reset sequence id
> >>   pcie_aer: expose pcie_aer_msg() interface
> >>   vfio-pci: pass the aer error to guest
> >>   vfio: add 'aer' property to expose aercap
> >>
> >>  hw/pci-bridge/ioh3420.c            |   2 +-
> >>  hw/pci-bridge/xio3130_downstream.c |   2 +-
> >>  hw/pci-bridge/xio3130_upstream.c   |   2 +-
> >>  hw/pci/pci.c                       |  42 +++
> >>  hw/pci/pci_bridge.c                |   3 +
> >>  hw/pci/pcie.c                      |   2 +-
> >>  hw/pci/pcie_aer.c                  |   6 +-
> >>  hw/vfio/pci.c                      | 616 +++++++++++++++++++++++++++++++++----
> >>  hw/vfio/pci.h                      |   9 +
> >>  include/hw/pci/pci.h               |   1 +
> >>  include/hw/pci/pci_bus.h           |   8 +
> >>  include/hw/pci/pcie_aer.h          |   3 +-
> >>  12 files changed, 624 insertions(+), 72 deletions(-)
> >>
> >>-- 
> >>1.9.3
> >>
> >>
> >
> >.
> >
> 
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
  2016-02-03 13:57     ` Michael S. Tsirkin
@ 2016-02-04  2:04       ` Chen Fan
  2016-02-04 11:21         ` Michael S. Tsirkin
  0 siblings, 1 reply; 42+ messages in thread
From: Chen Fan @ 2016-02-04  2:04 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: izumi.taku, Cao jin, alex.williamson, qemu-devel


On 02/03/2016 09:57 PM, Michael S. Tsirkin wrote:
> On Wed, Feb 03, 2016 at 04:54:01PM +0800, Chen Fan wrote:
>> On 01/17/2016 02:34 AM, Michael S. Tsirkin wrote:
>>> On Tue, Jan 12, 2016 at 10:43:01AM +0800, Cao jin wrote:
>>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>
>>>> For now, for vfio pci passthough devices when qemu receives
>>>> an error from host aer report, currentlly just terminate the guest,
>>>> but usually user want to know what error occurred but stopping the
>>>> guest, so this patches add aer capability support for vfio device,
>>>> and pass the error to guest, and have guest driver to recover
>>> >from the error.
>>> I would like to see a version of this patchset that doesn't
>>> depend on pci core changes.
>>> I think that if you make this simplifying assumption:
>>>
>>> - all devices on same bus in guest are on same bus in host
>>>
>>> then you can handle both reset and hotplug simply in function 0
>>> since it will belong to vfio.
>>>
>>> So we can have a version without pci core changes that simply assumes
>>> this, and things will just work.
>>>
>>>
>>> Now, if we wanted to enforce this limitation, I think the
>>> cleanest way would be to add a callback in struct PCIDevice:
>>>
>>> 	bool is_valid_function(PCIDevice *newfunction)
>>>
>>> and call it as each function is added.
>>> This way aer function can validate that each function
>>> added shares the same bus.
>>> And this way issues will be detected directly and not when
>>> function 0 is added.
>>>
>>> I would prefer this validation code to be a patch on top so we can merge
>>> the functionality directly and avoid blocking it while we figure out the
>>> best api to validate things.
>>>
>>> I don't see why making guest topology match host would
>>> ever be a problem, but if it's required to support
>>> configurations where these differ, I'd like to see
>>> an attempt to address that be split out, after aer
>>> is supported.
>> Hi Michael,
>>
>> Just think about this more,  I think we also should check the vfio
>> devices whether on the same bus at the time of function 0 is added.
>> because we don't know the affected devices by a bus reset have
>> already all been assigned to VM.
> This is something vfio in kernel should check.
> You can't rely on qemu being well behaved, so don't
> even try to catch cases which would break host in userspace.
>
> qemu should only worry about not breaking guest.
>
>
>> for example, the multi-function's hotplug.
>> devices on same bus in host are added to VM one by one. when we
>> test one device, we haven't yet added the other devices.
>> so I think
>> the patch should like below. then we could add a vfio_is_valid_function in
>> vfio
>> to test each device whether the affected devices on the same bus.
>>
>> Thanks,
>> Chen
>>
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index d940f79..7163b56 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -1836,6 +1836,38 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num,
>> uint8_t devfn)
>>       return bus->devices[devfn];
>>   }
>>
>> +static int pci_bus_check_devices(PCIBus *bus)
>> +{
>> +    PCIDeviceClass *pc;
>> +    int i, ret = 0;
>> +
>> +    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
>> +        if (!bus->devices[i]) {
>> +            continue;
>> +        }
>> +
>> +        pc = PCI_DEVICE_GET_CLASS(bus->devices[i]);
>> +        if (!pc->is_valid_func) {
>> +            continue;
>> +        }
>> +
>> +        ret = pc->is_valid_func(bus->devices[i], bus);
>> +        if (!ret) {
>> +            return -1;
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>> +static bool pci_is_valid_function(PCIDevice *pdev, PCIBus *bus)
>> +{
>> +    if (pdev->bus == bus) {
>> +        return true;
>> +    }
>> +
>> +    return false;
>> +}
>> +
> I don't really understand what is this one doing.
> Why do we need a default function?
if the vfio driver in kernel can handle the bus reset for any one
device in qemu without the affected devices assigned. I think
we don't need this default one.
BTW, IIRC at present the devices on the same bus in host can
be assigned to different VM, so if we want to support this kind of
bus reset for an independent device when enable aer, aren't we
limiting the case that others devices on the same bus must be
assigned to current VM?

Thanks,
Chen
>>   static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>>   {
>>       PCIDevice *pci_dev = (PCIDevice *)qdev;
>> @@ -1878,6 +1910,14 @@ static void pci_qdev_realize(DeviceState *qdev, Error
>> **errp)
>>           pci_qdev_unrealize(DEVICE(pci_dev), NULL);
>>           return;
>>       }
>> +
>> +    if (DEVICE(pci_dev)->hotplugged &&
>> +        pci_get_function_0(pci_dev) == pci_dev &&
>> +        pci_bus_check_devices(bus)) {
>> +        error_setg(errp, "failed to hotplug function 0");
>> +        pci_qdev_unrealize(DEVICE(pci_dev), NULL);
>> +        return;
>> +    }
>>   }
>>
>>   static void pci_default_realize(PCIDevice *dev, Error **errp)
>> @@ -2390,6 +2430,7 @@ static void pci_device_class_init(ObjectClass *klass,
>> void *data)
>>       k->bus_type = TYPE_PCI_BUS;
>>       k->props = pci_props;
>>       pc->realize = pci_default_realize;
>> +    pc->is_valid_func = pci_is_valid_function;
>>   }
>>
>>   AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index dedf277..a89580f 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -191,6 +191,7 @@ typedef struct PCIDeviceClass {
>>
>>       void (*realize)(PCIDevice *dev, Error **errp);
>>       int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
>> +    bool (*is_valid_func)(PCIDevice *dev, PCIBus *bus);
>>       PCIUnregisterFunc *exit;
>>       PCIConfigReadFunc *config_read;
>>       PCIConfigWriteFunc *config_write;
>>
>>
>>
>>>
>>>
>>>> v15-v16:
>>>>     10/14, 11/14 are new to introduce a reset sequence id to specify the
>>>>     vfio devices has been reset for that reset. other patches aren't modified.
>>>>
>>>> v14-v15:
>>>>     1. add device hot reset callback
>>>>     2. add bus_in_reset for vfio device to avoid multi do host bus reset
>>>>
>>>> v13-v14:
>>>>     1. for multifunction device, requiring all functions enable AER.(9/13)
>>>>     2. due to all affected functions receive error signal, ignore no
>>>>        error occurred function. (12/13)
>>>>
>>>> v12-v13:
>>>>     1. since support multifuncion hotplug, here add callback to enable aer.
>>>>     2. add pci device pre+post reset for aer host reset.
>>>>
>>>> Chen Fan (14):
>>>>    vfio: extract vfio_get_hot_reset_info as a single function
>>>>    vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
>>>>    pcie: modify the capability size assert
>>>>    vfio: make the 4 bytes aligned for capability size
>>>>    vfio: add pcie extanded capability support
>>>>    aer: impove pcie_aer_init to support vfio device
>>>>    vfio: add aer support for vfio device
>>>>    vfio: add check host bus reset is support or not
>>>>    add check reset mechanism when hotplug vfio device
>>>>    pci: introduce pci bus pre reset
>>>>    vfio: introduce last reset sequence id
>>>>    pcie_aer: expose pcie_aer_msg() interface
>>>>    vfio-pci: pass the aer error to guest
>>>>    vfio: add 'aer' property to expose aercap
>>>>
>>>>   hw/pci-bridge/ioh3420.c            |   2 +-
>>>>   hw/pci-bridge/xio3130_downstream.c |   2 +-
>>>>   hw/pci-bridge/xio3130_upstream.c   |   2 +-
>>>>   hw/pci/pci.c                       |  42 +++
>>>>   hw/pci/pci_bridge.c                |   3 +
>>>>   hw/pci/pcie.c                      |   2 +-
>>>>   hw/pci/pcie_aer.c                  |   6 +-
>>>>   hw/vfio/pci.c                      | 616 +++++++++++++++++++++++++++++++++----
>>>>   hw/vfio/pci.h                      |   9 +
>>>>   include/hw/pci/pci.h               |   1 +
>>>>   include/hw/pci/pci_bus.h           |   8 +
>>>>   include/hw/pci/pcie_aer.h          |   3 +-
>>>>   12 files changed, 624 insertions(+), 72 deletions(-)
>>>>
>>>> -- 
>>>> 1.9.3
>>>>
>>>>
>>> .
>>>
>>
>
> .
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
  2016-02-04  2:04       ` Chen Fan
@ 2016-02-04 11:21         ` Michael S. Tsirkin
  2016-02-04 17:46           ` Alex Williamson
  0 siblings, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-02-04 11:21 UTC (permalink / raw)
  To: Chen Fan; +Cc: izumi.taku, Cao jin, alex.williamson, qemu-devel

On Thu, Feb 04, 2016 at 10:04:01AM +0800, Chen Fan wrote:
> 
> On 02/03/2016 09:57 PM, Michael S. Tsirkin wrote:
> >On Wed, Feb 03, 2016 at 04:54:01PM +0800, Chen Fan wrote:
> >>On 01/17/2016 02:34 AM, Michael S. Tsirkin wrote:
> >>>On Tue, Jan 12, 2016 at 10:43:01AM +0800, Cao jin wrote:
> >>>>From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>>>
> >>>>For now, for vfio pci passthough devices when qemu receives
> >>>>an error from host aer report, currentlly just terminate the guest,
> >>>>but usually user want to know what error occurred but stopping the
> >>>>guest, so this patches add aer capability support for vfio device,
> >>>>and pass the error to guest, and have guest driver to recover
> >>>>from the error.
> >>>I would like to see a version of this patchset that doesn't
> >>>depend on pci core changes.
> >>>I think that if you make this simplifying assumption:
> >>>
> >>>- all devices on same bus in guest are on same bus in host
> >>>
> >>>then you can handle both reset and hotplug simply in function 0
> >>>since it will belong to vfio.
> >>>
> >>>So we can have a version without pci core changes that simply assumes
> >>>this, and things will just work.
> >>>
> >>>
> >>>Now, if we wanted to enforce this limitation, I think the
> >>>cleanest way would be to add a callback in struct PCIDevice:
> >>>
> >>>	bool is_valid_function(PCIDevice *newfunction)
> >>>
> >>>and call it as each function is added.
> >>>This way aer function can validate that each function
> >>>added shares the same bus.
> >>>And this way issues will be detected directly and not when
> >>>function 0 is added.
> >>>
> >>>I would prefer this validation code to be a patch on top so we can merge
> >>>the functionality directly and avoid blocking it while we figure out the
> >>>best api to validate things.
> >>>
> >>>I don't see why making guest topology match host would
> >>>ever be a problem, but if it's required to support
> >>>configurations where these differ, I'd like to see
> >>>an attempt to address that be split out, after aer
> >>>is supported.
> >>Hi Michael,
> >>
> >>Just think about this more,  I think we also should check the vfio
> >>devices whether on the same bus at the time of function 0 is added.
> >>because we don't know the affected devices by a bus reset have
> >>already all been assigned to VM.
> >This is something vfio in kernel should check.
> >You can't rely on qemu being well behaved, so don't
> >even try to catch cases which would break host in userspace.
> >
> >qemu should only worry about not breaking guest.
> >
> >
> >>for example, the multi-function's hotplug.
> >>devices on same bus in host are added to VM one by one. when we
> >>test one device, we haven't yet added the other devices.
> >>so I think
> >>the patch should like below. then we could add a vfio_is_valid_function in
> >>vfio
> >>to test each device whether the affected devices on the same bus.
> >>
> >>Thanks,
> >>Chen
> >>
> >>diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >>index d940f79..7163b56 100644
> >>--- a/hw/pci/pci.c
> >>+++ b/hw/pci/pci.c
> >>@@ -1836,6 +1836,38 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num,
> >>uint8_t devfn)
> >>      return bus->devices[devfn];
> >>  }
> >>
> >>+static int pci_bus_check_devices(PCIBus *bus)
> >>+{
> >>+    PCIDeviceClass *pc;
> >>+    int i, ret = 0;
> >>+
> >>+    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
> >>+        if (!bus->devices[i]) {
> >>+            continue;
> >>+        }
> >>+
> >>+        pc = PCI_DEVICE_GET_CLASS(bus->devices[i]);
> >>+        if (!pc->is_valid_func) {
> >>+            continue;
> >>+        }
> >>+
> >>+        ret = pc->is_valid_func(bus->devices[i], bus);
> >>+        if (!ret) {
> >>+            return -1;
> >>+        }
> >>+    }
> >>+    return 0;
> >>+}
> >>+
> >>+static bool pci_is_valid_function(PCIDevice *pdev, PCIBus *bus)
> >>+{
> >>+    if (pdev->bus == bus) {
> >>+        return true;
> >>+    }
> >>+
> >>+    return false;
> >>+}
> >>+
> >I don't really understand what is this one doing.
> >Why do we need a default function?
> if the vfio driver in kernel can handle the bus reset for any one
> device in qemu without the affected devices assigned. I think
> we don't need this default one.
> BTW, IIRC at present the devices on the same bus in host can
> be assigned to different VM, so if we want to support this kind of
> bus reset for an independent device when enable aer, aren't we
> limiting the case that others devices on the same bus must be
> assigned to current VM?
> 
> Thanks,
> Chen

I don't believe this works at the moment, and
I'd expect kernel to prevent this,
so we should not rely on userspace code for this.
Alex, could you comment please?


> >>  static void pci_qdev_realize(DeviceState *qdev, Error **errp)
> >>  {
> >>      PCIDevice *pci_dev = (PCIDevice *)qdev;
> >>@@ -1878,6 +1910,14 @@ static void pci_qdev_realize(DeviceState *qdev, Error
> >>**errp)
> >>          pci_qdev_unrealize(DEVICE(pci_dev), NULL);
> >>          return;
> >>      }
> >>+
> >>+    if (DEVICE(pci_dev)->hotplugged &&
> >>+        pci_get_function_0(pci_dev) == pci_dev &&
> >>+        pci_bus_check_devices(bus)) {
> >>+        error_setg(errp, "failed to hotplug function 0");
> >>+        pci_qdev_unrealize(DEVICE(pci_dev), NULL);
> >>+        return;
> >>+    }
> >>  }
> >>
> >>  static void pci_default_realize(PCIDevice *dev, Error **errp)
> >>@@ -2390,6 +2430,7 @@ static void pci_device_class_init(ObjectClass *klass,
> >>void *data)
> >>      k->bus_type = TYPE_PCI_BUS;
> >>      k->props = pci_props;
> >>      pc->realize = pci_default_realize;
> >>+    pc->is_valid_func = pci_is_valid_function;
> >>  }
> >>
> >>  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
> >>diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >>index dedf277..a89580f 100644
> >>--- a/include/hw/pci/pci.h
> >>+++ b/include/hw/pci/pci.h
> >>@@ -191,6 +191,7 @@ typedef struct PCIDeviceClass {
> >>
> >>      void (*realize)(PCIDevice *dev, Error **errp);
> >>      int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
> >>+    bool (*is_valid_func)(PCIDevice *dev, PCIBus *bus);
> >>      PCIUnregisterFunc *exit;
> >>      PCIConfigReadFunc *config_read;
> >>      PCIConfigWriteFunc *config_write;
> >>
> >>
> >>
> >>>
> >>>
> >>>>v15-v16:
> >>>>    10/14, 11/14 are new to introduce a reset sequence id to specify the
> >>>>    vfio devices has been reset for that reset. other patches aren't modified.
> >>>>
> >>>>v14-v15:
> >>>>    1. add device hot reset callback
> >>>>    2. add bus_in_reset for vfio device to avoid multi do host bus reset
> >>>>
> >>>>v13-v14:
> >>>>    1. for multifunction device, requiring all functions enable AER.(9/13)
> >>>>    2. due to all affected functions receive error signal, ignore no
> >>>>       error occurred function. (12/13)
> >>>>
> >>>>v12-v13:
> >>>>    1. since support multifuncion hotplug, here add callback to enable aer.
> >>>>    2. add pci device pre+post reset for aer host reset.
> >>>>
> >>>>Chen Fan (14):
> >>>>   vfio: extract vfio_get_hot_reset_info as a single function
> >>>>   vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
> >>>>   pcie: modify the capability size assert
> >>>>   vfio: make the 4 bytes aligned for capability size
> >>>>   vfio: add pcie extanded capability support
> >>>>   aer: impove pcie_aer_init to support vfio device
> >>>>   vfio: add aer support for vfio device
> >>>>   vfio: add check host bus reset is support or not
> >>>>   add check reset mechanism when hotplug vfio device
> >>>>   pci: introduce pci bus pre reset
> >>>>   vfio: introduce last reset sequence id
> >>>>   pcie_aer: expose pcie_aer_msg() interface
> >>>>   vfio-pci: pass the aer error to guest
> >>>>   vfio: add 'aer' property to expose aercap
> >>>>
> >>>>  hw/pci-bridge/ioh3420.c            |   2 +-
> >>>>  hw/pci-bridge/xio3130_downstream.c |   2 +-
> >>>>  hw/pci-bridge/xio3130_upstream.c   |   2 +-
> >>>>  hw/pci/pci.c                       |  42 +++
> >>>>  hw/pci/pci_bridge.c                |   3 +
> >>>>  hw/pci/pcie.c                      |   2 +-
> >>>>  hw/pci/pcie_aer.c                  |   6 +-
> >>>>  hw/vfio/pci.c                      | 616 +++++++++++++++++++++++++++++++++----
> >>>>  hw/vfio/pci.h                      |   9 +
> >>>>  include/hw/pci/pci.h               |   1 +
> >>>>  include/hw/pci/pci_bus.h           |   8 +
> >>>>  include/hw/pci/pcie_aer.h          |   3 +-
> >>>>  12 files changed, 624 insertions(+), 72 deletions(-)
> >>>>
> >>>>-- 
> >>>>1.9.3
> >>>>
> >>>>
> >>>.
> >>>
> >>
> >
> >.
> >
> 
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
  2016-02-04 11:21         ` Michael S. Tsirkin
@ 2016-02-04 17:46           ` Alex Williamson
  2016-02-04 18:09             ` Michael S. Tsirkin
  0 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2016-02-04 17:46 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Chen Fan, izumi.taku, Cao jin, qemu-devel

On Thu, 4 Feb 2016 13:21:57 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Thu, Feb 04, 2016 at 10:04:01AM +0800, Chen Fan wrote:
> > 
> > On 02/03/2016 09:57 PM, Michael S. Tsirkin wrote:  
> > >On Wed, Feb 03, 2016 at 04:54:01PM +0800, Chen Fan wrote:  
> > >>On 01/17/2016 02:34 AM, Michael S. Tsirkin wrote:  
> > >>>On Tue, Jan 12, 2016 at 10:43:01AM +0800, Cao jin wrote:  
> > >>>>From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > >>>>
> > >>>>For now, for vfio pci passthough devices when qemu receives
> > >>>>an error from host aer report, currentlly just terminate the
> > >>>>guest, but usually user want to know what error occurred but
> > >>>>stopping the guest, so this patches add aer capability support
> > >>>>for vfio device, and pass the error to guest, and have guest
> > >>>>driver to recover from the error.  
> > >>>I would like to see a version of this patchset that doesn't
> > >>>depend on pci core changes.
> > >>>I think that if you make this simplifying assumption:
> > >>>
> > >>>- all devices on same bus in guest are on same bus in host
> > >>>
> > >>>then you can handle both reset and hotplug simply in function 0
> > >>>since it will belong to vfio.
> > >>>
> > >>>So we can have a version without pci core changes that simply
> > >>>assumes this, and things will just work.
> > >>>
> > >>>
> > >>>Now, if we wanted to enforce this limitation, I think the
> > >>>cleanest way would be to add a callback in struct PCIDevice:
> > >>>
> > >>>	bool is_valid_function(PCIDevice *newfunction)
> > >>>
> > >>>and call it as each function is added.
> > >>>This way aer function can validate that each function
> > >>>added shares the same bus.
> > >>>And this way issues will be detected directly and not when
> > >>>function 0 is added.
> > >>>
> > >>>I would prefer this validation code to be a patch on top so we
> > >>>can merge the functionality directly and avoid blocking it while
> > >>>we figure out the best api to validate things.
> > >>>
> > >>>I don't see why making guest topology match host would
> > >>>ever be a problem, but if it's required to support
> > >>>configurations where these differ, I'd like to see
> > >>>an attempt to address that be split out, after aer
> > >>>is supported.  
> > >>Hi Michael,
> > >>
> > >>Just think about this more,  I think we also should check the vfio
> > >>devices whether on the same bus at the time of function 0 is
> > >>added. because we don't know the affected devices by a bus reset
> > >>have already all been assigned to VM.  
> > >This is something vfio in kernel should check.
> > >You can't rely on qemu being well behaved, so don't
> > >even try to catch cases which would break host in userspace.
> > >
> > >qemu should only worry about not breaking guest.
> > >
> > >  
> > >>for example, the multi-function's hotplug.
> > >>devices on same bus in host are added to VM one by one. when we
> > >>test one device, we haven't yet added the other devices.
> > >>so I think
> > >>the patch should like below. then we could add a
> > >>vfio_is_valid_function in vfio
> > >>to test each device whether the affected devices on the same bus.
> > >>
> > >>Thanks,
> > >>Chen
> > >>
> > >>diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > >>index d940f79..7163b56 100644
> > >>--- a/hw/pci/pci.c
> > >>+++ b/hw/pci/pci.c
> > >>@@ -1836,6 +1836,38 @@ PCIDevice *pci_find_device(PCIBus *bus,
> > >>int bus_num, uint8_t devfn)
> > >>      return bus->devices[devfn];
> > >>  }
> > >>
> > >>+static int pci_bus_check_devices(PCIBus *bus)
> > >>+{
> > >>+    PCIDeviceClass *pc;
> > >>+    int i, ret = 0;
> > >>+
> > >>+    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
> > >>+        if (!bus->devices[i]) {
> > >>+            continue;
> > >>+        }
> > >>+
> > >>+        pc = PCI_DEVICE_GET_CLASS(bus->devices[i]);
> > >>+        if (!pc->is_valid_func) {
> > >>+            continue;
> > >>+        }
> > >>+
> > >>+        ret = pc->is_valid_func(bus->devices[i], bus);
> > >>+        if (!ret) {
> > >>+            return -1;
> > >>+        }
> > >>+    }
> > >>+    return 0;
> > >>+}
> > >>+
> > >>+static bool pci_is_valid_function(PCIDevice *pdev, PCIBus *bus)
> > >>+{
> > >>+    if (pdev->bus == bus) {
> > >>+        return true;
> > >>+    }
> > >>+
> > >>+    return false;
> > >>+}
> > >>+  
> > >I don't really understand what is this one doing.
> > >Why do we need a default function?  
> > if the vfio driver in kernel can handle the bus reset for any one
> > device in qemu without the affected devices assigned. I think
> > we don't need this default one.
> > BTW, IIRC at present the devices on the same bus in host can
> > be assigned to different VM, so if we want to support this kind of
> > bus reset for an independent device when enable aer, aren't we
> > limiting the case that others devices on the same bus must be
> > assigned to current VM?
> > 
> > Thanks,
> > Chen  
> 
> I don't believe this works at the moment, and
> I'd expect kernel to prevent this,
> so we should not rely on userspace code for this.
> Alex, could you comment please?

DMA isolation and bus isolation are separate things.  So long as
devices on the same bus are DMA isolated then they can be assigned to
separate VMs or split between host and VM.  However, certain features
like bus reset are not available to the user unless they "own" all of
the DMA isolated sets affected by a bus reset.  The kernel doesn't care
how the user is using them, but they must prove they own them through
vfio group file descriptors.

I thought in previous discussions we decided that unused devices made
the problem set more complicated for userspace so we simplified by
requiring them to be assigned.  For instance imagine a two function
device, with DMA isolation between functions, where we only want one
function assigned to the VM. QEMU would need to learn to take ownership
of the other function without exposing it to the VM simply for the
purpose of being able to perform a bus reset.  Another simplification
was to expose them in the same slot, so we don't need to worry about VM
configurations where one device could be hot-unplugged and the
ownership released, breaking QEMU's ability to do a bus reset on the
remaining device.  There are a lot of configuration restrictions
imposed by adding the requirement that QEMU needs to be able to perform
a host bus reset on a device in order to support this feature.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
  2016-02-04 17:46           ` Alex Williamson
@ 2016-02-04 18:09             ` Michael S. Tsirkin
  2016-02-04 20:15               ` Alex Williamson
  0 siblings, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-02-04 18:09 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Chen Fan, izumi.taku, Cao jin, qemu-devel

On Thu, Feb 04, 2016 at 10:46:52AM -0700, Alex Williamson wrote:
> On Thu, 4 Feb 2016 13:21:57 +0200
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Thu, Feb 04, 2016 at 10:04:01AM +0800, Chen Fan wrote:
> > > 
> > > On 02/03/2016 09:57 PM, Michael S. Tsirkin wrote:  
> > > >On Wed, Feb 03, 2016 at 04:54:01PM +0800, Chen Fan wrote:  
> > > >>On 01/17/2016 02:34 AM, Michael S. Tsirkin wrote:  
> > > >>>On Tue, Jan 12, 2016 at 10:43:01AM +0800, Cao jin wrote:  
> > > >>>>From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > >>>>
> > > >>>>For now, for vfio pci passthough devices when qemu receives
> > > >>>>an error from host aer report, currentlly just terminate the
> > > >>>>guest, but usually user want to know what error occurred but
> > > >>>>stopping the guest, so this patches add aer capability support
> > > >>>>for vfio device, and pass the error to guest, and have guest
> > > >>>>driver to recover from the error.  
> > > >>>I would like to see a version of this patchset that doesn't
> > > >>>depend on pci core changes.
> > > >>>I think that if you make this simplifying assumption:
> > > >>>
> > > >>>- all devices on same bus in guest are on same bus in host
> > > >>>
> > > >>>then you can handle both reset and hotplug simply in function 0
> > > >>>since it will belong to vfio.
> > > >>>
> > > >>>So we can have a version without pci core changes that simply
> > > >>>assumes this, and things will just work.
> > > >>>
> > > >>>
> > > >>>Now, if we wanted to enforce this limitation, I think the
> > > >>>cleanest way would be to add a callback in struct PCIDevice:
> > > >>>
> > > >>>	bool is_valid_function(PCIDevice *newfunction)
> > > >>>
> > > >>>and call it as each function is added.
> > > >>>This way aer function can validate that each function
> > > >>>added shares the same bus.
> > > >>>And this way issues will be detected directly and not when
> > > >>>function 0 is added.
> > > >>>
> > > >>>I would prefer this validation code to be a patch on top so we
> > > >>>can merge the functionality directly and avoid blocking it while
> > > >>>we figure out the best api to validate things.
> > > >>>
> > > >>>I don't see why making guest topology match host would
> > > >>>ever be a problem, but if it's required to support
> > > >>>configurations where these differ, I'd like to see
> > > >>>an attempt to address that be split out, after aer
> > > >>>is supported.  
> > > >>Hi Michael,
> > > >>
> > > >>Just think about this more,  I think we also should check the vfio
> > > >>devices whether on the same bus at the time of function 0 is
> > > >>added. because we don't know the affected devices by a bus reset
> > > >>have already all been assigned to VM.  
> > > >This is something vfio in kernel should check.
> > > >You can't rely on qemu being well behaved, so don't
> > > >even try to catch cases which would break host in userspace.
> > > >
> > > >qemu should only worry about not breaking guest.
> > > >
> > > >  
> > > >>for example, the multi-function's hotplug.
> > > >>devices on same bus in host are added to VM one by one. when we
> > > >>test one device, we haven't yet added the other devices.
> > > >>so I think
> > > >>the patch should like below. then we could add a
> > > >>vfio_is_valid_function in vfio
> > > >>to test each device whether the affected devices on the same bus.
> > > >>
> > > >>Thanks,
> > > >>Chen
> > > >>
> > > >>diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > > >>index d940f79..7163b56 100644
> > > >>--- a/hw/pci/pci.c
> > > >>+++ b/hw/pci/pci.c
> > > >>@@ -1836,6 +1836,38 @@ PCIDevice *pci_find_device(PCIBus *bus,
> > > >>int bus_num, uint8_t devfn)
> > > >>      return bus->devices[devfn];
> > > >>  }
> > > >>
> > > >>+static int pci_bus_check_devices(PCIBus *bus)
> > > >>+{
> > > >>+    PCIDeviceClass *pc;
> > > >>+    int i, ret = 0;
> > > >>+
> > > >>+    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
> > > >>+        if (!bus->devices[i]) {
> > > >>+            continue;
> > > >>+        }
> > > >>+
> > > >>+        pc = PCI_DEVICE_GET_CLASS(bus->devices[i]);
> > > >>+        if (!pc->is_valid_func) {
> > > >>+            continue;
> > > >>+        }
> > > >>+
> > > >>+        ret = pc->is_valid_func(bus->devices[i], bus);
> > > >>+        if (!ret) {
> > > >>+            return -1;
> > > >>+        }
> > > >>+    }
> > > >>+    return 0;
> > > >>+}
> > > >>+
> > > >>+static bool pci_is_valid_function(PCIDevice *pdev, PCIBus *bus)
> > > >>+{
> > > >>+    if (pdev->bus == bus) {
> > > >>+        return true;
> > > >>+    }
> > > >>+
> > > >>+    return false;
> > > >>+}
> > > >>+  
> > > >I don't really understand what is this one doing.
> > > >Why do we need a default function?  
> > > if the vfio driver in kernel can handle the bus reset for any one
> > > device in qemu without the affected devices assigned. I think
> > > we don't need this default one.
> > > BTW, IIRC at present the devices on the same bus in host can
> > > be assigned to different VM, so if we want to support this kind of
> > > bus reset for an independent device when enable aer, aren't we
> > > limiting the case that others devices on the same bus must be
> > > assigned to current VM?
> > > 
> > > Thanks,
> > > Chen  
> > 
> > I don't believe this works at the moment, and
> > I'd expect kernel to prevent this,
> > so we should not rely on userspace code for this.
> > Alex, could you comment please?
> 
> DMA isolation and bus isolation are separate things.  So long as
> devices on the same bus are DMA isolated then they can be assigned to
> separate VMs or split between host and VM.  However, certain features
> like bus reset are not available to the user unless they "own" all of
> the DMA isolated sets affected by a bus reset.  The kernel doesn't care
> how the user is using them, but they must prove they own them through
> vfio group file descriptors.

OK this makes sense.
So I think the solution is for userspace to make sure bus reset
is available before exposing aer to guest.
For example, attempt a bus reset.


> I thought in previous discussions we decided that unused devices made
> the problem set more complicated for userspace so we simplified by
> requiring them to be assigned.  For instance imagine a two function
> device, with DMA isolation between functions, where we only want one
> function assigned to the VM. QEMU would need to learn to take ownership
> of the other function without exposing it to the VM simply for the
> purpose of being able to perform a bus reset.

Hmm this might be a security problem.
Ideally we should not touch devices we don't *need* to touch.
But given kernel requires this at the moment (IIUC)
QEMU could open the device
but not expose it to guest until actually asked.

>  Another simplification
> was to expose them in the same slot, so we don't need to worry about VM
> configurations where one device could be hot-unplugged and the
> ownership released, breaking QEMU's ability to do a bus reset on the
> remaining device.

Same would apply here.

>  There are a lot of configuration restrictions
> imposed by adding the requirement that QEMU needs to be able to perform
> a host bus reset on a device in order to support this feature.  Thanks,
> 
> Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
  2016-02-04 18:09             ` Michael S. Tsirkin
@ 2016-02-04 20:15               ` Alex Williamson
  2016-02-04 21:58                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2016-02-04 20:15 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Chen Fan, izumi taku, Cao jin, qemu-devel



----- Original Message -----
> On Thu, Feb 04, 2016 at 10:46:52AM -0700, Alex Williamson wrote:
> > On Thu, 4 Feb 2016 13:21:57 +0200
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > 
> > > On Thu, Feb 04, 2016 at 10:04:01AM +0800, Chen Fan wrote:
> > > > 
> > > > On 02/03/2016 09:57 PM, Michael S. Tsirkin wrote:
> > > > >On Wed, Feb 03, 2016 at 04:54:01PM +0800, Chen Fan wrote:
> > > > >>On 01/17/2016 02:34 AM, Michael S. Tsirkin wrote:
> > > > >>>On Tue, Jan 12, 2016 at 10:43:01AM +0800, Cao jin wrote:
> > > > >>>>From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > >>>>
> > > > >>>>For now, for vfio pci passthough devices when qemu receives
> > > > >>>>an error from host aer report, currentlly just terminate the
> > > > >>>>guest, but usually user want to know what error occurred but
> > > > >>>>stopping the guest, so this patches add aer capability support
> > > > >>>>for vfio device, and pass the error to guest, and have guest
> > > > >>>>driver to recover from the error.
> > > > >>>I would like to see a version of this patchset that doesn't
> > > > >>>depend on pci core changes.
> > > > >>>I think that if you make this simplifying assumption:
> > > > >>>
> > > > >>>- all devices on same bus in guest are on same bus in host
> > > > >>>
> > > > >>>then you can handle both reset and hotplug simply in function 0
> > > > >>>since it will belong to vfio.
> > > > >>>
> > > > >>>So we can have a version without pci core changes that simply
> > > > >>>assumes this, and things will just work.
> > > > >>>
> > > > >>>
> > > > >>>Now, if we wanted to enforce this limitation, I think the
> > > > >>>cleanest way would be to add a callback in struct PCIDevice:
> > > > >>>
> > > > >>>	bool is_valid_function(PCIDevice *newfunction)
> > > > >>>
> > > > >>>and call it as each function is added.
> > > > >>>This way aer function can validate that each function
> > > > >>>added shares the same bus.
> > > > >>>And this way issues will be detected directly and not when
> > > > >>>function 0 is added.
> > > > >>>
> > > > >>>I would prefer this validation code to be a patch on top so we
> > > > >>>can merge the functionality directly and avoid blocking it while
> > > > >>>we figure out the best api to validate things.
> > > > >>>
> > > > >>>I don't see why making guest topology match host would
> > > > >>>ever be a problem, but if it's required to support
> > > > >>>configurations where these differ, I'd like to see
> > > > >>>an attempt to address that be split out, after aer
> > > > >>>is supported.
> > > > >>Hi Michael,
> > > > >>
> > > > >>Just think about this more,  I think we also should check the vfio
> > > > >>devices whether on the same bus at the time of function 0 is
> > > > >>added. because we don't know the affected devices by a bus reset
> > > > >>have already all been assigned to VM.
> > > > >This is something vfio in kernel should check.
> > > > >You can't rely on qemu being well behaved, so don't
> > > > >even try to catch cases which would break host in userspace.
> > > > >
> > > > >qemu should only worry about not breaking guest.
> > > > >
> > > > >  
> > > > >>for example, the multi-function's hotplug.
> > > > >>devices on same bus in host are added to VM one by one. when we
> > > > >>test one device, we haven't yet added the other devices.
> > > > >>so I think
> > > > >>the patch should like below. then we could add a
> > > > >>vfio_is_valid_function in vfio
> > > > >>to test each device whether the affected devices on the same bus.
> > > > >>
> > > > >>Thanks,
> > > > >>Chen
> > > > >>
> > > > >>diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > > > >>index d940f79..7163b56 100644
> > > > >>--- a/hw/pci/pci.c
> > > > >>+++ b/hw/pci/pci.c
> > > > >>@@ -1836,6 +1836,38 @@ PCIDevice *pci_find_device(PCIBus *bus,
> > > > >>int bus_num, uint8_t devfn)
> > > > >>      return bus->devices[devfn];
> > > > >>  }
> > > > >>
> > > > >>+static int pci_bus_check_devices(PCIBus *bus)
> > > > >>+{
> > > > >>+    PCIDeviceClass *pc;
> > > > >>+    int i, ret = 0;
> > > > >>+
> > > > >>+    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
> > > > >>+        if (!bus->devices[i]) {
> > > > >>+            continue;
> > > > >>+        }
> > > > >>+
> > > > >>+        pc = PCI_DEVICE_GET_CLASS(bus->devices[i]);
> > > > >>+        if (!pc->is_valid_func) {
> > > > >>+            continue;
> > > > >>+        }
> > > > >>+
> > > > >>+        ret = pc->is_valid_func(bus->devices[i], bus);
> > > > >>+        if (!ret) {
> > > > >>+            return -1;
> > > > >>+        }
> > > > >>+    }
> > > > >>+    return 0;
> > > > >>+}
> > > > >>+
> > > > >>+static bool pci_is_valid_function(PCIDevice *pdev, PCIBus *bus)
> > > > >>+{
> > > > >>+    if (pdev->bus == bus) {
> > > > >>+        return true;
> > > > >>+    }
> > > > >>+
> > > > >>+    return false;
> > > > >>+}
> > > > >>+
> > > > >I don't really understand what is this one doing.
> > > > >Why do we need a default function?
> > > > if the vfio driver in kernel can handle the bus reset for any one
> > > > device in qemu without the affected devices assigned. I think
> > > > we don't need this default one.
> > > > BTW, IIRC at present the devices on the same bus in host can
> > > > be assigned to different VM, so if we want to support this kind of
> > > > bus reset for an independent device when enable aer, aren't we
> > > > limiting the case that others devices on the same bus must be
> > > > assigned to current VM?
> > > > 
> > > > Thanks,
> > > > Chen
> > > 
> > > I don't believe this works at the moment, and
> > > I'd expect kernel to prevent this,
> > > so we should not rely on userspace code for this.
> > > Alex, could you comment please?
> > 
> > DMA isolation and bus isolation are separate things.  So long as
> > devices on the same bus are DMA isolated then they can be assigned to
> > separate VMs or split between host and VM.  However, certain features
> > like bus reset are not available to the user unless they "own" all of
> > the DMA isolated sets affected by a bus reset.  The kernel doesn't care
> > how the user is using them, but they must prove they own them through
> > vfio group file descriptors.
> 
> OK this makes sense.
> So I think the solution is for userspace to make sure bus reset
> is available before exposing aer to guest.
> For example, attempt a bus reset.

The API includes a mechanism for retrieving the affected devices without
making it necessary to actually perform a bus reset.

> > I thought in previous discussions we decided that unused devices made
> > the problem set more complicated for userspace so we simplified by
> > requiring them to be assigned.  For instance imagine a two function
> > device, with DMA isolation between functions, where we only want one
> > function assigned to the VM. QEMU would need to learn to take ownership
> > of the other function without exposing it to the VM simply for the
> > purpose of being able to perform a bus reset.
> 
> Hmm this might be a security problem.

How so?  We don't simply simply trust that the user previously validated
the ability to do a bus reset and allow it any time they ask, each reset
needs to pass the file descriptors related to the affected devices.

> Ideally we should not touch devices we don't *need* to touch.
> But given kernel requires this at the moment (IIUC)
> QEMU could open the device
> but not expose it to guest until actually asked.

Well, a bus reset counts as needing to touch the device.  I don't see
how it could not.  Requiring QEMU to add a new mode of holding a device
only for ownership purposes sounds like an expansion of the scope of
these changes, which is exactly what we don't need at this point.

> >  Another simplification
> > was to expose them in the same slot, so we don't need to worry about VM
> > configurations where one device could be hot-unplugged and the
> > ownership released, breaking QEMU's ability to do a bus reset on the
> > remaining device.
> 
> Same would apply here.

Again, expanding scope versus configuration requirement, I take the
latter.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest
  2016-02-04 20:15               ` Alex Williamson
@ 2016-02-04 21:58                 ` Michael S. Tsirkin
  0 siblings, 0 replies; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-02-04 21:58 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Chen Fan, izumi taku, Cao jin, qemu-devel

On Thu, Feb 04, 2016 at 03:15:39PM -0500, Alex Williamson wrote:
> 
> 
> ----- Original Message -----
> > On Thu, Feb 04, 2016 at 10:46:52AM -0700, Alex Williamson wrote:
> > > On Thu, 4 Feb 2016 13:21:57 +0200
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > 
> > > > On Thu, Feb 04, 2016 at 10:04:01AM +0800, Chen Fan wrote:
> > > > > 
> > > > > On 02/03/2016 09:57 PM, Michael S. Tsirkin wrote:
> > > > > >On Wed, Feb 03, 2016 at 04:54:01PM +0800, Chen Fan wrote:
> > > > > >>On 01/17/2016 02:34 AM, Michael S. Tsirkin wrote:
> > > > > >>>On Tue, Jan 12, 2016 at 10:43:01AM +0800, Cao jin wrote:
> > > > > >>>>From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > > >>>>
> > > > > >>>>For now, for vfio pci passthough devices when qemu receives
> > > > > >>>>an error from host aer report, currentlly just terminate the
> > > > > >>>>guest, but usually user want to know what error occurred but
> > > > > >>>>stopping the guest, so this patches add aer capability support
> > > > > >>>>for vfio device, and pass the error to guest, and have guest
> > > > > >>>>driver to recover from the error.
> > > > > >>>I would like to see a version of this patchset that doesn't
> > > > > >>>depend on pci core changes.
> > > > > >>>I think that if you make this simplifying assumption:
> > > > > >>>
> > > > > >>>- all devices on same bus in guest are on same bus in host
> > > > > >>>
> > > > > >>>then you can handle both reset and hotplug simply in function 0
> > > > > >>>since it will belong to vfio.
> > > > > >>>
> > > > > >>>So we can have a version without pci core changes that simply
> > > > > >>>assumes this, and things will just work.
> > > > > >>>
> > > > > >>>
> > > > > >>>Now, if we wanted to enforce this limitation, I think the
> > > > > >>>cleanest way would be to add a callback in struct PCIDevice:
> > > > > >>>
> > > > > >>>	bool is_valid_function(PCIDevice *newfunction)
> > > > > >>>
> > > > > >>>and call it as each function is added.
> > > > > >>>This way aer function can validate that each function
> > > > > >>>added shares the same bus.
> > > > > >>>And this way issues will be detected directly and not when
> > > > > >>>function 0 is added.
> > > > > >>>
> > > > > >>>I would prefer this validation code to be a patch on top so we
> > > > > >>>can merge the functionality directly and avoid blocking it while
> > > > > >>>we figure out the best api to validate things.
> > > > > >>>
> > > > > >>>I don't see why making guest topology match host would
> > > > > >>>ever be a problem, but if it's required to support
> > > > > >>>configurations where these differ, I'd like to see
> > > > > >>>an attempt to address that be split out, after aer
> > > > > >>>is supported.
> > > > > >>Hi Michael,
> > > > > >>
> > > > > >>Just think about this more,  I think we also should check the vfio
> > > > > >>devices whether on the same bus at the time of function 0 is
> > > > > >>added. because we don't know the affected devices by a bus reset
> > > > > >>have already all been assigned to VM.
> > > > > >This is something vfio in kernel should check.
> > > > > >You can't rely on qemu being well behaved, so don't
> > > > > >even try to catch cases which would break host in userspace.
> > > > > >
> > > > > >qemu should only worry about not breaking guest.
> > > > > >
> > > > > >  
> > > > > >>for example, the multi-function's hotplug.
> > > > > >>devices on same bus in host are added to VM one by one. when we
> > > > > >>test one device, we haven't yet added the other devices.
> > > > > >>so I think
> > > > > >>the patch should like below. then we could add a
> > > > > >>vfio_is_valid_function in vfio
> > > > > >>to test each device whether the affected devices on the same bus.
> > > > > >>
> > > > > >>Thanks,
> > > > > >>Chen
> > > > > >>
> > > > > >>diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > > > > >>index d940f79..7163b56 100644
> > > > > >>--- a/hw/pci/pci.c
> > > > > >>+++ b/hw/pci/pci.c
> > > > > >>@@ -1836,6 +1836,38 @@ PCIDevice *pci_find_device(PCIBus *bus,
> > > > > >>int bus_num, uint8_t devfn)
> > > > > >>      return bus->devices[devfn];
> > > > > >>  }
> > > > > >>
> > > > > >>+static int pci_bus_check_devices(PCIBus *bus)
> > > > > >>+{
> > > > > >>+    PCIDeviceClass *pc;
> > > > > >>+    int i, ret = 0;
> > > > > >>+
> > > > > >>+    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
> > > > > >>+        if (!bus->devices[i]) {
> > > > > >>+            continue;
> > > > > >>+        }
> > > > > >>+
> > > > > >>+        pc = PCI_DEVICE_GET_CLASS(bus->devices[i]);
> > > > > >>+        if (!pc->is_valid_func) {
> > > > > >>+            continue;
> > > > > >>+        }
> > > > > >>+
> > > > > >>+        ret = pc->is_valid_func(bus->devices[i], bus);
> > > > > >>+        if (!ret) {
> > > > > >>+            return -1;
> > > > > >>+        }
> > > > > >>+    }
> > > > > >>+    return 0;
> > > > > >>+}
> > > > > >>+
> > > > > >>+static bool pci_is_valid_function(PCIDevice *pdev, PCIBus *bus)
> > > > > >>+{
> > > > > >>+    if (pdev->bus == bus) {
> > > > > >>+        return true;
> > > > > >>+    }
> > > > > >>+
> > > > > >>+    return false;
> > > > > >>+}
> > > > > >>+
> > > > > >I don't really understand what is this one doing.
> > > > > >Why do we need a default function?
> > > > > if the vfio driver in kernel can handle the bus reset for any one
> > > > > device in qemu without the affected devices assigned. I think
> > > > > we don't need this default one.
> > > > > BTW, IIRC at present the devices on the same bus in host can
> > > > > be assigned to different VM, so if we want to support this kind of
> > > > > bus reset for an independent device when enable aer, aren't we
> > > > > limiting the case that others devices on the same bus must be
> > > > > assigned to current VM?
> > > > > 
> > > > > Thanks,
> > > > > Chen
> > > > 
> > > > I don't believe this works at the moment, and
> > > > I'd expect kernel to prevent this,
> > > > so we should not rely on userspace code for this.
> > > > Alex, could you comment please?
> > > 
> > > DMA isolation and bus isolation are separate things.  So long as
> > > devices on the same bus are DMA isolated then they can be assigned to
> > > separate VMs or split between host and VM.  However, certain features
> > > like bus reset are not available to the user unless they "own" all of
> > > the DMA isolated sets affected by a bus reset.  The kernel doesn't care
> > > how the user is using them, but they must prove they own them through
> > > vfio group file descriptors.
> > 
> > OK this makes sense.
> > So I think the solution is for userspace to make sure bus reset
> > is available before exposing aer to guest.
> > For example, attempt a bus reset.
> 
> The API includes a mechanism for retrieving the affected devices without
> making it necessary to actually perform a bus reset.
> 
> > > I thought in previous discussions we decided that unused devices made
> > > the problem set more complicated for userspace so we simplified by
> > > requiring them to be assigned.  For instance imagine a two function
> > > device, with DMA isolation between functions, where we only want one
> > > function assigned to the VM. QEMU would need to learn to take ownership
> > > of the other function without exposing it to the VM simply for the
> > > purpose of being able to perform a bus reset.
> > 
> > Hmm this might be a security problem.
> 
> How so?  We don't simply simply trust that the user previously validated
> the ability to do a bus reset and allow it any time they ask, each reset
> needs to pass the file descriptors related to the affected devices.

Not a problem. A risk.  Simply passing in
each extra function to guest increases an attack surface.  We should not
expose devices to guest if it's not strictly required.

> > Ideally we should not touch devices we don't *need* to touch.
> > But given kernel requires this at the moment (IIUC)
> > QEMU could open the device
> > but not expose it to guest until actually asked.
> 
> Well, a bus reset counts as needing to touch the device.  I don't see
> how it could not.  Requiring QEMU to add a new mode of holding a device
> only for ownership purposes sounds like an expansion of the scope of
> these changes, which is exactly what we don't need at this point.

All this can be done gradually, I have no problem with that.

> > >  Another simplification
> > > was to expose them in the same slot, so we don't need to worry about VM
> > > configurations where one device could be hot-unplugged and the
> > > ownership released, breaking QEMU's ability to do a bus reset on the
> > > remaining device.
> > 
> > Same would apply here.
> 
> Again, expanding scope versus configuration requirement, I take the
> latter.  Thanks,
> 
> Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2016-02-04 21:59 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-12  2:43 [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Cao jin
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 01/14] vfio: extract vfio_get_hot_reset_info as a single function Cao jin
2016-01-17 12:48   ` Marcel Apfelbaum
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 02/14] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset Cao jin
2016-01-17 13:01   ` Marcel Apfelbaum
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 03/14] pcie: modify the capability size assert Cao jin
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 04/14] vfio: make the 4 bytes aligned for capability size Cao jin
2016-01-17 12:30   ` Marcel Apfelbaum
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 05/14] vfio: add pcie extanded capability support Cao jin
2016-01-17 13:22   ` Marcel Apfelbaum
2016-01-19  9:44     ` Chen Fan
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 06/14] aer: impove pcie_aer_init to support vfio device Cao jin
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 07/14] vfio: add aer support for " Cao jin
2016-01-18  9:12   ` Marcel Apfelbaum
2016-01-19  9:47     ` Chen Fan
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 08/14] vfio: add check host bus reset is support or not Cao jin
2016-01-18 10:32   ` Marcel Apfelbaum
2016-01-19  9:55     ` Chen Fan
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 09/14] add check reset mechanism when hotplug vfio device Cao jin
2016-01-18 11:03   ` Marcel Apfelbaum
2016-01-19  1:46     ` Cao jin
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 10/14] pci: introduce pci bus pre reset Cao jin
2016-01-14 20:36   ` Alex Williamson
2016-01-19 10:15     ` Chen Fan
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 11/14] vfio: introduce last reset sequence id Cao jin
2016-01-14 20:43   ` Alex Williamson
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 12/14] pcie_aer: expose pcie_aer_msg() interface Cao jin
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 13/14] vfio-pci: pass the aer error to guest Cao jin
2016-01-18 10:45   ` Marcel Apfelbaum
2016-01-19  9:27     ` Chen Fan
2016-01-12  2:43 ` [Qemu-devel] [PATCH v16 14/14] vfio: add 'aer' property to expose aercap Cao jin
2016-01-18 10:46   ` Marcel Apfelbaum
2016-01-16 18:34 ` [Qemu-devel] [PATCH v16 00/14] vfio-pci: pass the aer error to guest Michael S. Tsirkin
2016-01-19  9:09   ` Chen Fan
2016-02-03  8:54   ` Chen Fan
2016-02-03 13:57     ` Michael S. Tsirkin
2016-02-04  2:04       ` Chen Fan
2016-02-04 11:21         ` Michael S. Tsirkin
2016-02-04 17:46           ` Alex Williamson
2016-02-04 18:09             ` Michael S. Tsirkin
2016-02-04 20:15               ` Alex Williamson
2016-02-04 21:58                 ` Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).