All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
@ 2016-05-27  2:12 Zhou Jie
  2016-05-27 16:06 ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-05-27  2:12 UTC (permalink / raw)
  To: qemu-devel, alex.williamson, fan.chen
  Cc: izumi.taku, caoj.fnst, mst, Chen Fan, Zhou Jie

From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

For supporting aer recovery, host and guest would run the same aer
recovery code, that would do the secondary bus reset if the error
is fatal, the aer recovery process:
  1. error_detected
  2. reset_link (if fatal)
  3. slot_reset/mmio_enabled
  4. resume

It indicates that host will do secondary bus reset to reset
the physical devices under bus in step 2, that would cause
devices in D3 status in a short time. But in qemu, we register
an error detected handler, that would be invoked as host broadcasts
the error-detected event in step 1, in order to avoid guest do
reset_link when host do reset_link simultaneously. it may cause
fatal error. we introduce a resmue notifier to assure host reset
completely.
In qemu, the aer recovery process:
  1. Detect support for resume notification
     If host vfio driver does not support for resume notification,
     directly fail to boot up VM as with aer enabled.
  2. Immediately notify the VM on error detected.
  3. Delay the guest directed bus reset.
  4. Wait for resume notification.
     If we don't get the resume notification from the host after
     some timeout, we would abort the guest directed bus reset
     altogether and unplug of the device to prevent it from further
     interacting with the VM.
  5. After get the resume notification reset bus.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Signed-off-by: Zhou Jie <zhoujie2011@cn.fujitsu.com>
---
v7->v8
 *Add a comment for why VFIO_RESET_TIMEOUT value is 1000.
 *change vfio_resume_cap to pci_aer_has_resume.
 *change vfio_resume_notifier_handler to vfio_aer_resume_notifier_handler.
 *change reset_timer to pci_aer_reset_blocked_timer.
 *Remove error_report for not supporting resume notification in
  vfio_populate_device function.
 *All error and resume tracking is done with atomic bitmap on function 0.
 *Remove stalling any access to the device until resume is signaled.
  Because guest OS need read configure space to know the reason of error.
  And it should up to guest OS to decide to stop access BAR region.
 *Still use timer to delay reset.
  Because vfio_aer_resume_notifier_handler cann't be invoked when
  vfio_pci_reset is blocked.

 hw/vfio/pci.c              | 223 ++++++++++++++++++++++++++++++++++++++++++++-
 hw/vfio/pci.h              |   5 +
 linux-headers/linux/vfio.h |   1 +
 3 files changed, 228 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6877a3d..77d86d8 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -35,6 +35,12 @@
 #include "trace.h"
 
 #define MSIX_CAP_LENGTH 12
+/*
+ * Timeout for waiting resume notification, it is 1 second.
+ * The resume notificaton will be sent after host aer error recovery.
+ * For hardware bus reset 1 second will be enough.
+ */
+#define VFIO_RESET_TIMEOUT 1000
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
@@ -1937,6 +1943,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
     VFIOGroup *group;
     int ret, i, devfn, range_limit;
 
+    if (!vdev->pci_aer_has_resume) {
+        error_setg(errp, "vfio: Cannot enable AER for device %s,"
+                   " host vfio driver does not support for"
+                   " resume notification",
+                   vdev->vbasedev.name);
+        return;
+    }
+
     ret = vfio_get_hot_reset_info(vdev, &info);
     if (ret) {
         error_setg(errp, "vfio: Cannot enable AER for device %s,"
@@ -2594,6 +2608,17 @@ static int vfio_populate_device(VFIOPCIDevice *vdev)
                      vbasedev->name);
     }
 
+    irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX;
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+    if (ret) {
+        /* This can fail for an old kernel or legacy PCI dev */
+        trace_vfio_populate_device_get_irq_info_failure();
+        ret = 0;
+    } else if (irq_info.count == 1) {
+        vdev->pci_aer_has_resume = true;
+    }
+
 error:
     return ret;
 }
@@ -2606,6 +2631,72 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
     vfio_put_base_device(&vdev->vbasedev);
 }
 
+static void vfio_aer_error_set_signaled(VFIOPCIDevice *vdev)
+{
+    PCIDevice *dev = &vdev->pdev;
+    PCIDevice *dev_0 = pci_get_function_0(dev);
+    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+    unsigned int func = PCI_FUNC(dev->devfn);
+
+    bitmap_set_atomic(vdev_0->pci_aer_error_signaled_bitmap, func, 1);
+}
+
+static void vfio_aer_resume_set_signaled(VFIOPCIDevice *vdev)
+{
+    PCIDevice *dev = &vdev->pdev;
+    PCIDevice *dev_0 = pci_get_function_0(dev);
+    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+    unsigned int func = PCI_FUNC(dev->devfn);
+
+    bitmap_set_atomic(vdev_0->pci_aer_resume_signaled_bitmap, func, 1);
+}
+
+static void vfio_aer_error_clear_all_signaled(VFIOPCIDevice *vdev)
+{
+    PCIDevice *dev = &vdev->pdev;
+    PCIDevice *dev_0 = pci_get_function_0(dev);
+    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+
+    bitmap_test_and_clear_atomic(vdev_0->pci_aer_error_signaled_bitmap,
+                                 0, PCI_FUNC_MAX);
+}
+
+static void vfio_aer_resume_clear_all_signaled(VFIOPCIDevice *vdev)
+{
+    PCIDevice *dev = &vdev->pdev;
+    PCIDevice *dev_0 = pci_get_function_0(dev);
+    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+
+    bitmap_test_and_clear_atomic(vdev_0->pci_aer_resume_signaled_bitmap,
+                                 0, PCI_FUNC_MAX);
+}
+
+static int vfio_aer_error_is_not_signaled(VFIOPCIDevice *vdev)
+{
+    PCIDevice *dev = &vdev->pdev;
+    PCIDevice *dev_0 = pci_get_function_0(dev);
+    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+
+    return slow_bitmap_empty(vdev_0->pci_aer_error_signaled_bitmap,
+                             PCI_FUNC_MAX);
+}
+
+static int vfio_aer_resume_match_error_signaled(VFIOPCIDevice *vdev)
+{
+    PCIDevice *dev = &vdev->pdev;
+    PCIDevice *dev_0 = pci_get_function_0(dev);
+    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+    DECLARE_BITMAP(resume_bitmap, PCI_FUNC_MAX);
+
+    slow_bitmap_and(resume_bitmap,
+                    vdev_0->pci_aer_error_signaled_bitmap,
+                    vdev_0->pci_aer_resume_signaled_bitmap,
+                    PCI_FUNC_MAX);
+    return slow_bitmap_equal(resume_bitmap,
+                             vdev_0->pci_aer_error_signaled_bitmap,
+                             PCI_FUNC_MAX);
+}
+
 static void vfio_err_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
@@ -2661,6 +2752,12 @@ static void vfio_err_notifier_handler(void *opaque)
         msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
                                  PCI_ERR_ROOT_CMD_NONFATAL_EN;
 
+        if (isfatal) {
+            if (vfio_aer_error_is_not_signaled(vdev)) {
+                vfio_aer_resume_clear_all_signaled(vdev);
+            }
+            vfio_aer_error_set_signaled(vdev);
+        }
         pcie_aer_msg(dev, &msg);
         return;
     }
@@ -2756,6 +2853,98 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
     event_notifier_cleanup(&vdev->err_notifier);
 }
 
+static void vfio_aer_resume_notifier_handler(void *opaque)
+{
+    VFIOPCIDevice *vdev = opaque;
+
+    if (!event_notifier_test_and_clear(&vdev->resume_notifier)) {
+        return;
+    }
+
+    vfio_aer_resume_set_signaled(vdev);
+    if (vfio_aer_resume_match_error_signaled(vdev) &&
+        vdev->pci_aer_reset_blocked_timer != NULL) {
+        timer_mod(vdev->pci_aer_reset_blocked_timer,
+                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL));
+    }
+}
+
+static void vfio_register_aer_resume_notifier(VFIOPCIDevice *vdev)
+{
+    int ret;
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+
+    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER) ||
+        !vdev->pci_aer_has_resume) {
+        return;
+    }
+
+    if (event_notifier_init(&vdev->resume_notifier, 0)) {
+        error_report("vfio: Unable to init event notifier for"
+                     " resume notification");
+        vdev->pci_aer_has_resume = false;
+        return;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = event_notifier_get_fd(&vdev->resume_notifier);
+    qemu_set_fd_handler(*pfd, vfio_aer_resume_notifier_handler, NULL, vdev);
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to set up resume notification");
+        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
+        event_notifier_cleanup(&vdev->resume_notifier);
+        vdev->pci_aer_has_resume = false;
+    }
+    g_free(irq_set);
+}
+
+static void vfio_unregister_aer_resume_notifier(VFIOPCIDevice *vdev)
+{
+    int argsz;
+    struct vfio_irq_set *irq_set;
+    int32_t *pfd;
+    int ret;
+
+    if (!vdev->pci_aer_has_resume) {
+        return;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+    *pfd = -1;
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    if (ret) {
+        error_report("vfio: Failed to de-assign error fd: %m");
+    }
+    g_free(irq_set);
+    qemu_set_fd_handler(event_notifier_get_fd(&vdev->resume_notifier),
+                        NULL, NULL, vdev);
+    event_notifier_cleanup(&vdev->resume_notifier);
+}
+
 static void vfio_req_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
@@ -3061,6 +3250,7 @@ static int vfio_initfn(PCIDevice *pdev)
     }
 
     vfio_register_err_notifier(vdev);
+    vfio_register_aer_resume_notifier(vdev);
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
 
@@ -3091,6 +3281,7 @@ static void vfio_exitfn(PCIDevice *pdev)
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 
     vfio_unregister_req_notifier(vdev);
+    vfio_unregister_aer_resume_notifier(vdev);
     vfio_unregister_err_notifier(vdev);
     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
     vfio_disable_interrupts(vdev);
@@ -3101,6 +3292,22 @@ static void vfio_exitfn(PCIDevice *pdev)
     vfio_bars_exit(vdev);
 }
 
+static void vfio_pci_delayed_reset(void *opaque)
+{
+    VFIOPCIDevice *vdev = opaque;
+    QEMUTimer *reset_timer = vdev->pci_aer_reset_blocked_timer;
+
+    vdev->pci_aer_reset_blocked_timer = NULL;
+    timer_free(reset_timer);
+
+    if (vfio_aer_resume_match_error_signaled(vdev)) {
+        vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+        vfio_aer_error_clear_all_signaled(vdev);
+    } else {
+        qdev_unplug(&vdev->pdev.qdev, NULL);
+    }
+}
+
 static void vfio_pci_reset(DeviceState *dev)
 {
     PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
@@ -3114,7 +3321,21 @@ static void vfio_pci_reset(DeviceState *dev)
         if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
              PCI_BRIDGE_CTL_BUS_RESET)) {
             if (pci_get_function_0(pdev) == pdev) {
-                vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+                if (vfio_aer_error_is_not_signaled(vdev)) {
+                    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+                } else {
+                    if (vfio_aer_resume_match_error_signaled(vdev)) {
+                        vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+                        vfio_aer_error_clear_all_signaled(vdev);
+                    } else {
+                        vdev->pci_aer_reset_blocked_timer = timer_new_ms(
+                            QEMU_CLOCK_VIRTUAL,
+                            vfio_pci_delayed_reset, vdev);
+                        timer_mod(vdev->pci_aer_reset_blocked_timer,
+                            qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL)
+                            + VFIO_RESET_TIMEOUT);
+                    }
+                }
             }
             return;
         }
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 9fb0206..be3b883 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -119,6 +119,7 @@ typedef struct VFIOPCIDevice {
     VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */
     PCIHostDeviceAddress host;
     EventNotifier err_notifier;
+    EventNotifier resume_notifier;
     EventNotifier req_notifier;
     int (*resetfn)(struct VFIOPCIDevice *);
     uint32_t vendor_id;
@@ -144,6 +145,10 @@ typedef struct VFIOPCIDevice {
     bool no_kvm_msi;
     bool no_kvm_msix;
     bool single_depend_dev;
+    bool pci_aer_has_resume;
+    DECLARE_BITMAP(pci_aer_error_signaled_bitmap, PCI_FUNC_MAX);
+    DECLARE_BITMAP(pci_aer_resume_signaled_bitmap, PCI_FUNC_MAX);
+    QEMUTimer *pci_aer_reset_blocked_timer;
 } VFIOPCIDevice;
 
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 759b850..01dfd5d 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -433,6 +433,7 @@ enum {
 	VFIO_PCI_MSIX_IRQ_INDEX,
 	VFIO_PCI_ERR_IRQ_INDEX,
 	VFIO_PCI_REQ_IRQ_INDEX,
+        VFIO_PCI_RESUME_IRQ_INDEX,
 	VFIO_PCI_NUM_IRQS
 };
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-05-27  2:12 [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume Zhou Jie
@ 2016-05-27 16:06 ` Alex Williamson
  2016-06-12  2:38   ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-05-27 16:06 UTC (permalink / raw)
  To: Zhou Jie; +Cc: qemu-devel, fan.chen, izumi.taku, caoj.fnst, mst, Chen Fan

On Fri, 27 May 2016 10:12:11 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> For supporting aer recovery, host and guest would run the same aer
> recovery code, that would do the secondary bus reset if the error
> is fatal, the aer recovery process:
>   1. error_detected
>   2. reset_link (if fatal)
>   3. slot_reset/mmio_enabled
>   4. resume
> 
> It indicates that host will do secondary bus reset to reset
> the physical devices under bus in step 2, that would cause
> devices in D3 status in a short time. But in qemu, we register
> an error detected handler, that would be invoked as host broadcasts
> the error-detected event in step 1, in order to avoid guest do
> reset_link when host do reset_link simultaneously. it may cause
> fatal error. we introduce a resmue notifier to assure host reset
> completely.
> In qemu, the aer recovery process:
>   1. Detect support for resume notification
>      If host vfio driver does not support for resume notification,
>      directly fail to boot up VM as with aer enabled.
>   2. Immediately notify the VM on error detected.
>   3. Delay the guest directed bus reset.
>   4. Wait for resume notification.
>      If we don't get the resume notification from the host after
>      some timeout, we would abort the guest directed bus reset
>      altogether and unplug of the device to prevent it from further
>      interacting with the VM.
>   5. After get the resume notification reset bus.


It seems like we have a number of questions open in the thread with MST
from the previous version, particularly whether we should actually drop
the resume notifier and block the reset in the kernel.  The concern
being that it's not very well specified what we can and cannot do
between the error interrupt and the resume interrupt.  We'd probably
need some other indicate of whether the host has this capability,
perhaps a flag in vfio_device_info.  Appreciate your opinions there.
Thanks,

Alex

 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> Signed-off-by: Zhou Jie <zhoujie2011@cn.fujitsu.com>
> ---
> v7->v8
>  *Add a comment for why VFIO_RESET_TIMEOUT value is 1000.
>  *change vfio_resume_cap to pci_aer_has_resume.
>  *change vfio_resume_notifier_handler to vfio_aer_resume_notifier_handler.
>  *change reset_timer to pci_aer_reset_blocked_timer.
>  *Remove error_report for not supporting resume notification in
>   vfio_populate_device function.
>  *All error and resume tracking is done with atomic bitmap on function 0.
>  *Remove stalling any access to the device until resume is signaled.
>   Because guest OS need read configure space to know the reason of error.
>   And it should up to guest OS to decide to stop access BAR region.
>  *Still use timer to delay reset.
>   Because vfio_aer_resume_notifier_handler cann't be invoked when
>   vfio_pci_reset is blocked.
> 
>  hw/vfio/pci.c              | 223 ++++++++++++++++++++++++++++++++++++++++++++-
>  hw/vfio/pci.h              |   5 +
>  linux-headers/linux/vfio.h |   1 +
>  3 files changed, 228 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 6877a3d..77d86d8 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -35,6 +35,12 @@
>  #include "trace.h"
>  
>  #define MSIX_CAP_LENGTH 12
> +/*
> + * Timeout for waiting resume notification, it is 1 second.
> + * The resume notificaton will be sent after host aer error recovery.
> + * For hardware bus reset 1 second will be enough.
> + */
> +#define VFIO_RESET_TIMEOUT 1000
>  
>  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> @@ -1937,6 +1943,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
>      VFIOGroup *group;
>      int ret, i, devfn, range_limit;
>  
> +    if (!vdev->pci_aer_has_resume) {
> +        error_setg(errp, "vfio: Cannot enable AER for device %s,"
> +                   " host vfio driver does not support for"
> +                   " resume notification",
> +                   vdev->vbasedev.name);
> +        return;
> +    }
> +
>      ret = vfio_get_hot_reset_info(vdev, &info);
>      if (ret) {
>          error_setg(errp, "vfio: Cannot enable AER for device %s,"
> @@ -2594,6 +2608,17 @@ static int vfio_populate_device(VFIOPCIDevice *vdev)
>                       vbasedev->name);
>      }
>  
> +    irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX;
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> +    if (ret) {
> +        /* This can fail for an old kernel or legacy PCI dev */
> +        trace_vfio_populate_device_get_irq_info_failure();
> +        ret = 0;
> +    } else if (irq_info.count == 1) {
> +        vdev->pci_aer_has_resume = true;
> +    }
> +
>  error:
>      return ret;
>  }
> @@ -2606,6 +2631,72 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>      vfio_put_base_device(&vdev->vbasedev);
>  }
>  
> +static void vfio_aer_error_set_signaled(VFIOPCIDevice *vdev)
> +{
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIDevice *dev_0 = pci_get_function_0(dev);
> +    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
> +    unsigned int func = PCI_FUNC(dev->devfn);
> +
> +    bitmap_set_atomic(vdev_0->pci_aer_error_signaled_bitmap, func, 1);
> +}
> +
> +static void vfio_aer_resume_set_signaled(VFIOPCIDevice *vdev)
> +{
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIDevice *dev_0 = pci_get_function_0(dev);
> +    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
> +    unsigned int func = PCI_FUNC(dev->devfn);
> +
> +    bitmap_set_atomic(vdev_0->pci_aer_resume_signaled_bitmap, func, 1);
> +}
> +
> +static void vfio_aer_error_clear_all_signaled(VFIOPCIDevice *vdev)
> +{
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIDevice *dev_0 = pci_get_function_0(dev);
> +    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
> +
> +    bitmap_test_and_clear_atomic(vdev_0->pci_aer_error_signaled_bitmap,
> +                                 0, PCI_FUNC_MAX);
> +}
> +
> +static void vfio_aer_resume_clear_all_signaled(VFIOPCIDevice *vdev)
> +{
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIDevice *dev_0 = pci_get_function_0(dev);
> +    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
> +
> +    bitmap_test_and_clear_atomic(vdev_0->pci_aer_resume_signaled_bitmap,
> +                                 0, PCI_FUNC_MAX);
> +}
> +
> +static int vfio_aer_error_is_not_signaled(VFIOPCIDevice *vdev)
> +{
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIDevice *dev_0 = pci_get_function_0(dev);
> +    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
> +
> +    return slow_bitmap_empty(vdev_0->pci_aer_error_signaled_bitmap,
> +                             PCI_FUNC_MAX);
> +}
> +
> +static int vfio_aer_resume_match_error_signaled(VFIOPCIDevice *vdev)
> +{
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIDevice *dev_0 = pci_get_function_0(dev);
> +    VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
> +    DECLARE_BITMAP(resume_bitmap, PCI_FUNC_MAX);
> +
> +    slow_bitmap_and(resume_bitmap,
> +                    vdev_0->pci_aer_error_signaled_bitmap,
> +                    vdev_0->pci_aer_resume_signaled_bitmap,
> +                    PCI_FUNC_MAX);
> +    return slow_bitmap_equal(resume_bitmap,
> +                             vdev_0->pci_aer_error_signaled_bitmap,
> +                             PCI_FUNC_MAX);
> +}
> +
>  static void vfio_err_notifier_handler(void *opaque)
>  {
>      VFIOPCIDevice *vdev = opaque;
> @@ -2661,6 +2752,12 @@ static void vfio_err_notifier_handler(void *opaque)
>          msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>                                   PCI_ERR_ROOT_CMD_NONFATAL_EN;
>  
> +        if (isfatal) {
> +            if (vfio_aer_error_is_not_signaled(vdev)) {
> +                vfio_aer_resume_clear_all_signaled(vdev);
> +            }
> +            vfio_aer_error_set_signaled(vdev);
> +        }
>          pcie_aer_msg(dev, &msg);
>          return;
>      }
> @@ -2756,6 +2853,98 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
>      event_notifier_cleanup(&vdev->err_notifier);
>  }
>  
> +static void vfio_aer_resume_notifier_handler(void *opaque)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +
> +    if (!event_notifier_test_and_clear(&vdev->resume_notifier)) {
> +        return;
> +    }
> +
> +    vfio_aer_resume_set_signaled(vdev);
> +    if (vfio_aer_resume_match_error_signaled(vdev) &&
> +        vdev->pci_aer_reset_blocked_timer != NULL) {
> +        timer_mod(vdev->pci_aer_reset_blocked_timer,
> +                  qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL));
> +    }
> +}
> +
> +static void vfio_register_aer_resume_notifier(VFIOPCIDevice *vdev)
> +{
> +    int ret;
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +
> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER) ||
> +        !vdev->pci_aer_has_resume) {
> +        return;
> +    }
> +
> +    if (event_notifier_init(&vdev->resume_notifier, 0)) {
> +        error_report("vfio: Unable to init event notifier for"
> +                     " resume notification");
> +        vdev->pci_aer_has_resume = false;
> +        return;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +
> +    *pfd = event_notifier_get_fd(&vdev->resume_notifier);
> +    qemu_set_fd_handler(*pfd, vfio_aer_resume_notifier_handler, NULL, vdev);
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to set up resume notification");
> +        qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
> +        event_notifier_cleanup(&vdev->resume_notifier);
> +        vdev->pci_aer_has_resume = false;
> +    }
> +    g_free(irq_set);
> +}
> +
> +static void vfio_unregister_aer_resume_notifier(VFIOPCIDevice *vdev)
> +{
> +    int argsz;
> +    struct vfio_irq_set *irq_set;
> +    int32_t *pfd;
> +    int ret;
> +
> +    if (!vdev->pci_aer_has_resume) {
> +        return;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +    *pfd = -1;
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    if (ret) {
> +        error_report("vfio: Failed to de-assign error fd: %m");
> +    }
> +    g_free(irq_set);
> +    qemu_set_fd_handler(event_notifier_get_fd(&vdev->resume_notifier),
> +                        NULL, NULL, vdev);
> +    event_notifier_cleanup(&vdev->resume_notifier);
> +}
> +
>  static void vfio_req_notifier_handler(void *opaque)
>  {
>      VFIOPCIDevice *vdev = opaque;
> @@ -3061,6 +3250,7 @@ static int vfio_initfn(PCIDevice *pdev)
>      }
>  
>      vfio_register_err_notifier(vdev);
> +    vfio_register_aer_resume_notifier(vdev);
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn_quirk(vdev);
>  
> @@ -3091,6 +3281,7 @@ static void vfio_exitfn(PCIDevice *pdev)
>      VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>  
>      vfio_unregister_req_notifier(vdev);
> +    vfio_unregister_aer_resume_notifier(vdev);
>      vfio_unregister_err_notifier(vdev);
>      pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
>      vfio_disable_interrupts(vdev);
> @@ -3101,6 +3292,22 @@ static void vfio_exitfn(PCIDevice *pdev)
>      vfio_bars_exit(vdev);
>  }
>  
> +static void vfio_pci_delayed_reset(void *opaque)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    QEMUTimer *reset_timer = vdev->pci_aer_reset_blocked_timer;
> +
> +    vdev->pci_aer_reset_blocked_timer = NULL;
> +    timer_free(reset_timer);
> +
> +    if (vfio_aer_resume_match_error_signaled(vdev)) {
> +        vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +        vfio_aer_error_clear_all_signaled(vdev);
> +    } else {
> +        qdev_unplug(&vdev->pdev.qdev, NULL);
> +    }
> +}
> +
>  static void vfio_pci_reset(DeviceState *dev)
>  {
>      PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
> @@ -3114,7 +3321,21 @@ static void vfio_pci_reset(DeviceState *dev)
>          if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
>               PCI_BRIDGE_CTL_BUS_RESET)) {
>              if (pci_get_function_0(pdev) == pdev) {
> -                vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +                if (vfio_aer_error_is_not_signaled(vdev)) {
> +                    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +                } else {
> +                    if (vfio_aer_resume_match_error_signaled(vdev)) {
> +                        vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +                        vfio_aer_error_clear_all_signaled(vdev);
> +                    } else {
> +                        vdev->pci_aer_reset_blocked_timer = timer_new_ms(
> +                            QEMU_CLOCK_VIRTUAL,
> +                            vfio_pci_delayed_reset, vdev);
> +                        timer_mod(vdev->pci_aer_reset_blocked_timer,
> +                            qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL)
> +                            + VFIO_RESET_TIMEOUT);
> +                    }
> +                }
>              }
>              return;
>          }
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 9fb0206..be3b883 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -119,6 +119,7 @@ typedef struct VFIOPCIDevice {
>      VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */
>      PCIHostDeviceAddress host;
>      EventNotifier err_notifier;
> +    EventNotifier resume_notifier;
>      EventNotifier req_notifier;
>      int (*resetfn)(struct VFIOPCIDevice *);
>      uint32_t vendor_id;
> @@ -144,6 +145,10 @@ typedef struct VFIOPCIDevice {
>      bool no_kvm_msi;
>      bool no_kvm_msix;
>      bool single_depend_dev;
> +    bool pci_aer_has_resume;
> +    DECLARE_BITMAP(pci_aer_error_signaled_bitmap, PCI_FUNC_MAX);
> +    DECLARE_BITMAP(pci_aer_resume_signaled_bitmap, PCI_FUNC_MAX);
> +    QEMUTimer *pci_aer_reset_blocked_timer;
>  } VFIOPCIDevice;
>  
>  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index 759b850..01dfd5d 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -433,6 +433,7 @@ enum {
>  	VFIO_PCI_MSIX_IRQ_INDEX,
>  	VFIO_PCI_ERR_IRQ_INDEX,
>  	VFIO_PCI_REQ_IRQ_INDEX,
> +        VFIO_PCI_RESUME_IRQ_INDEX,
>  	VFIO_PCI_NUM_IRQS
>  };
>  

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-05-27 16:06 ` Alex Williamson
@ 2016-06-12  2:38   ` Zhou Jie
  2016-06-20  7:41     ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-06-12  2:38 UTC (permalink / raw)
  To: Alex Williamson
  Cc: qemu-devel, fan.chen, izumi.taku, caoj.fnst, mst, Chen Fan

Hi, Alex

> It seems like we have a number of questions open in the thread with MST
> from the previous version, particularly whether we should actually drop
> the resume notifier and block the reset in the kernel.  The concern
> being that it's not very well specified what we can and cannot do
> between the error interrupt and the resume interrupt.  We'd probably
> need some other indicate of whether the host has this capability,
> perhaps a flag in vfio_device_info.  Appreciate your opinions there.
> Thanks,
>
> Alex

Have you had a conclusion with MST?

How about the process like following?
1. Detect support for VFIO reset blocking.
    Maybe use vfio_device_info.
2. Immediately notify the VM on error detected.
3. Invoke ioctl(VFIO_DEVICE_PCI_HOT_RESET).
    If kernel is doing reset, then block in ioctl.

    Can I ignore the resume notifier?
    else
    Shall ioctl block until after receiving resume notification?

Sincerely
Zhou Jie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-12  2:38   ` Zhou Jie
@ 2016-06-20  7:41     ` Zhou Jie
  2016-06-20 16:32       ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-06-20  7:41 UTC (permalink / raw)
  To: Alex Williamson
  Cc: fan.chen, mst, qemu-devel, caoj.fnst, Chen Fan, izumi.taku

ping

On 2016/6/12 10:38, Zhou Jie wrote:
> Hi, Alex
>
>> It seems like we have a number of questions open in the thread with MST
>> from the previous version, particularly whether we should actually drop
>> the resume notifier and block the reset in the kernel.  The concern
>> being that it's not very well specified what we can and cannot do
>> between the error interrupt and the resume interrupt.  We'd probably
>> need some other indicate of whether the host has this capability,
>> perhaps a flag in vfio_device_info.  Appreciate your opinions there.
>> Thanks,
>>
>> Alex
>
> Have you had a conclusion with MST?
>
> How about the process like following?
> 1. Detect support for VFIO reset blocking.
>    Maybe use vfio_device_info.
> 2. Immediately notify the VM on error detected.
> 3. Invoke ioctl(VFIO_DEVICE_PCI_HOT_RESET).
>    If kernel is doing reset, then block in ioctl.
>
>    Can I ignore the resume notifier?
>    else
>    Shall ioctl block until after receiving resume notification?
>
> Sincerely
> Zhou Jie
>
>
>
>

-- 
------------------------------------------------
周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2011@cn.fujitsu.com
------------------------------------------------

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-20  7:41     ` Zhou Jie
@ 2016-06-20 16:32       ` Alex Williamson
  2016-06-21  2:16         ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-06-20 16:32 UTC (permalink / raw)
  To: Zhou Jie; +Cc: fan.chen, mst, qemu-devel, caoj.fnst, Chen Fan, izumi.taku

On Mon, 20 Jun 2016 15:41:03 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> ping
> 
> On 2016/6/12 10:38, Zhou Jie wrote:
> > Hi, Alex
> >  
> >> It seems like we have a number of questions open in the thread with MST
> >> from the previous version, particularly whether we should actually drop
> >> the resume notifier and block the reset in the kernel.  The concern
> >> being that it's not very well specified what we can and cannot do
> >> between the error interrupt and the resume interrupt.  We'd probably
> >> need some other indicate of whether the host has this capability,
> >> perhaps a flag in vfio_device_info.  Appreciate your opinions there.
> >> Thanks,
> >>
> >> Alex  
> >
> > Have you had a conclusion with MST?
> >
> > How about the process like following?
> > 1. Detect support for VFIO reset blocking.
> >    Maybe use vfio_device_info.
> > 2. Immediately notify the VM on error detected.
> > 3. Invoke ioctl(VFIO_DEVICE_PCI_HOT_RESET).
> >    If kernel is doing reset, then block in ioctl.
> >
> >    Can I ignore the resume notifier?
> >    else
> >    Shall ioctl block until after receiving resume notification?

I was really hoping to hear your opinion, or at least some further
discussion of pros and cons rather than simply parroting back my idea.
My current thinking is that a resume notifier to userspace is poorly
defined, it's not clear what the user can and cannot do between an
error notification and the resume notification.  One approach to solve
that might be that the kernel internally handles the resume
notifications.  Maybe that means blocking the ioctl (interruptible
timeout) until the internal resume occurs, or maybe that means
returning -EAGAIN.  Probably implementations of each need to be worked
through to determine which is better.  We don't want to add complexity
to the kernel simply to make things easier for userspace, but we also
don't want a poorly specified interface that is difficult for
userspace to use correctly.  Thanks,

Alex 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-20 16:32       ` Alex Williamson
@ 2016-06-21  2:16         ` Zhou Jie
  2016-06-21  3:13           ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-06-21  2:16 UTC (permalink / raw)
  To: Alex Williamson
  Cc: fan.chen, mst, qemu-devel, caoj.fnst, Chen Fan, izumi.taku

Hi, Alex

> I was really hoping to hear your opinion, or at least some further
> discussion of pros and cons rather than simply parroting back my idea.
I understand.

> My current thinking is that a resume notifier to userspace is poorly
> defined, it's not clear what the user can and cannot do between an
> error notification and the resume notification.
Yes, do nothing between that time is better.

> One approach to solve
> that might be that the kernel internally handles the resume
> notifications.  Maybe that means blocking the ioctl (interruptible
> timeout) until the internal resume occurs, or maybe that means
> returning -EAGAIN.
I don't think it is a good idea.
The kernel give the error and resume notifications, it's enough.
It's up to user to how to use them.

> Probably implementations of each need to be worked
> through to determine which is better.  We don't want to add complexity
> to the kernel simply to make things easier for userspace, but we also
> don't want a poorly specified interface that is difficult for
> userspace to use correctly.  Thanks,
In qemu, the aer recovery process:
   1. Detect support for resume notification
      If host vfio driver does not support for resume notification,
      directly fail to boot up VM as with aer enabled.
   2. Immediately notify the VM on error detected.
   3. Disable the device.
      Unmap the config space and bar region.
   4. Delay the guest directed bus reset.
   5. Wait for resume notification.
      If we don't get the resume notification from the host after
      some timeout, we would abort the guest directed bus reset
      altogether and unplug of the device to prevent it from further
      interacting with the VM.
   6. After get the resume notification reset bus and enable the device.

I think we only make sure the disabled device
  will not interact with the VM.

Sincerely
Zhou jie

>
> Alex
>
>
> .
>

-- 
------------------------------------------------
周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2011@cn.fujitsu.com
------------------------------------------------

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-21  2:16         ` Zhou Jie
@ 2016-06-21  3:13           ` Alex Williamson
  2016-06-21 12:41             ` Chen Fan
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-06-21  3:13 UTC (permalink / raw)
  To: Zhou Jie; +Cc: fan.chen, mst, qemu-devel, caoj.fnst, Chen Fan, izumi.taku

On Tue, 21 Jun 2016 10:16:25 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi, Alex
> 
> > I was really hoping to hear your opinion, or at least some further
> > discussion of pros and cons rather than simply parroting back my idea.  
> I understand.
> 
> > My current thinking is that a resume notifier to userspace is poorly
> > defined, it's not clear what the user can and cannot do between an
> > error notification and the resume notification.  
> Yes, do nothing between that time is better.
> 
> > One approach to solve
> > that might be that the kernel internally handles the resume
> > notifications.  Maybe that means blocking the ioctl (interruptible
> > timeout) until the internal resume occurs, or maybe that means
> > returning -EAGAIN.  
> I don't think it is a good idea.
> The kernel give the error and resume notifications, it's enough.
> It's up to user to how to use them.

Well that's exactly why it's poorly defined.  What does a resume
notification signal a user that they're allowed to do?  What can they
not do between error and resume notification.  Clearly you had issues
attempting to perform a reset during this time period since it was
racing with the kernel reset, so is a user allowed to do a hot reset
between error and resume?  Where do we define it?  Do we prevent it if
they try?  Why?  What about the reset ioctl?  How and why is that
different from a hot reset?  (hint, they can be the same)  Do we define
that resets are not allowed between error and resume, but other
operations like read/write or interrupt setup ioctls are allowed? Why?
Clearly we can't do anything that manipulates the device between error
and resume since it might be lost or ineffective, but where do we
define it and do we need to actively enforce those rules?  I'm arguing
that it's poorly defined, so "it's up to the user how to use them"
doesn't not give me any additional confidence in that approach.  We
can't trust the user to be polite, we can't even trust the user not to
be malicious.
 
> > Probably implementations of each need to be worked
> > through to determine which is better.  We don't want to add complexity
> > to the kernel simply to make things easier for userspace, but we also
> > don't want a poorly specified interface that is difficult for
> > userspace to use correctly.  Thanks,  
> In qemu, the aer recovery process:
>    1. Detect support for resume notification
>       If host vfio driver does not support for resume notification,
>       directly fail to boot up VM as with aer enabled.
>    2. Immediately notify the VM on error detected.
>    3. Disable the device.
>       Unmap the config space and bar region.
>    4. Delay the guest directed bus reset.
>    5. Wait for resume notification.
>       If we don't get the resume notification from the host after
>       some timeout, we would abort the guest directed bus reset
>       altogether and unplug of the device to prevent it from further
>       interacting with the VM.
>    6. After get the resume notification reset bus and enable the device.
> 
> I think we only make sure the disabled device
>   will not interact with the VM.

Should interrupt irqfds then also be disabled so they trap into QEMU
and we can prevent that interaction?  Also, QEMU can be polite, but as
above, QEMU is just one user, the API is open to anyone and QEMU might
be exploited to not be so polite.  So if there are points where the
user can interfere with the kernel or exploit the knowledge that the
device is going through a reset, the kernel can't rely on a friendly
user.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-21  3:13           ` Alex Williamson
@ 2016-06-21 12:41             ` Chen Fan
  2016-06-21 14:44               ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Chen Fan @ 2016-06-21 12:41 UTC (permalink / raw)
  To: Alex Williamson, Zhou Jie
  Cc: mst, qemu-devel, caoj.fnst, Chen Fan, izumi.taku

On 2016年06月21日 11:13, Alex Williamson wrote:
> On Tue, 21 Jun 2016 10:16:25 +0800
> Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
>
>> Hi, Alex
>>
>>> I was really hoping to hear your opinion, or at least some further
>>> discussion of pros and cons rather than simply parroting back my idea.
>> I understand.
>>
>>> My current thinking is that a resume notifier to userspace is poorly
>>> defined, it's not clear what the user can and cannot do between an
>>> error notification and the resume notification.
>> Yes, do nothing between that time is better.
>>
>>> One approach to solve
>>> that might be that the kernel internally handles the resume
>>> notifications.  Maybe that means blocking the ioctl (interruptible
>>> timeout) until the internal resume occurs, or maybe that means
>>> returning -EAGAIN.
>> I don't think it is a good idea.
>> The kernel give the error and resume notifications, it's enough.
>> It's up to user to how to use them.
> Well that's exactly why it's poorly defined.  What does a resume
> notification signal a user that they're allowed to do?  What can they
> not do between error and resume notification.  Clearly you had issues
> attempting to perform a reset during this time period since it was
> racing with the kernel reset, so is a user allowed to do a hot reset
> between error and resume?  Where do we define it?  Do we prevent it if
> they try?  Why?  What about the reset ioctl?  How and why is that
> different from a hot reset?  (hint, they can be the same)  Do we define
> that resets are not allowed between error and resume, but other
> operations like read/write or interrupt setup ioctls are allowed? Why?
> Clearly we can't do anything that manipulates the device between error
> and resume since it might be lost or ineffective, but where do we
> define it and do we need to actively enforce those rules?  I'm arguing
> that it's poorly defined, so "it's up to the user how to use them"
> doesn't not give me any additional confidence in that approach.  We
> can't trust the user to be polite, we can't even trust the user not to
> be malicious.
Hi Alex,
      on kernel side, I think if we don't trust the user behaviors, we 
should
  disable the access of vfio-pci interface once vfio-pci driver got the 
error_detected,
  we should disable all access to vfio fd regardless whether the vfio-pci
  was assigned to a VM, we also can return a EAGAIN error if user try
  to access it during the reset period until the host reset finished.
      on qemu side, when we got a error_detect, we pass through the
aer error to guest directly, ignore all access to vfio-pci during this 
time,
when qemu need to do a hot reset, we can retry to get the info from
the get info ioctl until we got the info that vfio-pci has been reset 
finished,
then do the hot_reset ioctl if need, the kernel should ensure the ioctl 
become
//// accessible after host reset completed.

Thanks,
Chen


>   
>>> Probably implementations of each need to be worked
>>> through to determine which is better.  We don't want to add complexity
>>> to the kernel simply to make things easier for userspace, but we also
>>> don't want a poorly specified interface that is difficult for
>>> userspace to use correctly.  Thanks,
>> In qemu, the aer recovery process:
>>     1. Detect support for resume notification
>>        If host vfio driver does not support for resume notification,
>>        directly fail to boot up VM as with aer enabled.
>>     2. Immediately notify the VM on error detected.
>>     3. Disable the device.
>>        Unmap the config space and bar region.
>>     4. Delay the guest directed bus reset.
>>     5. Wait for resume notification.
>>        If we don't get the resume notification from the host after
>>        some timeout, we would abort the guest directed bus reset
>>        altogether and unplug of the device to prevent it from further
>>        interacting with the VM.
>>     6. After get the resume notification reset bus and enable the device.
>>
>> I think we only make sure the disabled device
>>    will not interact with the VM.
> Should interrupt irqfds then also be disabled so they trap into QEMU
> and we can prevent that interaction?  Also, QEMU can be polite, but as
> above, QEMU is just one user, the API is open to anyone and QEMU might
> be exploited to not be so polite.  So if there are points where the
> user can interfere with the kernel or exploit the knowledge that the
> device is going through a reset, the kernel can't rely on a friendly
> user.  Thanks,
>
> Alex
>

-- 
Sincerely,
Chen Fan

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-21 12:41             ` Chen Fan
@ 2016-06-21 14:44               ` Alex Williamson
  2016-06-22  3:28                 ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-06-21 14:44 UTC (permalink / raw)
  To: Chen Fan; +Cc: Zhou Jie, Chen Fan, izumi.taku, caoj.fnst, qemu-devel, mst

On Tue, 21 Jun 2016 20:41:32 +0800
Chen Fan <fan.chen@easystack.cn> wrote:

> On 2016年06月21日 11:13, Alex Williamson wrote:
> > On Tue, 21 Jun 2016 10:16:25 +0800
> > Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
> >  
> >> Hi, Alex
> >>  
> >>> I was really hoping to hear your opinion, or at least some further
> >>> discussion of pros and cons rather than simply parroting back my idea.  
> >> I understand.
> >>  
> >>> My current thinking is that a resume notifier to userspace is poorly
> >>> defined, it's not clear what the user can and cannot do between an
> >>> error notification and the resume notification.  
> >> Yes, do nothing between that time is better.
> >>  
> >>> One approach to solve
> >>> that might be that the kernel internally handles the resume
> >>> notifications.  Maybe that means blocking the ioctl (interruptible
> >>> timeout) until the internal resume occurs, or maybe that means
> >>> returning -EAGAIN.  
> >> I don't think it is a good idea.
> >> The kernel give the error and resume notifications, it's enough.
> >> It's up to user to how to use them.  
> > Well that's exactly why it's poorly defined.  What does a resume
> > notification signal a user that they're allowed to do?  What can they
> > not do between error and resume notification.  Clearly you had issues
> > attempting to perform a reset during this time period since it was
> > racing with the kernel reset, so is a user allowed to do a hot reset
> > between error and resume?  Where do we define it?  Do we prevent it if
> > they try?  Why?  What about the reset ioctl?  How and why is that
> > different from a hot reset?  (hint, they can be the same)  Do we define
> > that resets are not allowed between error and resume, but other
> > operations like read/write or interrupt setup ioctls are allowed? Why?
> > Clearly we can't do anything that manipulates the device between error
> > and resume since it might be lost or ineffective, but where do we
> > define it and do we need to actively enforce those rules?  I'm arguing
> > that it's poorly defined, so "it's up to the user how to use them"
> > doesn't not give me any additional confidence in that approach.  We
> > can't trust the user to be polite, we can't even trust the user not to
> > be malicious.  
> Hi Alex,
>       on kernel side, I think if we don't trust the user behaviors, we 
> should
>   disable the access of vfio-pci interface once vfio-pci driver got the 
> error_detected,
>   we should disable all access to vfio fd regardless whether the vfio-pci
>   was assigned to a VM, we also can return a EAGAIN error if user try
>   to access it during the reset period until the host reset finished.
>       on qemu side, when we got a error_detect, we pass through the
> aer error to guest directly, ignore all access to vfio-pci during this 
> time,
> when qemu need to do a hot reset, we can retry to get the info from
> the get info ioctl until we got the info that vfio-pci has been reset 
> finished,
> then do the hot_reset ioctl if need, the kernel should ensure the ioctl 
> become
> //// accessible after host reset completed.
>

That sounds pretty thorough, the sticky point there is always disabling
the device mmaps w/o a revoke interface.  Do we invalidate the pfn
range and setup a fault handler that blocks on access?  I don't think
we have a whole lot of options, either block or sigbus, but having such
a mechanism might allow us to easily put a device in a "dead" state
where the user can't touch it, which could be useful for other purposes
too.  QEMU would also need to timeout after some number of reset
attempts and assume the device is not coming back.  Plus we'd need a
device flag to indicate this behavior.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-21 14:44               ` Alex Williamson
@ 2016-06-22  3:28                 ` Zhou Jie
  2016-06-22  3:56                   ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-06-22  3:28 UTC (permalink / raw)
  To: Alex Williamson, Chen Fan
  Cc: Chen Fan, izumi.taku, caoj.fnst, qemu-devel, mst

Hi Alex,

>> Hi Alex,
>>       on kernel side, I think if we don't trust the user behaviors, we
>> should
>>   disable the access of vfio-pci interface once vfio-pci driver got the
>> error_detected,
>>   we should disable all access to vfio fd regardless whether the vfio-pci
>>   was assigned to a VM, we also can return a EAGAIN error if user try
>>   to access it during the reset period until the host reset finished.
>>       on qemu side, when we got a error_detect, we pass through the
>> aer error to guest directly, ignore all access to vfio-pci during this
>> time,
>> when qemu need to do a hot reset, we can retry to get the info from
>> the get info ioctl until we got the info that vfio-pci has been reset
>> finished,
>> then do the hot_reset ioctl if need, the kernel should ensure the ioctl
>> become
>> //// accessible after host reset completed.
>>
>
> That sounds pretty thorough, the sticky point there is always disabling
> the device mmaps w/o a revoke interface.  Do we invalidate the pfn
> range and setup a fault handler that blocks on access?  I don't think
> we have a whole lot of options, either block or sigbus, but having such
> a mechanism might allow us to easily put a device in a "dead" state
> where the user can't touch it, which could be useful for other purposes
> too.  QEMU would also need to timeout after some number of reset
> attempts and assume the device is not coming back.  Plus we'd need a
> device flag to indicate this behavior.  Thanks,
>
> Alex

In vfio I have some questions.
1. How can I disable the access by mmap?
    We can disable all access to vfio fd by returning a EAGAIN error
    if user try to access it during the reset period until the host
    reset finished.
    But about the bar region which is maped by vfio_pci_mmap.
    How can I disable it in vfio driver?
    Even there is a way to do it,
    how about the complexity to recovery the mmap?

In qemu I have following proposals.
1. Setup a fault handler that blocks on access of bar region.
    So the data transmission will be blocked.
2. Disable vfio_pci_write_config, but keep vfio_pci_read_config
    enabled.
    The VM can get the error information by reading configure space.
    But operation of writing the configure space will be ignored.
3. Get VFIO device infomation instend of receiving resume notification.
    When I tested the non-fatal error.
    I found that sometimes the qemu receive resume notification earlier
    than error notification.
    The notification receiving time between different eventfd is not
    in the order of sending time.

Sincerely
Zhoujie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-22  3:28                 ` Zhou Jie
@ 2016-06-22  3:56                   ` Alex Williamson
  2016-06-22  5:45                     ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-06-22  3:56 UTC (permalink / raw)
  To: Zhou Jie; +Cc: Chen Fan, Chen Fan, izumi.taku, caoj.fnst, qemu-devel, mst

On Wed, 22 Jun 2016 11:28:50 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> >> Hi Alex,
> >>       on kernel side, I think if we don't trust the user behaviors, we
> >> should
> >>   disable the access of vfio-pci interface once vfio-pci driver got the
> >> error_detected,
> >>   we should disable all access to vfio fd regardless whether the vfio-pci
> >>   was assigned to a VM, we also can return a EAGAIN error if user try
> >>   to access it during the reset period until the host reset finished.
> >>       on qemu side, when we got a error_detect, we pass through the
> >> aer error to guest directly, ignore all access to vfio-pci during this
> >> time,
> >> when qemu need to do a hot reset, we can retry to get the info from
> >> the get info ioctl until we got the info that vfio-pci has been reset
> >> finished,
> >> then do the hot_reset ioctl if need, the kernel should ensure the ioctl
> >> become
> >> //// accessible after host reset completed.
> >>  
> >
> > That sounds pretty thorough, the sticky point there is always disabling
> > the device mmaps w/o a revoke interface.  Do we invalidate the pfn
> > range and setup a fault handler that blocks on access?  I don't think
> > we have a whole lot of options, either block or sigbus, but having such
> > a mechanism might allow us to easily put a device in a "dead" state
> > where the user can't touch it, which could be useful for other purposes
> > too.  QEMU would also need to timeout after some number of reset
> > attempts and assume the device is not coming back.  Plus we'd need a
> > device flag to indicate this behavior.  Thanks,
> >
> > Alex  
> 
> In vfio I have some questions.
> 1. How can I disable the access by mmap?
>     We can disable all access to vfio fd by returning a EAGAIN error
>     if user try to access it during the reset period until the host
>     reset finished.
>     But about the bar region which is maped by vfio_pci_mmap.
>     How can I disable it in vfio driver?
>     Even there is a way to do it,
>     how about the complexity to recovery the mmap?

That's exactly the "sticky point" I refer to above, you'd need to
solve that problem.  MST would probably still argue that we don't need
to disable all those interfaces, a userspace driver can already do
things like disable mmio space and then attempt to read from the mmio
space of the device.  So maybe the problem can be simplified to
non-device specific interfaces, like config space access plus ioctls.  I
think we know we shouldn't be doing any of those between error and
resume notification.

> In qemu I have following proposals.
> 1. Setup a fault handler that blocks on access of bar region.
>     So the data transmission will be blocked.
> 2. Disable vfio_pci_write_config, but keep vfio_pci_read_config
>     enabled.
>     The VM can get the error information by reading configure space.
>     But operation of writing the configure space will be ignored.
> 3. Get VFIO device infomation instend of receiving resume notification.
>     When I tested the non-fatal error.
>     I found that sometimes the qemu receive resume notification earlier
>     than error notification.
>     The notification receiving time between different eventfd is not
>     in the order of sending time.

I can imagine if events occur close enough together, QEMU will have
both events queued and which one we get a callback for first might
depend on the order they get tested.  APerhaps another reason not to
have a resume notifier.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-22  3:56                   ` Alex Williamson
@ 2016-06-22  5:45                     ` Zhou Jie
  2016-06-22  7:49                       ` Zhou Jie
  2016-06-22 15:25                       ` Alex Williamson
  0 siblings, 2 replies; 38+ messages in thread
From: Zhou Jie @ 2016-06-22  5:45 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Chen Fan, Chen Fan, izumi.taku, caoj.fnst, qemu-devel, mst

Hi Alex,

>>
>> In vfio I have some questions.
>> 1. How can I disable the access by mmap?
>>     We can disable all access to vfio fd by returning a EAGAIN error
>>     if user try to access it during the reset period until the host
>>     reset finished.
>>     But about the bar region which is maped by vfio_pci_mmap.
>>     How can I disable it in vfio driver?
>>     Even there is a way to do it,
>>     how about the complexity to recovery the mmap?
>
> That's exactly the "sticky point" I refer to above, you'd need to
> solve that problem.  MST would probably still argue that we don't need
> to disable all those interfaces, a userspace driver can already do
> things like disable mmio space and then attempt to read from the mmio
> space of the device.
You said we should not depend on user to protect the device
  be accessed during the reset period.

> So maybe the problem can be simplified to
> non-device specific interfaces, like config space access plus ioctls.
I don't understand what's your mean.

Sincerely
Zhoujie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-22  5:45                     ` Zhou Jie
@ 2016-06-22  7:49                       ` Zhou Jie
  2016-06-22 15:42                         ` Alex Williamson
  2016-06-22 15:25                       ` Alex Williamson
  1 sibling, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-06-22  7:49 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Chen Fan, mst, qemu-devel, caoj.fnst, Chen Fan, izumi.taku

Hi Alex,

On 2016/6/22 13:45, Zhou Jie wrote:
> Hi Alex,
>
>>>
>>> In vfio I have some questions.
>>> 1. How can I disable the access by mmap?
>>>     We can disable all access to vfio fd by returning a EAGAIN error
>>>     if user try to access it during the reset period until the host
>>>     reset finished.
>>>     But about the bar region which is maped by vfio_pci_mmap.
>>>     How can I disable it in vfio driver?
>>>     Even there is a way to do it,
>>>     how about the complexity to recovery the mmap?
>>
>> That's exactly the "sticky point" I refer to above, you'd need to
>> solve that problem.  MST would probably still argue that we don't need
>> to disable all those interfaces, a userspace driver can already do
>> things like disable mmio space and then attempt to read from the mmio
>> space of the device.
> You said we should not depend on user to protect the device
>  be accessed during the reset period.
>
>> So maybe the problem can be simplified to
>> non-device specific interfaces, like config space access plus ioctls.
> I don't understand what's your mean.

When a fatal aer error occurs the process is following.
For host
    aer driver detect aer error
-> vfio driver send aer error
-> aer driver reset bus
-> qemu report aer error
For guest
-> aer driver detect aer error
-> aer driver reset bus
-> device driver maybe disable device

I am not sure if all the device driver disable device
when a fatal aer error occurs.
Should we depend on the guest device driver to protect the device
be accessed during the reset period?

Sincerely
Zhoujie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-22  5:45                     ` Zhou Jie
  2016-06-22  7:49                       ` Zhou Jie
@ 2016-06-22 15:25                       ` Alex Williamson
  1 sibling, 0 replies; 38+ messages in thread
From: Alex Williamson @ 2016-06-22 15:25 UTC (permalink / raw)
  To: Zhou Jie; +Cc: Chen Fan, Chen Fan, izumi.taku, caoj.fnst, qemu-devel, mst

On Wed, 22 Jun 2016 13:45:10 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> >>
> >> In vfio I have some questions.
> >> 1. How can I disable the access by mmap?
> >>     We can disable all access to vfio fd by returning a EAGAIN error
> >>     if user try to access it during the reset period until the host
> >>     reset finished.
> >>     But about the bar region which is maped by vfio_pci_mmap.
> >>     How can I disable it in vfio driver?
> >>     Even there is a way to do it,
> >>     how about the complexity to recovery the mmap?  
> >
> > That's exactly the "sticky point" I refer to above, you'd need to
> > solve that problem.  MST would probably still argue that we don't need
> > to disable all those interfaces, a userspace driver can already do
> > things like disable mmio space and then attempt to read from the mmio
> > space of the device.  
> You said we should not depend on user to protect the device
>   be accessed during the reset period.
> 
> > So maybe the problem can be simplified to
> > non-device specific interfaces, like config space access plus ioctls.  
> I don't understand what's your mean.

I was trying to distinguish between state we control and state the
guest driver owns.  When the guest driver is reading and writing to
device apertures, that's opaque to us.  However when we're doing things
like manipulating interrupts or issuing a reset or even interacting
with partially virtualized config space, that's device state that we're
directly involved with.  So it seems like we have some ability, perhaps
even responsibility to make those work around the error event.  That's
the distinction I'm trying to make. Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-22  7:49                       ` Zhou Jie
@ 2016-06-22 15:42                         ` Alex Williamson
  2016-06-25  1:24                           ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-06-22 15:42 UTC (permalink / raw)
  To: Zhou Jie; +Cc: Chen Fan, mst, qemu-devel, caoj.fnst, izumi.taku

On Wed, 22 Jun 2016 15:49:41 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> On 2016/6/22 13:45, Zhou Jie wrote:
> > Hi Alex,
> >  
> >>>
> >>> In vfio I have some questions.
> >>> 1. How can I disable the access by mmap?
> >>>     We can disable all access to vfio fd by returning a EAGAIN error
> >>>     if user try to access it during the reset period until the host
> >>>     reset finished.
> >>>     But about the bar region which is maped by vfio_pci_mmap.
> >>>     How can I disable it in vfio driver?
> >>>     Even there is a way to do it,
> >>>     how about the complexity to recovery the mmap?  
> >>
> >> That's exactly the "sticky point" I refer to above, you'd need to
> >> solve that problem.  MST would probably still argue that we don't need
> >> to disable all those interfaces, a userspace driver can already do
> >> things like disable mmio space and then attempt to read from the mmio
> >> space of the device.  
> > You said we should not depend on user to protect the device
> >  be accessed during the reset period.
> >  
> >> So maybe the problem can be simplified to
> >> non-device specific interfaces, like config space access plus ioctls.  
> > I don't understand what's your mean.  
> 
> When a fatal aer error occurs the process is following.
> For host
>     aer driver detect aer error
> -> vfio driver send aer error
> -> aer driver reset bus
> -> qemu report aer error  
> For guest
> -> aer driver detect aer error
> -> aer driver reset bus
> -> device driver maybe disable device  
> 
> I am not sure if all the device driver disable device
> when a fatal aer error occurs.
> Should we depend on the guest device driver to protect the device
> be accessed during the reset period?

We should never depend on the guest driver to behave in a certain way,
but we need to prioritize what that actually means.  vfio in the kernel
has a responsibility first and foremost to the host kernel.  User owned
devices cannot be allowed to exploit or interfere with the host
regardless of user behavior.  The next priority is correct operation
for the user.  When the host kernel is handling the AER event between
the error and resume notifies, it doesn't have device specific drivers,
it's manipulating the device as a generic PCI device.  That makes me
think that vfio should not allow the user to interact (interfere) with
the device during that process and that such interference can be
limited to standard PCI level interactions.  That means config space,
and things that operate on config space (like interrupt ioctls and
resets).  On the QEMU side, we've sent a notification that an error
occurred, how the user and the guest respond to that is beyond the
concern of vfio in the kernel.  If the user/guest driver continues to
interact with resources on the device, that's fine, but I think vfio in
the kernel does need to prevent the user from interfering with the PCI
state of the device for that brief window when we know the host kernel
is operating on the device.  Otherwise the results are unpredictable
and therefore unsupportable.  Does that make sense?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-22 15:42                         ` Alex Williamson
@ 2016-06-25  1:24                           ` Zhou Jie
  2016-06-27 15:54                             ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-06-25  1:24 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Chen Fan, mst, qemu-devel, caoj.fnst, izumi.taku

Hi Alex,

> We should never depend on the guest driver to behave in a certain way,
> but we need to prioritize what that actually means.  vfio in the kernel
> has a responsibility first and foremost to the host kernel.  User owned
> devices cannot be allowed to exploit or interfere with the host
> regardless of user behavior.  The next priority is correct operation
> for the user.  When the host kernel is handling the AER event between
> the error and resume notifies, it doesn't have device specific drivers,
> it's manipulating the device as a generic PCI device.  That makes me
> think that vfio should not allow the user to interact (interfere) with
> the device during that process and that such interference can be
> limited to standard PCI level interactions.  That means config space,
> and things that operate on config space (like interrupt ioctls and
> resets).  On the QEMU side, we've sent a notification that an error
> occurred, how the user and the guest respond to that is beyond the
> concern of vfio in the kernel.  If the user/guest driver continues to
> interact with resources on the device, that's fine, but I think vfio in
> the kernel does need to prevent the user from interfering with the PCI
> state of the device for that brief window when we know the host kernel
> is operating on the device.  Otherwise the results are unpredictable
> and therefore unsupportable.  Does that make sense?  Thanks,
I understand.

I want to alter the VFIO driver like following.
During err occurs and resume:
1. Make config space read only.
    Ignore config space writing to prevent the user from
    interfering with the PCI state of the device.
    User can get the error infomation by reading the config space.
2. Disable INTx and MSI
    Write "Command Register" to disable INTx and MSI.
3. Do nothing for bar regions.
    Guest driver may access bar regions.
    It doesn't matter as device is going to be reset.

The following code will be modified.
1. vfio_pci_ioctl
    add flag for aer support
2. vfio_pci_ioctl
    During err occurs and resume:
    if (cmd == VFIO_DEVICE_SET_IRQS) return EAGAIN
    if (cmd == VFIO_DEVICE_RESET) return EAGAIN
    if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) return EAGAIN
    if (cmd == VFIO_DEVICE_PCI_HOT_RESET) return EAGAIN
3. vfio_pci_write
    During err occurs and resume:
    block
4. vfio_pci_aer_err_detected
    Set aer state in "struct vfio_pci_device"
    Write "Command Register" to disable INTx and MSI.
5. vfio_pci_aer_resume
    Clear aer state in "struct vfio_pci_device"
    I don't need to enable INTx and MSI.
    The device will be initalized by guest driver.

Sincerely
Zhoujie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-25  1:24                           ` Zhou Jie
@ 2016-06-27 15:54                             ` Alex Williamson
  2016-06-28  3:26                               ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-06-27 15:54 UTC (permalink / raw)
  To: Zhou Jie; +Cc: Chen Fan, mst, qemu-devel, caoj.fnst, izumi.taku

On Sat, 25 Jun 2016 09:24:19 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> > We should never depend on the guest driver to behave in a certain way,
> > but we need to prioritize what that actually means.  vfio in the kernel
> > has a responsibility first and foremost to the host kernel.  User owned
> > devices cannot be allowed to exploit or interfere with the host
> > regardless of user behavior.  The next priority is correct operation
> > for the user.  When the host kernel is handling the AER event between
> > the error and resume notifies, it doesn't have device specific drivers,
> > it's manipulating the device as a generic PCI device.  That makes me
> > think that vfio should not allow the user to interact (interfere) with
> > the device during that process and that such interference can be
> > limited to standard PCI level interactions.  That means config space,
> > and things that operate on config space (like interrupt ioctls and
> > resets).  On the QEMU side, we've sent a notification that an error
> > occurred, how the user and the guest respond to that is beyond the
> > concern of vfio in the kernel.  If the user/guest driver continues to
> > interact with resources on the device, that's fine, but I think vfio in
> > the kernel does need to prevent the user from interfering with the PCI
> > state of the device for that brief window when we know the host kernel
> > is operating on the device.  Otherwise the results are unpredictable
> > and therefore unsupportable.  Does that make sense?  Thanks,  
> I understand.
> 
> I want to alter the VFIO driver like following.
> During err occurs and resume:
> 1. Make config space read only.
>     Ignore config space writing to prevent the user from
>     interfering with the PCI state of the device.
>     User can get the error infomation by reading the config space.
> 2. Disable INTx and MSI
>     Write "Command Register" to disable INTx and MSI.
> 3. Do nothing for bar regions.
>     Guest driver may access bar regions.
>     It doesn't matter as device is going to be reset.
> 
> The following code will be modified.
> 1. vfio_pci_ioctl
>     add flag for aer support
> 2. vfio_pci_ioctl
>     During err occurs and resume:
>     if (cmd == VFIO_DEVICE_SET_IRQS) return EAGAIN
>     if (cmd == VFIO_DEVICE_RESET) return EAGAIN
>     if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) return EAGAIN
>     if (cmd == VFIO_DEVICE_PCI_HOT_RESET) return EAGAIN
> 3. vfio_pci_write
>     During err occurs and resume:
>     block
> 4. vfio_pci_aer_err_detected
>     Set aer state in "struct vfio_pci_device"
>     Write "Command Register" to disable INTx and MSI.
> 5. vfio_pci_aer_resume
>     Clear aer state in "struct vfio_pci_device"
>     I don't need to enable INTx and MSI.
>     The device will be initalized by guest driver.

The INTx/MSI part needs further definition for the user.  Are we
actually completely tearing down interrupts with the expectation that
the user will re-enable them or are we just masking them such that the
user needs to unmask?  Also note that not all devices support DisINTx.

Otherwise it seems like a reasonable approach, but I can't guarantee we
won't find new issues along the way.  For instance we'll need to test
how -EAGAIN returns interact with existing QEMU and maybe decided
whether there are cases that are better handled by doing an
interruptible wait.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-27 15:54                             ` Alex Williamson
@ 2016-06-28  3:26                               ` Zhou Jie
  2016-06-28  3:58                                 ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-06-28  3:26 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Chen Fan, mst, qemu-devel, caoj.fnst, izumi.taku

Hi Alex,

> The INTx/MSI part needs further definition for the user.  Are we
> actually completely tearing down interrupts with the expectation that
> the user will re-enable them or are we just masking them such that the
> user needs to unmask?  Also note that not all devices support DisINTx.

After reset, the "Bus Master Enable" bit of "Command Register"
should be cleared, so MSI/MSI- X interrupt Messages is still disabled.
After reset, the "Interrupt Disable" bit of "Command Register"
should be cleared, so INTx interrupts is enabled.
If the device doesn't support INTx, "Interrupt Disable" bit will
hardware to 0, it is OK here.

After fatal-error occurs, the user should reset the device and
reinitialize the device.
So I disable the interrupt before host reset the device,
and let user to do the reinitialization.

> Otherwise it seems like a reasonable approach, but I can't guarantee we
> won't find new issues along the way.  For instance we'll need to test
> how -EAGAIN returns interact with existing QEMU and maybe decided
> whether there are cases that are better handled by doing an
> interruptible wait.  Thanks,
I will dig into it.

Sincerely
Zhou Jie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-28  3:26                               ` Zhou Jie
@ 2016-06-28  3:58                                 ` Alex Williamson
  2016-06-28  5:27                                   ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-06-28  3:58 UTC (permalink / raw)
  To: Zhou Jie; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

On Tue, 28 Jun 2016 11:26:33 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> > The INTx/MSI part needs further definition for the user.  Are we
> > actually completely tearing down interrupts with the expectation that
> > the user will re-enable them or are we just masking them such that the
> > user needs to unmask?  Also note that not all devices support DisINTx.  
> 
> After reset, the "Bus Master Enable" bit of "Command Register"
> should be cleared, so MSI/MSI- X interrupt Messages is still disabled.
> After reset, the "Interrupt Disable" bit of "Command Register"
> should be cleared, so INTx interrupts is enabled.
> If the device doesn't support INTx, "Interrupt Disable" bit will
> hardware to 0, it is OK here.
> 
> After fatal-error occurs, the user should reset the device and
> reinitialize the device.
> So I disable the interrupt before host reset the device,
> and let user to do the reinitialization.

I'm dubious here.  When DisINTx is not supported by the device or it's
marked broken in host quirks, then we can't trust the device to stop
sending INTx.  It's hardwired to zero, meaning that it doesn't work or
it's been found to be broken in other ways.  So COMMAND register
masking is not sufficient for all devices.  Also, any time we start
changing the state of the device from what the user expects, we risk
consistency problems.  We need to consider how the user last saw the
device and whether we can legitimately expect them to handle the device
in a new state.  If we expect the user to re-initialize the device then
would it be more correct to teardown all interrupt signaling such that
the device is effectively in the same state as initial handoff when the
vfio device fd is opened?  How will the user know when the device is
ready to be reset?  Which of the ioctls that you're blocking can they
poll w/o any unwanted side-effects or awkward interactions?  Should
flag bits in the device info ioctl indicate not only support for this
behavior but also the current status?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-28  3:58                                 ` Alex Williamson
@ 2016-06-28  5:27                                   ` Zhou Jie
  2016-06-28 14:40                                     ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-06-28  5:27 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

Hi Alex,

On 2016/6/28 11:58, Alex Williamson wrote:
> On Tue, 28 Jun 2016 11:26:33 +0800
> Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
>
>> Hi Alex,
>>
>>> The INTx/MSI part needs further definition for the user.  Are we
>>> actually completely tearing down interrupts with the expectation that
>>> the user will re-enable them or are we just masking them such that the
>>> user needs to unmask?  Also note that not all devices support DisINTx.
>>
>> After reset, the "Bus Master Enable" bit of "Command Register"
>> should be cleared, so MSI/MSI- X interrupt Messages is still disabled.
>> After reset, the "Interrupt Disable" bit of "Command Register"
>> should be cleared, so INTx interrupts is enabled.
>> If the device doesn't support INTx, "Interrupt Disable" bit will
>> hardware to 0, it is OK here.
>>
>> After fatal-error occurs, the user should reset the device and
>> reinitialize the device.
>> So I disable the interrupt before host reset the device,
>> and let user to do the reinitialization.
>
> I'm dubious here.  When DisINTx is not supported by the device or it's
> marked broken in host quirks, then we can't trust the device to stop
> sending INTx.  It's hardwired to zero, meaning that it doesn't work or
> it's been found to be broken in other ways.  So COMMAND register
> masking is not sufficient for all devices.
For Endpoints that generate INTx interrupts, this bit is required.
For Endpoints that do not generate IN Tx interrupts this bit is
optional.  If not implemented, this bit must be hardwired to 0b.
For Root Ports, Switch Ports, and Bridges that generate INTx
interrupts on their own behalf, this bit is required.

The above is from "7.5.1.1." of "PCI Express Base Specification 3.1a".
So I think "Interrupt Disable" bit must be supported by the device
which can generate INTx interrupts.

> Also, any time we start
> changing the state of the device from what the user expects, we risk
> consistency problems.  We need to consider how the user last saw the
> device and whether we can legitimately expect them to handle the device
> in a new state.  If we expect the user to re-initialize the device then
> would it be more correct to teardown all interrupt signaling such that
> the device is effectively in the same state as initial handoff when the
> vfio device fd is opened?
Before the user re-initialize the device, host has reseted the device.
The interrupt status will be cleared by hardware.
So the hardware is the same as the state when the
vfio device fd is opened.

> How will the user know when the device is
> ready to be reset?  Which of the ioctls that you're blocking can they
> poll w/o any unwanted side-effects or awkward interactions?  Should
> flag bits in the device info ioctl indicate not only support for this
> behavior but also the current status?  Thanks,
I can block the reset ioctl and config write.
I will not add flag for the device current status,
because I don't depend on user to prevent awkward interactions.

Sincerely
Zhou Jie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-28  5:27                                   ` Zhou Jie
@ 2016-06-28 14:40                                     ` Alex Williamson
  2016-06-29  8:54                                       ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-06-28 14:40 UTC (permalink / raw)
  To: Zhou Jie; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

On Tue, 28 Jun 2016 13:27:21 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> On 2016/6/28 11:58, Alex Williamson wrote:
> > On Tue, 28 Jun 2016 11:26:33 +0800
> > Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
> >  
> >> Hi Alex,
> >>  
> >>> The INTx/MSI part needs further definition for the user.  Are we
> >>> actually completely tearing down interrupts with the expectation that
> >>> the user will re-enable them or are we just masking them such that the
> >>> user needs to unmask?  Also note that not all devices support DisINTx.  
> >>
> >> After reset, the "Bus Master Enable" bit of "Command Register"
> >> should be cleared, so MSI/MSI- X interrupt Messages is still disabled.
> >> After reset, the "Interrupt Disable" bit of "Command Register"
> >> should be cleared, so INTx interrupts is enabled.
> >> If the device doesn't support INTx, "Interrupt Disable" bit will
> >> hardware to 0, it is OK here.
> >>
> >> After fatal-error occurs, the user should reset the device and
> >> reinitialize the device.
> >> So I disable the interrupt before host reset the device,
> >> and let user to do the reinitialization.  
> >
> > I'm dubious here.  When DisINTx is not supported by the device or it's
> > marked broken in host quirks, then we can't trust the device to stop
> > sending INTx.  It's hardwired to zero, meaning that it doesn't work or
> > it's been found to be broken in other ways.  So COMMAND register
> > masking is not sufficient for all devices.  
> For Endpoints that generate INTx interrupts, this bit is required.
> For Endpoints that do not generate IN Tx interrupts this bit is
> optional.  If not implemented, this bit must be hardwired to 0b.
> For Root Ports, Switch Ports, and Bridges that generate INTx
> interrupts on their own behalf, this bit is required.
> 
> The above is from "7.5.1.1." of "PCI Express Base Specification 3.1a".
> So I think "Interrupt Disable" bit must be supported by the device
> which can generate INTx interrupts.

And yet we have struct pci_dev.broken_intx_masking and we test for
working DisINTx via pci_intx_mask_supported() rather than simply
looking for a PCIe device.  Some devices are broken and some simply
don't follow the spec, so you're going to need to deal with that or
exclude those devices.
 
> > Also, any time we start
> > changing the state of the device from what the user expects, we risk
> > consistency problems.  We need to consider how the user last saw the
> > device and whether we can legitimately expect them to handle the device
> > in a new state.  If we expect the user to re-initialize the device then
> > would it be more correct to teardown all interrupt signaling such that
> > the device is effectively in the same state as initial handoff when the
> > vfio device fd is opened?  
> Before the user re-initialize the device, host has reseted the device.

How does that happen, aren't we notifying the user at the point the
error occurs, while the device is still in the process or being reset?
My question is how does the user know that the host reset is complete
in order to begin their own re-initialization?

> The interrupt status will be cleared by hardware.
> So the hardware is the same as the state when the
> vfio device fd is opened.

The PCI-core in Linux will save and restore the device state around
reset, how do we know that vfio-pci itself is not racing that reset and
whether PCI-core will restore the state including our interrupt masking
or a state without it?  Do we need to restore the state to the one we
saved when we originally opened the device?  Shouldn't that mean we
teardown the interrupt setup the user had prior to the error event?
 
> > How will the user know when the device is
> > ready to be reset?  Which of the ioctls that you're blocking can they
> > poll w/o any unwanted side-effects or awkward interactions?  Should
> > flag bits in the device info ioctl indicate not only support for this
> > behavior but also the current status?  Thanks,  
> I can block the reset ioctl and config write.
> I will not add flag for the device current status,
> because I don't depend on user to prevent awkward interactions.

Ok, so that's a reason to block rather than return -EAGAIN.  Still we
need some way to indicate to the user whether the device supports this
new interaction rather than the existing behavior.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-28 14:40                                     ` Alex Williamson
@ 2016-06-29  8:54                                       ` Zhou Jie
  2016-06-29 18:22                                         ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-06-29  8:54 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

Hi Alex,

> And yet we have struct pci_dev.broken_intx_masking and we test for
> working DisINTx via pci_intx_mask_supported() rather than simply
> looking for a PCIe device.  Some devices are broken and some simply
> don't follow the spec, so you're going to need to deal with that or
> exclude those devices.
For those devices I have no way to disable the INTx.

> How does that happen, aren't we notifying the user at the point the
> error occurs, while the device is still in the process or being reset?
> My question is how does the user know that the host reset is complete
> in order to begin their own re-initialization?
I will add a state in "struct vfio_pci_device".
The state is set when the device can not work such as a aer error
  occured.
And the state is clear when the device can work such as resume
  received.
Return the state when user get info by vfio_pci_ioctl.

>> The interrupt status will be cleared by hardware.
>> So the hardware is the same as the state when the
>> vfio device fd is opened.
>
> The PCI-core in Linux will save and restore the device state around
> reset, how do we know that vfio-pci itself is not racing that reset and
> whether PCI-core will restore the state including our interrupt masking
> or a state without it?  Do we need to restore the state to the one we
> saved when we originally opened the device?  Shouldn't that mean we
> teardown the interrupt setup the user had prior to the error event?
For above you said.
Maybe disable the interrupt is not a good idea.
Think about what will happend in the interrupt handler.
Maybe read/write configure space and region bar.
I will make the configure space read only.
Do nothing for region bar which used by userd.

>>> How will the user know when the device is
>>> ready to be reset?  Which of the ioctls that you're blocking can they
>>> poll w/o any unwanted side-effects or awkward interactions?  Should
>>> flag bits in the device info ioctl indicate not only support for this
>>> behavior but also the current status?  Thanks,
>> I can block the reset ioctl and config write.
>> I will not add flag for the device current status,
>> because I don't depend on user to prevent awkward interactions.
>
> Ok, so that's a reason to block rather than return -EAGAIN.  Still we
> need some way to indicate to the user whether the device supports this
> new interaction rather than the existing behavior.  Thanks,
Because write configure space maybe happened in interrupt handler.
I think block is not a good choice.

Sincerely
Zhou Jie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-29  8:54                                       ` Zhou Jie
@ 2016-06-29 18:22                                         ` Alex Williamson
  2016-06-30  1:45                                           ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-06-29 18:22 UTC (permalink / raw)
  To: Zhou Jie; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

On Wed, 29 Jun 2016 16:54:05 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> > And yet we have struct pci_dev.broken_intx_masking and we test for
> > working DisINTx via pci_intx_mask_supported() rather than simply
> > looking for a PCIe device.  Some devices are broken and some simply
> > don't follow the spec, so you're going to need to deal with that or
> > exclude those devices.  
> For those devices I have no way to disable the INTx.

disable_irq()?  Clearly vfio-pci already manages these types of devices
and can disable INTx.  This is why I keep suggesting that maybe tearing
the interrupt setup down completely is a more complete and reliable
approach than masking in the command register.  Unless we're going to
exclude such devices from supporting this mode (which I don't condone),
we must deal with them.
 
> > How does that happen, aren't we notifying the user at the point the
> > error occurs, while the device is still in the process or being reset?
> > My question is how does the user know that the host reset is complete
> > in order to begin their own re-initialization?  
> I will add a state in "struct vfio_pci_device".
> The state is set when the device can not work such as a aer error
>   occured.
> And the state is clear when the device can work such as resume
>   received.
> Return the state when user get info by vfio_pci_ioctl.
> 
> >> The interrupt status will be cleared by hardware.
> >> So the hardware is the same as the state when the
> >> vfio device fd is opened.  
> >
> > The PCI-core in Linux will save and restore the device state around
> > reset, how do we know that vfio-pci itself is not racing that reset and
> > whether PCI-core will restore the state including our interrupt masking
> > or a state without it?  Do we need to restore the state to the one we
> > saved when we originally opened the device?  Shouldn't that mean we
> > teardown the interrupt setup the user had prior to the error event?  
> For above you said.
> Maybe disable the interrupt is not a good idea.
> Think about what will happend in the interrupt handler.
> Maybe read/write configure space and region bar.
> I will make the configure space read only.
> Do nothing for region bar which used by userd.

I'm thinking that vfio-pci will be attempting to mask the interrupts
via the PCI command register, which is potentially in a state of flux
due to the host reset and yet we're somehow expecting that our write to
the command register sticks.  We certainly have the ability to a)
discard interrupts received between AER error and resume, and b) if we
want to be consistent with requiring the user to reinitialize the
device, then the user interrupt setup should likely be torn down.
Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-29 18:22                                         ` Alex Williamson
@ 2016-06-30  1:45                                           ` Zhou Jie
  2016-07-03  4:00                                             ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-06-30  1:45 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

Hi Alex,

On 2016/6/30 2:22, Alex Williamson wrote:
> On Wed, 29 Jun 2016 16:54:05 +0800
> Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
>
>> Hi Alex,
>>
>>> And yet we have struct pci_dev.broken_intx_masking and we test for
>>> working DisINTx via pci_intx_mask_supported() rather than simply
>>> looking for a PCIe device.  Some devices are broken and some simply
>>> don't follow the spec, so you're going to need to deal with that or
>>> exclude those devices.
>> For those devices I have no way to disable the INTx.
>
> disable_irq()?  Clearly vfio-pci already manages these types of devices
> and can disable INTx.  This is why I keep suggesting that maybe tearing
> the interrupt setup down completely is a more complete and reliable
> approach than masking in the command register.  Unless we're going to
> exclude such devices from supporting this mode (which I don't condone),
> we must deal with them.
Thank you for tell me that.
Yes, I can use disable_irq to disable the pci device irq.
But should I enable the irq after reset?
I will dig into it.

Sincerely
Zhou Jie

>>> How does that happen, aren't we notifying the user at the point the
>>> error occurs, while the device is still in the process or being reset?
>>> My question is how does the user know that the host reset is complete
>>> in order to begin their own re-initialization?
>> I will add a state in "struct vfio_pci_device".
>> The state is set when the device can not work such as a aer error
>>   occured.
>> And the state is clear when the device can work such as resume
>>   received.
>> Return the state when user get info by vfio_pci_ioctl.
>>
>>>> The interrupt status will be cleared by hardware.
>>>> So the hardware is the same as the state when the
>>>> vfio device fd is opened.
>>>
>>> The PCI-core in Linux will save and restore the device state around
>>> reset, how do we know that vfio-pci itself is not racing that reset and
>>> whether PCI-core will restore the state including our interrupt masking
>>> or a state without it?  Do we need to restore the state to the one we
>>> saved when we originally opened the device?  Shouldn't that mean we
>>> teardown the interrupt setup the user had prior to the error event?
>> For above you said.
>> Maybe disable the interrupt is not a good idea.
>> Think about what will happend in the interrupt handler.
>> Maybe read/write configure space and region bar.
>> I will make the configure space read only.
>> Do nothing for region bar which used by userd.
>
> I'm thinking that vfio-pci will be attempting to mask the interrupts
> via the PCI command register, which is potentially in a state of flux
> due to the host reset and yet we're somehow expecting that our write to
> the command register sticks.  We certainly have the ability to a)
> discard interrupts received between AER error and resume, and b) if we
> want to be consistent with requiring the user to reinitialize the
> device, then the user interrupt setup should likely be torn down.
> Thanks,

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-06-30  1:45                                           ` Zhou Jie
@ 2016-07-03  4:00                                             ` Zhou Jie
  2016-07-05  1:36                                               ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-07-03  4:00 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

Hi Alex,

On 2016/6/30 9:45, Zhou Jie wrote:
> Hi Alex,
>
> On 2016/6/30 2:22, Alex Williamson wrote:
>> On Wed, 29 Jun 2016 16:54:05 +0800
>> Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
>>
>>> Hi Alex,
>>>
>>>> And yet we have struct pci_dev.broken_intx_masking and we test for
>>>> working DisINTx via pci_intx_mask_supported() rather than simply
>>>> looking for a PCIe device.  Some devices are broken and some simply
>>>> don't follow the spec, so you're going to need to deal with that or
>>>> exclude those devices.
>>> For those devices I have no way to disable the INTx.
>>
>> disable_irq()?  Clearly vfio-pci already manages these types of devices
>> and can disable INTx.  This is why I keep suggesting that maybe tearing
>> the interrupt setup down completely is a more complete and reliable
>> approach than masking in the command register.  Unless we're going to
>> exclude such devices from supporting this mode (which I don't condone),
>> we must deal with them.
> Thank you for tell me that.
> Yes, I can use disable_irq to disable the pci device irq.
> But should I enable the irq after reset?
> I will dig into it.

I will alter the VFIO driver like following.
During err occurs and resume:
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

The following code will be modified.
1. vfio_pci_ioctl
    add a flag in vfio_device_info for workable_state support
    return workable_state in "struct vfio_pci_device" when user get info
2. vfio_pci_ioctl
    During err occurs and resume:
    if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
    || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
    block for workable_state clearing
3. vfio_pci_write
    During err occurs and resume:
    ignore and return 0
4. vfio_pci_aer_err_detected
    Set workable_state to false in "struct vfio_pci_device"
    Disable INTx:
      If Disable INTx is support
        disable by PCI_COMMAND
      else
        disable by disable_irq function
    Disable MSI:
        disable by clearing the "Bus Master Enable" bit of PCI_COMMAND

5. vfio_pci_aer_resume
    Set workable_state to true in "struct vfio_pci_device"
    About INTx:
      According to the value of "vdev->ctx[0].masked"
      to decide whether to enable INTx
    About MSI:
      After reset the "Bus Master Enable" bit is default to 0.
      So user should process this after reset.

Sincerely
Zhou Jie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-03  4:00                                             ` Zhou Jie
@ 2016-07-05  1:36                                               ` Zhou Jie
  2016-07-05 17:03                                                 ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-07-05  1:36 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

ping

On 2016/7/3 12:00, Zhou Jie wrote:
> Hi Alex,
>
> On 2016/6/30 9:45, Zhou Jie wrote:
>> Hi Alex,
>>
>> On 2016/6/30 2:22, Alex Williamson wrote:
>>> On Wed, 29 Jun 2016 16:54:05 +0800
>>> Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
>>>
>>>> Hi Alex,
>>>>
>>>>> And yet we have struct pci_dev.broken_intx_masking and we test for
>>>>> working DisINTx via pci_intx_mask_supported() rather than simply
>>>>> looking for a PCIe device.  Some devices are broken and some simply
>>>>> don't follow the spec, so you're going to need to deal with that or
>>>>> exclude those devices.
>>>> For those devices I have no way to disable the INTx.
>>>
>>> disable_irq()?  Clearly vfio-pci already manages these types of devices
>>> and can disable INTx.  This is why I keep suggesting that maybe tearing
>>> the interrupt setup down completely is a more complete and reliable
>>> approach than masking in the command register.  Unless we're going to
>>> exclude such devices from supporting this mode (which I don't condone),
>>> we must deal with them.
>> Thank you for tell me that.
>> Yes, I can use disable_irq to disable the pci device irq.
>> But should I enable the irq after reset?
>> I will dig into it.
>
> I will alter the VFIO driver like following.
> During err occurs and resume:
> 1. Make config space read only.
> 2. Disable INTx/MSI Interrupt.
> 3. Do nothing for bar regions.
>
> The following code will be modified.
> 1. vfio_pci_ioctl
>    add a flag in vfio_device_info for workable_state support
>    return workable_state in "struct vfio_pci_device" when user get info
> 2. vfio_pci_ioctl
>    During err occurs and resume:
>    if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
>    || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
>    block for workable_state clearing
> 3. vfio_pci_write
>    During err occurs and resume:
>    ignore and return 0
> 4. vfio_pci_aer_err_detected
>    Set workable_state to false in "struct vfio_pci_device"
>    Disable INTx:
>      If Disable INTx is support
>        disable by PCI_COMMAND
>      else
>        disable by disable_irq function
>    Disable MSI:
>        disable by clearing the "Bus Master Enable" bit of PCI_COMMAND
>
> 5. vfio_pci_aer_resume
>    Set workable_state to true in "struct vfio_pci_device"
>    About INTx:
>      According to the value of "vdev->ctx[0].masked"
>      to decide whether to enable INTx
>    About MSI:
>      After reset the "Bus Master Enable" bit is default to 0.
>      So user should process this after reset.
>
> Sincerely
> Zhou Jie
>
>
>
>

-- 
------------------------------------------------
周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2011@cn.fujitsu.com
------------------------------------------------

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-05  1:36                                               ` Zhou Jie
@ 2016-07-05 17:03                                                 ` Alex Williamson
  2016-07-06  2:01                                                   ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-07-05 17:03 UTC (permalink / raw)
  To: Zhou Jie; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

On Tue, 5 Jul 2016 09:36:27 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> ping

Due to weekend and holiday in my country, there were zero regular
working hours between your emails.
 
> On 2016/7/3 12:00, Zhou Jie wrote:
> > Hi Alex,
> >
> > On 2016/6/30 9:45, Zhou Jie wrote:  
> >> Hi Alex,
> >>
> >> On 2016/6/30 2:22, Alex Williamson wrote:  
> >>> On Wed, 29 Jun 2016 16:54:05 +0800
> >>> Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
> >>>  
> >>>> Hi Alex,
> >>>>  
> >>>>> And yet we have struct pci_dev.broken_intx_masking and we test for
> >>>>> working DisINTx via pci_intx_mask_supported() rather than simply
> >>>>> looking for a PCIe device.  Some devices are broken and some simply
> >>>>> don't follow the spec, so you're going to need to deal with that or
> >>>>> exclude those devices.  
> >>>> For those devices I have no way to disable the INTx.  
> >>>
> >>> disable_irq()?  Clearly vfio-pci already manages these types of devices
> >>> and can disable INTx.  This is why I keep suggesting that maybe tearing
> >>> the interrupt setup down completely is a more complete and reliable
> >>> approach than masking in the command register.  Unless we're going to
> >>> exclude such devices from supporting this mode (which I don't condone),
> >>> we must deal with them.  
> >> Thank you for tell me that.
> >> Yes, I can use disable_irq to disable the pci device irq.
> >> But should I enable the irq after reset?
> >> I will dig into it.  
> >
> > I will alter the VFIO driver like following.
> > During err occurs and resume:
> > 1. Make config space read only.
> > 2. Disable INTx/MSI Interrupt.
> > 3. Do nothing for bar regions.
> >
> > The following code will be modified.
> > 1. vfio_pci_ioctl
> >    add a flag in vfio_device_info for workable_state support
> >    return workable_state in "struct vfio_pci_device" when user get info

Seems like two flags are required, one to indicate the presence of this
feature and another to indicate the state.  I'd prefer something like
access_blocked.

> > 2. vfio_pci_ioctl
> >    During err occurs and resume:
> >    if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
> >    || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
> >    block for workable_state clearing
> > 3. vfio_pci_write
> >    During err occurs and resume:
> >    ignore and return 0

This is contradictory to your comment "Do nothing for bar regions".
ISTR that returning 0 for read/write calls is an easy way to break
users since we've return neither the desired bytes nor an error code.

> > 4. vfio_pci_aer_err_detected
> >    Set workable_state to false in "struct vfio_pci_device"
> >    Disable INTx:
> >      If Disable INTx is support
> >        disable by PCI_COMMAND
> >      else
> >        disable by disable_irq function
> >    Disable MSI:
> >        disable by clearing the "Bus Master Enable" bit of PCI_COMMAND

I've suggested repeatedly to properly teardown these interrupts.  I
disagree with your proposed approach here.  If the device is intended to
be in a state that requires re-initialization then the interrupt setup
should be part of that.

> > 5. vfio_pci_aer_resume
> >    Set workable_state to true in "struct vfio_pci_device"
> >    About INTx:
> >      According to the value of "vdev->ctx[0].masked"
> >      to decide whether to enable INTx
> >    About MSI:
> >      After reset the "Bus Master Enable" bit is default to 0.
> >      So user should process this after reset.

Again, I think this is error prone, teardown the interrupts and define
that the device state, including interrupts, needs to be reinitialized
after error.  Why are you not incorporating this feedback?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-05 17:03                                                 ` Alex Williamson
@ 2016-07-06  2:01                                                   ` Zhou Jie
  2016-07-07 19:04                                                     ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-07-06  2:01 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

Hi Alex,

> Due to weekend and holiday in my country, there were zero regular
> working hours between your emails.
I wish you had a good time.

>>> The following code will be modified.
>>> 1. vfio_pci_ioctl
>>>    add a flag in vfio_device_info for workable_state support
>>>    return workable_state in "struct vfio_pci_device" when user get info
>
> Seems like two flags are required, one to indicate the presence of this
> feature and another to indicate the state.  I'd prefer something like
> access_blocked.
User can get the state by state flag.
And also maybe blocked by ioctl or write functons.
User has choice to invoke which functions.

>>> 2. vfio_pci_ioctl
>>>    During err occurs and resume:
>>>    if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
>>>    || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
>>>    block for workable_state clearing
>>> 3. vfio_pci_write
>>>    During err occurs and resume:
>>>    ignore and return 0
>
> This is contradictory to your comment "Do nothing for bar regions".
> ISTR that returning 0 for read/write calls is an easy way to break
> users since we've return neither the desired bytes nor an error code.
No, there is not change for read.
Just return 0 for write.
Return 0 mean that there is no byte has been written.
Consider that the aer has occurred,
it is better to not to write any thing to device.
User can still read/write bar regions by mmap address,
this may generate some date errors,
but it doesn't matter as device is going to be reset.

>>> 4. vfio_pci_aer_err_detected
>>>    Set workable_state to false in "struct vfio_pci_device"
>>>    Disable INTx:
>>>      If Disable INTx is support
>>>        disable by PCI_COMMAND
>>>      else
>>>        disable by disable_irq function
>>>    Disable MSI:
>>>        disable by clearing the "Bus Master Enable" bit of PCI_COMMAND
>
> I've suggested repeatedly to properly teardown these interrupts.  I
> disagree with your proposed approach here.  If the device is intended to
> be in a state that requires re-initialization then the interrupt setup
> should be part of that.
I have traced the INTx functions.
-vfio_pci_intx_unmask_handler
-vfio_pci_intx_mask
-vfio_intx_set_signal

They are invoked by User.
-vfio_pci_write
-vfio_pci_ioctl

During err occurs and resume above functions are blocked.
So, User cann't set the INTx.
I will disable the INTx in vfio_pci_aer_err_detected.
And reset the INTx in vfio_pci_aer_resume
according the original user setting(vdev->ctx[0].masked).

>>> 5. vfio_pci_aer_resume
>>>    Set workable_state to true in "struct vfio_pci_device"
>>>    About INTx:
>>>      According to the value of "vdev->ctx[0].masked"
>>>      to decide whether to enable INTx
>>>    About MSI:
>>>      After reset the "Bus Master Enable" bit is default to 0.
>>>      So user should process this after reset.
>
> Again, I think this is error prone, teardown the interrupts and define
> that the device state, including interrupts, needs to be reinitialized
> after error.  Why are you not incorporating this feedback?  Thanks,
The reinitialization depend on user.
For VFIO driver the process is following.
1. aer occurs
2. disable the following functions of the device
    write(except the bar regions), ioctl and interrupt
3. aer driver reset the device
4. renable the device for user
5. user process the aer event
    Maybe reset the device and reinitialization

What I do is make sure the following points.
1. Host can reset the device between step 2 and 4.
2. The user settings is the same at step 1 and 5.

Sincerely
Zhou Jie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-06  2:01                                                   ` Zhou Jie
@ 2016-07-07 19:04                                                     ` Alex Williamson
  2016-07-08  1:38                                                       ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-07-07 19:04 UTC (permalink / raw)
  To: Zhou Jie; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

On Wed, 6 Jul 2016 10:01:28 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> > Due to weekend and holiday in my country, there were zero regular
> > working hours between your emails.  
> I wish you had a good time.
> 
> >>> The following code will be modified.
> >>> 1. vfio_pci_ioctl
> >>>    add a flag in vfio_device_info for workable_state support
> >>>    return workable_state in "struct vfio_pci_device" when user get info  
> >
> > Seems like two flags are required, one to indicate the presence of this
> > feature and another to indicate the state.  I'd prefer something like
> > access_blocked.  
> User can get the state by state flag.
> And also maybe blocked by ioctl or write functons.
> User has choice to invoke which functions.

Let's imagine there's one flag bit, there are two possible polarities,
a) the bit is set when access is available, b) the bit is set when
access is blocked.

Let's examine a), if the bit is not set, does that means that access is
not available or does that mean the kernel doesn't support that
feature?  There's no way to know.  Fail.  So we switch to b), an error
occurs, the bit is not set, does that mean access is blocked or does
that mean that the kernel we're using doesn't support the feature.
Fail.  If there's a way to do this with one bit, please explain it to
me.  Relying on a function to block, which may not be a valid
assumption on the kernel we're using also fails.  Userspace must be
able to know, in a deterministic and backwards compatible way, the
features of the kernel and the behavior to expect.
 
> >>> 2. vfio_pci_ioctl
> >>>    During err occurs and resume:
> >>>    if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
> >>>    || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
> >>>    block for workable_state clearing
> >>> 3. vfio_pci_write
> >>>    During err occurs and resume:
> >>>    ignore and return 0  
> >
> > This is contradictory to your comment "Do nothing for bar regions".
> > ISTR that returning 0 for read/write calls is an easy way to break
> > users since we've return neither the desired bytes nor an error code.  
> No, there is not change for read.
> Just return 0 for write.
> Return 0 mean that there is no byte has been written.
> Consider that the aer has occurred,
> it is better to not to write any thing to device.
> User can still read/write bar regions by mmap address,
> this may generate some date errors,
> but it doesn't matter as device is going to be reset.

My statement still stands, you state "Do nothing for bar regions" and
"return 0 for write".  Those are contradictory and as I indicate,
returning 0 is problematic for userspace.  Additionally, making
vfio_pci_write return zero while still allowing writes through the BAR
mmap is inconsistent.

> >>> 4. vfio_pci_aer_err_detected
> >>>    Set workable_state to false in "struct vfio_pci_device"
> >>>    Disable INTx:
> >>>      If Disable INTx is support
> >>>        disable by PCI_COMMAND
> >>>      else
> >>>        disable by disable_irq function
> >>>    Disable MSI:
> >>>        disable by clearing the "Bus Master Enable" bit of PCI_COMMAND  
> >
> > I've suggested repeatedly to properly teardown these interrupts.  I
> > disagree with your proposed approach here.  If the device is intended to
> > be in a state that requires re-initialization then the interrupt setup
> > should be part of that.  
> I have traced the INTx functions.
> -vfio_pci_intx_unmask_handler
> -vfio_pci_intx_mask
> -vfio_intx_set_signal
> 
> They are invoked by User.
> -vfio_pci_write
> -vfio_pci_ioctl
> 
> During err occurs and resume above functions are blocked.
> So, User cann't set the INTx.
> I will disable the INTx in vfio_pci_aer_err_detected.
> And reset the INTx in vfio_pci_aer_resume
> according the original user setting(vdev->ctx[0].masked).

Again, you're missing the point.  If the device is expected to be
reinitialized after resume, why don't we return the device to an
initial state where interrupts are not configured?  I think this
presents inconsistent behavior to the user.

> >>> 5. vfio_pci_aer_resume
> >>>    Set workable_state to true in "struct vfio_pci_device"
> >>>    About INTx:
> >>>      According to the value of "vdev->ctx[0].masked"
> >>>      to decide whether to enable INTx
> >>>    About MSI:
> >>>      After reset the "Bus Master Enable" bit is default to 0.
> >>>      So user should process this after reset.  
> >
> > Again, I think this is error prone, teardown the interrupts and define
> > that the device state, including interrupts, needs to be reinitialized
> > after error.  Why are you not incorporating this feedback?  Thanks,  
> The reinitialization depend on user.
> For VFIO driver the process is following.
> 1. aer occurs
> 2. disable the following functions of the device
>     write(except the bar regions), ioctl and interrupt
> 3. aer driver reset the device
> 4. renable the device for user
> 5. user process the aer event
>     Maybe reset the device and reinitialization
> 
> What I do is make sure the following points.
> 1. Host can reset the device between step 2 and 4.
> 2. The user settings is the same at step 1 and 5.
> 
> Sincerely
> Zhou Jie
> 
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-07 19:04                                                     ` Alex Williamson
@ 2016-07-08  1:38                                                       ` Zhou Jie
  2016-07-08 17:33                                                         ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-07-08  1:38 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

Hi Alex,

>>>>> The following code will be modified.
>>>>> 1. vfio_pci_ioctl
>>>>>    add a flag in vfio_device_info for workable_state support
>>>>>    return workable_state in "struct vfio_pci_device" when user get info
>>>
>>> Seems like two flags are required, one to indicate the presence of this
>>> feature and another to indicate the state.  I'd prefer something like
>>> access_blocked.
>> User can get the state by state flag.
>> And also maybe blocked by ioctl or write functons.
>> User has choice to invoke which functions.
>
> Let's imagine there's one flag bit, there are two possible polarities,
> a) the bit is set when access is available, b) the bit is set when
> access is blocked.
>
> Let's examine a), if the bit is not set, does that means that access is
> not available or does that mean the kernel doesn't support that
> feature?  There's no way to know.  Fail.  So we switch to b), an error
> occurs, the bit is not set, does that mean access is blocked or does
> that mean that the kernel we're using doesn't support the feature.
> Fail.  If there's a way to do this with one bit, please explain it to
> me.  Relying on a function to block, which may not be a valid
> assumption on the kernel we're using also fails.  Userspace must be
> able to know, in a deterministic and backwards compatible way, the
> features of the kernel and the behavior to expect.
Sorry, I didn't say clearly.
There is one flag and one variable.
1) workable state support flag
    This flag let user know whether the kenerl support block feature.
2) workable state variable in "struct vfio_pci_device"
    This variable let user know whether the device is blocked.

>>>>> 2. vfio_pci_ioctl
>>>>>    During err occurs and resume:
>>>>>    if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
>>>>>    || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
>>>>>    block for workable_state clearing
>>>>> 3. vfio_pci_write
>>>>>    During err occurs and resume:
>>>>>    ignore and return 0
>>>
>>> This is contradictory to your comment "Do nothing for bar regions".
>>> ISTR that returning 0 for read/write calls is an easy way to break
>>> users since we've return neither the desired bytes nor an error code.
>> No, there is not change for read.
>> Just return 0 for write.
>> Return 0 mean that there is no byte has been written.
>> Consider that the aer has occurred,
>> it is better to not to write any thing to device.
>> User can still read/write bar regions by mmap address,
>> this may generate some date errors,
>> but it doesn't matter as device is going to be reset.
>
> My statement still stands, you state "Do nothing for bar regions" and
> "return 0 for write".  Those are contradictory and as I indicate,
> returning 0 is problematic for userspace.  Additionally, making
> vfio_pci_write return zero while still allowing writes through the BAR
> mmap is inconsistent.
I understand.
I will block writing to configure space.
And don't change for writing bar regions.

>>>>> 4. vfio_pci_aer_err_detected
>>>>>    Set workable_state to false in "struct vfio_pci_device"
>>>>>    Disable INTx:
>>>>>      If Disable INTx is support
>>>>>        disable by PCI_COMMAND
>>>>>      else
>>>>>        disable by disable_irq function
>>>>>    Disable MSI:
>>>>>        disable by clearing the "Bus Master Enable" bit of PCI_COMMAND
>>>
>>> I've suggested repeatedly to properly teardown these interrupts.  I
>>> disagree with your proposed approach here.  If the device is intended to
>>> be in a state that requires re-initialization then the interrupt setup
>>> should be part of that.
>> I have traced the INTx functions.
>> -vfio_pci_intx_unmask_handler
>> -vfio_pci_intx_mask
>> -vfio_intx_set_signal
>>
>> They are invoked by User.
>> -vfio_pci_write
>> -vfio_pci_ioctl
>>
>> During err occurs and resume above functions are blocked.
>> So, User cann't set the INTx.
>> I will disable the INTx in vfio_pci_aer_err_detected.
>> And reset the INTx in vfio_pci_aer_resume
>> according the original user setting(vdev->ctx[0].masked).
>
> Again, you're missing the point.  If the device is expected to be
> reinitialized after resume, why don't we return the device to an
> initial state where interrupts are not configured?  I think this
> presents inconsistent behavior to the user.
About return the device to an
initial state where interrupts are not configured.
Do you mean free_irq?
User request_irq by vfio_pci_ioctl.
If we free_irq, how can the user know it?
Do you think guest will reinitializ the device after resume,
so it doesn't matter?

Maybe we should not expect what the guest will do.
What I want to do in vfio driver is as following.
1. aer occurs
2. Disable INTx and MSI
3. aer driver reset the device
4. Restore INTx and MSI
5. user process the aer event

Sincerely
Zhou Jie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-08  1:38                                                       ` Zhou Jie
@ 2016-07-08 17:33                                                         ` Alex Williamson
  2016-07-10  1:28                                                           ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-07-08 17:33 UTC (permalink / raw)
  To: Zhou Jie; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

On Fri, 8 Jul 2016 09:38:50 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> >>>>> The following code will be modified.
> >>>>> 1. vfio_pci_ioctl
> >>>>>    add a flag in vfio_device_info for workable_state support
> >>>>>    return workable_state in "struct vfio_pci_device" when user get info  
> >>>
> >>> Seems like two flags are required, one to indicate the presence of this
> >>> feature and another to indicate the state.  I'd prefer something like
> >>> access_blocked.  
> >> User can get the state by state flag.
> >> And also maybe blocked by ioctl or write functons.
> >> User has choice to invoke which functions.  
> >
> > Let's imagine there's one flag bit, there are two possible polarities,
> > a) the bit is set when access is available, b) the bit is set when
> > access is blocked.
> >
> > Let's examine a), if the bit is not set, does that means that access is
> > not available or does that mean the kernel doesn't support that
> > feature?  There's no way to know.  Fail.  So we switch to b), an error
> > occurs, the bit is not set, does that mean access is blocked or does
> > that mean that the kernel we're using doesn't support the feature.
> > Fail.  If there's a way to do this with one bit, please explain it to
> > me.  Relying on a function to block, which may not be a valid
> > assumption on the kernel we're using also fails.  Userspace must be
> > able to know, in a deterministic and backwards compatible way, the
> > features of the kernel and the behavior to expect.  
> Sorry, I didn't say clearly.
> There is one flag and one variable.
> 1) workable state support flag
>     This flag let user know whether the kenerl support block feature.
> 2) workable state variable in "struct vfio_pci_device"
>     This variable let user know whether the device is blocked.

The variable clearly isn't visible to the user, so the user can know
whether the kernel supports this feature, but not whether the feature
is currently active.  Perhaps there's no way to avoid races completely,
but don't you expect that if we define that certain operations are
blocked after an error notification that a user may want some way to
poll for whether the block is active after a reset rather than simply
calling a blocked interface to probe for it?
 
> >>>>> 2. vfio_pci_ioctl
> >>>>>    During err occurs and resume:
> >>>>>    if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
> >>>>>    || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
> >>>>>    block for workable_state clearing
> >>>>> 3. vfio_pci_write
> >>>>>    During err occurs and resume:
> >>>>>    ignore and return 0  
> >>>
> >>> This is contradictory to your comment "Do nothing for bar regions".
> >>> ISTR that returning 0 for read/write calls is an easy way to break
> >>> users since we've return neither the desired bytes nor an error code.  
> >> No, there is not change for read.
> >> Just return 0 for write.
> >> Return 0 mean that there is no byte has been written.
> >> Consider that the aer has occurred,
> >> it is better to not to write any thing to device.
> >> User can still read/write bar regions by mmap address,
> >> this may generate some date errors,
> >> but it doesn't matter as device is going to be reset.  
> >
> > My statement still stands, you state "Do nothing for bar regions" and
> > "return 0 for write".  Those are contradictory and as I indicate,
> > returning 0 is problematic for userspace.  Additionally, making
> > vfio_pci_write return zero while still allowing writes through the BAR
> > mmap is inconsistent.  
> I understand.
> I will block writing to configure space.
> And don't change for writing bar regions.
> 
> >>>>> 4. vfio_pci_aer_err_detected
> >>>>>    Set workable_state to false in "struct vfio_pci_device"
> >>>>>    Disable INTx:
> >>>>>      If Disable INTx is support
> >>>>>        disable by PCI_COMMAND
> >>>>>      else
> >>>>>        disable by disable_irq function
> >>>>>    Disable MSI:
> >>>>>        disable by clearing the "Bus Master Enable" bit of PCI_COMMAND  
> >>>
> >>> I've suggested repeatedly to properly teardown these interrupts.  I
> >>> disagree with your proposed approach here.  If the device is intended to
> >>> be in a state that requires re-initialization then the interrupt setup
> >>> should be part of that.  
> >> I have traced the INTx functions.
> >> -vfio_pci_intx_unmask_handler
> >> -vfio_pci_intx_mask
> >> -vfio_intx_set_signal
> >>
> >> They are invoked by User.
> >> -vfio_pci_write
> >> -vfio_pci_ioctl
> >>
> >> During err occurs and resume above functions are blocked.
> >> So, User cann't set the INTx.
> >> I will disable the INTx in vfio_pci_aer_err_detected.
> >> And reset the INTx in vfio_pci_aer_resume
> >> according the original user setting(vdev->ctx[0].masked).  
> >
> > Again, you're missing the point.  If the device is expected to be
> > reinitialized after resume, why don't we return the device to an
> > initial state where interrupts are not configured?  I think this
> > presents inconsistent behavior to the user.  
> About return the device to an
> initial state where interrupts are not configured.
> Do you mean free_irq?
> User request_irq by vfio_pci_ioctl.
> If we free_irq, how can the user know it?
> Do you think guest will reinitializ the device after resume,
> so it doesn't matter?
> 
> Maybe we should not expect what the guest will do.
> What I want to do in vfio driver is as following.
> 1. aer occurs
> 2. Disable INTx and MSI
> 3. aer driver reset the device
> 4. Restore INTx and MSI
> 5. user process the aer event

As we've discussed before, the AER notification needs to be relayed to
the user without delay, otherwise we only increase the gap where the
user might consume bogus data.  It also only seems reasonable to modify
the behavior of the interfaces (ie. blocking) if the user is notified,
which would be through the existing error notifier.  We can never
depend on a specific behavior from the user, we may be dealing with a
malicious user.

We already disable interrupts in vfio_pci_disable() simply by calling
the ioctl function directly.  If we simply disable and re-enable
interrupts as you propose, how does the user deal with edge triggered
interrupts that may have occurred during that period?  Are they lost?
Should we instead leave the interrupts enabled but skip
eventfd_signal() in the interrupts handlers, queuing interrupts for
re-delivery after the device is resumed?  Or does it make more sense to
simply disable the interrupts as done in vfio_pci_disable() and define
that the user needs to re-establish interrupts before continuing after
an error event?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-08 17:33                                                         ` Alex Williamson
@ 2016-07-10  1:28                                                           ` Zhou Jie
  2016-07-11 16:24                                                             ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-07-10  1:28 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

Hi Alex,

> The variable clearly isn't visible to the user, so the user can know
> whether the kernel supports this feature, but not whether the feature
> is currently active.  Perhaps there's no way to avoid races completely,
> but don't you expect that if we define that certain operations are
> blocked after an error notification that a user may want some way to
> poll for whether the block is active after a reset rather than simply
> calling a blocked interface to probe for it?
Yes, I will use access blocked function, not the variable.

> As we've discussed before, the AER notification needs to be relayed to
> the user without delay, otherwise we only increase the gap where the
> user might consume bogus data.  It also only seems reasonable to modify
> the behavior of the interfaces (ie. blocking) if the user is notified,
> which would be through the existing error notifier.  We can never
> depend on a specific behavior from the user, we may be dealing with a
> malicious user.
>
> We already disable interrupts in vfio_pci_disable() simply by calling
> the ioctl function directly.
Sorry, I want to know where is vfio_pci_disable invoked.
I can't find it in ioctl function.

> If we simply disable and re-enable interrupts as you propose,
> how does the user deal with edge triggered
> interrupts that may have occurred during that period?  Are they lost?
> Should we instead leave the interrupts enabled but skip
> eventfd_signal() in the interrupts handlers, queuing interrupts for
> re-delivery after the device is resumed?
Yes, they will lost.

> Or does it make more sense to
> simply disable the interrupts as done in vfio_pci_disable() and define
> that the user needs to re-establish interrupts before continuing after
> an error event?  Thanks,
If user invoked the vfio_pci_disable by ioctl function.
Yes, user should re-establish interrupts before
continuing after an error event.

Sincerely
Zhoujie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-10  1:28                                                           ` Zhou Jie
@ 2016-07-11 16:24                                                             ` Alex Williamson
  2016-07-12  1:42                                                               ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-07-11 16:24 UTC (permalink / raw)
  To: Zhou Jie; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

On Sun, 10 Jul 2016 09:28:41 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> > The variable clearly isn't visible to the user, so the user can know
> > whether the kernel supports this feature, but not whether the feature
> > is currently active.  Perhaps there's no way to avoid races completely,
> > but don't you expect that if we define that certain operations are
> > blocked after an error notification that a user may want some way to
> > poll for whether the block is active after a reset rather than simply
> > calling a blocked interface to probe for it?  
> Yes, I will use access blocked function, not the variable.

I don't understand what this means.
 
> > As we've discussed before, the AER notification needs to be relayed to
> > the user without delay, otherwise we only increase the gap where the
> > user might consume bogus data.  It also only seems reasonable to modify
> > the behavior of the interfaces (ie. blocking) if the user is notified,
> > which would be through the existing error notifier.  We can never
> > depend on a specific behavior from the user, we may be dealing with a
> > malicious user.
> >
> > We already disable interrupts in vfio_pci_disable() simply by calling
> > the ioctl function directly.  
> Sorry, I want to know where is vfio_pci_disable invoked.
> I can't find it in ioctl function.

You have the code, vfio_pci_disable() is invoked when the vfio device
file descriptor is released.  It's not in the ioctl, it calls the ioctl
as the user would to disable all interrupts on the device.

> > If we simply disable and re-enable interrupts as you propose,
> > how does the user deal with edge triggered
> > interrupts that may have occurred during that period?  Are they lost?
> > Should we instead leave the interrupts enabled but skip
> > eventfd_signal() in the interrupts handlers, queuing interrupts for
> > re-delivery after the device is resumed?  
> Yes, they will lost.

Is that acceptable?  This is part of the problem I have with silently
disabling interrupt delivery via the command register across reset.  It
seems more non-deterministic than properly disabling interrupts and
requiring the user to reinitialize them after error.
 
> > Or does it make more sense to
> > simply disable the interrupts as done in vfio_pci_disable() and define
> > that the user needs to re-establish interrupts before continuing after
> > an error event?  Thanks,  
> If user invoked the vfio_pci_disable by ioctl function.

I'm in no way suggesting that a user invoke vfio_pci_disable(), I'm just
trying to point out that vfio_pci_disable() already does a teardown of
interrupts, similar to what seems to be required here.

> Yes, user should re-establish interrupts before
> continuing after an error event.

So if we define that users should re-establish interrupts after an
error event, then what's the point of only doing command register
masking of the interrupts and requiring the user to both tear-down the
interrupts and re-establish them?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-11 16:24                                                             ` Alex Williamson
@ 2016-07-12  1:42                                                               ` Zhou Jie
  2016-07-12 15:45                                                                 ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-07-12  1:42 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

Hi Alex,

>>> The variable clearly isn't visible to the user, so the user can know
>>> whether the kernel supports this feature, but not whether the feature
>>> is currently active.  Perhaps there's no way to avoid races completely,
>>> but don't you expect that if we define that certain operations are
>>> blocked after an error notification that a user may want some way to
>>> poll for whether the block is active after a reset rather than simply
>>> calling a blocked interface to probe for it?
>> Yes, I will use access blocked function, not the variable.
>
> I don't understand what this means.
I will use workable state support flag
to let user know whether the kenerl support block feature.
And make configure space writing and ioctl function blocked.

> Is that acceptable?  This is part of the problem I have with silently
> disabling interrupt delivery via the command register across reset.  It
> seems more non-deterministic than properly disabling interrupts and
> requiring the user to reinitialize them after error.
What is "properly disabling interrupts"?
User don't know what vfio driver has done.
What's the difference of "disabling interrupt delivery via
the command register" and "doing a teardown of interrupts"?
The interrupts will still lost silently.

>>> Or does it make more sense to
>>> simply disable the interrupts as done in vfio_pci_disable() and define
>>> that the user needs to re-establish interrupts before continuing after
>>> an error event?  Thanks,
>> If user invoked the vfio_pci_disable by ioctl function.
>
> I'm in no way suggesting that a user invoke vfio_pci_disable(), I'm just
> trying to point out that vfio_pci_disable() already does a teardown of
> interrupts, similar to what seems to be required here.
>
>> Yes, user should re-establish interrupts before
>> continuing after an error event.
>
> So if we define that users should re-establish interrupts after an
> error event, then what's the point of only doing command register
> masking of the interrupts and requiring the user to both tear-down the
> interrupts and re-establish them?  Thanks,
Take "Intel(R) 10GbE PCI Express Virtual Function" as an example.
In drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
static const struct pci_error_handlers ixgbevf_err_handler = {
         .error_detected = ixgbevf_io_error_detected,
         .slot_reset = ixgbevf_io_slot_reset,
         .resume = ixgbevf_io_resume,
};
User tear-down the interrupts in ixgbevf_io_error_detected function.
And up the interrupts in ixgbevf_io_resume.

Guest OS driver will do both tear-down the interrupts
and re-establish them.
Because it don't know what host vfio driver has done.

I disable the interrupts to pretend them interfere with device reset.

Sincerely
Zhou Jie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-12  1:42                                                               ` Zhou Jie
@ 2016-07-12 15:45                                                                 ` Alex Williamson
  2016-07-13  1:04                                                                   ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-07-12 15:45 UTC (permalink / raw)
  To: Zhou Jie; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

On Tue, 12 Jul 2016 09:42:21 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> >>> The variable clearly isn't visible to the user, so the user can know
> >>> whether the kernel supports this feature, but not whether the feature
> >>> is currently active.  Perhaps there's no way to avoid races completely,
> >>> but don't you expect that if we define that certain operations are
> >>> blocked after an error notification that a user may want some way to
> >>> poll for whether the block is active after a reset rather than simply
> >>> calling a blocked interface to probe for it?  
> >> Yes, I will use access blocked function, not the variable.  
> >
> > I don't understand what this means.  
> I will use workable state support flag
> to let user know whether the kenerl support block feature.
> And make configure space writing and ioctl function blocked.

And what of my suggestion that a user may desire to poll the state of
the device?
 
> > Is that acceptable?  This is part of the problem I have with silently
> > disabling interrupt delivery via the command register across reset.  It
> > seems more non-deterministic than properly disabling interrupts and
> > requiring the user to reinitialize them after error.  
> What is "properly disabling interrupts"?

We've discussed this, it's tearing down the eventfds and the irq
handlers using the same mechanism as used in vfio_pci_disable().

> User don't know what vfio driver has done.
> What's the difference of "disabling interrupt delivery via
> the command register" and "doing a teardown of interrupts"?
> The interrupts will still lost silently.

A user does know what the vfio driver has done if you define the
behavior that on an AER error reported event, as signaled to the user
via the error notification interrupt, vfio-pci will teardown device
interrupts to an uninitialized state.  The difference between the
command register approach you suggest and the teardown I suggest is
that the command register is simply masking interrupt deliver while the
teardown approach returns the device to an uninitialized interrupt
state.  Take a look at the device state when a bus reset occurs, what
state is saved and restored and what is left at a default PCI value.
The command register is saved and restored, so any manipulation we do
of it is racing the host kernel AER handling and bus reset.  What about
MSI and MSI-X?  Looks to me like those are left at the PCI default
initialization state, so now after an AER error we have irq handlers
and eventfds configured, while in fact the device has been
de-programmed.  To handle that we're expecting users to teardown the
interrupt state and re-establish it?  Again, why not just teardown the
interrupt state ourselves?  I dont' see the value in simply masking the
command register, especially when it doesn't handle the no-DisINTx case.

> >>> Or does it make more sense to
> >>> simply disable the interrupts as done in vfio_pci_disable() and define
> >>> that the user needs to re-establish interrupts before continuing after
> >>> an error event?  Thanks,  
> >> If user invoked the vfio_pci_disable by ioctl function.  
> >
> > I'm in no way suggesting that a user invoke vfio_pci_disable(), I'm just
> > trying to point out that vfio_pci_disable() already does a teardown of
> > interrupts, similar to what seems to be required here.
> >  
> >> Yes, user should re-establish interrupts before
> >> continuing after an error event.  
> >
> > So if we define that users should re-establish interrupts after an
> > error event, then what's the point of only doing command register
> > masking of the interrupts and requiring the user to both tear-down the
> > interrupts and re-establish them?  Thanks,  
> Take "Intel(R) 10GbE PCI Express Virtual Function" as an example.
> In drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> static const struct pci_error_handlers ixgbevf_err_handler = {
>          .error_detected = ixgbevf_io_error_detected,
>          .slot_reset = ixgbevf_io_slot_reset,
>          .resume = ixgbevf_io_resume,
> };
> User tear-down the interrupts in ixgbevf_io_error_detected function.
> And up the interrupts in ixgbevf_io_resume.
> 
> Guest OS driver will do both tear-down the interrupts
> and re-establish them.
> Because it don't know what host vfio driver has done.
> 
> I disable the interrupts to pretend them interfere with device reset.

We cannot depend on the behavior of any given driver and the fact that
the guest driver may teardown interrupts anyway is not a justification
that vfio shouldn't be doing this to make the device state presented to
the user consistent.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-12 15:45                                                                 ` Alex Williamson
@ 2016-07-13  1:04                                                                   ` Zhou Jie
  2016-07-13  2:54                                                                     ` Alex Williamson
  0 siblings, 1 reply; 38+ messages in thread
From: Zhou Jie @ 2016-07-13  1:04 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

Hi Alex,

>> I will use workable state support flag
>> to let user know whether the kenerl support block feature.
>> And make configure space writing and ioctl function blocked.
>
> And what of my suggestion that a user may desire to poll the state of
> the device?
I will also add a poll function to vfio_fops.

> A user does know what the vfio driver has done if you define the
> behavior that on an AER error reported event, as signaled to the user
> via the error notification interrupt, vfio-pci will teardown device
> interrupts to an uninitialized state.  The difference between the
> command register approach you suggest and the teardown I suggest is
> that the command register is simply masking interrupt deliver while the
> teardown approach returns the device to an uninitialized interrupt
> state.  Take a look at the device state when a bus reset occurs, what
> state is saved and restored and what is left at a default PCI value.
> The command register is saved and restored, so any manipulation we do
> of it is racing the host kernel AER handling and bus reset.  What about
> MSI and MSI-X?  Looks to me like those are left at the PCI default
> initialization state, so now after an AER error we have irq handlers
> and eventfds configured, while in fact the device has been
> de-programmed.  To handle that we're expecting users to teardown the
> interrupt state and re-establish it?  Again, why not just teardown the
> interrupt state ourselves?  I dont' see the value in simply masking the
> command register, especially when it doesn't handle the no-DisINTx case.
I understand.
Thank you very much to explain this to me.
I will teardown the interrupt state.

> We cannot depend on the behavior of any given driver and the fact that
> the guest driver may teardown interrupts anyway is not a justification
> that vfio shouldn't be doing this to make the device state presented to
> the user consistent.  Thanks,
I understand.

The following code will be modified.
1. vfio_pci_ioctl
    add a flag in vfio_device_info for workable_state support
2. vfio_pci_ioctl
    During err occurs and resume:
    if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
    || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
    block for workable_state clearing
3. vfio_pci_write
    During err occurs and resume:
    block write to configure space
4. vfio_pci_aer_err_detected
    Set workable_state to false in "struct vfio_pci_device"
    teardown the interrupt
5. vfio_pci_aer_resume
    Set workable_state to true in "struct vfio_pci_device"
6. vfio_fops
    Add poll function

Sincerely
Zhou Jie

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-13  1:04                                                                   ` Zhou Jie
@ 2016-07-13  2:54                                                                     ` Alex Williamson
  2016-07-13  3:33                                                                       ` Zhou Jie
  0 siblings, 1 reply; 38+ messages in thread
From: Alex Williamson @ 2016-07-13  2:54 UTC (permalink / raw)
  To: Zhou Jie; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

On Wed, 13 Jul 2016 09:04:16 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> >> I will use workable state support flag
> >> to let user know whether the kenerl support block feature.
> >> And make configure space writing and ioctl function blocked.  
> >
> > And what of my suggestion that a user may desire to poll the state of
> > the device?  
> I will also add a poll function to vfio_fops.

Can you explain how this will work?  I was only suggesting that one of
the flag bits in vfio_device_info be allocated to report the current
state of blocking and the user could poll by repeatedly calling the
DEVICE_INFO ioctl.  Are you thinking of using POLLOUT/POLLIN?  I'm not
sure if those are a perfect match since it's really only the PCI config
region and a few ioctls where access is blocked, other operations may
proceed normally.
 
> > A user does know what the vfio driver has done if you define the
> > behavior that on an AER error reported event, as signaled to the user
> > via the error notification interrupt, vfio-pci will teardown device
> > interrupts to an uninitialized state.  The difference between the
> > command register approach you suggest and the teardown I suggest is
> > that the command register is simply masking interrupt deliver while the
> > teardown approach returns the device to an uninitialized interrupt
> > state.  Take a look at the device state when a bus reset occurs, what
> > state is saved and restored and what is left at a default PCI value.
> > The command register is saved and restored, so any manipulation we do
> > of it is racing the host kernel AER handling and bus reset.  What about
> > MSI and MSI-X?  Looks to me like those are left at the PCI default
> > initialization state, so now after an AER error we have irq handlers
> > and eventfds configured, while in fact the device has been
> > de-programmed.  To handle that we're expecting users to teardown the
> > interrupt state and re-establish it?  Again, why not just teardown the
> > interrupt state ourselves?  I dont' see the value in simply masking the
> > command register, especially when it doesn't handle the no-DisINTx case.  
> I understand.
> Thank you very much to explain this to me.
> I will teardown the interrupt state.
> 
> > We cannot depend on the behavior of any given driver and the fact that
> > the guest driver may teardown interrupts anyway is not a justification
> > that vfio shouldn't be doing this to make the device state presented to
> > the user consistent.  Thanks,  
> I understand.
> 
> The following code will be modified.
> 1. vfio_pci_ioctl
>     add a flag in vfio_device_info for workable_state support
> 2. vfio_pci_ioctl
>     During err occurs and resume:
>     if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
>     || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
>     block for workable_state clearing
> 3. vfio_pci_write
>     During err occurs and resume:
>     block write to configure space
> 4. vfio_pci_aer_err_detected
>     Set workable_state to false in "struct vfio_pci_device"
>     teardown the interrupt
> 5. vfio_pci_aer_resume
>     Set workable_state to true in "struct vfio_pci_device"
> 6. vfio_fops
>     Add poll function

I would still suggest that the name "workable_state" is quite vague.
Something like aer_error_in_progress is much more specific.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
  2016-07-13  2:54                                                                     ` Alex Williamson
@ 2016-07-13  3:33                                                                       ` Zhou Jie
  0 siblings, 0 replies; 38+ messages in thread
From: Zhou Jie @ 2016-07-13  3:33 UTC (permalink / raw)
  To: Alex Williamson; +Cc: izumi.taku, caoj.fnst, Chen Fan, qemu-devel, mst

Hi Alex,

>>>> I will use workable state support flag
>>>> to let user know whether the kenerl support block feature.
>>>> And make configure space writing and ioctl function blocked.
>>>
>>> And what of my suggestion that a user may desire to poll the state of
>>> the device?
>> I will also add a poll function to vfio_fops.
>
> Can you explain how this will work?  I was only suggesting that one of
> the flag bits in vfio_device_info be allocated to report the current
> state of blocking and the user could poll by repeatedly calling the
> DEVICE_INFO ioctl.  Are you thinking of using POLLOUT/POLLIN?  I'm not
> sure if those are a perfect match since it's really only the PCI config
> region and a few ioctls where access is blocked, other operations may
> proceed normally.
Sorry, I had misunderstood your meaning.
I will use one of the flag bits in vfio_device_info to
  report the current state of blocking.

>> The following code will be modified.
>> 1. vfio_pci_ioctl
>>     add a flag in vfio_device_info for workable_state support
>> 2. vfio_pci_ioctl
>>     During err occurs and resume:
>>     if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
>>     || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
>>     block for workable_state clearing
>> 3. vfio_pci_write
>>     During err occurs and resume:
>>     block write to configure space
>> 4. vfio_pci_aer_err_detected
>>     Set workable_state to false in "struct vfio_pci_device"
>>     teardown the interrupt
>> 5. vfio_pci_aer_resume
>>     Set workable_state to true in "struct vfio_pci_device"
>> 6. vfio_fops
>>     Add poll function
>
> I would still suggest that the name "workable_state" is quite vague.
> Something like aer_error_in_progress is much more specific.  Thanks,
OK, I will alter the name.

Sincerely
Zhou Jie

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2016-07-13  3:33 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-27  2:12 [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume Zhou Jie
2016-05-27 16:06 ` Alex Williamson
2016-06-12  2:38   ` Zhou Jie
2016-06-20  7:41     ` Zhou Jie
2016-06-20 16:32       ` Alex Williamson
2016-06-21  2:16         ` Zhou Jie
2016-06-21  3:13           ` Alex Williamson
2016-06-21 12:41             ` Chen Fan
2016-06-21 14:44               ` Alex Williamson
2016-06-22  3:28                 ` Zhou Jie
2016-06-22  3:56                   ` Alex Williamson
2016-06-22  5:45                     ` Zhou Jie
2016-06-22  7:49                       ` Zhou Jie
2016-06-22 15:42                         ` Alex Williamson
2016-06-25  1:24                           ` Zhou Jie
2016-06-27 15:54                             ` Alex Williamson
2016-06-28  3:26                               ` Zhou Jie
2016-06-28  3:58                                 ` Alex Williamson
2016-06-28  5:27                                   ` Zhou Jie
2016-06-28 14:40                                     ` Alex Williamson
2016-06-29  8:54                                       ` Zhou Jie
2016-06-29 18:22                                         ` Alex Williamson
2016-06-30  1:45                                           ` Zhou Jie
2016-07-03  4:00                                             ` Zhou Jie
2016-07-05  1:36                                               ` Zhou Jie
2016-07-05 17:03                                                 ` Alex Williamson
2016-07-06  2:01                                                   ` Zhou Jie
2016-07-07 19:04                                                     ` Alex Williamson
2016-07-08  1:38                                                       ` Zhou Jie
2016-07-08 17:33                                                         ` Alex Williamson
2016-07-10  1:28                                                           ` Zhou Jie
2016-07-11 16:24                                                             ` Alex Williamson
2016-07-12  1:42                                                               ` Zhou Jie
2016-07-12 15:45                                                                 ` Alex Williamson
2016-07-13  1:04                                                                   ` Zhou Jie
2016-07-13  2:54                                                                     ` Alex Williamson
2016-07-13  3:33                                                                       ` Zhou Jie
2016-06-22 15:25                       ` Alex Williamson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.