All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
@ 2018-04-10  6:03 Yulei Zhang
  2018-04-16 14:44 ` Kirti Wankhede
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Yulei Zhang @ 2018-04-10  6:03 UTC (permalink / raw)
  To: qemu-devel
  Cc: kevin.tian, joonas.lahtinen, zhenyuw, kwankhede, zhi.a.wang,
	alex.williamson, dgilbert, quintela, Yulei Zhang

Instead of using vm state description, add SaveVMHandlers for VFIO
device to support live migration.

Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the memory
bitmap that dirtied by vfio device during the iterative precopy stage
to shorten the system downtime afterward.

For vfio pci device status migrate, during the system downtime, it will
save the following states
1. pci configuration space addr0~addr5
2. pci configuration space msi_addr msi_data
3. pci device status fetch from device driver

And on the target side the vfio_load will restore the same states
1. re-setup the pci bar configuration
2. re-setup the pci device msi configuration
3. restore the pci device status

Signed-off-by: Yulei Zhang <yulei.zhang@intel.com>
---
 hw/vfio/pci.c              | 195 +++++++++++++++++++++++++++++++++++++++++++--
 linux-headers/linux/vfio.h |  14 ++++
 2 files changed, 204 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 13d8c73..ac6a9c7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -33,9 +33,14 @@
 #include "trace.h"
 #include "qapi/error.h"
 #include "migration/blocker.h"
+#include "migration/register.h"
+#include "exec/ram_addr.h"
 
 #define MSIX_CAP_LENGTH 12
 
+#define VFIO_SAVE_FLAG_SETUP 0
+#define VFIO_SAVE_FLAG_DEV_STATE 1
+
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 static void vfio_vm_change_state_handler(void *pv, int running, RunState state);
@@ -2639,6 +2644,190 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }
 
+static uint64_t vfio_dirty_log_sync(VFIOPCIDevice *vdev)
+{
+    RAMBlock *block;
+    struct vfio_device_get_dirty_bitmap *d;
+    uint64_t page = 0;
+    ram_addr_t size;
+    unsigned long nr, bitmap;
+
+    RAMBLOCK_FOREACH(block) {
+        size = block->used_length;
+        nr = size >> TARGET_PAGE_BITS;
+        bitmap = (BITS_TO_LONGS(nr) + 1) * sizeof(unsigned long);
+        d = g_malloc0(sizeof(*d) +  bitmap);
+        d->start_addr = block->offset;
+        d->page_nr = nr;
+        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_DIRTY_BITMAP, d)) {
+            error_report("vfio: Failed to get device dirty bitmap");
+            g_free(d);
+            goto exit;
+        }
+
+        if (d->page_nr) {
+            cpu_physical_memory_set_dirty_lebitmap(
+                                 (unsigned long *)&d->dirty_bitmap,
+                                 d->start_addr, d->page_nr);
+            page += d->page_nr;
+        }
+        g_free(d);
+    }
+
+exit:
+    return page;
+}
+
+static void vfio_save_live_pending(QEMUFile *f, void *opaque, uint64_t max_size,
+                   uint64_t *non_postcopiable_pending,
+                   uint64_t *postcopiable_pending)
+{
+    VFIOPCIDevice *vdev = opaque;
+    uint64_t pending;
+
+    qemu_mutex_lock_iothread();
+    rcu_read_lock();
+    pending = vfio_dirty_log_sync(vdev);
+    rcu_read_unlock();
+    qemu_mutex_unlock_iothread();
+    *non_postcopiable_pending += pending;
+}
+
+static int vfio_load(QEMUFile *f, void *opaque, int version_id)
+{
+    VFIOPCIDevice *vdev = opaque;
+    PCIDevice *pdev = &vdev->pdev;
+    int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
+    uint8_t *buf = NULL;
+    uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i;
+    bool msi_64bit;
+
+    if (qemu_get_byte(f) == VFIO_SAVE_FLAG_SETUP) {
+        goto exit;
+    }
+
+    /* retore pci bar configuration */
+    ctl = pci_default_read_config(pdev, PCI_COMMAND, 2);
+    vfio_pci_write_config(pdev, PCI_COMMAND,
+                          ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        bar_cfg = qemu_get_be32(f);
+        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4);
+    }
+    vfio_pci_write_config(pdev, PCI_COMMAND,
+                          ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2);
+
+    /* restore msi configuration */
+    ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
+    msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT);
+
+    vfio_pci_write_config(&vdev->pdev,
+                          pdev->msi_cap + PCI_MSI_FLAGS,
+                          ctl & (!PCI_MSI_FLAGS_ENABLE), 2);
+
+    msi_lo = qemu_get_be32(f);
+    vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4);
+
+    if (msi_64bit) {
+        msi_hi = qemu_get_be32(f);
+        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
+                              msi_hi, 4);
+    }
+    msi_data = qemu_get_be32(f);
+    vfio_pci_write_config(pdev,
+              pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
+              msi_data, 2);
+
+    vfio_pci_write_config(&vdev->pdev, pdev->msi_cap + PCI_MSI_FLAGS,
+                          ctl | PCI_MSI_FLAGS_ENABLE, 2);
+
+    buf = g_malloc(sz);
+    if (buf == NULL) {
+        error_report("vfio: Failed to allocate memory for migrate");
+        return -1;
+    }
+
+    qemu_get_buffer(f, buf, sz);
+    if (pwrite(vdev->vbasedev.fd, buf, sz,
+               vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
+        error_report("vfio: Failed to write Device State Region");
+        return -1;
+    }
+
+    g_free(buf);
+
+exit:
+    return 0;
+}
+
+static int vfio_save_complete(QEMUFile *f, void *opaque)
+{
+    VFIOPCIDevice *vdev = opaque;
+    PCIDevice *pdev = &vdev->pdev;
+    int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
+    uint8_t *buf = NULL;
+    uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i;
+    bool msi_64bit;
+
+    qemu_put_byte(f, VFIO_SAVE_FLAG_DEV_STATE);
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
+        qemu_put_be32(f, bar_cfg);
+    }
+
+    msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
+    msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT);
+
+    msi_lo = pci_default_read_config(pdev,
+                                     pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
+    qemu_put_be32(f, msi_lo);
+
+    if (msi_64bit) {
+        msi_hi = pci_default_read_config(pdev,
+                                         pdev->msi_cap + PCI_MSI_ADDRESS_HI,
+                                         4);
+        qemu_put_be32(f, msi_hi);
+    }
+
+    msi_data = pci_default_read_config(pdev,
+               pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
+               2);
+    qemu_put_be32(f, msi_data);
+
+    buf = g_malloc(sz);
+    if (buf == NULL) {
+        error_report("vfio: Failed to allocate memory for migrate");
+        goto exit;
+    }
+
+    if (pread(vdev->vbasedev.fd, buf, sz,
+              vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
+        error_report("vfio: Failed to read Device State Region");
+        goto exit;
+    }
+
+    qemu_put_buffer(f, buf, sz);
+
+exit:
+    g_free(buf);
+
+    return 0;
+}
+
+static int vfio_save_setup(QEMUFile *f, void *opaque)
+{
+    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
+    return 0;
+}
+
+static SaveVMHandlers savevm_vfio_handlers = {
+    .save_setup = vfio_save_setup,
+    .save_live_pending = vfio_save_live_pending,
+    .save_live_complete_precopy = vfio_save_complete,
+    .load_state = vfio_load,
+};
+
 static void vfio_realize(PCIDevice *pdev, Error **errp)
 {
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
@@ -2898,6 +3087,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
     qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev);
+    register_savevm_live(NULL, "vfio-pci", 0, 1, &savevm_vfio_handlers, vdev);
 
     return;
 
@@ -3051,10 +3241,6 @@ static Property vfio_pci_dev_properties[] = {
     DEFINE_PROP_END_OF_LIST(),
 };
 
-static const VMStateDescription vfio_pci_vmstate = {
-    .name = "vfio-pci",
-};
-
 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -3062,7 +3248,6 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
 
     dc->reset = vfio_pci_reset;
     dc->props = vfio_pci_dev_properties;
-    dc->vmsd = &vfio_pci_vmstate;
     dc->desc = "VFIO-based PCI device assignment";
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
     pdc->realize = vfio_realize;
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 8f02f2f..2c911d9 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -512,6 +512,20 @@ struct vfio_pci_hot_reset {
 
 #define VFIO_DEVICE_PCI_HOT_RESET	_IO(VFIO_TYPE, VFIO_BASE + 13)
 
+/**
+ * VFIO_DEVICE_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 14,
+ *				    struct vfio_device_get_dirty_bitmap)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_get_dirty_bitmap {
+	__u64	       start_addr;
+	__u64	       page_nr;
+	__u8           dirty_bitmap[];
+};
+
+#define VFIO_DEVICE_GET_DIRTY_BITMAP	_IO(VFIO_TYPE, VFIO_BASE + 14)
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-10  6:03 [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration Yulei Zhang
@ 2018-04-16 14:44 ` Kirti Wankhede
  2018-04-17  8:01   ` Zhang, Yulei
  2018-04-17 19:25   ` Eric Blake
  2018-04-16 20:37 ` Alex Williamson
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 12+ messages in thread
From: Kirti Wankhede @ 2018-04-16 14:44 UTC (permalink / raw)
  To: Yulei Zhang, qemu-devel
  Cc: kevin.tian, joonas.lahtinen, zhenyuw, zhi.a.wang,
	alex.williamson, dgilbert, quintela



On 4/10/2018 11:33 AM, Yulei Zhang wrote:
> Instead of using vm state description, add SaveVMHandlers for VFIO
> device to support live migration.
> 
> Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the memory
> bitmap that dirtied by vfio device during the iterative precopy stage
> to shorten the system downtime afterward.
> 
> For vfio pci device status migrate, during the system downtime, it will
> save the following states
> 1. pci configuration space addr0~addr5
> 2. pci configuration space msi_addr msi_data
> 3. pci device status fetch from device driver
> 
> And on the target side the vfio_load will restore the same states
> 1. re-setup the pci bar configuration
> 2. re-setup the pci device msi configuration
> 3. restore the pci device status
> 

Can #1 and #2 be setup by vendor driver? Vendor driver knows
capabilities of device and vendor driver can setup rather than have the
restore code in common place and then handle all the capability cases here.


> Signed-off-by: Yulei Zhang <yulei.zhang@intel.com>
> ---
>  hw/vfio/pci.c              | 195 +++++++++++++++++++++++++++++++++++++++++++--
>  linux-headers/linux/vfio.h |  14 ++++
>  2 files changed, 204 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 13d8c73..ac6a9c7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -33,9 +33,14 @@
>  #include "trace.h"
>  #include "qapi/error.h"
>  #include "migration/blocker.h"
> +#include "migration/register.h"
> +#include "exec/ram_addr.h"
>  
>  #define MSIX_CAP_LENGTH 12
>  
> +#define VFIO_SAVE_FLAG_SETUP 0
> +#define VFIO_SAVE_FLAG_DEV_STATE 1
> +
>  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>  static void vfio_vm_change_state_handler(void *pv, int running, RunState state);
> @@ -2639,6 +2644,190 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>      vdev->req_enabled = false;
>  }
>  
> +static uint64_t vfio_dirty_log_sync(VFIOPCIDevice *vdev)
> +{
> +    RAMBlock *block;
> +    struct vfio_device_get_dirty_bitmap *d;
> +    uint64_t page = 0;
> +    ram_addr_t size;
> +    unsigned long nr, bitmap;
> +
> +    RAMBLOCK_FOREACH(block) {
> +        size = block->used_length;
> +        nr = size >> TARGET_PAGE_BITS;
> +        bitmap = (BITS_TO_LONGS(nr) + 1) * sizeof(unsigned long);
> +        d = g_malloc0(sizeof(*d) +  bitmap);
> +        d->start_addr = block->offset;
> +        d->page_nr = nr;
> +        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_DIRTY_BITMAP, d)) {
> +            error_report("vfio: Failed to get device dirty bitmap");
> +            g_free(d);
> +            goto exit;
> +        }
> +
> +        if (d->page_nr) {
> +            cpu_physical_memory_set_dirty_lebitmap(
> +                                 (unsigned long *)&d->dirty_bitmap,
> +                                 d->start_addr, d->page_nr);
> +            page += d->page_nr;
> +        }
> +        g_free(d);
> +    }
> +
> +exit:
> +    return page;
> +}
> +
> +static void vfio_save_live_pending(QEMUFile *f, void *opaque, uint64_t max_size,
> +                   uint64_t *non_postcopiable_pending,
> +                   uint64_t *postcopiable_pending)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    uint64_t pending;
> +
> +    qemu_mutex_lock_iothread();
> +    rcu_read_lock();
> +    pending = vfio_dirty_log_sync(vdev);
> +    rcu_read_unlock();
> +    qemu_mutex_unlock_iothread();
> +    *non_postcopiable_pending += pending;
> +}
> +

This doesn't provide a way to read device's state during pre-copy phase.
device_state region can be used to read device specific data from vendor
driver during pre-copy phase. That could be done by mmap device_state
region during init and then ioctl vendor driver here to query size of
data copied to device_state region and pending data size.


> +static int vfio_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *pdev = &vdev->pdev;
> +    int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
> +    uint8_t *buf = NULL;
> +    uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i;
> +    bool msi_64bit;
> +
> +    if (qemu_get_byte(f) == VFIO_SAVE_FLAG_SETUP) {
> +        goto exit;
> +    }
> +
> +    /* retore pci bar configuration */
> +    ctl = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> +                          ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        bar_cfg = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4);
> +    }
> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> +                          ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2);
> +
> +    /* restore msi configuration */
> +    ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
> +    msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT);
> +
> +    vfio_pci_write_config(&vdev->pdev,
> +                          pdev->msi_cap + PCI_MSI_FLAGS,
> +                          ctl & (!PCI_MSI_FLAGS_ENABLE), 2);
> +
> +    msi_lo = qemu_get_be32(f);
> +    vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4);
> +
> +    if (msi_64bit) {
> +        msi_hi = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                              msi_hi, 4);
> +    }
> +    msi_data = qemu_get_be32(f);
> +    vfio_pci_write_config(pdev,
> +              pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +              msi_data, 2);
> +
> +    vfio_pci_write_config(&vdev->pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                          ctl | PCI_MSI_FLAGS_ENABLE, 2);
> +
> +    buf = g_malloc(sz);
> +    if (buf == NULL) {
> +        error_report("vfio: Failed to allocate memory for migrate");
> +        return -1;
> +    }
> +
> +    qemu_get_buffer(f, buf, sz);
> +    if (pwrite(vdev->vbasedev.fd, buf, sz,
> +               vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
> +        error_report("vfio: Failed to write Device State Region");
> +        return -1;
> +    }
> +

Again here if device_state region in mmaped, QEMU would write buffer to
that region and an ioclt can be used to convey that new data is written
to the region along with size of data.


> +    g_free(buf);
> +
> +exit:
> +    return 0;
> +}
> +
> +static int vfio_save_complete(QEMUFile *f, void *opaque)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *pdev = &vdev->pdev;
> +    int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
> +    uint8_t *buf = NULL;
> +    uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i;
> +    bool msi_64bit;
> +
> +    qemu_put_byte(f, VFIO_SAVE_FLAG_DEV_STATE);
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> +        qemu_put_be32(f, bar_cfg);
> +    }
> +
> +    msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
> +    msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT);
> +
> +    msi_lo = pci_default_read_config(pdev,
> +                                     pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> +    qemu_put_be32(f, msi_lo);
> +
> +    if (msi_64bit) {
> +        msi_hi = pci_default_read_config(pdev,
> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                                         4);
> +        qemu_put_be32(f, msi_hi);
> +    }
> +
> +    msi_data = pci_default_read_config(pdev,
> +               pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +               2);
> +    qemu_put_be32(f, msi_data);
> +
> +    buf = g_malloc(sz);
> +    if (buf == NULL) {
> +        error_report("vfio: Failed to allocate memory for migrate");
> +        goto exit;
> +    }
> +
> +    if (pread(vdev->vbasedev.fd, buf, sz,
> +              vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
> +        error_report("vfio: Failed to read Device State Region");
> +        goto exit;
> +    }
> +

Again here getting buffer information from vendor driver and directly
reading that data from mmaped region would help.

> +    qemu_put_buffer(f, buf, sz);
> +
> +exit:
> +    g_free(buf);
> +
> +    return 0;
> +}
> +
> +static int vfio_save_setup(QEMUFile *f, void *opaque)
> +{
> +    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> +    return 0;
> +}
> +
> +static SaveVMHandlers savevm_vfio_handlers = {
> +    .save_setup = vfio_save_setup,
> +    .save_live_pending = vfio_save_live_pending,
> +    .save_live_complete_precopy = vfio_save_complete,
> +    .load_state = vfio_load,
> +};
> +

Isn't .is_active, .save_live_iterate and .cleanup required?
What is vendor driver have large amount of data in device's memory which
vendor driver is aware of? and vendor driver would required multiple
iterations to send that data to QEMU to save complete state of device?

Isn't there should be an ioclt to convey QEMU's MIGRATION_STATUS_* to
vendor driver? Knowing migration status would help vendor driver take
necessary action, like MIGRATION_STATUS_SETUP,
MIGRATION_STATUS_CANCELLED, MIGRATION_STATUS_FAILED,
MIGRATION_STATUS_COMPLETED, I think these are important states that
vendor driver should know.

Thanks,
Kirti

>  static void vfio_realize(PCIDevice *pdev, Error **errp)
>  {
>      VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> @@ -2898,6 +3087,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn_quirk(vdev);
>      qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev);
> +    register_savevm_live(NULL, "vfio-pci", 0, 1, &savevm_vfio_handlers, vdev);
>  
>      return;
>  
> @@ -3051,10 +3241,6 @@ static Property vfio_pci_dev_properties[] = {
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> -static const VMStateDescription vfio_pci_vmstate = {
> -    .name = "vfio-pci",
> -};
> -
>  static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
> @@ -3062,7 +3248,6 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>  
>      dc->reset = vfio_pci_reset;
>      dc->props = vfio_pci_dev_properties;
> -    dc->vmsd = &vfio_pci_vmstate;
>      dc->desc = "VFIO-based PCI device assignment";
>      set_bit(DEVICE_CATEGORY_MISC, dc->categories);
>      pdc->realize = vfio_realize;
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index 8f02f2f..2c911d9 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -512,6 +512,20 @@ struct vfio_pci_hot_reset {
>  
>  #define VFIO_DEVICE_PCI_HOT_RESET	_IO(VFIO_TYPE, VFIO_BASE + 13)
>  
> +/**
> + * VFIO_DEVICE_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 14,
> + *				    struct vfio_device_get_dirty_bitmap)
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_get_dirty_bitmap {
> +	__u64	       start_addr;
> +	__u64	       page_nr;
> +	__u8           dirty_bitmap[];
> +};
> +
> +#define VFIO_DEVICE_GET_DIRTY_BITMAP	_IO(VFIO_TYPE, VFIO_BASE + 14)
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-10  6:03 [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration Yulei Zhang
  2018-04-16 14:44 ` Kirti Wankhede
@ 2018-04-16 20:37 ` Alex Williamson
  2018-04-17  8:11   ` Zhang, Yulei
  2018-04-17 19:19 ` Dr. David Alan Gilbert
  2018-05-10 11:51 ` Juan Quintela
  3 siblings, 1 reply; 12+ messages in thread
From: Alex Williamson @ 2018-04-16 20:37 UTC (permalink / raw)
  To: Yulei Zhang
  Cc: qemu-devel, kevin.tian, joonas.lahtinen, zhenyuw, kwankhede,
	zhi.a.wang, dgilbert, quintela

On Tue, 10 Apr 2018 14:03:13 +0800
Yulei Zhang <yulei.zhang@intel.com> wrote:

> Instead of using vm state description, add SaveVMHandlers for VFIO
> device to support live migration.
> 
> Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the memory
> bitmap that dirtied by vfio device during the iterative precopy stage
> to shorten the system downtime afterward.
> 
> For vfio pci device status migrate, during the system downtime, it will
> save the following states
> 1. pci configuration space addr0~addr5
> 2. pci configuration space msi_addr msi_data
> 3. pci device status fetch from device driver
> 
> And on the target side the vfio_load will restore the same states
> 1. re-setup the pci bar configuration
> 2. re-setup the pci device msi configuration
> 3. restore the pci device status

Interrupts are configured via ioctl, but I don't see any code here to
configure the device into the correct interrupt state.  How do we know
the target device is compatible with the source device?  Do we rely on
the vendor driver to implicitly include some kind of device and version
information and fail at the very end of the migration?  It seems like
we need to somehow front-load that sort of device compatibility
checking since a vfio-pci device can be anything (ex. what happens if
a user tries to migrate a GVT-g vGPU to an NVIDIA vGPU?).  Thanks,

Alex

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-16 14:44 ` Kirti Wankhede
@ 2018-04-17  8:01   ` Zhang, Yulei
  2018-04-17 19:41     ` Kirti Wankhede
  2018-04-17 19:25   ` Eric Blake
  1 sibling, 1 reply; 12+ messages in thread
From: Zhang, Yulei @ 2018-04-17  8:01 UTC (permalink / raw)
  To: Kirti Wankhede, qemu-devel
  Cc: Tian, Kevin, joonas.lahtinen, zhenyuw, Wang, Zhi A,
	alex.williamson, dgilbert, quintela



> -----Original Message-----
> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> Sent: Monday, April 16, 2018 10:45 PM
> To: Zhang, Yulei <yulei.zhang@intel.com>; qemu-devel@nongnu.org
> Cc: Tian, Kevin <kevin.tian@intel.com>; joonas.lahtinen@linux.intel.com;
> zhenyuw@linux.intel.com; Wang, Zhi A <zhi.a.wang@intel.com>;
> alex.williamson@redhat.com; dgilbert@redhat.com; quintela@redhat.com
> Subject: Re: [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device
> to support live migration
> 
> 
> 
> On 4/10/2018 11:33 AM, Yulei Zhang wrote:
> > Instead of using vm state description, add SaveVMHandlers for VFIO
> > device to support live migration.
> >
> > Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the
> memory
> > bitmap that dirtied by vfio device during the iterative precopy stage
> > to shorten the system downtime afterward.
> >
> > For vfio pci device status migrate, during the system downtime, it
> > will save the following states 1. pci configuration space addr0~addr5
> > 2. pci configuration space msi_addr msi_data 3. pci device status
> > fetch from device driver
> >
> > And on the target side the vfio_load will restore the same states 1.
> > re-setup the pci bar configuration 2. re-setup the pci device msi
> > configuration 3. restore the pci device status
> >
> 
> Can #1 and #2 be setup by vendor driver? Vendor driver knows capabilities
> of device and vendor driver can setup rather than have the restore code in
> common place and then handle all the capability cases here.
> 
We need vfio to help setup the interrupt patch.

> 
> > Signed-off-by: Yulei Zhang <yulei.zhang@intel.com>
> > ---
> >  hw/vfio/pci.c              | 195
> +++++++++++++++++++++++++++++++++++++++++++--
> >  linux-headers/linux/vfio.h |  14 ++++
> >  2 files changed, 204 insertions(+), 5 deletions(-)
> >
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 13d8c73..ac6a9c7
> > 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -33,9 +33,14 @@
> >  #include "trace.h"
> >  #include "qapi/error.h"
> >  #include "migration/blocker.h"
> > +#include "migration/register.h"
> > +#include "exec/ram_addr.h"
> >
> >  #define MSIX_CAP_LENGTH 12
> >
> > +#define VFIO_SAVE_FLAG_SETUP 0
> > +#define VFIO_SAVE_FLAG_DEV_STATE 1
> > +
> >  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);  static
> > void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);  static
> > void vfio_vm_change_state_handler(void *pv, int running, RunState
> > state); @@ -2639,6 +2644,190 @@ static void
> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> >      vdev->req_enabled = false;
> >  }
> >
> > +static uint64_t vfio_dirty_log_sync(VFIOPCIDevice *vdev) {
> > +    RAMBlock *block;
> > +    struct vfio_device_get_dirty_bitmap *d;
> > +    uint64_t page = 0;
> > +    ram_addr_t size;
> > +    unsigned long nr, bitmap;
> > +
> > +    RAMBLOCK_FOREACH(block) {
> > +        size = block->used_length;
> > +        nr = size >> TARGET_PAGE_BITS;
> > +        bitmap = (BITS_TO_LONGS(nr) + 1) * sizeof(unsigned long);
> > +        d = g_malloc0(sizeof(*d) +  bitmap);
> > +        d->start_addr = block->offset;
> > +        d->page_nr = nr;
> > +        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_DIRTY_BITMAP, d)) {
> > +            error_report("vfio: Failed to get device dirty bitmap");
> > +            g_free(d);
> > +            goto exit;
> > +        }
> > +
> > +        if (d->page_nr) {
> > +            cpu_physical_memory_set_dirty_lebitmap(
> > +                                 (unsigned long *)&d->dirty_bitmap,
> > +                                 d->start_addr, d->page_nr);
> > +            page += d->page_nr;
> > +        }
> > +        g_free(d);
> > +    }
> > +
> > +exit:
> > +    return page;
> > +}
> > +
> > +static void vfio_save_live_pending(QEMUFile *f, void *opaque, uint64_t
> max_size,
> > +                   uint64_t *non_postcopiable_pending,
> > +                   uint64_t *postcopiable_pending) {
> > +    VFIOPCIDevice *vdev = opaque;
> > +    uint64_t pending;
> > +
> > +    qemu_mutex_lock_iothread();
> > +    rcu_read_lock();
> > +    pending = vfio_dirty_log_sync(vdev);
> > +    rcu_read_unlock();
> > +    qemu_mutex_unlock_iothread();
> > +    *non_postcopiable_pending += pending; }
> > +
> 
> This doesn't provide a way to read device's state during pre-copy phase.
> device_state region can be used to read device specific data from vendor
> driver during pre-copy phase. That could be done by mmap device_state
> region during init and then ioctl vendor driver here to query size of data
> copied to device_state region and pending data size.
> 

Maybe size could be placed in the device state region instead of adding ioctl.
> 
> > +static int vfio_load(QEMUFile *f, void *opaque, int version_id) {
> > +    VFIOPCIDevice *vdev = opaque;
> > +    PCIDevice *pdev = &vdev->pdev;
> > +    int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
> > +    uint8_t *buf = NULL;
> > +    uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i;
> > +    bool msi_64bit;
> > +
> > +    if (qemu_get_byte(f) == VFIO_SAVE_FLAG_SETUP) {
> > +        goto exit;
> > +    }
> > +
> > +    /* retore pci bar configuration */
> > +    ctl = pci_default_read_config(pdev, PCI_COMMAND, 2);
> > +    vfio_pci_write_config(pdev, PCI_COMMAND,
> > +                          ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)),
> 2);
> > +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > +        bar_cfg = qemu_get_be32(f);
> > +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4);
> > +    }
> > +    vfio_pci_write_config(pdev, PCI_COMMAND,
> > +                          ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY,
> > + 2);
> > +
> > +    /* restore msi configuration */
> > +    ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> 2);
> > +    msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT);
> > +
> > +    vfio_pci_write_config(&vdev->pdev,
> > +                          pdev->msi_cap + PCI_MSI_FLAGS,
> > +                          ctl & (!PCI_MSI_FLAGS_ENABLE), 2);
> > +
> > +    msi_lo = qemu_get_be32(f);
> > +    vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> > + msi_lo, 4);
> > +
> > +    if (msi_64bit) {
> > +        msi_hi = qemu_get_be32(f);
> > +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> > +                              msi_hi, 4);
> > +    }
> > +    msi_data = qemu_get_be32(f);
> > +    vfio_pci_write_config(pdev,
> > +              pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 :
> PCI_MSI_DATA_32),
> > +              msi_data, 2);
> > +
> > +    vfio_pci_write_config(&vdev->pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> > +                          ctl | PCI_MSI_FLAGS_ENABLE, 2);
> > +
> > +    buf = g_malloc(sz);
> > +    if (buf == NULL) {
> > +        error_report("vfio: Failed to allocate memory for migrate");
> > +        return -1;
> > +    }
> > +
> > +    qemu_get_buffer(f, buf, sz);
> > +    if (pwrite(vdev->vbasedev.fd, buf, sz,
> > +               vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
> > +        error_report("vfio: Failed to write Device State Region");
> > +        return -1;
> > +    }
> > +
> 
> Again here if device_state region in mmaped, QEMU would write buffer to
> that region and an ioclt can be used to convey that new data is written to
> the region along with size of data.
> 
> 
> > +    g_free(buf);
> > +
> > +exit:
> > +    return 0;
> > +}
> > +
> > +static int vfio_save_complete(QEMUFile *f, void *opaque) {
> > +    VFIOPCIDevice *vdev = opaque;
> > +    PCIDevice *pdev = &vdev->pdev;
> > +    int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
> > +    uint8_t *buf = NULL;
> > +    uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i;
> > +    bool msi_64bit;
> > +
> > +    qemu_put_byte(f, VFIO_SAVE_FLAG_DEV_STATE);
> > +
> > +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > +        bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i *
> 4, 4);
> > +        qemu_put_be32(f, bar_cfg);
> > +    }
> > +
> > +    msi_cfg = pci_default_read_config(pdev, pdev->msi_cap +
> PCI_MSI_FLAGS, 2);
> > +    msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT);
> > +
> > +    msi_lo = pci_default_read_config(pdev,
> > +                                     pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> > +    qemu_put_be32(f, msi_lo);
> > +
> > +    if (msi_64bit) {
> > +        msi_hi = pci_default_read_config(pdev,
> > +                                         pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> > +                                         4);
> > +        qemu_put_be32(f, msi_hi);
> > +    }
> > +
> > +    msi_data = pci_default_read_config(pdev,
> > +               pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 :
> PCI_MSI_DATA_32),
> > +               2);
> > +    qemu_put_be32(f, msi_data);
> > +
> > +    buf = g_malloc(sz);
> > +    if (buf == NULL) {
> > +        error_report("vfio: Failed to allocate memory for migrate");
> > +        goto exit;
> > +    }
> > +
> > +    if (pread(vdev->vbasedev.fd, buf, sz,
> > +              vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
> > +        error_report("vfio: Failed to read Device State Region");
> > +        goto exit;
> > +    }
> > +
> 
> Again here getting buffer information from vendor driver and directly
> reading that data from mmaped region would help.
> 
> > +    qemu_put_buffer(f, buf, sz);
> > +
> > +exit:
> > +    g_free(buf);
> > +
> > +    return 0;
> > +}
> > +
> > +static int vfio_save_setup(QEMUFile *f, void *opaque) {
> > +    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> > +    return 0;
> > +}
> > +
> > +static SaveVMHandlers savevm_vfio_handlers = {
> > +    .save_setup = vfio_save_setup,
> > +    .save_live_pending = vfio_save_live_pending,
> > +    .save_live_complete_precopy = vfio_save_complete,
> > +    .load_state = vfio_load,
> > +};
> > +
> 
> Isn't .is_active, .save_live_iterate and .cleanup required?
> What is vendor driver have large amount of data in device's memory which
> vendor driver is aware of? and vendor driver would required multiple
> iterations to send that data to QEMU to save complete state of device?
> 
I suppose the vendor driver will copy the device's memory to the device
region iteratively, and let qemu read from the device region and transfer
the data to the target side in pre-copy stage, isn't it? 

> Isn't there should be an ioclt to convey QEMU's MIGRATION_STATUS_* to
> vendor driver? Knowing migration status would help vendor driver take
> necessary action, like MIGRATION_STATUS_SETUP,
> MIGRATION_STATUS_CANCELLED, MIGRATION_STATUS_FAILED,
> MIGRATION_STATUS_COMPLETED, I think these are important states that
> vendor driver should know.
> 
Good point.

> Thanks,
> Kirti
> 
> >  static void vfio_realize(PCIDevice *pdev, Error **errp)  {
> >      VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); @@
> > -2898,6 +3087,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> >      vfio_register_req_notifier(vdev);
> >      vfio_setup_resetfn_quirk(vdev);
> >      qemu_add_vm_change_state_handler(vfio_vm_change_state_handler,
> > vdev);
> > +    register_savevm_live(NULL, "vfio-pci", 0, 1,
> > + &savevm_vfio_handlers, vdev);
> >
> >      return;
> >
> > @@ -3051,10 +3241,6 @@ static Property vfio_pci_dev_properties[] = {
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> >
> > -static const VMStateDescription vfio_pci_vmstate = {
> > -    .name = "vfio-pci",
> > -};
> > -
> >  static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> > {
> >      DeviceClass *dc = DEVICE_CLASS(klass); @@ -3062,7 +3248,6 @@
> > static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> >
> >      dc->reset = vfio_pci_reset;
> >      dc->props = vfio_pci_dev_properties;
> > -    dc->vmsd = &vfio_pci_vmstate;
> >      dc->desc = "VFIO-based PCI device assignment";
> >      set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> >      pdc->realize = vfio_realize;
> > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> > index 8f02f2f..2c911d9 100644
> > --- a/linux-headers/linux/vfio.h
> > +++ b/linux-headers/linux/vfio.h
> > @@ -512,6 +512,20 @@ struct vfio_pci_hot_reset {
> >
> >  #define VFIO_DEVICE_PCI_HOT_RESET	_IO(VFIO_TYPE, VFIO_BASE + 13)
> >
> > +/**
> > + * VFIO_DEVICE_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 14,
> > + *				    struct vfio_device_get_dirty_bitmap)
> > + *
> > + * Return: 0 on success, -errno on failure.
> > + */
> > +struct vfio_device_get_dirty_bitmap {
> > +	__u64	       start_addr;
> > +	__u64	       page_nr;
> > +	__u8           dirty_bitmap[];
> > +};
> > +
> > +#define VFIO_DEVICE_GET_DIRTY_BITMAP	_IO(VFIO_TYPE,
> VFIO_BASE + 14)
> > +
> >  /* -------- API for Type1 VFIO IOMMU -------- */
> >
> >  /**
> >

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-16 20:37 ` Alex Williamson
@ 2018-04-17  8:11   ` Zhang, Yulei
  2018-04-17 14:34     ` Alex Williamson
  0 siblings, 1 reply; 12+ messages in thread
From: Zhang, Yulei @ 2018-04-17  8:11 UTC (permalink / raw)
  To: Alex Williamson
  Cc: qemu-devel, Tian, Kevin, joonas.lahtinen, zhenyuw, kwankhede,
	Wang, Zhi A, dgilbert, quintela


> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Tuesday, April 17, 2018 4:38 AM
> To: Zhang, Yulei <yulei.zhang@intel.com>
> Cc: qemu-devel@nongnu.org; Tian, Kevin <kevin.tian@intel.com>;
> joonas.lahtinen@linux.intel.com; zhenyuw@linux.intel.com;
> kwankhede@nvidia.com; Wang, Zhi A <zhi.a.wang@intel.com>;
> dgilbert@redhat.com; quintela@redhat.com
> Subject: Re: [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device
> to support live migration
> 
> On Tue, 10 Apr 2018 14:03:13 +0800
> Yulei Zhang <yulei.zhang@intel.com> wrote:
> 
> > Instead of using vm state description, add SaveVMHandlers for VFIO
> > device to support live migration.
> >
> > Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the
> memory
> > bitmap that dirtied by vfio device during the iterative precopy stage
> > to shorten the system downtime afterward.
> >
> > For vfio pci device status migrate, during the system downtime, it
> > will save the following states 1. pci configuration space addr0~addr5
> > 2. pci configuration space msi_addr msi_data 3. pci device status
> > fetch from device driver
> >
> > And on the target side the vfio_load will restore the same states 1.
> > re-setup the pci bar configuration 2. re-setup the pci device msi
> > configuration 3. restore the pci device status
> 
> Interrupts are configured via ioctl, but I don't see any code here to configure
> the device into the correct interrupt state.  How do we know the target
> device is compatible with the source device?  Do we rely on the vendor
> driver to implicitly include some kind of device and version information and
> fail at the very end of the migration?  It seems like we need to somehow
> front-load that sort of device compatibility checking since a vfio-pci device
> can be anything (ex. what happens if a user tries to migrate a GVT-g vGPU to
> an NVIDIA vGPU?).  Thanks,
> 
> Alex

We emulate the write to the pci configure space in vfio_load() which will help
setup the interrupt state. 
For compatibility I think currently the vendor driver would put version number 
or device specific info in the device state region, so during the pre-copy stage
the target side will discover the difference and call off the migration.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-17  8:11   ` Zhang, Yulei
@ 2018-04-17 14:34     ` Alex Williamson
  2018-04-18 10:57       ` Zhang, Yulei
  0 siblings, 1 reply; 12+ messages in thread
From: Alex Williamson @ 2018-04-17 14:34 UTC (permalink / raw)
  To: Zhang, Yulei
  Cc: qemu-devel, Tian, Kevin, joonas.lahtinen, zhenyuw, kwankhede,
	Wang, Zhi A, dgilbert, quintela

On Tue, 17 Apr 2018 08:11:12 +0000
"Zhang, Yulei" <yulei.zhang@intel.com> wrote:

> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Tuesday, April 17, 2018 4:38 AM
> > To: Zhang, Yulei <yulei.zhang@intel.com>
> > Cc: qemu-devel@nongnu.org; Tian, Kevin <kevin.tian@intel.com>;
> > joonas.lahtinen@linux.intel.com; zhenyuw@linux.intel.com;
> > kwankhede@nvidia.com; Wang, Zhi A <zhi.a.wang@intel.com>;
> > dgilbert@redhat.com; quintela@redhat.com
> > Subject: Re: [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device
> > to support live migration
> > 
> > On Tue, 10 Apr 2018 14:03:13 +0800
> > Yulei Zhang <yulei.zhang@intel.com> wrote:
> >   
> > > Instead of using vm state description, add SaveVMHandlers for VFIO
> > > device to support live migration.
> > >
> > > Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the  
> > memory  
> > > bitmap that dirtied by vfio device during the iterative precopy stage
> > > to shorten the system downtime afterward.
> > >
> > > For vfio pci device status migrate, during the system downtime, it
> > > will save the following states 1. pci configuration space addr0~addr5
> > > 2. pci configuration space msi_addr msi_data 3. pci device status
> > > fetch from device driver
> > >
> > > And on the target side the vfio_load will restore the same states 1.
> > > re-setup the pci bar configuration 2. re-setup the pci device msi
> > > configuration 3. restore the pci device status  
> > 
> > Interrupts are configured via ioctl, but I don't see any code here to configure
> > the device into the correct interrupt state.  How do we know the target
> > device is compatible with the source device?  Do we rely on the vendor
> > driver to implicitly include some kind of device and version information and
> > fail at the very end of the migration?  It seems like we need to somehow
> > front-load that sort of device compatibility checking since a vfio-pci device
> > can be anything (ex. what happens if a user tries to migrate a GVT-g vGPU to
> > an NVIDIA vGPU?).  Thanks,
> > 
> > Alex  
> 
> We emulate the write to the pci configure space in vfio_load() which will help
> setup the interrupt state. 

But you're only doing that for MSI, not MSI-X, we cannot simply say
that we don't have an MSI-X devices right now and add it later or we'll
end up with incompatible vmstate, we need to plan for how we'll support
it within the save state stream now.

> For compatibility I think currently the vendor driver would put version number 
> or device specific info in the device state region, so during the pre-copy stage
> the target side will discover the difference and call off the migration.

Those sorts of things should be built into the device state region, we
shouldn't rely on vendor drivers to make these kinds of considerations,
we should build it into the API, which also allows QEMU to check state
compatibility before attempting a migration.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-10  6:03 [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration Yulei Zhang
  2018-04-16 14:44 ` Kirti Wankhede
  2018-04-16 20:37 ` Alex Williamson
@ 2018-04-17 19:19 ` Dr. David Alan Gilbert
  2018-05-10 11:51 ` Juan Quintela
  3 siblings, 0 replies; 12+ messages in thread
From: Dr. David Alan Gilbert @ 2018-04-17 19:19 UTC (permalink / raw)
  To: Yulei Zhang
  Cc: qemu-devel, kevin.tian, joonas.lahtinen, zhenyuw, kwankhede,
	zhi.a.wang, alex.williamson, quintela

* Yulei Zhang (yulei.zhang@intel.com) wrote:
> Instead of using vm state description, add SaveVMHandlers for VFIO
> device to support live migration.
> 
> Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the memory
> bitmap that dirtied by vfio device during the iterative precopy stage
> to shorten the system downtime afterward.
> 
> For vfio pci device status migrate, during the system downtime, it will
> save the following states
> 1. pci configuration space addr0~addr5
> 2. pci configuration space msi_addr msi_data
> 3. pci device status fetch from device driver
> 
> And on the target side the vfio_load will restore the same states
> 1. re-setup the pci bar configuration
> 2. re-setup the pci device msi configuration
> 3. restore the pci device status
> 
> Signed-off-by: Yulei Zhang <yulei.zhang@intel.com>
> ---
>  hw/vfio/pci.c              | 195 +++++++++++++++++++++++++++++++++++++++++++--
>  linux-headers/linux/vfio.h |  14 ++++
>  2 files changed, 204 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 13d8c73..ac6a9c7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -33,9 +33,14 @@
>  #include "trace.h"
>  #include "qapi/error.h"
>  #include "migration/blocker.h"
> +#include "migration/register.h"
> +#include "exec/ram_addr.h"
>  
>  #define MSIX_CAP_LENGTH 12
>  
> +#define VFIO_SAVE_FLAG_SETUP 0
> +#define VFIO_SAVE_FLAG_DEV_STATE 1
> +
>  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>  static void vfio_vm_change_state_handler(void *pv, int running, RunState state);
> @@ -2639,6 +2644,190 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>      vdev->req_enabled = false;
>  }
>  
> +static uint64_t vfio_dirty_log_sync(VFIOPCIDevice *vdev)
> +{
> +    RAMBlock *block;
> +    struct vfio_device_get_dirty_bitmap *d;
> +    uint64_t page = 0;
> +    ram_addr_t size;
> +    unsigned long nr, bitmap;
> +
> +    RAMBLOCK_FOREACH(block) {
> +        size = block->used_length;
> +        nr = size >> TARGET_PAGE_BITS;
> +        bitmap = (BITS_TO_LONGS(nr) + 1) * sizeof(unsigned long);
> +        d = g_malloc0(sizeof(*d) +  bitmap);
> +        d->start_addr = block->offset;
> +        d->page_nr = nr;
> +        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_DIRTY_BITMAP, d)) {
> +            error_report("vfio: Failed to get device dirty bitmap");
> +            g_free(d);
> +            goto exit;
> +        }
> +
> +        if (d->page_nr) {
> +            cpu_physical_memory_set_dirty_lebitmap(
> +                                 (unsigned long *)&d->dirty_bitmap,
> +                                 d->start_addr, d->page_nr);
> +            page += d->page_nr;
> +        }
> +        g_free(d);
> +    }
> +
> +exit:
> +    return page;
> +}
> +
> +static void vfio_save_live_pending(QEMUFile *f, void *opaque, uint64_t max_size,
> +                   uint64_t *non_postcopiable_pending,
> +                   uint64_t *postcopiable_pending)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    uint64_t pending;
> +
> +    qemu_mutex_lock_iothread();
> +    rcu_read_lock();
> +    pending = vfio_dirty_log_sync(vdev);
> +    rcu_read_unlock();
> +    qemu_mutex_unlock_iothread();
> +    *non_postcopiable_pending += pending;
> +}
> +
> +static int vfio_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *pdev = &vdev->pdev;
> +    int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
> +    uint8_t *buf = NULL;
> +    uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i;
> +    bool msi_64bit;
> +
> +    if (qemu_get_byte(f) == VFIO_SAVE_FLAG_SETUP) {
> +        goto exit;
> +    }

If you're building something complex like this, you might want to add
some version flags at the start and a canary at the end to detect
corruption.

Also note that the migration could fail at any point; so calling
qemu_file_get_error is good practice at points before acting on data
youv'e read with qemu_get_be* since it could be bogus if it's already
failed.

> +    /* retore pci bar configuration */
> +    ctl = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> +                          ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        bar_cfg = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4);
> +    }
> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> +                          ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2);
> +
> +    /* restore msi configuration */
> +    ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
> +    msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT);
> +
> +    vfio_pci_write_config(&vdev->pdev,
> +                          pdev->msi_cap + PCI_MSI_FLAGS,
> +                          ctl & (!PCI_MSI_FLAGS_ENABLE), 2);
> +
> +    msi_lo = qemu_get_be32(f);
> +    vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4);
> +
> +    if (msi_64bit) {
> +        msi_hi = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                              msi_hi, 4);
> +    }
> +    msi_data = qemu_get_be32(f);
> +    vfio_pci_write_config(pdev,
> +              pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +              msi_data, 2);
> +
> +    vfio_pci_write_config(&vdev->pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                          ctl | PCI_MSI_FLAGS_ENABLE, 2);
> +
> +    buf = g_malloc(sz);

Please validate 'sz' before using it - just assume that the byte stream
got horribly corrupted and you should really check for it.

> +    if (buf == NULL) {
> +        error_report("vfio: Failed to allocate memory for migrate");
> +        return -1;
> +    }

g_malloc doesn't fail by returning NULL; it asserts;  so use
g_try_malloc.

> +    qemu_get_buffer(f, buf, sz);
> +    if (pwrite(vdev->vbasedev.fd, buf, sz,
> +               vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
> +        error_report("vfio: Failed to write Device State Region");

free the buffer here.
> +        return -1;
> +    }
> +
> +    g_free(buf);
> +
> +exit:
> +    return 0;
> +}
> +
> +static int vfio_save_complete(QEMUFile *f, void *opaque)

Echoing Kirti's mail, this doesn't seem to be an iterative save routine;
this is just all at the end; in which case use VMState.
If you want it iterative then you need to define the other functions.

> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *pdev = &vdev->pdev;
> +    int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
> +    uint8_t *buf = NULL;
> +    uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i;
> +    bool msi_64bit;
> +
> +    qemu_put_byte(f, VFIO_SAVE_FLAG_DEV_STATE);
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> +        qemu_put_be32(f, bar_cfg);
> +    }
> +
> +    msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
> +    msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT);
> +
> +    msi_lo = pci_default_read_config(pdev,
> +                                     pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> +    qemu_put_be32(f, msi_lo);
> +
> +    if (msi_64bit) {
> +        msi_hi = pci_default_read_config(pdev,
> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                                         4);
> +        qemu_put_be32(f, msi_hi);
> +    }
> +
> +    msi_data = pci_default_read_config(pdev,
> +               pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +               2);
> +    qemu_put_be32(f, msi_data);
> +
> +    buf = g_malloc(sz);
> +    if (buf == NULL) {
> +        error_report("vfio: Failed to allocate memory for migrate");
> +        goto exit;
> +    }

Same about the allocation above.

> +    if (pread(vdev->vbasedev.fd, buf, sz,
> +              vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
> +        error_report("vfio: Failed to read Device State Region");

Include errno?

> +        goto exit;
> +    }
> +
> +    qemu_put_buffer(f, buf, sz);
> +
> +exit:
> +    g_free(buf);
> +
> +    return 0;
> +}
> +
> +static int vfio_save_setup(QEMUFile *f, void *opaque)
> +{
> +    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);
> +    return 0;
> +}
> +
> +static SaveVMHandlers savevm_vfio_handlers = {
> +    .save_setup = vfio_save_setup,
> +    .save_live_pending = vfio_save_live_pending,
> +    .save_live_complete_precopy = vfio_save_complete,
> +    .load_state = vfio_load,
> +};
> +
>  static void vfio_realize(PCIDevice *pdev, Error **errp)
>  {
>      VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> @@ -2898,6 +3087,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn_quirk(vdev);
>      qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev);
> +    register_savevm_live(NULL, "vfio-pci", 0, 1, &savevm_vfio_handlers, vdev);
>  
>      return;
>  
> @@ -3051,10 +3241,6 @@ static Property vfio_pci_dev_properties[] = {
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> -static const VMStateDescription vfio_pci_vmstate = {
> -    .name = "vfio-pci",
> -};
> -
>  static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
> @@ -3062,7 +3248,6 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>  
>      dc->reset = vfio_pci_reset;
>      dc->props = vfio_pci_dev_properties;
> -    dc->vmsd = &vfio_pci_vmstate;
>      dc->desc = "VFIO-based PCI device assignment";
>      set_bit(DEVICE_CATEGORY_MISC, dc->categories);
>      pdc->realize = vfio_realize;
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index 8f02f2f..2c911d9 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -512,6 +512,20 @@ struct vfio_pci_hot_reset {
>  
>  #define VFIO_DEVICE_PCI_HOT_RESET	_IO(VFIO_TYPE, VFIO_BASE + 13)
>  
> +/**
> + * VFIO_DEVICE_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 14,
> + *				    struct vfio_device_get_dirty_bitmap)
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_get_dirty_bitmap {
> +	__u64	       start_addr;
> +	__u64	       page_nr;
> +	__u8           dirty_bitmap[];
> +};

Header updates should be in a separate patch.

Dave

> +#define VFIO_DEVICE_GET_DIRTY_BITMAP	_IO(VFIO_TYPE, VFIO_BASE + 14)
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**
> -- 
> 2.7.4
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-16 14:44 ` Kirti Wankhede
  2018-04-17  8:01   ` Zhang, Yulei
@ 2018-04-17 19:25   ` Eric Blake
  1 sibling, 0 replies; 12+ messages in thread
From: Eric Blake @ 2018-04-17 19:25 UTC (permalink / raw)
  To: Kirti Wankhede, Yulei Zhang, qemu-devel
  Cc: kevin.tian, quintela, joonas.lahtinen, dgilbert, zhenyuw,
	alex.williamson, zhi.a.wang

[-- Attachment #1: Type: text/plain, Size: 386 bytes --]

On 04/16/2018 09:44 AM, Kirti Wankhede wrote:
> 
> 
> On 4/10/2018 11:33 AM, Yulei Zhang wrote:
>> Instead of using vm state description, add SaveVMHandlers for VFIO
>> device to support live migration.

In the subject line: s/Hanlders/Handlers/

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 619 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-17  8:01   ` Zhang, Yulei
@ 2018-04-17 19:41     ` Kirti Wankhede
  0 siblings, 0 replies; 12+ messages in thread
From: Kirti Wankhede @ 2018-04-17 19:41 UTC (permalink / raw)
  To: Zhang, Yulei, qemu-devel
  Cc: Tian, Kevin, joonas.lahtinen, zhenyuw, Wang, Zhi A,
	alex.williamson, dgilbert, quintela



On 4/17/2018 1:31 PM, Zhang, Yulei wrote:

>>> +static SaveVMHandlers savevm_vfio_handlers = {
>>> +    .save_setup = vfio_save_setup,
>>> +    .save_live_pending = vfio_save_live_pending,
>>> +    .save_live_complete_precopy = vfio_save_complete,
>>> +    .load_state = vfio_load,
>>> +};
>>> +
>>
>> Isn't .is_active, .save_live_iterate and .cleanup required?
>> What is vendor driver have large amount of data in device's memory which
>> vendor driver is aware of? and vendor driver would required multiple
>> iterations to send that data to QEMU to save complete state of device?
>>
> I suppose the vendor driver will copy the device's memory to the device
> region iteratively, and let qemu read from the device region and transfer
> the data to the target side in pre-copy stage, isn't it? 
> 


As Dave mentioned in other mail in this thread, all data will not be
copied only in pre-copy state. Some static data would be copied in
pre-copy state but there could be significant amount of data in
stop-and-copy state where iterations would be required. .is_active and
.save_live_iterate would be required for that iterations.

.cleanup is required to provide an indication to vendor driver that
migration is complete and vendor driver can cleanup all the extra
allocations done for migration.

Thanks,
Kirti

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-17 14:34     ` Alex Williamson
@ 2018-04-18 10:57       ` Zhang, Yulei
  2018-04-18 11:18         ` Dr. David Alan Gilbert
  0 siblings, 1 reply; 12+ messages in thread
From: Zhang, Yulei @ 2018-04-18 10:57 UTC (permalink / raw)
  To: Alex Williamson
  Cc: qemu-devel, Tian, Kevin, joonas.lahtinen, zhenyuw, kwankhede,
	Wang, Zhi A, dgilbert, quintela



> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Tuesday, April 17, 2018 10:35 PM
> To: Zhang, Yulei <yulei.zhang@intel.com>
> Cc: qemu-devel@nongnu.org; Tian, Kevin <kevin.tian@intel.com>;
> joonas.lahtinen@linux.intel.com; zhenyuw@linux.intel.com;
> kwankhede@nvidia.com; Wang, Zhi A <zhi.a.wang@intel.com>;
> dgilbert@redhat.com; quintela@redhat.com
> Subject: Re: [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device
> to support live migration
> 
> On Tue, 17 Apr 2018 08:11:12 +0000
> "Zhang, Yulei" <yulei.zhang@intel.com> wrote:
> 
> > > -----Original Message-----
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Tuesday, April 17, 2018 4:38 AM
> > > To: Zhang, Yulei <yulei.zhang@intel.com>
> > > Cc: qemu-devel@nongnu.org; Tian, Kevin <kevin.tian@intel.com>;
> > > joonas.lahtinen@linux.intel.com; zhenyuw@linux.intel.com;
> > > kwankhede@nvidia.com; Wang, Zhi A <zhi.a.wang@intel.com>;
> > > dgilbert@redhat.com; quintela@redhat.com
> > > Subject: Re: [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO
> device
> > > to support live migration
> > >
> > > On Tue, 10 Apr 2018 14:03:13 +0800
> > > Yulei Zhang <yulei.zhang@intel.com> wrote:
> > >
> > > > Instead of using vm state description, add SaveVMHandlers for VFIO
> > > > device to support live migration.
> > > >
> > > > Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the
> > > memory
> > > > bitmap that dirtied by vfio device during the iterative precopy stage
> > > > to shorten the system downtime afterward.
> > > >
> > > > For vfio pci device status migrate, during the system downtime, it
> > > > will save the following states 1. pci configuration space addr0~addr5
> > > > 2. pci configuration space msi_addr msi_data 3. pci device status
> > > > fetch from device driver
> > > >
> > > > And on the target side the vfio_load will restore the same states 1.
> > > > re-setup the pci bar configuration 2. re-setup the pci device msi
> > > > configuration 3. restore the pci device status
> > >
> > > Interrupts are configured via ioctl, but I don't see any code here to
> configure
> > > the device into the correct interrupt state.  How do we know the target
> > > device is compatible with the source device?  Do we rely on the vendor
> > > driver to implicitly include some kind of device and version information
> and
> > > fail at the very end of the migration?  It seems like we need to somehow
> > > front-load that sort of device compatibility checking since a vfio-pci
> device
> > > can be anything (ex. what happens if a user tries to migrate a GVT-g
> vGPU to
> > > an NVIDIA vGPU?).  Thanks,
> > >
> > > Alex
> >
> > We emulate the write to the pci configure space in vfio_load() which will
> help
> > setup the interrupt state.
> 
> But you're only doing that for MSI, not MSI-X, we cannot simply say
> that we don't have an MSI-X devices right now and add it later or we'll
> end up with incompatible vmstate, we need to plan for how we'll support
> it within the save state stream now.
> 
Agree with u. 

> > For compatibility I think currently the vendor driver would put version
> number
> > or device specific info in the device state region, so during the pre-copy
> stage
> > the target side will discover the difference and call off the migration.
> 
> Those sorts of things should be built into the device state region, we
> shouldn't rely on vendor drivers to make these kinds of considerations,
> we should build it into the API, which also allows QEMU to check state
> compatibility before attempting a migration.  Thanks,
> 
> Alex

Not sure about how to check the compatibility before attempting a migration,
to my understanding, qemu doesn't know the configuration on target side,
target may call off the migration when it finds the input device state isn't compatible,
and source vm will resume.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-18 10:57       ` Zhang, Yulei
@ 2018-04-18 11:18         ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 12+ messages in thread
From: Dr. David Alan Gilbert @ 2018-04-18 11:18 UTC (permalink / raw)
  To: Zhang, Yulei
  Cc: Alex Williamson, qemu-devel, Tian, Kevin, joonas.lahtinen,
	zhenyuw, kwankhede, Wang, Zhi A, quintela

* Zhang, Yulei (yulei.zhang@intel.com) wrote:
> 
> 
> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Tuesday, April 17, 2018 10:35 PM
> > To: Zhang, Yulei <yulei.zhang@intel.com>
> > Cc: qemu-devel@nongnu.org; Tian, Kevin <kevin.tian@intel.com>;
> > joonas.lahtinen@linux.intel.com; zhenyuw@linux.intel.com;
> > kwankhede@nvidia.com; Wang, Zhi A <zhi.a.wang@intel.com>;
> > dgilbert@redhat.com; quintela@redhat.com
> > Subject: Re: [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device
> > to support live migration
> > 
> > On Tue, 17 Apr 2018 08:11:12 +0000
> > "Zhang, Yulei" <yulei.zhang@intel.com> wrote:
> > 
> > > > -----Original Message-----
> > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > Sent: Tuesday, April 17, 2018 4:38 AM
> > > > To: Zhang, Yulei <yulei.zhang@intel.com>
> > > > Cc: qemu-devel@nongnu.org; Tian, Kevin <kevin.tian@intel.com>;
> > > > joonas.lahtinen@linux.intel.com; zhenyuw@linux.intel.com;
> > > > kwankhede@nvidia.com; Wang, Zhi A <zhi.a.wang@intel.com>;
> > > > dgilbert@redhat.com; quintela@redhat.com
> > > > Subject: Re: [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO
> > device
> > > > to support live migration
> > > >
> > > > On Tue, 10 Apr 2018 14:03:13 +0800
> > > > Yulei Zhang <yulei.zhang@intel.com> wrote:
> > > >
> > > > > Instead of using vm state description, add SaveVMHandlers for VFIO
> > > > > device to support live migration.
> > > > >
> > > > > Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the
> > > > memory
> > > > > bitmap that dirtied by vfio device during the iterative precopy stage
> > > > > to shorten the system downtime afterward.
> > > > >
> > > > > For vfio pci device status migrate, during the system downtime, it
> > > > > will save the following states 1. pci configuration space addr0~addr5
> > > > > 2. pci configuration space msi_addr msi_data 3. pci device status
> > > > > fetch from device driver
> > > > >
> > > > > And on the target side the vfio_load will restore the same states 1.
> > > > > re-setup the pci bar configuration 2. re-setup the pci device msi
> > > > > configuration 3. restore the pci device status
> > > >
> > > > Interrupts are configured via ioctl, but I don't see any code here to
> > configure
> > > > the device into the correct interrupt state.  How do we know the target
> > > > device is compatible with the source device?  Do we rely on the vendor
> > > > driver to implicitly include some kind of device and version information
> > and
> > > > fail at the very end of the migration?  It seems like we need to somehow
> > > > front-load that sort of device compatibility checking since a vfio-pci
> > device
> > > > can be anything (ex. what happens if a user tries to migrate a GVT-g
> > vGPU to
> > > > an NVIDIA vGPU?).  Thanks,
> > > >
> > > > Alex
> > >
> > > We emulate the write to the pci configure space in vfio_load() which will
> > help
> > > setup the interrupt state.
> > 
> > But you're only doing that for MSI, not MSI-X, we cannot simply say
> > that we don't have an MSI-X devices right now and add it later or we'll
> > end up with incompatible vmstate, we need to plan for how we'll support
> > it within the save state stream now.
> > 
> Agree with u. 
> 
> > > For compatibility I think currently the vendor driver would put version
> > number
> > > or device specific info in the device state region, so during the pre-copy
> > stage
> > > the target side will discover the difference and call off the migration.
> > 
> > Those sorts of things should be built into the device state region, we
> > shouldn't rely on vendor drivers to make these kinds of considerations,
> > we should build it into the API, which also allows QEMU to check state
> > compatibility before attempting a migration.  Thanks,
> > 
> > Alex
> 
> Not sure about how to check the compatibility before attempting a migration,
> to my understanding, qemu doesn't know the configuration on target side,
> target may call off the migration when it finds the input device state isn't compatible,
> and source vm will resume.

There are a few parts to this:
   a) Tools, like libvirt etc need to be able to detect that the two
configurations are actually compatible before they try - so there should
be some way of exposing enough detail about the host/drivers for
something somewhere to be able to say 'yes it should work' before even
starting the migration.

   b) The migration stream should contain enough information to be able
to *cleanly* detect an incompatibility so that the failure is obvious;
so make sure you have enough information in there (preferably in the
'setup' part if it really is iterative).  Then when loading check that
data and print a clear reason why it's incompatible.

   c) We've got to try and keep that compatibility across versions - so
migrations to a newer QEMU, or newer drivers (or depending on the
hardware) to newer hardware should be able to work if possible.  So the
stream might need to have version identifiers in to help with that.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
  2018-04-10  6:03 [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration Yulei Zhang
                   ` (2 preceding siblings ...)
  2018-04-17 19:19 ` Dr. David Alan Gilbert
@ 2018-05-10 11:51 ` Juan Quintela
  3 siblings, 0 replies; 12+ messages in thread
From: Juan Quintela @ 2018-05-10 11:51 UTC (permalink / raw)
  To: Yulei Zhang
  Cc: qemu-devel, kevin.tian, joonas.lahtinen, zhenyuw, kwankhede,
	zhi.a.wang, alex.williamson, dgilbert

Yulei Zhang <yulei.zhang@intel.com> wrote:
> Instead of using vm state description, add SaveVMHandlers for VFIO
> device to support live migration.
>
> Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the memory
> bitmap that dirtied by vfio device during the iterative precopy stage
> to shorten the system downtime afterward.
>
> For vfio pci device status migrate, during the system downtime, it will
> save the following states
> 1. pci configuration space addr0~addr5
> 2. pci configuration space msi_addr msi_data
> 3. pci device status fetch from device driver
>
> And on the target side the vfio_load will restore the same states
> 1. re-setup the pci bar configuration
> 2. re-setup the pci device msi configuration
> 3. restore the pci device status

[...]

> +static uint64_t vfio_dirty_log_sync(VFIOPCIDevice *vdev)
> +{
> +    RAMBlock *block;
> +    struct vfio_device_get_dirty_bitmap *d;
> +    uint64_t page = 0;
> +    ram_addr_t size;
> +    unsigned long nr, bitmap;
> +
> +    RAMBLOCK_FOREACH(block) {

BTW, you are asking to sync all blocks of memory?  Does vfio needs it?
Things like acpi tables, or similar weird blocks looks strange, no?

> +        size = block->used_length;
> +        nr = size >> TARGET_PAGE_BITS;
> +        bitmap = (BITS_TO_LONGS(nr) + 1) * sizeof(unsigned long);
> +        d = g_malloc0(sizeof(*d) +  bitmap);
> +        d->start_addr = block->offset;
> +        d->page_nr = nr;
> +        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_DIRTY_BITMAP, d)) {
> +            error_report("vfio: Failed to get device dirty bitmap");
> +            g_free(d);
> +            goto exit;
> +        }
> +
> +        if (d->page_nr) {
> +            cpu_physical_memory_set_dirty_lebitmap(
> +                                 (unsigned long *)&d->dirty_bitmap,
> +                                 d->start_addr, d->page_nr);
> +            page += d->page_nr;

Are you sure that this are the number of dirty pages?  It looks to me to
the total number of pages in the region, no?

> +        }
> +        g_free(d);
> +    }
> +
> +exit:
> +    return page;
> +}

You walk the whole RAM on each bitmap.  Could it be possible that driver
knows _what_ memory regions it can have dirty pages on?


> +static void vfio_save_live_pending(QEMUFile *f, void *opaque, uint64_t max_size,
> +                   uint64_t *non_postcopiable_pending,
> +                   uint64_t *postcopiable_pending)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    uint64_t pending;
> +
> +    qemu_mutex_lock_iothread();
> +    rcu_read_lock();
> +    pending = vfio_dirty_log_sync(vdev);
> +    rcu_read_unlock();
> +    qemu_mutex_unlock_iothread();
> +    *non_postcopiable_pending += pending;
> +}

As said in other emails, this is not for iterative migration, see how we
do for ram (I have simplified it a bit):

static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
                             uint64_t *res_precopy_only,
                             uint64_t *res_compatible,
                             uint64_t *res_postcopy_only)
{
    RAMState **temp = opaque;
    RAMState *rs = *temp;
    uint64_t remaining_size;

    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;

    if (remaining_size < max_size) {
        qemu_mutex_lock_iothread();
        rcu_read_lock();
        migration_bitmap_sync(rs);
        rcu_read_unlock();
        qemu_mutex_unlock_iothread();
        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
    }

    *res_precopy_only += remaining_size;
}

I.e. we only do the sync stage if we don't have enough dirty pages on
the bitmap.

BTW, once that we are here, independently of this, perhaps we should
change the "sync" to take functions for each ramblock instead of for
the whole ram.

> +static int vfio_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *pdev = &vdev->pdev;
> +    int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
> +    uint8_t *buf = NULL;
> +    uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i;
> +    bool msi_64bit;
> +
> +    if (qemu_get_byte(f) == VFIO_SAVE_FLAG_SETUP) {
> +        goto exit;
> +    }

As said before, I would suggest that you add several fields:
- VERSION: So you can make incopatible changes
- FLAGS: compatible changes
- SIZE of the region?  Rest of QEMU don't have it, I consider it an
  error.  Having it makes way easier to analyze the stream and seach for errors.

> +    /* retore pci bar configuration */
> +    ctl = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> +                          ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);

Please forgive my pci ignorance, but are we really want to store the pci
configuration every time that we receive an iteration stage?

> +    for (i = 0; i < PCI_ROM_SLOT; i++) {

Is this PCI_ROM_SLOT fixed by some spec or should we transfer it?

> +        bar_cfg = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4);
> +    }
> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> +                          ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2);
> +
> +    /* restore msi configuration */
> +    ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
> +    msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT);

old experence show this is a bad idea.  See all the ifdefs for 32/64 bit
on target/i386/machine.c

I really suggest to store on the stream a bool with msi_64bit value and
send dummy values for 32bit mode and do nothing on destination.  The
simpler is your stream, the less bugs that you end with.


> +
> +    vfio_pci_write_config(&vdev->pdev,
> +                          pdev->msi_cap + PCI_MSI_FLAGS,
> +                          ctl & (!PCI_MSI_FLAGS_ENABLE), 2);
> +
> +    msi_lo = qemu_get_be32(f);
> +    vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4);
> +
> +    if (msi_64bit) {
> +        msi_hi = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                              msi_hi, 4);
> +    }
> +    msi_data = qemu_get_be32(f);
> +    vfio_pci_write_config(pdev,
> +              pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +              msi_data, 2);
> +
> +    vfio_pci_write_config(&vdev->pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                          ctl | PCI_MSI_FLAGS_ENABLE, 2);
> +
> +    buf = g_malloc(sz);
> +    if (buf == NULL) {
> +        error_report("vfio: Failed to allocate memory for migrate");
> +        return -1;
> +    }
> +
> +    qemu_get_buffer(f, buf, sz);
> +    if (pwrite(vdev->vbasedev.fd, buf, sz,
> +               vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
> +        error_report("vfio: Failed to write Device State Region");
> +        return -1;
> +    }
> +
> +    g_free(buf);
> +
> +exit:
> +    return 0;
> +}

BTW, this looks easy to do with vmstate, no need to use save_live for this.


> +static int vfio_save_setup(QEMUFile *f, void *opaque)
> +{
> +    qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP);

I think it would be a good idea that it would be a good idea that you do
the malloc() here.  Failing migration on completion for a memory error
in the completion stage is considered rude O:-)

> @@ -3051,10 +3241,6 @@ static Property vfio_pci_dev_properties[] = {
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> -static const VMStateDescription vfio_pci_vmstate = {
> -    .name = "vfio-pci",
> -};
> -
>  static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
> @@ -3062,7 +3248,6 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>  
>      dc->reset = vfio_pci_reset;
>      dc->props = vfio_pci_dev_properties;
> -    dc->vmsd = &vfio_pci_vmstate;

Seing what you are doing, I will put your configuration data here on the
vmsd, and will only left the dirty pages/log handrinng for the register_savevm.

Or you can search for inspiration in virtio log handling.  If you are
just using ram, you don't need to transfer anything, you just need to
make sure that you are called when the memory dirty bit is updated so
you can add your log.

Later, Juan.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-05-10 11:49 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-10  6:03 [Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration Yulei Zhang
2018-04-16 14:44 ` Kirti Wankhede
2018-04-17  8:01   ` Zhang, Yulei
2018-04-17 19:41     ` Kirti Wankhede
2018-04-17 19:25   ` Eric Blake
2018-04-16 20:37 ` Alex Williamson
2018-04-17  8:11   ` Zhang, Yulei
2018-04-17 14:34     ` Alex Williamson
2018-04-18 10:57       ` Zhang, Yulei
2018-04-18 11:18         ` Dr. David Alan Gilbert
2018-04-17 19:19 ` Dr. David Alan Gilbert
2018-05-10 11:51 ` Juan Quintela

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.