From mboxrd@z Thu Jan 1 00:00:00 1970 From: Cornelia Huck Subject: Re: [PATCH 2/5] vfio/migration: support device of device config capability Date: Tue, 19 Feb 2019 15:37:24 +0100 Message-ID: <20190219153724.12460160.cohuck@redhat.com> References: <1550566254-3545-1-git-send-email-yan.y.zhao@intel.com> <1550566347-3648-1-git-send-email-yan.y.zhao@intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Cc: cjia@nvidia.com, kvm@vger.kernel.org, aik@ozlabs.ru, Zhengxiao.zx@Alibaba-inc.com, shuangtai.tst@alibaba-inc.com, qemu-devel@nongnu.org, kwankhede@nvidia.com, eauger@redhat.com, yi.l.liu@intel.com, eskultet@redhat.com, ziye.yang@intel.com, mlevitsk@redhat.com, pasic@linux.ibm.com, arei.gonglei@huawei.com, felipe@nutanix.com, Ken.Xue@amd.com, kevin.tian@intel.com, dgilbert@redhat.com, alex.williamson@redhat.com, intel-gvt-dev@lists.freedesktop.org, changpeng.liu@intel.com, zhi.a.wang@intel.com, jonathan.davies@nutanix.com To: Yan Zhao Return-path: In-Reply-To: <1550566347-3648-1-git-send-email-yan.y.zhao@intel.com> List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+gceq-qemu-devel2=m.gmane.org@nongnu.org Sender: "Qemu-devel" List-Id: kvm.vger.kernel.org On Tue, 19 Feb 2019 16:52:27 +0800 Yan Zhao wrote: > Device config is the default data that every device should have. so > device config capability is by default on, no need to set. > > - Currently two type of resources are saved/loaded for device of device > config capability: > General PCI config data, and Device config data. > They are copies as a whole when precopy is stopped. > > Migration setup flow: > - Setup device state regions, check its device state version and capabilities. > Mmap Device Config Region and Dirty Bitmap Region, if available. > - If device state regions are failed to get setup, a migration blocker is > registered instead. > - Added SaveVMHandlers to register device state save/load handlers. > - Register VM state change handler to set device's running/stop states. > - On migration startup on source machine, set device's state to > VFIO_DEVICE_STATE_LOGGING > > Signed-off-by: Yan Zhao > Signed-off-by: Yulei Zhang > --- > hw/vfio/Makefile.objs | 2 +- > hw/vfio/migration.c | 633 ++++++++++++++++++++++++++++++++++++++++++ > hw/vfio/pci.c | 1 - > hw/vfio/pci.h | 25 +- > include/hw/vfio/vfio-common.h | 1 + > 5 files changed, 659 insertions(+), 3 deletions(-) > create mode 100644 hw/vfio/migration.c > > diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs > index 8b3f664..f32ff19 100644 > --- a/hw/vfio/Makefile.objs > +++ b/hw/vfio/Makefile.objs > @@ -1,6 +1,6 @@ > ifeq ($(CONFIG_LINUX), y) > obj-$(CONFIG_SOFTMMU) += common.o > -obj-$(CONFIG_PCI) += pci.o pci-quirks.o display.o > +obj-$(CONFIG_PCI) += pci.o pci-quirks.o display.o migration.o I think you want to split the migration code: The type-independent code, and the pci-specific code. > obj-$(CONFIG_VFIO_CCW) += ccw.o > obj-$(CONFIG_SOFTMMU) += platform.o > obj-$(CONFIG_VFIO_XGMAC) += calxeda-xgmac.o > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c > new file mode 100644 > index 0000000..16d6395 > --- /dev/null > +++ b/hw/vfio/migration.c > @@ -0,0 +1,633 @@ > +#include "qemu/osdep.h" > + > +#include "hw/vfio/vfio-common.h" > +#include "migration/blocker.h" > +#include "migration/register.h" > +#include "qapi/error.h" > +#include "pci.h" > +#include "sysemu/kvm.h" > +#include "exec/ram_addr.h" > + > +#define VFIO_SAVE_FLAG_SETUP 0 > +#define VFIO_SAVE_FLAG_PCI 1 > +#define VFIO_SAVE_FLAG_DEVCONFIG 2 > +#define VFIO_SAVE_FLAG_DEVMEMORY 4 > +#define VFIO_SAVE_FLAG_CONTINUE 8 > + > +static int vfio_device_state_region_setup(VFIOPCIDevice *vdev, > + VFIORegion *region, uint32_t subtype, const char *name) This function looks like it should be more generic and e.g. take a VFIODevice instead of a VFIOPCIDevice as argument. > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + struct vfio_region_info *info; > + int ret; > + > + ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_DEVICE_STATE, > + subtype, &info); > + if (ret) { > + error_report("Failed to get info of region %s", name); > + return ret; > + } > + > + if (vfio_region_setup(OBJECT(vdev), vbasedev, > + region, info->index, name)) { > + error_report("Failed to setup migrtion region %s", name); > + return ret; > + } > + > + if (vfio_region_mmap(region)) { > + error_report("Failed to mmap migrtion region %s", name); > + } > + > + return 0; > +} > + > +bool vfio_device_data_cap_system_memory(VFIOPCIDevice *vdev) > +{ > + return !!(vdev->migration->data_caps & VFIO_DEVICE_DATA_CAP_SYSTEM_MEMORY); > +} > + > +bool vfio_device_data_cap_device_memory(VFIOPCIDevice *vdev) > +{ > + return !!(vdev->migration->data_caps & VFIO_DEVICE_DATA_CAP_DEVICE_MEMORY); > +} These two as well. The migration structure should probably hang off the VFIODevice instead. > + > +static bool vfio_device_state_region_mmaped(VFIORegion *region) > +{ > + bool mmaped = true; > + if (region->nr_mmaps != 1 || region->mmaps[0].offset || > + (region->size != region->mmaps[0].size) || > + (region->mmaps[0].mmap == NULL)) { > + mmaped = false; > + } > + > + return mmaped; > +} s/mmaped/mmapped/ ? > + > +static int vfio_get_device_config_size(VFIOPCIDevice *vdev) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region_ctl = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + VFIORegion *region_config = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_CONFIG]; This looks like it should not depend on pci, either. > + uint64_t len; > + int sz; > + > + sz = sizeof(len); > + if (pread(vbasedev->fd, &len, sz, > + region_ctl->fd_offset + > + offsetof(struct vfio_device_state_ctl, device_config.size)) > + != sz) { > + error_report("vfio: Failed to get length of device config"); > + return -1; > + } > + if (len > region_config->size) { > + error_report("vfio: Error device config length"); > + return -1; > + } > + vdev->migration->devconfig_size = len; > + > + return 0; > +} > + > +static int vfio_set_device_config_size(VFIOPCIDevice *vdev, uint64_t size) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region_ctl = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + VFIORegion *region_config = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_CONFIG]; Ditto. Also for the functions below. > + int sz; > + > + if (size > region_config->size) { > + return -1; > + } > + > + sz = sizeof(size); > + if (pwrite(vbasedev->fd, &size, sz, > + region_ctl->fd_offset + > + offsetof(struct vfio_device_state_ctl, device_config.size)) > + != sz) { > + error_report("vfio: Failed to set length of device config"); > + return -1; > + } > + vdev->migration->devconfig_size = size; > + return 0; > +} > + > +static int vfio_save_data_device_config(VFIOPCIDevice *vdev, QEMUFile *f) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region_ctl = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + VFIORegion *region_config = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_CONFIG]; > + void *dest; > + uint32_t sz; > + uint8_t *buf = NULL; > + uint32_t action = VFIO_DEVICE_DATA_ACTION_GET_BUFFER; > + uint64_t len = vdev->migration->devconfig_size; > + > + qemu_put_be64(f, len); Why big endian? (Generally, do we need any endianness considerations?) > + > + sz = sizeof(action); > + if (pwrite(vbasedev->fd, &action, sz, > + region_ctl->fd_offset + > + offsetof(struct vfio_device_state_ctl, device_config.action)) > + != sz) { > + error_report("vfio: action failure for device config get buffer"); > + return -1; > + } Might make sense to wrap this into a set_action() helper that takes a SET_BUFFER/GET_BUFFER argument. > + > + if (!vfio_device_state_region_mmaped(region_config)) { > + buf = g_malloc(len); > + if (buf == NULL) { > + error_report("vfio: Failed to allocate memory for migrate"); > + return -1; > + } > + if (pread(vbasedev->fd, buf, len, region_config->fd_offset) != len) { > + error_report("vfio: Failed read device config buffer"); > + return -1; > + } > + qemu_put_buffer(f, buf, len); > + g_free(buf); > + } else { > + dest = region_config->mmaps[0].mmap; > + qemu_put_buffer(f, dest, len); > + } > + return 0; > +} > + > +static int vfio_load_data_device_config(VFIOPCIDevice *vdev, > + QEMUFile *f, uint64_t len) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region_ctl = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + VFIORegion *region_config = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_CONFIG]; > + void *dest; > + uint32_t sz; > + uint8_t *buf = NULL; > + uint32_t action = VFIO_DEVICE_DATA_ACTION_SET_BUFFER; > + > + vfio_set_device_config_size(vdev, len); > + > + if (!vfio_device_state_region_mmaped(region_config)) { > + buf = g_malloc(len); > + if (buf == NULL) { > + error_report("vfio: Failed to allocate memory for migrate"); > + return -1; > + } > + qemu_get_buffer(f, buf, len); > + if (pwrite(vbasedev->fd, buf, len, > + region_config->fd_offset) != len) { > + error_report("vfio: Failed to write devie config buffer"); > + return -1; > + } > + g_free(buf); > + } else { > + dest = region_config->mmaps[0].mmap; > + qemu_get_buffer(f, dest, len); > + } > + > + sz = sizeof(action); > + if (pwrite(vbasedev->fd, &action, sz, > + region_ctl->fd_offset + > + offsetof(struct vfio_device_state_ctl, device_config.action)) > + != sz) { > + error_report("vfio: action failure for device config set buffer"); > + return -1; > + } > + > + return 0; > +} > + > +static int vfio_set_dirty_page_bitmap_chunk(VFIOPCIDevice *vdev, > + uint64_t start_addr, uint64_t page_nr) > +{ > + > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region_ctl = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + VFIORegion *region_bitmap = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_BITMAP]; > + unsigned long bitmap_size = > + BITS_TO_LONGS(page_nr) * sizeof(unsigned long); > + uint32_t sz; > + > + struct { > + __u64 start_addr; > + __u64 page_nr; > + } system_memory; > + system_memory.start_addr = start_addr; > + system_memory.page_nr = page_nr; > + sz = sizeof(system_memory); > + if (pwrite(vbasedev->fd, &system_memory, sz, > + region_ctl->fd_offset + > + offsetof(struct vfio_device_state_ctl, system_memory)) > + != sz) { > + error_report("vfio: Failed to set system memory range for dirty pages"); > + return -1; > + } > + > + if (!vfio_device_state_region_mmaped(region_bitmap)) { > + void *bitmap = g_malloc0(bitmap_size); > + > + if (pread(vbasedev->fd, bitmap, bitmap_size, > + region_bitmap->fd_offset) != bitmap_size) { > + error_report("vfio: Failed to read dirty bitmap data"); > + return -1; > + } > + > + cpu_physical_memory_set_dirty_lebitmap(bitmap, start_addr, page_nr); > + > + g_free(bitmap); > + } else { > + cpu_physical_memory_set_dirty_lebitmap( > + region_bitmap->mmaps[0].mmap, > + start_addr, page_nr); > + } > + return 0; > +} > + > +int vfio_set_dirty_page_bitmap(VFIOPCIDevice *vdev, > + uint64_t start_addr, uint64_t page_nr) > +{ > + VFIORegion *region_bitmap = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_BITMAP]; > + unsigned long chunk_size = region_bitmap->size; > + uint64_t chunk_pg_nr = (chunk_size / sizeof(unsigned long)) * > + BITS_PER_LONG; > + > + uint64_t cnt_left; > + int rc = 0; > + > + cnt_left = page_nr; > + > + while (cnt_left >= chunk_pg_nr) { > + rc = vfio_set_dirty_page_bitmap_chunk(vdev, start_addr, chunk_pg_nr); > + if (rc) { > + goto exit; > + } > + cnt_left -= chunk_pg_nr; > + start_addr += start_addr; > + } > + rc = vfio_set_dirty_page_bitmap_chunk(vdev, start_addr, cnt_left); > + > +exit: > + return rc; > +} > + > +static int vfio_set_device_state(VFIOPCIDevice *vdev, > + uint32_t dev_state) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + uint32_t sz = sizeof(dev_state); > + > + if (!vdev->migration) { > + return -1; > + } > + > + if (pwrite(vbasedev->fd, &dev_state, sz, > + region->fd_offset + > + offsetof(struct vfio_device_state_ctl, device_state)) > + != sz) { > + error_report("vfio: Failed to set device state %d", dev_state); Can the kernel reject this if a state transition is not allowed (or are all transitions allowed?) > + return -1; > + } > + vdev->migration->device_state = dev_state; > + return 0; > +} > + > +static int vfio_get_device_data_caps(VFIOPCIDevice *vdev) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + > + uint32_t caps; > + uint32_t size = sizeof(caps); > + > + if (pread(vbasedev->fd, &caps, size, > + region->fd_offset + > + offsetof(struct vfio_device_state_ctl, caps)) > + != size) { > + error_report("%s Failed to read data caps of device states", > + vbasedev->name); > + return -1; > + } > + vdev->migration->data_caps = caps; > + return 0; > +} > + > + > +static int vfio_check_devstate_version(VFIOPCIDevice *vdev) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + > + uint32_t version; > + uint32_t size = sizeof(version); > + > + if (pread(vbasedev->fd, &version, size, > + region->fd_offset + > + offsetof(struct vfio_device_state_ctl, version)) > + != size) { > + error_report("%s Failed to read version of device state interfaces", > + vbasedev->name); > + return -1; > + } > + > + if (version != VFIO_DEVICE_STATE_INTERFACE_VERSION) { > + error_report("%s migration version mismatch, right version is %d", > + vbasedev->name, VFIO_DEVICE_STATE_INTERFACE_VERSION); So, we require an exact match... or should we allow to extend the interface in an backwards-compatible way, in which case we'd require (QEMU interface version) <= (kernel interface version)? > + return -1; > + } > + > + return 0; > +} > + > +static void vfio_vm_change_state_handler(void *pv, int running, RunState state) > +{ > + VFIOPCIDevice *vdev = pv; > + uint32_t dev_state = vdev->migration->device_state; > + > + if (!running) { > + dev_state |= VFIO_DEVICE_STATE_STOP; > + } else { > + dev_state &= ~VFIO_DEVICE_STATE_STOP; > + } > + > + vfio_set_device_state(vdev, dev_state); > +} > + > +static void vfio_save_live_pending(QEMUFile *f, void *opaque, > + uint64_t max_size, > + uint64_t *res_precopy_only, > + uint64_t *res_compatible, > + uint64_t *res_post_copy_only) > +{ > + VFIOPCIDevice *vdev = opaque; > + > + if (!vfio_device_data_cap_device_memory(vdev)) { > + return; > + } > + > + return; > +} > + > +static int vfio_save_iterate(QEMUFile *f, void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + > + if (!vfio_device_data_cap_device_memory(vdev)) { > + return 0; > + } > + > + return 0; > +} These look a bit weird... > + > +static void vfio_pci_load_config(VFIOPCIDevice *vdev, QEMUFile *f) > +{ > + PCIDevice *pdev = &vdev->pdev; > + uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i; > + bool msi_64bit; > + > + /* retore pci bar configuration */ > + ctl = pci_default_read_config(pdev, PCI_COMMAND, 2); > + vfio_pci_write_config(pdev, PCI_COMMAND, > + ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2); > + for (i = 0; i < PCI_ROM_SLOT; i++) { > + bar_cfg = qemu_get_be32(f); > + vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4); > + } > + vfio_pci_write_config(pdev, PCI_COMMAND, > + ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2); > + > + /* restore msi configuration */ > + ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); > + msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT); > + > + vfio_pci_write_config(&vdev->pdev, > + pdev->msi_cap + PCI_MSI_FLAGS, > + ctl & (!PCI_MSI_FLAGS_ENABLE), 2); > + > + msi_lo = qemu_get_be32(f); > + vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4); > + > + if (msi_64bit) { > + msi_hi = qemu_get_be32(f); > + vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, > + msi_hi, 4); > + } > + msi_data = qemu_get_be32(f); > + vfio_pci_write_config(pdev, > + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), > + msi_data, 2); > + > + vfio_pci_write_config(&vdev->pdev, pdev->msi_cap + PCI_MSI_FLAGS, > + ctl | PCI_MSI_FLAGS_ENABLE, 2); > + Ok, this function is indeed pci-specific and probably should be moved to the vfio-pci code (other types could hook themselves up in the same place, then). > +} > + > +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) > +{ > + VFIOPCIDevice *vdev = opaque; > + int flag; > + uint64_t len; > + int ret = 0; > + > + if (version_id != VFIO_DEVICE_STATE_INTERFACE_VERSION) { > + return -EINVAL; > + } > + > + do { > + flag = qemu_get_byte(f); > + > + switch (flag & ~VFIO_SAVE_FLAG_CONTINUE) { > + case VFIO_SAVE_FLAG_SETUP: > + break; > + case VFIO_SAVE_FLAG_PCI: > + vfio_pci_load_config(vdev, f); > + break; > + case VFIO_SAVE_FLAG_DEVCONFIG: > + len = qemu_get_be64(f); > + vfio_load_data_device_config(vdev, f, len); > + break; > + default: > + ret = -EINVAL; > + } > + } while (flag & VFIO_SAVE_FLAG_CONTINUE); > + > + return ret; > +} > + > +static void vfio_pci_save_config(VFIOPCIDevice *vdev, QEMUFile *f) > +{ > + PCIDevice *pdev = &vdev->pdev; > + uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i; > + bool msi_64bit; > + > + for (i = 0; i < PCI_ROM_SLOT; i++) { > + bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4); > + qemu_put_be32(f, bar_cfg); > + } > + > + msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); > + msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT); > + > + msi_lo = pci_default_read_config(pdev, > + pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4); > + qemu_put_be32(f, msi_lo); > + > + if (msi_64bit) { > + msi_hi = pci_default_read_config(pdev, > + pdev->msi_cap + PCI_MSI_ADDRESS_HI, > + 4); > + qemu_put_be32(f, msi_hi); > + } > + > + msi_data = pci_default_read_config(pdev, > + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), > + 2); > + qemu_put_be32(f, msi_data); > + > +} > + > +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + int rc = 0; > + > + qemu_put_byte(f, VFIO_SAVE_FLAG_PCI | VFIO_SAVE_FLAG_CONTINUE); > + vfio_pci_save_config(vdev, f); > + > + qemu_put_byte(f, VFIO_SAVE_FLAG_DEVCONFIG); > + rc += vfio_get_device_config_size(vdev); > + rc += vfio_save_data_device_config(vdev, f); > + > + return rc; > +} > + > +static int vfio_save_setup(QEMUFile *f, void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP); > + > + vfio_set_device_state(vdev, VFIO_DEVICE_STATE_RUNNING | > + VFIO_DEVICE_STATE_LOGGING); > + return 0; > +} > + > +static int vfio_load_setup(QEMUFile *f, void *opaque) > +{ > + return 0; > +} > + > +static void vfio_save_cleanup(void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + uint32_t dev_state = vdev->migration->device_state; > + > + dev_state &= ~VFIO_DEVICE_STATE_LOGGING; > + > + vfio_set_device_state(vdev, dev_state); > +} These look like they should be type-independent, again. > + > +static SaveVMHandlers savevm_vfio_handlers = { > + .save_setup = vfio_save_setup, > + .save_live_pending = vfio_save_live_pending, > + .save_live_iterate = vfio_save_iterate, > + .save_live_complete_precopy = vfio_save_complete_precopy, > + .save_cleanup = vfio_save_cleanup, > + .load_setup = vfio_load_setup, > + .load_state = vfio_load_state, > +}; > + > +int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp) > +{ > + int ret; > + Error *local_err = NULL; > + vdev->migration = g_new0(VFIOMigration, 1); > + > + if (vfio_device_state_region_setup(vdev, > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL], > + VFIO_REGION_SUBTYPE_DEVICE_STATE_CTL, > + "device-state-ctl")) { > + goto error; > + } > + > + if (vfio_check_devstate_version(vdev)) { > + goto error; > + } > + > + if (vfio_get_device_data_caps(vdev)) { > + goto error; > + } > + > + if (vfio_device_state_region_setup(vdev, > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_CONFIG], > + VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_CONFIG, > + "device-state-data-device-config")) { > + goto error; > + } > + > + if (vfio_device_data_cap_device_memory(vdev)) { > + error_report("No suppport of data cap device memory Yet"); > + goto error; > + } > + > + if (vfio_device_data_cap_system_memory(vdev) && > + vfio_device_state_region_setup(vdev, > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_BITMAP], > + VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_DIRTYBITMAP, > + "device-state-data-dirtybitmap")) { > + goto error; > + } > + > + vdev->migration->device_state = VFIO_DEVICE_STATE_RUNNING; > + > + register_savevm_live(NULL, TYPE_VFIO_PCI, -1, > + VFIO_DEVICE_STATE_INTERFACE_VERSION, > + &savevm_vfio_handlers, > + vdev); > + > + vdev->migration->vm_state = > + qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev); > + > + return 0; > +error: > + error_setg(&vdev->migration_blocker, > + "VFIO device doesn't support migration"); > + ret = migrate_add_blocker(vdev->migration_blocker, &local_err); > + if (local_err) { > + error_propagate(errp, local_err); > + error_free(vdev->migration_blocker); > + } > + > + g_free(vdev->migration); > + vdev->migration = NULL; > + > + return ret; > +} > + > +void vfio_migration_finalize(VFIOPCIDevice *vdev) > +{ > + if (vdev->migration) { > + int i; > + qemu_del_vm_change_state_handler(vdev->migration->vm_state); > + unregister_savevm(NULL, TYPE_VFIO_PCI, vdev); > + for (i = 0; i < VFIO_DEVSTATE_REGION_NUM; i++) { > + vfio_region_finalize(&vdev->migration->region[i]); > + } > + g_free(vdev->migration); > + vdev->migration = NULL; > + } else if (vdev->migration_blocker) { > + migrate_del_blocker(vdev->migration_blocker); > + error_free(vdev->migration_blocker); > + } > +} > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index c0cb1ec..b8e006b 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -37,7 +37,6 @@ > > #define MSIX_CAP_LENGTH 12 > > -#define TYPE_VFIO_PCI "vfio-pci" Why do you need to move this? That looks like a sign that the layering needs work. > #define PCI_VFIO(obj) OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI) > > static void vfio_disable_interrupts(VFIOPCIDevice *vdev); > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index b1ae4c0..4b7b1bb 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -19,6 +19,7 @@ > #include "qemu/event_notifier.h" > #include "qemu/queue.h" > #include "qemu/timer.h" > +#include "sysemu/sysemu.h" > > #define PCI_ANY_ID (~0) > > @@ -56,6 +57,21 @@ typedef struct VFIOBAR { > QLIST_HEAD(, VFIOQuirk) quirks; > } VFIOBAR; > > +enum { > + VFIO_DEVSTATE_REGION_CTL = 0, > + VFIO_DEVSTATE_REGION_DATA_CONFIG, > + VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY, > + VFIO_DEVSTATE_REGION_DATA_BITMAP, > + VFIO_DEVSTATE_REGION_NUM, > +}; > +typedef struct VFIOMigration { > + VFIORegion region[VFIO_DEVSTATE_REGION_NUM]; > + uint32_t data_caps; > + uint32_t device_state; > + uint64_t devconfig_size; > + VMChangeStateEntry *vm_state; > +} VFIOMigration; > + > typedef struct VFIOVGARegion { > MemoryRegion mem; > off_t offset; > @@ -132,6 +148,8 @@ typedef struct VFIOPCIDevice { > VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ > VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ > void *igd_opregion; > + VFIOMigration *migration; As said, it would probably be better to hang this off VFIODevice. > + Error *migration_blocker; > PCIHostDeviceAddress host; > EventNotifier err_notifier; > EventNotifier req_notifier; > @@ -198,5 +216,10 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, > void vfio_display_reset(VFIOPCIDevice *vdev); > int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp); > void vfio_display_finalize(VFIOPCIDevice *vdev); > - > +bool vfio_device_data_cap_system_memory(VFIOPCIDevice *vdev); > +bool vfio_device_data_cap_device_memory(VFIOPCIDevice *vdev); > +int vfio_set_dirty_page_bitmap(VFIOPCIDevice *vdev, > + uint64_t start_addr, uint64_t page_nr); > +int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp); > +void vfio_migration_finalize(VFIOPCIDevice *vdev); And the interfaces should be in vfio-common. > #endif /* HW_VFIO_VFIO_PCI_H */ > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 1b434d0..ed43613 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -32,6 +32,7 @@ > #endif > > #define VFIO_MSG_PREFIX "vfio %s: " > +#define TYPE_VFIO_PCI "vfio-pci" > > enum { > VFIO_DEVICE_TYPE_PCI = 0, From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([209.51.188.92]:50634) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1gw6X0-0002OM-Jj for qemu-devel@nongnu.org; Tue, 19 Feb 2019 09:38:01 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1gw6Wv-0005mx-Gn for qemu-devel@nongnu.org; Tue, 19 Feb 2019 09:37:56 -0500 Received: from mx1.redhat.com ([209.132.183.28]:35710) by eggs.gnu.org with esmtps (TLS1.0:DHE_RSA_AES_256_CBC_SHA1:32) (Exim 4.71) (envelope-from ) id 1gw6Wm-0005ho-Am for qemu-devel@nongnu.org; Tue, 19 Feb 2019 09:37:48 -0500 Date: Tue, 19 Feb 2019 15:37:24 +0100 From: Cornelia Huck Message-ID: <20190219153724.12460160.cohuck@redhat.com> In-Reply-To: <1550566347-3648-1-git-send-email-yan.y.zhao@intel.com> References: <1550566254-3545-1-git-send-email-yan.y.zhao@intel.com> <1550566347-3648-1-git-send-email-yan.y.zhao@intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [PATCH 2/5] vfio/migration: support device of device config capability List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Yan Zhao Cc: alex.williamson@redhat.com, qemu-devel@nongnu.org, intel-gvt-dev@lists.freedesktop.org, Zhengxiao.zx@Alibaba-inc.com, yi.l.liu@intel.com, eskultet@redhat.com, ziye.yang@intel.com, shuangtai.tst@alibaba-inc.com, dgilbert@redhat.com, zhi.a.wang@intel.com, mlevitsk@redhat.com, pasic@linux.ibm.com, aik@ozlabs.ru, eauger@redhat.com, felipe@nutanix.com, jonathan.davies@nutanix.com, changpeng.liu@intel.com, Ken.Xue@amd.com, kwankhede@nvidia.com, kevin.tian@intel.com, cjia@nvidia.com, arei.gonglei@huawei.com, kvm@vger.kernel.org On Tue, 19 Feb 2019 16:52:27 +0800 Yan Zhao wrote: > Device config is the default data that every device should have. so > device config capability is by default on, no need to set. > > - Currently two type of resources are saved/loaded for device of device > config capability: > General PCI config data, and Device config data. > They are copies as a whole when precopy is stopped. > > Migration setup flow: > - Setup device state regions, check its device state version and capabilities. > Mmap Device Config Region and Dirty Bitmap Region, if available. > - If device state regions are failed to get setup, a migration blocker is > registered instead. > - Added SaveVMHandlers to register device state save/load handlers. > - Register VM state change handler to set device's running/stop states. > - On migration startup on source machine, set device's state to > VFIO_DEVICE_STATE_LOGGING > > Signed-off-by: Yan Zhao > Signed-off-by: Yulei Zhang > --- > hw/vfio/Makefile.objs | 2 +- > hw/vfio/migration.c | 633 ++++++++++++++++++++++++++++++++++++++++++ > hw/vfio/pci.c | 1 - > hw/vfio/pci.h | 25 +- > include/hw/vfio/vfio-common.h | 1 + > 5 files changed, 659 insertions(+), 3 deletions(-) > create mode 100644 hw/vfio/migration.c > > diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs > index 8b3f664..f32ff19 100644 > --- a/hw/vfio/Makefile.objs > +++ b/hw/vfio/Makefile.objs > @@ -1,6 +1,6 @@ > ifeq ($(CONFIG_LINUX), y) > obj-$(CONFIG_SOFTMMU) += common.o > -obj-$(CONFIG_PCI) += pci.o pci-quirks.o display.o > +obj-$(CONFIG_PCI) += pci.o pci-quirks.o display.o migration.o I think you want to split the migration code: The type-independent code, and the pci-specific code. > obj-$(CONFIG_VFIO_CCW) += ccw.o > obj-$(CONFIG_SOFTMMU) += platform.o > obj-$(CONFIG_VFIO_XGMAC) += calxeda-xgmac.o > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c > new file mode 100644 > index 0000000..16d6395 > --- /dev/null > +++ b/hw/vfio/migration.c > @@ -0,0 +1,633 @@ > +#include "qemu/osdep.h" > + > +#include "hw/vfio/vfio-common.h" > +#include "migration/blocker.h" > +#include "migration/register.h" > +#include "qapi/error.h" > +#include "pci.h" > +#include "sysemu/kvm.h" > +#include "exec/ram_addr.h" > + > +#define VFIO_SAVE_FLAG_SETUP 0 > +#define VFIO_SAVE_FLAG_PCI 1 > +#define VFIO_SAVE_FLAG_DEVCONFIG 2 > +#define VFIO_SAVE_FLAG_DEVMEMORY 4 > +#define VFIO_SAVE_FLAG_CONTINUE 8 > + > +static int vfio_device_state_region_setup(VFIOPCIDevice *vdev, > + VFIORegion *region, uint32_t subtype, const char *name) This function looks like it should be more generic and e.g. take a VFIODevice instead of a VFIOPCIDevice as argument. > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + struct vfio_region_info *info; > + int ret; > + > + ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_DEVICE_STATE, > + subtype, &info); > + if (ret) { > + error_report("Failed to get info of region %s", name); > + return ret; > + } > + > + if (vfio_region_setup(OBJECT(vdev), vbasedev, > + region, info->index, name)) { > + error_report("Failed to setup migrtion region %s", name); > + return ret; > + } > + > + if (vfio_region_mmap(region)) { > + error_report("Failed to mmap migrtion region %s", name); > + } > + > + return 0; > +} > + > +bool vfio_device_data_cap_system_memory(VFIOPCIDevice *vdev) > +{ > + return !!(vdev->migration->data_caps & VFIO_DEVICE_DATA_CAP_SYSTEM_MEMORY); > +} > + > +bool vfio_device_data_cap_device_memory(VFIOPCIDevice *vdev) > +{ > + return !!(vdev->migration->data_caps & VFIO_DEVICE_DATA_CAP_DEVICE_MEMORY); > +} These two as well. The migration structure should probably hang off the VFIODevice instead. > + > +static bool vfio_device_state_region_mmaped(VFIORegion *region) > +{ > + bool mmaped = true; > + if (region->nr_mmaps != 1 || region->mmaps[0].offset || > + (region->size != region->mmaps[0].size) || > + (region->mmaps[0].mmap == NULL)) { > + mmaped = false; > + } > + > + return mmaped; > +} s/mmaped/mmapped/ ? > + > +static int vfio_get_device_config_size(VFIOPCIDevice *vdev) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region_ctl = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + VFIORegion *region_config = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_CONFIG]; This looks like it should not depend on pci, either. > + uint64_t len; > + int sz; > + > + sz = sizeof(len); > + if (pread(vbasedev->fd, &len, sz, > + region_ctl->fd_offset + > + offsetof(struct vfio_device_state_ctl, device_config.size)) > + != sz) { > + error_report("vfio: Failed to get length of device config"); > + return -1; > + } > + if (len > region_config->size) { > + error_report("vfio: Error device config length"); > + return -1; > + } > + vdev->migration->devconfig_size = len; > + > + return 0; > +} > + > +static int vfio_set_device_config_size(VFIOPCIDevice *vdev, uint64_t size) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region_ctl = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + VFIORegion *region_config = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_CONFIG]; Ditto. Also for the functions below. > + int sz; > + > + if (size > region_config->size) { > + return -1; > + } > + > + sz = sizeof(size); > + if (pwrite(vbasedev->fd, &size, sz, > + region_ctl->fd_offset + > + offsetof(struct vfio_device_state_ctl, device_config.size)) > + != sz) { > + error_report("vfio: Failed to set length of device config"); > + return -1; > + } > + vdev->migration->devconfig_size = size; > + return 0; > +} > + > +static int vfio_save_data_device_config(VFIOPCIDevice *vdev, QEMUFile *f) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region_ctl = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + VFIORegion *region_config = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_CONFIG]; > + void *dest; > + uint32_t sz; > + uint8_t *buf = NULL; > + uint32_t action = VFIO_DEVICE_DATA_ACTION_GET_BUFFER; > + uint64_t len = vdev->migration->devconfig_size; > + > + qemu_put_be64(f, len); Why big endian? (Generally, do we need any endianness considerations?) > + > + sz = sizeof(action); > + if (pwrite(vbasedev->fd, &action, sz, > + region_ctl->fd_offset + > + offsetof(struct vfio_device_state_ctl, device_config.action)) > + != sz) { > + error_report("vfio: action failure for device config get buffer"); > + return -1; > + } Might make sense to wrap this into a set_action() helper that takes a SET_BUFFER/GET_BUFFER argument. > + > + if (!vfio_device_state_region_mmaped(region_config)) { > + buf = g_malloc(len); > + if (buf == NULL) { > + error_report("vfio: Failed to allocate memory for migrate"); > + return -1; > + } > + if (pread(vbasedev->fd, buf, len, region_config->fd_offset) != len) { > + error_report("vfio: Failed read device config buffer"); > + return -1; > + } > + qemu_put_buffer(f, buf, len); > + g_free(buf); > + } else { > + dest = region_config->mmaps[0].mmap; > + qemu_put_buffer(f, dest, len); > + } > + return 0; > +} > + > +static int vfio_load_data_device_config(VFIOPCIDevice *vdev, > + QEMUFile *f, uint64_t len) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region_ctl = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + VFIORegion *region_config = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_CONFIG]; > + void *dest; > + uint32_t sz; > + uint8_t *buf = NULL; > + uint32_t action = VFIO_DEVICE_DATA_ACTION_SET_BUFFER; > + > + vfio_set_device_config_size(vdev, len); > + > + if (!vfio_device_state_region_mmaped(region_config)) { > + buf = g_malloc(len); > + if (buf == NULL) { > + error_report("vfio: Failed to allocate memory for migrate"); > + return -1; > + } > + qemu_get_buffer(f, buf, len); > + if (pwrite(vbasedev->fd, buf, len, > + region_config->fd_offset) != len) { > + error_report("vfio: Failed to write devie config buffer"); > + return -1; > + } > + g_free(buf); > + } else { > + dest = region_config->mmaps[0].mmap; > + qemu_get_buffer(f, dest, len); > + } > + > + sz = sizeof(action); > + if (pwrite(vbasedev->fd, &action, sz, > + region_ctl->fd_offset + > + offsetof(struct vfio_device_state_ctl, device_config.action)) > + != sz) { > + error_report("vfio: action failure for device config set buffer"); > + return -1; > + } > + > + return 0; > +} > + > +static int vfio_set_dirty_page_bitmap_chunk(VFIOPCIDevice *vdev, > + uint64_t start_addr, uint64_t page_nr) > +{ > + > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region_ctl = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + VFIORegion *region_bitmap = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_BITMAP]; > + unsigned long bitmap_size = > + BITS_TO_LONGS(page_nr) * sizeof(unsigned long); > + uint32_t sz; > + > + struct { > + __u64 start_addr; > + __u64 page_nr; > + } system_memory; > + system_memory.start_addr = start_addr; > + system_memory.page_nr = page_nr; > + sz = sizeof(system_memory); > + if (pwrite(vbasedev->fd, &system_memory, sz, > + region_ctl->fd_offset + > + offsetof(struct vfio_device_state_ctl, system_memory)) > + != sz) { > + error_report("vfio: Failed to set system memory range for dirty pages"); > + return -1; > + } > + > + if (!vfio_device_state_region_mmaped(region_bitmap)) { > + void *bitmap = g_malloc0(bitmap_size); > + > + if (pread(vbasedev->fd, bitmap, bitmap_size, > + region_bitmap->fd_offset) != bitmap_size) { > + error_report("vfio: Failed to read dirty bitmap data"); > + return -1; > + } > + > + cpu_physical_memory_set_dirty_lebitmap(bitmap, start_addr, page_nr); > + > + g_free(bitmap); > + } else { > + cpu_physical_memory_set_dirty_lebitmap( > + region_bitmap->mmaps[0].mmap, > + start_addr, page_nr); > + } > + return 0; > +} > + > +int vfio_set_dirty_page_bitmap(VFIOPCIDevice *vdev, > + uint64_t start_addr, uint64_t page_nr) > +{ > + VFIORegion *region_bitmap = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_BITMAP]; > + unsigned long chunk_size = region_bitmap->size; > + uint64_t chunk_pg_nr = (chunk_size / sizeof(unsigned long)) * > + BITS_PER_LONG; > + > + uint64_t cnt_left; > + int rc = 0; > + > + cnt_left = page_nr; > + > + while (cnt_left >= chunk_pg_nr) { > + rc = vfio_set_dirty_page_bitmap_chunk(vdev, start_addr, chunk_pg_nr); > + if (rc) { > + goto exit; > + } > + cnt_left -= chunk_pg_nr; > + start_addr += start_addr; > + } > + rc = vfio_set_dirty_page_bitmap_chunk(vdev, start_addr, cnt_left); > + > +exit: > + return rc; > +} > + > +static int vfio_set_device_state(VFIOPCIDevice *vdev, > + uint32_t dev_state) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + uint32_t sz = sizeof(dev_state); > + > + if (!vdev->migration) { > + return -1; > + } > + > + if (pwrite(vbasedev->fd, &dev_state, sz, > + region->fd_offset + > + offsetof(struct vfio_device_state_ctl, device_state)) > + != sz) { > + error_report("vfio: Failed to set device state %d", dev_state); Can the kernel reject this if a state transition is not allowed (or are all transitions allowed?) > + return -1; > + } > + vdev->migration->device_state = dev_state; > + return 0; > +} > + > +static int vfio_get_device_data_caps(VFIOPCIDevice *vdev) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + > + uint32_t caps; > + uint32_t size = sizeof(caps); > + > + if (pread(vbasedev->fd, &caps, size, > + region->fd_offset + > + offsetof(struct vfio_device_state_ctl, caps)) > + != size) { > + error_report("%s Failed to read data caps of device states", > + vbasedev->name); > + return -1; > + } > + vdev->migration->data_caps = caps; > + return 0; > +} > + > + > +static int vfio_check_devstate_version(VFIOPCIDevice *vdev) > +{ > + VFIODevice *vbasedev = &vdev->vbasedev; > + VFIORegion *region = > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL]; > + > + uint32_t version; > + uint32_t size = sizeof(version); > + > + if (pread(vbasedev->fd, &version, size, > + region->fd_offset + > + offsetof(struct vfio_device_state_ctl, version)) > + != size) { > + error_report("%s Failed to read version of device state interfaces", > + vbasedev->name); > + return -1; > + } > + > + if (version != VFIO_DEVICE_STATE_INTERFACE_VERSION) { > + error_report("%s migration version mismatch, right version is %d", > + vbasedev->name, VFIO_DEVICE_STATE_INTERFACE_VERSION); So, we require an exact match... or should we allow to extend the interface in an backwards-compatible way, in which case we'd require (QEMU interface version) <= (kernel interface version)? > + return -1; > + } > + > + return 0; > +} > + > +static void vfio_vm_change_state_handler(void *pv, int running, RunState state) > +{ > + VFIOPCIDevice *vdev = pv; > + uint32_t dev_state = vdev->migration->device_state; > + > + if (!running) { > + dev_state |= VFIO_DEVICE_STATE_STOP; > + } else { > + dev_state &= ~VFIO_DEVICE_STATE_STOP; > + } > + > + vfio_set_device_state(vdev, dev_state); > +} > + > +static void vfio_save_live_pending(QEMUFile *f, void *opaque, > + uint64_t max_size, > + uint64_t *res_precopy_only, > + uint64_t *res_compatible, > + uint64_t *res_post_copy_only) > +{ > + VFIOPCIDevice *vdev = opaque; > + > + if (!vfio_device_data_cap_device_memory(vdev)) { > + return; > + } > + > + return; > +} > + > +static int vfio_save_iterate(QEMUFile *f, void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + > + if (!vfio_device_data_cap_device_memory(vdev)) { > + return 0; > + } > + > + return 0; > +} These look a bit weird... > + > +static void vfio_pci_load_config(VFIOPCIDevice *vdev, QEMUFile *f) > +{ > + PCIDevice *pdev = &vdev->pdev; > + uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i; > + bool msi_64bit; > + > + /* retore pci bar configuration */ > + ctl = pci_default_read_config(pdev, PCI_COMMAND, 2); > + vfio_pci_write_config(pdev, PCI_COMMAND, > + ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2); > + for (i = 0; i < PCI_ROM_SLOT; i++) { > + bar_cfg = qemu_get_be32(f); > + vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4); > + } > + vfio_pci_write_config(pdev, PCI_COMMAND, > + ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2); > + > + /* restore msi configuration */ > + ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); > + msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT); > + > + vfio_pci_write_config(&vdev->pdev, > + pdev->msi_cap + PCI_MSI_FLAGS, > + ctl & (!PCI_MSI_FLAGS_ENABLE), 2); > + > + msi_lo = qemu_get_be32(f); > + vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4); > + > + if (msi_64bit) { > + msi_hi = qemu_get_be32(f); > + vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, > + msi_hi, 4); > + } > + msi_data = qemu_get_be32(f); > + vfio_pci_write_config(pdev, > + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), > + msi_data, 2); > + > + vfio_pci_write_config(&vdev->pdev, pdev->msi_cap + PCI_MSI_FLAGS, > + ctl | PCI_MSI_FLAGS_ENABLE, 2); > + Ok, this function is indeed pci-specific and probably should be moved to the vfio-pci code (other types could hook themselves up in the same place, then). > +} > + > +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) > +{ > + VFIOPCIDevice *vdev = opaque; > + int flag; > + uint64_t len; > + int ret = 0; > + > + if (version_id != VFIO_DEVICE_STATE_INTERFACE_VERSION) { > + return -EINVAL; > + } > + > + do { > + flag = qemu_get_byte(f); > + > + switch (flag & ~VFIO_SAVE_FLAG_CONTINUE) { > + case VFIO_SAVE_FLAG_SETUP: > + break; > + case VFIO_SAVE_FLAG_PCI: > + vfio_pci_load_config(vdev, f); > + break; > + case VFIO_SAVE_FLAG_DEVCONFIG: > + len = qemu_get_be64(f); > + vfio_load_data_device_config(vdev, f, len); > + break; > + default: > + ret = -EINVAL; > + } > + } while (flag & VFIO_SAVE_FLAG_CONTINUE); > + > + return ret; > +} > + > +static void vfio_pci_save_config(VFIOPCIDevice *vdev, QEMUFile *f) > +{ > + PCIDevice *pdev = &vdev->pdev; > + uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i; > + bool msi_64bit; > + > + for (i = 0; i < PCI_ROM_SLOT; i++) { > + bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4); > + qemu_put_be32(f, bar_cfg); > + } > + > + msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); > + msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT); > + > + msi_lo = pci_default_read_config(pdev, > + pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4); > + qemu_put_be32(f, msi_lo); > + > + if (msi_64bit) { > + msi_hi = pci_default_read_config(pdev, > + pdev->msi_cap + PCI_MSI_ADDRESS_HI, > + 4); > + qemu_put_be32(f, msi_hi); > + } > + > + msi_data = pci_default_read_config(pdev, > + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), > + 2); > + qemu_put_be32(f, msi_data); > + > +} > + > +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + int rc = 0; > + > + qemu_put_byte(f, VFIO_SAVE_FLAG_PCI | VFIO_SAVE_FLAG_CONTINUE); > + vfio_pci_save_config(vdev, f); > + > + qemu_put_byte(f, VFIO_SAVE_FLAG_DEVCONFIG); > + rc += vfio_get_device_config_size(vdev); > + rc += vfio_save_data_device_config(vdev, f); > + > + return rc; > +} > + > +static int vfio_save_setup(QEMUFile *f, void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + qemu_put_byte(f, VFIO_SAVE_FLAG_SETUP); > + > + vfio_set_device_state(vdev, VFIO_DEVICE_STATE_RUNNING | > + VFIO_DEVICE_STATE_LOGGING); > + return 0; > +} > + > +static int vfio_load_setup(QEMUFile *f, void *opaque) > +{ > + return 0; > +} > + > +static void vfio_save_cleanup(void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + uint32_t dev_state = vdev->migration->device_state; > + > + dev_state &= ~VFIO_DEVICE_STATE_LOGGING; > + > + vfio_set_device_state(vdev, dev_state); > +} These look like they should be type-independent, again. > + > +static SaveVMHandlers savevm_vfio_handlers = { > + .save_setup = vfio_save_setup, > + .save_live_pending = vfio_save_live_pending, > + .save_live_iterate = vfio_save_iterate, > + .save_live_complete_precopy = vfio_save_complete_precopy, > + .save_cleanup = vfio_save_cleanup, > + .load_setup = vfio_load_setup, > + .load_state = vfio_load_state, > +}; > + > +int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp) > +{ > + int ret; > + Error *local_err = NULL; > + vdev->migration = g_new0(VFIOMigration, 1); > + > + if (vfio_device_state_region_setup(vdev, > + &vdev->migration->region[VFIO_DEVSTATE_REGION_CTL], > + VFIO_REGION_SUBTYPE_DEVICE_STATE_CTL, > + "device-state-ctl")) { > + goto error; > + } > + > + if (vfio_check_devstate_version(vdev)) { > + goto error; > + } > + > + if (vfio_get_device_data_caps(vdev)) { > + goto error; > + } > + > + if (vfio_device_state_region_setup(vdev, > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_CONFIG], > + VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_CONFIG, > + "device-state-data-device-config")) { > + goto error; > + } > + > + if (vfio_device_data_cap_device_memory(vdev)) { > + error_report("No suppport of data cap device memory Yet"); > + goto error; > + } > + > + if (vfio_device_data_cap_system_memory(vdev) && > + vfio_device_state_region_setup(vdev, > + &vdev->migration->region[VFIO_DEVSTATE_REGION_DATA_BITMAP], > + VFIO_REGION_SUBTYPE_DEVICE_STATE_DATA_DIRTYBITMAP, > + "device-state-data-dirtybitmap")) { > + goto error; > + } > + > + vdev->migration->device_state = VFIO_DEVICE_STATE_RUNNING; > + > + register_savevm_live(NULL, TYPE_VFIO_PCI, -1, > + VFIO_DEVICE_STATE_INTERFACE_VERSION, > + &savevm_vfio_handlers, > + vdev); > + > + vdev->migration->vm_state = > + qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev); > + > + return 0; > +error: > + error_setg(&vdev->migration_blocker, > + "VFIO device doesn't support migration"); > + ret = migrate_add_blocker(vdev->migration_blocker, &local_err); > + if (local_err) { > + error_propagate(errp, local_err); > + error_free(vdev->migration_blocker); > + } > + > + g_free(vdev->migration); > + vdev->migration = NULL; > + > + return ret; > +} > + > +void vfio_migration_finalize(VFIOPCIDevice *vdev) > +{ > + if (vdev->migration) { > + int i; > + qemu_del_vm_change_state_handler(vdev->migration->vm_state); > + unregister_savevm(NULL, TYPE_VFIO_PCI, vdev); > + for (i = 0; i < VFIO_DEVSTATE_REGION_NUM; i++) { > + vfio_region_finalize(&vdev->migration->region[i]); > + } > + g_free(vdev->migration); > + vdev->migration = NULL; > + } else if (vdev->migration_blocker) { > + migrate_del_blocker(vdev->migration_blocker); > + error_free(vdev->migration_blocker); > + } > +} > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index c0cb1ec..b8e006b 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -37,7 +37,6 @@ > > #define MSIX_CAP_LENGTH 12 > > -#define TYPE_VFIO_PCI "vfio-pci" Why do you need to move this? That looks like a sign that the layering needs work. > #define PCI_VFIO(obj) OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI) > > static void vfio_disable_interrupts(VFIOPCIDevice *vdev); > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index b1ae4c0..4b7b1bb 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -19,6 +19,7 @@ > #include "qemu/event_notifier.h" > #include "qemu/queue.h" > #include "qemu/timer.h" > +#include "sysemu/sysemu.h" > > #define PCI_ANY_ID (~0) > > @@ -56,6 +57,21 @@ typedef struct VFIOBAR { > QLIST_HEAD(, VFIOQuirk) quirks; > } VFIOBAR; > > +enum { > + VFIO_DEVSTATE_REGION_CTL = 0, > + VFIO_DEVSTATE_REGION_DATA_CONFIG, > + VFIO_DEVSTATE_REGION_DATA_DEVICE_MEMORY, > + VFIO_DEVSTATE_REGION_DATA_BITMAP, > + VFIO_DEVSTATE_REGION_NUM, > +}; > +typedef struct VFIOMigration { > + VFIORegion region[VFIO_DEVSTATE_REGION_NUM]; > + uint32_t data_caps; > + uint32_t device_state; > + uint64_t devconfig_size; > + VMChangeStateEntry *vm_state; > +} VFIOMigration; > + > typedef struct VFIOVGARegion { > MemoryRegion mem; > off_t offset; > @@ -132,6 +148,8 @@ typedef struct VFIOPCIDevice { > VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ > VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ > void *igd_opregion; > + VFIOMigration *migration; As said, it would probably be better to hang this off VFIODevice. > + Error *migration_blocker; > PCIHostDeviceAddress host; > EventNotifier err_notifier; > EventNotifier req_notifier; > @@ -198,5 +216,10 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, > void vfio_display_reset(VFIOPCIDevice *vdev); > int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp); > void vfio_display_finalize(VFIOPCIDevice *vdev); > - > +bool vfio_device_data_cap_system_memory(VFIOPCIDevice *vdev); > +bool vfio_device_data_cap_device_memory(VFIOPCIDevice *vdev); > +int vfio_set_dirty_page_bitmap(VFIOPCIDevice *vdev, > + uint64_t start_addr, uint64_t page_nr); > +int vfio_migration_init(VFIOPCIDevice *vdev, Error **errp); > +void vfio_migration_finalize(VFIOPCIDevice *vdev); And the interfaces should be in vfio-common. > #endif /* HW_VFIO_VFIO_PCI_H */ > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 1b434d0..ed43613 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -32,6 +32,7 @@ > #endif > > #define VFIO_MSG_PREFIX "vfio %s: " > +#define TYPE_VFIO_PCI "vfio-pci" > > enum { > VFIO_DEVICE_TYPE_PCI = 0,