qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Jagannathan Raman <jag.raman@oracle.com>
To: qemu-devel@nongnu.org
Cc: elena.ufimtseva@oracle.com, john.g.johnson@oracle.com,
	thuth@redhat.com, jag.raman@oracle.com,
	swapnil.ingle@nutanix.com, john.levon@nutanix.com,
	philmd@redhat.com, alex.williamson@redhat.com,
	marcandre.lureau@gmail.com, stefanha@redhat.com,
	thanos.makatos@nutanix.com, pbonzini@redhat.com,
	alex.bennee@linaro.org
Subject: [PATCH v3 11/12] vfio-user: register handlers to facilitate migration
Date: Mon, 11 Oct 2021 01:31:16 -0400	[thread overview]
Message-ID: <9f85493af346c32d34cca3622e8293053b5c7440.1633929457.git.jag.raman@oracle.com> (raw)
In-Reply-To: <cover.1633929457.git.jag.raman@oracle.com>

Store and load the device's state during migration. use libvfio-user's
handlers for this purpose

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 migration/savevm.h        |   2 +
 hw/remote/vfio-user-obj.c | 339 ++++++++++++++++++++++++++++++++++++++
 migration/savevm.c        |  73 ++++++++
 3 files changed, 414 insertions(+)

diff --git a/migration/savevm.h b/migration/savevm.h
index 6461342cb4..8007064ff2 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -67,5 +67,7 @@ int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
 int qemu_load_device_state(QEMUFile *f);
 int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
         bool in_postcopy, bool inactivate_disks);
+int qemu_remote_savevm(QEMUFile *f, DeviceState *dev);
+int qemu_remote_loadvm(QEMUFile *f);
 
 #endif
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 63c468d6f3..757355ecaf 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -49,6 +49,10 @@
 #include "hw/qdev-core.h"
 #include "hw/pci/pci.h"
 #include "hw/remote/iohub.h"
+#include "migration/qemu-file.h"
+#include "migration/savevm.h"
+#include "migration/global_state.h"
+#include "block/block.h"
 
 #define TYPE_VFU_OBJECT "vfio-user-server"
 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
@@ -77,6 +81,35 @@ struct VfuObject {
     PCIDevice *pci_dev;
 
     int vfu_poll_fd;
+
+    /*
+     * vfu_mig_buf holds the migration data. In the remote server, this
+     * buffer replaces the role of an IO channel which links the source
+     * and the destination.
+     *
+     * Whenever the client QEMU process initiates migration, the remote
+     * server gets notified via libvfio-user callbacks. The remote server
+     * sets up a QEMUFile object using this buffer as backend. The remote
+     * server passes this object to its migration subsystem, which slurps
+     * the VMSD of the device ('devid' above) referenced by this object
+     * and stores the VMSD in this buffer.
+     *
+     * The client subsequetly asks the remote server for any data that
+     * needs to be moved over to the destination via libvfio-user
+     * library's vfu_migration_callbacks_t callbacks. The remote hands
+     * over this buffer as data at this time.
+     *
+     * A reverse of this process happens at the destination.
+     */
+    uint8_t *vfu_mig_buf;
+
+    uint64_t vfu_mig_buf_size;
+
+    uint64_t vfu_mig_buf_pending;
+
+    QEMUFile *vfu_mig_file;
+
+    vfu_migr_state_t vfu_state;
 };
 
 static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
@@ -110,6 +143,272 @@ static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
     trace_vfu_prop("device", str);
 }
 
+/**
+ * Migration helper functions
+ *
+ * vfu_mig_buf_read & vfu_mig_buf_write are used by QEMU's migration
+ * subsystem - qemu_remote_loadvm & qemu_remote_savevm. loadvm/savevm
+ * call these functions via QEMUFileOps to load/save the VMSD of a
+ * device into vfu_mig_buf
+ *
+ */
+static ssize_t vfu_mig_buf_read(void *opaque, uint8_t *buf, int64_t pos,
+                                size_t size, Error **errp)
+{
+    VfuObject *o = opaque;
+
+    if (pos > o->vfu_mig_buf_size) {
+        size = 0;
+    } else if ((pos + size) > o->vfu_mig_buf_size) {
+        size = o->vfu_mig_buf_size - pos;
+    }
+
+    memcpy(buf, (o->vfu_mig_buf + pos), size);
+
+    return size;
+}
+
+static ssize_t vfu_mig_buf_write(void *opaque, struct iovec *iov, int iovcnt,
+                                 int64_t pos, Error **errp)
+{
+    VfuObject *o = opaque;
+    uint64_t end = pos + iov_size(iov, iovcnt);
+    int i;
+
+    if (end > o->vfu_mig_buf_size) {
+        o->vfu_mig_buf = g_realloc(o->vfu_mig_buf, end);
+    }
+
+    for (i = 0; i < iovcnt; i++) {
+        memcpy((o->vfu_mig_buf + o->vfu_mig_buf_size), iov[i].iov_base,
+               iov[i].iov_len);
+        o->vfu_mig_buf_size += iov[i].iov_len;
+        o->vfu_mig_buf_pending += iov[i].iov_len;
+    }
+
+    return iov_size(iov, iovcnt);
+}
+
+static int vfu_mig_buf_shutdown(void *opaque, bool rd, bool wr, Error **errp)
+{
+    VfuObject *o = opaque;
+
+    o->vfu_mig_buf_size = 0;
+
+    g_free(o->vfu_mig_buf);
+
+    o->vfu_mig_buf = NULL;
+
+    o->vfu_mig_buf_pending = 0;
+
+    return 0;
+}
+
+static const QEMUFileOps vfu_mig_fops_save = {
+    .writev_buffer  = vfu_mig_buf_write,
+    .shut_down      = vfu_mig_buf_shutdown,
+};
+
+static const QEMUFileOps vfu_mig_fops_load = {
+    .get_buffer     = vfu_mig_buf_read,
+    .shut_down      = vfu_mig_buf_shutdown,
+};
+
+/**
+ * handlers for vfu_migration_callbacks_t
+ *
+ * The libvfio-user library accesses these handlers to drive the migration
+ * at the remote end, and also to transport the data stored in vfu_mig_buf
+ *
+ */
+static void vfu_mig_state_stop_and_copy(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    int ret;
+
+    if (!o->vfu_mig_file) {
+        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_save, false);
+    }
+
+    ret = qemu_remote_savevm(o->vfu_mig_file, DEVICE(o->pci_dev));
+    if (ret) {
+        qemu_file_shutdown(o->vfu_mig_file);
+        o->vfu_mig_file = NULL;
+        return;
+    }
+
+    qemu_fflush(o->vfu_mig_file);
+}
+
+static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
+    static int migrated_devs;
+    Error *local_err = NULL;
+    int ret;
+
+    /**
+     * TODO: move to VFU_MIGR_STATE_RESUME handler. Presently, the
+     * VMSD data from source is not available at RESUME state.
+     * Working on a fix for this.
+     */
+    if (!o->vfu_mig_file) {
+        o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
+    }
+
+    ret = qemu_remote_loadvm(o->vfu_mig_file);
+    if (ret) {
+        error_setg(&error_abort, "vfu: failed to restore device state");
+        return;
+    }
+
+    qemu_file_shutdown(o->vfu_mig_file);
+    o->vfu_mig_file = NULL;
+
+    /* VFU_MIGR_STATE_RUNNING begins here */
+    if (++migrated_devs == k->nr_devs) {
+        bdrv_invalidate_cache_all(&local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return;
+        }
+
+        vm_start();
+    }
+}
+
+static void vfu_mig_state_stop(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
+    static int migrated_devs;
+
+    /**
+     * note: calling bdrv_inactivate_all() is not the best approach.
+     *
+     *  Ideally, we would identify the block devices (if any) indirectly
+     *  linked (such as via a scs-hd device) to each of the migrated devices,
+     *  and inactivate them individually. This is essential while operating
+     *  the server in a storage daemon mode, with devices from different VMs.
+     *
+     *  However, we currently don't have this capability. As such, we need to
+     *  inactivate all devices at the same time when migration is completed.
+     */
+    if (++migrated_devs == k->nr_devs) {
+        bdrv_inactivate_all();
+        vm_stop(RUN_STATE_PAUSED);
+    }
+}
+
+static int vfu_mig_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    if (o->vfu_state == state) {
+        return 0;
+    }
+
+    switch (state) {
+    case VFU_MIGR_STATE_RESUME:
+        break;
+    case VFU_MIGR_STATE_STOP_AND_COPY:
+        vfu_mig_state_stop_and_copy(vfu_ctx);
+        break;
+    case VFU_MIGR_STATE_STOP:
+        vfu_mig_state_stop(vfu_ctx);
+        break;
+    case VFU_MIGR_STATE_PRE_COPY:
+        break;
+    case VFU_MIGR_STATE_RUNNING:
+        if (!runstate_is_running()) {
+            vfu_mig_state_running(vfu_ctx);
+        }
+        break;
+    default:
+        warn_report("vfu: Unknown migration state %d", state);
+    }
+
+    o->vfu_state = state;
+
+    return 0;
+}
+
+static uint64_t vfu_mig_get_pending_bytes(vfu_ctx_t *vfu_ctx)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    return o->vfu_mig_buf_pending;
+}
+
+static int vfu_mig_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset,
+                                uint64_t *size)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    if (offset) {
+        *offset = 0;
+    }
+
+    if (size) {
+        *size = o->vfu_mig_buf_size;
+    }
+
+    return 0;
+}
+
+static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
+                                 uint64_t size, uint64_t offset)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+
+    if (offset > o->vfu_mig_buf_size) {
+        return -1;
+    }
+
+    if ((offset + size) > o->vfu_mig_buf_size) {
+        warn_report("vfu: buffer overflow - check pending_bytes");
+        size = o->vfu_mig_buf_size - offset;
+    }
+
+    memcpy(buf, (o->vfu_mig_buf + offset), size);
+
+    o->vfu_mig_buf_pending -= size;
+
+    return size;
+}
+
+static ssize_t vfu_mig_write_data(vfu_ctx_t *vfu_ctx, void *data,
+                                  uint64_t size, uint64_t offset)
+{
+    VfuObject *o = vfu_get_private(vfu_ctx);
+    uint64_t end = offset + size;
+
+    if (end > o->vfu_mig_buf_size) {
+        o->vfu_mig_buf = g_realloc(o->vfu_mig_buf, end);
+        o->vfu_mig_buf_size = end;
+    }
+
+    memcpy((o->vfu_mig_buf + offset), data, size);
+
+    return size;
+}
+
+static int vfu_mig_data_written(vfu_ctx_t *vfu_ctx, uint64_t count)
+{
+    return 0;
+}
+
+static const vfu_migration_callbacks_t vfu_mig_cbs = {
+    .version = VFU_MIGR_CALLBACKS_VERS,
+    .transition = &vfu_mig_transition,
+    .get_pending_bytes = &vfu_mig_get_pending_bytes,
+    .prepare_data = &vfu_mig_prepare_data,
+    .read_data = &vfu_mig_read_data,
+    .data_written = &vfu_mig_data_written,
+    .write_data = &vfu_mig_write_data,
+};
+
 static void vfu_object_ctx_run(void *opaque)
 {
     VfuObject *o = opaque;
@@ -359,6 +658,7 @@ static void vfu_object_machine_done(Notifier *notifier, void *data)
     VfuObject *o = container_of(notifier, VfuObject, machine_done);
     DeviceState *dev = NULL;
     vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
+    size_t migr_area_size;
     int ret;
 
     o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path,
@@ -422,6 +722,35 @@ static void vfu_object_machine_done(Notifier *notifier, void *data)
         return;
     }
 
+    /*
+     * TODO: The 0x20000 number used below is a temporary. We are working on
+     *     a cleaner fix for this.
+     *
+     *     The libvfio-user library assumes that the remote knows the size of
+     *     the data to be migrated at boot time, but that is not the case with
+     *     VMSDs, as it can contain a variable-size buffer. 0x20000 is used
+     *     as a sufficiently large buffer to demonstrate migration, but that
+     *     cannot be used as a solution.
+     *
+     */
+    ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX,
+                           0x20000, NULL,
+                           VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
+    if (ret < 0) {
+        error_setg(&error_abort, "vfu: Failed to register migration BAR %s- %s",
+                   o->device, strerror(errno));
+        return;
+    }
+
+    migr_area_size = vfu_get_migr_register_area_size();
+    ret = vfu_setup_device_migration_callbacks(o->vfu_ctx, &vfu_mig_cbs,
+                                               migr_area_size);
+    if (ret < 0) {
+        error_setg(&error_abort, "vfu: Failed to setup migration %s- %s",
+                   o->device, strerror(errno));
+        return;
+    }
+
     ret = vfu_realize_ctx(o->vfu_ctx);
     if (ret < 0) {
         error_setg(&error_abort, "vfu: Failed to realize device %s- %s",
@@ -464,6 +793,16 @@ static void vfu_object_init(Object *obj)
     qemu_add_machine_init_done_notifier(&o->machine_done);
 
     o->vfu_poll_fd = -1;
+
+    o->vfu_mig_file = NULL;
+
+    o->vfu_mig_buf = NULL;
+
+    o->vfu_mig_buf_size = 0;
+
+    o->vfu_mig_buf_pending = 0;
+
+    o->vfu_state = VFU_MIGR_STATE_STOP;
 }
 
 static void vfu_object_finalize(Object *obj)
diff --git a/migration/savevm.c b/migration/savevm.c
index 7b7b64bd13..341fde73f8 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1604,6 +1604,49 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
     return ret;
 }
 
+static SaveStateEntry *find_se_from_dev(DeviceState *dev)
+{
+    SaveStateEntry *se;
+
+    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+        if (se->opaque == dev) {
+            return se;
+        }
+    }
+
+    return NULL;
+}
+
+int qemu_remote_savevm(QEMUFile *f, DeviceState *dev)
+{
+    SaveStateEntry *se;
+    int ret = 0;
+
+    se = find_se_from_dev(dev);
+    if (!se) {
+        return -ENODEV;
+    }
+
+    if (!se->vmsd || !vmstate_save_needed(se->vmsd, se->opaque)) {
+        return ret;
+    }
+
+    save_section_header(f, se, QEMU_VM_SECTION_FULL);
+
+    ret = vmstate_save(f, se, NULL);
+    if (ret) {
+        qemu_file_set_error(f, ret);
+        return ret;
+    }
+
+    save_section_footer(f, se);
+
+    qemu_put_byte(f, QEMU_VM_EOF);
+    qemu_fflush(f);
+
+    return 0;
+}
+
 void qemu_savevm_live_state(QEMUFile *f)
 {
     /* save QEMU_VM_SECTION_END section */
@@ -2444,6 +2487,36 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
     return 0;
 }
 
+int qemu_remote_loadvm(QEMUFile *f)
+{
+    uint8_t section_type;
+    int ret = 0;
+
+    while (true) {
+        section_type = qemu_get_byte(f);
+
+        ret = qemu_file_get_error(f);
+        if (ret) {
+            break;
+        }
+
+        switch (section_type) {
+        case QEMU_VM_SECTION_FULL:
+            ret = qemu_loadvm_section_start_full(f, NULL);
+            if (ret < 0) {
+                break;
+            }
+            break;
+        case QEMU_VM_EOF:
+            return ret;
+        default:
+            return -EINVAL;
+        }
+    }
+
+    return ret;
+}
+
 static int
 qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
 {
-- 
2.20.1



  parent reply	other threads:[~2021-10-11  5:52 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-10-11  5:31 [PATCH v3 00/12] vfio-user server in QEMU Jagannathan Raman
2021-10-11  5:31 ` [PATCH v3 01/12] configure, meson: override C compiler for cmake Jagannathan Raman
2021-10-12 10:44   ` Paolo Bonzini
2021-10-11  5:31 ` [PATCH v3 02/12] vfio-user: build library Jagannathan Raman
2021-10-27 15:17   ` Stefan Hajnoczi
2021-10-29 14:17     ` Jag Raman
2021-11-01  9:56       ` Stefan Hajnoczi
2021-10-11  5:31 ` [PATCH v3 03/12] vfio-user: define vfio-user-server object Jagannathan Raman
2021-10-27 15:40   ` Stefan Hajnoczi
2021-10-29 14:42     ` Jag Raman
2021-11-01 10:34       ` Stefan Hajnoczi
2021-11-04 12:13         ` Markus Armbruster
2021-11-04 14:39           ` Kevin Wolf
2021-11-05 10:08             ` Markus Armbruster
2021-11-05 13:19               ` Kevin Wolf
2021-11-05 13:54                 ` Peter Krempa
2021-11-06  6:34                 ` Markus Armbruster
2021-11-08 12:05                   ` Kevin Wolf
2021-11-08 12:54                     ` Peter Krempa
2021-11-04 16:48           ` Stefan Hajnoczi
2021-10-11  5:31 ` [PATCH v3 04/12] vfio-user: instantiate vfio-user context Jagannathan Raman
2021-10-27 15:59   ` Stefan Hajnoczi
2021-10-29 14:59     ` Jag Raman
2021-11-01 10:35       ` Stefan Hajnoczi
2021-10-11  5:31 ` [PATCH v3 05/12] vfio-user: find and init PCI device Jagannathan Raman
2021-10-27 16:05   ` Stefan Hajnoczi
2021-10-29 15:58     ` Jag Raman
2021-11-01 10:38       ` Stefan Hajnoczi
2021-10-11  5:31 ` [PATCH v3 06/12] vfio-user: run vfio-user context Jagannathan Raman
2021-10-27 16:21   ` Stefan Hajnoczi
2021-10-28 21:55     ` John Levon
2021-10-11  5:31 ` [PATCH v3 07/12] vfio-user: handle PCI config space accesses Jagannathan Raman
2021-10-27 16:35   ` Stefan Hajnoczi
2021-10-11  5:31 ` [PATCH v3 08/12] vfio-user: handle DMA mappings Jagannathan Raman
2021-10-11  5:31 ` [PATCH v3 09/12] vfio-user: handle PCI BAR accesses Jagannathan Raman
2021-10-27 16:38   ` Stefan Hajnoczi
2021-10-11  5:31 ` [PATCH v3 10/12] vfio-user: handle device interrupts Jagannathan Raman
2021-10-11  5:31 ` Jagannathan Raman [this message]
2021-10-27 18:30   ` [PATCH v3 11/12] vfio-user: register handlers to facilitate migration Stefan Hajnoczi
2021-12-15 15:49     ` Jag Raman
2021-10-11  5:31 ` [PATCH v3 12/12] vfio-user: acceptance test Jagannathan Raman
2021-10-11 22:26   ` Philippe Mathieu-Daudé
2021-10-27 16:42   ` Stefan Hajnoczi
2021-10-27 18:33 ` [PATCH v3 00/12] vfio-user server in QEMU Stefan Hajnoczi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=9f85493af346c32d34cca3622e8293053b5c7440.1633929457.git.jag.raman@oracle.com \
    --to=jag.raman@oracle.com \
    --cc=alex.bennee@linaro.org \
    --cc=alex.williamson@redhat.com \
    --cc=elena.ufimtseva@oracle.com \
    --cc=john.g.johnson@oracle.com \
    --cc=john.levon@nutanix.com \
    --cc=marcandre.lureau@gmail.com \
    --cc=pbonzini@redhat.com \
    --cc=philmd@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    --cc=swapnil.ingle@nutanix.com \
    --cc=thanos.makatos@nutanix.com \
    --cc=thuth@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).