All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [patch 0/7] live block copy (v4)
@ 2011-06-06 16:55 Marcelo Tosatti
  2011-06-06 16:55 ` [Qemu-devel] [patch 1/7] add migration_active function Marcelo Tosatti
                   ` (6 more replies)
  0 siblings, 7 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-06 16:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, Jes.Sorensen, jdenemar, dlaor, avi

v4:
- add block-copy documentation
- block-copy style fixes
- fix blkmirror_cancel method
- blkmirror style fixes
- add escaping to blkmirror pathnames
- add blkmirror documentation

v3:
- replace commit file with mirrored writes
- address comments from round 2

v2:
- use reference counting to be safe against device hotplug / bdrv_truncate
- add comment about usage of timer

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [patch 1/7] add migration_active function
  2011-06-06 16:55 [Qemu-devel] [patch 0/7] live block copy (v4) Marcelo Tosatti
@ 2011-06-06 16:55 ` Marcelo Tosatti
  2011-06-06 16:55 ` [Qemu-devel] [patch 2/7] Add blkmirror block driver Marcelo Tosatti
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-06 16:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, Marcelo Tosatti, Jes.Sorensen, dlaor, avi, jdenemar

[-- Attachment #1: 1-3-add-migration-active.patch --]
[-- Type: text/plain, Size: 1266 bytes --]

To query whether migration is active.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu-block-copy/migration.c
===================================================================
--- qemu-block-copy.orig/migration.c
+++ qemu-block-copy/migration.c
@@ -85,8 +85,8 @@ int do_migrate(Monitor *mon, const QDict
     int inc = qdict_get_try_bool(qdict, "inc", 0);
     const char *uri = qdict_get_str(qdict, "uri");
 
-    if (current_migration &&
-        current_migration->get_status(current_migration) == MIG_STATE_ACTIVE) {
+
+    if (migration_active()) {
         monitor_printf(mon, "migration already in progress\n");
         return -1;
     }
@@ -480,3 +480,13 @@ int get_migration_state(void)
         return MIG_STATE_ERROR;
     }
 }
+
+bool migration_active(void)
+{
+    if (current_migration &&
+        current_migration->get_status(current_migration) == MIG_STATE_ACTIVE) {
+        return true;
+    }
+
+    return false;
+}
Index: qemu-block-copy/migration.h
===================================================================
--- qemu-block-copy.orig/migration.h
+++ qemu-block-copy/migration.h
@@ -148,4 +148,6 @@ int ram_load(QEMUFile *f, void *opaque, 
 
 extern int incoming_expected;
 
+bool migration_active(void);
+
 #endif

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [patch 2/7] Add blkmirror block driver
  2011-06-06 16:55 [Qemu-devel] [patch 0/7] live block copy (v4) Marcelo Tosatti
  2011-06-06 16:55 ` [Qemu-devel] [patch 1/7] add migration_active function Marcelo Tosatti
@ 2011-06-06 16:55 ` Marcelo Tosatti
  2011-06-06 21:52   ` malc
  2011-06-07 10:25   ` Stefan Hajnoczi
  2011-06-06 16:55 ` [Qemu-devel] [patch 3/7] Add error messages for live block copy Marcelo Tosatti
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-06 16:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, Marcelo Tosatti, Jes.Sorensen, dlaor, avi, jdenemar

[-- Attachment #1: blkmirror --]
[-- Type: text/plain, Size: 8601 bytes --]

Mirrored writes are used by live block copy.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu-block-copy/block/blkmirror.c
===================================================================
--- /dev/null
+++ qemu-block-copy/block/blkmirror.c
@@ -0,0 +1,277 @@
+/*
+ * Block driver for mirrored writes.
+ *
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "block_int.h"
+
+typedef struct {
+    BlockDriverState *bs[2];
+} BdrvMirrorState;
+
+typedef struct DupAIOCB DupAIOCB;
+
+typedef struct SingleAIOCB {
+    BlockDriverAIOCB *aiocb;
+    int finished;
+    DupAIOCB *parent;
+} SingleAIOCB;
+
+struct DupAIOCB {
+    BlockDriverAIOCB common;
+    int count;
+
+    BlockDriverCompletionFunc *cb;
+    SingleAIOCB aios[2];
+    int ret;
+};
+
+/* Valid blkmirror filenames look like
+ * blkmirror:path/to/image1:path/to/image2 */
+static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BdrvMirrorState *m = bs->opaque;
+    int ret, escape, i, n;
+    char *raw;
+
+    /* Parse the blkmirror: prefix */
+    if (strncmp(filename, "blkmirror:", strlen("blkmirror:"))) {
+        return -EINVAL;
+    }
+    filename += strlen("blkmirror:");
+
+    /* Parse the raw image filename */
+    raw = malloc(strlen(filename));
+    escape = 0;
+    for (i = n = 0; i < strlen(filename); i++) {
+        if (!escape && filename[i] == ':') {
+            break;
+        }
+        if (!escape && filename[i] == '\\') {
+            escape = 1;
+        } else {
+            escape = 0;
+        }
+
+        if (!escape) {
+            raw[n++] = filename[i];
+        }
+    }
+    raw[n] = '\0';
+
+    m->bs[0] = bdrv_new("");
+    if (m->bs[0] == NULL) {
+        return -ENOMEM;
+    }
+    ret = bdrv_open(m->bs[0], raw, flags, NULL);
+    free(raw);
+    if (ret < 0) {
+        return ret;
+    }
+    filename += i + 1;
+
+    m->bs[1] = bdrv_new("");
+    if (m->bs[1] == NULL) {
+        return -ENOMEM;
+    }
+    ret = bdrv_open(m->bs[1], filename, flags, NULL);
+    if (ret < 0) {
+        bdrv_delete(m->bs[0]);
+        return ret;
+    }
+
+    return 0;
+}
+
+static void blkmirror_close(BlockDriverState *bs)
+{
+    BdrvMirrorState *m = bs->opaque;
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        bdrv_delete(m->bs[i]);
+        m->bs[i] = NULL;
+    }
+}
+
+static int blkmirror_flush(BlockDriverState *bs)
+{
+    BdrvMirrorState *m = bs->opaque;
+
+    bdrv_flush(m->bs[0]);
+    bdrv_flush(m->bs[1]);
+
+    return 0;
+}
+
+static int64_t blkmirror_getlength(BlockDriverState *bs)
+{
+    BdrvMirrorState *m = bs->opaque;
+
+    return bdrv_getlength(m->bs[0]);
+}
+
+static BlockDriverAIOCB *blkmirror_aio_readv(BlockDriverState *bs,
+                                             int64_t sector_num,
+                                             QEMUIOVector *qiov,
+                                             int nb_sectors,
+                                             BlockDriverCompletionFunc *cb,
+                                             void *opaque)
+{
+    BdrvMirrorState *m = bs->opaque;
+    return bdrv_aio_readv(m->bs[0], sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static void dup_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    DupAIOCB *acb = container_of(blockacb, DupAIOCB, common);
+    int i;
+
+    for (i = 0 ; i < 2; i++) {
+        if (!acb->aios[i].finished) {
+            bdrv_aio_cancel(acb->aios[i].aiocb);
+        }
+    }
+    qemu_aio_release(acb);
+}
+
+static AIOPool dup_aio_pool = {
+    .aiocb_size         = sizeof(DupAIOCB),
+    .cancel             = dup_aio_cancel,
+};
+
+static void blkmirror_aio_cb(void *opaque, int ret)
+{
+    SingleAIOCB *scb = opaque;
+    DupAIOCB *dcb = scb->parent;
+
+    scb->finished = 1;
+    dcb->count--;
+    assert(dcb->count >= 0);
+    if (ret < 0) {
+        dcb->ret = ret;
+    }
+    if (dcb->count == 0) {
+        dcb->common.cb(dcb->common.opaque, dcb->ret);
+        qemu_aio_release(dcb);
+    }
+}
+
+static DupAIOCB *dup_aio_get(BlockDriverState *bs,
+                             BlockDriverCompletionFunc *cb,
+                             void *opaque)
+{
+    DupAIOCB *dcb;
+    int i;
+
+    dcb = qemu_aio_get(&dup_aio_pool, bs, cb, opaque);
+    if (!dcb) {
+        return NULL;
+    }
+    dcb->count = 2;
+    for (i = 0; i < 2; i++) {
+        dcb->aios[i].parent = dcb;
+        dcb->aios[i].finished = 0;
+    }
+    dcb->ret = 0;
+
+    return dcb;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_writev(BlockDriverState *bs,
+                                              int64_t sector_num,
+                                              QEMUIOVector *qiov,
+                                              int nb_sectors,
+                                              BlockDriverCompletionFunc *cb,
+                                              void *opaque)
+{
+    BdrvMirrorState *m = bs->opaque;
+    DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        dcb->aios[i].aiocb = bdrv_aio_writev(m->bs[i], sector_num, qiov,
+                                             nb_sectors, &blkmirror_aio_cb,
+                                             &dcb->aios[i]);
+        if (!dcb->aios[i].aiocb) {
+            int a;
+
+            for (a = 0; a < i; a++) {
+                bdrv_aio_cancel(dcb->aios[i].aiocb);
+            }
+            qemu_aio_release(dcb);
+            return NULL;
+        }
+    }
+
+    return &dcb->common;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_flush(BlockDriverState *bs,
+                                             BlockDriverCompletionFunc *cb,
+                                             void *opaque)
+{
+    BdrvMirrorState *m = bs->opaque;
+    DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        dcb->aios[i].aiocb = bdrv_aio_flush(m->bs[i], &blkmirror_aio_cb,
+                                            &dcb->aios[i]);
+        if (!dcb->aios[i].aiocb) {
+            int a;
+
+            for (a = 0; a < i; a++) {
+                bdrv_aio_cancel(dcb->aios[i].aiocb);
+            }
+            qemu_aio_release(dcb);
+            return NULL;
+        }
+    }
+
+    return &dcb->common;
+}
+
+static int blkmirror_discard(BlockDriverState *bs, int64_t sector_num,
+                             int nb_sectors)
+{
+    BdrvMirrorState *m = bs->opaque;
+    int ret;
+
+    ret = bdrv_discard(m->bs[0], sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_discard(m->bs[1], sector_num, nb_sectors);
+}
+
+
+static BlockDriver bdrv_blkmirror = {
+    .format_name        = "blkmirror",
+    .protocol_name      = "blkmirror",
+    .instance_size      = sizeof(BdrvMirrorState),
+
+    .bdrv_getlength     = blkmirror_getlength,
+
+    .bdrv_file_open     = blkmirror_open,
+    .bdrv_close         = blkmirror_close,
+    .bdrv_flush         = blkmirror_flush,
+    .bdrv_discard       = blkmirror_discard,
+
+    .bdrv_aio_readv     = blkmirror_aio_readv,
+    .bdrv_aio_writev    = blkmirror_aio_writev,
+    .bdrv_aio_flush     = blkmirror_aio_flush,
+};
+
+static void bdrv_blkmirror_init(void)
+{
+    bdrv_register(&bdrv_blkmirror);
+}
+
+block_init(bdrv_blkmirror_init);
Index: qemu-block-copy/Makefile.objs
===================================================================
--- qemu-block-copy.orig/Makefile.objs
+++ qemu-block-copy/Makefile.objs
@@ -22,7 +22,7 @@ block-nested-y += raw.o cow.o qcow.o vdi
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blkmirror.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
Index: qemu-block-copy/docs/blkmirror.txt
===================================================================
--- /dev/null
+++ qemu-block-copy/docs/blkmirror.txt
@@ -0,0 +1,15 @@
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+Its used internally by live block copy.
+
+Format
+------
+
+blkmirror:/image1.img:/image2.img
+
+'\' (backslash) can be used to escape colon processing
+as a separator.
+
+

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [patch 3/7] Add error messages for live block copy
  2011-06-06 16:55 [Qemu-devel] [patch 0/7] live block copy (v4) Marcelo Tosatti
  2011-06-06 16:55 ` [Qemu-devel] [patch 1/7] add migration_active function Marcelo Tosatti
  2011-06-06 16:55 ` [Qemu-devel] [patch 2/7] Add blkmirror block driver Marcelo Tosatti
@ 2011-06-06 16:55 ` Marcelo Tosatti
  2011-06-06 16:55 ` [Qemu-devel] [patch 4/7] Add blkdebug points " Marcelo Tosatti
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-06 16:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, Marcelo Tosatti, Jes.Sorensen, dlaor, avi, jdenemar

[-- Attachment #1: blockcopy-qerror --]
[-- Type: text/plain, Size: 1222 bytes --]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu-block-copy/qerror.c
===================================================================
--- qemu-block-copy.orig/qerror.c
+++ qemu-block-copy/qerror.c
@@ -209,6 +209,14 @@ static const QErrorStringTable qerror_ta
         .error_fmt = QERR_VNC_SERVER_FAILED,
         .desc      = "Could not start VNC server on %(target)",
     },
+    {
+        .error_fmt = QERR_IN_PROGRESS,
+        .desc      = "Operation %(operation) in progress",
+    },
+    {
+        .error_fmt = QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS,
+        .desc      = "Length of destination image differs from source image",
+    },
     {}
 };
 
Index: qemu-block-copy/qerror.h
===================================================================
--- qemu-block-copy.orig/qerror.h
+++ qemu-block-copy/qerror.h
@@ -174,4 +174,10 @@ QError *qobject_to_qerror(const QObject 
 #define QERR_FEATURE_DISABLED \
     "{ 'class': 'FeatureDisabled', 'data': { 'name': %s } }"
 
+#define QERR_IN_PROGRESS \
+    "{ 'class': 'InProgress', 'data': { 'operation': %s } }"
+
+#define QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS \
+    "{ 'class': 'BlockCopyImageSizeDiffers', 'data': {} }"
+
 #endif /* QERROR_H */

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [patch 4/7] Add blkdebug points for live block copy
  2011-06-06 16:55 [Qemu-devel] [patch 0/7] live block copy (v4) Marcelo Tosatti
                   ` (2 preceding siblings ...)
  2011-06-06 16:55 ` [Qemu-devel] [patch 3/7] Add error messages for live block copy Marcelo Tosatti
@ 2011-06-06 16:55 ` Marcelo Tosatti
  2011-06-06 16:55 ` [Qemu-devel] [patch 5/7] Add vmstop code " Marcelo Tosatti
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-06 16:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, Marcelo Tosatti, Jes.Sorensen, dlaor, avi, jdenemar

[-- Attachment #1: blkdebug-add-points --]
[-- Type: text/plain, Size: 1651 bytes --]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu-block-copy/block.h
===================================================================
--- qemu-block-copy.orig/block.h
+++ qemu-block-copy/block.h
@@ -295,6 +295,14 @@ typedef enum {
     BLKDBG_CLUSTER_ALLOC_BYTES,
     BLKDBG_CLUSTER_FREE,
 
+    BLKDBG_BLKCOPY_STAGE_BULK,
+    BLKDBG_BLKCOPY_STAGE_BULK_FINISHED,
+    BLKDBG_BLKCOPY_STAGE_DIRTY,
+    BLKDBG_BLKCOPY_STAGE_MIRROR_WRITES,
+    BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED,
+    BLKDBG_BLKCOPY_SWITCH_START,
+    BLKDBG_BLKCOPY_AIO_WRITE,
+
     BLKDBG_EVENT_MAX,
 } BlkDebugEvent;
 
Index: qemu-block-copy/block/blkdebug.c
===================================================================
--- qemu-block-copy.orig/block/blkdebug.c
+++ qemu-block-copy/block/blkdebug.c
@@ -178,6 +178,15 @@ static const char *event_names[BLKDBG_EV
     [BLKDBG_CLUSTER_ALLOC]                  = "cluster_alloc",
     [BLKDBG_CLUSTER_ALLOC_BYTES]            = "cluster_alloc_bytes",
     [BLKDBG_CLUSTER_FREE]                   = "cluster_free",
+
+
+    [BLKDBG_BLKCOPY_STAGE_BULK]             = "blkcopy_stage_bulk",
+    [BLKDBG_BLKCOPY_STAGE_BULK_FINISHED]    = "blkcopy_stage_bulk_finished",
+    [BLKDBG_BLKCOPY_STAGE_DIRTY]            = "blkcopy_stage_dirty",
+    [BLKDBG_BLKCOPY_STAGE_MIRROR_WRITES]    = "blkcopy_stage_mirror_writes",
+    [BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED]  = "blkcopy_stage_switch_finished",
+    [BLKDBG_BLKCOPY_SWITCH_START]           = "blkcopy_switch_start",
+    [BLKDBG_BLKCOPY_AIO_WRITE]              = "blkcopy_aio_write",
 };
 
 static int get_event_by_name(const char *name, BlkDebugEvent *event)

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [patch 5/7] Add vmstop code for live block copy
  2011-06-06 16:55 [Qemu-devel] [patch 0/7] live block copy (v4) Marcelo Tosatti
                   ` (3 preceding siblings ...)
  2011-06-06 16:55 ` [Qemu-devel] [patch 4/7] Add blkdebug points " Marcelo Tosatti
@ 2011-06-06 16:55 ` Marcelo Tosatti
  2011-06-06 16:55 ` [Qemu-devel] [patch 6/7] QEMU " Marcelo Tosatti
  2011-06-06 16:55 ` [Qemu-devel] [patch 7/7] do not allow migration if block copy in progress Marcelo Tosatti
  6 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-06 16:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, Marcelo Tosatti, Jes.Sorensen, dlaor, avi, jdenemar

[-- Attachment #1: vmstop-blockcopy --]
[-- Type: text/plain, Size: 439 bytes --]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu-block-copy/sysemu.h
===================================================================
--- qemu-block-copy.orig/sysemu.h
+++ qemu-block-copy/sysemu.h
@@ -33,6 +33,7 @@ void qemu_del_vm_change_state_handler(VM
 #define VMSTOP_SAVEVM    6
 #define VMSTOP_LOADVM    7
 #define VMSTOP_MIGRATE   8
+#define VMSTOP_BLOCKCOPY 9
 
 void vm_start(void);
 void vm_stop(int reason);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [patch 6/7] QEMU live block copy
  2011-06-06 16:55 [Qemu-devel] [patch 0/7] live block copy (v4) Marcelo Tosatti
                   ` (4 preceding siblings ...)
  2011-06-06 16:55 ` [Qemu-devel] [patch 5/7] Add vmstop code " Marcelo Tosatti
@ 2011-06-06 16:55 ` Marcelo Tosatti
  2011-06-06 17:03   ` [Qemu-devel] [patch 6/7] QEMU live block copy (update) Marcelo Tosatti
  2011-06-07 12:15   ` [Qemu-devel] [patch 6/7] QEMU live block copy Stefan Hajnoczi
  2011-06-06 16:55 ` [Qemu-devel] [patch 7/7] do not allow migration if block copy in progress Marcelo Tosatti
  6 siblings, 2 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-06 16:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, Marcelo Tosatti, Jes.Sorensen, dlaor, avi, jdenemar

[-- Attachment #1: 2-3-live-block-copy.patch --]
[-- Type: text/plain, Size: 31691 bytes --]

Support live image copy + switch. That is, copy an image backing
a guest hard disk to a destination image (destination image must
be created separately), and switch to this copy.

Command syntax:

block_copy device filename [-i] -- live block copy device to image
             -i for incremental copy (base image shared between src and destination)

Please refer to qmp-commands diff for more details.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu-block-copy/block-copy.c
===================================================================
--- /dev/null
+++ qemu-block-copy/block-copy.c
@@ -0,0 +1,764 @@
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "blockdev.h"
+#include "qemu-queue.h"
+#include "qemu-timer.h"
+#include "monitor.h"
+#include "block-copy.h"
+#include "migration.h"
+#include "sysemu.h"
+#include "qjson.h"
+#include <assert.h>
+#include "hw/hw.h"
+
+#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
+#define MAX_IS_ALLOCATED_SEARCH 65536
+
+/*
+ * Stages:
+ *
+ * STAGE_BULK: bulk reads/writes in progress
+ * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress
+ * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress
+ * STAGE_MIRROR_WRITES: copy finished, writes mirrored to both images.
+ * STAGE_SWITCH_FINISHED: switched to new image.
+ */
+
+enum BdrvCopyStage {
+    STAGE_BULK,
+    STAGE_BULK_FINISHED,
+    STAGE_DIRTY,
+    STAGE_MIRROR_WRITES,
+    STAGE_SWITCH_FINISHED,
+};
+
+#define NAME_LEN 1024
+#define DEV_LEN 256
+
+typedef struct BdrvCopyState {
+    BlockDriverState *src;
+    BlockDriverState *dst;
+    bool shared_base;
+
+    int64_t curr_sector;
+    int64_t completed_sectors;
+    int64_t nr_sectors;
+
+    enum BdrvCopyStage stage;
+    int inflight_reads;
+    int error;
+    int failed;
+    int cancelled;
+    QLIST_HEAD(, BdrvCopyBlock) io_list;
+    unsigned long *aio_bitmap;
+    QEMUTimer *aio_timer;
+    QLIST_ENTRY(BdrvCopyState) list;
+
+    int64_t blocks;
+    int64_t total_time;
+
+    char device_name[DEV_LEN];
+    char src_filename[NAME_LEN];
+    char dst_filename[NAME_LEN];
+} BdrvCopyState;
+
+typedef struct BdrvCopyBlock {
+    BdrvCopyState *state;
+    uint8_t *buf;
+    int64_t sector;
+    int64_t nr_sectors;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    BlockDriverAIOCB *aiocb;
+    int64_t time;
+    QLIST_ENTRY(BdrvCopyBlock) list;
+} BdrvCopyBlock;
+
+static QLIST_HEAD(, BdrvCopyState) block_copy_list =
+    QLIST_HEAD_INITIALIZER(block_copy_list);
+
+static void alloc_aio_bitmap(BdrvCopyState *s)
+{
+    BlockDriverState *bs = s->src;
+    int64_t bitmap_size;
+
+    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
+            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
+
+    s->aio_bitmap = qemu_mallocz(bitmap_size);
+}
+
+static bool aio_inflight(BdrvCopyState *s, int64_t sector)
+{
+    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    if (s->aio_bitmap &&
+        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(s->src)) {
+        return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
+            (1UL << (chunk % (sizeof(unsigned long) * 8))));
+    } else {
+        return 0;
+    }
+}
+
+static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num,
+                             int nb_sectors, int set)
+{
+    int64_t start, end;
+    unsigned long val, idx, bit;
+
+    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
+    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    for (; start <= end; start++) {
+        idx = start / (sizeof(unsigned long) * 8);
+        bit = start % (sizeof(unsigned long) * 8);
+        val = s->aio_bitmap[idx];
+        if (set) {
+            if (!(val & (1UL << bit))) {
+                val |= 1UL << bit;
+            }
+        } else {
+            if (val & (1UL << bit)) {
+                val &= ~(1UL << bit);
+            }
+        }
+        s->aio_bitmap[idx] = val;
+    }
+}
+
+static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage)
+{
+    s->stage = stage;
+
+    switch (stage) {
+    case STAGE_BULK:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK);
+        break;
+    case STAGE_BULK_FINISHED:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED);
+        break;
+    case STAGE_DIRTY:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY);
+        break;
+    case STAGE_MIRROR_WRITES:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_MIRROR_WRITES);
+        break;
+    case STAGE_SWITCH_FINISHED:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED);
+        break;
+    default:
+        break;
+    }
+}
+
+static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret)
+{
+    s->error = ret;
+    qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+}
+
+static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time)
+{
+    s->blocks++;
+    s->total_time += time;
+}
+
+static void blk_copy_write_cb(void *opaque, int ret)
+{
+    BdrvCopyBlock *blk = opaque;
+    BdrvCopyState *s = blk->state;
+
+    if (ret < 0) {
+        QLIST_REMOVE(blk, list);
+        qemu_free(blk->buf);
+        qemu_free(blk);
+        blk_copy_handle_cb_error(s, ret);
+        return;
+    }
+
+    QLIST_REMOVE(blk, list);
+    add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time);
+
+    /* schedule switch to STAGE_DIRTY on last bulk write completion */
+    if (blk->state->stage == STAGE_BULK_FINISHED) {
+        qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+    }
+
+    if (blk->state->stage > STAGE_BULK_FINISHED) {
+        set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0);
+    }
+
+    qemu_free(blk->buf);
+    qemu_free(blk);
+}
+
+static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk)
+{
+    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+    blk->state = s;
+    blk->sector = read_blk->sector;
+    blk->nr_sectors = read_blk->nr_sectors;
+    blk->time = read_blk->time;
+    blk->buf = read_blk->buf;
+    QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+    blk->iov.iov_base = read_blk->buf;
+    blk->iov.iov_len = read_blk->iov.iov_len;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+    BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE);
+    blk->aiocb = bdrv_aio_writev(s->dst, blk->sector, &blk->qiov,
+                                 blk->iov.iov_len / BDRV_SECTOR_SIZE,
+                                 blk_copy_write_cb, blk);
+    if (!blk->aiocb) {
+        s->error = 1;
+        goto error;
+    }
+
+    return;
+
+error:
+    QLIST_REMOVE(blk, list);
+    qemu_free(read_blk->buf);
+    qemu_free(blk);
+}
+
+static void blk_copy_read_cb(void *opaque, int ret)
+{
+    BdrvCopyBlock *blk = opaque;
+    BdrvCopyState *s = blk->state;
+
+    s->inflight_reads--;
+    if (ret < 0) {
+        QLIST_REMOVE(blk, list);
+        qemu_free(blk->buf);
+        qemu_free(blk);
+        blk_copy_handle_cb_error(s, ret);
+        return;
+    }
+    blk_copy_issue_write(s, blk);
+    QLIST_REMOVE(blk, list);
+    qemu_free(blk);
+    qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+}
+
+static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
+                                int nr_sectors)
+{
+    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+    blk->buf = qemu_mallocz(BLOCK_SIZE);
+    blk->state = s;
+    blk->sector = sector;
+    blk->nr_sectors = nr_sectors;
+    QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+    blk->iov.iov_base = blk->buf;
+    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+    s->inflight_reads++;
+    blk->time = qemu_get_clock_ns(rt_clock);
+    blk->aiocb = bdrv_aio_readv(s->src, sector, &blk->qiov, nr_sectors,
+                                blk_copy_read_cb, blk);
+    if (!blk->aiocb) {
+        s->error = 1;
+        goto error;
+    }
+
+    return;
+
+error:
+    s->inflight_reads--;
+    QLIST_REMOVE(blk, list);
+    qemu_free(blk->buf);
+    qemu_free(blk);
+}
+
+static bool blkcopy_can_switch(BdrvCopyState *s)
+{
+    int64_t remaining_dirty;
+    int64_t avg_transfer_time;
+
+    remaining_dirty = bdrv_get_dirty_count(s->src);
+    if (remaining_dirty == 0 || s->blocks == 0) {
+        return true;
+    }
+
+    avg_transfer_time = s->total_time / s->blocks;
+    if ((remaining_dirty * avg_transfer_time) <= migrate_max_downtime()) {
+        return true;
+    }
+    return false;
+}
+
+static int blk_issue_reads_dirty(BdrvCopyState *s)
+{
+    int64_t sector;
+
+    for (sector = s->curr_sector; sector < s->nr_sectors;) {
+        if (bdrv_get_dirty(s->src, sector) && !aio_inflight(s, sector)) {
+            int nr_sectors = MIN(s->nr_sectors - s->curr_sector,
+                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+            blk_copy_issue_read(s, sector, nr_sectors);
+            bdrv_reset_dirty(s->src, sector, nr_sectors);
+            set_aio_inflight(s, sector, nr_sectors, 1);
+            break;
+        }
+
+        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+        s->curr_sector = sector;
+    }
+
+    if (sector >= s->nr_sectors) {
+        s->curr_sector = 0;
+    }
+    return 0;
+}
+
+static int blk_issue_reads_bulk(BdrvCopyState *s)
+{
+    int nr_sectors;
+    int64_t curr_sector = s->curr_sector;
+
+    if (s->shared_base) {
+        while (curr_sector < s->nr_sectors &&
+                !bdrv_is_allocated(s->src, curr_sector,
+                                   MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
+                curr_sector += nr_sectors;
+        }
+    }
+
+    if (curr_sector >= s->nr_sectors) {
+        s->curr_sector = 0;
+        return 1;
+    }
+
+    curr_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
+    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    blk_copy_issue_read(s, s->curr_sector, nr_sectors);
+    s->curr_sector += nr_sectors;
+    s->completed_sectors = curr_sector;
+    return 0;
+}
+
+static void blkcopy_finish(BdrvCopyState *s)
+{
+    int64_t sector;
+    uint8_t *buf;
+
+    buf = qemu_malloc(BLOCK_SIZE);
+
+    /* FIXME: speed up loop, get_next_dirty_block? */
+    for (sector = 0; sector < s->nr_sectors;
+         sector += BDRV_SECTORS_PER_DIRTY_CHUNK) {
+        if (bdrv_get_dirty(s->src, sector)) {
+            int nr_sectors = MIN(s->nr_sectors - sector,
+                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+            memset(buf, 0, BLOCK_SIZE);
+            if (bdrv_read(s->src, sector, buf, nr_sectors) < 0) {
+                goto error;
+            }
+            if (bdrv_write(s->dst, sector, buf, nr_sectors) < 0) {
+                goto error;
+            }
+            bdrv_reset_dirty(s->src, sector, nr_sectors);
+        }
+
+        if (bdrv_get_dirty_count(s->src) == 0) {
+            break;
+        }
+    }
+    qemu_free(buf);
+    return;
+
+error:
+    qemu_free(buf);
+    s->error = 1;
+}
+
+static void blkcopy_cleanup(BdrvCopyState *s)
+{
+    assert(s->inflight_reads == 0);
+    assert(QLIST_EMPTY(&s->io_list));
+    bdrv_set_dirty_tracking(s->src, 0);
+    drive_put_ref(drive_get_by_blockdev(s->src));
+    bdrv_set_in_use(s->src, 0);
+    if (s->stage >= STAGE_DIRTY) {
+        qemu_free(s->aio_bitmap);
+    }
+    qemu_del_timer(s->aio_timer);
+}
+
+static void blkcopy_free(BdrvCopyState *s)
+{
+    QLIST_REMOVE(s, list);
+    qemu_free(s);
+}
+
+static void handle_error(BdrvCopyState *s)
+{
+    if (!QLIST_EMPTY(&s->io_list)) {
+        return;
+    }
+    s->failed = 1;
+    blkcopy_cleanup(s);
+}
+
+static void blkcopy_switch(BdrvCopyState *s)
+{
+    char src_filename[NAME_LEN], mirror_name[NAME_LEN*2];
+    int open_flags, ret;
+
+    strncpy(src_filename, s->src->filename, sizeof(src_filename) - 1);
+    open_flags = s->src->open_flags;
+
+    assert(s->stage == STAGE_DIRTY);
+
+    vm_stop(VMSTOP_BLOCKCOPY);
+    /* flush any guest writes, dirty bitmap uptodate after this.
+     * copy AIO also finished.
+     */
+    qemu_aio_flush();
+    assert(QLIST_EMPTY(&s->io_list));
+    if (s->error) {
+        handle_error(s);
+        goto vm_start;
+    }
+    blkcopy_finish(s);
+    if (s->error) {
+        handle_error(s);
+        goto vm_start;
+    }
+    assert(bdrv_get_dirty_count(s->src) == 0);
+    /* turn dirty bitmap off */
+    bdrv_set_dirty_tracking(s->src, 0);
+    /* switch to double writes */
+    bdrv_flush_all();
+    bdrv_close(s->src);
+    bdrv_close(s->dst);
+
+    snprintf(mirror_name, sizeof(mirror_name)-1,
+             "blkmirror:%s:%s", s->dst->filename, s->src->filename);
+
+    ret = bdrv_open(s->src, mirror_name, s->src->open_flags, NULL);
+    if (ret < 0) {
+        error_report("%s: cannot open blkmirror device, err %d",
+                      mirror_name, ret);
+        s->failed = 1;
+        goto err;
+    }
+
+    blkcopy_set_stage(s, STAGE_MIRROR_WRITES);
+    qemu_del_timer(s->aio_timer);
+
+vm_start:
+    vm_start();
+    return;
+
+err:
+    if (bdrv_open(s->src, src_filename, open_flags, NULL) < 0) {
+        error_report("%s: %s: cannot fallback to source image\n", __func__,
+                     s->src_filename);
+        abort();
+    }
+    blkcopy_cleanup(s);
+    goto vm_start;
+}
+
+#define BLKCOPY_INFLIGHT 2
+
+/*
+ * To simplify the implementation, the IO completion callbacks do not
+ * handle stage control or submit IO for further blocks. A timer is used
+ * for such purpose.
+ */
+
+static void aio_timer(void *opaque)
+{
+    BdrvCopyState *s = opaque;
+
+    assert(s->cancelled == 0);
+    assert(s->stage < STAGE_MIRROR_WRITES);
+
+    if (s->error) {
+        handle_error(s);
+        return;
+    }
+
+    while (s->stage == STAGE_BULK) {
+        if (s->inflight_reads >= BLKCOPY_INFLIGHT) {
+            break;
+        }
+        if (blk_issue_reads_bulk(s)) {
+            blkcopy_set_stage(s, STAGE_BULK_FINISHED);
+        }
+    }
+
+    if (s->stage == STAGE_BULK_FINISHED) {
+        if (QLIST_EMPTY(&s->io_list)) {
+            blkcopy_set_stage(s, STAGE_DIRTY);
+            alloc_aio_bitmap(s);
+        }
+    }
+
+    while (s->stage == STAGE_DIRTY) {
+        if (s->inflight_reads >= BLKCOPY_INFLIGHT) {
+            break;
+        }
+        blk_issue_reads_dirty(s);
+        if (blkcopy_can_switch(s)) {
+            BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_SWITCH_START);
+            blkcopy_switch(s);
+            return;
+        }
+    }
+}
+
+
+int do_bdrv_copy_switch(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+    const char *device = qdict_get_str(qdict, "device");
+    BdrvCopyState *s = NULL;
+    int open_flags;
+
+    QLIST_FOREACH(s, &block_copy_list, list) {
+        if (!strcmp(s->device_name, device)) {
+            if (s->stage != STAGE_MIRROR_WRITES) {
+                qerror_report(QERR_IN_PROGRESS, "block copy");
+                return -1;
+            }
+            break;
+        }
+    }
+
+    if (!s) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, device);
+        return -1;
+    }
+
+    open_flags = s->src->open_flags;
+
+    /* switch from mirrored writes to destination only */
+    bdrv_flush_all();
+    bdrv_close(s->src);
+    if (bdrv_open(s->src, s->dst->filename, s->src->open_flags, NULL) < 0) {
+        s->failed = 1;
+        goto err;
+    }
+
+    blkcopy_set_stage(s, STAGE_SWITCH_FINISHED);
+    blkcopy_cleanup(s);
+    return 0;
+
+err:
+    if (bdrv_open(s->src, s->src_filename, open_flags, NULL) < 0) {
+        error_report("%s: %s: cannot fallback to source image\n", __func__,
+                     s->src_filename);
+        abort();
+    }
+    return -1;
+}
+
+static int bdrv_copy_setup(const char *device, const char *filename,
+                           bool shared_base)
+{
+    int64_t sectors;
+    BdrvCopyState *blkcopy, *safe;
+    BlockDriverState *src, *dst;
+
+    src = bdrv_find(device);
+    if (src) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, device);
+        return -1;
+    }
+
+    dst = bdrv_new("");
+    if (bdrv_open(dst, filename, src->open_flags, NULL) < 0) {
+        bdrv_delete(dst);
+        qerror_report(QERR_OPEN_FILE_FAILED, filename);
+        return -1;
+    }
+
+    QLIST_FOREACH_SAFE(blkcopy, &block_copy_list, list, safe) {
+        if (!strcmp(blkcopy->device_name, src->device_name)) {
+            if (blkcopy->stage == STAGE_SWITCH_FINISHED || blkcopy->failed) {
+                blkcopy_free(blkcopy);
+            } else {
+                qerror_report(QERR_IN_PROGRESS, "block copy");
+                return -1;
+            }
+        }
+    }
+
+    sectors = bdrv_getlength(src) >> BDRV_SECTOR_BITS;
+    if (sectors != bdrv_getlength(dst) >> BDRV_SECTOR_BITS) {
+        qerror_report(QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS);
+        return -1;
+    }
+
+    blkcopy = qemu_mallocz(sizeof(BdrvCopyState));
+    blkcopy->src = src;
+    blkcopy->dst = dst;
+    blkcopy->curr_sector = 0;
+    blkcopy->nr_sectors = sectors;
+    blkcopy_set_stage(blkcopy, STAGE_BULK);
+    blkcopy->aio_timer = qemu_new_timer_ms(rt_clock, aio_timer, blkcopy);
+    blkcopy->shared_base = shared_base;
+    strncpy(blkcopy->device_name, blkcopy->src->device_name,
+            sizeof(blkcopy->device_name) - 1);
+    strncpy(blkcopy->src_filename, blkcopy->src->filename,
+            sizeof(blkcopy->src_filename) - 1);
+    strncpy(blkcopy->dst_filename, filename,
+            sizeof(blkcopy->dst_filename) - 1);
+
+    drive_get_ref(drive_get_by_blockdev(src));
+    bdrv_set_in_use(src, 1);
+    bdrv_set_dirty_tracking(src, 1);
+    qemu_mod_timer(blkcopy->aio_timer, qemu_get_clock_ms(rt_clock));
+
+    QLIST_INSERT_HEAD(&block_copy_list, blkcopy, list);
+    return 0;
+}
+
+int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+    const char *device = qdict_get_str(qdict, "device");
+    const char *filename = qdict_get_str(qdict, "filename");
+    bool shared_base = qdict_get_try_bool(qdict, "incremental", 0);
+
+    if (migration_active()) {
+        qerror_report(QERR_IN_PROGRESS, "migration");
+        return -1;
+    }
+
+    return bdrv_copy_setup(device, filename, shared_base);
+}
+
+int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+    BdrvCopyState *blkcopy, *s = NULL;
+    const char *device = qdict_get_str(qdict, "device");
+
+    QLIST_FOREACH(blkcopy, &block_copy_list, list) {
+        if (!strcmp(blkcopy->device_name, device)) {
+            s = blkcopy;
+            break;
+        }
+    }
+
+    if (!s || s->stage == STAGE_SWITCH_FINISHED || s->failed) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, device);
+        return -1;
+    }
+
+    s->cancelled = 1;
+    do {
+        qemu_aio_flush();
+    } while (!QLIST_EMPTY(&s->io_list));
+    blkcopy_cleanup(s);
+    blkcopy_free(s);
+
+    return 0;
+}
+
+static void blockcopy_print_dict(QObject *obj, void *opaque)
+{
+    QDict *c_dict;
+    Monitor *mon = opaque;
+
+    c_dict = qobject_to_qdict(obj);
+
+    monitor_printf(mon, "%s: status=%s ",
+                        qdict_get_str(c_dict, "device"),
+                        qdict_get_str(c_dict, "status"));
+
+    if (qdict_haskey(c_dict, "info")) {
+        QDict *qdict = qobject_to_qdict(qdict_get(c_dict, "info"));
+
+        monitor_printf(mon, "percentage=%ld %%",
+                       qdict_get_int(qdict, "percentage"));
+    }
+
+    monitor_printf(mon, "\n");
+}
+
+void do_info_blockcopy_print(Monitor *mon, const QObject *data)
+{
+    qlist_iter(qobject_to_qlist(data), blockcopy_print_dict, mon);
+}
+
+void do_info_blockcopy(Monitor *mon, QObject **ret_data)
+{
+    QList *c_list;
+    BdrvCopyState *s;
+
+    c_list = qlist_new();
+
+    QLIST_FOREACH(s, &block_copy_list, list) {
+        QObject *c_obj;
+        static const char *status[] = { "failed", "active", "mirrored",
+                                        "completed" };
+        int i;
+
+        if (s->failed) {
+            i = 0;
+        } else if (s->stage < STAGE_MIRROR_WRITES) {
+            i = 1;
+        } else if (s->stage < STAGE_SWITCH_FINISHED) {
+            i = 2;
+        } else {
+            i = 3;
+        }
+
+        c_obj = qobject_from_jsonf("{ 'device': %s, 'status': %s }",
+                                    s->device_name, status[i]);
+
+        if (i == 1) {
+            QDict *dict = qobject_to_qdict(c_obj);
+            QObject *obj;
+
+            /* FIXME: add dirty stage progress? */
+            obj = qobject_from_jsonf("{ 'percentage': %" PRId64 "}",
+                                     s->completed_sectors * 100
+                                     / s->nr_sectors);
+            qdict_put_obj(dict, "info", obj);
+        }
+        qlist_append_obj(c_list, c_obj);
+    }
+
+    *ret_data = QOBJECT(c_list);
+}
+
+bool block_copy_active(void)
+{
+    BdrvCopyState *s;
+
+    QLIST_FOREACH(s, &block_copy_list, list) {
+        if (s->failed) {
+            continue;
+        }
+        if (s->stage < STAGE_SWITCH_FINISHED) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
Index: qemu-block-copy/block-copy.h
===================================================================
--- /dev/null
+++ qemu-block-copy/block-copy.h
@@ -0,0 +1,26 @@
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_COPY_H
+#define BLOCK_COPY_H
+
+int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data);
+int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data);
+int do_bdrv_copy_switch(Monitor *mon, const QDict *qdict, QObject **ret_data);
+
+void do_info_blockcopy_print(Monitor *mon, const QObject *data);
+void do_info_blockcopy(Monitor *mon, QObject **ret_data);
+
+bool block_copy_active(void);
+
+#endif /* BLOCK_COPY_H */
+
Index: qemu-block-copy/hmp-commands.hx
===================================================================
--- qemu-block-copy.orig/hmp-commands.hx
+++ qemu-block-copy/hmp-commands.hx
@@ -806,6 +806,63 @@ Set maximum speed to @var{value} (in byt
 ETEXI
 
     {
+        .name       = "block_copy",
+        .args_type  = "device:s,filename:s,incremental:-i",
+        .params     = "device filename [-i]",
+        .help       = "live block copy device to image"
+                      "\n\t\t\t -i for incremental copy "
+                      "(base image shared between original and destination)",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy,
+    },
+
+STEXI
+@item block_copy @var{device} @var{filename} [-i]
+@findex block_copy
+Live copy block device @var{device} to image @var{filename}.
+        -i for incremental copy (base image is shared)
+
+Destination image @var{filename} must be created with qemu-img prior
+to execution of this command, with image size equal to the original
+image size.
+
+Incremental copy allows the destination image @var{filename} to share
+a common base image with the original image. This option skips copying
+blocks which are not allocated in the original image.
+ETEXI
+
+    {
+        .name       = "block_copy_cancel",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "cancel live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_cancel,
+    },
+
+STEXI
+@item block_copy_cancel @var{device}
+@findex block_copy_cancel
+Cancel live block copy on @var{device}.
+ETEXI
+
+    {
+        .name       = "block_copy_switch",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "finish live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_switch,
+    },
+
+STEXI
+@item block_copy_switch @var{device}
+@findex block_copy_switch
+Finish live block copy on @var{device} by switching
+to destination image.
+ETEXI
+
+    {
         .name       = "migrate_set_downtime",
         .args_type  = "value:T",
         .params     = "value",
@@ -1352,6 +1409,8 @@ show device tree
 show qdev device model list
 @item info roms
 show roms
+@item info block-copy
+show block copy status
 @end table
 ETEXI
 
Index: qemu-block-copy/monitor.c
===================================================================
--- qemu-block-copy.orig/monitor.c
+++ qemu-block-copy/monitor.c
@@ -45,6 +45,7 @@
 #include "balloon.h"
 #include "qemu-timer.h"
 #include "migration.h"
+#include "block-copy.h"
 #include "kvm.h"
 #include "acl.h"
 #include "qint.h"
@@ -3101,6 +3102,14 @@ static const mon_cmd_t info_cmds[] = {
     },
 #endif
     {
+        .name       = "block-copy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "show block copy status",
+        .user_print = do_info_blockcopy_print,
+        .mhandler.info_new = do_info_blockcopy,
+    },
+    {
         .name       = NULL,
     },
 };
@@ -3242,6 +3251,14 @@ static const mon_cmd_t qmp_query_cmds[] 
         .mhandler.info_async = do_info_balloon,
         .flags      = MONITOR_CMD_ASYNC,
     },
+    {
+        .name       = "block-copy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "show block copy status",
+        .user_print = do_info_blockcopy_print,
+        .mhandler.info_new = do_info_blockcopy,
+    },
     { /* NULL */ },
 };
 
Index: qemu-block-copy/qmp-commands.hx
===================================================================
--- qemu-block-copy.orig/qmp-commands.hx
+++ qemu-block-copy/qmp-commands.hx
@@ -581,6 +581,98 @@ Example:
 EQMP
 
     {
+        .name       = "block_copy",
+        .args_type  = "device:s,filename:s,inc:-i",
+        .params     = "device filename [-i]",
+        .help       = "live block copy device to image"
+                      "\n\t\t\t -i for incremental copy "
+                      "(base image shared between src and destination)",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy,
+    },
+
+SQMP
+block-copy
+-------
+
+Live block copy.
+
+Arguments:
+
+- "device": device name (json-string)
+- "filename": target image filename (json-string)
+- "incremental": incremental disk copy (json-bool, optional)
+
+Example:
+
+-> { "execute": "block_copy",
+                            "arguments": { "device": "ide0-hd1",
+                               "filename": "/mnt/new-disk.img",
+                             } }
+
+<- { "return": {} }
+
+Notes:
+
+(1) The 'query-block-copy' command should be used to check block copy progress
+    and final result (this information is provided by the 'status' member)
+(2) Boolean argument "incremental" defaults to false
+
+EQMP
+
+    {
+        .name       = "block_copy_cancel",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "cancel live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_cancel,
+    },
+
+SQMP
+block_copy_cancel
+--------------
+
+Cancel live block copy.
+
+Arguments:
+
+- device: device name (json-string)
+
+Example:
+
+-> { "execute": "block_copy_cancel", "arguments": { "device": "ide0-hd1" } }
+<- { "return": {} }
+
+EQMP
+
+    {
+        .name       = "block_copy_switch",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "finish live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_switch,
+    },
+
+SQMP
+block_copy_switch
+--------------
+
+Finish live block copy, switching device to destination image.
+
+Arguments:
+
+- device: device name (json-string)
+
+Example:
+
+-> { "execute": "block_copy_switch", "arguments": { "device": "ide0-hd1" } }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "netdev_add",
         .args_type  = "netdev:O",
         .params     = "[user|tap|socket],id=str[,prop=value][,...]",
@@ -1744,6 +1836,50 @@ Examples:
 EQMP
 
 SQMP
+query-block-copy
+-------------
+
+Live block copy status.
+
+Each block copy instance information is stored in a json-object and the returned
+value is a json-array of all instances.
+
+Each json-object contains the following:
+
+- "device": device name (json-string)
+- "status": block copy status (json-string)
+    - Possible values: "active", "failed", "mirrored", "completed", meaning:
+                    - failed: block copy failed.
+                    - active: block copy active, copying to destination image.
+                    - mirrored: block copy active, finished copying to destination
+                      image, writes are mirrored.
+                    - completed: block copy completed.
+
+- "info": A json-object with the statistics information, if status is "active":
+    - "percentage": percentage completed (json-int)
+
+Example:
+
+Block copy for "ide1-hd0" active and block copy for "ide1-hd1" failed:
+
+-> { "execute": "query-block-copy" }
+<- {
+      "return":[
+        {"device":"ide1-hd0",
+            "status":"active",
+            "info":{
+               "percentage":23,
+            }
+        },
+        {"device":"ide1-hd1",
+         "status":"failed"
+        }
+      ]
+   }
+
+EQMP
+
+SQMP
 query-balloon
 -------------
 
Index: qemu-block-copy/Makefile.objs
===================================================================
--- qemu-block-copy.orig/Makefile.objs
+++ qemu-block-copy/Makefile.objs
@@ -98,7 +98,7 @@ common-obj-y += buffered_file.o migratio
 common-obj-y += qemu-char.o savevm.o #aio.o
 common-obj-y += msmouse.o ps2.o
 common-obj-y += qdev.o qdev-properties.o
-common-obj-y += block-migration.o iohandler.o
+common-obj-y += block-migration.o iohandler.o block-copy.o
 common-obj-y += pflib.o
 common-obj-y += bitmap.o bitops.o
 
Index: qemu-block-copy/docs/block_copy.txt
===================================================================
--- /dev/null
+++ qemu-block-copy/docs/block_copy.txt
@@ -0,0 +1,39 @@
+Live block copy
+===============
+
+Block copy allows an image to be copied to a destination image, while
+the guest is operating on the source image. The command is:
+
+block_copy {device} {filename} [-i]
+
+Destination image {filename} must be created with qemu-img prior to
+execution of this command, with image size equal to the original image
+size.
+
+Incremental copy allows the destination image to share a common base
+image with the original image. This option skips copying blocks which
+are not allocated in the original image.
+
+
+Command flow
+============
+
+Once the copy initiated with the block_copy command has finished,
+writes by the guest to the source image will be mirrored to the
+destination image.
+
+This state is indicated by the "mirrored" status of a device, visible in
+query-block-copy command output.
+
+Once in the mirrored state, management application is able to issue
+the block_copy_switch command to complete the block copy operation.
+
+Follows a pseudo-algorithm:
+
+     block_copy(dev, file);
+     do {
+         query_block_copy(block_copy_state);
+         sleep(s)
+     } while (!block_copy_state.dev.mirrored);
+     block_copy_switch(dev);
+

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [patch 7/7] do not allow migration if block copy in progress
  2011-06-06 16:55 [Qemu-devel] [patch 0/7] live block copy (v4) Marcelo Tosatti
                   ` (5 preceding siblings ...)
  2011-06-06 16:55 ` [Qemu-devel] [patch 6/7] QEMU " Marcelo Tosatti
@ 2011-06-06 16:55 ` Marcelo Tosatti
  6 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-06 16:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, Marcelo Tosatti, Jes.Sorensen, dlaor, avi, jdenemar

[-- Attachment #1: 3-3-do-not-allow-migration-if-block-copy-in-progress.patch --]
[-- Type: text/plain, Size: 911 bytes --]

Disable live migration with in progress live copy, due to:

- Conflict with block migration.
- Block copy does not support migration ATM.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu-block-copy/migration.c
===================================================================
--- qemu-block-copy.orig/migration.c
+++ qemu-block-copy/migration.c
@@ -19,6 +19,7 @@
 #include "block.h"
 #include "qemu_socket.h"
 #include "block-migration.h"
+#include "block-copy.h"
 #include "qemu-objects.h"
 
 //#define DEBUG_MIGRATION
@@ -95,6 +96,11 @@ int do_migrate(Monitor *mon, const QDict
         return -1;
     }
 
+    if (block_copy_active()) {
+        monitor_printf(mon, "block copy in progress\n");
+        return -1;
+    }
+
     if (strstart(uri, "tcp:", &p)) {
         s = tcp_start_outgoing_migration(mon, p, max_throttle, detach,
                                          blk, inc);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [patch 6/7] QEMU live block copy (update)
  2011-06-06 16:55 ` [Qemu-devel] [patch 6/7] QEMU " Marcelo Tosatti
@ 2011-06-06 17:03   ` Marcelo Tosatti
  2011-06-07 10:15     ` Jiri Denemark
  2011-06-07 12:15   ` [Qemu-devel] [patch 6/7] QEMU live block copy Stefan Hajnoczi
  1 sibling, 1 reply; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-06 17:03 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, Jes.Sorensen, jdenemar, dlaor, avi


Support live image copy + switch. That is, copy an image backing
a guest hard disk to a destination image (destination image must
be created separately), and switch to this copy.

Command syntax:

block_copy device filename [-i] -- live block copy device to image
             -i for incremental copy (base image shared between src and destination)

Please refer to qmp-commands diff for more details.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu-block-copy/block-copy.c
===================================================================
--- /dev/null
+++ qemu-block-copy/block-copy.c
@@ -0,0 +1,764 @@
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "blockdev.h"
+#include "qemu-queue.h"
+#include "qemu-timer.h"
+#include "monitor.h"
+#include "block-copy.h"
+#include "migration.h"
+#include "sysemu.h"
+#include "qjson.h"
+#include <assert.h>
+#include "hw/hw.h"
+
+#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
+#define MAX_IS_ALLOCATED_SEARCH 65536
+
+/*
+ * Stages:
+ *
+ * STAGE_BULK: bulk reads/writes in progress
+ * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress
+ * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress
+ * STAGE_MIRROR_WRITES: copy finished, writes mirrored to both images.
+ * STAGE_SWITCH_FINISHED: switched to new image.
+ */
+
+enum BdrvCopyStage {
+    STAGE_BULK,
+    STAGE_BULK_FINISHED,
+    STAGE_DIRTY,
+    STAGE_MIRROR_WRITES,
+    STAGE_SWITCH_FINISHED,
+};
+
+#define NAME_LEN 1024
+#define DEV_LEN 256
+
+typedef struct BdrvCopyState {
+    BlockDriverState *src;
+    BlockDriverState *dst;
+    bool shared_base;
+
+    int64_t curr_sector;
+    int64_t completed_sectors;
+    int64_t nr_sectors;
+
+    enum BdrvCopyStage stage;
+    int inflight_reads;
+    int error;
+    int failed;
+    int cancelled;
+    QLIST_HEAD(, BdrvCopyBlock) io_list;
+    unsigned long *aio_bitmap;
+    QEMUTimer *aio_timer;
+    QLIST_ENTRY(BdrvCopyState) list;
+
+    int64_t blocks;
+    int64_t total_time;
+
+    char device_name[DEV_LEN];
+    char src_filename[NAME_LEN];
+    char dst_filename[NAME_LEN];
+} BdrvCopyState;
+
+typedef struct BdrvCopyBlock {
+    BdrvCopyState *state;
+    uint8_t *buf;
+    int64_t sector;
+    int64_t nr_sectors;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    BlockDriverAIOCB *aiocb;
+    int64_t time;
+    QLIST_ENTRY(BdrvCopyBlock) list;
+} BdrvCopyBlock;
+
+static QLIST_HEAD(, BdrvCopyState) block_copy_list =
+    QLIST_HEAD_INITIALIZER(block_copy_list);
+
+static void alloc_aio_bitmap(BdrvCopyState *s)
+{
+    BlockDriverState *bs = s->src;
+    int64_t bitmap_size;
+
+    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
+            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
+
+    s->aio_bitmap = qemu_mallocz(bitmap_size);
+}
+
+static bool aio_inflight(BdrvCopyState *s, int64_t sector)
+{
+    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    if (s->aio_bitmap &&
+        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(s->src)) {
+        return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
+            (1UL << (chunk % (sizeof(unsigned long) * 8))));
+    } else {
+        return 0;
+    }
+}
+
+static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num,
+                             int nb_sectors, int set)
+{
+    int64_t start, end;
+    unsigned long val, idx, bit;
+
+    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
+    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    for (; start <= end; start++) {
+        idx = start / (sizeof(unsigned long) * 8);
+        bit = start % (sizeof(unsigned long) * 8);
+        val = s->aio_bitmap[idx];
+        if (set) {
+            if (!(val & (1UL << bit))) {
+                val |= 1UL << bit;
+            }
+        } else {
+            if (val & (1UL << bit)) {
+                val &= ~(1UL << bit);
+            }
+        }
+        s->aio_bitmap[idx] = val;
+    }
+}
+
+static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage)
+{
+    s->stage = stage;
+
+    switch (stage) {
+    case STAGE_BULK:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK);
+        break;
+    case STAGE_BULK_FINISHED:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED);
+        break;
+    case STAGE_DIRTY:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY);
+        break;
+    case STAGE_MIRROR_WRITES:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_MIRROR_WRITES);
+        break;
+    case STAGE_SWITCH_FINISHED:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED);
+        break;
+    default:
+        break;
+    }
+}
+
+static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret)
+{
+    s->error = ret;
+    qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+}
+
+static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time)
+{
+    s->blocks++;
+    s->total_time += time;
+}
+
+static void blk_copy_write_cb(void *opaque, int ret)
+{
+    BdrvCopyBlock *blk = opaque;
+    BdrvCopyState *s = blk->state;
+
+    if (ret < 0) {
+        QLIST_REMOVE(blk, list);
+        qemu_free(blk->buf);
+        qemu_free(blk);
+        blk_copy_handle_cb_error(s, ret);
+        return;
+    }
+
+    QLIST_REMOVE(blk, list);
+    add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time);
+
+    /* schedule switch to STAGE_DIRTY on last bulk write completion */
+    if (blk->state->stage == STAGE_BULK_FINISHED) {
+        qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+    }
+
+    if (blk->state->stage > STAGE_BULK_FINISHED) {
+        set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0);
+    }
+
+    qemu_free(blk->buf);
+    qemu_free(blk);
+}
+
+static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk)
+{
+    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+    blk->state = s;
+    blk->sector = read_blk->sector;
+    blk->nr_sectors = read_blk->nr_sectors;
+    blk->time = read_blk->time;
+    blk->buf = read_blk->buf;
+    QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+    blk->iov.iov_base = read_blk->buf;
+    blk->iov.iov_len = read_blk->iov.iov_len;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+    BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE);
+    blk->aiocb = bdrv_aio_writev(s->dst, blk->sector, &blk->qiov,
+                                 blk->iov.iov_len / BDRV_SECTOR_SIZE,
+                                 blk_copy_write_cb, blk);
+    if (!blk->aiocb) {
+        s->error = 1;
+        goto error;
+    }
+
+    return;
+
+error:
+    QLIST_REMOVE(blk, list);
+    qemu_free(read_blk->buf);
+    qemu_free(blk);
+}
+
+static void blk_copy_read_cb(void *opaque, int ret)
+{
+    BdrvCopyBlock *blk = opaque;
+    BdrvCopyState *s = blk->state;
+
+    s->inflight_reads--;
+    if (ret < 0) {
+        QLIST_REMOVE(blk, list);
+        qemu_free(blk->buf);
+        qemu_free(blk);
+        blk_copy_handle_cb_error(s, ret);
+        return;
+    }
+    blk_copy_issue_write(s, blk);
+    QLIST_REMOVE(blk, list);
+    qemu_free(blk);
+    qemu_mod_timer(s->aio_timer, qemu_get_clock_ms(rt_clock));
+}
+
+static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
+                                int nr_sectors)
+{
+    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+    blk->buf = qemu_mallocz(BLOCK_SIZE);
+    blk->state = s;
+    blk->sector = sector;
+    blk->nr_sectors = nr_sectors;
+    QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+    blk->iov.iov_base = blk->buf;
+    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+    s->inflight_reads++;
+    blk->time = qemu_get_clock_ns(rt_clock);
+    blk->aiocb = bdrv_aio_readv(s->src, sector, &blk->qiov, nr_sectors,
+                                blk_copy_read_cb, blk);
+    if (!blk->aiocb) {
+        s->error = 1;
+        goto error;
+    }
+
+    return;
+
+error:
+    s->inflight_reads--;
+    QLIST_REMOVE(blk, list);
+    qemu_free(blk->buf);
+    qemu_free(blk);
+}
+
+static bool blkcopy_can_switch(BdrvCopyState *s)
+{
+    int64_t remaining_dirty;
+    int64_t avg_transfer_time;
+
+    remaining_dirty = bdrv_get_dirty_count(s->src);
+    if (remaining_dirty == 0 || s->blocks == 0) {
+        return true;
+    }
+
+    avg_transfer_time = s->total_time / s->blocks;
+    if ((remaining_dirty * avg_transfer_time) <= migrate_max_downtime()) {
+        return true;
+    }
+    return false;
+}
+
+static int blk_issue_reads_dirty(BdrvCopyState *s)
+{
+    int64_t sector;
+
+    for (sector = s->curr_sector; sector < s->nr_sectors;) {
+        if (bdrv_get_dirty(s->src, sector) && !aio_inflight(s, sector)) {
+            int nr_sectors = MIN(s->nr_sectors - s->curr_sector,
+                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+            blk_copy_issue_read(s, sector, nr_sectors);
+            bdrv_reset_dirty(s->src, sector, nr_sectors);
+            set_aio_inflight(s, sector, nr_sectors, 1);
+            break;
+        }
+
+        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+        s->curr_sector = sector;
+    }
+
+    if (sector >= s->nr_sectors) {
+        s->curr_sector = 0;
+    }
+    return 0;
+}
+
+static int blk_issue_reads_bulk(BdrvCopyState *s)
+{
+    int nr_sectors;
+    int64_t curr_sector = s->curr_sector;
+
+    if (s->shared_base) {
+        while (curr_sector < s->nr_sectors &&
+                !bdrv_is_allocated(s->src, curr_sector,
+                                   MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
+                curr_sector += nr_sectors;
+        }
+    }
+
+    if (curr_sector >= s->nr_sectors) {
+        s->curr_sector = 0;
+        return 1;
+    }
+
+    curr_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
+    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    blk_copy_issue_read(s, s->curr_sector, nr_sectors);
+    s->curr_sector += nr_sectors;
+    s->completed_sectors = curr_sector;
+    return 0;
+}
+
+static void blkcopy_finish(BdrvCopyState *s)
+{
+    int64_t sector;
+    uint8_t *buf;
+
+    buf = qemu_malloc(BLOCK_SIZE);
+
+    /* FIXME: speed up loop, get_next_dirty_block? */
+    for (sector = 0; sector < s->nr_sectors;
+         sector += BDRV_SECTORS_PER_DIRTY_CHUNK) {
+        if (bdrv_get_dirty(s->src, sector)) {
+            int nr_sectors = MIN(s->nr_sectors - sector,
+                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+            memset(buf, 0, BLOCK_SIZE);
+            if (bdrv_read(s->src, sector, buf, nr_sectors) < 0) {
+                goto error;
+            }
+            if (bdrv_write(s->dst, sector, buf, nr_sectors) < 0) {
+                goto error;
+            }
+            bdrv_reset_dirty(s->src, sector, nr_sectors);
+        }
+
+        if (bdrv_get_dirty_count(s->src) == 0) {
+            break;
+        }
+    }
+    qemu_free(buf);
+    return;
+
+error:
+    qemu_free(buf);
+    s->error = 1;
+}
+
+static void blkcopy_cleanup(BdrvCopyState *s)
+{
+    assert(s->inflight_reads == 0);
+    assert(QLIST_EMPTY(&s->io_list));
+    bdrv_set_dirty_tracking(s->src, 0);
+    drive_put_ref(drive_get_by_blockdev(s->src));
+    bdrv_set_in_use(s->src, 0);
+    if (s->stage >= STAGE_DIRTY) {
+        qemu_free(s->aio_bitmap);
+    }
+    qemu_del_timer(s->aio_timer);
+}
+
+static void blkcopy_free(BdrvCopyState *s)
+{
+    QLIST_REMOVE(s, list);
+    qemu_free(s);
+}
+
+static void handle_error(BdrvCopyState *s)
+{
+    if (!QLIST_EMPTY(&s->io_list)) {
+        return;
+    }
+    s->failed = 1;
+    blkcopy_cleanup(s);
+}
+
+static void blkcopy_switch(BdrvCopyState *s)
+{
+    char src_filename[NAME_LEN], mirror_name[NAME_LEN*2];
+    int open_flags, ret;
+
+    strncpy(src_filename, s->src->filename, sizeof(src_filename) - 1);
+    open_flags = s->src->open_flags;
+
+    assert(s->stage == STAGE_DIRTY);
+
+    vm_stop(VMSTOP_BLOCKCOPY);
+    /* flush any guest writes, dirty bitmap uptodate after this.
+     * copy AIO also finished.
+     */
+    qemu_aio_flush();
+    assert(QLIST_EMPTY(&s->io_list));
+    if (s->error) {
+        handle_error(s);
+        goto vm_start;
+    }
+    blkcopy_finish(s);
+    if (s->error) {
+        handle_error(s);
+        goto vm_start;
+    }
+    assert(bdrv_get_dirty_count(s->src) == 0);
+    /* turn dirty bitmap off */
+    bdrv_set_dirty_tracking(s->src, 0);
+    /* switch to double writes */
+    bdrv_flush_all();
+    bdrv_close(s->src);
+    bdrv_close(s->dst);
+
+    snprintf(mirror_name, sizeof(mirror_name)-1,
+             "blkmirror:%s:%s", s->dst->filename, s->src->filename);
+
+    ret = bdrv_open(s->src, mirror_name, s->src->open_flags, NULL);
+    if (ret < 0) {
+        error_report("%s: cannot open blkmirror device, err %d",
+                      mirror_name, ret);
+        s->failed = 1;
+        goto err;
+    }
+
+    blkcopy_set_stage(s, STAGE_MIRROR_WRITES);
+    qemu_del_timer(s->aio_timer);
+
+vm_start:
+    vm_start();
+    return;
+
+err:
+    if (bdrv_open(s->src, src_filename, open_flags, NULL) < 0) {
+        error_report("%s: %s: cannot fallback to source image\n", __func__,
+                     s->src_filename);
+        abort();
+    }
+    blkcopy_cleanup(s);
+    goto vm_start;
+}
+
+#define BLKCOPY_INFLIGHT 2
+
+/*
+ * To simplify the implementation, the IO completion callbacks do not
+ * handle stage control or submit IO for further blocks. A timer is used
+ * for such purpose.
+ */
+
+static void aio_timer(void *opaque)
+{
+    BdrvCopyState *s = opaque;
+
+    assert(s->cancelled == 0);
+    assert(s->stage < STAGE_MIRROR_WRITES);
+
+    if (s->error) {
+        handle_error(s);
+        return;
+    }
+
+    while (s->stage == STAGE_BULK) {
+        if (s->inflight_reads >= BLKCOPY_INFLIGHT) {
+            break;
+        }
+        if (blk_issue_reads_bulk(s)) {
+            blkcopy_set_stage(s, STAGE_BULK_FINISHED);
+        }
+    }
+
+    if (s->stage == STAGE_BULK_FINISHED) {
+        if (QLIST_EMPTY(&s->io_list)) {
+            blkcopy_set_stage(s, STAGE_DIRTY);
+            alloc_aio_bitmap(s);
+        }
+    }
+
+    while (s->stage == STAGE_DIRTY) {
+        if (s->inflight_reads >= BLKCOPY_INFLIGHT) {
+            break;
+        }
+        blk_issue_reads_dirty(s);
+        if (blkcopy_can_switch(s)) {
+            BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_SWITCH_START);
+            blkcopy_switch(s);
+            return;
+        }
+    }
+}
+
+
+int do_bdrv_copy_switch(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+    const char *device = qdict_get_str(qdict, "device");
+    BdrvCopyState *s = NULL;
+    int open_flags;
+
+    QLIST_FOREACH(s, &block_copy_list, list) {
+        if (!strcmp(s->device_name, device)) {
+            if (s->stage != STAGE_MIRROR_WRITES) {
+                qerror_report(QERR_IN_PROGRESS, "block copy");
+                return -1;
+            }
+            break;
+        }
+    }
+
+    if (!s) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, device);
+        return -1;
+    }
+
+    open_flags = s->src->open_flags;
+
+    /* switch from mirrored writes to destination only */
+    bdrv_flush_all();
+    bdrv_close(s->src);
+    if (bdrv_open(s->src, s->dst->filename, s->src->open_flags, NULL) < 0) {
+        s->failed = 1;
+        goto err;
+    }
+
+    blkcopy_set_stage(s, STAGE_SWITCH_FINISHED);
+    blkcopy_cleanup(s);
+    return 0;
+
+err:
+    if (bdrv_open(s->src, s->src_filename, open_flags, NULL) < 0) {
+        error_report("%s: %s: cannot fallback to source image\n", __func__,
+                     s->src_filename);
+        abort();
+    }
+    return -1;
+}
+
+static int bdrv_copy_setup(const char *device, const char *filename,
+                           bool shared_base)
+{
+    int64_t sectors;
+    BdrvCopyState *blkcopy, *safe;
+    BlockDriverState *src, *dst;
+
+    src = bdrv_find(device);
+    if (!src) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, device);
+        return -1;
+    }
+
+    dst = bdrv_new("");
+    if (bdrv_open(dst, filename, src->open_flags, NULL) < 0) {
+        bdrv_delete(dst);
+        qerror_report(QERR_OPEN_FILE_FAILED, filename);
+        return -1;
+    }
+
+    QLIST_FOREACH_SAFE(blkcopy, &block_copy_list, list, safe) {
+        if (!strcmp(blkcopy->device_name, src->device_name)) {
+            if (blkcopy->stage == STAGE_SWITCH_FINISHED || blkcopy->failed) {
+                blkcopy_free(blkcopy);
+            } else {
+                qerror_report(QERR_IN_PROGRESS, "block copy");
+                return -1;
+            }
+        }
+    }
+
+    sectors = bdrv_getlength(src) >> BDRV_SECTOR_BITS;
+    if (sectors != bdrv_getlength(dst) >> BDRV_SECTOR_BITS) {
+        qerror_report(QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS);
+        return -1;
+    }
+
+    blkcopy = qemu_mallocz(sizeof(BdrvCopyState));
+    blkcopy->src = src;
+    blkcopy->dst = dst;
+    blkcopy->curr_sector = 0;
+    blkcopy->nr_sectors = sectors;
+    blkcopy_set_stage(blkcopy, STAGE_BULK);
+    blkcopy->aio_timer = qemu_new_timer_ms(rt_clock, aio_timer, blkcopy);
+    blkcopy->shared_base = shared_base;
+    strncpy(blkcopy->device_name, blkcopy->src->device_name,
+            sizeof(blkcopy->device_name) - 1);
+    strncpy(blkcopy->src_filename, blkcopy->src->filename,
+            sizeof(blkcopy->src_filename) - 1);
+    strncpy(blkcopy->dst_filename, filename,
+            sizeof(blkcopy->dst_filename) - 1);
+
+    drive_get_ref(drive_get_by_blockdev(src));
+    bdrv_set_in_use(src, 1);
+    bdrv_set_dirty_tracking(src, 1);
+    qemu_mod_timer(blkcopy->aio_timer, qemu_get_clock_ms(rt_clock));
+
+    QLIST_INSERT_HEAD(&block_copy_list, blkcopy, list);
+    return 0;
+}
+
+int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+    const char *device = qdict_get_str(qdict, "device");
+    const char *filename = qdict_get_str(qdict, "filename");
+    bool shared_base = qdict_get_try_bool(qdict, "incremental", 0);
+
+    if (migration_active()) {
+        qerror_report(QERR_IN_PROGRESS, "migration");
+        return -1;
+    }
+
+    return bdrv_copy_setup(device, filename, shared_base);
+}
+
+int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+    BdrvCopyState *blkcopy, *s = NULL;
+    const char *device = qdict_get_str(qdict, "device");
+
+    QLIST_FOREACH(blkcopy, &block_copy_list, list) {
+        if (!strcmp(blkcopy->device_name, device)) {
+            s = blkcopy;
+            break;
+        }
+    }
+
+    if (!s || s->stage == STAGE_SWITCH_FINISHED || s->failed) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, device);
+        return -1;
+    }
+
+    s->cancelled = 1;
+    do {
+        qemu_aio_flush();
+    } while (!QLIST_EMPTY(&s->io_list));
+    blkcopy_cleanup(s);
+    blkcopy_free(s);
+
+    return 0;
+}
+
+static void blockcopy_print_dict(QObject *obj, void *opaque)
+{
+    QDict *c_dict;
+    Monitor *mon = opaque;
+
+    c_dict = qobject_to_qdict(obj);
+
+    monitor_printf(mon, "%s: status=%s ",
+                        qdict_get_str(c_dict, "device"),
+                        qdict_get_str(c_dict, "status"));
+
+    if (qdict_haskey(c_dict, "info")) {
+        QDict *qdict = qobject_to_qdict(qdict_get(c_dict, "info"));
+
+        monitor_printf(mon, "percentage=%ld %%",
+                       qdict_get_int(qdict, "percentage"));
+    }
+
+    monitor_printf(mon, "\n");
+}
+
+void do_info_blockcopy_print(Monitor *mon, const QObject *data)
+{
+    qlist_iter(qobject_to_qlist(data), blockcopy_print_dict, mon);
+}
+
+void do_info_blockcopy(Monitor *mon, QObject **ret_data)
+{
+    QList *c_list;
+    BdrvCopyState *s;
+
+    c_list = qlist_new();
+
+    QLIST_FOREACH(s, &block_copy_list, list) {
+        QObject *c_obj;
+        static const char *status[] = { "failed", "active", "mirrored",
+                                        "completed" };
+        int i;
+
+        if (s->failed) {
+            i = 0;
+        } else if (s->stage < STAGE_MIRROR_WRITES) {
+            i = 1;
+        } else if (s->stage < STAGE_SWITCH_FINISHED) {
+            i = 2;
+        } else {
+            i = 3;
+        }
+
+        c_obj = qobject_from_jsonf("{ 'device': %s, 'status': %s }",
+                                    s->device_name, status[i]);
+
+        if (i == 1) {
+            QDict *dict = qobject_to_qdict(c_obj);
+            QObject *obj;
+
+            /* FIXME: add dirty stage progress? */
+            obj = qobject_from_jsonf("{ 'percentage': %" PRId64 "}",
+                                     s->completed_sectors * 100
+                                     / s->nr_sectors);
+            qdict_put_obj(dict, "info", obj);
+        }
+        qlist_append_obj(c_list, c_obj);
+    }
+
+    *ret_data = QOBJECT(c_list);
+}
+
+bool block_copy_active(void)
+{
+    BdrvCopyState *s;
+
+    QLIST_FOREACH(s, &block_copy_list, list) {
+        if (s->failed) {
+            continue;
+        }
+        if (s->stage < STAGE_SWITCH_FINISHED) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
Index: qemu-block-copy/block-copy.h
===================================================================
--- /dev/null
+++ qemu-block-copy/block-copy.h
@@ -0,0 +1,26 @@
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_COPY_H
+#define BLOCK_COPY_H
+
+int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data);
+int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data);
+int do_bdrv_copy_switch(Monitor *mon, const QDict *qdict, QObject **ret_data);
+
+void do_info_blockcopy_print(Monitor *mon, const QObject *data);
+void do_info_blockcopy(Monitor *mon, QObject **ret_data);
+
+bool block_copy_active(void);
+
+#endif /* BLOCK_COPY_H */
+
Index: qemu-block-copy/hmp-commands.hx
===================================================================
--- qemu-block-copy.orig/hmp-commands.hx
+++ qemu-block-copy/hmp-commands.hx
@@ -806,6 +806,63 @@ Set maximum speed to @var{value} (in byt
 ETEXI
 
     {
+        .name       = "block_copy",
+        .args_type  = "device:s,filename:s,incremental:-i",
+        .params     = "device filename [-i]",
+        .help       = "live block copy device to image"
+                      "\n\t\t\t -i for incremental copy "
+                      "(base image shared between original and destination)",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy,
+    },
+
+STEXI
+@item block_copy @var{device} @var{filename} [-i]
+@findex block_copy
+Live copy block device @var{device} to image @var{filename}.
+        -i for incremental copy (base image is shared)
+
+Destination image @var{filename} must be created with qemu-img prior
+to execution of this command, with image size equal to the original
+image size.
+
+Incremental copy allows the destination image @var{filename} to share
+a common base image with the original image. This option skips copying
+blocks which are not allocated in the original image.
+ETEXI
+
+    {
+        .name       = "block_copy_cancel",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "cancel live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_cancel,
+    },
+
+STEXI
+@item block_copy_cancel @var{device}
+@findex block_copy_cancel
+Cancel live block copy on @var{device}.
+ETEXI
+
+    {
+        .name       = "block_copy_switch",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "finish live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_switch,
+    },
+
+STEXI
+@item block_copy_switch @var{device}
+@findex block_copy_switch
+Finish live block copy on @var{device} by switching
+to destination image.
+ETEXI
+
+    {
         .name       = "migrate_set_downtime",
         .args_type  = "value:T",
         .params     = "value",
@@ -1352,6 +1409,8 @@ show device tree
 show qdev device model list
 @item info roms
 show roms
+@item info block-copy
+show block copy status
 @end table
 ETEXI
 
Index: qemu-block-copy/monitor.c
===================================================================
--- qemu-block-copy.orig/monitor.c
+++ qemu-block-copy/monitor.c
@@ -45,6 +45,7 @@
 #include "balloon.h"
 #include "qemu-timer.h"
 #include "migration.h"
+#include "block-copy.h"
 #include "kvm.h"
 #include "acl.h"
 #include "qint.h"
@@ -3101,6 +3102,14 @@ static const mon_cmd_t info_cmds[] = {
     },
 #endif
     {
+        .name       = "block-copy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "show block copy status",
+        .user_print = do_info_blockcopy_print,
+        .mhandler.info_new = do_info_blockcopy,
+    },
+    {
         .name       = NULL,
     },
 };
@@ -3242,6 +3251,14 @@ static const mon_cmd_t qmp_query_cmds[] 
         .mhandler.info_async = do_info_balloon,
         .flags      = MONITOR_CMD_ASYNC,
     },
+    {
+        .name       = "block-copy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "show block copy status",
+        .user_print = do_info_blockcopy_print,
+        .mhandler.info_new = do_info_blockcopy,
+    },
     { /* NULL */ },
 };
 
Index: qemu-block-copy/qmp-commands.hx
===================================================================
--- qemu-block-copy.orig/qmp-commands.hx
+++ qemu-block-copy/qmp-commands.hx
@@ -581,6 +581,98 @@ Example:
 EQMP
 
     {
+        .name       = "block_copy",
+        .args_type  = "device:s,filename:s,inc:-i",
+        .params     = "device filename [-i]",
+        .help       = "live block copy device to image"
+                      "\n\t\t\t -i for incremental copy "
+                      "(base image shared between src and destination)",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy,
+    },
+
+SQMP
+block-copy
+-------
+
+Live block copy.
+
+Arguments:
+
+- "device": device name (json-string)
+- "filename": target image filename (json-string)
+- "incremental": incremental disk copy (json-bool, optional)
+
+Example:
+
+-> { "execute": "block_copy",
+                            "arguments": { "device": "ide0-hd1",
+                               "filename": "/mnt/new-disk.img",
+                             } }
+
+<- { "return": {} }
+
+Notes:
+
+(1) The 'query-block-copy' command should be used to check block copy progress
+    and final result (this information is provided by the 'status' member)
+(2) Boolean argument "incremental" defaults to false
+
+EQMP
+
+    {
+        .name       = "block_copy_cancel",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "cancel live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_cancel,
+    },
+
+SQMP
+block_copy_cancel
+--------------
+
+Cancel live block copy.
+
+Arguments:
+
+- device: device name (json-string)
+
+Example:
+
+-> { "execute": "block_copy_cancel", "arguments": { "device": "ide0-hd1" } }
+<- { "return": {} }
+
+EQMP
+
+    {
+        .name       = "block_copy_switch",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "finish live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_switch,
+    },
+
+SQMP
+block_copy_switch
+--------------
+
+Finish live block copy, switching device to destination image.
+
+Arguments:
+
+- device: device name (json-string)
+
+Example:
+
+-> { "execute": "block_copy_switch", "arguments": { "device": "ide0-hd1" } }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "netdev_add",
         .args_type  = "netdev:O",
         .params     = "[user|tap|socket],id=str[,prop=value][,...]",
@@ -1744,6 +1836,50 @@ Examples:
 EQMP
 
 SQMP
+query-block-copy
+-------------
+
+Live block copy status.
+
+Each block copy instance information is stored in a json-object and the returned
+value is a json-array of all instances.
+
+Each json-object contains the following:
+
+- "device": device name (json-string)
+- "status": block copy status (json-string)
+    - Possible values: "active", "failed", "mirrored", "completed", meaning:
+                    - failed: block copy failed.
+                    - active: block copy active, copying to destination image.
+                    - mirrored: block copy active, finished copying to destination
+                      image, writes are mirrored.
+                    - completed: block copy completed.
+
+- "info": A json-object with the statistics information, if status is "active":
+    - "percentage": percentage completed (json-int)
+
+Example:
+
+Block copy for "ide1-hd0" active and block copy for "ide1-hd1" failed:
+
+-> { "execute": "query-block-copy" }
+<- {
+      "return":[
+        {"device":"ide1-hd0",
+            "status":"active",
+            "info":{
+               "percentage":23,
+            }
+        },
+        {"device":"ide1-hd1",
+         "status":"failed"
+        }
+      ]
+   }
+
+EQMP
+
+SQMP
 query-balloon
 -------------
 
Index: qemu-block-copy/Makefile.objs
===================================================================
--- qemu-block-copy.orig/Makefile.objs
+++ qemu-block-copy/Makefile.objs
@@ -98,7 +98,7 @@ common-obj-y += buffered_file.o migratio
 common-obj-y += qemu-char.o savevm.o #aio.o
 common-obj-y += msmouse.o ps2.o
 common-obj-y += qdev.o qdev-properties.o
-common-obj-y += block-migration.o iohandler.o
+common-obj-y += block-migration.o iohandler.o block-copy.o
 common-obj-y += pflib.o
 common-obj-y += bitmap.o bitops.o
 
Index: qemu-block-copy/docs/block_copy.txt
===================================================================
--- /dev/null
+++ qemu-block-copy/docs/block_copy.txt
@@ -0,0 +1,39 @@
+Live block copy
+===============
+
+Block copy allows an image to be copied to a destination image, while
+the guest is operating on the source image. The command is:
+
+block_copy {device} {filename} [-i]
+
+Destination image {filename} must be created with qemu-img prior to
+execution of this command, with image size equal to the original image
+size.
+
+Incremental copy allows the destination image to share a common base
+image with the original image. This option skips copying blocks which
+are not allocated in the original image.
+
+
+Command flow
+============
+
+Once the copy initiated with the block_copy command has finished,
+writes by the guest to the source image will be mirrored to the
+destination image.
+
+This state is indicated by the "mirrored" status of a device, visible in
+query-block-copy command output.
+
+Once in the mirrored state, management application is able to issue
+the block_copy_switch command to complete the block copy operation.
+
+Follows a pseudo-algorithm:
+
+     block_copy(dev, file);
+     do {
+         query_block_copy(block_copy_state);
+         sleep(s)
+     } while (!block_copy_state.dev.mirrored);
+     block_copy_switch(dev);
+

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 2/7] Add blkmirror block driver
  2011-06-06 16:55 ` [Qemu-devel] [patch 2/7] Add blkmirror block driver Marcelo Tosatti
@ 2011-06-06 21:52   ` malc
  2011-06-07 10:25   ` Stefan Hajnoczi
  1 sibling, 0 replies; 22+ messages in thread
From: malc @ 2011-06-06 21:52 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kwolf, Jes.Sorensen, dlaor, qemu-devel, avi, jdenemar

On Mon, 6 Jun 2011, Marcelo Tosatti wrote:

> Mirrored writes are used by live block copy.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Index: qemu-block-copy/block/blkmirror.c
> ===================================================================
> --- /dev/null
> +++ qemu-block-copy/block/blkmirror.c
> @@ -0,0 +1,277 @@
> +/*
> + * Block driver for mirrored writes.
> + *
> + * Copyright (C) 2011 Red Hat, Inc.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include <stdarg.h>
> +#include "block_int.h"
> +
> +typedef struct {
> +    BlockDriverState *bs[2];
> +} BdrvMirrorState;
> +
> +typedef struct DupAIOCB DupAIOCB;
> +
> +typedef struct SingleAIOCB {
> +    BlockDriverAIOCB *aiocb;
> +    int finished;
> +    DupAIOCB *parent;
> +} SingleAIOCB;
> +
> +struct DupAIOCB {
> +    BlockDriverAIOCB common;
> +    int count;
> +
> +    BlockDriverCompletionFunc *cb;
> +    SingleAIOCB aios[2];
> +    int ret;
> +};
> +
> +/* Valid blkmirror filenames look like
> + * blkmirror:path/to/image1:path/to/image2 */
> +static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    int ret, escape, i, n;
> +    char *raw;
> +
> +    /* Parse the blkmirror: prefix */
> +    if (strncmp(filename, "blkmirror:", strlen("blkmirror:"))) {
> +        return -EINVAL;
> +    }
> +    filename += strlen("blkmirror:");
> +
> +    /* Parse the raw image filename */
> +    raw = malloc(strlen(filename));
> +    escape = 0;
> +    for (i = n = 0; i < strlen(filename); i++) {
> +        if (!escape && filename[i] == ':') {
> +            break;
> +        }
> +        if (!escape && filename[i] == '\\') {
> +            escape = 1;
> +        } else {
> +            escape = 0;
> +        }
> +
> +        if (!escape) {
> +            raw[n++] = filename[i];
> +        }
> +    }
> +    raw[n] = '\0';

This potentially writes past the memory allocated for raw.

> +
> +    m->bs[0] = bdrv_new("");
> +    if (m->bs[0] == NULL) {
> +        return -ENOMEM;
> +    }
> +    ret = bdrv_open(m->bs[0], raw, flags, NULL);
> +    free(raw);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    filename += i + 1;
> +
> +    m->bs[1] = bdrv_new("");
> +    if (m->bs[1] == NULL) {
> +        return -ENOMEM;
> +    }
> +    ret = bdrv_open(m->bs[1], filename, flags, NULL);
> +    if (ret < 0) {
> +        bdrv_delete(m->bs[0]);
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +static void blkmirror_close(BlockDriverState *bs)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    int i;
> +
> +    for (i = 0; i < 2; i++) {
> +        bdrv_delete(m->bs[i]);
> +        m->bs[i] = NULL;
> +    }
> +}
> +
> +static int blkmirror_flush(BlockDriverState *bs)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +
> +    bdrv_flush(m->bs[0]);
> +    bdrv_flush(m->bs[1]);
> +
> +    return 0;
> +}
> +
> +static int64_t blkmirror_getlength(BlockDriverState *bs)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +
> +    return bdrv_getlength(m->bs[0]);
> +}
> +
> +static BlockDriverAIOCB *blkmirror_aio_readv(BlockDriverState *bs,
> +                                             int64_t sector_num,
> +                                             QEMUIOVector *qiov,
> +                                             int nb_sectors,
> +                                             BlockDriverCompletionFunc *cb,
> +                                             void *opaque)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    return bdrv_aio_readv(m->bs[0], sector_num, qiov, nb_sectors, cb, opaque);
> +}
> +
> +static void dup_aio_cancel(BlockDriverAIOCB *blockacb)
> +{
> +    DupAIOCB *acb = container_of(blockacb, DupAIOCB, common);
> +    int i;
> +
> +    for (i = 0 ; i < 2; i++) {
> +        if (!acb->aios[i].finished) {
> +            bdrv_aio_cancel(acb->aios[i].aiocb);
> +        }
> +    }
> +    qemu_aio_release(acb);
> +}
> +
> +static AIOPool dup_aio_pool = {
> +    .aiocb_size         = sizeof(DupAIOCB),
> +    .cancel             = dup_aio_cancel,
> +};
> +
> +static void blkmirror_aio_cb(void *opaque, int ret)
> +{
> +    SingleAIOCB *scb = opaque;
> +    DupAIOCB *dcb = scb->parent;
> +
> +    scb->finished = 1;
> +    dcb->count--;
> +    assert(dcb->count >= 0);
> +    if (ret < 0) {
> +        dcb->ret = ret;
> +    }
> +    if (dcb->count == 0) {
> +        dcb->common.cb(dcb->common.opaque, dcb->ret);
> +        qemu_aio_release(dcb);
> +    }
> +}
> +
> +static DupAIOCB *dup_aio_get(BlockDriverState *bs,
> +                             BlockDriverCompletionFunc *cb,
> +                             void *opaque)
> +{
> +    DupAIOCB *dcb;
> +    int i;
> +
> +    dcb = qemu_aio_get(&dup_aio_pool, bs, cb, opaque);
> +    if (!dcb) {
> +        return NULL;
> +    }
> +    dcb->count = 2;
> +    for (i = 0; i < 2; i++) {
> +        dcb->aios[i].parent = dcb;
> +        dcb->aios[i].finished = 0;
> +    }
> +    dcb->ret = 0;
> +
> +    return dcb;
> +}
> +
> +static BlockDriverAIOCB *blkmirror_aio_writev(BlockDriverState *bs,
> +                                              int64_t sector_num,
> +                                              QEMUIOVector *qiov,
> +                                              int nb_sectors,
> +                                              BlockDriverCompletionFunc *cb,
> +                                              void *opaque)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
> +    int i;
> +
> +    for (i = 0; i < 2; i++) {
> +        dcb->aios[i].aiocb = bdrv_aio_writev(m->bs[i], sector_num, qiov,
> +                                             nb_sectors, &blkmirror_aio_cb,
> +                                             &dcb->aios[i]);
> +        if (!dcb->aios[i].aiocb) {
> +            int a;
> +
> +            for (a = 0; a < i; a++) {
> +                bdrv_aio_cancel(dcb->aios[i].aiocb);
> +            }
> +            qemu_aio_release(dcb);
> +            return NULL;
> +        }
> +    }
> +
> +    return &dcb->common;
> +}
> +
> +static BlockDriverAIOCB *blkmirror_aio_flush(BlockDriverState *bs,
> +                                             BlockDriverCompletionFunc *cb,
> +                                             void *opaque)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
> +    int i;
> +
> +    for (i = 0; i < 2; i++) {
> +        dcb->aios[i].aiocb = bdrv_aio_flush(m->bs[i], &blkmirror_aio_cb,
> +                                            &dcb->aios[i]);
> +        if (!dcb->aios[i].aiocb) {
> +            int a;
> +
> +            for (a = 0; a < i; a++) {
> +                bdrv_aio_cancel(dcb->aios[i].aiocb);
> +            }
> +            qemu_aio_release(dcb);
> +            return NULL;
> +        }
> +    }
> +
> +    return &dcb->common;
> +}
> +
> +static int blkmirror_discard(BlockDriverState *bs, int64_t sector_num,
> +                             int nb_sectors)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    int ret;
> +
> +    ret = bdrv_discard(m->bs[0], sector_num, nb_sectors);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return bdrv_discard(m->bs[1], sector_num, nb_sectors);
> +}
> +
> +
> +static BlockDriver bdrv_blkmirror = {
> +    .format_name        = "blkmirror",
> +    .protocol_name      = "blkmirror",
> +    .instance_size      = sizeof(BdrvMirrorState),
> +
> +    .bdrv_getlength     = blkmirror_getlength,
> +
> +    .bdrv_file_open     = blkmirror_open,
> +    .bdrv_close         = blkmirror_close,
> +    .bdrv_flush         = blkmirror_flush,
> +    .bdrv_discard       = blkmirror_discard,
> +
> +    .bdrv_aio_readv     = blkmirror_aio_readv,
> +    .bdrv_aio_writev    = blkmirror_aio_writev,
> +    .bdrv_aio_flush     = blkmirror_aio_flush,
> +};
> +
> +static void bdrv_blkmirror_init(void)
> +{
> +    bdrv_register(&bdrv_blkmirror);
> +}
> +
> +block_init(bdrv_blkmirror_init);
> Index: qemu-block-copy/Makefile.objs
> ===================================================================
> --- qemu-block-copy.orig/Makefile.objs
> +++ qemu-block-copy/Makefile.objs
> @@ -22,7 +22,7 @@ block-nested-y += raw.o cow.o qcow.o vdi
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blkmirror.o
>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>  block-nested-$(CONFIG_CURL) += curl.o
> Index: qemu-block-copy/docs/blkmirror.txt
> ===================================================================
> --- /dev/null
> +++ qemu-block-copy/docs/blkmirror.txt
> @@ -0,0 +1,15 @@
> +Block mirror driver
> +-------------------
> +
> +This driver will mirror writes to two distinct images.
> +Its used internally by live block copy.
> +
> +Format
> +------
> +
> +blkmirror:/image1.img:/image2.img
> +
> +'\' (backslash) can be used to escape colon processing
> +as a separator.
> +
> +
> 
> 
> 

-- 
mailto:av1474@comtv.ru

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 6/7] QEMU live block copy (update)
  2011-06-06 17:03   ` [Qemu-devel] [patch 6/7] QEMU live block copy (update) Marcelo Tosatti
@ 2011-06-07 10:15     ` Jiri Denemark
  2011-06-15 15:49       ` Marcelo Tosatti
  2011-06-15 15:51       ` Marcelo Tosatti
  0 siblings, 2 replies; 22+ messages in thread
From: Jiri Denemark @ 2011-06-07 10:15 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kwolf, Jes.Sorensen, dlaor, qemu-devel, avi

On Mon, Jun 06, 2011 at 14:03:44 -0300, Marcelo Tosatti wrote:
...
>  SQMP
> +query-block-copy
> +-------------
> +
> +Live block copy status.
> +
> +Each block copy instance information is stored in a json-object and the returned
> +value is a json-array of all instances.
> +
> +Each json-object contains the following:
> +
> +- "device": device name (json-string)
> +- "status": block copy status (json-string)
> +    - Possible values: "active", "failed", "mirrored", "completed", meaning:
> +                    - failed: block copy failed.
> +                    - active: block copy active, copying to destination image.
> +                    - mirrored: block copy active, finished copying to destination
> +                      image, writes are mirrored.
> +                    - completed: block copy completed.
> +
> +- "info": A json-object with the statistics information, if status is "active":
> +    - "percentage": percentage completed (json-int)
> +
> +Example:
> +
> +Block copy for "ide1-hd0" active and block copy for "ide1-hd1" failed:
> +
> +-> { "execute": "query-block-copy" }
> +<- {
> +      "return":[
> +        {"device":"ide1-hd0",
> +            "status":"active",
> +            "info":{
> +               "percentage":23,
> +            }
> +        },

What about using the same form of progress reporting as used by query-migrate?
That is, instead of

    "info":{
        "percentage":23
    }

the following would be reported:

    "disk":{
        "transferred":123,
        "remaining":123,
        "total":246
    }

One can trivially compute percentage from that but it's impossible to get this
kind of data when only percentage is reported. And total can even change in
time if needed (just like it changes during migration).

> +        {"device":"ide1-hd1",
> +         "status":"failed"
> +        }

Is there any way we can get the exact error which made it fail, such as EPERM
or ENOSPC?

> +      ]
> +   }
> +
> +EQMP

Jirka

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 2/7] Add blkmirror block driver
  2011-06-06 16:55 ` [Qemu-devel] [patch 2/7] Add blkmirror block driver Marcelo Tosatti
  2011-06-06 21:52   ` malc
@ 2011-06-07 10:25   ` Stefan Hajnoczi
  1 sibling, 0 replies; 22+ messages in thread
From: Stefan Hajnoczi @ 2011-06-07 10:25 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kwolf, Jes.Sorensen, dlaor, qemu-devel, avi, jdenemar

On Mon, Jun 6, 2011 at 5:55 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> +/* Valid blkmirror filenames look like
> + * blkmirror:path/to/image1:path/to/image2 */
> +static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    int ret, escape, i, n;
> +    char *raw;
> +
> +    /* Parse the blkmirror: prefix */
> +    if (strncmp(filename, "blkmirror:", strlen("blkmirror:"))) {
> +        return -EINVAL;
> +    }
> +    filename += strlen("blkmirror:");
> +
> +    /* Parse the raw image filename */
> +    raw = malloc(strlen(filename));

Please use qemu_malloc()/qemu_strdup()/qemu_free() instead of the
system library versions.

I'm guilty of this in blkverify :(.

> +    escape = 0;
> +    for (i = n = 0; i < strlen(filename); i++) {
> +        if (!escape && filename[i] == ':') {
> +            break;
> +        }
> +        if (!escape && filename[i] == '\\') {
> +            escape = 1;
> +        } else {
> +            escape = 0;
> +        }
> +
> +        if (!escape) {
> +            raw[n++] = filename[i];
> +        }
> +    }
> +    raw[n] = '\0';
> +
> +    m->bs[0] = bdrv_new("");
> +    if (m->bs[0] == NULL) {
> +        return -ENOMEM;

raw is leaked.

> +    }
> +    ret = bdrv_open(m->bs[0], raw, flags, NULL);

This isn't necessarily a "raw" file.  filename0 and filename1 would be
clearer IMO.

> +    free(raw);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    filename += i + 1;

Please document that escaping only takes effect in filename0.  After
the second ':' you may no longer escape.

For sanity perhaps the whole string should be unescaped.

> +
> +    m->bs[1] = bdrv_new("");
> +    if (m->bs[1] == NULL) {

bs[0] is leaked.

> +        return -ENOMEM;
> +    }
> +    ret = bdrv_open(m->bs[1], filename, flags, NULL);
> +    if (ret < 0) {
> +        bdrv_delete(m->bs[0]);
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +static void blkmirror_close(BlockDriverState *bs)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    int i;
> +
> +    for (i = 0; i < 2; i++) {
> +        bdrv_delete(m->bs[i]);
> +        m->bs[i] = NULL;
> +    }
> +}
> +
> +static int blkmirror_flush(BlockDriverState *bs)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +
> +    bdrv_flush(m->bs[0]);
> +    bdrv_flush(m->bs[1]);

Return values should be checked.

Stefan

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 6/7] QEMU live block copy
  2011-06-06 16:55 ` [Qemu-devel] [patch 6/7] QEMU " Marcelo Tosatti
  2011-06-06 17:03   ` [Qemu-devel] [patch 6/7] QEMU live block copy (update) Marcelo Tosatti
@ 2011-06-07 12:15   ` Stefan Hajnoczi
  2011-06-08 15:10     ` Jagane Sundar
  2011-06-15 16:59     ` Marcelo Tosatti
  1 sibling, 2 replies; 22+ messages in thread
From: Stefan Hajnoczi @ 2011-06-07 12:15 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kwolf, Jes.Sorensen, dlaor, qemu-devel, avi, jdenemar

On Mon, Jun 6, 2011 at 5:55 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:

I haven't reviewed this whole patch yet, but comments below.

This patch, like image streaming, may hit deadlocks due to synchronous
I/O emulation.  I discovered this problem when working on image
streaming and it should be solved by getting rid of the asynchronous
context concept.  The problem is that async I/O emulation will push a
new context, preventing existing requests to complete until the
current context is popped again.  If the image format has dependencies
between requests (e.g. QED allocating writes are serialized), then
this leads to deadlock because the new request cannot complete until
the old one does, but the old one needs to wait for the context to be
popped.  I think you are not affected by the QED allocating write case
since the source image is only read, not written, by live block copy.
But you might encounter this problem in other places.

> +#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)

Should this be called DIRTY_CHUNK_SIZE?

> +static void alloc_aio_bitmap(BdrvCopyState *s)
> +{
> +    BlockDriverState *bs = s->src;
> +    int64_t bitmap_size;
> +
> +    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
> +            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
> +    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
> +
> +    s->aio_bitmap = qemu_mallocz(bitmap_size);
> +}

There is bitmap.h, which has a bunch of common bitmap code that could be reused.

> +static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
> +                                int nr_sectors)
> +{
> +    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
> +    blk->buf = qemu_mallocz(BLOCK_SIZE);

qemu_blockalign() should be used to meet block device alignment requirements.

> +    blk->state = s;
> +    blk->sector = sector;
> +    blk->nr_sectors = nr_sectors;
> +    QLIST_INSERT_HEAD(&s->io_list, blk, list);
> +
> +    blk->iov.iov_base = blk->buf;
> +    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
> +    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
> +
> +    s->inflight_reads++;
> +    blk->time = qemu_get_clock_ns(rt_clock);
> +    blk->aiocb = bdrv_aio_readv(s->src, sector, &blk->qiov, nr_sectors,
> +                                blk_copy_read_cb, blk);
> +    if (!blk->aiocb) {
> +        s->error = 1;
> +        goto error;
> +    }
> +
> +    return;
> +
> +error:
> +    s->inflight_reads--;
> +    QLIST_REMOVE(blk, list);
> +    qemu_free(blk->buf);

qemu_vfree() after you switch to qemu_blockalign() above.

> +    qemu_free(blk);
> +}

> +static void blkcopy_cleanup(BdrvCopyState *s)
> +{

Need to free dst, at least on error?

> +    assert(s->inflight_reads == 0);
> +    assert(QLIST_EMPTY(&s->io_list));
> +    bdrv_set_dirty_tracking(s->src, 0);
> +    drive_put_ref(drive_get_by_blockdev(s->src));
> +    bdrv_set_in_use(s->src, 0);
> +    if (s->stage >= STAGE_DIRTY) {
> +        qemu_free(s->aio_bitmap);
> +    }
> +    qemu_del_timer(s->aio_timer);

Missing qemu_free_timer().

> +}

> +static int bdrv_copy_setup(const char *device, const char *filename,
> +                           bool shared_base)
> +{
> +    int64_t sectors;
> +    BdrvCopyState *blkcopy, *safe;
> +    BlockDriverState *src, *dst;
> +
> +    src = bdrv_find(device);
> +    if (src) {
> +        qerror_report(QERR_DEVICE_NOT_FOUND, device);
> +        return -1;
> +    }
> +
> +    dst = bdrv_new("");
> +    if (bdrv_open(dst, filename, src->open_flags, NULL) < 0) {
> +        bdrv_delete(dst);
> +        qerror_report(QERR_OPEN_FILE_FAILED, filename);
> +        return -1;
> +    }
> +
> +    QLIST_FOREACH_SAFE(blkcopy, &block_copy_list, list, safe) {
> +        if (!strcmp(blkcopy->device_name, src->device_name)) {
> +            if (blkcopy->stage == STAGE_SWITCH_FINISHED || blkcopy->failed) {
> +                blkcopy_free(blkcopy);
> +            } else {
> +                qerror_report(QERR_IN_PROGRESS, "block copy");

dst is leaked.

> +                return -1;
> +            }
> +        }
> +    }
> +
> +    sectors = bdrv_getlength(src) >> BDRV_SECTOR_BITS;
> +    if (sectors != bdrv_getlength(dst) >> BDRV_SECTOR_BITS) {
> +        qerror_report(QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS);
> +        return -1;

dst is leaked.

Stefan

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 6/7] QEMU live block copy
  2011-06-07 12:15   ` [Qemu-devel] [patch 6/7] QEMU live block copy Stefan Hajnoczi
@ 2011-06-08 15:10     ` Jagane Sundar
  2011-06-08 16:18       ` Stefan Hajnoczi
  2011-06-15 16:59     ` Marcelo Tosatti
  1 sibling, 1 reply; 22+ messages in thread
From: Jagane Sundar @ 2011-06-08 15:10 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: kwolf, dlaor, Jes.Sorensen, Marcelo Tosatti, qemu-devel, avi, jdenemar

On 6/7/2011 5:15 AM, Stefan Hajnoczi wrote:
> On Mon, Jun 6, 2011 at 5:55 PM, Marcelo Tosatti<mtosatti@redhat.com>  wrote:
>
> I haven't reviewed this whole patch yet, but comments below.
>
> This patch, like image streaming, may hit deadlocks due to synchronous
> I/O emulation.  I discovered this problem when working on image
> streaming and it should be solved by getting rid of the asynchronous
> context concept.  The problem is that async I/O emulation will push a
> new context, preventing existing requests to complete until the
> current context is popped again.  If the image format has dependencies
> between requests (e.g. QED allocating writes are serialized), then
> this leads to deadlock because the new request cannot complete until
> the old one does, but the old one needs to wait for the context to be
> popped.  I think you are not affected by the QED allocating write case
> since the source image is only read, not written, by live block copy.
> But you might encounter this problem in other places.
>
Hello Stefan,

Can you expand on this some more? I have similar concerns for Livebackup.

At the beginning of your paragraph,  did you mean 'asynchronous I/O 
emulation' instead of 'synchronous I/O emulation'?

Also, I don't understand the 'stack' construct that you refer to. When 
you say 'push a new context', are you talking about what happens when a 
new thread picks up a new async I/O req from the VM, and then proceeds 
to execute the I/O req? What is this stack that you refer to?

Any design documents, code snippets that I can look, other pointers welcome.

Thanks,
Jagane

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 6/7] QEMU live block copy
  2011-06-08 15:10     ` Jagane Sundar
@ 2011-06-08 16:18       ` Stefan Hajnoczi
  2011-06-09 15:42         ` Jagane Sundar
  0 siblings, 1 reply; 22+ messages in thread
From: Stefan Hajnoczi @ 2011-06-08 16:18 UTC (permalink / raw)
  To: Jagane Sundar
  Cc: kwolf, dlaor, Jes.Sorensen, Marcelo Tosatti, qemu-devel, avi, jdenemar

On Wed, Jun 8, 2011 at 4:10 PM, Jagane Sundar <jagane@sundar.org> wrote:
> On 6/7/2011 5:15 AM, Stefan Hajnoczi wrote:
>>
>> On Mon, Jun 6, 2011 at 5:55 PM, Marcelo Tosatti<mtosatti@redhat.com>
>>  wrote:
>>
>> I haven't reviewed this whole patch yet, but comments below.
>>
>> This patch, like image streaming, may hit deadlocks due to synchronous
>> I/O emulation.  I discovered this problem when working on image
>> streaming and it should be solved by getting rid of the asynchronous
>> context concept.  The problem is that async I/O emulation will push a
>> new context, preventing existing requests to complete until the
>> current context is popped again.  If the image format has dependencies
>> between requests (e.g. QED allocating writes are serialized), then
>> this leads to deadlock because the new request cannot complete until
>> the old one does, but the old one needs to wait for the context to be
>> popped.  I think you are not affected by the QED allocating write case
>> since the source image is only read, not written, by live block copy.
>> But you might encounter this problem in other places.
>>
> Hello Stefan,
>
> Can you expand on this some more? I have similar concerns for Livebackup.
>
> At the beginning of your paragraph,  did you mean 'asynchronous I/O
> emulation' instead of 'synchronous I/O emulation'?
>
> Also, I don't understand the 'stack' construct that you refer to. When you
> say 'push a new context', are you talking about what happens when a new
> thread picks up a new async I/O req from the VM, and then proceeds to
> execute the I/O req? What is this stack that you refer to?
>
> Any design documents, code snippets that I can look, other pointers welcome.

See async.c.

There is synchronous I/O emulation in block.c for BlockDrivers that
don't support .bdrv_read()/.bdrv_write() but only
.bdrv_aio_readv()/.bdrv_aio_writev().  The way it works is that it
pushes a new I/O context and then issues async I/O.  Then it runs a
special event loop waiting for that I/O to complete.  After the I/O
completes it pops the context again.

The point of the context is that completions only get called for the
current context.  Therefore callers of the synchronous I/O functions
don't need to worry that the state of the world might change during
their "synchronous" operation - only their own I/O operation can
complete.  If a pending async I/O completes during synchronous I/O
emulation, its callback is not invoked until the bottom half (BH) is
called after the async context is popped.  This guarantees that the
synchronous operation and its caller have completed before I/O
completion callbacks are invoked for pending async I/O.

The problem is that QED allocating writes are serialized and cannot
make progress if a pending request is unable to complete.  Preventing
the pending request from completing deadlocks QEMU.  The same thing
could happen in other places.

I'm bringing this up in case anyone hits such a deadlock.  I think we
can solve this in the future by eliminating the async context concept.
 Kevin and I have discussed how that can be done but it's not possible
or trivial to do yet.

Stefan

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 6/7] QEMU live block copy
  2011-06-08 16:18       ` Stefan Hajnoczi
@ 2011-06-09 15:42         ` Jagane Sundar
  0 siblings, 0 replies; 22+ messages in thread
From: Jagane Sundar @ 2011-06-09 15:42 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: kwolf, dlaor, Jes.Sorensen, Marcelo Tosatti, qemu-devel, avi, jdenemar


>> Hello Stefan,
>>
>> Can you expand on this some more? I have similar concerns for Livebackup.
>>
>> At the beginning of your paragraph,  did you mean 'asynchronous I/O
>> emulation' instead of 'synchronous I/O emulation'?
>>
>> Also, I don't understand the 'stack' construct that you refer to. When you
>> say 'push a new context', are you talking about what happens when a new
>> thread picks up a new async I/O req from the VM, and then proceeds to
>> execute the I/O req? What is this stack that you refer to?
>>
>> Any design documents, code snippets that I can look, other pointers welcome.
> See async.c.
>
Thanks - I will do so.
> There is synchronous I/O emulation in block.c for BlockDrivers that
> don't support .bdrv_read()/.bdrv_write() but only
> .bdrv_aio_readv()/.bdrv_aio_writev().  The way it works is that it
> pushes a new I/O context and then issues async I/O.  Then it runs a
> special event loop waiting for that I/O to complete.  After the I/O
> completes it pops the context again.
>
OK. This is the opposite of what I was thinking of. I was considering
the code that emulates Async I/O using multiple threads.

It sounds like the goal of this async.c mechanism is more than
serializing all synchronous I/O requests, right?
> The point of the context is that completions only get called for the
> current context.  Therefore callers of the synchronous I/O functions
> don't need to worry that the state of the world might change during
> their "synchronous" operation - only their own I/O operation can
> complete.  If a pending async I/O completes during synchronous I/O
> emulation, its callback is not invoked until the bottom half (BH) is
> called after the async context is popped.  This guarantees that the
> synchronous operation and its caller have completed before I/O
> completion callbacks are invoked for pending async I/O.
>
OK. This might not be a problem for Livebackup, after all.
Livebackup transparently interposes the async I/O offered by the driver.
Hence the async.c mechanism is layered neatly above the
async_block_driver+livebackup layer and should work correctly.

I will take a closer look at this, and check for sanity...


Thanks,
Jagane

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 6/7] QEMU live block copy (update)
  2011-06-07 10:15     ` Jiri Denemark
@ 2011-06-15 15:49       ` Marcelo Tosatti
  2011-06-15 15:51       ` Marcelo Tosatti
  1 sibling, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-15 15:49 UTC (permalink / raw)
  To: Jiri Denemark; +Cc: kwolf, Jes.Sorensen, dlaor, qemu-devel, avi

On Tue, Jun 07, 2011 at 12:15:55PM +0200, Jiri Denemark wrote:
> > +            }
> > +        },
> 
> What about using the same form of progress reporting as used by query-migrate?
> That is, instead of

Done.

> One can trivially compute percentage from that but it's impossible to get this
> kind of data when only percentage is reported. And total can even change in
> time if needed (just like it changes during migration).
> 
> > +        {"device":"ide1-hd1",
> > +         "status":"failed"
> > +        }
> 
> Is there any way we can get the exact error which made it fail, such as EPERM
> or ENOSPC?

The errors that can generate "failed" here are internal to QEMU, so i
don't think exporting them to management application makes sense. This 
are errors such as EIO from file system.

Error processing should be done in the block_copy command, that is where
checks of the destination image are performed.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 6/7] QEMU live block copy (update)
  2011-06-07 10:15     ` Jiri Denemark
  2011-06-15 15:49       ` Marcelo Tosatti
@ 2011-06-15 15:51       ` Marcelo Tosatti
  1 sibling, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-15 15:51 UTC (permalink / raw)
  To: Jiri Denemark; +Cc: kwolf, Jes.Sorensen, dlaor, qemu-devel, avi

On Tue, Jun 07, 2011 at 12:15:55PM +0200, Jiri Denemark wrote:
> On Mon, Jun 06, 2011 at 14:03:44 -0300, Marcelo Tosatti wrote:
> ...
> > +      "return":[
> > +        {"device":"ide1-hd0",
> > +            "status":"active",
> > +            "info":{
> > +               "percentage":23,
> > +            }
> > +        },
> 
> What about using the same form of progress reporting as used by query-migrate?
> That is, instead of
> 
>     "info":{
>         "percentage":23
>     }
> 
> the following would be reported:
> 
>     "disk":{
>         "transferred":123,
>         "remaining":123,
>         "total":246
>     }
> 
> One can trivially compute percentage from that but it's impossible to get this
> kind of data when only percentage is reported. And total can even change in
> time if needed (just like it changes during migration).

Done.

> > +        {"device":"ide1-hd1",
> > +         "status":"failed"
> > +        }
> 
> Is there any way we can get the exact error which made it fail, such as EPERM
> or ENOSPC?

The errors that can generate "failed" here are internal to QEMU, so i
don't think exporting them to management application makes sense. This
are errors such as EIO from file system.

Error processing should be done in the block_copy command, that is where
checks of the destination image are performed.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 6/7] QEMU live block copy
  2011-06-07 12:15   ` [Qemu-devel] [patch 6/7] QEMU live block copy Stefan Hajnoczi
  2011-06-08 15:10     ` Jagane Sundar
@ 2011-06-15 16:59     ` Marcelo Tosatti
  1 sibling, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-06-15 16:59 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: kwolf, Jes.Sorensen, dlaor, qemu-devel, avi, jdenemar

On Tue, Jun 07, 2011 at 01:15:02PM +0100, Stefan Hajnoczi wrote:
> On Mon, Jun 6, 2011 at 5:55 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> I haven't reviewed this whole patch yet, but comments below.
> 
> This patch, like image streaming, may hit deadlocks due to synchronous
> I/O emulation.  I discovered this problem when working on image
> streaming and it should be solved by getting rid of the asynchronous
> context concept.  The problem is that async I/O emulation will push a
> new context, preventing existing requests to complete until the
> current context is popped again.  If the image format has dependencies
> between requests (e.g. QED allocating writes are serialized), then
> this leads to deadlock because the new request cannot complete until
> the old one does, but the old one needs to wait for the context to be
> popped.  I think you are not affected by the QED allocating write case
> since the source image is only read, not written, by live block copy.
> But you might encounter this problem in other places.

I see. This should be fixed in the context push/pop logic (or something
equivalent), as you mention.

Fixed other comments, thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 1/7] add migration_active function
  2011-05-23 21:31 ` [Qemu-devel] [patch 1/7] add migration_active function Marcelo Tosatti
  2011-05-27 17:43   ` Kevin Wolf
@ 2011-05-30  0:53   ` Jes Sorensen
  1 sibling, 0 replies; 22+ messages in thread
From: Jes Sorensen @ 2011-05-30  0:53 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kwolf, dlaor, qemu-devel, avi

On 05/24/11 06:31, Marcelo Tosatti wrote:
> To query whether migration is active.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

ACK

Jes

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [patch 1/7] add migration_active function
  2011-05-23 21:31 ` [Qemu-devel] [patch 1/7] add migration_active function Marcelo Tosatti
@ 2011-05-27 17:43   ` Kevin Wolf
  2011-05-30  0:53   ` Jes Sorensen
  1 sibling, 0 replies; 22+ messages in thread
From: Kevin Wolf @ 2011-05-27 17:43 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Jes.Sorensen, dlaor, qemu-devel, avi

Am 23.05.2011 23:31, schrieb Marcelo Tosatti:
> To query whether migration is active.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Index: qemu-block-copy/migration.c
> ===================================================================
> --- qemu-block-copy.orig/migration.c
> +++ qemu-block-copy/migration.c
> @@ -480,3 +480,13 @@ int get_migration_state(void)
>          return MIG_STATE_ERROR;
>      }
>  }
> +
> +bool migration_active(void)
> +{
> +    if (current_migration &&
> +        current_migration->get_status(current_migration) == MIG_STATE_ACTIVE) {
> +        return true;
> +    }
> +
> +    return false;
> +}

The very same check already exists open-coded in do_migrate(). Maybe we
should convert it now that a function is available for it?

Kevin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [patch 1/7] add migration_active function
  2011-05-23 21:31 [Qemu-devel] [patch 0/7] live block copy (v3) Marcelo Tosatti
@ 2011-05-23 21:31 ` Marcelo Tosatti
  2011-05-27 17:43   ` Kevin Wolf
  2011-05-30  0:53   ` Jes Sorensen
  0 siblings, 2 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2011-05-23 21:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, Jes.Sorensen, dlaor, avi, Marcelo Tosatti

[-- Attachment #1: 1-3-add-migration-active.patch --]
[-- Type: text/plain, Size: 865 bytes --]

To query whether migration is active.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu-block-copy/migration.c
===================================================================
--- qemu-block-copy.orig/migration.c
+++ qemu-block-copy/migration.c
@@ -480,3 +480,13 @@ int get_migration_state(void)
         return MIG_STATE_ERROR;
     }
 }
+
+bool migration_active(void)
+{
+    if (current_migration &&
+        current_migration->get_status(current_migration) == MIG_STATE_ACTIVE) {
+        return true;
+    }
+
+    return false;
+}
Index: qemu-block-copy/migration.h
===================================================================
--- qemu-block-copy.orig/migration.h
+++ qemu-block-copy/migration.h
@@ -148,4 +148,6 @@ int ram_load(QEMUFile *f, void *opaque, 
 
 extern int incoming_expected;
 
+bool migration_active(void);
+
 #endif

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2011-06-19 16:37 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-06-06 16:55 [Qemu-devel] [patch 0/7] live block copy (v4) Marcelo Tosatti
2011-06-06 16:55 ` [Qemu-devel] [patch 1/7] add migration_active function Marcelo Tosatti
2011-06-06 16:55 ` [Qemu-devel] [patch 2/7] Add blkmirror block driver Marcelo Tosatti
2011-06-06 21:52   ` malc
2011-06-07 10:25   ` Stefan Hajnoczi
2011-06-06 16:55 ` [Qemu-devel] [patch 3/7] Add error messages for live block copy Marcelo Tosatti
2011-06-06 16:55 ` [Qemu-devel] [patch 4/7] Add blkdebug points " Marcelo Tosatti
2011-06-06 16:55 ` [Qemu-devel] [patch 5/7] Add vmstop code " Marcelo Tosatti
2011-06-06 16:55 ` [Qemu-devel] [patch 6/7] QEMU " Marcelo Tosatti
2011-06-06 17:03   ` [Qemu-devel] [patch 6/7] QEMU live block copy (update) Marcelo Tosatti
2011-06-07 10:15     ` Jiri Denemark
2011-06-15 15:49       ` Marcelo Tosatti
2011-06-15 15:51       ` Marcelo Tosatti
2011-06-07 12:15   ` [Qemu-devel] [patch 6/7] QEMU live block copy Stefan Hajnoczi
2011-06-08 15:10     ` Jagane Sundar
2011-06-08 16:18       ` Stefan Hajnoczi
2011-06-09 15:42         ` Jagane Sundar
2011-06-15 16:59     ` Marcelo Tosatti
2011-06-06 16:55 ` [Qemu-devel] [patch 7/7] do not allow migration if block copy in progress Marcelo Tosatti
  -- strict thread matches above, loose matches on Subject: below --
2011-05-23 21:31 [Qemu-devel] [patch 0/7] live block copy (v3) Marcelo Tosatti
2011-05-23 21:31 ` [Qemu-devel] [patch 1/7] add migration_active function Marcelo Tosatti
2011-05-27 17:43   ` Kevin Wolf
2011-05-30  0:53   ` Jes Sorensen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.