All of lore.kernel.org
 help / color / mirror / Atom feed
From: Wen Congyang <wency@cn.fujitsu.com>
To: qemu devel <qemu-devel@nongnu.org>, Fam Zheng <famz@redhat.com>,
	Max Reitz <mreitz@redhat.com>,
	Paolo Bonzini <pbonzini@redhat.com>
Cc: Kevin Wolf <kwolf@redhat.com>, qemu block <qemu-block@nongnu.org>,
	Lai Jiangshan <laijs@cn.fujitsu.com>,
	Jiang Yunhong <yunhong.jiang@intel.com>,
	Dong Eddie <eddie.dong@intel.com>,
	"Dr. David Alan Gilbert" <dgilbert@redhat.com>,
	Gonglei <arei.gonglei@huawei.com>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	Yang Hongyang <yanghy@cn.fujitsu.com>,
	zhanghailiang <zhang.zhanghailiang@huawei.com>
Subject: [Qemu-devel] [PATCH COLO-Block v6 16/16] Implement new driver for block replication
Date: Thu, 18 Jun 2015 16:49:21 +0800	[thread overview]
Message-ID: <1434617361-17778-17-git-send-email-wency@cn.fujitsu.com> (raw)
In-Reply-To: <1434617361-17778-1-git-send-email-wency@cn.fujitsu.com>

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Gonglei <arei.gonglei@huawei.com>
---
 block/Makefile.objs |   1 +
 block/replication.c | 441 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 442 insertions(+)
 create mode 100644 block/replication.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index f068666..84952b1 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -23,6 +23,7 @@ block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o
 block-obj-y += write-threshold.o
 block-obj-y += backup.o
+block-obj-y += replication.o
 
 common-obj-y += stream.o
 common-obj-y += commit.o
diff --git a/block/replication.c b/block/replication.c
new file mode 100644
index 0000000..da3b512
--- /dev/null
+++ b/block/replication.c
@@ -0,0 +1,441 @@
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+#include "block/nbd.h"
+
+typedef struct BDRVReplicationState {
+    ReplicationMode mode;
+    int replication_state;
+    char *export_name;
+    NBDExport *exp;
+    BlockDriverState *active_disk;
+    BlockDriverState *hidden_disk;
+    BlockDriverState *secondary_disk; /* nbd target */
+    int error;
+} BDRVReplicationState;
+
+enum {
+    BLOCK_REPLICATION_NONE,     /* block replication is not started */
+    BLOCK_REPLICATION_RUNNING,  /* block replication is running */
+    BLOCK_REPLICATION_DONE,     /* block replication is done(failover) */
+};
+
+#define COMMIT_CLUSTER_BITS 16
+#define COMMIT_CLUSTER_SIZE (1 << COMMIT_CLUSTER_BITS)
+#define COMMIT_SECTORS_PER_CLUSTER (COMMIT_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
+
+static void replication_stop(BlockDriverState *bs, bool failover, Error **errp);
+
+#define NBD_OPT_EXPORT          "export"
+#define REPLICATION_MODE        "mode"
+static QemuOptsList replication_runtime_opts = {
+    .name = "replication",
+    .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
+    .desc = {
+        {
+            .name = REPLICATION_MODE,
+            .type = QEMU_OPT_STRING,
+        },
+        {
+            .name = NBD_OPT_EXPORT,
+            .type = QEMU_OPT_STRING,
+            .help = "The NBD server name",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int replication_open(BlockDriverState *bs, QDict *options,
+                            int flags, Error **errp)
+{
+    int ret;
+    BDRVReplicationState *s = bs->opaque;;
+    Error *local_err = NULL;
+    QemuOpts *opts = NULL;
+    const char *mode;
+
+    ret = -EINVAL;
+    opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        goto fail;
+    }
+
+    mode = qemu_opt_get(opts, REPLICATION_MODE);
+    if (!mode) {
+        error_setg(&local_err, "Missing the option mode");
+        goto fail;
+    }
+
+    if (!strcmp(mode, "primary")) {
+        s->mode = REPLICATION_MODE_PRIMARY;
+    } else if (!strcmp(mode, "secondary")) {
+        s->mode = REPLICATION_MODE_SECONDARY;
+    } else {
+        error_setg(&local_err,
+                   "The option mode's value should be primary or secondary");
+        goto fail;
+    }
+
+    if (s->mode == REPLICATION_MODE_SECONDARY) {
+        s->export_name = g_strdup(qemu_opt_get(opts, NBD_OPT_EXPORT));
+        if (!s->export_name) {
+            error_setg(&local_err, "Missing the option export");
+            goto fail;
+        }
+    }
+
+    return 0;
+
+fail:
+    qemu_opts_del(opts);
+    /* propagate error */
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+    return ret;
+}
+
+static void replication_close(BlockDriverState *bs)
+{
+    BDRVReplicationState *s = bs->opaque;
+
+    if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
+        replication_stop(bs, false, NULL);
+    }
+
+    g_free(s->export_name);
+}
+
+static int64_t replication_getlength(BlockDriverState *bs)
+{
+    return bdrv_getlength(bs->file);
+}
+
+static int replication_get_io_status(BDRVReplicationState *s)
+{
+    switch (s->replication_state) {
+    case BLOCK_REPLICATION_NONE:
+        return -EIO;
+    case BLOCK_REPLICATION_RUNNING:
+        return 0;
+    case BLOCK_REPLICATION_DONE:
+        return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
+    default:
+        abort();
+    }
+}
+
+static int replication_return_value(BDRVReplicationState *s, int ret)
+{
+    if (s->mode == REPLICATION_MODE_SECONDARY) {
+        return ret;
+    }
+
+    if (ret < 0) {
+        s->error = ret;
+        ret = 0;
+    }
+
+    return ret;
+}
+
+static coroutine_fn int replication_co_readv(BlockDriverState *bs,
+                                             int64_t sector_num,
+                                             int remaining_sectors,
+                                             QEMUIOVector *qiov)
+{
+    BDRVReplicationState *s = bs->opaque;
+    int ret;
+
+    if (s->mode == REPLICATION_MODE_PRIMARY) {
+        /* We only use it to forward primary write requests */
+        return -EIO;
+    }
+
+    ret = replication_get_io_status(s);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /*
+     * After failover, because we don't commit active disk/hidden disk
+     * to secondary disk(nbd target), so we should read from active disk
+     * directly.
+     */
+    ret = bdrv_co_readv(bs->file, sector_num, remaining_sectors, qiov);
+    return replication_return_value(s, ret);
+}
+
+static coroutine_fn int replication_co_writev(BlockDriverState *bs,
+                                              int64_t sector_num,
+                                              int remaining_sectors,
+                                              QEMUIOVector *qiov)
+{
+    BDRVReplicationState *s = bs->opaque;
+    QEMUIOVector hd_qiov;
+    uint64_t bytes_done = 0;
+    BlockDriverState *top = bs->file;
+    BlockDriverState *base = s->secondary_disk;
+    BlockDriverState *target;
+    int ret, n;
+
+    ret = replication_get_io_status(s);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (ret == 0) {
+        ret = bdrv_co_writev(bs->file, sector_num, remaining_sectors, qiov);
+        return replication_return_value(s, ret);
+    }
+
+    /*
+     * Only write to active disk if the sectors have
+     * already been allocated in active disk/hidden disk.
+     */
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    while (remaining_sectors > 0) {
+        ret = bdrv_is_allocated_above(top, base, sector_num,
+                                      remaining_sectors, &n);
+        if (ret < 0) {
+            return ret;
+        }
+
+        qemu_iovec_reset(&hd_qiov);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, n * 512);
+
+        target = ret ? top: base;
+        ret = bdrv_co_writev(target, sector_num, n, &hd_qiov);
+        if (ret < 0) {
+            return ret;
+        }
+
+        remaining_sectors -= n;
+        sector_num += n;
+        bytes_done += n * BDRV_SECTOR_SIZE;
+    }
+
+    return 0;
+}
+
+static coroutine_fn int replication_co_discard(BlockDriverState *bs,
+                                               int64_t sector_num,
+                                               int nb_sectors)
+{
+    BDRVReplicationState *s = bs->opaque;
+    int ret;
+
+    ret = replication_get_io_status(s);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (ret == 1) {
+        /* It is secondary qemu and we are after failover */
+        ret = bdrv_co_discard(s->secondary_disk, sector_num, nb_sectors);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    ret = bdrv_co_discard(bs->file, sector_num, nb_sectors);
+    return replication_return_value(s, ret);
+}
+
+static bool replication_recurse_is_first_non_filter(BlockDriverState *bs,
+                                                    BlockDriverState *candidate)
+{
+    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
+}
+
+static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
+{
+    Error *local_err = NULL;
+    int ret;
+
+    if (!s->secondary_disk->job) {
+        error_setg(errp, "Backup job is cancelled unexpectedly");
+        return;
+    }
+
+    block_job_do_checkpoint(s->secondary_disk->job, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    ret = s->active_disk->drv->bdrv_make_empty(s->active_disk);
+    if (ret < 0) {
+        error_setg(errp, "Cannot make active disk empty");
+        return;
+    }
+
+    ret = s->hidden_disk->drv->bdrv_make_empty(s->hidden_disk);
+    if (ret < 0) {
+        error_setg(errp, "Cannot make hidden disk empty");
+        return;
+    }
+}
+
+static void replication_start(BlockDriverState *bs, ReplicationMode mode,
+                              Error **errp)
+{
+    BDRVReplicationState *s = bs->opaque;
+    int64_t active_length, hidden_length, nbd_length;
+    Error *local_err = NULL;
+
+    if (s->replication_state != BLOCK_REPLICATION_NONE) {
+        error_setg(errp, "Block replication is running or done");
+        return;
+    }
+
+    if (s->mode != mode) {
+        error_setg(errp, "Invalid parameter 'mode'");
+        return;
+    }
+
+    switch (s->mode) {
+    case REPLICATION_MODE_PRIMARY:
+        bdrv_connect(bs->file, errp);
+        break;
+    case REPLICATION_MODE_SECONDARY:
+        if (!bs->file->backing_reference) {
+            error_setg(errp, "Active disk doesn't use backing_reference");
+        }
+
+        s->active_disk = bs->file;
+        s->hidden_disk = s->active_disk->backing_hd;
+        s->secondary_disk = s->hidden_disk->backing_hd;
+
+        if (!s->secondary_disk->job ||
+            s->secondary_disk->job->driver->job_type != BLOCK_JOB_TYPE_BACKUP) {
+            error_setg(errp, "Backup job is cancelled unexpectedly");
+            return;
+        }
+
+        /* verify the length */
+        active_length = bdrv_getlength(s->active_disk);
+        hidden_length = bdrv_getlength(s->hidden_disk);
+        nbd_length = bdrv_getlength(s->secondary_disk);
+        if (active_length < 0 || hidden_length < 0 || nbd_length < 0 ||
+            active_length != hidden_length || hidden_length != nbd_length) {
+            error_setg(errp, "active disk, hidden disk, nbd target's length"
+                       " are not the same");
+            return;
+        }
+
+        if (!s->active_disk->drv->bdrv_make_empty ||
+            !s->hidden_disk->drv->bdrv_make_empty) {
+            error_setg(errp,
+                       "active disk or hidden disk doesn't support make_empty");
+            return;
+        }
+        break;
+    default:
+        abort();
+    }
+
+    s->replication_state = BLOCK_REPLICATION_RUNNING;
+
+    if (s->mode == REPLICATION_MODE_SECONDARY) {
+        secondary_do_checkpoint(s, errp);
+
+        /* start NBD server */
+        s->exp = nbd_export_new(s->secondary_disk->blk, 0, -1, 0, NULL,
+                                &local_err);
+        if (!s->exp) {
+            s->replication_state = BLOCK_REPLICATION_NONE;
+            error_propagate(errp, local_err);
+            return;
+        }
+        nbd_export_set_name(s->exp, s->export_name);
+    }
+}
+
+static void replication_do_checkpoint(BlockDriverState *bs, Error **errp)
+{
+    BDRVReplicationState *s = bs->opaque;
+
+    if (s->replication_state != BLOCK_REPLICATION_RUNNING) {
+        error_setg(errp, "Block replication is not running");
+        return;
+    }
+
+    if (s->error) {
+        error_setg(errp, "I/O error occurs");
+        return;
+    }
+
+    if (s->mode == REPLICATION_MODE_SECONDARY) {
+        secondary_do_checkpoint(s, errp);
+    }
+}
+
+static void replication_stop(BlockDriverState *bs, bool failover, Error **errp)
+{
+    BDRVReplicationState *s = bs->opaque;
+
+    if (s->replication_state != BLOCK_REPLICATION_RUNNING) {
+        error_setg(errp, "Block replication is not running");
+        return;
+    }
+
+    s->replication_state = BLOCK_REPLICATION_DONE;
+
+    switch (s->mode) {
+    case REPLICATION_MODE_PRIMARY:
+        bdrv_disconnect(bs->file);
+        break;
+    case REPLICATION_MODE_SECONDARY:
+        /* stop NBD server */
+        nbd_export_close(s->exp);
+        nbd_export_put(s->exp);
+
+        if (!failover) {
+            secondary_do_checkpoint(s, errp);
+            return;
+        }
+
+        if (!s->secondary_disk->job ||
+            s->secondary_disk->job->driver->job_type != BLOCK_JOB_TYPE_BACKUP) {
+            error_setg(errp, "Backup job is cancelled unexpectedly");
+            return;
+        }
+
+        block_job_cancel(s->secondary_disk->job);
+        break;
+    default:
+        abort();
+    }
+}
+
+BlockDriver bdrv_replication = {
+    .format_name                = "replication",
+    .protocol_name              = "replication",
+    .instance_size              = sizeof(BDRVReplicationState),
+
+    .bdrv_open                  = replication_open,
+    .bdrv_close                 = replication_close,
+
+    .bdrv_getlength             = replication_getlength,
+    .bdrv_co_readv              = replication_co_readv,
+    .bdrv_co_writev             = replication_co_writev,
+    .bdrv_co_discard            = replication_co_discard,
+
+    .is_filter                  = true,
+    .bdrv_recurse_is_first_non_filter = replication_recurse_is_first_non_filter,
+
+    .bdrv_start_replication     = replication_start,
+    .bdrv_do_checkpoint         = replication_do_checkpoint,
+    .bdrv_stop_replication      = replication_stop,
+
+    .has_variable_length        = true,
+};
+
+static void bdrv_replication_init(void)
+{
+    bdrv_register(&bdrv_replication);
+}
+
+block_init(bdrv_replication_init);
-- 
2.4.3

      parent reply	other threads:[~2015-06-18  8:46 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-06-18  8:49 [Qemu-devel] [PATCH COLO-Block v6 00/16] Block replication for continuous checkpoints Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 01/16] docs: block replication's description Wen Congyang
2015-06-18 10:34   ` Stefan Hajnoczi
2015-06-18 10:51     ` Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 02/16] allow writing to the backing file Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 03/16] Allow creating backup jobs when opening BDS Wen Congyang
2015-06-18 10:40   ` [Qemu-devel] [Qemu-block] " Stefan Hajnoczi
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 04/16] block: Parse "backing_reference" option to reference existing BDS Wen Congyang
2015-06-18 10:50   ` Stefan Hajnoczi
2015-06-18 11:40     ` Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 05/16] Backup: clear all bitmap when doing block checkpoint Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 06/16] Don't allow a disk use backing reference target Wen Congyang
2015-06-18 12:47   ` Stefan Hajnoczi
2015-06-18 15:17     ` Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 07/16] Add new block driver interface to connect/disconnect the remote target Wen Congyang
2015-06-18 12:55   ` [Qemu-devel] [Qemu-block] " Stefan Hajnoczi
2015-06-18 14:36     ` Wen Congyang
2015-06-18 16:06       ` Stefan Hajnoczi
2015-06-19  0:54         ` Wen Congyang
2015-06-19 10:49           ` Stefan Hajnoczi
2015-06-20  3:31             ` Wen Congyang
2015-06-22 12:39               ` Stefan Hajnoczi
2015-06-22 13:56                 ` Wen Congyang
2015-06-23 13:42                   ` Stefan Hajnoczi
2015-06-23 13:54                     ` Wen Congyang
2015-06-24 14:07               ` Dr. David Alan Gilbert
2015-06-25  1:01                 ` Wen Congyang
2015-06-26 19:03                   ` Dr. David Alan Gilbert
2015-06-29  1:05                     ` Wen Congyang
2015-06-30 19:01                       ` Dr. David Alan Gilbert
2015-07-01  1:01                         ` Wen Congyang
2015-07-01  8:11                           ` Dr. David Alan Gilbert
2015-07-01  8:18                             ` Wen Congyang
2015-07-01 18:42                               ` Dr. David Alan Gilbert
2015-07-02  0:55                                 ` Wen Congyang
2015-07-02  7:54                                   ` Dr. David Alan Gilbert
2015-07-02  8:05                                     ` Wen Congyang
2015-06-24  1:11             ` Wen Congyang
2015-06-23  6:44         ` Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 08/16] NBD client: implement block driver interfaces to connect/disconnect NBD server Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 09/16] Introduce a new -drive option to control whether to connect to remote target Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 10/16] NBD client: connect to nbd server later Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 11/16] Add new block driver interfaces to control block replication Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 12/16] skip nbd_target when starting " Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 13/16] quorum: implement block driver interfaces for " Wen Congyang
2015-06-18 13:06   ` Stefan Hajnoczi
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 14/16] introduce a new API qemu_opts_absorb_qdict_by_index() Wen Congyang
2015-06-18  8:49 ` [Qemu-devel] [PATCH COLO-Block v6 15/16] quorum: allow ignoring child errors Wen Congyang
2015-06-18 13:06   ` Stefan Hajnoczi
2015-06-18  8:49 ` Wen Congyang [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1434617361-17778-17-git-send-email-wency@cn.fujitsu.com \
    --to=wency@cn.fujitsu.com \
    --cc=arei.gonglei@huawei.com \
    --cc=dgilbert@redhat.com \
    --cc=eddie.dong@intel.com \
    --cc=famz@redhat.com \
    --cc=kwolf@redhat.com \
    --cc=laijs@cn.fujitsu.com \
    --cc=mreitz@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    --cc=yanghy@cn.fujitsu.com \
    --cc=yunhong.jiang@intel.com \
    --cc=zhang.zhanghailiang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.