[PATCH v12 15/26] libxc/restore: support COLO restore

From: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
To: xen devel <xen-devel@lists.xen.org>,
	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Ian Campbell <ian.campbell@citrix.com>,
	Ian Jackson <ian.jackson@eu.citrix.com>,
	Wei Liu <wei.liu2@citrix.com>
Cc: Lars Kurth <lars.kurth@citrix.com>,
	Changlong Xie <xiecl.fnst@cn.fujitsu.com>,
	Wen Congyang <wency@cn.fujitsu.com>,
	Li Zhijian <lizhijian@cn.fujitsu.com>,
	Gui Jianfeng <guijianfeng@cn.fujitsu.com>,
	Jiang Yunhong <yunhong.jiang@intel.com>,
	Dong Eddie <eddie.dong@intel.com>,
	Anthony Perard <anthony.perard@citrix.com>,
	Shriram Rajagopalan <rshriram@cs.ubc.ca>,
	Yang Hongyang <hongyang.yang@easystack.cn>
Subject: [PATCH v12 15/26] libxc/restore: support COLO restore
Date: Wed, 23 Mar 2016 16:06:29 +0800	[thread overview]
Message-ID: <1458720400-4699-16-git-send-email-xiecl.fnst@cn.fujitsu.com> (raw)
In-Reply-To: <1458720400-4699-1-git-send-email-xiecl.fnst@cn.fujitsu.com>

From: Wen Congyang <wency@cn.fujitsu.com>

a. call callbacks resume/checkpoint/suspend while secondary vm
   status is consistent with primary
b. send dirty pfn list to primary when checkpoint under colo
c. send store gfn and console gfn to xl before resuming secondary vm

Signed-off-by: Yang Hongyang <hongyang.yang@easystack.cn>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
CC: Andrew Cooper <andrew.cooper3@citrix.com>
---
 tools/libxc/include/xenguest.h     |   8 ++
 tools/libxc/xc_sr_common.h         |   8 +-
 tools/libxc/xc_sr_restore.c        | 181 +++++++++++++++++++++++++++++++++++++
 tools/libxl/libxl_colo_restore.c   |   5 -
 tools/libxl/libxl_create.c         |   3 +
 tools/libxl/libxl_save_msgs_gen.pl |   4 +-
 6 files changed, 200 insertions(+), 9 deletions(-)

diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
index 8ea5a3c..40902ee 100644
--- a/tools/libxc/include/xenguest.h
+++ b/tools/libxc/include/xenguest.h
@@ -133,6 +133,14 @@ struct restore_callbacks {
      */
     int (*wait_checkpoint)(void* data);
 
+    /*
+     * callback to send store gfn and console gfn to xl
+     * if we want to resume vm before xc_domain_save()
+     * exits.
+     */
+    void (*restore_results)(xen_pfn_t store_gfn, xen_pfn_t console_gfn,
+                            void *data);
+
     /* to be provided as the last argument to each callback function */
     void* data;
 };
diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
index c990664..cf32ab8 100644
--- a/tools/libxc/xc_sr_common.h
+++ b/tools/libxc/xc_sr_common.h
@@ -214,6 +214,10 @@ struct xc_sr_context
             struct xc_sr_restore_ops ops;
             struct restore_callbacks *callbacks;
 
+            int send_back_fd;
+            unsigned long p2m_size;
+            xc_hypercall_buffer_t dirty_bitmap_hbuf;
+
             /* From Image Header. */
             uint32_t format_version;
 
@@ -222,13 +226,13 @@ struct xc_sr_context
             uint32_t guest_page_size;
 
             /* Plain VM, or checkpoints over time. */
-            bool checkpointed;
+            int checkpointed;
 
             /* Currently buffering records between a checkpoint */
             bool buffer_all_records;
 
 /*
- * With Remus, we buffer the records sent by the primary at checkpoint,
+ * With Remus/COLO, we buffer the records sent by the primary at checkpoint,
  * in case the primary will fail, we can recover from the last
  * checkpoint state.
  * This should be enough for most of the cases because primary only send
diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c
index 3e4ca7f..728edbc 100644
--- a/tools/libxc/xc_sr_restore.c
+++ b/tools/libxc/xc_sr_restore.c
@@ -411,6 +411,92 @@ static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record *rec)
     return rc;
 }
 
+/*
+ * Send checkpoint dirty pfn list to primary.
+ */
+static int send_checkpoint_dirty_pfn_list(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc = -1;
+    unsigned count, written;
+    uint64_t i, *pfns = NULL;
+    struct iovec *iov = NULL;
+    xc_shadow_op_stats_t stats = { 0, ctx->restore.p2m_size };
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST,
+    };
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->restore.dirty_bitmap_hbuf);
+
+    if ( xc_shadow_control(
+             xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+             HYPERCALL_BUFFER(dirty_bitmap), ctx->restore.p2m_size,
+             NULL, 0, &stats) != ctx->restore.p2m_size )
+    {
+        PERROR("Failed to retrieve logdirty bitmap");
+        goto err;
+    }
+
+    for ( i = 0, count = 0; i < ctx->restore.p2m_size; i++ )
+    {
+        if ( test_bit(i, dirty_bitmap) )
+            count++;
+    }
+
+
+    pfns = malloc(count * sizeof(*pfns));
+    if ( !pfns )
+    {
+        ERROR("Unable to allocate %zu bytes of memory for dirty pfn list",
+              count * sizeof(*pfns));
+        goto err;
+    }
+
+    for ( i = 0, written = 0; i < ctx->restore.p2m_size; ++i )
+    {
+        if ( !test_bit(i, dirty_bitmap) )
+            continue;
+
+        if ( written > count )
+        {
+            ERROR("Dirty pfn list exceed");
+            goto err;
+        }
+
+        pfns[written++] = i;
+    }
+
+    /* iovec[] for writev(). */
+    iov = malloc(3 * sizeof(*iov));
+    if ( !iov )
+    {
+        ERROR("Unable to allocate memory for sending dirty bitmap");
+        goto err;
+    }
+
+    rec.length = count * sizeof(*pfns);
+
+    iov[0].iov_base = &rec.type;
+    iov[0].iov_len = sizeof(rec.type);
+
+    iov[1].iov_base = &rec.length;
+    iov[1].iov_len = sizeof(rec.length);
+
+    iov[2].iov_base = pfns;
+    iov[2].iov_len = count * sizeof(*pfns);
+
+    if ( writev_exact(ctx->restore.send_back_fd, iov, 3) )
+    {
+        PERROR("Failed to write dirty bitmap to stream");
+        goto err;
+    }
+
+    rc = 0;
+ err:
+    return rc;
+}
+
 static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec);
 static int handle_checkpoint(struct xc_sr_context *ctx)
 {
@@ -460,6 +546,53 @@ static int handle_checkpoint(struct xc_sr_context *ctx)
     else
         ctx->restore.buffer_all_records = true;
 
+    if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+    {
+#define HANDLE_CALLBACK_RETURN_VALUE(ret)                   \
+    do {                                                    \
+        if ( ret == 1 )                                     \
+            rc = 0; /* Success */                           \
+        else                                                \
+        {                                                   \
+            if ( ret == 2 )                                 \
+                rc = BROKEN_CHANNEL;                        \
+            else                                            \
+                rc = -1; /* Some unspecified error */       \
+            goto err;                                       \
+        }                                                   \
+    } while (0)
+
+        /* COLO */
+
+        /* We need to resume guest */
+        rc = ctx->restore.ops.stream_complete(ctx);
+        if ( rc )
+            goto err;
+
+        ctx->restore.callbacks->restore_results(ctx->restore.xenstore_gfn,
+                                                ctx->restore.console_gfn,
+                                                ctx->restore.callbacks->data);
+
+        /* Resume secondary vm */
+        ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data);
+        HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+        /* Wait for a new checkpoint */
+        ret = ctx->restore.callbacks->wait_checkpoint(
+                                                ctx->restore.callbacks->data);
+        HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+        /* suspend secondary vm */
+        ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data);
+        HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+#undef HANDLE_CALLBACK_RETURN_VALUE
+
+        rc = send_checkpoint_dirty_pfn_list(ctx);
+        if ( rc )
+            goto err;
+    }
+
  err:
     return rc;
 }
@@ -529,6 +662,21 @@ static int setup(struct xc_sr_context *ctx)
 {
     xc_interface *xch = ctx->xch;
     int rc;
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->restore.dirty_bitmap_hbuf);
+
+    if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+    {
+        dirty_bitmap = xc_hypercall_buffer_alloc_pages(xch, dirty_bitmap,
+                                NRPAGES(bitmap_size(ctx->restore.p2m_size)));
+
+        if ( !dirty_bitmap )
+        {
+            ERROR("Unable to allocate memory for dirty bitmap");
+            rc = -1;
+            goto err;
+        }
+    }
 
     rc = ctx->restore.ops.setup(ctx);
     if ( rc )
@@ -562,10 +710,15 @@ static void cleanup(struct xc_sr_context *ctx)
 {
     xc_interface *xch = ctx->xch;
     unsigned i;
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->restore.dirty_bitmap_hbuf);
 
     for ( i = 0; i < ctx->restore.buffered_rec_num; i++ )
         free(ctx->restore.buffered_records[i].data);
 
+    if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+        xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
+                                   NRPAGES(bitmap_size(ctx->restore.p2m_size)));
     free(ctx->restore.buffered_records);
     free(ctx->restore.populated_pfns);
     if ( ctx->restore.ops.cleanup(ctx) )
@@ -631,6 +784,15 @@ static int restore(struct xc_sr_context *ctx)
     } while ( rec.type != REC_TYPE_END );
 
  remus_failover:
+
+    if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+    {
+        /* With COLO, we have already called stream_complete */
+        rc = 0;
+        IPRINTF("COLO Failover");
+        goto done;
+    }
+
     /*
      * With Remus, if we reach here, there must be some error on primary,
      * failover from the last checkpoint state.
@@ -667,6 +829,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
                       xc_migration_stream_t stream_type,
                       struct restore_callbacks *callbacks, int send_back_fd)
 {
+    xen_pfn_t nr_pfns;
     struct xc_sr_context ctx =
         {
             .xch = xch,
@@ -680,11 +843,21 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
     ctx.restore.xenstore_domid = store_domid;
     ctx.restore.checkpointed = stream_type;
     ctx.restore.callbacks = callbacks;
+    ctx.restore.send_back_fd = send_back_fd;
 
     /* Sanity checks for callbacks. */
     if ( stream_type )
         assert(callbacks->checkpoint);
 
+    if ( ctx.restore.checkpointed == XC_MIG_STREAM_COLO )
+    {
+        /* this is COLO restore */
+        assert(callbacks->suspend &&
+               callbacks->postcopy &&
+               callbacks->wait_checkpoint &&
+               callbacks->restore_results);
+    }
+
     DPRINTF("fd %d, dom %u, hvm %u, pae %u, superpages %d"
             ", stream_type %d", io_fd, dom, hvm, pae,
             superpages, stream_type);
@@ -706,6 +879,14 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
     if ( read_headers(&ctx) )
         return -1;
 
+    if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 )
+    {
+        PERROR("Unable to obtain the guest p2m size");
+        return -1;
+    }
+
+    ctx.restore.p2m_size = nr_pfns;
+
     if ( ctx.dominfo.hvm )
     {
         ctx.restore.ops = restore_ops_x86_hvm;
diff --git a/tools/libxl/libxl_colo_restore.c b/tools/libxl/libxl_colo_restore.c
index a8f74a7..04b02d8 100644
--- a/tools/libxl/libxl_colo_restore.c
+++ b/tools/libxl/libxl_colo_restore.c
@@ -126,11 +126,6 @@ static void colo_resume_vm(libxl__egc *egc,
         return;
     }
 
-    /*
-     * TODO: get store gfn and console gfn
-     *  We should call the callback restore_results in
-     *  xc_domain_restore() before resuming the guest.
-     */
     libxl__xc_domain_restore_done(egc, dcs, 0, 0, 0);
 
     return;
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index a8b3bb1..1bf8a92 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -1013,6 +1013,8 @@ static void domcreate_bootloader_done(libxl__egc *egc,
     const int checkpointed_stream = dcs->restore_params.checkpointed_stream;
     libxl__colo_restore_state *const crs = &dcs->crs;
     libxl_domain_build_info *const info = &d_config->b_info;
+    libxl__srm_restore_autogen_callbacks *const callbacks =
+        &dcs->srs.shs.callbacks.restore.a;
 
     if (rc) {
         domcreate_rebuild_done(egc, dcs, rc);
@@ -1040,6 +1042,7 @@ static void domcreate_bootloader_done(libxl__egc *egc,
     }
 
     /* Restore */
+    callbacks->restore_results = libxl__srm_callout_callback_restore_results;
 
     /* COLO only supports HVM now because it does not work very
      * well with pv drivers:
diff --git a/tools/libxl/libxl_save_msgs_gen.pl b/tools/libxl/libxl_save_msgs_gen.pl
index 6016706..c2243f2 100755
--- a/tools/libxl/libxl_save_msgs_gen.pl
+++ b/tools/libxl/libxl_save_msgs_gen.pl
@@ -29,8 +29,8 @@ our @msgs = (
     [  6, 'srcxA',  "wait_checkpoint", [] ],
     [  7, 'scxA',   "switch_qemu_logdirty",  [qw(int domid
                                               unsigned enable)] ],
-    [  8, 'r',      "restore_results",       ['unsigned long', 'store_mfn',
-                                              'unsigned long', 'console_mfn'] ],
+    [  8, 'rcx',    "restore_results",       ['unsigned long', 'store_gfn',
+                                              'unsigned long', 'console_gfn'] ],
     [  9, 'srW',    "complete",              [qw(int retval
                                                  int errnoval)] ],
 );
-- 
1.9.3




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel