All of lore.kernel.org
 help / color / mirror / Atom feed
From: Md Haris Iqbal <haris.phnx@gmail.com>
To: qemu-devel@nongnu.org
Cc: dgilbert@redhat.com, Md Haris Iqbal <haris.phnx@gmail.com>
Subject: [Qemu-devel] [PATCH 1/6] Migration: Reconnect network in case of network failure during pc migration (source)
Date: Mon, 22 Aug 2016 02:28:47 +0530	[thread overview]
Message-ID: <1471813132-13836-2-git-send-email-haris.phnx@gmail.com> (raw)
In-Reply-To: <1471813132-13836-1-git-send-email-haris.phnx@gmail.com>

Signed-off-by: Md Haris Iqbal <haris.phnx@gmail.com>
---
 hmp-commands.hx               |  20 +++---
 hmp.c                         |   4 +-
 include/migration/migration.h |   4 ++
 migration/migration.c         | 153 +++++++++++++++++++++++++++++++++++++-----
 qapi-schema.json              |  16 ++++-
 qmp-commands.hx               |   3 +-
 vl.c                          |   4 ++
 7 files changed, 174 insertions(+), 30 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index 848efee..8f765fd 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -894,23 +894,25 @@ ETEXI
 
     {
         .name       = "migrate",
-        .args_type  = "detach:-d,blk:-b,inc:-i,uri:s",
-        .params     = "[-d] [-b] [-i] uri",
+        .args_type  = "detach:-d,recover:-r,blk:-b,inc:-i,uri:s",
+        .params     = "[-d] [-r] [-b] [-i] uri",
         .help       = "migrate to URI (using -d to not wait for completion)"
-		      "\n\t\t\t -b for migration without shared storage with"
-		      " full copy of disk\n\t\t\t -i for migration without "
-		      "shared storage with incremental copy of disk "
-		      "(base image shared between src and destination)",
+                     "\n\t\t\t -r to recover from a broken migration\n\t\t\t"
+                     " -b for migration without shared storage with"
+                     " full copy of disk\n\t\t\t -i for migration without "
+                     "shared storage with incremental copy of disk "
+                     "(base image shared between src and destination)",
         .mhandler.cmd = hmp_migrate,
     },
 
 
 STEXI
-@item migrate [-d] [-b] [-i] @var{uri}
+@item migrate [-d] [-r] [-b] [-i] @var{uri}
 @findex migrate
 Migrate to @var{uri} (using -d to not wait for completion).
-	-b for migration with full copy of disk
-	-i for migration with incremental copy of disk (base image is shared)
+       -r to recover from a broken migration
+       -b for migration with full copy of disk
+       -i for migration with incremental copy of disk (base image is shared)
 ETEXI
 
     {
diff --git a/hmp.c b/hmp.c
index cc2056e..02ed457 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1563,12 +1563,14 @@ static void hmp_migrate_status_cb(void *opaque)
 void hmp_migrate(Monitor *mon, const QDict *qdict)
 {
     bool detach = qdict_get_try_bool(qdict, "detach", false);
+    bool recover = qdict_get_try_bool(qdict, "recover", false);
     bool blk = qdict_get_try_bool(qdict, "blk", false);
     bool inc = qdict_get_try_bool(qdict, "inc", false);
     const char *uri = qdict_get_str(qdict, "uri");
     Error *err = NULL;
 
-    qmp_migrate(uri, !!blk, blk, !!inc, inc, false, false, &err);
+    qmp_migrate(uri, !!recover, recover, !!blk, blk, !!inc, inc, false, false,
+                &err);
     if (err) {
         error_report_err(err);
         return;
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 3c96623..bcaf55d 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -142,6 +142,7 @@ struct MigrationState
     int state;
     /* Old style params from 'migrate' command */
     MigrationParams params;
+    bool in_recovery;
 
     /* State related to return path */
     struct {
@@ -351,6 +352,9 @@ void flush_page_queue(MigrationState *ms);
 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
                          ram_addr_t start, ram_addr_t len);
 
+int qemu_migrate_postcopy_outgoing_recovery(MigrationState *ms);
+int qemu_migrate_postcopy_incoming_recovery(QEMUFile **f,MigrationIncomingState* mis);
+
 PostcopyState postcopy_state_get(void);
 /* Set the state and return the old state */
 PostcopyState postcopy_state_set(PostcopyState new_state);
diff --git a/migration/migration.c b/migration/migration.c
index 955d5ee..6ed2e82 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -709,6 +709,33 @@ MigrationInfo *qmp_query_migrate(Error **errp)
     case MIGRATION_STATUS_CANCELLED:
         info->has_status = true;
         break;
+    case MIGRATION_STATUS_POSTCOPY_RECOVERY:
+        info->has_status = true;
+        info->has_total_time = true;
+        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+        info->has_ram = true;
+        info->ram = g_malloc0(sizeof(*info->ram));
+        info->ram->transferred = ram_bytes_transferred();
+        info->ram->remaining = ram_bytes_remaining();
+        info->ram->total = ram_bytes_total();
+        info->ram->duplicate = dup_mig_pages_transferred();
+        info->ram->skipped = skipped_mig_pages_transferred();
+        info->ram->normal = norm_mig_pages_transferred();
+        info->ram->normal_bytes = norm_mig_bytes_transferred();
+        info->ram->dirty_pages_rate = s->dirty_pages_rate;
+        info->ram->mbps = s->mbps;
+        info->ram->dirty_sync_count = s->dirty_sync_count;
+
+        if (blk_mig_active()) {
+            info->has_disk = true;
+            info->disk = g_malloc0(sizeof(*info->disk));
+            info->disk->transferred = blk_mig_bytes_transferred();
+            info->disk->remaining = blk_mig_bytes_remaining();
+            info->disk->total = blk_mig_bytes_total();
+        }
+
+        get_xbzrle_cache_stats(info);
     }
     info->status = s->state;
 
@@ -993,6 +1020,7 @@ MigrationState *migrate_init(const MigrationParams *params)
     s->xfer_limit = 0;
     s->cleanup_bh = 0;
     s->to_dst_file = NULL;
+    s->in_recovery = false;
     s->state = MIGRATION_STATUS_NONE;
     s->params = *params;
     s->rp_state.from_dst_file = NULL;
@@ -1069,13 +1097,14 @@ bool migration_is_blocked(Error **errp)
     return false;
 }
 
-void qmp_migrate(const char *uri, bool has_blk, bool blk,
-                 bool has_inc, bool inc, bool has_detach, bool detach,
+void qmp_migrate(const char *uri, bool in_recover, bool recover, bool has_blk,
+                 bool blk, bool has_inc, bool inc, bool has_detach, bool detach,
                  Error **errp)
 {
     Error *local_err = NULL;
     MigrationState *s = migrate_get_current();
     MigrationParams params;
+    bool recovery = in_recover && recover;
     const char *p;
 
     params.blk = has_blk && blk;
@@ -1095,7 +1124,39 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
         return;
     }
 
-    s = migrate_init(&params);
+    if (recovery ^ atomic_mb_read(&s->in_recovery)) {
+        if (recovery) {
+            /* No VM is waiting for recovery and
+             * recovery option was set
+             */
+
+            error_setg(errp, "No VM to recover");
+            return;
+        } else {
+            /* A VM is waiting for recovery and
+             * no recovery option is set
+             */
+
+            error_setg(errp, "A migration is in recovery state");
+            return;
+        }
+    } else {
+        if (!recovery) {
+            /* No VM is waiting for recovery and
+             * no recovery option is set
+             */
+            s = migrate_init(&params);
+        } else {
+            /* A VM is waiting for recovery and
+             * recovery option was set
+             */
+            s->to_dst_file = NULL;
+            if (s->rp_state.from_dst_file) {
+                /* shutdown the rp socket, so causing the rp thread to shutdown */
+                qemu_file_shutdown(s->rp_state.from_dst_file);
+            }
+        }
+    }
 
     if (strstart(uri, "tcp:", &p)) {
         tcp_start_outgoing_migration(s, p, &local_err);
@@ -1336,6 +1397,8 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
  */
 static void *source_return_path_thread(void *opaque)
 {
+    fprintf(stderr, "Return path started on source\n");
+
     MigrationState *ms = opaque;
     QEMUFile *rp = ms->rp_state.from_dst_file;
     uint16_t header_len, header_type;
@@ -1439,8 +1502,8 @@ static void *source_return_path_thread(void *opaque)
 
     trace_source_return_path_thread_end();
 out:
-    ms->rp_state.from_dst_file = NULL;
     qemu_fclose(rp);
+    fprintf(stderr, "Return path failed on source\n");
     return NULL;
 }
 
@@ -1714,6 +1777,7 @@ static void *migration_thread(void *opaque)
     bool entered_postcopy = false;
     /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
     enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
+    int ret;
 
     rcu_register_thread();
 
@@ -1781,7 +1845,26 @@ static void *migration_thread(void *opaque)
             }
         }
 
-        if (qemu_file_get_error(s->to_dst_file)) {
+        if ((ret = qemu_file_get_error(s->to_dst_file))) {
+            /*  This check is based on how the error is set during the network
+             *  recv(). When recv() returns 0 (i.e. no data to read), the error
+             *  is set to -EIO. For all other network errors, it is set
+             *  according to the return value received.
+             */
+            if (ret == -EIO && s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+                /* Network Failure during postcopy */
+
+                current_active_state = MIGRATION_STATUS_POSTCOPY_RECOVERY;
+                runstate_set(RUN_STATE_POSTMIGRATE_RECOVERY);
+                ret = qemu_migrate_postcopy_outgoing_recovery(s);
+                if(ret == 0) {
+                    current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE;
+                    runstate_set(RUN_STATE_FINISH_MIGRATE);
+                    qemu_file_clear_error(s->to_dst_file);
+                    continue;
+                }
+
+            }
             migrate_set_state(&s->state, current_active_state,
                               MIGRATION_STATUS_FAILED);
             trace_migration_thread_file_err();
@@ -1852,17 +1935,6 @@ static void *migration_thread(void *opaque)
 
 void migrate_fd_connect(MigrationState *s)
 {
-    /* This is a best 1st approximation. ns to ms */
-    s->expected_downtime = max_downtime/1000000;
-    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
-
-    qemu_file_set_blocking(s->to_dst_file, true);
-    qemu_file_set_rate_limit(s->to_dst_file,
-                             s->bandwidth_limit / XFER_LIMIT_RATIO);
-
-    /* Notify before starting migration thread */
-    notifier_list_notify(&migration_state_notifiers, s);
-
     /*
      * Open the return path; currently for postcopy but other things might
      * also want it.
@@ -1877,12 +1949,61 @@ void migrate_fd_connect(MigrationState *s)
         }
     }
 
+    qemu_file_set_blocking(s->to_dst_file, true);
+    qemu_file_set_rate_limit(s->to_dst_file,
+                             s->bandwidth_limit / XFER_LIMIT_RATIO);
+
+    if (atomic_mb_read(&s->in_recovery)) {
+        qemu_mutex_lock(&migration_recovery_mutex);
+        atomic_mb_set(&s->in_recovery, false);
+        qemu_cond_signal(&migration_recovery_cond);
+        qemu_mutex_unlock(&migration_recovery_mutex);
+
+        fprintf(stderr, "recovered\n");
+        return;
+    }
+
+    /* This is a best 1st approximation. ns to ms */
+    s->expected_downtime = max_downtime/1000000;
+    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
+
+
+    /* Notify before starting migration thread */
+    notifier_list_notify(&migration_state_notifiers, s);
+
     migrate_compress_threads_create();
     qemu_thread_create(&s->thread, "migration", migration_thread, s,
                        QEMU_THREAD_JOINABLE);
     s->migration_thread_running = true;
 }
 
+int qemu_migrate_postcopy_outgoing_recovery(MigrationState* ms)
+{
+    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                                  MIGRATION_STATUS_POSTCOPY_RECOVERY);
+
+    atomic_mb_set(&ms->in_recovery, true);
+    /* Code for network recovery to be added here */
+    qemu_mutex_lock(&migration_recovery_mutex);
+    while(atomic_mb_read(&ms->in_recovery) == true) {
+        fprintf(stderr, "Under recovery, not letting it fail %p\n", ms->to_dst_file);
+        qemu_cond_wait(&migration_recovery_cond, &migration_recovery_mutex);
+    }
+    qemu_mutex_unlock(&migration_recovery_mutex);
+
+    if(ms->to_dst_file != NULL) {
+        /* Recovery successfull */
+        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_RECOVERY,
+                                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+        qemu_savevm_send_open_return_path(ms->to_dst_file);
+        return 0;
+    }
+
+    return -1;
+
+}
+
 PostcopyState  postcopy_state_get(void)
 {
     return atomic_mb_read(&incoming_postcopy_state);
diff --git a/qapi-schema.json b/qapi-schema.json
index 5658723..a658462 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -154,12 +154,15 @@
 # @watchdog: the watchdog action is configured to pause and has been triggered
 #
 # @guest-panicked: guest has been panicked as a result of guest OS panic
+#
+# @postmigrate-recovery: guest is paused for recovery after a network failure
+# (since 2.7)
 ##
 { 'enum': 'RunState',
   'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
             'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
             'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
-            'guest-panicked' ] }
+            'guest-panicked', 'postmigrate-recovery' ] }
 
 ##
 # @StatusInfo:
@@ -438,12 +441,15 @@
 #
 # @failed: some error occurred during migration process.
 #
+# @postcopy-recovery: in recovery mode, after a network failure. (since 2.7)
+#
 # Since: 2.3
 #
 ##
 { 'enum': 'MigrationStatus',
   'data': [ 'none', 'setup', 'cancelling', 'cancelled',
-            'active', 'postcopy-active', 'completed', 'failed' ] }
+            'active', 'postcopy-active', 'completed', 'failed',
+            'postcopy-recovery' ] }
 
 ##
 # @MigrationInfo
@@ -2119,6 +2125,8 @@
 #
 # @uri: the Uniform Resource Identifier of the destination VM
 #
+# @recover: #optional recover from a broken migration (since 2.7)
+#
 # @blk: #optional do block migration (full disk copy)
 #
 # @inc: #optional incremental disk copy migration
@@ -2131,7 +2139,7 @@
 # Since: 0.14.0
 ##
 { 'command': 'migrate',
-  'data': {'uri': 'str', '*blk': 'bool', '*inc': 'bool', '*detach': 'bool' } }
+  'data': {'uri': 'str', '*recover': 'bool', '*blk': 'bool', '*inc': 'bool', '*detach': 'bool' } }
 
 ##
 # @migrate-incoming
@@ -2142,6 +2150,8 @@
 # @uri: The Uniform Resource Identifier identifying the source or
 #       address to listen on
 #
+# @recover: #optional recover from a broken migration (since 2.7)
+#
 # Returns: nothing on success
 #
 # Since: 2.3
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 6866264..dd727bf 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -639,7 +639,7 @@ EQMP
 
     {
         .name       = "migrate",
-        .args_type  = "detach:-d,blk:-b,inc:-i,uri:s",
+        .args_type  = "detach:-d,recover:-r,blk:-b,inc:-i,uri:s",
         .mhandler.cmd_new = qmp_marshal_migrate,
     },
 
@@ -651,6 +651,7 @@ Migrate to URI.
 
 Arguments:
 
+- "recover": recover migration (json-bool, optional)
 - "blk": block migration, full disk copy (json-bool, optional)
 - "inc": incremental disk copy (json-bool, optional)
 - "uri": Destination URI (json-string)
diff --git a/vl.c b/vl.c
index b3c80d5..f702886 100644
--- a/vl.c
+++ b/vl.c
@@ -597,6 +597,10 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE_RECOVERY },
+
+    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_FINISH_MIGRATE },
+    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_SHUTDOWN },
 
     { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
-- 
2.7.4

  reply	other threads:[~2016-08-21 20:59 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-08-21 20:58 [Qemu-devel] [PATCH 0/6] Recovery from network failure during Postcopy Migration Md Haris Iqbal
2016-08-21 20:58 ` Md Haris Iqbal [this message]
2016-08-21 20:58 ` [Qemu-devel] [PATCH 2/6] migration : General additions for migration recovery Md Haris Iqbal
2016-08-21 20:58 ` [Qemu-devel] [PATCH 3/6] Migration: Reconnect network in case of network failure during pc migration (destination) Md Haris Iqbal
2016-08-21 20:58 ` [Qemu-devel] [PATCH 4/6] Migration: New bitmap for postcopy migration failure Md Haris Iqbal
2016-08-21 20:58 ` [Qemu-devel] [PATCH 5/6] Migration: Recovering pages lost due to n/w failure during pc migration (source) Md Haris Iqbal
2016-08-21 20:58 ` [Qemu-devel] [PATCH 6/6] Migration: Recovering pages lost due to n/w failure during pc migration (destination) Md Haris Iqbal
2016-08-21 21:10 ` [Qemu-devel] [PATCH 0/6] Recovery from network failure during Postcopy Migration no-reply

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1471813132-13836-2-git-send-email-haris.phnx@gmail.com \
    --to=haris.phnx@gmail.com \
    --cc=dgilbert@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.