From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:48060) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Yb0Qy-0002kB-Oy for qemu-devel@nongnu.org; Thu, 26 Mar 2015 01:34:28 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1Yb0Qt-0007dY-Hd for qemu-devel@nongnu.org; Thu, 26 Mar 2015 01:34:24 -0400 Received: from szxga03-in.huawei.com ([119.145.14.66]:47828) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Yb0Qk-0007ae-6M for qemu-devel@nongnu.org; Thu, 26 Mar 2015 01:34:19 -0400 From: zhanghailiang Date: Thu, 26 Mar 2015 13:29:21 +0800 Message-ID: <1427347774-8960-16-git-send-email-zhang.zhanghailiang@huawei.com> In-Reply-To: <1427347774-8960-1-git-send-email-zhang.zhanghailiang@huawei.com> References: <1427347774-8960-1-git-send-email-zhang.zhanghailiang@huawei.com> MIME-Version: 1.0 Content-Type: text/plain Subject: [Qemu-devel] [RFC PATCH v4 15/28] COLO failover: Implement COLO master/slave failover work List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: lizhijian@cn.fujitsu.com, quintela@redhat.com, yunhong.jiang@intel.com, eddie.dong@intel.com, peter.huangpeng@huawei.com, dgilbert@redhat.com, zhanghailiang , arei.gonglei@huawei.com, amit.shah@redhat.com, Lai Jiangshan , david@gibson.dropbear.id.au If failover is requested, after some cleanup work, PVM or SVM will exit COLO mode, and resume to normal run. Signed-off-by: zhanghailiang Signed-off-by: Li Zhijian Signed-off-by: Lai Jiangshan --- include/migration/migration-colo.h | 14 ++++ include/migration/migration-failover.h | 2 + migration/colo-comm.c | 10 +++ migration/colo-failover.c | 12 +++- migration/colo.c | 122 ++++++++++++++++++++++++++++++++- stubs/migration-colo.c | 5 ++ 6 files changed, 163 insertions(+), 2 deletions(-) diff --git a/include/migration/migration-colo.h b/include/migration/migration-colo.h index 593431a..7e8fe46 100644 --- a/include/migration/migration-colo.h +++ b/include/migration/migration-colo.h @@ -22,6 +22,13 @@ bool colo_supported(void); void colo_info_mig_init(void); +/* Checkpoint control, called in migration/checkpoint thread */ +enum { + COLO_UNPROTECTED_MODE = 0, + COLO_PRIMARY_MODE, + COLO_SECONDARY_MODE, +}; + struct colo_incoming { QEMUFile *file; QemuThread thread; @@ -36,8 +43,15 @@ bool loadvm_enable_colo(void); void loadvm_exit_colo(void); void *colo_process_incoming_checkpoints(void *opaque); bool loadvm_in_colo_state(void); + +int get_colo_mode(void); + /* ram cache */ void create_and_init_ram_cache(void); void colo_flush_ram_cache(void); void release_ram_cache(void); + +/* failover */ +void colo_do_failover(MigrationState *s); + #endif diff --git a/include/migration/migration-failover.h b/include/migration/migration-failover.h index a8767fc..5e59b1d 100644 --- a/include/migration/migration-failover.h +++ b/include/migration/migration-failover.h @@ -16,5 +16,7 @@ #include "qemu-common.h" void failover_request_set(void); +void failover_request_clear(void); +bool failover_request_is_set(void); #endif diff --git a/migration/colo-comm.c b/migration/colo-comm.c index 1d844e1..c3dd617 100644 --- a/migration/colo-comm.c +++ b/migration/colo-comm.c @@ -31,6 +31,16 @@ static void colo_info_save(QEMUFile *f, void *opaque) } /* restore */ +int get_colo_mode(void) +{ + if (migrate_in_colo_state()) { + return COLO_PRIMARY_MODE; + } else if (loadvm_in_colo_state()) { + return COLO_SECONDARY_MODE; + } else { + return COLO_UNPROTECTED_MODE; + } +} static int colo_info_load(QEMUFile *f, void *opaque, int version_id) { int value = qemu_get_byte(f); diff --git a/migration/colo-failover.c b/migration/colo-failover.c index af78054..850b05c 100644 --- a/migration/colo-failover.c +++ b/migration/colo-failover.c @@ -22,7 +22,7 @@ static void colo_failover_bh(void *opaque) { qemu_bh_delete(failover_bh); failover_bh = NULL; - /*TODO: Do failover work */ + colo_do_failover(NULL); } void failover_request_set(void) @@ -32,6 +32,16 @@ void failover_request_set(void) qemu_bh_schedule(failover_bh); } +void failover_request_clear(void) +{ + failover_request = false; +} + +bool failover_request_is_set(void) +{ + return failover_request; +} + void qmp_colo_lost_heartbeat(Error **errp) { failover_request_set(); diff --git a/migration/colo.c b/migration/colo.c index ce41afb..6240178 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -76,6 +76,68 @@ bool migrate_in_colo_state(void) return (s->state == MIGRATION_STATUS_COLO); } +static bool colo_runstate_is_stopped(void) +{ + return runstate_check(RUN_STATE_COLO) || !runstate_is_running(); +} + +/* + * there are two way to entry this function + * 1. From colo checkpoint incoming thread, in this case + * we should protect it by iothread lock + * 2. From user command, because hmp/qmp command + * was happened in main loop, iothread lock will cause a + * dead lock. + */ +static void slave_do_failover(void) +{ + DPRINTF("do_failover!\n"); + + colo = NULL; + + if (!autostart) { + error_report("\"-S\" qemu option will be ignored in colo slave side"); + /* recover runstate to normal migration finish state */ + autostart = true; + } + + /* On slave side, jump to incoming co */ + if (migration_incoming_co) { + qemu_coroutine_enter(migration_incoming_co, NULL); + } +} + +static void master_do_failover(void) +{ + MigrationState *s = migrate_get_current(); + + if (!colo_runstate_is_stopped()) { + vm_stop_force_state(RUN_STATE_COLO); + } + + if (s->state != MIGRATION_STATUS_FAILED) { + migrate_set_state(s, MIGRATION_STATUS_COLO, MIGRATION_STATUS_COMPLETED); + } + + vm_start(); +} + +static bool failover_completed; +void colo_do_failover(MigrationState *s) +{ + /* Make sure vm stopped while failover */ + if (!colo_runstate_is_stopped()) { + vm_stop_force_state(RUN_STATE_COLO); + } + + if (get_colo_mode() == COLO_SECONDARY_MODE) { + slave_do_failover(); + } else { + master_do_failover(); + } + failover_completed = true; +} + /* colo checkpoint control helper */ static int colo_ctl_put(QEMUFile *f, uint64_t request) { @@ -147,11 +209,23 @@ static int colo_do_checkpoint_transaction(MigrationState *s, QEMUFile *control) goto out; } + if (failover_request_is_set()) { + ret = -1; + goto out; + } /* suspend and save vm state to colo buffer */ qemu_mutex_lock_iothread(); vm_stop_force_state(RUN_STATE_COLO); qemu_mutex_unlock_iothread(); DPRINTF("vm is stoped\n"); + /* + * failover request bh could be called after + * vm_stop_force_state so we check failover_request_is_set() again. + */ + if (failover_request_is_set()) { + ret = -1; + goto out; + } /* Disable block migration */ s->params.blk = 0; @@ -247,7 +321,18 @@ static void *colo_thread(void *opaque) } out: - migrate_set_state(s, MIGRATION_STATUS_COLO, MIGRATION_STATUS_COMPLETED); + error_report("colo: some error happens in colo_thread"); + qemu_mutex_lock_iothread(); + if (!failover_request_is_set()) { + error_report("master takeover from checkpoint channel"); + failover_request_set(); + } + qemu_mutex_unlock_iothread(); + + while (!failover_completed) { + ; + } + failover_request_clear(); qsb_free(colo_buffer); colo_buffer = NULL; @@ -288,6 +373,11 @@ void colo_init_checkpointer(MigrationState *s) qemu_bh_schedule(colo_bh); } +bool loadvm_in_colo_state(void) +{ + return colo != NULL; +} + /* * return: * 0: start a checkpoint @@ -359,6 +449,10 @@ void *colo_process_incoming_checkpoints(void *opaque) continue; } } + if (failover_request_is_set()) { + error_report("failover request from heartbeat channel"); + goto out; + } /* suspend guest */ qemu_mutex_lock_iothread(); @@ -427,6 +521,32 @@ void *colo_process_incoming_checkpoints(void *opaque) } out: + error_report("Detect some error or get a failover request"); + /* determine whether we need to failover */ + if (!failover_request_is_set()) { + /* + * TODO: Here, maybe we should raise a qmp event to the user, + * It can help user to know what happens, and help deciding whether to + * do failover. + */ + usleep(2000 * 1000); + } + /* check flag again*/ + if (!failover_request_is_set()) { + /* + * We assume that master is still alive according to heartbeat, + * just kill slave + */ + error_report("SVM is going to exit!"); + exit(1); + } else { + /* if we went here, means master may dead, we are doing failover */ + while (!failover_completed) { + ; + } + failover_request_clear(); + } + colo = NULL; if (fb) { diff --git a/stubs/migration-colo.c b/stubs/migration-colo.c index 82fe14c..75b9940 100644 --- a/stubs/migration-colo.c +++ b/stubs/migration-colo.c @@ -32,6 +32,11 @@ void *colo_process_incoming_checkpoints(void *opaque) return NULL; } +bool loadvm_in_colo_state(void) +{ + return false; +} + void qmp_colo_lost_heartbeat(Error **errp) { error_setg(errp, "COLO is not supported, please rerun configure" -- 1.7.12.4