[Qemu-devel] [PATCH v3 0/5] Reduce down time during migration without shared storage

All of lore.kernel.org
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH v3 0/5] Reduce down time during migration without shared storage
@ 2010-01-26  8:31 Liran Schour
  2010-01-26  8:31 ` [Qemu-devel] [PATCH v3 1/5] Remove unused code Liran Schour
  0 siblings, 1 reply; 7+ messages in thread
From: Liran Schour @ 2010-01-26  8:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

This series of patches reduce the down time of the guest during a migration
without shared storage. It does that by start transfer dirty blocks in the 
iterative phase. In the current code transferring of dirty blocks begins only 
during the full phase while the guest is suspended. Therefore the guest will 
be suspended linear to the amount of data that was written to disk during
migration.

Changes from v2: - don't duplicate code by using qemu_get_clock_ns() use Paolo 
                   Bonzini <pbonzini@redhat.com> patch.
                 - Coding style issues
                 - Remove unused constants
Changes from v1: - infer storage performance by get_clock()
                 - remove dirty max iterations, user is responsible for 
                   migration convergence
                 - remove trailing whitespaces
                 - minor cleanups

 block-migration.c |  235 +++++++++++++++++++++++++++++++++++++---------------
 block.c           |   16 +++-
 block.h           |    1 +
 block_int.h       |    1 +
 qemu-timer.h      |    1 +
 vl.c              |   21 +++++-
 6 files changed, 203 insertions(+), 72 deletions(-)

Signed-off-by: Liran Schour <lirans@il.ibm.com>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH v3 1/5] Remove unused code
  2010-01-26  8:31 [Qemu-devel] [PATCH v3 0/5] Reduce down time during migration without shared storage Liran Schour
@ 2010-01-26  8:31 ` Liran Schour
  2010-01-26  8:31   ` [Qemu-devel] [PATCH v3 2/5] add qemu_get_clock_ns Liran Schour
  2010-02-09 23:27   ` [Qemu-devel] [PATCH v3 1/5] Remove unused code Anthony Liguori
  0 siblings, 2 replies; 7+ messages in thread
From: Liran Schour @ 2010-01-26  8:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

blk_mig_save_bulked_block is never called with sync flag. Remove the sync
flag. Calculate bulk completion during blk_mig_save_bulked_block.
Remove unused constants.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |   61 +++++++++++++++-------------------------------------
 1 files changed, 18 insertions(+), 43 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 258a88a..93d86d9 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -26,9 +26,6 @@
 #define BLK_MIG_FLAG_PROGRESS           0x04
 
 #define MAX_IS_ALLOCATED_SEARCH 65536
-#define MAX_BLOCKS_READ 10000
-#define BLOCKS_READ_CHANGE 100
-#define INITIAL_BLOCKS_READ 100
 
 //#define DEBUG_BLK_MIGRATION
 
@@ -72,6 +69,7 @@ typedef struct BlkMigState {
     int transferred;
     int64_t total_sector_sum;
     int prev_progress;
+    int bulk_completed;
 } BlkMigState;
 
 static BlkMigState block_mig_state;
@@ -138,7 +136,7 @@ static void blk_mig_read_cb(void *opaque, int ret)
 }
 
 static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
-                                BlkMigDevState *bmds, int is_async)
+                                BlkMigDevState *bmds)
 {
     int64_t total_sectors = bmds->total_sectors;
     int64_t cur_sector = bmds->cur_sector;
@@ -175,27 +173,16 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
     blk->bmds = bmds;
     blk->sector = cur_sector;
 
-    if (is_async) {
-        blk->iov.iov_base = blk->buf;
-        blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
-        qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
-
-        blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
-                                    nr_sectors, blk_mig_read_cb, blk);
-        if (!blk->aiocb) {
-            goto error;
-        }
-        block_mig_state.submitted++;
-    } else {
-        if (bdrv_read(bs, cur_sector, blk->buf, nr_sectors) < 0) {
-            goto error;
-        }
-        blk_send(f, blk);
+    blk->iov.iov_base = blk->buf;
+    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 
-        qemu_free(blk->buf);
-        qemu_free(blk);
+    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
+                                nr_sectors, blk_mig_read_cb, blk);
+    if (!blk->aiocb) {
+        goto error;
     }
-
+    block_mig_state.submitted++;
     bdrv_reset_dirty(bs, cur_sector, nr_sectors);
     bmds->cur_sector = cur_sector + nr_sectors;
 
@@ -229,6 +216,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
     block_mig_state.transferred = 0;
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
+    block_mig_state.bulk_completed = 0;
 
     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
         if (bs->type == BDRV_TYPE_HD) {
@@ -260,7 +248,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
     }
 }
 
-static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
+static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
 {
     int64_t completed_sector_sum = 0;
     BlkMigDevState *bmds;
@@ -269,7 +257,7 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
         if (bmds->bulk_completed == 0) {
-            if (mig_save_device_bulk(mon, f, bmds, is_async) == 1) {
+            if (mig_save_device_bulk(mon, f, bmds) == 1) {
                 /* completed bulk section for this device */
                 bmds->bulk_completed = 1;
             }
@@ -362,19 +350,7 @@ static void flush_blks(QEMUFile* f)
 
 static int is_stage2_completed(void)
 {
-    BlkMigDevState *bmds;
-
-    if (block_mig_state.submitted > 0) {
-        return 0;
-    }
-
-    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        if (bmds->bulk_completed == 0) {
-            return 0;
-        }
-    }
-
-    return 1;
+    return (block_mig_state.submitted == 0 && block_mig_state.bulk_completed);
 }
 
 static void blk_mig_cleanup(Monitor *mon)
@@ -432,8 +408,9 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     while ((block_mig_state.submitted +
             block_mig_state.read_done) * BLOCK_SIZE <
            qemu_file_get_rate_limit(f)) {
-        if (blk_mig_save_bulked_block(mon, f, 1) == 0) {
-            /* no more bulk blocks for now */
+        if (blk_mig_save_bulked_block(mon, f) == 0) {
+            /* finished saving bulk on all devices */
+            block_mig_state.bulk_completed = 1;
             break;
         }
     }
@@ -446,9 +423,7 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     }
 
     if (stage == 3) {
-        while (blk_mig_save_bulked_block(mon, f, 0) != 0) {
-            /* empty */
-        }
+        /* we know for sure that save bulk is completed */
 
         blk_mig_save_dirty_blocks(mon, f);
         blk_mig_cleanup(mon);
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH v3 2/5] add qemu_get_clock_ns
  2010-01-26  8:31 ` [Qemu-devel] [PATCH v3 1/5] Remove unused code Liran Schour
@ 2010-01-26  8:31   ` Liran Schour
  2010-01-26  8:31     ` [Qemu-devel] [PATCH v3 3/5] Tranfer dirty blocks during iterative phase Liran Schour
  2010-02-09 23:27   ` [Qemu-devel] [PATCH v3 1/5] Remove unused code Anthony Liguori
  1 sibling, 1 reply; 7+ messages in thread
From: Liran Schour @ 2010-01-26  8:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini

From: Paolo Bonzini <pbonzini@redhat.com>

Some places use get_clock directly because they want to access the
rt_clock with nanosecond precision.  Add a function to do exactly that
instead of using internal interfaces.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 qemu-timer.h |    1 +
 vl.c         |   21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/qemu-timer.h b/qemu-timer.h
index e7eaa04..c17b4e6 100644
--- a/qemu-timer.h
+++ b/qemu-timer.h
@@ -25,6 +25,7 @@ extern QEMUClock *vm_clock;
 extern QEMUClock *host_clock;
 
 int64_t qemu_get_clock(QEMUClock *clock);
+int64_t qemu_get_clock_ns(QEMUClock *clock);
 
 QEMUTimer *qemu_new_timer(QEMUClock *clock, QEMUTimerCB *cb, void *opaque);
 void qemu_free_timer(QEMUTimer *ts);
diff --git a/vl.c b/vl.c
index e881e45..c5cd462 100644
--- a/vl.c
+++ b/vl.c
@@ -1131,6 +1131,23 @@ int64_t qemu_get_clock(QEMUClock *clock)
     }
 }
 
+int64_t qemu_get_clock_ns(QEMUClock *clock)
+{
+    switch(clock->type) {
+    case QEMU_CLOCK_REALTIME:
+        return get_clock();
+    default:
+    case QEMU_CLOCK_VIRTUAL:
+        if (use_icount) {
+            return cpu_get_icount();
+        } else {
+            return cpu_get_clock();
+        }
+    case QEMU_CLOCK_HOST:
+        return get_clock_realtime();
+    }
+}
+
 static void init_clocks(void)
 {
     init_get_clock();
@@ -3063,7 +3080,7 @@ static int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     }
 
     bytes_transferred_last = bytes_transferred;
-    bwidth = get_clock();
+    bwidth = qemu_get_clock_ns(rt_clock);
 
     while (!qemu_file_rate_limit(f)) {
         int ret;
@@ -3074,7 +3091,7 @@ static int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
             break;
     }
 
-    bwidth = get_clock() - bwidth;
+    bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
     bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;
 
     /* if we haven't transferred anything this round, force expected_time to a
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH v3 3/5] Tranfer dirty blocks during iterative phase
  2010-01-26  8:31   ` [Qemu-devel] [PATCH v3 2/5] add qemu_get_clock_ns Liran Schour
@ 2010-01-26  8:31     ` Liran Schour
  2010-01-26  8:31       ` [Qemu-devel] [PATCH v3 4/5] Count dirty blocks and expose an API to get dirty count Liran Schour
  0 siblings, 1 reply; 7+ messages in thread
From: Liran Schour @ 2010-01-26  8:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

Start transfer dirty blocks during the iterative stage. That will
reduce the time that the guest will be suspended

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |  135 +++++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 99 insertions(+), 36 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 93d86d9..d8755d1 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -42,6 +42,7 @@ typedef struct BlkMigDevState {
     int bulk_completed;
     int shared_base;
     int64_t cur_sector;
+    int64_t cur_dirty;
     int64_t completed_sectors;
     int64_t total_sectors;
     int64_t dirty;
@@ -70,6 +71,7 @@ typedef struct BlkMigState {
     int64_t total_sector_sum;
     int prev_progress;
     int bulk_completed;
+    int dirty_iterations;
 } BlkMigState;
 
 static BlkMigState block_mig_state;
@@ -183,6 +185,7 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
         goto error;
     }
     block_mig_state.submitted++;
+
     bdrv_reset_dirty(bs, cur_sector, nr_sectors);
     bmds->cur_sector = cur_sector + nr_sectors;
 
@@ -281,39 +284,88 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
     return ret;
 }
 
-#define MAX_NUM_BLOCKS 4
-
-static void blk_mig_save_dirty_blocks(Monitor *mon, QEMUFile *f)
+static void blk_mig_reset_dirty_cursor(void)
 {
     BlkMigDevState *bmds;
-    BlkMigBlock blk;
+
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        bmds->cur_dirty = 0;
+    }
+}
+
+static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
+                                 BlkMigDevState *bmds, int is_async)
+{
+    BlkMigBlock *blk;
+    int64_t total_sectors = bmds->total_sectors;
     int64_t sector;
+    int nr_sectors;
 
-    blk.buf = qemu_malloc(BLOCK_SIZE);
+    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
+        if (bdrv_get_dirty(bmds->bs, sector)) {
 
-    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        for (sector = 0; sector < bmds->cur_sector;) {
-            if (bdrv_get_dirty(bmds->bs, sector)) {
-                if (bdrv_read(bmds->bs, sector, blk.buf,
-                              BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
-                    monitor_printf(mon, "Error reading sector %" PRId64 "\n",
-                                   sector);
-                    qemu_file_set_error(f);
-                    qemu_free(blk.buf);
-                    return;
+            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
+                nr_sectors = total_sectors - sector;
+            } else {
+                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+            }
+            blk = qemu_malloc(sizeof(BlkMigBlock));
+            blk->buf = qemu_malloc(BLOCK_SIZE);
+            blk->bmds = bmds;
+            blk->sector = sector;
+
+            if(is_async) {
+                blk->iov.iov_base = blk->buf;
+                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
+                                            nr_sectors, blk_mig_read_cb, blk);
+                if (!blk->aiocb) {
+                    goto error;
+                }
+                block_mig_state.submitted++;
+            } else {
+                if (bdrv_read(bmds->bs, sector, blk->buf,
+                              nr_sectors) < 0) {
+                    goto error;
                 }
-                blk.bmds = bmds;
-                blk.sector = sector;
-                blk_send(f, &blk);
+                blk_send(f, blk);
 
-                bdrv_reset_dirty(bmds->bs, sector,
-                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+                qemu_free(blk->buf);
+                qemu_free(blk);
             }
-            sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
+            break;
         }
+        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+        bmds->cur_dirty = sector;
     }
 
-    qemu_free(blk.buf);
+    return (bmds->cur_dirty >= bmds->total_sectors);
+
+ error:
+    monitor_printf(mon, "Error reading sector %" PRId64 "\n", sector);
+    qemu_file_set_error(f);
+    qemu_free(blk->buf);
+    qemu_free(blk);
+    return 0;
+}
+
+static int blk_mig_save_dirty_block(Monitor *mon, QEMUFile *f, int is_async)
+{
+    BlkMigDevState *bmds;
+    int ret = 0;
+
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        if(mig_save_device_dirty(mon, f, bmds, is_async) == 0) {
+            ret = 1;
+            break;
+        }
+    }
+
+    return ret;
 }
 
 static void flush_blks(QEMUFile* f)
@@ -404,28 +456,39 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
         return 0;
     }
 
-    /* control the rate of transfer */
-    while ((block_mig_state.submitted +
-            block_mig_state.read_done) * BLOCK_SIZE <
-           qemu_file_get_rate_limit(f)) {
-        if (blk_mig_save_bulked_block(mon, f) == 0) {
-            /* finished saving bulk on all devices */
-            block_mig_state.bulk_completed = 1;
-            break;
+    blk_mig_reset_dirty_cursor();
+
+    if(stage == 2) {
+        /* control the rate of transfer */
+        while ((block_mig_state.submitted +
+                block_mig_state.read_done) * BLOCK_SIZE <
+               qemu_file_get_rate_limit(f)) {
+            if (block_mig_state.bulk_completed == 0) {
+                /* first finish the bulk phase */
+                if (blk_mig_save_bulked_block(mon, f) == 0) {
+                    /* finish saving bulk on all devices */
+                    block_mig_state.bulk_completed = 1;
+                }
+            } else {
+                if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
+                    /* no more dirty blocks */
+                    break;
+                }
+            }
         }
-    }
 
-    flush_blks(f);
+        flush_blks(f);
 
-    if (qemu_file_has_error(f)) {
-        blk_mig_cleanup(mon);
-        return 0;
+        if (qemu_file_has_error(f)) {
+            blk_mig_cleanup(mon);
+            return 0;
+        }
     }
 
     if (stage == 3) {
         /* we know for sure that save bulk is completed */
 
-        blk_mig_save_dirty_blocks(mon, f);
+        while(blk_mig_save_dirty_block(mon, f, 0) != 0);
         blk_mig_cleanup(mon);
 
         /* report completion */
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH v3 4/5] Count dirty blocks and expose an API to get dirty count
  2010-01-26  8:31     ` [Qemu-devel] [PATCH v3 3/5] Tranfer dirty blocks during iterative phase Liran Schour
@ 2010-01-26  8:31       ` Liran Schour
  2010-01-26  8:31         ` [Qemu-devel] [PATCH v3 5/5] Try not to exceed max downtime on stage3 Liran Schour
  0 siblings, 1 reply; 7+ messages in thread
From: Liran Schour @ 2010-01-26  8:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

This will manage dirty counter for each device and will allow to get the
dirty counter from above.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block.c     |   16 ++++++++++++++--
 block.h     |    1 +
 block_int.h |    1 +
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 30ae2b1..a6381ad 100644
--- a/block.c
+++ b/block.c
@@ -653,9 +653,15 @@ static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
         bit = start % (sizeof(unsigned long) * 8);
         val = bs->dirty_bitmap[idx];
         if (dirty) {
-            val |= 1 << bit;
+            if (!(val & (1 << bit))) {
+                bs->dirty_count++;
+                val |= 1 << bit;
+            }
         } else {
-            val &= ~(1 << bit);
+            if (val & (1 << bit)) {
+                bs->dirty_count--;
+                val &= ~(1 << bit);
+            }
         }
         bs->dirty_bitmap[idx] = val;
     }
@@ -2116,6 +2122,7 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
 {
     int64_t bitmap_size;
 
+    bs->dirty_count = 0;
     if (enable) {
         if (!bs->dirty_bitmap) {
             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
@@ -2150,3 +2157,8 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
 {
     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
 }
+
+int64_t bdrv_get_dirty_count(BlockDriverState *bs)
+{
+    return bs->dirty_count;
+}
diff --git a/block.h b/block.h
index fa51ddf..1012303 100644
--- a/block.h
+++ b/block.h
@@ -201,4 +201,5 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable);
 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector);
 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
                       int nr_sectors);
+int64_t bdrv_get_dirty_count(BlockDriverState *bs);
 #endif
diff --git a/block_int.h b/block_int.h
index 9a3b2e0..8d5d9bc 100644
--- a/block_int.h
+++ b/block_int.h
@@ -172,6 +172,7 @@ struct BlockDriverState {
     int type;
     char device_name[32];
     unsigned long *dirty_bitmap;
+    int64_t dirty_count;
     BlockDriverState *next;
     void *private;
 };
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH v3 5/5] Try not to exceed max downtime on stage3
  2010-01-26  8:31       ` [Qemu-devel] [PATCH v3 4/5] Count dirty blocks and expose an API to get dirty count Liran Schour
@ 2010-01-26  8:31         ` Liran Schour
  0 siblings, 0 replies; 7+ messages in thread
From: Liran Schour @ 2010-01-26  8:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

Move to stage3 only when remaining work can be done below max downtime.
Use qemu_get_clock_ns for measuring read performance.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |   79 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index d8755d1..0e63596 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -15,8 +15,10 @@
 #include "block_int.h"
 #include "hw/hw.h"
 #include "qemu-queue.h"
+#include "qemu-timer.h"
 #include "monitor.h"
 #include "block-migration.h"
+#include "migration.h"
 #include <assert.h>
 
 #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
@@ -57,6 +59,7 @@ typedef struct BlkMigBlock {
     QEMUIOVector qiov;
     BlockDriverAIOCB *aiocb;
     int ret;
+    int64_t time;
     QSIMPLEQ_ENTRY(BlkMigBlock) entry;
 } BlkMigBlock;
 
@@ -71,7 +74,8 @@ typedef struct BlkMigState {
     int64_t total_sector_sum;
     int prev_progress;
     int bulk_completed;
-    int dirty_iterations;
+    long double total_time;
+    int reads;
 } BlkMigState;
 
 static BlkMigState block_mig_state;
@@ -124,12 +128,28 @@ uint64_t blk_mig_bytes_total(void)
     return sum << BDRV_SECTOR_BITS;
 }
 
+static inline void add_avg_read_time(int64_t time)
+{
+    block_mig_state.reads++;
+    block_mig_state.total_time += time;
+}
+
+static inline long double compute_read_bwidth(void)
+{
+    assert(block_mig_state.total_time != 0);
+    return  (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time;
+}
+
 static void blk_mig_read_cb(void *opaque, int ret)
 {
     BlkMigBlock *blk = opaque;
 
     blk->ret = ret;
 
+    blk->time = qemu_get_clock_ns(rt_clock) - blk->time;
+
+    add_avg_read_time(blk->time);
+
     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
 
     block_mig_state.submitted--;
@@ -179,6 +199,8 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
     blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
     qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 
+    blk->time = qemu_get_clock_ns(rt_clock);
+
     blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                 nr_sectors, blk_mig_read_cb, blk);
     if (!blk->aiocb) {
@@ -220,6 +242,8 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
     block_mig_state.bulk_completed = 0;
+    block_mig_state.total_time = 0;
+    block_mig_state.reads = 0;
 
     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
         if (bs->type == BDRV_TYPE_HD) {
@@ -314,11 +338,13 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
             blk->bmds = bmds;
             blk->sector = sector;
 
-            if(is_async) {
+            if (is_async) {
                 blk->iov.iov_base = blk->buf;
                 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
                 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 
+		blk->time = qemu_get_clock_ns(rt_clock);
+
                 blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
                                             nr_sectors, blk_mig_read_cb, blk);
                 if (!blk->aiocb) {
@@ -345,7 +371,7 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
 
     return (bmds->cur_dirty >= bmds->total_sectors);
 
- error:
+error:
     monitor_printf(mon, "Error reading sector %" PRId64 "\n", sector);
     qemu_file_set_error(f);
     qemu_free(blk->buf);
@@ -359,7 +385,7 @@ static int blk_mig_save_dirty_block(Monitor *mon, QEMUFile *f, int is_async)
     int ret = 0;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        if(mig_save_device_dirty(mon, f, bmds, is_async) == 0) {
+        if (mig_save_device_dirty(mon, f, bmds, is_async) == 0) {
             ret = 1;
             break;
         }
@@ -400,9 +426,42 @@ static void flush_blks(QEMUFile* f)
             block_mig_state.transferred);
 }
 
+static int64_t get_remaining_dirty(void)
+{
+    BlkMigDevState *bmds;
+    int64_t dirty = 0;
+
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        dirty += bdrv_get_dirty_count(bmds->bs);
+    }
+
+    return dirty * BLOCK_SIZE;
+}
+
 static int is_stage2_completed(void)
 {
-    return (block_mig_state.submitted == 0 && block_mig_state.bulk_completed);
+    int64_t remaining_dirty;
+    long double bwidth;
+
+    if (block_mig_state.bulk_completed == 1) {
+
+        remaining_dirty = get_remaining_dirty();
+	if (remaining_dirty == 0) {
+	    return 1;
+	}
+
+	bwidth = compute_read_bwidth();
+
+	if ((remaining_dirty / bwidth) <=
+            migrate_max_downtime()) {
+            /* finish stage2 because we think that we can finish remaing work
+               below max_downtime */
+
+            return 1;
+        }
+    }
+
+    return 0;
 }
 
 static void blk_mig_cleanup(Monitor *mon)
@@ -458,7 +517,7 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
 
     blk_mig_reset_dirty_cursor();
 
-    if(stage == 2) {
+    if (stage == 2) {
         /* control the rate of transfer */
         while ((block_mig_state.submitted +
                 block_mig_state.read_done) * BLOCK_SIZE <
@@ -466,7 +525,7 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
             if (block_mig_state.bulk_completed == 0) {
                 /* first finish the bulk phase */
                 if (blk_mig_save_bulked_block(mon, f) == 0) {
-                    /* finish saving bulk on all devices */
+                    /* finished saving bulk on all devices */
                     block_mig_state.bulk_completed = 1;
                 }
             } else {
@@ -486,9 +545,11 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     }
 
     if (stage == 3) {
-        /* we know for sure that save bulk is completed */
+        /* we know for sure that save bulk is completed and
+           all async read completed */
+        assert(block_mig_state.submitted == 0);
 
-        while(blk_mig_save_dirty_block(mon, f, 0) != 0);
+        while (blk_mig_save_dirty_block(mon, f, 0) != 0);
         blk_mig_cleanup(mon);
 
         /* report completion */
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] [PATCH v3 1/5] Remove unused code
  2010-01-26  8:31 ` [Qemu-devel] [PATCH v3 1/5] Remove unused code Liran Schour
  2010-01-26  8:31   ` [Qemu-devel] [PATCH v3 2/5] add qemu_get_clock_ns Liran Schour
@ 2010-02-09 23:27   ` Anthony Liguori
  1 sibling, 0 replies; 7+ messages in thread
From: Anthony Liguori @ 2010-02-09 23:27 UTC (permalink / raw)
  To: Liran Schour; +Cc: qemu-devel

On 01/26/2010 02:31 AM, Liran Schour wrote:
> blk_mig_save_bulked_block is never called with sync flag. Remove the sync
> flag. Calculate bulk completion during blk_mig_save_bulked_block.
> Remove unused constants.
>
> Signed-off-by: Liran Schour<lirans@il.ibm.com>
>    

Applied all. Thanks.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2010-02-09 23:27 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-01-26  8:31 [Qemu-devel] [PATCH v3 0/5] Reduce down time during migration without shared storage Liran Schour
2010-01-26  8:31 ` [Qemu-devel] [PATCH v3 1/5] Remove unused code Liran Schour
2010-01-26  8:31   ` [Qemu-devel] [PATCH v3 2/5] add qemu_get_clock_ns Liran Schour
2010-01-26  8:31     ` [Qemu-devel] [PATCH v3 3/5] Tranfer dirty blocks during iterative phase Liran Schour
2010-01-26  8:31       ` [Qemu-devel] [PATCH v3 4/5] Count dirty blocks and expose an API to get dirty count Liran Schour
2010-01-26  8:31         ` [Qemu-devel] [PATCH v3 5/5] Try not to exceed max downtime on stage3 Liran Schour
2010-02-09 23:27   ` [Qemu-devel] [PATCH v3 1/5] Remove unused code Anthony Liguori

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.