All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v8 3/4] block: add block timer and throttling algorithm
@ 2011-09-07 12:41 ` Zhi Yong Wu
  0 siblings, 0 replies; 14+ messages in thread
From: Zhi Yong Wu @ 2011-09-07 12:41 UTC (permalink / raw)
  To: qemu-devel
  Cc: kwolf, aliguori, stefanha, kvm, mtosatti, Zhi Yong Wu, zwu.kernel, ryanh

Note:
     1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
     2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.

For these problems, if you have nice thought, pls let us know.:)

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 block.c |  246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 block.h |    1 -
 2 files changed, 236 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index cd75183..8a82273 100644
--- a/block.c
+++ b/block.c
@@ -30,6 +30,9 @@
 #include "qemu-objects.h"
 #include "qemu-coroutine.h"
 
+#include "qemu-timer.h"
+#include "block/blk-queue.h"
+
 #ifdef CONFIG_BSD
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -72,6 +75,13 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
                                          QEMUIOVector *iov);
 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, int64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -745,6 +755,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
 
+    /* throttling disk I/O limits */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_enable(bs);
+    }
+
     return 0;
 
 unlink_and_fail:
@@ -783,6 +798,18 @@ void bdrv_close(BlockDriverState *bs)
         if (bs->change_cb)
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
+
+    /* throttling disk I/O limits */
+    if (bs->block_queue) {
+        qemu_del_block_queue(bs->block_queue);
+        bs->block_queue = NULL;
+    }
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer = NULL;
+    }
 }
 
 void bdrv_close_all(void)
@@ -2341,16 +2368,40 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
                                  BlockDriverCompletionFunc *cb, void *opaque)
 {
     BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    int64_t wait_time = -1;
 
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
         return NULL;
+    }
 
-    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+    /* throttling disk read I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
+                           sector_num, qiov, nb_sectors, cb, opaque);
+            if (wait_time != -1) {
+                qemu_mod_timer(bs->block_timer,
+                               wait_time + qemu_get_clock_ns(vm_clock));
+            }
+
+            return ret;
+        }
+    }
+
+    ret =  drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
+    if (ret) {
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
+                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
+        }
+    }
+
+    return ret;
 }
 
 typedef struct BlockCompleteData {
@@ -2396,15 +2447,14 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
     BlockCompleteData *blk_cb_data;
+    int64_t wait_time = -1;
 
     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bs->read_only)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bs->read_only
+        || bdrv_check_request(bs, sector_num, nb_sectors)) {
         return NULL;
+    }
 
     if (bs->dirty_bitmap) {
         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
@@ -2413,13 +2463,32 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
         opaque = blk_cb_data;
     }
 
+    /* throttling disk write I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
+                                  sector_num, qiov, nb_sectors, cb, opaque);
+            if (wait_time != -1) {
+                qemu_mod_timer(bs->block_timer,
+                               wait_time + qemu_get_clock_ns(vm_clock));
+            }
+
+            return ret;
+        }
+    }
+
     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
-
     if (ret) {
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
+
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
+                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
+        }
     }
 
     return ret;
@@ -2684,6 +2753,163 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->pool->cancel(acb);
 }
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait) {
+    uint64_t bps_limit = 0;
+    double   bytes_limit, bytes_disp, bytes_res;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    bytes_limit = bps_limit * slice_time;
+    bytes_disp  = bs->io_disps.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_disp += bs->io_disps.bytes[!is_write];
+    }
+
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_disp + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait) {
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_disp;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    ios_limit  = iops_limit * slice_time;
+    ios_disp   = bs->io_disps.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_disp += bs->io_disps.ios[!is_write];
+    }
+
+    if (ios_disp + 1 <= ios_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (ios_disp + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+        wait_time = wait_time - elapsed_time;
+    } else {
+        wait_time = 0;
+    }
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, int64_t *wait) {
+    int64_t  now, max_wait;
+    uint64_t bps_wait = 0, iops_wait = 0;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    now = qemu_get_clock_ns(vm_clock);
+    if ((bs->slice_start < now)
+        && (bs->slice_end > now)) {
+        bs->slice_end = now + BLOCK_IO_SLICE_TIME;
+    } else {
+        bs->slice_start = now;
+        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
+
+        bs->io_disps.bytes[is_write]  = 0;
+        bs->io_disps.bytes[!is_write] = 0;
+
+        bs->io_disps.ios[is_write]    = 0;
+        bs->io_disps.ios[!is_write]   = 0;
+    }
+
+    /* If a limit was exceeded, immediately queue this request */
+    if (qemu_block_queue_has_pending(bs->block_queue)) {
+        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
+            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
+            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+            if (wait) {
+                *wait = -1;
+            }
+
+            return true;
+        }
+    }
+
+    elapsed_time  = now - bs->slice_start;
+    elapsed_time  /= (NANOSECONDS_PER_SECOND);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        now = qemu_get_clock_ns(vm_clock);
+        if (bs->slice_end < now + max_wait) {
+            bs->slice_end = now + max_wait;
+        }
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    return false;
+}
 
 /**************************************************************/
 /* async block device emulation */
diff --git a/block.h b/block.h
index a3e69db..10d2828 100644
--- a/block.h
+++ b/block.h
@@ -107,7 +107,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
     const char *backing_file, const char *backing_fmt);
 void bdrv_register(BlockDriver *bdrv);
 
-
 typedef struct BdrvCheckResult {
     int corruptions;
     int leaks;
-- 
1.7.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v8 3/4] block: add block timer and throttling algorithm
@ 2011-09-07 12:41 ` Zhi Yong Wu
  0 siblings, 0 replies; 14+ messages in thread
From: Zhi Yong Wu @ 2011-09-07 12:41 UTC (permalink / raw)
  To: qemu-devel
  Cc: kwolf, aliguori, stefanha, kvm, mtosatti, Zhi Yong Wu, zwu.kernel, ryanh

Note:
     1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
     2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.

For these problems, if you have nice thought, pls let us know.:)

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 block.c |  246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 block.h |    1 -
 2 files changed, 236 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index cd75183..8a82273 100644
--- a/block.c
+++ b/block.c
@@ -30,6 +30,9 @@
 #include "qemu-objects.h"
 #include "qemu-coroutine.h"
 
+#include "qemu-timer.h"
+#include "block/blk-queue.h"
+
 #ifdef CONFIG_BSD
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -72,6 +75,13 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
                                          QEMUIOVector *iov);
 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, int64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -745,6 +755,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
 
+    /* throttling disk I/O limits */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_enable(bs);
+    }
+
     return 0;
 
 unlink_and_fail:
@@ -783,6 +798,18 @@ void bdrv_close(BlockDriverState *bs)
         if (bs->change_cb)
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
+
+    /* throttling disk I/O limits */
+    if (bs->block_queue) {
+        qemu_del_block_queue(bs->block_queue);
+        bs->block_queue = NULL;
+    }
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer = NULL;
+    }
 }
 
 void bdrv_close_all(void)
@@ -2341,16 +2368,40 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
                                  BlockDriverCompletionFunc *cb, void *opaque)
 {
     BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    int64_t wait_time = -1;
 
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
         return NULL;
+    }
 
-    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+    /* throttling disk read I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
+                           sector_num, qiov, nb_sectors, cb, opaque);
+            if (wait_time != -1) {
+                qemu_mod_timer(bs->block_timer,
+                               wait_time + qemu_get_clock_ns(vm_clock));
+            }
+
+            return ret;
+        }
+    }
+
+    ret =  drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
+    if (ret) {
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
+                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
+        }
+    }
+
+    return ret;
 }
 
 typedef struct BlockCompleteData {
@@ -2396,15 +2447,14 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
     BlockCompleteData *blk_cb_data;
+    int64_t wait_time = -1;
 
     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bs->read_only)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bs->read_only
+        || bdrv_check_request(bs, sector_num, nb_sectors)) {
         return NULL;
+    }
 
     if (bs->dirty_bitmap) {
         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
@@ -2413,13 +2463,32 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
         opaque = blk_cb_data;
     }
 
+    /* throttling disk write I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
+                                  sector_num, qiov, nb_sectors, cb, opaque);
+            if (wait_time != -1) {
+                qemu_mod_timer(bs->block_timer,
+                               wait_time + qemu_get_clock_ns(vm_clock));
+            }
+
+            return ret;
+        }
+    }
+
     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
-
     if (ret) {
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
+
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
+                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
+        }
     }
 
     return ret;
@@ -2684,6 +2753,163 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->pool->cancel(acb);
 }
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait) {
+    uint64_t bps_limit = 0;
+    double   bytes_limit, bytes_disp, bytes_res;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    bytes_limit = bps_limit * slice_time;
+    bytes_disp  = bs->io_disps.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_disp += bs->io_disps.bytes[!is_write];
+    }
+
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_disp + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait) {
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_disp;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    ios_limit  = iops_limit * slice_time;
+    ios_disp   = bs->io_disps.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_disp += bs->io_disps.ios[!is_write];
+    }
+
+    if (ios_disp + 1 <= ios_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (ios_disp + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+        wait_time = wait_time - elapsed_time;
+    } else {
+        wait_time = 0;
+    }
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, int64_t *wait) {
+    int64_t  now, max_wait;
+    uint64_t bps_wait = 0, iops_wait = 0;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    now = qemu_get_clock_ns(vm_clock);
+    if ((bs->slice_start < now)
+        && (bs->slice_end > now)) {
+        bs->slice_end = now + BLOCK_IO_SLICE_TIME;
+    } else {
+        bs->slice_start = now;
+        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
+
+        bs->io_disps.bytes[is_write]  = 0;
+        bs->io_disps.bytes[!is_write] = 0;
+
+        bs->io_disps.ios[is_write]    = 0;
+        bs->io_disps.ios[!is_write]   = 0;
+    }
+
+    /* If a limit was exceeded, immediately queue this request */
+    if (qemu_block_queue_has_pending(bs->block_queue)) {
+        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
+            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
+            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+            if (wait) {
+                *wait = -1;
+            }
+
+            return true;
+        }
+    }
+
+    elapsed_time  = now - bs->slice_start;
+    elapsed_time  /= (NANOSECONDS_PER_SECOND);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        now = qemu_get_clock_ns(vm_clock);
+        if (bs->slice_end < now + max_wait) {
+            bs->slice_end = now + max_wait;
+        }
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    return false;
+}
 
 /**************************************************************/
 /* async block device emulation */
diff --git a/block.h b/block.h
index a3e69db..10d2828 100644
--- a/block.h
+++ b/block.h
@@ -107,7 +107,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
     const char *backing_file, const char *backing_fmt);
 void bdrv_register(BlockDriver *bdrv);
 
-
 typedef struct BdrvCheckResult {
     int corruptions;
     int leaks;
-- 
1.7.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-20 12:34           ` Marcelo Tosatti
  2011-09-21  3:14             ` Zhi Yong Wu
  2011-09-21  7:03             ` Zhi Yong Wu
@ 2011-09-26  8:15             ` Zhi Yong Wu
  2 siblings, 0 replies; 14+ messages in thread
From: Zhi Yong Wu @ 2011-09-26  8:15 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Zhi Yong Wu, qemu-devel, kvm, stefanha, aliguori, ryanh, kwolf, pair

On Tue, Sep 20, 2011 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Mon, Sep 19, 2011 at 05:55:41PM +0800, Zhi Yong Wu wrote:
>> On Wed, Sep 14, 2011 at 6:50 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> > On Tue, Sep 13, 2011 at 11:09:46AM +0800, Zhi Yong Wu wrote:
>> >> On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> >> > On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
>> >> >> Note:
>> >> >>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>> >> >
>> >> > You can increase the length of the slice, if the request is larger than
>> >> > slice_time * bps_limit.
>> >> Yeah, but it is a challenge for how to increase it. Do you have some nice idea?
>> >
>> > If the queue is empty, and the request being processed does not fit the
>> > queue, increase the slice so that the request fits.
>> Sorry for late reply. actually, do you think that this scenario is
>> meaningful for the user?
>> Since we implement this, if the user limits the bps below 512
>> bytes/second, the VM can also not run every task.
>> Can you let us know why we need to make such effort?
>
> It would be good to handle request larger than the slice.
>
> It is not strictly necessary, but in case its not handled, a minimum
> should be in place, to reflect maximum request size known. Being able to
> specify something which crashes is not acceptable.
HI, Marcelo,

any comments? I have post the implementation based on your suggestions

>
>



-- 
Regards,

Zhi Yong Wu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-23 16:19   ` Kevin Wolf
@ 2011-09-26  7:24     ` Zhi Yong Wu
  0 siblings, 0 replies; 14+ messages in thread
From: Zhi Yong Wu @ 2011-09-26  7:24 UTC (permalink / raw)
  To: Kevin Wolf
  Cc: aliguori, stefanha, kvm, mtosatti, qemu-devel, pair, ryanh, Zhi Yong Wu

On Sat, Sep 24, 2011 at 12:19 AM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 08.09.2011 12:11, schrieb Zhi Yong Wu:
>> Note:
>>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>      2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
>>
>> For these problems, if you have nice thought, pls let us know.:)
>>
>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>> ---
>>  block.c |  259 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>  block.h |    1 -
>>  2 files changed, 248 insertions(+), 12 deletions(-)
>
> One general comment: What about synchronous and/or coroutine I/O
> operations? Do you think they are just not important enough to consider
> here or were they forgotten?
For sync ops, we assume that it will be converse into async mode at
some point of future, right?
For coroutine I/O, it is introduced in image driver layer, and behind
bdrv_aio_readv/writev. I think that we need not consider them, right?

>
> Also, do I understand correctly that you're always submitting the whole
Right, when the block timer fire, it will flush whole request queue.
> queue at once? Does this effectively enforce the limit all the time or
> will it lead to some peaks and then no requests at all for a while until
In fact, it only try to submit those enqueued request one by one. If
fail to pass the limit, this request will be enqueued again.
> the average is right again?
Yeah, it is possible. Do you better idea?
>
> Maybe some documentation on how it all works from a high level
> perspective would be helpful.
>
>> diff --git a/block.c b/block.c
>> index cd75183..c08fde8 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -30,6 +30,9 @@
>>  #include "qemu-objects.h"
>>  #include "qemu-coroutine.h"
>>
>> +#include "qemu-timer.h"
>> +#include "block/blk-queue.h"
>> +
>>  #ifdef CONFIG_BSD
>>  #include <sys/types.h>
>>  #include <sys/stat.h>
>> @@ -72,6 +75,13 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
>>                                           QEMUIOVector *iov);
>>  static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +        double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, int64_t *wait);
>> +
>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>
>> @@ -745,6 +755,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>      }
>>
>> +    /* throttling disk I/O limits */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_enable(bs);
>> +    }
>> +
>>      return 0;
>>
>>  unlink_and_fail:
>> @@ -783,6 +798,18 @@ void bdrv_close(BlockDriverState *bs)
>>          if (bs->change_cb)
>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>      }
>> +
>> +    /* throttling disk I/O limits */
>> +    if (bs->block_queue) {
>> +        qemu_del_block_queue(bs->block_queue);
>> +        bs->block_queue = NULL;
>> +    }
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>> +        bs->block_timer = NULL;
>> +    }
>
> Why not io_limits_disable() instead of copying the code here?
Good point, thanks.
>
>>  }
>>
>>  void bdrv_close_all(void)
>> @@ -2341,16 +2368,48 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>>                                   BlockDriverCompletionFunc *cb, void *opaque)
>>  {
>>      BlockDriver *drv = bs->drv;
>> -
>> +    BlockDriverAIOCB *ret;
>> +    int64_t wait_time = -1;
>> +printf("sector_num=%ld, nb_sectors=%d\n", sector_num, nb_sectors);
>
> Debugging leftover (more of them follow, won't comment on each one)
Removed.
>
>>      trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>>
>> -    if (!drv)
>> -        return NULL;
>> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
>>          return NULL;
>> +    }
>
> This part is unrelated.
Have changed it to original.
>
>> +
>> +    /* throttling disk read I/O */
>> +    if (bs->io_limits_enabled) {
>> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
>> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
>> +                           sector_num, qiov, nb_sectors, cb, opaque);
>> +            printf("wait_time=%ld\n", wait_time);
>> +            if (wait_time != -1) {
>> +                printf("reset block timer\n");
>> +                qemu_mod_timer(bs->block_timer,
>> +                               wait_time + qemu_get_clock_ns(vm_clock));
>> +            }
>> +
>> +            if (ret) {
>> +                printf("ori ret is not null\n");
>> +            } else {
>> +                printf("ori ret is null\n");
>> +            }
>> +
>> +            return ret;
>> +        }
>> +    }
>>
>> -    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>> +    ret =  drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>>                                 cb, opaque);
>> +    if (ret) {
>> +        if (bs->io_limits_enabled) {
>> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
>> +                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
>> +        }
>
> I wonder if you can't reuse bs->nr_bytes/nr_ops instead of introducing a
> second counting mechanism. Would have the advantage that numbers are
NO, our counting variables will be reset to ZERO if current slice
time(0.1ms) is used up.
> actually consistent (your metric counts slightly differently than the
> existing info blockstats one).
Yeah, i notice this, and don't think there's wrong with it. and you?
>
>> +    }
>> +
>> +    return ret;
>>  }
>>
>>  typedef struct BlockCompleteData {
>> @@ -2396,15 +2455,14 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>      BlockDriver *drv = bs->drv;
>>      BlockDriverAIOCB *ret;
>>      BlockCompleteData *blk_cb_data;
>> +    int64_t wait_time = -1;
>>
>>      trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>>
>> -    if (!drv)
>> -        return NULL;
>> -    if (bs->read_only)
>> -        return NULL;
>> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>> +    if (!drv || bs->read_only
>> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
>>          return NULL;
>> +    }
>
> Again, unrelated changes.
Have changed it to original.
>
>>
>>      if (bs->dirty_bitmap) {
>>          blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
>> @@ -2413,13 +2471,32 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>          opaque = blk_cb_data;
>>      }
>>
>> +    /* throttling disk write I/O */
>> +    if (bs->io_limits_enabled) {
>> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
>> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
>> +                                  sector_num, qiov, nb_sectors, cb, opaque);
>> +            if (wait_time != -1) {
>> +                qemu_mod_timer(bs->block_timer,
>> +                               wait_time + qemu_get_clock_ns(vm_clock));
>> +            }
>> +
>> +            return ret;
>> +        }
>> +    }
>
> This looks very similar to the code in bdrv_aio_readv. Can it be moved
> into a common function?
Good advice, done. thanks.

>
> Kevin
>



-- 
Regards,

Zhi Yong Wu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-08 10:11 ` [PATCH v8 3/4] block: add block timer and throttling algorithm Zhi Yong Wu
  2011-09-09 14:44   ` Marcelo Tosatti
@ 2011-09-23 16:19   ` Kevin Wolf
  2011-09-26  7:24     ` Zhi Yong Wu
  1 sibling, 1 reply; 14+ messages in thread
From: Kevin Wolf @ 2011-09-23 16:19 UTC (permalink / raw)
  To: Zhi Yong Wu
  Cc: qemu-devel, kvm, stefanha, mtosatti, aliguori, ryanh, zwu.kernel, pair

Am 08.09.2011 12:11, schrieb Zhi Yong Wu:
> Note:
>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>      2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
> 
> For these problems, if you have nice thought, pls let us know.:)
> 
> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> ---
>  block.c |  259 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  block.h |    1 -
>  2 files changed, 248 insertions(+), 12 deletions(-)

One general comment: What about synchronous and/or coroutine I/O
operations? Do you think they are just not important enough to consider
here or were they forgotten?

Also, do I understand correctly that you're always submitting the whole
queue at once? Does this effectively enforce the limit all the time or
will it lead to some peaks and then no requests at all for a while until
the average is right again?

Maybe some documentation on how it all works from a high level
perspective would be helpful.

> diff --git a/block.c b/block.c
> index cd75183..c08fde8 100644
> --- a/block.c
> +++ b/block.c
> @@ -30,6 +30,9 @@
>  #include "qemu-objects.h"
>  #include "qemu-coroutine.h"
>  
> +#include "qemu-timer.h"
> +#include "block/blk-queue.h"
> +
>  #ifdef CONFIG_BSD
>  #include <sys/types.h>
>  #include <sys/stat.h>
> @@ -72,6 +75,13 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
>                                           QEMUIOVector *iov);
>  static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
>  
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +        double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, int64_t *wait);
> +
>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>  
> @@ -745,6 +755,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>      }
>  
> +    /* throttling disk I/O limits */
> +    if (bs->io_limits_enabled) {
> +        bdrv_io_limits_enable(bs);
> +    }
> +
>      return 0;
>  
>  unlink_and_fail:
> @@ -783,6 +798,18 @@ void bdrv_close(BlockDriverState *bs)
>          if (bs->change_cb)
>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>      }
> +
> +    /* throttling disk I/O limits */
> +    if (bs->block_queue) {
> +        qemu_del_block_queue(bs->block_queue);
> +        bs->block_queue = NULL;
> +    }
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);
> +        bs->block_timer = NULL;
> +    }

Why not io_limits_disable() instead of copying the code here?

>  }
>  
>  void bdrv_close_all(void)
> @@ -2341,16 +2368,48 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>                                   BlockDriverCompletionFunc *cb, void *opaque)
>  {
>      BlockDriver *drv = bs->drv;
> -
> +    BlockDriverAIOCB *ret;
> +    int64_t wait_time = -1;
> +printf("sector_num=%ld, nb_sectors=%d\n", sector_num, nb_sectors);

Debugging leftover (more of them follow, won't comment on each one)

>      trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>  
> -    if (!drv)
> -        return NULL;
> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
>          return NULL;
> +    }

This part is unrelated.

> +
> +    /* throttling disk read I/O */
> +    if (bs->io_limits_enabled) {
> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
> +                           sector_num, qiov, nb_sectors, cb, opaque);
> +            printf("wait_time=%ld\n", wait_time);
> +            if (wait_time != -1) {
> +                printf("reset block timer\n");
> +                qemu_mod_timer(bs->block_timer,
> +                               wait_time + qemu_get_clock_ns(vm_clock));
> +            }
> +
> +            if (ret) {
> +                printf("ori ret is not null\n");
> +            } else {
> +                printf("ori ret is null\n");
> +            }
> +
> +            return ret;
> +        }
> +    }
>  
> -    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
> +    ret =  drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>                                 cb, opaque);
> +    if (ret) {
> +        if (bs->io_limits_enabled) {
> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
> +                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
> +        }

I wonder if you can't reuse bs->nr_bytes/nr_ops instead of introducing a
second counting mechanism. Would have the advantage that numbers are
actually consistent (your metric counts slightly differently than the
existing info blockstats one).

> +    }
> +
> +    return ret;
>  }
>  
>  typedef struct BlockCompleteData {
> @@ -2396,15 +2455,14 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>      BlockDriver *drv = bs->drv;
>      BlockDriverAIOCB *ret;
>      BlockCompleteData *blk_cb_data;
> +    int64_t wait_time = -1;
>  
>      trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>  
> -    if (!drv)
> -        return NULL;
> -    if (bs->read_only)
> -        return NULL;
> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> +    if (!drv || bs->read_only
> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
>          return NULL;
> +    }

Again, unrelated changes.

>  
>      if (bs->dirty_bitmap) {
>          blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
> @@ -2413,13 +2471,32 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>          opaque = blk_cb_data;
>      }
>  
> +    /* throttling disk write I/O */
> +    if (bs->io_limits_enabled) {
> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
> +                                  sector_num, qiov, nb_sectors, cb, opaque);
> +            if (wait_time != -1) {
> +                qemu_mod_timer(bs->block_timer,
> +                               wait_time + qemu_get_clock_ns(vm_clock));
> +            }
> +
> +            return ret;
> +        }
> +    }

This looks very similar to the code in bdrv_aio_readv. Can it be moved
into a common function?

Kevin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-20 12:34           ` Marcelo Tosatti
  2011-09-21  3:14             ` Zhi Yong Wu
@ 2011-09-21  7:03             ` Zhi Yong Wu
  2011-09-26  8:15             ` Zhi Yong Wu
  2 siblings, 0 replies; 14+ messages in thread
From: Zhi Yong Wu @ 2011-09-21  7:03 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Zhi Yong Wu, qemu-devel, kvm, stefanha, aliguori, ryanh, kwolf, pair

On Tue, Sep 20, 2011 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Mon, Sep 19, 2011 at 05:55:41PM +0800, Zhi Yong Wu wrote:
>> On Wed, Sep 14, 2011 at 6:50 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> > On Tue, Sep 13, 2011 at 11:09:46AM +0800, Zhi Yong Wu wrote:
>> >> On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> >> > On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
>> >> >> Note:
>> >> >>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>> >> >
>> >> > You can increase the length of the slice, if the request is larger than
>> >> > slice_time * bps_limit.
>> >> Yeah, but it is a challenge for how to increase it. Do you have some nice idea?
>> >
>> > If the queue is empty, and the request being processed does not fit the
>> > queue, increase the slice so that the request fits.
>> Sorry for late reply. actually, do you think that this scenario is
>> meaningful for the user?
>> Since we implement this, if the user limits the bps below 512
>> bytes/second, the VM can also not run every task.
>> Can you let us know why we need to make such effort?
>
> It would be good to handle request larger than the slice.
Below is the code changes for your way. I used simple trace and did dd
test on guest, then found only the first rw req is handled, and
subsequent reqs are enqueued. After several minutes, guest prints the
info below on its terminal:
INFO: task kdmflush:326 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.

I don't make sure if it is correct. Do you have better way to verify it?

>
> It is not strictly necessary, but in case its not handled, a minimum
> should be in place, to reflect maximum request size known. Being able to
> specify something which crashes is not acceptable.
>
>

diff --git a/block.c b/block.c
index af19784..f88c22a 100644
--- a/block.c
+++ b/block.c
@@ -132,9 +132,10 @@ void bdrv_io_limits_disable(BlockDriverState *bs)
         bs->block_timer     = NULL;
     }

-    bs->slice_start = 0;
-
-    bs->slice_end   = 0;
+    bs->slice_time    = 0;
+    bs->slice_start   = 0;
+    bs->slice_end     = 0;
+    bs->first_time_rw = false;
 }

 static void bdrv_block_timer(void *opaque)
@@ -151,9 +152,10 @@ void bdrv_io_limits_enable(BlockDriverState *bs)
     bs->block_queue = qemu_new_block_queue();
     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);

+    bs->slice_time  = BLOCK_IO_SLICE_TIME;
     bs->slice_start = qemu_get_clock_ns(vm_clock);
-
-    bs->slice_end   = bs->slice_start + BLOCK_IO_SLICE_TIME;
+    bs->slice_end   = bs->slice_start + bs->slice_time;
+    bs->first_time_rw = true;
 }

 bool bdrv_io_limits_enabled(BlockDriverState *bs)
@@ -2846,11 +2848,23 @@ static bool
bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
     /* Calc approx time to dispatch */
     wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;

-    if (wait) {
-        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
-    }
+    if (!bs->first_time_rw
+        || !qemu_block_queue_is_empty(bs->block_queue)) {
+        if (wait) {
+            *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+        }

-    return true;
+        return true;
+    } else {
+        bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+        bs->slice_end += bs->slice_time - BLOCK_IO_SLICE_TIME;
+        if (wait) {
+            *wait = 0;
+        }
+
+        bs->first_time_rw = false;
+        return false;
+    }
 }

 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
@@ -2895,11 +2909,23 @@ static bool
bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
         wait_time = 0;
     }

-    if (wait) {
-        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
-    }
+    if (!bs->first_time_rw
+        || !qemu_block_queue_is_empty(bs->block_queue)) {
+        if (wait) {
+            *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+        }

-    return true;
+        return true;
+    } else {
+        bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+        bs->slice_end += bs->slice_time - BLOCK_IO_SLICE_TIME;
+        if (wait) {
+            *wait = 0;
+        }
+
+        bs->first_time_rw = false;
+        return false;
+    }
 }

 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
@@ -2912,10 +2938,10 @@ static bool
bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
     now = qemu_get_clock_ns(vm_clock);
     if ((bs->slice_start < now)
         && (bs->slice_end > now)) {
-        bs->slice_end = now + BLOCK_IO_SLICE_TIME;
+        bs->slice_end = now + bs->slice_time;
     } else {
         bs->slice_start = now;
-        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
+        bs->slice_end   = now + bs->slice_time;

         bs->io_disps.bytes[is_write]  = 0;
         bs->io_disps.bytes[!is_write] = 0;
diff --git a/block/blk-queue.c b/block/blk-queue.c
index adef497..04e52ad 100644
--- a/block/blk-queue.c
+++ b/block/blk-queue.c
@@ -199,3 +199,8 @@ bool qemu_block_queue_has_pending(BlockQueue *queue)
 {
     return !queue->flushing && !QTAILQ_EMPTY(&queue->requests);
 }
+
+bool qemu_block_queue_is_empty(BlockQueue *queue)
+{
+    return QTAILQ_EMPTY(&queue->requests);
+}
diff --git a/block/blk-queue.h b/block/blk-queue.h
index c1529f7..d3b379b 100644
--- a/block/blk-queue.h
+++ b/block/blk-queue.h
@@ -56,4 +56,6 @@ void qemu_block_queue_flush(BlockQueue *queue);

 bool qemu_block_queue_has_pending(BlockQueue *queue);

+bool qemu_block_queue_is_empty(BlockQueue *queue);
+
 #endif /* QEMU_BLOCK_QUEUE_H */
diff --git a/block_int.h b/block_int.h
index 93c0d56..5eb007d 100644
--- a/block_int.h
+++ b/block_int.h
@@ -199,6 +199,7 @@ struct BlockDriverState {
     void *sync_aiocb;

     /* the time for latest disk I/O */
+    int64_t slice_time;
     int64_t slice_start;
     int64_t slice_end;
     BlockIOLimit io_limits;
@@ -206,6 +207,7 @@ struct BlockDriverState {
     BlockQueue   *block_queue;
     QEMUTimer    *block_timer;
     bool         io_limits_enabled;
+    bool         first_time_rw;

     /* I/O stats (display with "info blockstats"). */
     uint64_t nr_bytes[BDRV_MAX_IOTYPE];
diff --git a/blockdev.c b/blockdev.c
index 63bd2b5..67d5a50 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -782,6 +782,8 @@ int do_block_set_io_throttle(Monitor *mon,
     bs->io_limits.iops[BLOCK_IO_LIMIT_READ]  = iops_rd;
     bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] = iops_wr;

+    bs->slice_time = BLOCK_IO_SLICE_TIME;
+
     if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) {
         bdrv_io_limits_enable(bs);
     } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {





-- 
Regards,

Zhi Yong Wu

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-21  3:14             ` Zhi Yong Wu
@ 2011-09-21  5:54               ` Zhi Yong Wu
  0 siblings, 0 replies; 14+ messages in thread
From: Zhi Yong Wu @ 2011-09-21  5:54 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Zhi Yong Wu, qemu-devel, kvm, stefanha, aliguori, ryanh, kwolf, pair

On Wed, Sep 21, 2011 at 11:14 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
> On Tue, Sep 20, 2011 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> On Mon, Sep 19, 2011 at 05:55:41PM +0800, Zhi Yong Wu wrote:
>>> On Wed, Sep 14, 2011 at 6:50 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>> > On Tue, Sep 13, 2011 at 11:09:46AM +0800, Zhi Yong Wu wrote:
>>> >> On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>> >> > On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
>>> >> >> Note:
>>> >> >>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>> >> >
>>> >> > You can increase the length of the slice, if the request is larger than
>>> >> > slice_time * bps_limit.
>>> >> Yeah, but it is a challenge for how to increase it. Do you have some nice idea?
>>> >
>>> > If the queue is empty, and the request being processed does not fit the
>>> > queue, increase the slice so that the request fits.
>>> Sorry for late reply. actually, do you think that this scenario is
>>> meaningful for the user?
>>> Since we implement this, if the user limits the bps below 512
>>> bytes/second, the VM can also not run every task.
>>> Can you let us know why we need to make such effort?
>>
>> It would be good to handle request larger than the slice.
> OK. Let me spend some time on trying your way.
>>
>> It is not strictly necessary, but in case its not handled, a minimum
>> should be in place, to reflect maximum request size known. Being able to
> In fact, slice_time has been dynamic now, and adjusted in some range.
Sorry, I made a mistake. Currently it is fixed.
>> specify something which crashes is not acceptable.
> Do you mean that one warning should be displayed if the specified
> limit is smaller than the minimum capability?
>
>>
>>
>
>
>
> --
> Regards,
>
> Zhi Yong Wu
>



-- 
Regards,

Zhi Yong Wu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-20 12:34           ` Marcelo Tosatti
@ 2011-09-21  3:14             ` Zhi Yong Wu
  2011-09-21  5:54               ` Zhi Yong Wu
  2011-09-21  7:03             ` Zhi Yong Wu
  2011-09-26  8:15             ` Zhi Yong Wu
  2 siblings, 1 reply; 14+ messages in thread
From: Zhi Yong Wu @ 2011-09-21  3:14 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Zhi Yong Wu, qemu-devel, kvm, stefanha, aliguori, ryanh, kwolf, pair

On Tue, Sep 20, 2011 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Mon, Sep 19, 2011 at 05:55:41PM +0800, Zhi Yong Wu wrote:
>> On Wed, Sep 14, 2011 at 6:50 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> > On Tue, Sep 13, 2011 at 11:09:46AM +0800, Zhi Yong Wu wrote:
>> >> On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> >> > On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
>> >> >> Note:
>> >> >>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>> >> >
>> >> > You can increase the length of the slice, if the request is larger than
>> >> > slice_time * bps_limit.
>> >> Yeah, but it is a challenge for how to increase it. Do you have some nice idea?
>> >
>> > If the queue is empty, and the request being processed does not fit the
>> > queue, increase the slice so that the request fits.
>> Sorry for late reply. actually, do you think that this scenario is
>> meaningful for the user?
>> Since we implement this, if the user limits the bps below 512
>> bytes/second, the VM can also not run every task.
>> Can you let us know why we need to make such effort?
>
> It would be good to handle request larger than the slice.
OK. Let me spend some time on trying your way.
>
> It is not strictly necessary, but in case its not handled, a minimum
> should be in place, to reflect maximum request size known. Being able to
In fact, slice_time has been dynamic now, and adjusted in some range.
> specify something which crashes is not acceptable.
Do you mean that one warning should be displayed if the specified
limit is smaller than the minimum capability?

>
>



-- 
Regards,

Zhi Yong Wu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-19  9:55         ` Zhi Yong Wu
@ 2011-09-20 12:34           ` Marcelo Tosatti
  2011-09-21  3:14             ` Zhi Yong Wu
                               ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Marcelo Tosatti @ 2011-09-20 12:34 UTC (permalink / raw)
  To: Zhi Yong Wu
  Cc: Zhi Yong Wu, qemu-devel, kvm, stefanha, aliguori, ryanh, kwolf, pair

On Mon, Sep 19, 2011 at 05:55:41PM +0800, Zhi Yong Wu wrote:
> On Wed, Sep 14, 2011 at 6:50 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Tue, Sep 13, 2011 at 11:09:46AM +0800, Zhi Yong Wu wrote:
> >> On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >> > On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
> >> >> Note:
> >> >>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
> >> >
> >> > You can increase the length of the slice, if the request is larger than
> >> > slice_time * bps_limit.
> >> Yeah, but it is a challenge for how to increase it. Do you have some nice idea?
> >
> > If the queue is empty, and the request being processed does not fit the
> > queue, increase the slice so that the request fits.
> Sorry for late reply. actually, do you think that this scenario is
> meaningful for the user?
> Since we implement this, if the user limits the bps below 512
> bytes/second, the VM can also not run every task.
> Can you let us know why we need to make such effort?

It would be good to handle request larger than the slice.

It is not strictly necessary, but in case its not handled, a minimum
should be in place, to reflect maximum request size known. Being able to
specify something which crashes is not acceptable.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-14 10:50       ` Marcelo Tosatti
@ 2011-09-19  9:55         ` Zhi Yong Wu
  2011-09-20 12:34           ` Marcelo Tosatti
  0 siblings, 1 reply; 14+ messages in thread
From: Zhi Yong Wu @ 2011-09-19  9:55 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Zhi Yong Wu, qemu-devel, kvm, stefanha, aliguori, ryanh, kwolf, pair

On Wed, Sep 14, 2011 at 6:50 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Tue, Sep 13, 2011 at 11:09:46AM +0800, Zhi Yong Wu wrote:
>> On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> > On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
>> >> Note:
>> >>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>> >
>> > You can increase the length of the slice, if the request is larger than
>> > slice_time * bps_limit.
>> Yeah, but it is a challenge for how to increase it. Do you have some nice idea?
>
> If the queue is empty, and the request being processed does not fit the
> queue, increase the slice so that the request fits.
Sorry for late reply. actually, do you think that this scenario is
meaningful for the user?
Since we implement this, if the user limits the bps below 512
bytes/second, the VM can also not run every task.
Can you let us know why we need to make such effort?

>
> That is, make BLOCK_IO_SLICE_TIME dynamic and adjust it as described
> above (if the bps or io limits change, reset it to the default
> BLOCK_IO_SLICE_TIME).
>
>> >>      2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
>> >
>> > Why?
>> This issue has not existed. I will remove it.
>> When drive bps=1000000, i did some testings on guest VM.
>> 1.) bs=1024K
>> 18+0 records in
>> 18+0 records out
>> 18874368 bytes (19 MB) copied, 26.6268 s, 709 kB/s
>> 2.) bs=2048K
>> 18+0 records in
>> 18+0 records out
>> 37748736 bytes (38 MB) copied, 46.5336 s, 811 kB/s
>>
>> >
>> > There is lots of debugging leftovers in the patch.
>> sorry, i forgot to remove them.
>> >
>> >
>
>



-- 
Regards,

Zhi Yong Wu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-13  3:09     ` Zhi Yong Wu
@ 2011-09-14 10:50       ` Marcelo Tosatti
  2011-09-19  9:55         ` Zhi Yong Wu
  0 siblings, 1 reply; 14+ messages in thread
From: Marcelo Tosatti @ 2011-09-14 10:50 UTC (permalink / raw)
  To: Zhi Yong Wu
  Cc: Zhi Yong Wu, qemu-devel, kvm, stefanha, aliguori, ryanh, kwolf, pair

On Tue, Sep 13, 2011 at 11:09:46AM +0800, Zhi Yong Wu wrote:
> On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
> >> Note:
> >>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
> >
> > You can increase the length of the slice, if the request is larger than
> > slice_time * bps_limit.
> Yeah, but it is a challenge for how to increase it. Do you have some nice idea?

If the queue is empty, and the request being processed does not fit the
queue, increase the slice so that the request fits.

That is, make BLOCK_IO_SLICE_TIME dynamic and adjust it as described
above (if the bps or io limits change, reset it to the default
BLOCK_IO_SLICE_TIME).

> >>      2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
> >
> > Why?
> This issue has not existed. I will remove it.
> When drive bps=1000000, i did some testings on guest VM.
> 1.) bs=1024K
> 18+0 records in
> 18+0 records out
> 18874368 bytes (19 MB) copied, 26.6268 s, 709 kB/s
> 2.) bs=2048K
> 18+0 records in
> 18+0 records out
> 37748736 bytes (38 MB) copied, 46.5336 s, 811 kB/s
> 
> >
> > There is lots of debugging leftovers in the patch.
> sorry, i forgot to remove them.
> >
> >


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-09 14:44   ` Marcelo Tosatti
@ 2011-09-13  3:09     ` Zhi Yong Wu
  2011-09-14 10:50       ` Marcelo Tosatti
  0 siblings, 1 reply; 14+ messages in thread
From: Zhi Yong Wu @ 2011-09-13  3:09 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Zhi Yong Wu, qemu-devel, kvm, stefanha, aliguori, ryanh, kwolf, pair

On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
>> Note:
>>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>
> You can increase the length of the slice, if the request is larger than
> slice_time * bps_limit.
Yeah, but it is a challenge for how to increase it. Do you have some nice idea?

>
>>      2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
>
> Why?
This issue has not existed. I will remove it.
When drive bps=1000000, i did some testings on guest VM.
1.) bs=1024K
18+0 records in
18+0 records out
18874368 bytes (19 MB) copied, 26.6268 s, 709 kB/s
2.) bs=2048K
18+0 records in
18+0 records out
37748736 bytes (38 MB) copied, 46.5336 s, 811 kB/s

>
> There is lots of debugging leftovers in the patch.
sorry, i forgot to remove them.
>
>



-- 
Regards,

Zhi Yong Wu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-08 10:11 ` [PATCH v8 3/4] block: add block timer and throttling algorithm Zhi Yong Wu
@ 2011-09-09 14:44   ` Marcelo Tosatti
  2011-09-13  3:09     ` Zhi Yong Wu
  2011-09-23 16:19   ` Kevin Wolf
  1 sibling, 1 reply; 14+ messages in thread
From: Marcelo Tosatti @ 2011-09-09 14:44 UTC (permalink / raw)
  To: Zhi Yong Wu
  Cc: qemu-devel, kvm, stefanha, aliguori, ryanh, zwu.kernel, kwolf, pair

On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
> Note:
>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.

You can increase the length of the slice, if the request is larger than
slice_time * bps_limit.

>      2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.

Why?

There is lots of debugging leftovers in the patch.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v8 3/4] block: add block timer and throttling algorithm
  2011-09-08 10:11 [PATCH v8 0/4] The intro of QEMU block I/O throttling Zhi Yong Wu
@ 2011-09-08 10:11 ` Zhi Yong Wu
  2011-09-09 14:44   ` Marcelo Tosatti
  2011-09-23 16:19   ` Kevin Wolf
  0 siblings, 2 replies; 14+ messages in thread
From: Zhi Yong Wu @ 2011-09-08 10:11 UTC (permalink / raw)
  To: qemu-devel
  Cc: kvm, stefanha, mtosatti, aliguori, ryanh, zwu.kernel, kwolf,
	pair, Zhi Yong Wu

Note:
     1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
     2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.

For these problems, if you have nice thought, pls let us know.:)

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 block.c |  259 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 block.h |    1 -
 2 files changed, 248 insertions(+), 12 deletions(-)

diff --git a/block.c b/block.c
index cd75183..c08fde8 100644
--- a/block.c
+++ b/block.c
@@ -30,6 +30,9 @@
 #include "qemu-objects.h"
 #include "qemu-coroutine.h"
 
+#include "qemu-timer.h"
+#include "block/blk-queue.h"
+
 #ifdef CONFIG_BSD
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -72,6 +75,13 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
                                          QEMUIOVector *iov);
 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, int64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -745,6 +755,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
 
+    /* throttling disk I/O limits */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_enable(bs);
+    }
+
     return 0;
 
 unlink_and_fail:
@@ -783,6 +798,18 @@ void bdrv_close(BlockDriverState *bs)
         if (bs->change_cb)
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
+
+    /* throttling disk I/O limits */
+    if (bs->block_queue) {
+        qemu_del_block_queue(bs->block_queue);
+        bs->block_queue = NULL;
+    }
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer = NULL;
+    }
 }
 
 void bdrv_close_all(void)
@@ -2341,16 +2368,48 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
                                  BlockDriverCompletionFunc *cb, void *opaque)
 {
     BlockDriver *drv = bs->drv;
-
+    BlockDriverAIOCB *ret;
+    int64_t wait_time = -1;
+printf("sector_num=%ld, nb_sectors=%d\n", sector_num, nb_sectors);
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
         return NULL;
+    }
+
+    /* throttling disk read I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
+                           sector_num, qiov, nb_sectors, cb, opaque);
+            printf("wait_time=%ld\n", wait_time);
+            if (wait_time != -1) {
+                printf("reset block timer\n");
+                qemu_mod_timer(bs->block_timer,
+                               wait_time + qemu_get_clock_ns(vm_clock));
+            }
+
+            if (ret) {
+                printf("ori ret is not null\n");
+            } else {
+                printf("ori ret is null\n");
+            }
+
+            return ret;
+        }
+    }
 
-    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+    ret =  drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
+    if (ret) {
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
+                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
+        }
+    }
+
+    return ret;
 }
 
 typedef struct BlockCompleteData {
@@ -2396,15 +2455,14 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
     BlockCompleteData *blk_cb_data;
+    int64_t wait_time = -1;
 
     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bs->read_only)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bs->read_only
+        || bdrv_check_request(bs, sector_num, nb_sectors)) {
         return NULL;
+    }
 
     if (bs->dirty_bitmap) {
         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
@@ -2413,13 +2471,32 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
         opaque = blk_cb_data;
     }
 
+    /* throttling disk write I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
+                                  sector_num, qiov, nb_sectors, cb, opaque);
+            if (wait_time != -1) {
+                qemu_mod_timer(bs->block_timer,
+                               wait_time + qemu_get_clock_ns(vm_clock));
+            }
+
+            return ret;
+        }
+    }
+
     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
-
     if (ret) {
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
+
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
+                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
+        }
     }
 
     return ret;
@@ -2684,6 +2761,166 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->pool->cancel(acb);
 }
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait) {
+    uint64_t bps_limit = 0;
+    double   bytes_limit, bytes_disp, bytes_res;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    bytes_limit = bps_limit * slice_time;
+    bytes_disp  = bs->io_disps.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_disp += bs->io_disps.bytes[!is_write];
+    }
+
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_disp + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    printf("1 wait=%ld\n", *wait);
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait) {
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_disp;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    ios_limit  = iops_limit * slice_time;
+    ios_disp   = bs->io_disps.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_disp += bs->io_disps.ios[!is_write];
+    }
+
+    if (ios_disp + 1 <= ios_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (ios_disp + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+        wait_time = wait_time - elapsed_time;
+    } else {
+        wait_time = 0;
+    }
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, int64_t *wait) {
+    int64_t  now, max_wait;
+    uint64_t bps_wait = 0, iops_wait = 0;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    now = qemu_get_clock_ns(vm_clock);
+    if ((bs->slice_start < now)
+        && (bs->slice_end > now)) {
+        bs->slice_end = now + BLOCK_IO_SLICE_TIME;
+    } else {
+        bs->slice_start = now;
+        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
+
+        bs->io_disps.bytes[is_write]  = 0;
+        bs->io_disps.bytes[!is_write] = 0;
+
+        bs->io_disps.ios[is_write]    = 0;
+        bs->io_disps.ios[!is_write]   = 0;
+    }
+
+    /* If a limit was exceeded, immediately queue this request */
+    if (qemu_block_queue_has_pending(bs->block_queue)) {
+        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
+            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
+            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+            if (wait) {
+                *wait = -1;
+            }
+
+            return true;
+        }
+    }
+
+    elapsed_time  = now - bs->slice_start;
+    elapsed_time  /= (NANOSECONDS_PER_SECOND);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        now = qemu_get_clock_ns(vm_clock);
+        if (bs->slice_end < now + max_wait) {
+            bs->slice_end = now + max_wait;
+        }
+
+        printf("end wait=%ld\n", *wait);
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    return false;
+}
 
 /**************************************************************/
 /* async block device emulation */
diff --git a/block.h b/block.h
index a3e69db..10d2828 100644
--- a/block.h
+++ b/block.h
@@ -107,7 +107,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
     const char *backing_file, const char *backing_fmt);
 void bdrv_register(BlockDriver *bdrv);
 
-
 typedef struct BdrvCheckResult {
     int corruptions;
     int leaks;
-- 
1.7.6


^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2011-09-26  8:15 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-09-07 12:41 [PATCH v8 3/4] block: add block timer and throttling algorithm Zhi Yong Wu
2011-09-07 12:41 ` [Qemu-devel] " Zhi Yong Wu
2011-09-08 10:11 [PATCH v8 0/4] The intro of QEMU block I/O throttling Zhi Yong Wu
2011-09-08 10:11 ` [PATCH v8 3/4] block: add block timer and throttling algorithm Zhi Yong Wu
2011-09-09 14:44   ` Marcelo Tosatti
2011-09-13  3:09     ` Zhi Yong Wu
2011-09-14 10:50       ` Marcelo Tosatti
2011-09-19  9:55         ` Zhi Yong Wu
2011-09-20 12:34           ` Marcelo Tosatti
2011-09-21  3:14             ` Zhi Yong Wu
2011-09-21  5:54               ` Zhi Yong Wu
2011-09-21  7:03             ` Zhi Yong Wu
2011-09-26  8:15             ` Zhi Yong Wu
2011-09-23 16:19   ` Kevin Wolf
2011-09-26  7:24     ` Zhi Yong Wu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.