From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> To: qemu-devel@nongnu.org Cc: kvm@vger.kernel.org, aliguori@us.ibm.com, stefanha@linux.vnet.ibm.com, raharper@us.ibm.com, kwolf@redhat.com, mtosatti@redhat.com, zwu.kernel@gmail.com, luowenj@cn.ibm.com, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> Subject: [PATCH v4 3/3] The support for queue timer and throttling algorithm Date: Mon, 1 Aug 2011 14:25:55 +0800 [thread overview] Message-ID: <1312179955-23536-4-git-send-email-wuzhy@linux.vnet.ibm.com> (raw) In-Reply-To: <1312179955-23536-1-git-send-email-wuzhy@linux.vnet.ibm.com> Note: 1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario. 2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits. For these problems, if you have nice thought, pls let us know.:) Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> --- block.c | 302 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- block.h | 1 - block_int.h | 29 ++++++ 3 files changed, 323 insertions(+), 9 deletions(-) diff --git a/block.c b/block.c index 24a25d5..42763a3 100644 --- a/block.c +++ b/block.c @@ -29,6 +29,9 @@ #include "module.h" #include "qemu-objects.h" +#include "qemu-timer.h" +#include "block/blk-queue.h" + #ifdef CONFIG_BSD #include <sys/types.h> #include <sys/stat.h> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, double elapsed_time, uint64_t *wait); +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, + double elapsed_time, uint64_t *wait); +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, uint64_t *wait); + static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename) } #endif +static int bdrv_io_limits_enable(BlockIOLimit *io_limits) +{ + if ((io_limits->bps[0] == 0) + && (io_limits->bps[1] == 0) + && (io_limits->bps[2] == 0) + && (io_limits->iops[0] == 0) + && (io_limits->iops[1] == 0) + && (io_limits->iops[2] == 0)) { + return 0; + } + + return 1; +} + /* check if the path starts with "<protocol>:" */ static int path_has_protocol(const char *path) { @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size, } } +static void bdrv_block_timer(void *opaque) +{ + BlockDriverState *bs = opaque; + BlockQueue *queue = bs->block_queue; + + while (!QTAILQ_EMPTY(&queue->requests)) { + BlockIORequest *request = NULL; + int ret = 0; + + request = QTAILQ_FIRST(&queue->requests); + QTAILQ_REMOVE(&queue->requests, request, entry); + + ret = qemu_block_queue_handler(request); + if (ret == 0) { + QTAILQ_INSERT_HEAD(&queue->requests, request, entry); + break; + } + + qemu_free(request); + } +} + void bdrv_register(BlockDriver *bdrv) { if (!bdrv->bdrv_aio_readv) { @@ -642,6 +688,19 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, bs->change_cb(bs->change_opaque, CHANGE_MEDIA); } + /* throttling disk I/O limits */ + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->req_from_queue = false; + bs->block_queue = qemu_new_block_queue(); + bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); + + bs->slice_start[0] = qemu_get_clock_ns(vm_clock); + bs->slice_start[1] = qemu_get_clock_ns(vm_clock); + + bs->slice_end[0] = qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME; + bs->slice_end[1] = qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME; + } + return 0; unlink_and_fail: @@ -680,6 +739,16 @@ void bdrv_close(BlockDriverState *bs) if (bs->change_cb) bs->change_cb(bs->change_opaque, CHANGE_MEDIA); } + + /* throttling disk I/O limits */ + if (bs->block_queue) { + qemu_del_block_queue(bs->block_queue); + } + + if (bs->block_timer) { + qemu_del_timer(bs->block_timer); + qemu_free_timer(bs->block_timer); + } } void bdrv_close_all(void) @@ -1312,6 +1381,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs, *psecs = bs->secs; } +/* throttling disk io limits */ +void bdrv_set_io_limits(BlockDriverState *bs, + BlockIOLimit *io_limits) +{ + memset(&bs->io_limits, 0, sizeof(BlockIOLimit)); + bs->io_limits = *io_limits; +} + /* Recognize floppy formats */ typedef struct FDFormat { FDriveType drive; @@ -2111,6 +2188,165 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn) return buf; } +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, double elapsed_time, uint64_t *wait) { + uint64_t bps_limit = 0; + double bytes_limit, bytes_disp, bytes_res; + double slice_time, wait_time; + + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; + } else if (bs->io_limits.bps[is_write]) { + bps_limit = bs->io_limits.bps[is_write]; + } else { + if (wait) { + *wait = 0; + } + + return false; + } + + slice_time = bs->slice_end[is_write] - bs->slice_start[is_write]; + slice_time /= (BLOCK_IO_SLICE_TIME * 10.0); + bytes_limit = bps_limit * slice_time; + bytes_disp = bs->io_disps.bytes[is_write]; + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { + bytes_disp += bs->io_disps.bytes[!is_write]; + } + + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; + + if (bytes_disp + bytes_res <= bytes_limit) { + if (wait) { + *wait = 0; + } + + return false; + } + + /* Calc approx time to dispatch */ + wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time; + + if (wait) { + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; + } + + return true; +} + +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, + double elapsed_time, uint64_t *wait) { + uint64_t iops_limit = 0; + double ios_limit, ios_disp; + double slice_time, wait_time; + + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; + } else if (bs->io_limits.iops[is_write]) { + iops_limit = bs->io_limits.iops[is_write]; + } else { + if (wait) { + *wait = 0; + } + + return false; + } + + slice_time = bs->slice_end[is_write] - bs->slice_start[is_write]; + slice_time /= (BLOCK_IO_SLICE_TIME * 10.0); + ios_limit = iops_limit * slice_time; + ios_disp = bs->io_disps.ios[is_write]; + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + ios_disp += bs->io_disps.ios[!is_write]; + } + + if (ios_disp + 1 <= ios_limit) { + if (wait) { + *wait = 0; + } + + return false; + } + + /* Calc approx time to dispatch */ + wait_time = (ios_disp + 1) / iops_limit; + if (wait_time > elapsed_time) { + wait_time = wait_time - elapsed_time; + } else { + wait_time = 0; + } + + if (wait) { + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; + } + + return true; +} + +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, uint64_t *wait) { + int64_t real_time, real_slice; + uint64_t bps_wait = 0, iops_wait = 0, max_wait; + double elapsed_time; + int bps_ret, iops_ret; + + real_time = qemu_get_clock_ns(vm_clock); + real_slice = bs->slice_end[is_write] - bs->slice_start[is_write]; + if ((bs->slice_start[is_write] < real_time) + && (bs->slice_end[is_write] > real_time)) { + bs->slice_end[is_write] = real_time + BLOCK_IO_SLICE_TIME; + } else { + bs->slice_start[is_write] = real_time; + bs->slice_end[is_write] = real_time + BLOCK_IO_SLICE_TIME; + + bs->io_disps.bytes[is_write] = 0; + bs->io_disps.bytes[!is_write] = 0; + + bs->io_disps.ios[is_write] = 0; + bs->io_disps.ios[!is_write] = 0; + } + + /* If a limit was exceeded, immediately queue this request */ + if ((bs->req_from_queue == false) + && !QTAILQ_EMPTY(&bs->block_queue->requests)) { + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] + || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write] + || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + if (wait) { + *wait = 0; + } + + return true; + } + } + + elapsed_time = real_time - bs->slice_start[is_write]; + elapsed_time /= (BLOCK_IO_SLICE_TIME * 10.0); + + bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, + is_write, elapsed_time, &bps_wait); + iops_ret = bdrv_exceed_iops_limits(bs, is_write, + elapsed_time, &iops_wait); + if (bps_ret || iops_ret) { + max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; + if (wait) { + *wait = max_wait; + } + + real_time = qemu_get_clock_ns(vm_clock); + if (bs->slice_end[is_write] < real_time + max_wait) { + bs->slice_end[is_write] = real_time + max_wait; + } + + return true; + } + + if (wait) { + *wait = 0; + } + + return false; +} /**************************************************************/ /* async I/Os */ @@ -2121,13 +2357,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, { BlockDriver *drv = bs->drv; BlockDriverAIOCB *ret; + uint64_t wait_time = 0; trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - if (!drv) - return NULL; - if (bdrv_check_request(bs, sector_num, nb_sectors)) + if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) { + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->req_from_queue = false; + } return NULL; + } + + /* throttling disk read I/O */ + if (bdrv_io_limits_enable(&bs->io_limits)) { + if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) { + ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv, + sector_num, qiov, nb_sectors, cb, opaque); + qemu_mod_timer(bs->block_timer, + wait_time + qemu_get_clock_ns(vm_clock)); + bs->req_from_queue = false; + return ret; + } + } ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, cb, opaque); @@ -2136,6 +2387,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, /* Update stats even though technically transfer has not happened. */ bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE; bs->rd_ops ++; + + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] += + (unsigned) nb_sectors * BDRV_SECTOR_SIZE; + bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++; + } + } + + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->req_from_queue = false; } return ret; @@ -2184,15 +2445,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, BlockDriver *drv = bs->drv; BlockDriverAIOCB *ret; BlockCompleteData *blk_cb_data; + uint64_t wait_time = 0; trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - if (!drv) - return NULL; - if (bs->read_only) - return NULL; - if (bdrv_check_request(bs, sector_num, nb_sectors)) + if (!drv || bs->read_only + || bdrv_check_request(bs, sector_num, nb_sectors)) { + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->req_from_queue = false; + } + return NULL; + } if (bs->dirty_bitmap) { blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, @@ -2201,6 +2465,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, opaque = blk_cb_data; } + /* throttling disk write I/O */ + if (bdrv_io_limits_enable(&bs->io_limits)) { + if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) { + ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev, + sector_num, qiov, nb_sectors, cb, opaque); + qemu_mod_timer(bs->block_timer, + wait_time + qemu_get_clock_ns(vm_clock)); + bs->req_from_queue = false; + return ret; + } + } + ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, cb, opaque); @@ -2211,6 +2487,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { bs->wr_highest_sector = sector_num + nb_sectors - 1; } + + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] += + (unsigned) nb_sectors * BDRV_SECTOR_SIZE; + bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++; + } + } + + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->req_from_queue = false; } return ret; diff --git a/block.h b/block.h index 859d1d9..f0dac62 100644 --- a/block.h +++ b/block.h @@ -97,7 +97,6 @@ int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file, const char *backing_fmt); void bdrv_register(BlockDriver *bdrv); - typedef struct BdrvCheckResult { int corruptions; int leaks; diff --git a/block_int.h b/block_int.h index 1e265d2..1ca826b 100644 --- a/block_int.h +++ b/block_int.h @@ -27,10 +27,17 @@ #include "block.h" #include "qemu-option.h" #include "qemu-queue.h" +#include "block/blk-queue.h" #define BLOCK_FLAG_ENCRYPT 1 #define BLOCK_FLAG_COMPAT6 4 +#define BLOCK_IO_LIMIT_READ 0 +#define BLOCK_IO_LIMIT_WRITE 1 +#define BLOCK_IO_LIMIT_TOTAL 2 + +#define BLOCK_IO_SLICE_TIME 100000000 + #define BLOCK_OPT_SIZE "size" #define BLOCK_OPT_ENCRYPT "encryption" #define BLOCK_OPT_COMPAT6 "compat6" @@ -46,6 +53,16 @@ typedef struct AIOPool { BlockDriverAIOCB *free_aiocb; } AIOPool; +typedef struct BlockIOLimit { + uint64_t bps[3]; + uint64_t iops[3]; +} BlockIOLimit; + +typedef struct BlockIODisp { + uint64_t bytes[2]; + uint64_t ios[2]; +} BlockIODisp; + struct BlockDriver { const char *format_name; int instance_size; @@ -175,6 +192,15 @@ struct BlockDriverState { void *sync_aiocb; + /* the time for latest disk I/O */ + int64_t slice_start[2]; + int64_t slice_end[2]; + BlockIOLimit io_limits; + BlockIODisp io_disps; + BlockQueue *block_queue; + QEMUTimer *block_timer; + bool req_from_queue; + /* I/O stats (display with "info blockstats"). */ uint64_t rd_bytes; uint64_t wr_bytes; @@ -222,6 +248,9 @@ void qemu_aio_release(void *p); void *qemu_blockalign(BlockDriverState *bs, size_t size); +void bdrv_set_io_limits(BlockDriverState *bs, + BlockIOLimit *io_limits); + #ifdef _WIN32 int is_windows_drive(const char *filename); #endif -- 1.7.2.3
WARNING: multiple messages have this Message-ID (diff)
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> To: qemu-devel@nongnu.org Cc: kwolf@redhat.com, aliguori@us.ibm.com, stefanha@linux.vnet.ibm.com, kvm@vger.kernel.org, mtosatti@redhat.com, zwu.kernel@gmail.com, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>, luowenj@cn.ibm.com, raharper@us.ibm.com Subject: [Qemu-devel] [PATCH v4 3/3] The support for queue timer and throttling algorithm Date: Mon, 1 Aug 2011 14:25:55 +0800 [thread overview] Message-ID: <1312179955-23536-4-git-send-email-wuzhy@linux.vnet.ibm.com> (raw) In-Reply-To: <1312179955-23536-1-git-send-email-wuzhy@linux.vnet.ibm.com> Note: 1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario. 2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits. For these problems, if you have nice thought, pls let us know.:) Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> --- block.c | 302 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- block.h | 1 - block_int.h | 29 ++++++ 3 files changed, 323 insertions(+), 9 deletions(-) diff --git a/block.c b/block.c index 24a25d5..42763a3 100644 --- a/block.c +++ b/block.c @@ -29,6 +29,9 @@ #include "module.h" #include "qemu-objects.h" +#include "qemu-timer.h" +#include "block/blk-queue.h" + #ifdef CONFIG_BSD #include <sys/types.h> #include <sys/stat.h> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, double elapsed_time, uint64_t *wait); +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, + double elapsed_time, uint64_t *wait); +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, uint64_t *wait); + static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename) } #endif +static int bdrv_io_limits_enable(BlockIOLimit *io_limits) +{ + if ((io_limits->bps[0] == 0) + && (io_limits->bps[1] == 0) + && (io_limits->bps[2] == 0) + && (io_limits->iops[0] == 0) + && (io_limits->iops[1] == 0) + && (io_limits->iops[2] == 0)) { + return 0; + } + + return 1; +} + /* check if the path starts with "<protocol>:" */ static int path_has_protocol(const char *path) { @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size, } } +static void bdrv_block_timer(void *opaque) +{ + BlockDriverState *bs = opaque; + BlockQueue *queue = bs->block_queue; + + while (!QTAILQ_EMPTY(&queue->requests)) { + BlockIORequest *request = NULL; + int ret = 0; + + request = QTAILQ_FIRST(&queue->requests); + QTAILQ_REMOVE(&queue->requests, request, entry); + + ret = qemu_block_queue_handler(request); + if (ret == 0) { + QTAILQ_INSERT_HEAD(&queue->requests, request, entry); + break; + } + + qemu_free(request); + } +} + void bdrv_register(BlockDriver *bdrv) { if (!bdrv->bdrv_aio_readv) { @@ -642,6 +688,19 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, bs->change_cb(bs->change_opaque, CHANGE_MEDIA); } + /* throttling disk I/O limits */ + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->req_from_queue = false; + bs->block_queue = qemu_new_block_queue(); + bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); + + bs->slice_start[0] = qemu_get_clock_ns(vm_clock); + bs->slice_start[1] = qemu_get_clock_ns(vm_clock); + + bs->slice_end[0] = qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME; + bs->slice_end[1] = qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME; + } + return 0; unlink_and_fail: @@ -680,6 +739,16 @@ void bdrv_close(BlockDriverState *bs) if (bs->change_cb) bs->change_cb(bs->change_opaque, CHANGE_MEDIA); } + + /* throttling disk I/O limits */ + if (bs->block_queue) { + qemu_del_block_queue(bs->block_queue); + } + + if (bs->block_timer) { + qemu_del_timer(bs->block_timer); + qemu_free_timer(bs->block_timer); + } } void bdrv_close_all(void) @@ -1312,6 +1381,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs, *psecs = bs->secs; } +/* throttling disk io limits */ +void bdrv_set_io_limits(BlockDriverState *bs, + BlockIOLimit *io_limits) +{ + memset(&bs->io_limits, 0, sizeof(BlockIOLimit)); + bs->io_limits = *io_limits; +} + /* Recognize floppy formats */ typedef struct FDFormat { FDriveType drive; @@ -2111,6 +2188,165 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn) return buf; } +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, double elapsed_time, uint64_t *wait) { + uint64_t bps_limit = 0; + double bytes_limit, bytes_disp, bytes_res; + double slice_time, wait_time; + + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; + } else if (bs->io_limits.bps[is_write]) { + bps_limit = bs->io_limits.bps[is_write]; + } else { + if (wait) { + *wait = 0; + } + + return false; + } + + slice_time = bs->slice_end[is_write] - bs->slice_start[is_write]; + slice_time /= (BLOCK_IO_SLICE_TIME * 10.0); + bytes_limit = bps_limit * slice_time; + bytes_disp = bs->io_disps.bytes[is_write]; + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { + bytes_disp += bs->io_disps.bytes[!is_write]; + } + + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; + + if (bytes_disp + bytes_res <= bytes_limit) { + if (wait) { + *wait = 0; + } + + return false; + } + + /* Calc approx time to dispatch */ + wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time; + + if (wait) { + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; + } + + return true; +} + +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, + double elapsed_time, uint64_t *wait) { + uint64_t iops_limit = 0; + double ios_limit, ios_disp; + double slice_time, wait_time; + + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; + } else if (bs->io_limits.iops[is_write]) { + iops_limit = bs->io_limits.iops[is_write]; + } else { + if (wait) { + *wait = 0; + } + + return false; + } + + slice_time = bs->slice_end[is_write] - bs->slice_start[is_write]; + slice_time /= (BLOCK_IO_SLICE_TIME * 10.0); + ios_limit = iops_limit * slice_time; + ios_disp = bs->io_disps.ios[is_write]; + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + ios_disp += bs->io_disps.ios[!is_write]; + } + + if (ios_disp + 1 <= ios_limit) { + if (wait) { + *wait = 0; + } + + return false; + } + + /* Calc approx time to dispatch */ + wait_time = (ios_disp + 1) / iops_limit; + if (wait_time > elapsed_time) { + wait_time = wait_time - elapsed_time; + } else { + wait_time = 0; + } + + if (wait) { + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; + } + + return true; +} + +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, uint64_t *wait) { + int64_t real_time, real_slice; + uint64_t bps_wait = 0, iops_wait = 0, max_wait; + double elapsed_time; + int bps_ret, iops_ret; + + real_time = qemu_get_clock_ns(vm_clock); + real_slice = bs->slice_end[is_write] - bs->slice_start[is_write]; + if ((bs->slice_start[is_write] < real_time) + && (bs->slice_end[is_write] > real_time)) { + bs->slice_end[is_write] = real_time + BLOCK_IO_SLICE_TIME; + } else { + bs->slice_start[is_write] = real_time; + bs->slice_end[is_write] = real_time + BLOCK_IO_SLICE_TIME; + + bs->io_disps.bytes[is_write] = 0; + bs->io_disps.bytes[!is_write] = 0; + + bs->io_disps.ios[is_write] = 0; + bs->io_disps.ios[!is_write] = 0; + } + + /* If a limit was exceeded, immediately queue this request */ + if ((bs->req_from_queue == false) + && !QTAILQ_EMPTY(&bs->block_queue->requests)) { + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] + || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write] + || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + if (wait) { + *wait = 0; + } + + return true; + } + } + + elapsed_time = real_time - bs->slice_start[is_write]; + elapsed_time /= (BLOCK_IO_SLICE_TIME * 10.0); + + bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, + is_write, elapsed_time, &bps_wait); + iops_ret = bdrv_exceed_iops_limits(bs, is_write, + elapsed_time, &iops_wait); + if (bps_ret || iops_ret) { + max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; + if (wait) { + *wait = max_wait; + } + + real_time = qemu_get_clock_ns(vm_clock); + if (bs->slice_end[is_write] < real_time + max_wait) { + bs->slice_end[is_write] = real_time + max_wait; + } + + return true; + } + + if (wait) { + *wait = 0; + } + + return false; +} /**************************************************************/ /* async I/Os */ @@ -2121,13 +2357,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, { BlockDriver *drv = bs->drv; BlockDriverAIOCB *ret; + uint64_t wait_time = 0; trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - if (!drv) - return NULL; - if (bdrv_check_request(bs, sector_num, nb_sectors)) + if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) { + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->req_from_queue = false; + } return NULL; + } + + /* throttling disk read I/O */ + if (bdrv_io_limits_enable(&bs->io_limits)) { + if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) { + ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv, + sector_num, qiov, nb_sectors, cb, opaque); + qemu_mod_timer(bs->block_timer, + wait_time + qemu_get_clock_ns(vm_clock)); + bs->req_from_queue = false; + return ret; + } + } ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, cb, opaque); @@ -2136,6 +2387,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, /* Update stats even though technically transfer has not happened. */ bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE; bs->rd_ops ++; + + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] += + (unsigned) nb_sectors * BDRV_SECTOR_SIZE; + bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++; + } + } + + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->req_from_queue = false; } return ret; @@ -2184,15 +2445,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, BlockDriver *drv = bs->drv; BlockDriverAIOCB *ret; BlockCompleteData *blk_cb_data; + uint64_t wait_time = 0; trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - if (!drv) - return NULL; - if (bs->read_only) - return NULL; - if (bdrv_check_request(bs, sector_num, nb_sectors)) + if (!drv || bs->read_only + || bdrv_check_request(bs, sector_num, nb_sectors)) { + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->req_from_queue = false; + } + return NULL; + } if (bs->dirty_bitmap) { blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, @@ -2201,6 +2465,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, opaque = blk_cb_data; } + /* throttling disk write I/O */ + if (bdrv_io_limits_enable(&bs->io_limits)) { + if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) { + ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev, + sector_num, qiov, nb_sectors, cb, opaque); + qemu_mod_timer(bs->block_timer, + wait_time + qemu_get_clock_ns(vm_clock)); + bs->req_from_queue = false; + return ret; + } + } + ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, cb, opaque); @@ -2211,6 +2487,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { bs->wr_highest_sector = sector_num + nb_sectors - 1; } + + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] += + (unsigned) nb_sectors * BDRV_SECTOR_SIZE; + bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++; + } + } + + if (bdrv_io_limits_enable(&bs->io_limits)) { + bs->req_from_queue = false; } return ret; diff --git a/block.h b/block.h index 859d1d9..f0dac62 100644 --- a/block.h +++ b/block.h @@ -97,7 +97,6 @@ int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file, const char *backing_fmt); void bdrv_register(BlockDriver *bdrv); - typedef struct BdrvCheckResult { int corruptions; int leaks; diff --git a/block_int.h b/block_int.h index 1e265d2..1ca826b 100644 --- a/block_int.h +++ b/block_int.h @@ -27,10 +27,17 @@ #include "block.h" #include "qemu-option.h" #include "qemu-queue.h" +#include "block/blk-queue.h" #define BLOCK_FLAG_ENCRYPT 1 #define BLOCK_FLAG_COMPAT6 4 +#define BLOCK_IO_LIMIT_READ 0 +#define BLOCK_IO_LIMIT_WRITE 1 +#define BLOCK_IO_LIMIT_TOTAL 2 + +#define BLOCK_IO_SLICE_TIME 100000000 + #define BLOCK_OPT_SIZE "size" #define BLOCK_OPT_ENCRYPT "encryption" #define BLOCK_OPT_COMPAT6 "compat6" @@ -46,6 +53,16 @@ typedef struct AIOPool { BlockDriverAIOCB *free_aiocb; } AIOPool; +typedef struct BlockIOLimit { + uint64_t bps[3]; + uint64_t iops[3]; +} BlockIOLimit; + +typedef struct BlockIODisp { + uint64_t bytes[2]; + uint64_t ios[2]; +} BlockIODisp; + struct BlockDriver { const char *format_name; int instance_size; @@ -175,6 +192,15 @@ struct BlockDriverState { void *sync_aiocb; + /* the time for latest disk I/O */ + int64_t slice_start[2]; + int64_t slice_end[2]; + BlockIOLimit io_limits; + BlockIODisp io_disps; + BlockQueue *block_queue; + QEMUTimer *block_timer; + bool req_from_queue; + /* I/O stats (display with "info blockstats"). */ uint64_t rd_bytes; uint64_t wr_bytes; @@ -222,6 +248,9 @@ void qemu_aio_release(void *p); void *qemu_blockalign(BlockDriverState *bs, size_t size); +void bdrv_set_io_limits(BlockDriverState *bs, + BlockIOLimit *io_limits); + #ifdef _WIN32 int is_windows_drive(const char *filename); #endif -- 1.7.2.3
next prev parent reply other threads:[~2011-08-01 6:29 UTC|newest] Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top 2011-08-01 6:25 [PATCH v4 0/3] The intro for QEMU disk I/O limits Zhi Yong Wu 2011-08-01 6:25 ` [Qemu-devel] " Zhi Yong Wu 2011-08-01 6:25 ` [PATCH v4 1/3] The cmd support for QEMU block I/O throttling Zhi Yong Wu 2011-08-01 6:25 ` [Qemu-devel] " Zhi Yong Wu 2011-08-01 6:25 ` [PATCH v4 2/3] The support for block queue Zhi Yong Wu 2011-08-01 6:25 ` [Qemu-devel] " Zhi Yong Wu 2011-08-01 20:21 ` Ryan Harper 2011-08-01 20:21 ` Ryan Harper 2011-08-05 2:57 ` Zhi Yong Wu 2011-08-05 2:57 ` Zhi Yong Wu 2011-08-01 6:25 ` Zhi Yong Wu [this message] 2011-08-01 6:25 ` [Qemu-devel] [PATCH v4 3/3] The support for queue timer and throttling algorithm Zhi Yong Wu 2011-08-01 20:39 ` Ryan Harper 2011-08-01 20:39 ` Ryan Harper 2011-08-05 2:48 ` Zhi Yong Wu 2011-08-05 2:48 ` Zhi Yong Wu 2011-08-01 20:06 ` [PATCH v4 0/3] The intro for QEMU disk I/O limits Ryan Harper 2011-08-01 20:06 ` [Qemu-devel] " Ryan Harper 2011-08-05 2:20 ` Zhi Yong Wu 2011-08-05 2:20 ` Zhi Yong Wu
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=1312179955-23536-4-git-send-email-wuzhy@linux.vnet.ibm.com \ --to=wuzhy@linux.vnet.ibm.com \ --cc=aliguori@us.ibm.com \ --cc=kvm@vger.kernel.org \ --cc=kwolf@redhat.com \ --cc=luowenj@cn.ibm.com \ --cc=mtosatti@redhat.com \ --cc=qemu-devel@nongnu.org \ --cc=raharper@us.ibm.com \ --cc=stefanha@linux.vnet.ibm.com \ --cc=zwu.kernel@gmail.com \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.