From mboxrd@z Thu Jan 1 00:00:00 1970 From: Marcelo Tosatti Subject: Re: [PATCH v2 1/1] The codes V2 for QEMU disk I/O limits. Date: Tue, 26 Jul 2011 16:26:18 -0300 Message-ID: <20110726192618.GA8126@amt.cnet> References: <1311670746-20498-1-git-send-email-wuzhy@linux.vnet.ibm.com> <1311670746-20498-2-git-send-email-wuzhy@linux.vnet.ibm.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org, aliguori@us.ibm.com, stefanha@linux.vnet.ibm.com, ryanh@us.ibm.com, kwolf@redhat.com, vgoyal@redhat.com, zwu.kernel@gmail.com, luowenj@cn.ibm.com To: Zhi Yong Wu Return-path: Received: from mx1.redhat.com ([209.132.183.28]:26381 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753322Ab1GZT1z (ORCPT ); Tue, 26 Jul 2011 15:27:55 -0400 Content-Disposition: inline In-Reply-To: <1311670746-20498-2-git-send-email-wuzhy@linux.vnet.ibm.com> Sender: kvm-owner@vger.kernel.org List-ID: On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote: > Welcome to give me your comments, thanks. > > Signed-off-by: Zhi Yong Wu > --- > Makefile.objs | 2 +- > block.c | 288 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > block.h | 1 - > block/blk-queue.c | 116 +++++++++++++++++++++ > block/blk-queue.h | 70 +++++++++++++ > block_int.h | 28 +++++ > blockdev.c | 21 ++++ > qemu-config.c | 24 +++++ > qemu-option.c | 17 +++ > qemu-option.h | 1 + > qemu-options.hx | 1 + > 11 files changed, 559 insertions(+), 10 deletions(-) > create mode 100644 block/blk-queue.c > create mode 100644 block/blk-queue.h > > diff --git a/Makefile.objs b/Makefile.objs > index 9f99ed4..06f2033 100644 > --- a/Makefile.objs > +++ b/Makefile.objs > @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv > block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o > block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o > block-nested-y += qed-check.o > -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o > +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o > block-nested-$(CONFIG_WIN32) += raw-win32.o > block-nested-$(CONFIG_POSIX) += raw-posix.o > block-nested-$(CONFIG_CURL) += curl.o > diff --git a/block.c b/block.c > index 24a25d5..e54e59c 100644 > --- a/block.c > +++ b/block.c > @@ -29,6 +29,9 @@ > #include "module.h" > #include "qemu-objects.h" > > +#include "qemu-timer.h" > +#include "block/blk-queue.h" > + > #ifdef CONFIG_BSD > #include > #include > @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, > static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, > const uint8_t *buf, int nb_sectors); > > +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, > + bool is_write, double elapsed_time, uint64_t *wait); > +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, > + double elapsed_time, uint64_t *wait); > +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, > + bool is_write, uint64_t *wait); > + > static QTAILQ_HEAD(, BlockDriverState) bdrv_states = > QTAILQ_HEAD_INITIALIZER(bdrv_states); > > @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename) > } > #endif > > +static int bdrv_io_limits_enable(BlockIOLimit *io_limits) > +{ > + if ((io_limits->bps[0] == 0) > + && (io_limits->bps[1] == 0) > + && (io_limits->bps[2] == 0) > + && (io_limits->iops[0] == 0) > + && (io_limits->iops[1] == 0) > + && (io_limits->iops[2] == 0)) { > + return 0; > + } > + > + return 1; > +} > + > /* check if the path starts with ":" */ > static int path_has_protocol(const char *path) > { > @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size, > } > } > > +static void bdrv_block_timer(void *opaque) > +{ > + BlockDriverState *bs = opaque; > + BlockQueue *queue = bs->block_queue; > + > + while (!QTAILQ_EMPTY(&queue->requests)) { > + BlockIORequest *request; > + int ret; > + > + request = QTAILQ_FIRST(&queue->requests); > + QTAILQ_REMOVE(&queue->requests, request, entry); > + > + ret = qemu_block_queue_handler(request); > + if (ret == 0) { > + QTAILQ_INSERT_HEAD(&queue->requests, request, entry); > + break; > + } > + > + qemu_free(request); > + } > +} > + > void bdrv_register(BlockDriver *bdrv) > { > if (!bdrv->bdrv_aio_readv) { > @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, > bs->change_cb(bs->change_opaque, CHANGE_MEDIA); > } > > + /* throttling disk I/O limits */ > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->block_queue = qemu_new_block_queue(); > + bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); > + > + bs->slice_start[0] = qemu_get_clock_ns(vm_clock); > + bs->slice_start[1] = qemu_get_clock_ns(vm_clock); > + } > + It should be possible to tune the limits on the flight, please introduce QMP commands for that. > return 0; > > unlink_and_fail: > @@ -680,6 +735,16 @@ void bdrv_close(BlockDriverState *bs) > if (bs->change_cb) > bs->change_cb(bs->change_opaque, CHANGE_MEDIA); > } > + > + /* throttling disk I/O limits */ > + if (bs->block_queue) { > + qemu_del_block_queue(bs->block_queue); > + } > + > + if (bs->block_timer) { > + qemu_del_timer(bs->block_timer); > + qemu_free_timer(bs->block_timer); > + } > } > > void bdrv_close_all(void) > @@ -1312,6 +1377,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs, > *psecs = bs->secs; > } > > +/* throttling disk io limits */ > +void bdrv_set_io_limits(BlockDriverState *bs, > + BlockIOLimit *io_limits) > +{ > + memset(&bs->io_limits, 0, sizeof(BlockIOLimit)); > + bs->io_limits = *io_limits; > +} > + > /* Recognize floppy formats */ > typedef struct FDFormat { > FDriveType drive; > @@ -2111,6 +2184,155 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn) > return buf; > } > > +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, > + bool is_write, double elapsed_time, uint64_t *wait) { > + uint64_t bps_limit = 0; > + double bytes_limit, bytes_disp, bytes_res; > + double slice_time = 0.1, wait_time; > + > + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { > + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; > + } else if (bs->io_limits.bps[is_write]) { > + bps_limit = bs->io_limits.bps[is_write]; > + } else { > + if (wait) { > + *wait = 0; > + } > + > + return false; > + } > + > + bytes_limit = bps_limit * slice_time; > + bytes_disp = bs->io_disps.bytes[is_write]; > + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { > + bytes_disp += bs->io_disps.bytes[!is_write]; > + } > + > + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; Virtio can submit requests of 512 sectors or more... does not play well with 1MB/sec limit. > + if (bytes_disp + bytes_res <= bytes_limit) { > + if (wait) { > + *wait = 0; > + } > + > + return false; > + } > + > + /* Calc approx time to dispatch */ > + wait_time = (bytes_disp + bytes_res - bytes_limit) / bps_limit; > + if (!wait_time) { > + wait_time = 1; > + } > + > + wait_time = wait_time + (slice_time - elapsed_time); > + if (wait) { > + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1; > + } The guest can keep submitting requests where "wait_time = 1" above, and the timer will be rearmed continuously in the future. Can't you simply arm the timer to the next slice start? _Some_ data must be transfered by then, anyway (and nothing can be transfered earlier than that). Same for iops calculation below. > + > + return true; > +} > + > +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, > + double elapsed_time, uint64_t *wait) { > + uint64_t iops_limit = 0; > + double ios_limit, ios_disp; > + double slice_time = 0.1, wait_time; > + > + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; > + } else if (bs->io_limits.iops[is_write]) { > + iops_limit = bs->io_limits.iops[is_write]; > + } else { > + if (wait) { > + *wait = 0; > + } > + > + return false; > + } > + > + ios_limit = iops_limit * slice_time; > + ios_disp = bs->io_disps.ios[is_write]; > + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + ios_disp += bs->io_disps.ios[!is_write]; > + } > + > + if (ios_disp + 1 <= ios_limit) { > + if (wait) { > + *wait = 0; > + } > + > + return false; > + } > + > + /* Calc approx time to dispatch */ > + wait_time = (ios_disp + 1) / iops_limit; > + if (wait_time > elapsed_time) { > + wait_time = wait_time - elapsed_time; > + } else { > + wait_time = 0; > + } > + > + if (wait) { > + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1; > + } > + > + return true; > +} > + > +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, > + bool is_write, uint64_t *wait) { > + int64_t real_time; > + uint64_t bps_wait = 0, iops_wait = 0, max_wait; > + double elapsed_time; > + int bps_ret, iops_ret; > + > + real_time = qemu_get_clock_ns(vm_clock); > + if (bs->slice_start[is_write] + BLOCK_IO_SLICE_TIME <= real_time) { > + bs->slice_start[is_write] = real_time; > + > + bs->io_disps.bytes[is_write] = 0; > + bs->io_disps.bytes[!is_write] = 0; > + > + bs->io_disps.ios[is_write] = 0; > + bs->io_disps.ios[!is_write] = 0; > + } > + > + /* If a limit was exceeded, immediately queue this request */ > + if ((bs->req_from_queue == false) > + && !QTAILQ_EMPTY(&bs->block_queue->requests)) { > + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] > + || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write] > + || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + if (wait) { > + *wait = 0; > + } > + > + return true; > + } > + } > + > + elapsed_time = real_time - bs->slice_start[is_write]; > + elapsed_time /= (BLOCK_IO_SLICE_TIME * 10.0); > + > + bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, > + is_write, elapsed_time, &bps_wait); > + iops_ret = bdrv_exceed_iops_limits(bs, is_write, > + elapsed_time, &iops_wait); > + if (bps_ret || iops_ret) { > + max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; > + if (wait) { > + *wait = max_wait; > + } > + > + return true; > + } > + > + if (wait) { > + *wait = 0; > + } > + > + return false; > +} > > /**************************************************************/ > /* async I/Os */ > @@ -2121,13 +2343,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, > { > BlockDriver *drv = bs->drv; > BlockDriverAIOCB *ret; > + uint64_t wait_time = 0; > > trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); > > - if (!drv) > - return NULL; > - if (bdrv_check_request(bs, sector_num, nb_sectors)) > + if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) { > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->req_from_queue = false; > + } > return NULL; > + } > + > + /* throttling disk read I/O */ > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) { > + ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv, > + sector_num, qiov, nb_sectors, cb, opaque); > + qemu_mod_timer(bs->block_timer, > + wait_time + qemu_get_clock_ns(vm_clock)); > + bs->req_from_queue = false; > + return ret; > + } > + } > > ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, > cb, opaque); > @@ -2136,6 +2373,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, > /* Update stats even though technically transfer has not happened. */ > bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE; > bs->rd_ops ++; > + > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] += > + (unsigned) nb_sectors * BDRV_SECTOR_SIZE; > + bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++; > + } > + } > + > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->req_from_queue = false; > } > > return ret; > @@ -2184,15 +2431,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, > BlockDriver *drv = bs->drv; > BlockDriverAIOCB *ret; > BlockCompleteData *blk_cb_data; > + uint64_t wait_time = 0; > > trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); > > - if (!drv) > - return NULL; > - if (bs->read_only) > - return NULL; > - if (bdrv_check_request(bs, sector_num, nb_sectors)) > + if (!drv || bs->read_only > + || bdrv_check_request(bs, sector_num, nb_sectors)) { > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->req_from_queue = false; > + } > + > return NULL; > + } > > if (bs->dirty_bitmap) { > blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, > @@ -2201,6 +2451,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, > opaque = blk_cb_data; > } > > + /* throttling disk write I/O */ > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) { > + ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev, > + sector_num, qiov, nb_sectors, cb, opaque); > + qemu_mod_timer(bs->block_timer, > + wait_time + qemu_get_clock_ns(vm_clock)); > + bs->req_from_queue = false; > + return ret; > + } > + } > + > ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, > cb, opaque); > > @@ -2211,6 +2473,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, > if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { > bs->wr_highest_sector = sector_num + nb_sectors - 1; > } > + > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] += > + (unsigned) nb_sectors * BDRV_SECTOR_SIZE; > + bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++; > + } > + } > + > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->req_from_queue = false; > } > > return ret; > diff --git a/block.h b/block.h > index 859d1d9..f0dac62 100644 > --- a/block.h > +++ b/block.h > @@ -97,7 +97,6 @@ int bdrv_change_backing_file(BlockDriverState *bs, > const char *backing_file, const char *backing_fmt); > void bdrv_register(BlockDriver *bdrv); > > - > typedef struct BdrvCheckResult { > int corruptions; > int leaks; > diff --git a/block/blk-queue.c b/block/blk-queue.c > new file mode 100644 > index 0000000..09fcfe9 > --- /dev/null > +++ b/block/blk-queue.c > @@ -0,0 +1,116 @@ > +/* > + * QEMU System Emulator queue definition for block layer > + * > + * Copyright (c) 2011 Zhi Yong Wu > + * > + * Permission is hereby granted, free of charge, to any person obtaining a copy > + * of this software and associated documentation files (the "Software"), to deal > + * in the Software without restriction, including without limitation the rights > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > + * copies of the Software, and to permit persons to whom the Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > + * THE SOFTWARE. > + */ > + > +#include "block_int.h" > +#include "qemu-queue.h" > +#include "block/blk-queue.h" > + > +/* The APIs for block request queue on qemu block layer. > + */ > + > +static void qemu_block_queue_cancel(BlockDriverAIOCB *acb) > +{ > + qemu_aio_release(acb); > +} > + > +static AIOPool block_queue_pool = { > + .aiocb_size = sizeof(struct BlockDriverAIOCB), > + .cancel = qemu_block_queue_cancel, > +}; > + > +static void qemu_block_queue_callback(void *opaque, int ret) > +{ > + BlockDriverAIOCB *acb = opaque; > + > + qemu_aio_release(acb); > +} > + > +BlockQueue *qemu_new_block_queue(void) > +{ > + BlockQueue *queue; > + > + queue = qemu_mallocz(sizeof(BlockQueue)); > + > + QTAILQ_INIT(&queue->requests); > + > + return queue; > +} > + > +void qemu_del_block_queue(BlockQueue *queue) > +{ > + BlockIORequest *request, *next; > + > + QTAILQ_FOREACH_SAFE(request, &queue->requests, entry, next) { > + QTAILQ_REMOVE(&queue->requests, request, entry); > + qemu_free(request); > + } > + > + qemu_free(queue); > +} > + > +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue, > + BlockDriverState *bs, > + BlockRequestHandler *handler, > + int64_t sector_num, > + QEMUIOVector *qiov, > + int nb_sectors, > + BlockDriverCompletionFunc *cb, > + void *opaque) > +{ > + BlockIORequest *request; > + BlockDriverAIOCB *acb; > + > + request = qemu_malloc(sizeof(BlockIORequest)); > + request->bs = bs; > + request->handler = handler; > + request->sector_num = sector_num; > + request->qiov = qiov; > + request->nb_sectors = nb_sectors; > + request->cb = cb; > + request->opaque = opaque; > + > + QTAILQ_INSERT_TAIL(&queue->requests, request, entry); > + > + acb = qemu_aio_get(&block_queue_pool, bs, > + qemu_block_queue_callback, opaque); > + > + return acb; > +} > + > +int qemu_block_queue_handler(BlockIORequest *request) > +{ > + int ret; > + BlockDriverAIOCB *res; > + > + /* indicate this req is from block queue */ > + request->bs->req_from_queue = true; > + > + res = request->handler(request->bs, request->sector_num, > + request->qiov, request->nb_sectors, > + request->cb, request->opaque); > + > + ret = (res == NULL) ? 0 : 1; > + > + return ret; > +} > diff --git a/block/blk-queue.h b/block/blk-queue.h > new file mode 100644 > index 0000000..47f8a36 > --- /dev/null > +++ b/block/blk-queue.h > @@ -0,0 +1,70 @@ > +/* > + * QEMU System Emulator queue declaration for block layer > + * > + * Copyright (c) 2011 Zhi Yong Wu > + * > + * Permission is hereby granted, free of charge, to any person obtaining a copy > + * of this software and associated documentation files (the "Software"), to deal > + * in the Software without restriction, including without limitation the rights > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > + * copies of the Software, and to permit persons to whom the Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > + * THE SOFTWARE. > + */ > + > +#ifndef QEMU_BLOCK_QUEUE_H > +#define QEMU_BLOCK_QUEUE_H > + > +#include "block.h" > +#include "qemu-queue.h" > +#include "qemu-common.h" > + > +typedef BlockDriverAIOCB* (BlockRequestHandler) (BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, > + int nb_sectors, BlockDriverCompletionFunc *cb, > + void *opaque); > + > +struct BlockIORequest { > + QTAILQ_ENTRY(BlockIORequest) entry; > + BlockDriverState *bs; > + BlockRequestHandler *handler; > + int64_t sector_num; > + QEMUIOVector *qiov; > + int nb_sectors; > + BlockDriverCompletionFunc *cb; > + void *opaque; > +}; > + > +typedef struct BlockIORequest BlockIORequest; > + > +struct BlockQueue { > + QTAILQ_HEAD(requests, BlockIORequest) requests; > +}; > + > +typedef struct BlockQueue BlockQueue; > + > +BlockQueue *qemu_new_block_queue(void); > + > +void qemu_del_block_queue(BlockQueue *queue); > + > +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue, > + BlockDriverState *bs, > + BlockRequestHandler *handler, > + int64_t sector_num, > + QEMUIOVector *qiov, > + int nb_sectors, > + BlockDriverCompletionFunc *cb, > + void *opaque); > + > +int qemu_block_queue_handler(BlockIORequest *request); > +#endif /* QEMU_BLOCK_QUEUE_H */ > diff --git a/block_int.h b/block_int.h > index 1e265d2..1587171 100644 > --- a/block_int.h > +++ b/block_int.h > @@ -27,10 +27,17 @@ > #include "block.h" > #include "qemu-option.h" > #include "qemu-queue.h" > +#include "block/blk-queue.h" > > #define BLOCK_FLAG_ENCRYPT 1 > #define BLOCK_FLAG_COMPAT6 4 > > +#define BLOCK_IO_LIMIT_READ 0 > +#define BLOCK_IO_LIMIT_WRITE 1 > +#define BLOCK_IO_LIMIT_TOTAL 2 > + > +#define BLOCK_IO_SLICE_TIME 100000000 > + > #define BLOCK_OPT_SIZE "size" > #define BLOCK_OPT_ENCRYPT "encryption" > #define BLOCK_OPT_COMPAT6 "compat6" > @@ -46,6 +53,16 @@ typedef struct AIOPool { > BlockDriverAIOCB *free_aiocb; > } AIOPool; > > +typedef struct BlockIOLimit { > + uint64_t bps[3]; > + uint64_t iops[3]; > +} BlockIOLimit; > + > +typedef struct BlockIODisp { > + uint64_t bytes[2]; > + uint64_t ios[2]; > +} BlockIODisp; > + > struct BlockDriver { > const char *format_name; > int instance_size; > @@ -175,6 +192,14 @@ struct BlockDriverState { > > void *sync_aiocb; > > + /* the time for latest disk I/O */ > + int64_t slice_start[2]; > + BlockIOLimit io_limits; > + BlockIODisp io_disps; > + BlockQueue *block_queue; > + QEMUTimer *block_timer; > + bool req_from_queue; > + > /* I/O stats (display with "info blockstats"). */ > uint64_t rd_bytes; > uint64_t wr_bytes; > @@ -222,6 +247,9 @@ void qemu_aio_release(void *p); > > void *qemu_blockalign(BlockDriverState *bs, size_t size); > > +void bdrv_set_io_limits(BlockDriverState *bs, > + BlockIOLimit *io_limits); > + > #ifdef _WIN32 > int is_windows_drive(const char *filename); > #endif > diff --git a/blockdev.c b/blockdev.c > index c263663..45602f4 100644 > --- a/blockdev.c > +++ b/blockdev.c > @@ -238,6 +238,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) > int on_read_error, on_write_error; > const char *devaddr; > DriveInfo *dinfo; > + BlockIOLimit io_limits; > + bool iol_flag = false; > + const char *iol_opts[7] = {"bps", "bps_rd", "bps_wr", "iops", "iops_rd", "iops_wr"}; > int is_extboot = 0; > int snapshot = 0; > int ret; > @@ -372,6 +375,19 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) > return NULL; > } > > + /* disk io limits */ > + iol_flag = qemu_opt_io_limits_enable_flag(opts, iol_opts); > + if (iol_flag) { > + memset(&io_limits, 0, sizeof(BlockIOLimit)); > + > + io_limits.bps[2] = qemu_opt_get_number(opts, "bps", 0); > + io_limits.bps[0] = qemu_opt_get_number(opts, "bps_rd", 0); > + io_limits.bps[1] = qemu_opt_get_number(opts, "bps_wr", 0); > + io_limits.iops[2] = qemu_opt_get_number(opts, "iops", 0); > + io_limits.iops[0] = qemu_opt_get_number(opts, "iops_rd", 0); > + io_limits.iops[1] = qemu_opt_get_number(opts, "iops_wr", 0); > + } > + > on_write_error = BLOCK_ERR_STOP_ENOSPC; > if ((buf = qemu_opt_get(opts, "werror")) != NULL) { > if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) { > @@ -483,6 +499,11 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) > > bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error); > > + /* throttling disk io limits */ > + if (iol_flag) { > + bdrv_set_io_limits(dinfo->bdrv, &io_limits); > + } > + > switch(type) { > case IF_IDE: > case IF_SCSI: > diff --git a/qemu-config.c b/qemu-config.c > index efa892c..9232bbb 100644 > --- a/qemu-config.c > +++ b/qemu-config.c > @@ -82,6 +82,30 @@ static QemuOptsList qemu_drive_opts = { > .name = "boot", > .type = QEMU_OPT_BOOL, > .help = "make this a boot drive", > + },{ > + .name = "iops", > + .type = QEMU_OPT_NUMBER, > + .help = "limit total I/O operations per second", > + },{ > + .name = "iops_rd", > + .type = QEMU_OPT_NUMBER, > + .help = "limit read operations per second", > + },{ > + .name = "iops_wr", > + .type = QEMU_OPT_NUMBER, > + .help = "limit write operations per second", > + },{ > + .name = "bps", > + .type = QEMU_OPT_NUMBER, > + .help = "limit total bytes per second", > + },{ > + .name = "bps_rd", > + .type = QEMU_OPT_NUMBER, > + .help = "limit read bytes per second", > + },{ > + .name = "bps_wr", > + .type = QEMU_OPT_NUMBER, > + .help = "limit write bytes per second", > }, > { /* end of list */ } > }, > diff --git a/qemu-option.c b/qemu-option.c > index 65db542..9fe234d 100644 > --- a/qemu-option.c > +++ b/qemu-option.c > @@ -562,6 +562,23 @@ uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval) > return opt->value.uint; > } > > +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts) > +{ > + int i; > + uint64_t opt_val = 0; > + bool iol_flag = false; > + > + for (i = 0; iol_opts[i]; i++) { > + opt_val = qemu_opt_get_number(opts, iol_opts[i], 0); > + if (opt_val != 0) { > + iol_flag = true; > + break; > + } > + } > + > + return iol_flag; > +} > + > uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval) > { > QemuOpt *opt = qemu_opt_find(opts, name); > diff --git a/qemu-option.h b/qemu-option.h > index b515813..fc909f9 100644 > --- a/qemu-option.h > +++ b/qemu-option.h > @@ -107,6 +107,7 @@ struct QemuOptsList { > const char *qemu_opt_get(QemuOpts *opts, const char *name); > int qemu_opt_get_bool(QemuOpts *opts, const char *name, int defval); > uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval); > +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts); > uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval); > int qemu_opt_set(QemuOpts *opts, const char *name, const char *value); > typedef int (*qemu_opt_loopfunc)(const char *name, const char *value, void *opaque); > diff --git a/qemu-options.hx b/qemu-options.hx > index cb3347e..ae219f5 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -121,6 +121,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive, > " [,cache=writethrough|writeback|none|unsafe][,format=f]\n" > " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n" > " [,readonly=on|off][,boot=on|off]\n" > + " [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n" > " use 'file' as a drive image\n", QEMU_ARCH_ALL) > STEXI > @item -drive @var{option}[,@var{option}[,@var{option}[,...]]] > -- > 1.7.2.3 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([140.186.70.92]:48142) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1QlnIL-0003Zz-DP for qemu-devel@nongnu.org; Tue, 26 Jul 2011 15:27:59 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1QlnII-0000l7-M5 for qemu-devel@nongnu.org; Tue, 26 Jul 2011 15:27:57 -0400 Received: from mx1.redhat.com ([209.132.183.28]:56595) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1QlnIH-0000l0-UP for qemu-devel@nongnu.org; Tue, 26 Jul 2011 15:27:54 -0400 Resent-Message-ID: <20110726192729.GB8126@amt.cnet> Resent-To: Zhi Yong Wu , qemu-devel@nongnu.org, kvm@vger.kernel.org, aliguori@us.ibm.com, stefanha@linux.vnet.ibm.com, ryanh@us.ibm.com, kwolf@redhat.com, vgoyal@redhat.com, zwu.kernel@gmail.com, luowenj@cn.ibm.com Date: Tue, 26 Jul 2011 16:26:18 -0300 From: Marcelo Tosatti Message-ID: <20110726192618.GA8126@amt.cnet> References: <1311670746-20498-1-git-send-email-wuzhy@linux.vnet.ibm.com> <1311670746-20498-2-git-send-email-wuzhy@linux.vnet.ibm.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <1311670746-20498-2-git-send-email-wuzhy@linux.vnet.ibm.com> Subject: Re: [Qemu-devel] [PATCH v2 1/1] The codes V2 for QEMU disk I/O limits. List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Zhi Yong Wu Cc: kwolf@redhat.com, aliguori@us.ibm.com, stefanha@linux.vnet.ibm.com, kvm@vger.kernel.org, qemu-devel@nongnu.org, zwu.kernel@gmail.com, ryanh@us.ibm.com, luowenj@cn.ibm.com, vgoyal@redhat.com On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote: > Welcome to give me your comments, thanks. > > Signed-off-by: Zhi Yong Wu > --- > Makefile.objs | 2 +- > block.c | 288 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > block.h | 1 - > block/blk-queue.c | 116 +++++++++++++++++++++ > block/blk-queue.h | 70 +++++++++++++ > block_int.h | 28 +++++ > blockdev.c | 21 ++++ > qemu-config.c | 24 +++++ > qemu-option.c | 17 +++ > qemu-option.h | 1 + > qemu-options.hx | 1 + > 11 files changed, 559 insertions(+), 10 deletions(-) > create mode 100644 block/blk-queue.c > create mode 100644 block/blk-queue.h > > diff --git a/Makefile.objs b/Makefile.objs > index 9f99ed4..06f2033 100644 > --- a/Makefile.objs > +++ b/Makefile.objs > @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv > block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o > block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o > block-nested-y += qed-check.o > -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o > +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o > block-nested-$(CONFIG_WIN32) += raw-win32.o > block-nested-$(CONFIG_POSIX) += raw-posix.o > block-nested-$(CONFIG_CURL) += curl.o > diff --git a/block.c b/block.c > index 24a25d5..e54e59c 100644 > --- a/block.c > +++ b/block.c > @@ -29,6 +29,9 @@ > #include "module.h" > #include "qemu-objects.h" > > +#include "qemu-timer.h" > +#include "block/blk-queue.h" > + > #ifdef CONFIG_BSD > #include > #include > @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, > static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, > const uint8_t *buf, int nb_sectors); > > +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, > + bool is_write, double elapsed_time, uint64_t *wait); > +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, > + double elapsed_time, uint64_t *wait); > +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, > + bool is_write, uint64_t *wait); > + > static QTAILQ_HEAD(, BlockDriverState) bdrv_states = > QTAILQ_HEAD_INITIALIZER(bdrv_states); > > @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename) > } > #endif > > +static int bdrv_io_limits_enable(BlockIOLimit *io_limits) > +{ > + if ((io_limits->bps[0] == 0) > + && (io_limits->bps[1] == 0) > + && (io_limits->bps[2] == 0) > + && (io_limits->iops[0] == 0) > + && (io_limits->iops[1] == 0) > + && (io_limits->iops[2] == 0)) { > + return 0; > + } > + > + return 1; > +} > + > /* check if the path starts with ":" */ > static int path_has_protocol(const char *path) > { > @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size, > } > } > > +static void bdrv_block_timer(void *opaque) > +{ > + BlockDriverState *bs = opaque; > + BlockQueue *queue = bs->block_queue; > + > + while (!QTAILQ_EMPTY(&queue->requests)) { > + BlockIORequest *request; > + int ret; > + > + request = QTAILQ_FIRST(&queue->requests); > + QTAILQ_REMOVE(&queue->requests, request, entry); > + > + ret = qemu_block_queue_handler(request); > + if (ret == 0) { > + QTAILQ_INSERT_HEAD(&queue->requests, request, entry); > + break; > + } > + > + qemu_free(request); > + } > +} > + > void bdrv_register(BlockDriver *bdrv) > { > if (!bdrv->bdrv_aio_readv) { > @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, > bs->change_cb(bs->change_opaque, CHANGE_MEDIA); > } > > + /* throttling disk I/O limits */ > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->block_queue = qemu_new_block_queue(); > + bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); > + > + bs->slice_start[0] = qemu_get_clock_ns(vm_clock); > + bs->slice_start[1] = qemu_get_clock_ns(vm_clock); > + } > + It should be possible to tune the limits on the flight, please introduce QMP commands for that. > return 0; > > unlink_and_fail: > @@ -680,6 +735,16 @@ void bdrv_close(BlockDriverState *bs) > if (bs->change_cb) > bs->change_cb(bs->change_opaque, CHANGE_MEDIA); > } > + > + /* throttling disk I/O limits */ > + if (bs->block_queue) { > + qemu_del_block_queue(bs->block_queue); > + } > + > + if (bs->block_timer) { > + qemu_del_timer(bs->block_timer); > + qemu_free_timer(bs->block_timer); > + } > } > > void bdrv_close_all(void) > @@ -1312,6 +1377,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs, > *psecs = bs->secs; > } > > +/* throttling disk io limits */ > +void bdrv_set_io_limits(BlockDriverState *bs, > + BlockIOLimit *io_limits) > +{ > + memset(&bs->io_limits, 0, sizeof(BlockIOLimit)); > + bs->io_limits = *io_limits; > +} > + > /* Recognize floppy formats */ > typedef struct FDFormat { > FDriveType drive; > @@ -2111,6 +2184,155 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn) > return buf; > } > > +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, > + bool is_write, double elapsed_time, uint64_t *wait) { > + uint64_t bps_limit = 0; > + double bytes_limit, bytes_disp, bytes_res; > + double slice_time = 0.1, wait_time; > + > + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { > + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; > + } else if (bs->io_limits.bps[is_write]) { > + bps_limit = bs->io_limits.bps[is_write]; > + } else { > + if (wait) { > + *wait = 0; > + } > + > + return false; > + } > + > + bytes_limit = bps_limit * slice_time; > + bytes_disp = bs->io_disps.bytes[is_write]; > + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { > + bytes_disp += bs->io_disps.bytes[!is_write]; > + } > + > + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; Virtio can submit requests of 512 sectors or more... does not play well with 1MB/sec limit. > + if (bytes_disp + bytes_res <= bytes_limit) { > + if (wait) { > + *wait = 0; > + } > + > + return false; > + } > + > + /* Calc approx time to dispatch */ > + wait_time = (bytes_disp + bytes_res - bytes_limit) / bps_limit; > + if (!wait_time) { > + wait_time = 1; > + } > + > + wait_time = wait_time + (slice_time - elapsed_time); > + if (wait) { > + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1; > + } The guest can keep submitting requests where "wait_time = 1" above, and the timer will be rearmed continuously in the future. Can't you simply arm the timer to the next slice start? _Some_ data must be transfered by then, anyway (and nothing can be transfered earlier than that). Same for iops calculation below. > + > + return true; > +} > + > +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, > + double elapsed_time, uint64_t *wait) { > + uint64_t iops_limit = 0; > + double ios_limit, ios_disp; > + double slice_time = 0.1, wait_time; > + > + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; > + } else if (bs->io_limits.iops[is_write]) { > + iops_limit = bs->io_limits.iops[is_write]; > + } else { > + if (wait) { > + *wait = 0; > + } > + > + return false; > + } > + > + ios_limit = iops_limit * slice_time; > + ios_disp = bs->io_disps.ios[is_write]; > + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + ios_disp += bs->io_disps.ios[!is_write]; > + } > + > + if (ios_disp + 1 <= ios_limit) { > + if (wait) { > + *wait = 0; > + } > + > + return false; > + } > + > + /* Calc approx time to dispatch */ > + wait_time = (ios_disp + 1) / iops_limit; > + if (wait_time > elapsed_time) { > + wait_time = wait_time - elapsed_time; > + } else { > + wait_time = 0; > + } > + > + if (wait) { > + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1; > + } > + > + return true; > +} > + > +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, > + bool is_write, uint64_t *wait) { > + int64_t real_time; > + uint64_t bps_wait = 0, iops_wait = 0, max_wait; > + double elapsed_time; > + int bps_ret, iops_ret; > + > + real_time = qemu_get_clock_ns(vm_clock); > + if (bs->slice_start[is_write] + BLOCK_IO_SLICE_TIME <= real_time) { > + bs->slice_start[is_write] = real_time; > + > + bs->io_disps.bytes[is_write] = 0; > + bs->io_disps.bytes[!is_write] = 0; > + > + bs->io_disps.ios[is_write] = 0; > + bs->io_disps.ios[!is_write] = 0; > + } > + > + /* If a limit was exceeded, immediately queue this request */ > + if ((bs->req_from_queue == false) > + && !QTAILQ_EMPTY(&bs->block_queue->requests)) { > + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] > + || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write] > + || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + if (wait) { > + *wait = 0; > + } > + > + return true; > + } > + } > + > + elapsed_time = real_time - bs->slice_start[is_write]; > + elapsed_time /= (BLOCK_IO_SLICE_TIME * 10.0); > + > + bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, > + is_write, elapsed_time, &bps_wait); > + iops_ret = bdrv_exceed_iops_limits(bs, is_write, > + elapsed_time, &iops_wait); > + if (bps_ret || iops_ret) { > + max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; > + if (wait) { > + *wait = max_wait; > + } > + > + return true; > + } > + > + if (wait) { > + *wait = 0; > + } > + > + return false; > +} > > /**************************************************************/ > /* async I/Os */ > @@ -2121,13 +2343,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, > { > BlockDriver *drv = bs->drv; > BlockDriverAIOCB *ret; > + uint64_t wait_time = 0; > > trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); > > - if (!drv) > - return NULL; > - if (bdrv_check_request(bs, sector_num, nb_sectors)) > + if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) { > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->req_from_queue = false; > + } > return NULL; > + } > + > + /* throttling disk read I/O */ > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) { > + ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv, > + sector_num, qiov, nb_sectors, cb, opaque); > + qemu_mod_timer(bs->block_timer, > + wait_time + qemu_get_clock_ns(vm_clock)); > + bs->req_from_queue = false; > + return ret; > + } > + } > > ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, > cb, opaque); > @@ -2136,6 +2373,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, > /* Update stats even though technically transfer has not happened. */ > bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE; > bs->rd_ops ++; > + > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] += > + (unsigned) nb_sectors * BDRV_SECTOR_SIZE; > + bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++; > + } > + } > + > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->req_from_queue = false; > } > > return ret; > @@ -2184,15 +2431,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, > BlockDriver *drv = bs->drv; > BlockDriverAIOCB *ret; > BlockCompleteData *blk_cb_data; > + uint64_t wait_time = 0; > > trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); > > - if (!drv) > - return NULL; > - if (bs->read_only) > - return NULL; > - if (bdrv_check_request(bs, sector_num, nb_sectors)) > + if (!drv || bs->read_only > + || bdrv_check_request(bs, sector_num, nb_sectors)) { > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->req_from_queue = false; > + } > + > return NULL; > + } > > if (bs->dirty_bitmap) { > blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, > @@ -2201,6 +2451,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, > opaque = blk_cb_data; > } > > + /* throttling disk write I/O */ > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) { > + ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev, > + sector_num, qiov, nb_sectors, cb, opaque); > + qemu_mod_timer(bs->block_timer, > + wait_time + qemu_get_clock_ns(vm_clock)); > + bs->req_from_queue = false; > + return ret; > + } > + } > + > ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, > cb, opaque); > > @@ -2211,6 +2473,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, > if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { > bs->wr_highest_sector = sector_num + nb_sectors - 1; > } > + > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] += > + (unsigned) nb_sectors * BDRV_SECTOR_SIZE; > + bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++; > + } > + } > + > + if (bdrv_io_limits_enable(&bs->io_limits)) { > + bs->req_from_queue = false; > } > > return ret; > diff --git a/block.h b/block.h > index 859d1d9..f0dac62 100644 > --- a/block.h > +++ b/block.h > @@ -97,7 +97,6 @@ int bdrv_change_backing_file(BlockDriverState *bs, > const char *backing_file, const char *backing_fmt); > void bdrv_register(BlockDriver *bdrv); > > - > typedef struct BdrvCheckResult { > int corruptions; > int leaks; > diff --git a/block/blk-queue.c b/block/blk-queue.c > new file mode 100644 > index 0000000..09fcfe9 > --- /dev/null > +++ b/block/blk-queue.c > @@ -0,0 +1,116 @@ > +/* > + * QEMU System Emulator queue definition for block layer > + * > + * Copyright (c) 2011 Zhi Yong Wu > + * > + * Permission is hereby granted, free of charge, to any person obtaining a copy > + * of this software and associated documentation files (the "Software"), to deal > + * in the Software without restriction, including without limitation the rights > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > + * copies of the Software, and to permit persons to whom the Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > + * THE SOFTWARE. > + */ > + > +#include "block_int.h" > +#include "qemu-queue.h" > +#include "block/blk-queue.h" > + > +/* The APIs for block request queue on qemu block layer. > + */ > + > +static void qemu_block_queue_cancel(BlockDriverAIOCB *acb) > +{ > + qemu_aio_release(acb); > +} > + > +static AIOPool block_queue_pool = { > + .aiocb_size = sizeof(struct BlockDriverAIOCB), > + .cancel = qemu_block_queue_cancel, > +}; > + > +static void qemu_block_queue_callback(void *opaque, int ret) > +{ > + BlockDriverAIOCB *acb = opaque; > + > + qemu_aio_release(acb); > +} > + > +BlockQueue *qemu_new_block_queue(void) > +{ > + BlockQueue *queue; > + > + queue = qemu_mallocz(sizeof(BlockQueue)); > + > + QTAILQ_INIT(&queue->requests); > + > + return queue; > +} > + > +void qemu_del_block_queue(BlockQueue *queue) > +{ > + BlockIORequest *request, *next; > + > + QTAILQ_FOREACH_SAFE(request, &queue->requests, entry, next) { > + QTAILQ_REMOVE(&queue->requests, request, entry); > + qemu_free(request); > + } > + > + qemu_free(queue); > +} > + > +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue, > + BlockDriverState *bs, > + BlockRequestHandler *handler, > + int64_t sector_num, > + QEMUIOVector *qiov, > + int nb_sectors, > + BlockDriverCompletionFunc *cb, > + void *opaque) > +{ > + BlockIORequest *request; > + BlockDriverAIOCB *acb; > + > + request = qemu_malloc(sizeof(BlockIORequest)); > + request->bs = bs; > + request->handler = handler; > + request->sector_num = sector_num; > + request->qiov = qiov; > + request->nb_sectors = nb_sectors; > + request->cb = cb; > + request->opaque = opaque; > + > + QTAILQ_INSERT_TAIL(&queue->requests, request, entry); > + > + acb = qemu_aio_get(&block_queue_pool, bs, > + qemu_block_queue_callback, opaque); > + > + return acb; > +} > + > +int qemu_block_queue_handler(BlockIORequest *request) > +{ > + int ret; > + BlockDriverAIOCB *res; > + > + /* indicate this req is from block queue */ > + request->bs->req_from_queue = true; > + > + res = request->handler(request->bs, request->sector_num, > + request->qiov, request->nb_sectors, > + request->cb, request->opaque); > + > + ret = (res == NULL) ? 0 : 1; > + > + return ret; > +} > diff --git a/block/blk-queue.h b/block/blk-queue.h > new file mode 100644 > index 0000000..47f8a36 > --- /dev/null > +++ b/block/blk-queue.h > @@ -0,0 +1,70 @@ > +/* > + * QEMU System Emulator queue declaration for block layer > + * > + * Copyright (c) 2011 Zhi Yong Wu > + * > + * Permission is hereby granted, free of charge, to any person obtaining a copy > + * of this software and associated documentation files (the "Software"), to deal > + * in the Software without restriction, including without limitation the rights > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > + * copies of the Software, and to permit persons to whom the Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > + * THE SOFTWARE. > + */ > + > +#ifndef QEMU_BLOCK_QUEUE_H > +#define QEMU_BLOCK_QUEUE_H > + > +#include "block.h" > +#include "qemu-queue.h" > +#include "qemu-common.h" > + > +typedef BlockDriverAIOCB* (BlockRequestHandler) (BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, > + int nb_sectors, BlockDriverCompletionFunc *cb, > + void *opaque); > + > +struct BlockIORequest { > + QTAILQ_ENTRY(BlockIORequest) entry; > + BlockDriverState *bs; > + BlockRequestHandler *handler; > + int64_t sector_num; > + QEMUIOVector *qiov; > + int nb_sectors; > + BlockDriverCompletionFunc *cb; > + void *opaque; > +}; > + > +typedef struct BlockIORequest BlockIORequest; > + > +struct BlockQueue { > + QTAILQ_HEAD(requests, BlockIORequest) requests; > +}; > + > +typedef struct BlockQueue BlockQueue; > + > +BlockQueue *qemu_new_block_queue(void); > + > +void qemu_del_block_queue(BlockQueue *queue); > + > +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue, > + BlockDriverState *bs, > + BlockRequestHandler *handler, > + int64_t sector_num, > + QEMUIOVector *qiov, > + int nb_sectors, > + BlockDriverCompletionFunc *cb, > + void *opaque); > + > +int qemu_block_queue_handler(BlockIORequest *request); > +#endif /* QEMU_BLOCK_QUEUE_H */ > diff --git a/block_int.h b/block_int.h > index 1e265d2..1587171 100644 > --- a/block_int.h > +++ b/block_int.h > @@ -27,10 +27,17 @@ > #include "block.h" > #include "qemu-option.h" > #include "qemu-queue.h" > +#include "block/blk-queue.h" > > #define BLOCK_FLAG_ENCRYPT 1 > #define BLOCK_FLAG_COMPAT6 4 > > +#define BLOCK_IO_LIMIT_READ 0 > +#define BLOCK_IO_LIMIT_WRITE 1 > +#define BLOCK_IO_LIMIT_TOTAL 2 > + > +#define BLOCK_IO_SLICE_TIME 100000000 > + > #define BLOCK_OPT_SIZE "size" > #define BLOCK_OPT_ENCRYPT "encryption" > #define BLOCK_OPT_COMPAT6 "compat6" > @@ -46,6 +53,16 @@ typedef struct AIOPool { > BlockDriverAIOCB *free_aiocb; > } AIOPool; > > +typedef struct BlockIOLimit { > + uint64_t bps[3]; > + uint64_t iops[3]; > +} BlockIOLimit; > + > +typedef struct BlockIODisp { > + uint64_t bytes[2]; > + uint64_t ios[2]; > +} BlockIODisp; > + > struct BlockDriver { > const char *format_name; > int instance_size; > @@ -175,6 +192,14 @@ struct BlockDriverState { > > void *sync_aiocb; > > + /* the time for latest disk I/O */ > + int64_t slice_start[2]; > + BlockIOLimit io_limits; > + BlockIODisp io_disps; > + BlockQueue *block_queue; > + QEMUTimer *block_timer; > + bool req_from_queue; > + > /* I/O stats (display with "info blockstats"). */ > uint64_t rd_bytes; > uint64_t wr_bytes; > @@ -222,6 +247,9 @@ void qemu_aio_release(void *p); > > void *qemu_blockalign(BlockDriverState *bs, size_t size); > > +void bdrv_set_io_limits(BlockDriverState *bs, > + BlockIOLimit *io_limits); > + > #ifdef _WIN32 > int is_windows_drive(const char *filename); > #endif > diff --git a/blockdev.c b/blockdev.c > index c263663..45602f4 100644 > --- a/blockdev.c > +++ b/blockdev.c > @@ -238,6 +238,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) > int on_read_error, on_write_error; > const char *devaddr; > DriveInfo *dinfo; > + BlockIOLimit io_limits; > + bool iol_flag = false; > + const char *iol_opts[7] = {"bps", "bps_rd", "bps_wr", "iops", "iops_rd", "iops_wr"}; > int is_extboot = 0; > int snapshot = 0; > int ret; > @@ -372,6 +375,19 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) > return NULL; > } > > + /* disk io limits */ > + iol_flag = qemu_opt_io_limits_enable_flag(opts, iol_opts); > + if (iol_flag) { > + memset(&io_limits, 0, sizeof(BlockIOLimit)); > + > + io_limits.bps[2] = qemu_opt_get_number(opts, "bps", 0); > + io_limits.bps[0] = qemu_opt_get_number(opts, "bps_rd", 0); > + io_limits.bps[1] = qemu_opt_get_number(opts, "bps_wr", 0); > + io_limits.iops[2] = qemu_opt_get_number(opts, "iops", 0); > + io_limits.iops[0] = qemu_opt_get_number(opts, "iops_rd", 0); > + io_limits.iops[1] = qemu_opt_get_number(opts, "iops_wr", 0); > + } > + > on_write_error = BLOCK_ERR_STOP_ENOSPC; > if ((buf = qemu_opt_get(opts, "werror")) != NULL) { > if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) { > @@ -483,6 +499,11 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) > > bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error); > > + /* throttling disk io limits */ > + if (iol_flag) { > + bdrv_set_io_limits(dinfo->bdrv, &io_limits); > + } > + > switch(type) { > case IF_IDE: > case IF_SCSI: > diff --git a/qemu-config.c b/qemu-config.c > index efa892c..9232bbb 100644 > --- a/qemu-config.c > +++ b/qemu-config.c > @@ -82,6 +82,30 @@ static QemuOptsList qemu_drive_opts = { > .name = "boot", > .type = QEMU_OPT_BOOL, > .help = "make this a boot drive", > + },{ > + .name = "iops", > + .type = QEMU_OPT_NUMBER, > + .help = "limit total I/O operations per second", > + },{ > + .name = "iops_rd", > + .type = QEMU_OPT_NUMBER, > + .help = "limit read operations per second", > + },{ > + .name = "iops_wr", > + .type = QEMU_OPT_NUMBER, > + .help = "limit write operations per second", > + },{ > + .name = "bps", > + .type = QEMU_OPT_NUMBER, > + .help = "limit total bytes per second", > + },{ > + .name = "bps_rd", > + .type = QEMU_OPT_NUMBER, > + .help = "limit read bytes per second", > + },{ > + .name = "bps_wr", > + .type = QEMU_OPT_NUMBER, > + .help = "limit write bytes per second", > }, > { /* end of list */ } > }, > diff --git a/qemu-option.c b/qemu-option.c > index 65db542..9fe234d 100644 > --- a/qemu-option.c > +++ b/qemu-option.c > @@ -562,6 +562,23 @@ uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval) > return opt->value.uint; > } > > +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts) > +{ > + int i; > + uint64_t opt_val = 0; > + bool iol_flag = false; > + > + for (i = 0; iol_opts[i]; i++) { > + opt_val = qemu_opt_get_number(opts, iol_opts[i], 0); > + if (opt_val != 0) { > + iol_flag = true; > + break; > + } > + } > + > + return iol_flag; > +} > + > uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval) > { > QemuOpt *opt = qemu_opt_find(opts, name); > diff --git a/qemu-option.h b/qemu-option.h > index b515813..fc909f9 100644 > --- a/qemu-option.h > +++ b/qemu-option.h > @@ -107,6 +107,7 @@ struct QemuOptsList { > const char *qemu_opt_get(QemuOpts *opts, const char *name); > int qemu_opt_get_bool(QemuOpts *opts, const char *name, int defval); > uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval); > +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts); > uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval); > int qemu_opt_set(QemuOpts *opts, const char *name, const char *value); > typedef int (*qemu_opt_loopfunc)(const char *name, const char *value, void *opaque); > diff --git a/qemu-options.hx b/qemu-options.hx > index cb3347e..ae219f5 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -121,6 +121,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive, > " [,cache=writethrough|writeback|none|unsafe][,format=f]\n" > " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n" > " [,readonly=on|off][,boot=on|off]\n" > + " [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n" > " use 'file' as a drive image\n", QEMU_ARCH_ALL) > STEXI > @item -drive @var{option}[,@var{option}[,@var{option}[,...]]] > -- > 1.7.2.3 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html