All of lore.kernel.org
 help / color / mirror / Atom feed
From: Stefan Hajnoczi <stefanha@gmail.com>
To: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Cc: kwolf@redhat.com, aliguori@us.ibm.com,
	stefanha@linux.vnet.ibm.com, kvm@vger.kernel.org,
	mtosatti@redhat.com, qemu-devel@nongnu.org, pair@us.ibm.com,
	zwu.kernel@gmail.com, ryanh@us.ibm.com
Subject: Re: [PATCH v6 3/4] block: add block timer and block throttling algorithm
Date: Thu, 1 Sep 2011 14:36:41 +0100	[thread overview]
Message-ID: <CAJSP0QU8GqWsA6jaYAfs0=hvabQayfRPSzRNctGP6GEGgptHaw@mail.gmail.com> (raw)
In-Reply-To: <1314877456-19521-4-git-send-email-wuzhy@linux.vnet.ibm.com>

On Thu, Sep 1, 2011 at 12:44 PM, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> wrote:
> Note:
>     1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>     2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
>
> For these problems, if you have nice thought, pls let us know.:)
>
> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> ---
>  block.c     |  290 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  block.h     |    5 +
>  block_int.h |    9 ++
>  3 files changed, 296 insertions(+), 8 deletions(-)
>
> diff --git a/block.c b/block.c
> index 17ee3df..680f1e7 100644
> --- a/block.c
> +++ b/block.c
> @@ -30,6 +30,9 @@
>  #include "qemu-objects.h"
>  #include "qemu-coroutine.h"
>
> +#include "qemu-timer.h"
> +#include "block/blk-queue.h"
> +
>  #ifdef CONFIG_BSD
>  #include <sys/types.h>
>  #include <sys/stat.h>
> @@ -72,6 +75,13 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
>                                          QEMUIOVector *iov);
>  static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
>
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +        double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, uint64_t *wait);
> +
>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>     QTAILQ_HEAD_INITIALIZER(bdrv_states);
>
> @@ -104,6 +114,64 @@ int is_windows_drive(const char *filename)
>  }
>  #endif
>
> +/* throttling disk I/O limits */
> +void bdrv_io_limits_disable(BlockDriverState *bs)
> +{
> +    bs->io_limits_enabled = false;
> +
> +    if (bs->block_queue) {
> +        qemu_block_queue_flush(bs->block_queue);
> +        qemu_del_block_queue(bs->block_queue);
> +        bs->block_queue = NULL;
> +    }
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);
> +        bs->block_timer = NULL;
> +    }
> +
> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = 0;
> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = 0;
> +
> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    = 0;
> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   = 0;
> +}
> +
> +static void bdrv_block_timer(void *opaque)
> +{
> +    BlockDriverState *bs = opaque;
> +    BlockQueue *queue = bs->block_queue;
> +
> +    qemu_block_queue_flush(queue);
> +}
> +
> +void bdrv_io_limits_enable(BlockDriverState *bs)
> +{
> +    bs->block_queue    = qemu_new_block_queue();
> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
> +
> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] =
> +           bs->slice_start[BLOCK_IO_LIMIT_READ];
> +
> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
> +           bs->slice_start[BLOCK_IO_LIMIT_READ] + BLOCK_IO_SLICE_TIME;
> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
> +           bs->slice_end[BLOCK_IO_LIMIT_READ];
> +}
> +
> +bool bdrv_io_limits_enabled(BlockDriverState *bs)
> +{
> +    BlockIOLimit *io_limits = &bs->io_limits;
> +    return io_limits->bps[BLOCK_IO_LIMIT_READ]
> +         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
> +         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
> +         || io_limits->iops[BLOCK_IO_LIMIT_READ]
> +         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
> +         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
> +}
> +
>  /* check if the path starts with "<protocol>:" */
>  static int path_has_protocol(const char *path)
>  {
> @@ -694,6 +762,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>     }
>
> +    /* throttling disk I/O limits */
> +    if (bs->io_limits_enabled) {
> +        bdrv_io_limits_enable(bs);
> +    }
> +
>     return 0;
>
>  unlink_and_fail:
> @@ -732,6 +805,18 @@ void bdrv_close(BlockDriverState *bs)
>         if (bs->change_cb)
>             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>     }
> +
> +    /* throttling disk I/O limits */
> +    if (bs->block_queue) {
> +        qemu_del_block_queue(bs->block_queue);
> +        bs->block_queue = NULL;
> +    }
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);
> +        bs->block_timer = NULL;
> +    }
>  }
>
>  void bdrv_close_all(void)
> @@ -2290,13 +2375,29 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>                                  BlockDriverCompletionFunc *cb, void *opaque)
>  {
>     BlockDriver *drv = bs->drv;
> +    uint64_t wait_time = 0;
> +    BlockDriverAIOCB *ret;
>
>     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>
> -    if (!drv)
> -        return NULL;
> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
>         return NULL;
> +    }
> +
> +    /* throttling disk read I/O */
> +    if (bs->io_limits_enabled) {
> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
> +                           sector_num, qiov, nb_sectors, cb, opaque);
> +            qemu_mod_timer(bs->block_timer,
> +                           wait_time + qemu_get_clock_ns(vm_clock));
> +            return ret;
> +        }
> +
> +        bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
> +                           (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +        bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
> +    }
>
>     return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>                                cb, opaque);
> @@ -2345,15 +2446,14 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>     BlockDriver *drv = bs->drv;
>     BlockDriverAIOCB *ret;
>     BlockCompleteData *blk_cb_data;
> +    uint64_t wait_time = 0;
>
>     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>
> -    if (!drv)
> -        return NULL;
> -    if (bs->read_only)
> -        return NULL;
> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> +    if (!drv || bs->read_only
> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
>         return NULL;
> +    }
>
>     if (bs->dirty_bitmap) {
>         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
> @@ -2362,6 +2462,17 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>         opaque = blk_cb_data;
>     }
>
> +    /* throttling disk write I/O */
> +    if (bs->io_limits_enabled) {
> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
> +                                  sector_num, qiov, nb_sectors, cb, opaque);
> +            qemu_mod_timer(bs->block_timer,
> +                                  wait_time + qemu_get_clock_ns(vm_clock));
> +            return ret;
> +        }
> +    }
> +
>     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
>                                cb, opaque);
>
> @@ -2369,6 +2480,12 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
>             bs->wr_highest_sector = sector_num + nb_sectors - 1;
>         }
> +
> +        if (bs->io_limits_enabled) {
> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
> +                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
> +        }
>     }
>
>     return ret;
> @@ -2633,6 +2750,163 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>     acb->pool->cancel(acb);
>  }
>
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +                 bool is_write, double elapsed_time, uint64_t *wait) {
> +    uint64_t bps_limit = 0;
> +    double   bytes_limit, bytes_disp, bytes_res;
> +    double   slice_time, wait_time;
> +
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.bps[is_write]) {
> +        bps_limit = bs->io_limits.bps[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
> +    slice_time /= (NANOSECONDS_PER_SECOND);
> +    bytes_limit = bps_limit * slice_time;
> +    bytes_disp  = bs->io_disps.bytes[is_write];
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bytes_disp += bs->io_disps.bytes[!is_write];
> +    }
> +
> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +
> +    if (bytes_disp + bytes_res <= bytes_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
> +
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    }
> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +                             double elapsed_time, uint64_t *wait) {
> +    uint64_t iops_limit = 0;
> +    double   ios_limit, ios_disp;
> +    double   slice_time, wait_time;
> +
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.iops[is_write]) {
> +        iops_limit = bs->io_limits.iops[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
> +    slice_time /= (NANOSECONDS_PER_SECOND);
> +    ios_limit  = iops_limit * slice_time;
> +    ios_disp   = bs->io_disps.ios[is_write];
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        ios_disp += bs->io_disps.ios[!is_write];
> +    }
> +
> +    if (ios_disp + 1 <= ios_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (ios_disp + 1) / iops_limit;
> +    if (wait_time > elapsed_time) {
> +        wait_time = wait_time - elapsed_time;
> +    } else {
> +        wait_time = 0;
> +    }
> +
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    }
> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +                           bool is_write, uint64_t *wait) {
> +    int64_t  now;
> +    uint64_t bps_wait = 0, iops_wait = 0, max_wait;
> +    double   elapsed_time;
> +    int      bps_ret, iops_ret;
> +
> +    now = qemu_get_clock_ns(vm_clock);
> +    if ((bs->slice_start[is_write] < now)
> +        && (bs->slice_end[is_write] > now)) {
> +        bs->slice_end[is_write]   = now + BLOCK_IO_SLICE_TIME;
> +    } else {
> +        bs->slice_start[is_write]     = now;
> +        bs->slice_end[is_write]       = now + BLOCK_IO_SLICE_TIME;
> +
> +        bs->io_disps.bytes[is_write]  = 0;
> +        bs->io_disps.bytes[!is_write] = 0;
> +
> +        bs->io_disps.ios[is_write]    = 0;
> +        bs->io_disps.ios[!is_write]   = 0;

Does it make sense to keep separate slice_start/slice_end for read and
write since we reset the dispatched statistics to zero for both?
Perhaps we should use a scalar slice_start/slice_end and not two
separate values for read/write.

> +    }
> +
> +    /* If a limit was exceeded, immediately queue this request */
> +    if (qemu_block_queue_has_pending(bs->block_queue)) {
> +        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
> +            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
> +            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +            if (wait) {
> +                *wait = 0;

This causes the queue to be flushed each time the guest enqueues an
I/O while there are queued requests.  Perhaps this is (part of) the
CPU overhead that Ryan's benchmarking discovered.

If we try to preserve request ordering then I don't think there is a
reason to modify the timer once it has been set.

Stefan

WARNING: multiple messages have this Message-ID (diff)
From: Stefan Hajnoczi <stefanha@gmail.com>
To: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Cc: kwolf@redhat.com, aliguori@us.ibm.com,
	stefanha@linux.vnet.ibm.com, kvm@vger.kernel.org,
	mtosatti@redhat.com, qemu-devel@nongnu.org, pair@us.ibm.com,
	zwu.kernel@gmail.com, ryanh@us.ibm.com
Subject: Re: [Qemu-devel] [PATCH v6 3/4] block: add block timer and block throttling algorithm
Date: Thu, 1 Sep 2011 14:36:41 +0100	[thread overview]
Message-ID: <CAJSP0QU8GqWsA6jaYAfs0=hvabQayfRPSzRNctGP6GEGgptHaw@mail.gmail.com> (raw)
In-Reply-To: <1314877456-19521-4-git-send-email-wuzhy@linux.vnet.ibm.com>

On Thu, Sep 1, 2011 at 12:44 PM, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> wrote:
> Note:
>     1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>     2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
>
> For these problems, if you have nice thought, pls let us know.:)
>
> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> ---
>  block.c     |  290 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  block.h     |    5 +
>  block_int.h |    9 ++
>  3 files changed, 296 insertions(+), 8 deletions(-)
>
> diff --git a/block.c b/block.c
> index 17ee3df..680f1e7 100644
> --- a/block.c
> +++ b/block.c
> @@ -30,6 +30,9 @@
>  #include "qemu-objects.h"
>  #include "qemu-coroutine.h"
>
> +#include "qemu-timer.h"
> +#include "block/blk-queue.h"
> +
>  #ifdef CONFIG_BSD
>  #include <sys/types.h>
>  #include <sys/stat.h>
> @@ -72,6 +75,13 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
>                                          QEMUIOVector *iov);
>  static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
>
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +        double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, uint64_t *wait);
> +
>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>     QTAILQ_HEAD_INITIALIZER(bdrv_states);
>
> @@ -104,6 +114,64 @@ int is_windows_drive(const char *filename)
>  }
>  #endif
>
> +/* throttling disk I/O limits */
> +void bdrv_io_limits_disable(BlockDriverState *bs)
> +{
> +    bs->io_limits_enabled = false;
> +
> +    if (bs->block_queue) {
> +        qemu_block_queue_flush(bs->block_queue);
> +        qemu_del_block_queue(bs->block_queue);
> +        bs->block_queue = NULL;
> +    }
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);
> +        bs->block_timer = NULL;
> +    }
> +
> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = 0;
> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = 0;
> +
> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    = 0;
> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   = 0;
> +}
> +
> +static void bdrv_block_timer(void *opaque)
> +{
> +    BlockDriverState *bs = opaque;
> +    BlockQueue *queue = bs->block_queue;
> +
> +    qemu_block_queue_flush(queue);
> +}
> +
> +void bdrv_io_limits_enable(BlockDriverState *bs)
> +{
> +    bs->block_queue    = qemu_new_block_queue();
> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
> +
> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] =
> +           bs->slice_start[BLOCK_IO_LIMIT_READ];
> +
> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
> +           bs->slice_start[BLOCK_IO_LIMIT_READ] + BLOCK_IO_SLICE_TIME;
> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
> +           bs->slice_end[BLOCK_IO_LIMIT_READ];
> +}
> +
> +bool bdrv_io_limits_enabled(BlockDriverState *bs)
> +{
> +    BlockIOLimit *io_limits = &bs->io_limits;
> +    return io_limits->bps[BLOCK_IO_LIMIT_READ]
> +         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
> +         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
> +         || io_limits->iops[BLOCK_IO_LIMIT_READ]
> +         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
> +         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
> +}
> +
>  /* check if the path starts with "<protocol>:" */
>  static int path_has_protocol(const char *path)
>  {
> @@ -694,6 +762,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>     }
>
> +    /* throttling disk I/O limits */
> +    if (bs->io_limits_enabled) {
> +        bdrv_io_limits_enable(bs);
> +    }
> +
>     return 0;
>
>  unlink_and_fail:
> @@ -732,6 +805,18 @@ void bdrv_close(BlockDriverState *bs)
>         if (bs->change_cb)
>             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>     }
> +
> +    /* throttling disk I/O limits */
> +    if (bs->block_queue) {
> +        qemu_del_block_queue(bs->block_queue);
> +        bs->block_queue = NULL;
> +    }
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);
> +        bs->block_timer = NULL;
> +    }
>  }
>
>  void bdrv_close_all(void)
> @@ -2290,13 +2375,29 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>                                  BlockDriverCompletionFunc *cb, void *opaque)
>  {
>     BlockDriver *drv = bs->drv;
> +    uint64_t wait_time = 0;
> +    BlockDriverAIOCB *ret;
>
>     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>
> -    if (!drv)
> -        return NULL;
> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
>         return NULL;
> +    }
> +
> +    /* throttling disk read I/O */
> +    if (bs->io_limits_enabled) {
> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
> +                           sector_num, qiov, nb_sectors, cb, opaque);
> +            qemu_mod_timer(bs->block_timer,
> +                           wait_time + qemu_get_clock_ns(vm_clock));
> +            return ret;
> +        }
> +
> +        bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
> +                           (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +        bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
> +    }
>
>     return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>                                cb, opaque);
> @@ -2345,15 +2446,14 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>     BlockDriver *drv = bs->drv;
>     BlockDriverAIOCB *ret;
>     BlockCompleteData *blk_cb_data;
> +    uint64_t wait_time = 0;
>
>     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>
> -    if (!drv)
> -        return NULL;
> -    if (bs->read_only)
> -        return NULL;
> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> +    if (!drv || bs->read_only
> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
>         return NULL;
> +    }
>
>     if (bs->dirty_bitmap) {
>         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
> @@ -2362,6 +2462,17 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>         opaque = blk_cb_data;
>     }
>
> +    /* throttling disk write I/O */
> +    if (bs->io_limits_enabled) {
> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
> +                                  sector_num, qiov, nb_sectors, cb, opaque);
> +            qemu_mod_timer(bs->block_timer,
> +                                  wait_time + qemu_get_clock_ns(vm_clock));
> +            return ret;
> +        }
> +    }
> +
>     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
>                                cb, opaque);
>
> @@ -2369,6 +2480,12 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
>             bs->wr_highest_sector = sector_num + nb_sectors - 1;
>         }
> +
> +        if (bs->io_limits_enabled) {
> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
> +                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
> +        }
>     }
>
>     return ret;
> @@ -2633,6 +2750,163 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>     acb->pool->cancel(acb);
>  }
>
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +                 bool is_write, double elapsed_time, uint64_t *wait) {
> +    uint64_t bps_limit = 0;
> +    double   bytes_limit, bytes_disp, bytes_res;
> +    double   slice_time, wait_time;
> +
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.bps[is_write]) {
> +        bps_limit = bs->io_limits.bps[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
> +    slice_time /= (NANOSECONDS_PER_SECOND);
> +    bytes_limit = bps_limit * slice_time;
> +    bytes_disp  = bs->io_disps.bytes[is_write];
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bytes_disp += bs->io_disps.bytes[!is_write];
> +    }
> +
> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +
> +    if (bytes_disp + bytes_res <= bytes_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
> +
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    }
> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +                             double elapsed_time, uint64_t *wait) {
> +    uint64_t iops_limit = 0;
> +    double   ios_limit, ios_disp;
> +    double   slice_time, wait_time;
> +
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.iops[is_write]) {
> +        iops_limit = bs->io_limits.iops[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
> +    slice_time /= (NANOSECONDS_PER_SECOND);
> +    ios_limit  = iops_limit * slice_time;
> +    ios_disp   = bs->io_disps.ios[is_write];
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        ios_disp += bs->io_disps.ios[!is_write];
> +    }
> +
> +    if (ios_disp + 1 <= ios_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (ios_disp + 1) / iops_limit;
> +    if (wait_time > elapsed_time) {
> +        wait_time = wait_time - elapsed_time;
> +    } else {
> +        wait_time = 0;
> +    }
> +
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    }
> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +                           bool is_write, uint64_t *wait) {
> +    int64_t  now;
> +    uint64_t bps_wait = 0, iops_wait = 0, max_wait;
> +    double   elapsed_time;
> +    int      bps_ret, iops_ret;
> +
> +    now = qemu_get_clock_ns(vm_clock);
> +    if ((bs->slice_start[is_write] < now)
> +        && (bs->slice_end[is_write] > now)) {
> +        bs->slice_end[is_write]   = now + BLOCK_IO_SLICE_TIME;
> +    } else {
> +        bs->slice_start[is_write]     = now;
> +        bs->slice_end[is_write]       = now + BLOCK_IO_SLICE_TIME;
> +
> +        bs->io_disps.bytes[is_write]  = 0;
> +        bs->io_disps.bytes[!is_write] = 0;
> +
> +        bs->io_disps.ios[is_write]    = 0;
> +        bs->io_disps.ios[!is_write]   = 0;

Does it make sense to keep separate slice_start/slice_end for read and
write since we reset the dispatched statistics to zero for both?
Perhaps we should use a scalar slice_start/slice_end and not two
separate values for read/write.

> +    }
> +
> +    /* If a limit was exceeded, immediately queue this request */
> +    if (qemu_block_queue_has_pending(bs->block_queue)) {
> +        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
> +            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
> +            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +            if (wait) {
> +                *wait = 0;

This causes the queue to be flushed each time the guest enqueues an
I/O while there are queued requests.  Perhaps this is (part of) the
CPU overhead that Ryan's benchmarking discovered.

If we try to preserve request ordering then I don't think there is a
reason to modify the timer once it has been set.

Stefan

  reply	other threads:[~2011-09-01 13:36 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-09-01 11:44 [PATCH v6 0/4] The intro of QEMU block I/O throttling Zhi Yong Wu
2011-09-01 11:44 ` [Qemu-devel] " Zhi Yong Wu
2011-09-01 11:44 ` [PATCH v6 1/4] block: add the command line support Zhi Yong Wu
2011-09-01 11:44   ` [Qemu-devel] " Zhi Yong Wu
2011-09-01 11:44 ` [PATCH v6 2/4] block: add the block queue support Zhi Yong Wu
2011-09-01 11:44   ` [Qemu-devel] " Zhi Yong Wu
2011-09-01 13:02   ` Stefan Hajnoczi
2011-09-01 13:02     ` [Qemu-devel] " Stefan Hajnoczi
2011-09-05  8:34     ` Zhi Yong Wu
2011-09-07 11:13       ` Stefan Hajnoczi
2011-09-01 11:44 ` [PATCH v6 3/4] block: add block timer and block throttling algorithm Zhi Yong Wu
2011-09-01 11:44   ` [Qemu-devel] " Zhi Yong Wu
2011-09-01 13:36   ` Stefan Hajnoczi [this message]
2011-09-01 13:36     ` Stefan Hajnoczi
2011-09-05  7:10     ` Zhi Yong Wu
2011-09-01 11:44 ` [PATCH v6 4/4] qmp/hmp: add block_set_io_throttle Zhi Yong Wu
2011-09-01 11:44   ` [Qemu-devel] " Zhi Yong Wu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAJSP0QU8GqWsA6jaYAfs0=hvabQayfRPSzRNctGP6GEGgptHaw@mail.gmail.com' \
    --to=stefanha@gmail.com \
    --cc=aliguori@us.ibm.com \
    --cc=kvm@vger.kernel.org \
    --cc=kwolf@redhat.com \
    --cc=mtosatti@redhat.com \
    --cc=pair@us.ibm.com \
    --cc=qemu-devel@nongnu.org \
    --cc=ryanh@us.ibm.com \
    --cc=stefanha@linux.vnet.ibm.com \
    --cc=wuzhy@linux.vnet.ibm.com \
    --cc=zwu.kernel@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.