All of lore.kernel.org
 help / color / mirror / Atom feed
From: Max Reitz <mreitz@redhat.com>
To: Pavel Butsykin <pbutsykin@virtuozzo.com>,
	qemu-block@nongnu.org, qemu-devel@nongnu.org
Cc: kwolf@redhat.com, eblake@redhat.com, armbru@redhat.com, den@openvz.org
Subject: Re: [Qemu-devel] [PATCH v2 3/4] qcow2: add shrink image support
Date: Thu, 22 Jun 2017 00:55:18 +0200	[thread overview]
Message-ID: <aed64e0e-61ed-12c9-8676-ad55795b6e62@redhat.com> (raw)
In-Reply-To: <20170613121639.17853-4-pbutsykin@virtuozzo.com>

[-- Attachment #1: Type: text/plain, Size: 10721 bytes --]

On 2017-06-13 14:16, Pavel Butsykin wrote:
> This patch add shrinking of the image file for qcow2. As a result, this allows
> us to reduce the virtual image size and free up space on the disk without
> copying the image. Image can be fragmented and shrink is done by punching holes
> in the image file.
> 
> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
> ---
>  block/qcow2-cluster.c  | 42 ++++++++++++++++++++++++++++++++
>  block/qcow2-refcount.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/qcow2.c          | 40 +++++++++++++++++++++++--------
>  block/qcow2.h          |  2 ++
>  qapi/block-core.json   |  3 ++-
>  5 files changed, 141 insertions(+), 11 deletions(-)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index d779ea19cf..a84b7e607e 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -32,6 +32,48 @@
>  #include "qemu/bswap.h"
>  #include "trace.h"
>  
> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size)

It's not really a max_size but always an exact size. You don't want it
to be any smaller than this.

> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    int new_l1_size, i, ret;
> +
> +    if (max_size >= s->l1_size) {
> +        return 0;
> +    }
> +
> +    new_l1_size = max_size;
> +
> +#ifdef DEBUG_ALLOC2
> +    fprintf(stderr, "shrink l1_table from %d to %" PRId64 "\n",
> +            s->l1_size, new_l1_size);

new_l1_size is of type int, not int64_t.

> +#endif
> +
> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
> +    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
> +                                       sizeof(uint64_t) * new_l1_size,
> +                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_flush(bs->file->bs);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
> +    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
> +        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
> +            continue;
> +        }
> +        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
> +                            s->l2_size * sizeof(uint64_t),

I'm more of a fan of s->cluster_size instead of s->l2_size *
sizeof(uint64_t) but it's not like it matters...

> +                            QCOW2_DISCARD_ALWAYS);
> +        s->l1_table[i] = 0;

I'd probably clear the overhanging s->l1_table entries before
bdrv_flush() (before you shouldn't really use them after
bdrv_pwrite_zeroes() has returned, even if bdrv_flush() has failed), but
it's not absolutely necessary. As long as they still have a refcount of
at least one, writing to them will just be useless but not destroy any data.

> +    }
> +    return 0;
> +}
> +
>  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>                          bool exact_size)
>  {
> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
> index 576ab551d6..e98306acd8 100644
> --- a/block/qcow2-refcount.c
> +++ b/block/qcow2-refcount.c
> @@ -29,6 +29,7 @@
>  #include "block/qcow2.h"
>  #include "qemu/range.h"
>  #include "qemu/bswap.h"
> +#include "qemu/cutils.h"
>  
>  static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
>  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
> @@ -2936,3 +2937,67 @@ done:
>      qemu_vfree(new_refblock);
>      return ret;
>  }
> +
> +int qcow2_shrink_reftable(BlockDriverState *bs)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    uint64_t *reftable_tmp =
> +        g_try_malloc(sizeof(uint64_t) * s->refcount_table_size);
> +    int i, ret;
> +
> +    if (s->refcount_table_size && reftable_tmp == NULL) {
> +        return -ENOMEM;
> +    }
> +
> +    for (i = 0; i < s->refcount_table_size; i++) {
> +        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
> +        void *refblock;
> +        bool unused_block;
> +
> +        if (refblock_offs == 0) {
> +            reftable_tmp[i] = 0;
> +            continue;
> +        }
> +        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
> +                              &refblock);
> +        if (ret < 0) {
> +            goto out;
> +        }
> +
> +        /* the refblock has own reference */
> +        if (i == refblock_offs >> (s->refcount_block_bits + s->cluster_bits)) {
> +            uint64_t blk_index = (refblock_offs >> s->cluster_bits) &
> +                                 (s->refcount_block_size - 1);
> +            uint64_t refcount = s->get_refcount(refblock, blk_index);
> +
> +            s->set_refcount(refblock, blk_index, 0);
> +
> +            unused_block = buffer_is_zero(refblock, s->refcount_block_size);

s/refcount_block_size/cluster_size/

> +
> +            s->set_refcount(refblock, blk_index, refcount);
> +        } else {
> +            unused_block = buffer_is_zero(refblock, s->refcount_block_size);

Same here.

> +        }
> +        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
> +
> +        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
> +    }
> +
> +    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
> +                           sizeof(uint64_t) * s->refcount_table_size);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    for (i = 0; i < s->refcount_table_size; i++) {
> +        if (s->refcount_table[i] && !reftable_tmp[i]) {
> +            qcow2_free_clusters(bs, s->refcount_table[i] & REFT_OFFSET_MASK,
> +                                s->cluster_size, QCOW2_DISCARD_ALWAYS);

This doesn't feel like a very good idea. The bdrv_pwrite_sync() before
has brought the on-disk refcount structures into a different state than
what we have cached.

OTOH, the bdrv_pwrite_sync() has accessed only the reftable and this
should only access refblocks. So I cannot think of any way this might
actually do something bad. But I guess it'll be better for to revisit
this when it's not in the middle of the night (so on Friday).

> +            s->refcount_table[i] = 0;
> +        }
> +    }
> +
> +out:
> +    g_free(reftable_tmp);
> +    return ret;
> +}
> diff --git a/block/qcow2.c b/block/qcow2.c
> index b3ba5daa93..0ad46d2776 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2545,6 +2545,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
>  {
>      BDRVQcow2State *s = bs->opaque;
>      int64_t new_l1_size;
> +    uint64_t total_size;
>      int ret;
>  
>      if (offset & 511) {
> @@ -2558,17 +2559,36 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
>          return -ENOTSUP;
>      }
>  
> -    /* shrinking is currently not supported */
> -    if (offset < bs->total_sectors * 512) {
> -        error_setg(errp, "qcow2 doesn't support shrinking images yet");
> -        return -ENOTSUP;
> -    }
> -
>      new_l1_size = size_to_l1(s, offset);
> -    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
> -    if (ret < 0) {
> -        error_setg_errno(errp, -ret, "Failed to grow the L1 table");
> -        return ret;
> +    total_size = bs->total_sectors << BDRV_SECTOR_BITS;
> +
> +    if (offset < total_size) {
> +        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
> +                                    total_size - ROUND_UP(offset,
> +                                                          s->cluster_size),
> +                                    QCOW2_DISCARD_ALWAYS, true);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to discard reduced clasters");

s/clasters/clusters/

And maybe "truncated", "stripped", or "cropped" instead of "reduced"?

> +            return ret;
> +        }
> +
> +        ret = qcow2_shrink_l1_table(bs, new_l1_size);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to reduce the L1 table");

s/reduce/shrink/ (or "truncate"; or "reduce the L1 table size")

Also, to be fair, you're actually reducing the number of L2 tables, not
the size of the L1 table. (But that's a nit pick)

> +            return ret;
> +        }
> +
> +        ret = qcow2_shrink_reftable(bs);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to shrink the refcount table");

And this is not really shrinking the reftable but instead discarding
some refblocks (potentially). (This is a nit pick, too)

Max

> +            return ret;
> +        }
> +    } else {
> +        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
> +            return ret;
> +        }
>      }
>  
>      /* write updated header.size */
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 07faa6dc78..600463bf8e 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -531,10 +531,12 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
>  int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
>                                  BlockDriverAmendStatusCB *status_cb,
>                                  void *cb_opaque, Error **errp);
> +int qcow2_shrink_reftable(BlockDriverState *bs);
>  
>  /* qcow2-cluster.c functions */
>  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>                          bool exact_size);
> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size);
>  int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
>  int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
>  int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index f85c2235c7..bcbffa3339 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -2372,7 +2372,8 @@
>              'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
>              'flush_to_disk', 'pwritev_rmw_head', 'pwritev_rmw_after_head',
>              'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
> -            'pwritev_zero', 'pwritev_done', 'empty_image_prepare' ] }
> +            'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
> +            'l1_shrink_write_table', 'l1_shrink_free_l2_clusters' ] }
>  
>  ##
>  # @BlkdebugInjectErrorOptions:
> 



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 498 bytes --]

  reply	other threads:[~2017-06-21 22:55 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-06-13 12:16 [Qemu-devel] [PATCH v2 0/4] Add shrink image for qcow2 Pavel Butsykin
2017-06-13 12:16 ` [Qemu-devel] [PATCH v2 1/4] qemu-img: add --shrink flag for resize Pavel Butsykin
2017-06-21 22:17   ` Max Reitz
2017-06-22 13:54     ` Pavel Butsykin
2017-06-22 14:49       ` Kevin Wolf
2017-06-22 16:26         ` Pavel Butsykin
2017-06-13 12:16 ` [Qemu-devel] [PATCH v2 2/4] qcow2: add qcow2_cache_discard Pavel Butsykin
2017-06-21 22:29   ` Max Reitz
2017-06-22 13:55     ` Pavel Butsykin
2017-06-13 12:16 ` [Qemu-devel] [PATCH v2 3/4] qcow2: add shrink image support Pavel Butsykin
2017-06-21 22:55   ` Max Reitz [this message]
2017-06-22 13:57     ` Pavel Butsykin
2017-06-23 15:46       ` Max Reitz
2017-06-26 15:23         ` Pavel Butsykin
2017-06-26 17:47           ` Max Reitz
2017-06-27 15:06             ` Pavel Butsykin
2017-06-28 13:59               ` Max Reitz
2017-06-28 15:31                 ` Pavel Butsykin
2017-06-28 23:36                   ` Max Reitz
2017-06-13 12:16 ` [Qemu-devel] [PATCH v2 4/4] qemu-iotests: add shrinking image test Pavel Butsykin
2017-06-23 15:47   ` Max Reitz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aed64e0e-61ed-12c9-8676-ad55795b6e62@redhat.com \
    --to=mreitz@redhat.com \
    --cc=armbru@redhat.com \
    --cc=den@openvz.org \
    --cc=eblake@redhat.com \
    --cc=kwolf@redhat.com \
    --cc=pbutsykin@virtuozzo.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.