From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:37249) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1dNoX7-0006J6-V1 for qemu-devel@nongnu.org; Wed, 21 Jun 2017 18:55:35 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1dNoX6-0005Ii-8i for qemu-devel@nongnu.org; Wed, 21 Jun 2017 18:55:34 -0400 References: <20170613121639.17853-1-pbutsykin@virtuozzo.com> <20170613121639.17853-4-pbutsykin@virtuozzo.com> From: Max Reitz Message-ID: Date: Thu, 22 Jun 2017 00:55:18 +0200 MIME-Version: 1.0 In-Reply-To: <20170613121639.17853-4-pbutsykin@virtuozzo.com> Content-Type: multipart/signed; micalg=pgp-sha256; protocol="application/pgp-signature"; boundary="aRn8wCFvSW9XjSfkvvBg4f6N5TRFieadu" Subject: Re: [Qemu-devel] [PATCH v2 3/4] qcow2: add shrink image support List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Pavel Butsykin , qemu-block@nongnu.org, qemu-devel@nongnu.org Cc: kwolf@redhat.com, eblake@redhat.com, armbru@redhat.com, den@openvz.org This is an OpenPGP/MIME signed message (RFC 4880 and 3156) --aRn8wCFvSW9XjSfkvvBg4f6N5TRFieadu From: Max Reitz To: Pavel Butsykin , qemu-block@nongnu.org, qemu-devel@nongnu.org Cc: kwolf@redhat.com, eblake@redhat.com, armbru@redhat.com, den@openvz.org Message-ID: Subject: Re: [PATCH v2 3/4] qcow2: add shrink image support References: <20170613121639.17853-1-pbutsykin@virtuozzo.com> <20170613121639.17853-4-pbutsykin@virtuozzo.com> In-Reply-To: <20170613121639.17853-4-pbutsykin@virtuozzo.com> Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: quoted-printable On 2017-06-13 14:16, Pavel Butsykin wrote: > This patch add shrinking of the image file for qcow2. As a result, this= allows > us to reduce the virtual image size and free up space on the disk witho= ut > copying the image. Image can be fragmented and shrink is done by punchi= ng holes > in the image file. >=20 > Signed-off-by: Pavel Butsykin > --- > block/qcow2-cluster.c | 42 ++++++++++++++++++++++++++++++++ > block/qcow2-refcount.c | 65 ++++++++++++++++++++++++++++++++++++++++++= ++++++++ > block/qcow2.c | 40 +++++++++++++++++++++++-------- > block/qcow2.h | 2 ++ > qapi/block-core.json | 3 ++- > 5 files changed, 141 insertions(+), 11 deletions(-) >=20 > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c > index d779ea19cf..a84b7e607e 100644 > --- a/block/qcow2-cluster.c > +++ b/block/qcow2-cluster.c > @@ -32,6 +32,48 @@ > #include "qemu/bswap.h" > #include "trace.h" > =20 > +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size) It's not really a max_size but always an exact size. You don't want it to be any smaller than this. > +{ > + BDRVQcow2State *s =3D bs->opaque; > + int new_l1_size, i, ret; > + > + if (max_size >=3D s->l1_size) { > + return 0; > + } > + > + new_l1_size =3D max_size; > + > +#ifdef DEBUG_ALLOC2 > + fprintf(stderr, "shrink l1_table from %d to %" PRId64 "\n", > + s->l1_size, new_l1_size); new_l1_size is of type int, not int64_t. > +#endif > + > + BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE); > + ret =3D bdrv_pwrite_zeroes(bs->file, s->l1_table_offset + > + sizeof(uint64_t) * new_l1_size,= > + (s->l1_size - new_l1_size) * sizeof(uint6= 4_t), 0); > + if (ret < 0) { > + return ret; > + } > + > + ret =3D bdrv_flush(bs->file->bs); > + if (ret < 0) { > + return ret; > + } > + > + BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS); > + for (i =3D s->l1_size - 1; i > new_l1_size - 1; i--) { > + if ((s->l1_table[i] & L1E_OFFSET_MASK) =3D=3D 0) { > + continue; > + } > + qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK, > + s->l2_size * sizeof(uint64_t), I'm more of a fan of s->cluster_size instead of s->l2_size * sizeof(uint64_t) but it's not like it matters... > + QCOW2_DISCARD_ALWAYS); > + s->l1_table[i] =3D 0; I'd probably clear the overhanging s->l1_table entries before bdrv_flush() (before you shouldn't really use them after bdrv_pwrite_zeroes() has returned, even if bdrv_flush() has failed), but it's not absolutely necessary. As long as they still have a refcount of at least one, writing to them will just be useless but not destroy any da= ta. > + } > + return 0; > +} > + > int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, > bool exact_size) > { > diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c > index 576ab551d6..e98306acd8 100644 > --- a/block/qcow2-refcount.c > +++ b/block/qcow2-refcount.c > @@ -29,6 +29,7 @@ > #include "block/qcow2.h" > #include "qemu/range.h" > #include "qemu/bswap.h" > +#include "qemu/cutils.h" > =20 > static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t siz= e); > static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *b= s, > @@ -2936,3 +2937,67 @@ done: > qemu_vfree(new_refblock); > return ret; > } > + > +int qcow2_shrink_reftable(BlockDriverState *bs) > +{ > + BDRVQcow2State *s =3D bs->opaque; > + uint64_t *reftable_tmp =3D > + g_try_malloc(sizeof(uint64_t) * s->refcount_table_size); > + int i, ret; > + > + if (s->refcount_table_size && reftable_tmp =3D=3D NULL) { > + return -ENOMEM; > + } > + > + for (i =3D 0; i < s->refcount_table_size; i++) { > + int64_t refblock_offs =3D s->refcount_table[i] & REFT_OFFSET_M= ASK; > + void *refblock; > + bool unused_block; > + > + if (refblock_offs =3D=3D 0) { > + reftable_tmp[i] =3D 0; > + continue; > + } > + ret =3D qcow2_cache_get(bs, s->refcount_block_cache, refblock_= offs, > + &refblock); > + if (ret < 0) { > + goto out; > + } > + > + /* the refblock has own reference */ > + if (i =3D=3D refblock_offs >> (s->refcount_block_bits + s->clu= ster_bits)) { > + uint64_t blk_index =3D (refblock_offs >> s->cluster_bits) = & > + (s->refcount_block_size - 1); > + uint64_t refcount =3D s->get_refcount(refblock, blk_index)= ; > + > + s->set_refcount(refblock, blk_index, 0); > + > + unused_block =3D buffer_is_zero(refblock, s->refcount_bloc= k_size); s/refcount_block_size/cluster_size/ > + > + s->set_refcount(refblock, blk_index, refcount); > + } else { > + unused_block =3D buffer_is_zero(refblock, s->refcount_bloc= k_size); Same here. > + } > + qcow2_cache_put(bs, s->refcount_block_cache, &refblock); > + > + reftable_tmp[i] =3D unused_block ? 0 : cpu_to_be64(s->refcount= _table[i]); > + } > + > + ret =3D bdrv_pwrite_sync(bs->file, s->refcount_table_offset, refta= ble_tmp, > + sizeof(uint64_t) * s->refcount_table_size);= > + if (ret < 0) { > + goto out; > + } > + > + for (i =3D 0; i < s->refcount_table_size; i++) { > + if (s->refcount_table[i] && !reftable_tmp[i]) { > + qcow2_free_clusters(bs, s->refcount_table[i] & REFT_OFFSET= _MASK, > + s->cluster_size, QCOW2_DISCARD_ALWAYS)= ; This doesn't feel like a very good idea. The bdrv_pwrite_sync() before has brought the on-disk refcount structures into a different state than what we have cached. OTOH, the bdrv_pwrite_sync() has accessed only the reftable and this should only access refblocks. So I cannot think of any way this might actually do something bad. But I guess it'll be better for to revisit this when it's not in the middle of the night (so on Friday). > + s->refcount_table[i] =3D 0; > + } > + } > + > +out: > + g_free(reftable_tmp); > + return ret; > +} > diff --git a/block/qcow2.c b/block/qcow2.c > index b3ba5daa93..0ad46d2776 100644 > --- a/block/qcow2.c > +++ b/block/qcow2.c > @@ -2545,6 +2545,7 @@ static int qcow2_truncate(BlockDriverState *bs, i= nt64_t offset, Error **errp) > { > BDRVQcow2State *s =3D bs->opaque; > int64_t new_l1_size; > + uint64_t total_size; > int ret; > =20 > if (offset & 511) { > @@ -2558,17 +2559,36 @@ static int qcow2_truncate(BlockDriverState *bs,= int64_t offset, Error **errp) > return -ENOTSUP; > } > =20 > - /* shrinking is currently not supported */ > - if (offset < bs->total_sectors * 512) { > - error_setg(errp, "qcow2 doesn't support shrinking images yet")= ; > - return -ENOTSUP; > - } > - > new_l1_size =3D size_to_l1(s, offset); > - ret =3D qcow2_grow_l1_table(bs, new_l1_size, true); > - if (ret < 0) { > - error_setg_errno(errp, -ret, "Failed to grow the L1 table"); > - return ret; > + total_size =3D bs->total_sectors << BDRV_SECTOR_BITS; > + > + if (offset < total_size) { > + ret =3D qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_= size), > + total_size - ROUND_UP(offset, > + s->cluster_s= ize), > + QCOW2_DISCARD_ALWAYS, true); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Failed to discard reduced cl= asters"); s/clasters/clusters/ And maybe "truncated", "stripped", or "cropped" instead of "reduced"? > + return ret; > + } > + > + ret =3D qcow2_shrink_l1_table(bs, new_l1_size); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Failed to reduce the L1 tabl= e"); s/reduce/shrink/ (or "truncate"; or "reduce the L1 table size") Also, to be fair, you're actually reducing the number of L2 tables, not the size of the L1 table. (But that's a nit pick) > + return ret; > + } > + > + ret =3D qcow2_shrink_reftable(bs); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Failed to shrink the refcoun= t table"); And this is not really shrinking the reftable but instead discarding some refblocks (potentially). (This is a nit pick, too) Max > + return ret; > + } > + } else { > + ret =3D qcow2_grow_l1_table(bs, new_l1_size, true); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Failed to grow the L1 table"= ); > + return ret; > + } > } > =20 > /* write updated header.size */ > diff --git a/block/qcow2.h b/block/qcow2.h > index 07faa6dc78..600463bf8e 100644 > --- a/block/qcow2.h > +++ b/block/qcow2.h > @@ -531,10 +531,12 @@ int qcow2_pre_write_overlap_check(BlockDriverStat= e *bs, int ign, int64_t offset, > int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_ord= er, > BlockDriverAmendStatusCB *status_cb, > void *cb_opaque, Error **errp); > +int qcow2_shrink_reftable(BlockDriverState *bs); > =20 > /* qcow2-cluster.c functions */ > int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, > bool exact_size); > +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size); > int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); > int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_of= fset); > int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, > diff --git a/qapi/block-core.json b/qapi/block-core.json > index f85c2235c7..bcbffa3339 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -2372,7 +2372,8 @@ > 'cluster_alloc_bytes', 'cluster_free', 'flush_to_os', > 'flush_to_disk', 'pwritev_rmw_head', 'pwritev_rmw_after_he= ad', > 'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev', > - 'pwritev_zero', 'pwritev_done', 'empty_image_prepare' ] } > + 'pwritev_zero', 'pwritev_done', 'empty_image_prepare', > + 'l1_shrink_write_table', 'l1_shrink_free_l2_clusters' ] } > =20 > ## > # @BlkdebugInjectErrorOptions: >=20 --aRn8wCFvSW9XjSfkvvBg4f6N5TRFieadu Content-Type: application/pgp-signature; name="signature.asc" Content-Description: OpenPGP digital signature Content-Disposition: attachment; filename="signature.asc" -----BEGIN PGP SIGNATURE----- Version: GnuPG v2 iQEvBAEBCAAZBQJZSvlXEhxtcmVpdHpAcmVkaGF0LmNvbQAKCRD0B9sAYdXPQMqK B/9QXXpx9gjq7siAdLpQtb/cWC9RhCsQS1srFi0rcqVKoqPn2fAlQgk0lBwQpvGw 0pNXI7+aHMoL6gAjSq18fREC3NeykI19nrsvLkzbZubXRnhHgNpNGDgSzeQQD/jh 3gilPvNAdMEUYMFItdf2hveu6LZW9hbqp+MWYMf34Cy3LnCqvKsKLfaTrpAFRON0 SAuWpCffNtt+ElXFCfqOJvHvzKOiwHei83J/7kWhIBVefwr6o42dfwUWrh4YhJhu dQc8+jIt7hk7COTj2picI0bK0fceG3QTekt8ixmzQNdI9q7I15if+/2TujAZOTMo RmG+JsgisPiJwa0kgPV8BVcg =5a/k -----END PGP SIGNATURE----- --aRn8wCFvSW9XjSfkvvBg4f6N5TRFieadu--