All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kevin Wolf <kwolf@redhat.com>
To: qemu-block@nongnu.org
Cc: kwolf@redhat.com, peter.maydell@linaro.org, qemu-devel@nongnu.org
Subject: [Qemu-devel] [PULL 22/24] qcow2: add shrink image support
Date: Tue, 26 Sep 2017 16:21:31 +0200	[thread overview]
Message-ID: <20170926142133.2498-23-kwolf@redhat.com> (raw)
In-Reply-To: <20170926142133.2498-1-kwolf@redhat.com>

From: Pavel Butsykin <pbutsykin@virtuozzo.com>

This patch add shrinking of the image file for qcow2. As a result, this allows
us to reduce the virtual image size and free up space on the disk without
copying the image. Image can be fragmented and shrink is done by punching holes
in the image file.

Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 20170918124230.8152-4-pbutsykin@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qapi/block-core.json   |   8 +++-
 block/qcow2.h          |  14 ++++++
 block/qcow2-cluster.c  |  50 +++++++++++++++++++++
 block/qcow2-refcount.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.c          |  43 ++++++++++++++----
 5 files changed, 225 insertions(+), 10 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index c69a395804..750bb0c77c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2533,6 +2533,11 @@
 #
 # Trigger events supported by blkdebug.
 #
+# @l1_shrink_write_table:      write zeros to the l1 table to shrink image.
+#                              (since 2.11)
+#
+# @l1_shrink_free_l2_clusters: discard the l2 tables. (since 2.11)
+#
 # Since: 2.9
 ##
 { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
@@ -2549,7 +2554,8 @@
             'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
             'flush_to_disk', 'pwritev_rmw_head', 'pwritev_rmw_after_head',
             'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
-            'pwritev_zero', 'pwritev_done', 'empty_image_prepare' ] }
+            'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
+            'l1_shrink_write_table', 'l1_shrink_free_l2_clusters' ] }
 
 ##
 # @BlkdebugInjectErrorOptions:
diff --git a/block/qcow2.h b/block/qcow2.h
index 52c374e9ed..5a289a81e2 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -521,6 +521,18 @@ static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
     return r1 > r2 ? r1 - r2 : r2 - r1;
 }
 
+static inline
+uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
+{
+    return offset >> (s->refcount_block_bits + s->cluster_bits);
+}
+
+static inline uint64_t get_refblock_offset(BDRVQcow2State *s, uint64_t offset)
+{
+    uint32_t index = offset_to_reftable_index(s, offset);
+    return s->refcount_table[index] & REFT_OFFSET_MASK;
+}
+
 /* qcow2.c functions */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                   int64_t sector_num, int nb_sectors);
@@ -584,10 +596,12 @@ int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                 BlockDriverAmendStatusCB *status_cb,
                                 void *cb_opaque, Error **errp);
+int qcow2_shrink_reftable(BlockDriverState *bs);
 
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size);
+int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size);
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 0d4824993c..d2518d1893 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -32,6 +32,56 @@
 #include "qemu/bswap.h"
 #include "trace.h"
 
+int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int new_l1_size, i, ret;
+
+    if (exact_size >= s->l1_size) {
+        return 0;
+    }
+
+    new_l1_size = exact_size;
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
+#endif
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
+    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
+                                       new_l1_size * sizeof(uint64_t),
+                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = bdrv_flush(bs->file->bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
+    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
+        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
+            continue;
+        }
+        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
+                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
+        s->l1_table[i] = 0;
+    }
+    return 0;
+
+fail:
+    /*
+     * If the write in the l1_table failed the image may contain a partially
+     * overwritten l1_table. In this case it would be better to clear the
+     * l1_table in memory to avoid possible image corruption.
+     */
+    memset(s->l1_table + new_l1_size, 0,
+           (s->l1_size - new_l1_size) * sizeof(uint64_t));
+    return ret;
+}
+
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size)
 {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 8c17c0e3aa..88d5a3f1ad 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,6 +29,7 @@
 #include "block/qcow2.h"
 #include "qemu/range.h"
 #include "qemu/bswap.h"
+#include "qemu/cutils.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
@@ -3061,3 +3062,122 @@ done:
     qemu_vfree(new_refblock);
     return ret;
 }
+
+static int qcow2_discard_refcount_block(BlockDriverState *bs,
+                                        uint64_t discard_block_offs)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t refblock_offs = get_refblock_offset(s, discard_block_offs);
+    uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
+    uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
+    void *refblock;
+    int ret;
+
+    assert(discard_block_offs != 0);
+
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
+                          &refblock);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (s->get_refcount(refblock, block_index) != 1) {
+        qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
+                                " refblock offset %#" PRIx64
+                                ", reftable index %u"
+                                ", block offset %#" PRIx64
+                                ", refcount %#" PRIx64,
+                                refblock_offs,
+                                offset_to_reftable_index(s, discard_block_offs),
+                                discard_block_offs,
+                                s->get_refcount(refblock, block_index));
+        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+        return -EINVAL;
+    }
+    s->set_refcount(refblock, block_index, 0);
+
+    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, refblock);
+
+    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+
+    if (cluster_index < s->free_cluster_index) {
+        s->free_cluster_index = cluster_index;
+    }
+
+    refblock = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
+                                           discard_block_offs);
+    if (refblock) {
+        /* discard refblock from the cache if refblock is cached */
+        qcow2_cache_discard(bs, s->refcount_block_cache, refblock);
+    }
+    update_refcount_discard(bs, discard_block_offs, s->cluster_size);
+
+    return 0;
+}
+
+int qcow2_shrink_reftable(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t *reftable_tmp =
+        g_malloc(s->refcount_table_size * sizeof(uint64_t));
+    int i, ret;
+
+    for (i = 0; i < s->refcount_table_size; i++) {
+        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
+        void *refblock;
+        bool unused_block;
+
+        if (refblock_offs == 0) {
+            reftable_tmp[i] = 0;
+            continue;
+        }
+        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
+                              &refblock);
+        if (ret < 0) {
+            goto out;
+        }
+
+        /* the refblock has own reference */
+        if (i == offset_to_reftable_index(s, refblock_offs)) {
+            uint64_t block_index = (refblock_offs >> s->cluster_bits) &
+                                   (s->refcount_block_size - 1);
+            uint64_t refcount = s->get_refcount(refblock, block_index);
+
+            s->set_refcount(refblock, block_index, 0);
+
+            unused_block = buffer_is_zero(refblock, s->cluster_size);
+
+            s->set_refcount(refblock, block_index, refcount);
+        } else {
+            unused_block = buffer_is_zero(refblock, s->cluster_size);
+        }
+        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+
+        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
+                           s->refcount_table_size * sizeof(uint64_t));
+    /*
+     * If the write in the reftable failed the image may contain a partially
+     * overwritten reftable. In this case it would be better to clear the
+     * reftable in memory to avoid possible image corruption.
+     */
+    for (i = 0; i < s->refcount_table_size; i++) {
+        if (s->refcount_table[i] && !reftable_tmp[i]) {
+            if (ret == 0) {
+                ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
+                                                       REFT_OFFSET_MASK);
+            }
+            s->refcount_table[i] = 0;
+        }
+    }
+
+    if (!s->cache_discards) {
+        qcow2_process_discards(bs, ret);
+    }
+
+out:
+    g_free(reftable_tmp);
+    return ret;
+}
diff --git a/block/qcow2.c b/block/qcow2.c
index d33fb3ecdd..970006fc1d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3104,18 +3104,43 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
     }
 
     old_length = bs->total_sectors * 512;
+    new_l1_size = size_to_l1(s, offset);
 
-    /* shrinking is currently not supported */
     if (offset < old_length) {
-        error_setg(errp, "qcow2 doesn't support shrinking images yet");
-        return -ENOTSUP;
-    }
+        if (prealloc != PREALLOC_MODE_OFF) {
+            error_setg(errp,
+                       "Preallocation can't be used for shrinking an image");
+            return -EINVAL;
+        }
 
-    new_l1_size = size_to_l1(s, offset);
-    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "Failed to grow the L1 table");
-        return ret;
+        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
+                                    old_length - ROUND_UP(offset,
+                                                          s->cluster_size),
+                                    QCOW2_DISCARD_ALWAYS, true);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
+            return ret;
+        }
+
+        ret = qcow2_shrink_l1_table(bs, new_l1_size);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                             "Failed to reduce the number of L2 tables");
+            return ret;
+        }
+
+        ret = qcow2_shrink_reftable(bs);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                             "Failed to discard unused refblocks");
+            return ret;
+        }
+    } else {
+        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
+            return ret;
+        }
     }
 
     switch (prealloc) {
-- 
2.13.5

  parent reply	other threads:[~2017-09-26 14:22 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-09-26 14:21 [Qemu-devel] [PULL 00/24] Block layer patches Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 01/24] qemu-iotests: Add missing -machine accel=qtest Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 02/24] qemu-img: Clarify about relative backing file options Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 03/24] file-posix: Clear out first sector in hdev_create Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 04/24] docs: add qemu-block-drivers(7) man page Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 05/24] iotests: use -ccw on s390x for 040, 139, and 182 Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 06/24] iotests: use -ccw on s390x for 051 Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 07/24] iotests: use virtio aliases for 067 Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 08/24] iotests: Print full path of bad output if mismatch Kevin Wolf
2017-09-26 14:56   ` Eric Blake
2017-09-26 15:25     ` Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 09/24] throttle: Assert that bkt->max is valid in throttle_compute_wait() Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 10/24] block/throttle-groups.c: allocate RestartData on the heap Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 11/24] block: Clean up some bad code in the vvfat driver Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 12/24] qemu-io: Drop write permissions before read-only reopen Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 13/24] block: Add reopen_queue to bdrv_child_perm() Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 14/24] block: Add reopen queue to bdrv_check_perm() Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 15/24] block: Base permissions on rw state after reopen Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 16/24] block: reopen: Queue children after their parents Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 17/24] block: Fix permissions after bdrv_reopen() Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 18/24] qemu-iotests: Test change-backing-file command Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 19/24] iotests: fix 181: enable postcopy-ram capability on target Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 20/24] qemu-img: add --shrink flag for resize Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 21/24] qcow2: add qcow2_cache_discard Kevin Wolf
2017-09-26 14:21 ` Kevin Wolf [this message]
2017-09-26 14:21 ` [Qemu-devel] [PULL 23/24] qemu-iotests: add shrinking image test Kevin Wolf
2017-09-26 14:21 ` [Qemu-devel] [PULL 24/24] block/qcow2-bitmap: fix use of uninitialized pointer Kevin Wolf
2017-09-27 17:20 ` [Qemu-devel] [PULL 00/24] Block layer patches Peter Maydell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170926142133.2498-23-kwolf@redhat.com \
    --to=kwolf@redhat.com \
    --cc=peter.maydell@linaro.org \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.