All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
@ 2018-05-02 13:34 Ivan Ren
  2018-05-02 14:02 ` Eric Blake
  2018-05-02 14:37 ` Max Reitz
  0 siblings, 2 replies; 12+ messages in thread
From: Ivan Ren @ 2018-05-02 13:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-block, kwolf, mreitz

qemu-img info with a block device which has a qcow2 format always
return 0 for disk size, and this can not reflect the qcow2 size
and the used space of the block device. This patch return the
allocated size of qcow2 as the disk size.

Signed-off-by: Ivan Ren <ivanren@tencent.com>
---
 block/qcow2-bitmap.c |  69 +++++++++++++++++
 block/qcow2.c        | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.h        |  42 ++++++++++
 3 files changed, 323 insertions(+)

diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c
index 6e93ec4..2957a7a 100644
--- a/block/qcow2-bitmap.c
+++ b/block/qcow2-bitmap.c
@@ -636,6 +636,75 @@ fail:
     return NULL;
 }
 
+/*
+ * Get the highest allocated cluster index of bitmap
+ */
+int qcow2_get_bitmap_allocated_clusters(BlockDriverState *bs,
+                                        int64_t *p_highest_cluster_index,
+                                        int64_t nb_clusters)
+{
+    int ret;
+    BDRVQcow2State *s = bs->opaque;
+    Qcow2BitmapList *bm_list = NULL;
+    Qcow2Bitmap *bm;
+
+    if (s->nb_bitmaps == 0) {
+        return 0;
+    }
+
+    ret = update_highest_cluster_index(s, s->bitmap_directory_offset,
+                                       s->bitmap_directory_size,
+                                       p_highest_cluster_index, nb_clusters);
+    if (ret == 1) {
+        goto out;
+    }
+
+    bm_list = bitmap_list_load(bs, s->bitmap_directory_offset,
+                               s->bitmap_directory_size, NULL);
+    if (bm_list == NULL) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    QSIMPLEQ_FOREACH(bm, bm_list, entry) {
+        uint64_t *bitmap_table = NULL;
+        int i;
+
+        ret = update_highest_cluster_index(s, bm->table.offset,
+                                        bm->table.size * sizeof(uint64_t),
+                                        p_highest_cluster_index, nb_clusters);
+        if (ret == 1) {
+            goto out;
+        }
+
+        ret = bitmap_table_load(bs, &bm->table, &bitmap_table);
+        if (ret < 0) {
+            goto out;
+        }
+
+        for (i = 0; i < bm->table.size; ++i) {
+            uint64_t entry = bitmap_table[i];
+            uint64_t offset = entry & BME_TABLE_ENTRY_OFFSET_MASK;
+
+            /* ignore entry legality */
+            if (offset == 0) {
+                continue;
+            }
+
+            ret = update_highest_cluster_index(s, offset, s->cluster_size,
+                                         p_highest_cluster_index, nb_clusters);
+            if (ret == 1) {
+                g_free(bitmap_table);
+                goto out;
+            }
+        }
+        g_free(bitmap_table);
+    }
+out:
+    bitmap_list_free(bm_list);
+    return ret;
+}
+
 int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                                   void **refcount_table,
                                   int64_t *refcount_table_size)
diff --git a/block/qcow2.c b/block/qcow2.c
index ef68772..fa251af 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -4493,6 +4493,217 @@ static QemuOptsList qcow2_create_opts = {
     }
 };
 
+static int qcow2_get_l2_allocated_size(BlockDriverState *bs,
+                                     uint64_t l2_table_offset,
+                                     int64_t *p_highest_cluster_index,
+                                     int64_t nb_clusters)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t *l2_table = NULL, l2_entry, cluster_offset;
+    int l2_size, ret, i, nb_csectors;
+
+    /* malloc L2 table */
+    l2_size = s->l2_size * sizeof(uint64_t);
+    if (l2_size > 0) {
+        l2_table = g_malloc(l2_size);
+        if (l2_table == NULL) {
+            ret = -ENOMEM;
+            goto out;
+        }
+        /* load l2 table from image file */
+        ret = bdrv_pread(bs->file, l2_table_offset, l2_table, l2_size);
+        if (ret < 0 || ret != l2_size) {
+            goto out;
+        }
+
+        ret = update_highest_cluster_index(s, l2_table_offset, l2_size,
+                p_highest_cluster_index, nb_clusters);
+        if (ret == 1) {
+            goto out;
+        }
+    }
+
+    for (i = 0; i < s->l2_size; i++) {
+        l2_entry = be64_to_cpu(l2_table[i]);
+        switch (qcow2_get_cluster_type(l2_entry)) {
+        case QCOW2_CLUSTER_COMPRESSED:
+            nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
+            l2_entry &= s->cluster_offset_mask;
+            ret = update_highest_cluster_index(s, l2_entry & ~511,
+                                        nb_csectors * 512,
+                                        p_highest_cluster_index, nb_clusters);
+            if (ret == 1) {
+                goto out;
+            }
+            break;
+        case QCOW2_CLUSTER_ZERO_ALLOC:
+        case QCOW2_CLUSTER_NORMAL:
+            cluster_offset = l2_entry & L2E_OFFSET_MASK;
+
+            ret = update_highest_cluster_index(s, cluster_offset,
+                                        s->cluster_size,
+                                        p_highest_cluster_index, nb_clusters);
+            if (ret == 1) {
+                goto out;
+            }
+
+            break;
+        case QCOW2_CLUSTER_ZERO_PLAIN:
+        case QCOW2_CLUSTER_UNALLOCATED:
+            break;
+        default:
+            abort(); /* some error happen */
+        }
+    }
+    ret = 0;
+out:
+    g_free(l2_table);
+    return ret;
+}
+
+static int qcow2_get_l1_allocated_size(BlockDriverState *bs,
+                                     uint64_t l1_table_offset,
+                                     int l1_size,
+                                     int64_t *p_highest_cluster_index,
+                                     int64_t nb_clusters)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t *l1_table = NULL, l2_offset;
+    int i, ret, l1_size2;
+
+    /* malloca l1 table memory */
+    l1_size2 = l1_size * sizeof(uint64_t);
+    if (l1_size2 > 0) {
+        l1_table = g_malloc(l1_size2);
+        if (!l1_table) {
+            ret = -ENOMEM;
+            goto out;
+        }
+        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
+        if (ret < 0 || ret != l1_size2) {
+            goto out;
+        }
+
+        ret = update_highest_cluster_index(s, l1_table_offset, l1_size2,
+                                        p_highest_cluster_index, nb_clusters);
+        if (ret == 1) /* has reached the end */
+            goto out;
+    }
+
+
+    for (i = 0; i < l1_size; i++) {
+        be64_to_cpus(&l1_table[i]);
+        l2_offset = l1_table[i];
+        if (l2_offset) {
+            l2_offset &= L1E_OFFSET_MASK;
+            ret = qcow2_get_l2_allocated_size(bs, l2_offset,
+                                p_highest_cluster_index,  nb_clusters);
+            if (ret <  0 || ret == 1) {
+                goto out;
+            }
+        }
+    }
+
+    ret = 0;
+out:
+    g_free(l1_table);
+    return ret;
+
+}
+
+/* Get block device allocated space for qcow2, aligned with cluster_size */
+static int64_t qcow2_get_block_allocated_size(BlockDriverState *bs)
+{
+    int64_t file_size, file_max_clusters;
+    uint64_t offset, allocated_size;
+    int64_t highest_cluster_index = 0;
+    int i, ret;
+    BDRVQcow2State *s = bs->opaque;
+    QCowSnapshot *sn;
+
+    file_size = bdrv_getlength(bs->file->bs);
+    file_max_clusters = size_to_clusters(s, file_size);
+
+    /* current image map */
+    ret = qcow2_get_l1_allocated_size(bs, s->l1_table_offset,
+                                          s->l1_size, &highest_cluster_index,
+                                          file_max_clusters);
+    if (ret < 0 || ret == 1) {
+        goto out;
+    }
+
+    /* snapshot */
+    for (i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        ret = qcow2_get_l1_allocated_size(bs, sn->l1_table_offset,
+                                            sn->l1_size, &highest_cluster_index,
+                                            file_max_clusters);
+        if (ret < 0 || ret == 1) {
+            goto out;
+        }
+    }
+    ret = update_highest_cluster_index(s, s->snapshots_offset,
+                  s->snapshots_size, &highest_cluster_index, file_max_clusters);
+    if (ret == 1) {
+        goto out;
+    }
+
+    /* refcount table */
+    ret = update_highest_cluster_index(s, s->refcount_table_offset,
+                                 s->refcount_table_size * sizeof(uint64_t),
+                                 &highest_cluster_index, file_max_clusters);
+    if (ret == 1) {
+        goto out;
+    }
+    for (i = 0; i < s->refcount_table_size; i++) {
+        offset = s->refcount_table[i];
+        ret = update_highest_cluster_index(s, offset, s->cluster_size,
+                                     &highest_cluster_index, file_max_clusters);
+        if (ret == 1) {
+            goto out;
+        }
+    }
+
+    /* encryption */
+    if (s->crypto_header.length) {
+        ret = update_highest_cluster_index(s, s->crypto_header.offset,
+                                    s->crypto_header.length,
+                                    &highest_cluster_index, file_max_clusters);
+        if (ret == 1) {
+            goto out;
+        }
+    }
+
+    /* bitmap */
+    ret = qcow2_get_bitmap_allocated_clusters(bs, &highest_cluster_index,
+                                        file_max_clusters);
+out:
+    if (ret < 0) {
+        allocated_size = 0; /* if any error happen, return zero */
+    } else {
+        allocated_size = MIN((highest_cluster_index + 1) * s->cluster_size,
+                              file_size);
+    }
+
+    return allocated_size;
+}
+
+static int64_t qcow2_get_allocated_file_size(BlockDriverState *bs)
+{
+    struct stat st;
+    if (stat(bs->filename, &st) < 0 || !S_ISBLK(st.st_mode)) {
+        goto get_file_size;
+    }
+
+    return qcow2_get_block_allocated_size(bs);
+
+get_file_size:
+    if (bs->file) {
+        return bdrv_get_allocated_file_size(bs->file->bs);
+    }
+    return -ENOTSUP;
+}
+
 BlockDriver bdrv_qcow2 = {
     .format_name        = "qcow2",
     .instance_size      = sizeof(BDRVQcow2State),
@@ -4516,6 +4727,7 @@ BlockDriver bdrv_qcow2 = {
     .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
     .bdrv_co_pdiscard       = qcow2_co_pdiscard,
     .bdrv_truncate          = qcow2_truncate,
+    .bdrv_get_allocated_file_size = qcow2_get_allocated_file_size,
     .bdrv_co_pwritev_compressed = qcow2_co_pwritev_compressed,
     .bdrv_make_empty        = qcow2_make_empty,
 
diff --git a/block/qcow2.h b/block/qcow2.h
index adf5c39..9ed3771 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -466,6 +466,45 @@ static inline int64_t size_to_l1(BDRVQcow2State *s, int64_t size)
     return (size + (1ULL << shift) - 1) >> shift;
 }
 
+static inline int64_t offset_to_cluster_index(BDRVQcow2State *s,
+                                              uint64_t offset)
+{
+    return offset >> s->cluster_bits;
+}
+
+/*
+ * Return value may be 0 or 1, 1 means the highest cluster
+ * index has reached the max device size, so There is no need
+ * to continue scanning. @*p_highest_cluster_index has the
+ * new highest cluster index
+ */
+static inline int update_highest_cluster_index(BDRVQcow2State *s,
+                                        int64_t offset,
+                                        int64_t size,
+                                        int64_t *p_highest_cluster_index,
+                                        int64_t nb_clusters)
+{
+    int64_t cluster_index;
+    cluster_index = offset_to_cluster_index(s, offset + size - 1);
+
+    /* If necessary update the new highest index.
+     * Cluster Index exceed the device will be ignored.
+     */
+    if (cluster_index > *p_highest_cluster_index
+                        && cluster_index < nb_clusters) {
+        *p_highest_cluster_index = cluster_index;
+    } else {
+        return 0;
+    }
+
+
+    if (*p_highest_cluster_index == (nb_clusters - 1)) {
+        return 1; /* have reached max */
+    } else {
+        return 0;
+    }
+}
+
 static inline int offset_to_l1_index(BDRVQcow2State *s, uint64_t offset)
 {
     return offset >> (s->l2_bits + s->cluster_bits);
@@ -668,6 +707,9 @@ void *qcow2_cache_is_table_offset(Qcow2Cache *c, uint64_t offset);
 void qcow2_cache_discard(Qcow2Cache *c, void *table);
 
 /* qcow2-bitmap.c functions */
+int qcow2_get_bitmap_allocated_clusters(BlockDriverState *bs,
+                                        int64_t *p_highest_cluster_index,
+                                        int64_t nb_clusters);
 int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                                   void **refcount_table,
                                   int64_t *refcount_table_size);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-02 13:34 [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format Ivan Ren
@ 2018-05-02 14:02 ` Eric Blake
  2018-05-03 13:06   ` 叶残风
  2018-05-02 14:37 ` Max Reitz
  1 sibling, 1 reply; 12+ messages in thread
From: Eric Blake @ 2018-05-02 14:02 UTC (permalink / raw)
  To: Ivan Ren, qemu-devel; +Cc: kwolf, qemu-block, mreitz

On 05/02/2018 08:34 AM, Ivan Ren wrote:
> qemu-img info with a block device which has a qcow2 format always
> return 0 for disk size, and this can not reflect the qcow2 size
> and the used space of the block device. This patch return the
> allocated size of qcow2 as the disk size.

How does this differ from what qemu can already give you at runtime via 
the wr_highest_offset property in BlockDeviceStats, and related to the 
write_threshold ('block-set-write-threshold command, 
BLOCK_WRITE_THRESHOLD event)?  Is there any code we can reuse, rather 
than writing something from scratch?

> 
> Signed-off-by: Ivan Ren <ivanren@tencent.com>
> ---
>   block/qcow2-bitmap.c |  69 +++++++++++++++++
>   block/qcow2.c        | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++
>   block/qcow2.h        |  42 ++++++++++
>   3 files changed, 323 insertions(+)
> 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-02 13:34 [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format Ivan Ren
  2018-05-02 14:02 ` Eric Blake
@ 2018-05-02 14:37 ` Max Reitz
  2018-05-02 15:01   ` Eric Blake
  2018-05-03 13:08   ` 叶残风
  1 sibling, 2 replies; 12+ messages in thread
From: Max Reitz @ 2018-05-02 14:37 UTC (permalink / raw)
  To: Ivan Ren, qemu-devel; +Cc: qemu-block, kwolf

[-- Attachment #1: Type: text/plain, Size: 3077 bytes --]

Hi,

[replying to this version because the previous mail doesn't seem to have
made it to the mailing lists for whatever reason]

On 2018-05-02 15:34, Ivan Ren wrote:
> qemu-img info with a block device which has a qcow2 format always
> return 0 for disk size, and this can not reflect the qcow2 size
> and the used space of the block device. This patch return the
> allocated size of qcow2 as the disk size.

I'm not quite sure whether you really need this information for block
devices (I tend to agree with Eric that wr_highest_cluster is the more
important information there), but I can imagine it just being nice to have.

So the basic idea makes sense to me, but I think the implementation can
be simplified and the reporting in qemu-img should be done differently.

> 
> Signed-off-by: Ivan Ren <ivanren@tencent.com>
> ---
>  block/qcow2-bitmap.c |  69 +++++++++++++++++
>  block/qcow2.c        | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/qcow2.h        |  42 ++++++++++
>  3 files changed, 323 insertions(+)

The whole implementation reminds me a lot of qcow2's check function,
which basically just recalculates the refcounts.  So I'm wondering
whether you could just count how many clusters with non-0 refcount there
are and thus simplify the implementation dramatically.

[...]

> +static int64_t qcow2_get_allocated_file_size(BlockDriverState *bs)
> +{
> +    struct stat st;
> +    if (stat(bs->filename, &st) < 0 || !S_ISBLK(st.st_mode)) {
> +        goto get_file_size;
> +    }

This definitely doesn't work because nobody guarantees that bs->filename
is something that stat() can work with.  I'm aware that you need to do
the S_ISBLK() check somewhere, but the qcow2 driver is not the right place.

I don't really have a good way around this, though.  These things come
to mind:

(1) We could let file-posix report an error for S_ISBLK because the
information is known to be usually useless -- but I think that is not
quite the right thing to do because maybe some block devices do report
useful information there, I don't know.

(2) Or we introduce a new field in qemu-img info (and thus in ImageInfo,
too, I suppose?).  qemu-img info (or rather bdrv_query_image_info())
could detect whether the format layer supports
bdrv_get_allocated_file_size, and if so, it emits that information
separately from the allocated size of bs->file->bs.  But that would
break vmdk...

(3) As a hack, qcow2_get_allocated_file_size() could first always call
bdrv_get_allocated_file_size(bs->file->bs), and if that returns 0 (which
is absolutely impossible for qcow2 files because they have an image
header that takes up some space), we fall back to
qcow2_get_block_allocated_size().  While I consider it a hack, I can't
come up with a scenario where it wouldn't work.

Max

> +
> +    return qcow2_get_block_allocated_size(bs);
> +
> +get_file_size:
> +    if (bs->file) {
> +        return bdrv_get_allocated_file_size(bs->file->bs);
> +    }
> +    return -ENOTSUP;
> +}
> +


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-02 14:37 ` Max Reitz
@ 2018-05-02 15:01   ` Eric Blake
  2018-05-02 15:13     ` Max Reitz
  2018-05-03 13:08   ` 叶残风
  1 sibling, 1 reply; 12+ messages in thread
From: Eric Blake @ 2018-05-02 15:01 UTC (permalink / raw)
  To: Max Reitz, Ivan Ren, qemu-devel; +Cc: kwolf, qemu-block

On 05/02/2018 09:37 AM, Max Reitz wrote:
> On 2018-05-02 15:34, Ivan Ren wrote:
>> qemu-img info with a block device which has a qcow2 format always
>> return 0 for disk size, and this can not reflect the qcow2 size
>> and the used space of the block device. This patch return the
>> allocated size of qcow2 as the disk size.
> 
> I'm not quite sure whether you really need this information for block
> devices (I tend to agree with Eric that wr_highest_cluster is the more
> important information there), but I can imagine it just being nice to have.

Hmm, so in an extreme case, if you create an internal snapshot, then the 
guest makes edits, then you remove the internal snapshot, we have a 
wr_highest_offset that has advanced (because the guest changes had to 
allocate new clusters due to COW of refcount=2 clusters); but the 
deleted snapshot now means we have a lot of unused clusters earlier in 
the image (deleting the snapshot took refcount=2 clusters back to 1, and 
any COW'd clusters edited after the internal snapshot means the snapshot 
version is now back to refcount=0, whether or not we also try to punch a 
hole in the protocol layer for those freed clusters).  Thus, reporting 
the highest written cluster is a larger number than the number of 
clusters that are actually in use, and both numbers might be useful to 
know (how big do I have to size my block device, and how utilized is my 
block device), especially if we add code for online compaction or 
defragmentation of a qcow2 image so that we can move higher offsets into 
holes left earlier in the image.

If you don't use internal snapshots, the only way to get holes of 
unallocated clusters earlier in the image is if the guest uses TRIM 
operations, and I'm not sure if that's easier or harder to trigger, nor 
which approach (internal snapshots vs. guest TRIM operations) is likely 
to leave more holes of unallocated clusters.

> 
> The whole implementation reminds me a lot of qcow2's check function,
> which basically just recalculates the refcounts.  So I'm wondering
> whether you could just count how many clusters with non-0 refcount there
> are and thus simplify the implementation dramatically.

We also recently added 'qemu-img measure', which DOES report how many 
clusters are in use.  Is any of that reusable here?

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-02 15:01   ` Eric Blake
@ 2018-05-02 15:13     ` Max Reitz
  2018-05-02 15:19       ` Eric Blake
  0 siblings, 1 reply; 12+ messages in thread
From: Max Reitz @ 2018-05-02 15:13 UTC (permalink / raw)
  To: Eric Blake, Ivan Ren, qemu-devel; +Cc: kwolf, qemu-block

[-- Attachment #1: Type: text/plain, Size: 3385 bytes --]

On 2018-05-02 17:01, Eric Blake wrote:
> On 05/02/2018 09:37 AM, Max Reitz wrote:
>> On 2018-05-02 15:34, Ivan Ren wrote:
>>> qemu-img info with a block device which has a qcow2 format always
>>> return 0 for disk size, and this can not reflect the qcow2 size
>>> and the used space of the block device. This patch return the
>>> allocated size of qcow2 as the disk size.
>>
>> I'm not quite sure whether you really need this information for block
>> devices (I tend to agree with Eric that wr_highest_cluster is the more
>> important information there), but I can imagine it just being nice to
>> have.
> 
> Hmm, so in an extreme case, if you create an internal snapshot, then the
> guest makes edits, then you remove the internal snapshot, we have a
> wr_highest_offset that has advanced (because the guest changes had to
> allocate new clusters due to COW of refcount=2 clusters); but the
> deleted snapshot now means we have a lot of unused clusters earlier in
> the image (deleting the snapshot took refcount=2 clusters back to 1, and
> any COW'd clusters edited after the internal snapshot means the snapshot
> version is now back to refcount=0, whether or not we also try to punch a
> hole in the protocol layer for those freed clusters).  Thus, reporting
> the highest written cluster is a larger number than the number of
> clusters that are actually in use, and both numbers might be useful to
> know (how big do I have to size my block device, and how utilized is my
> block device), especially if we add code for online compaction or
> defragmentation of a qcow2 image so that we can move higher offsets into
> holes left earlier in the image.

In any case, since a block device is linear you really need to know the
highest offset that is in use.  Sure, if you know how much space there
is wasted in holes in the middle of the file, that is nice to know, but
it doesn't tell you when you need to grow your block device.

(It does tell you when to consider qemu-img convert or a mirror job to
defragment the image, though...)

> If you don't use internal snapshots, the only way to get holes of
> unallocated clusters earlier in the image is if the guest uses TRIM
> operations, and I'm not sure if that's easier or harder to trigger, nor
> which approach (internal snapshots vs. guest TRIM operations) is likely
> to leave more holes of unallocated clusters.

And I think we (at least used to) have quirks in qcow2's allocation
algorithm that meant it could leave some clusters unallocated in the
middle of the image (I think that was in case you allocate more than a
single cluster (e.g. for an L1 table), but then you also need to
allocate a new refblock, so you allocate that first and then you only
start to allocate clusters after that refblock, even though there might
still be space in front of it).

>> The whole implementation reminds me a lot of qcow2's check function,
>> which basically just recalculates the refcounts.  So I'm wondering
>> whether you could just count how many clusters with non-0 refcount there
>> are and thus simplify the implementation dramatically.
> 
> We also recently added 'qemu-img measure', which DOES report how many
> clusters are in use.  Is any of that reusable here?

It only tells you that information for a hypothetical new image, though,
doesn't it?

Max


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-02 15:13     ` Max Reitz
@ 2018-05-02 15:19       ` Eric Blake
  2018-05-02 15:33         ` Max Reitz
  0 siblings, 1 reply; 12+ messages in thread
From: Eric Blake @ 2018-05-02 15:19 UTC (permalink / raw)
  To: Max Reitz, Ivan Ren, qemu-devel; +Cc: kwolf, qemu-block

On 05/02/2018 10:13 AM, Max Reitz wrote:

>> We also recently added 'qemu-img measure', which DOES report how many
>> clusters are in use.  Is any of that reusable here?
> 
> It only tells you that information for a hypothetical new image, though,
> doesn't it?

It has two uses: with just a size, estimate the overhead needed to 
create a file with the given format and exposing the given size to the 
guest (both sparse and fully allocated sizes are estimated); and with a 
pre-existing image, compute the exact overhead needed for 'qemu-img 
convert' to create a new image with the given format and the same 
guest-visible contents.  The latter use, where it uses an existing guest 
image as the starting point to measure, indeed only tells you what a 
hypothetical new image will occupy (the used cluster count, not the 
wr_highest_offset count) - but isn't that what you want?

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-02 15:19       ` Eric Blake
@ 2018-05-02 15:33         ` Max Reitz
  0 siblings, 0 replies; 12+ messages in thread
From: Max Reitz @ 2018-05-02 15:33 UTC (permalink / raw)
  To: Eric Blake, Ivan Ren, qemu-devel; +Cc: kwolf, qemu-block

[-- Attachment #1: Type: text/plain, Size: 1561 bytes --]

On 2018-05-02 17:19, Eric Blake wrote:
> On 05/02/2018 10:13 AM, Max Reitz wrote:
> 
>>> We also recently added 'qemu-img measure', which DOES report how many
>>> clusters are in use.  Is any of that reusable here?
>>
>> It only tells you that information for a hypothetical new image, though,
>> doesn't it?
> 
> It has two uses: with just a size, estimate the overhead needed to
> create a file with the given format and exposing the given size to the
> guest (both sparse and fully allocated sizes are estimated); and with a
> pre-existing image, compute the exact overhead needed for 'qemu-img
> convert' to create a new image with the given format and the same
> guest-visible contents.  The latter use, where it uses an existing guest
> image as the starting point to measure, indeed only tells you what a
> hypothetical new image will occupy (the used cluster count, not the
> wr_highest_offset count) - but isn't that what you want?

As far as I can tell, it only uses allocation status of the active layer
to determine how much of the hypothetical new image would be allocated
(the man page says "as if converting an existing image file using
qemu-img convert").  But this patch tries to determine exactly how much
is allocated in the current image for everything, not just how much you
need to have allocated to represent the active layer.

That may include snapshots, but it also includes bitmaps, or allocated
zero clusters (which qemu-img measure discounts), or things like empty
L2 tables.  So, everything.

Max


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-02 14:02 ` Eric Blake
@ 2018-05-03 13:06   ` 叶残风
  2018-05-04 14:35     ` Max Reitz
  0 siblings, 1 reply; 12+ messages in thread
From: 叶残风 @ 2018-05-03 13:06 UTC (permalink / raw)
  To: eblake; +Cc: qemu-devel, kwolf, qemu-block, mreitz

Thanks for your review, Eric.
Yes, the wr_highest_offset can tell the end offset at runtime, and
write_threshold similar to it. But in my situation, I need to know
the allocated end without a vm running.

Eric Blake <eblake@redhat.com> 于2018年5月2日周三 下午10:02写道:

> On 05/02/2018 08:34 AM, Ivan Ren wrote:
> > qemu-img info with a block device which has a qcow2 format always
> > return 0 for disk size, and this can not reflect the qcow2 size
> > and the used space of the block device. This patch return the
> > allocated size of qcow2 as the disk size.
>
> How does this differ from what qemu can already give you at runtime via
> the wr_highest_offset property in BlockDeviceStats, and related to the
> write_threshold ('block-set-write-threshold command,
> BLOCK_WRITE_THRESHOLD event)?  Is there any code we can reuse, rather
> than writing something from scratch?
>
> >
> > Signed-off-by: Ivan Ren <ivanren@tencent.com>
> > ---
> >   block/qcow2-bitmap.c |  69 +++++++++++++++++
> >   block/qcow2.c        | 212
> +++++++++++++++++++++++++++++++++++++++++++++++++++
> >   block/qcow2.h        |  42 ++++++++++
> >   3 files changed, 323 insertions(+)
> >
>
> --
> Eric Blake, Principal Software Engineer
> Red Hat, Inc.           +1-919-301-3266
> Virtualization:  qemu.org | libvirt.org
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-02 14:37 ` Max Reitz
  2018-05-02 15:01   ` Eric Blake
@ 2018-05-03 13:08   ` 叶残风
  2018-05-04 14:27     ` Max Reitz
  1 sibling, 1 reply; 12+ messages in thread
From: 叶残风 @ 2018-05-03 13:08 UTC (permalink / raw)
  To: mreitz; +Cc: qemu-devel, qemu-block, kwolf

> The whole implementation reminds me a lot of qcow2's check function,
> which basically just recalculates the refcounts.  So I'm wondering
> whether you could just count how many clusters with non-0 refcount there
> are and thus simplify the implementation dramatically.

Thanks for your review, Max.

Yes, just get the highest non-0 refcount cluster index can simply get the
end offset. But in some situations(such as some errors happen), a cluster
is indexed in index table, but the refcount may be 0 error, just like the
qcow2 check inconsistency. So I traverse the whole index part, include L1,
L2, snapshot and so on.
I try to reuse the qcow2 check code, but the check routine limit the
avaliable
end to SIZE_MAX which work well in file situation, however the block device
has
a fix end. And the check routine print a lot check info which I don't need.



>> +static int64_t qcow2_get_allocated_file_size(BlockDriverState *bs)
>> +{
>> +    struct stat st;
>> +    if (stat(bs->filename, &st) < 0 || !S_ISBLK(st.st_mode)) {
>> +        goto get_file_size;
>> +    }
>
> This definitely doesn't work because nobody guarantees that bs->filename
> is something that stat() can work with.  I'm aware that you need to do
> the S_ISBLK() check somewhere, but the qcow2 driver is not the right
place.
>
> I don't really have a good way around this, though.  These things come
> to mind:
> ...

Yea, thank you for your suggestion. I think a hack to
qcow2_get_allocated_file_size
will work well.
> (3) As a hack, qcow2_get_allocated_file_size() could first always call
> bdrv_get_allocated_file_size(bs->file->bs), and if that returns 0 (which
> is absolutely impossible for qcow2 files because they have an image
> header that takes up some space), we fall back to
> qcow2_get_block_allocated_size().  While I consider it a hack, I can't
> come up with a scenario where it wouldn't work.


Max Reitz <mreitz@redhat.com> 于2018年5月2日周三 下午10:37写道:

> Hi,
>
> [replying to this version because the previous mail doesn't seem to have
> made it to the mailing lists for whatever reason]
>
> On 2018-05-02 15:34, Ivan Ren wrote:
> > qemu-img info with a block device which has a qcow2 format always
> > return 0 for disk size, and this can not reflect the qcow2 size
> > and the used space of the block device. This patch return the
> > allocated size of qcow2 as the disk size.
>
> I'm not quite sure whether you really need this information for block
> devices (I tend to agree with Eric that wr_highest_cluster is the more
> important information there), but I can imagine it just being nice to have.
>
> So the basic idea makes sense to me, but I think the implementation can
> be simplified and the reporting in qemu-img should be done differently.
>
> >
> > Signed-off-by: Ivan Ren <ivanren@tencent.com>
> > ---
> >  block/qcow2-bitmap.c |  69 +++++++++++++++++
> >  block/qcow2.c        | 212
> +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  block/qcow2.h        |  42 ++++++++++
> >  3 files changed, 323 insertions(+)
>
> The whole implementation reminds me a lot of qcow2's check function,
> which basically just recalculates the refcounts.  So I'm wondering
> whether you could just count how many clusters with non-0 refcount there
> are and thus simplify the implementation dramatically.
>
> [...]
>
> > +static int64_t qcow2_get_allocated_file_size(BlockDriverState *bs)
> > +{
> > +    struct stat st;
> > +    if (stat(bs->filename, &st) < 0 || !S_ISBLK(st.st_mode)) {
> > +        goto get_file_size;
> > +    }
>
> This definitely doesn't work because nobody guarantees that bs->filename
> is something that stat() can work with.  I'm aware that you need to do
> the S_ISBLK() check somewhere, but the qcow2 driver is not the right place.
>
> I don't really have a good way around this, though.  These things come
> to mind:
>
> (1) We could let file-posix report an error for S_ISBLK because the
> information is known to be usually useless -- but I think that is not
> quite the right thing to do because maybe some block devices do report
> useful information there, I don't know.
>
> (2) Or we introduce a new field in qemu-img info (and thus in ImageInfo,
> too, I suppose?).  qemu-img info (or rather bdrv_query_image_info())
> could detect whether the format layer supports
> bdrv_get_allocated_file_size, and if so, it emits that information
> separately from the allocated size of bs->file->bs.  But that would
> break vmdk...
>
> (3) As a hack, qcow2_get_allocated_file_size() could first always call
> bdrv_get_allocated_file_size(bs->file->bs), and if that returns 0 (which
> is absolutely impossible for qcow2 files because they have an image
> header that takes up some space), we fall back to
> qcow2_get_block_allocated_size().  While I consider it a hack, I can't
> come up with a scenario where it wouldn't work.
>
> Max
>
> > +
> > +    return qcow2_get_block_allocated_size(bs);
> > +
> > +get_file_size:
> > +    if (bs->file) {
> > +        return bdrv_get_allocated_file_size(bs->file->bs);
> > +    }
> > +    return -ENOTSUP;
> > +}
> > +
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-03 13:08   ` 叶残风
@ 2018-05-04 14:27     ` Max Reitz
  0 siblings, 0 replies; 12+ messages in thread
From: Max Reitz @ 2018-05-04 14:27 UTC (permalink / raw)
  To: 叶残风; +Cc: qemu-devel, qemu-block, kwolf

[-- Attachment #1: Type: text/plain, Size: 6453 bytes --]

On 2018-05-03 15:08, 叶残风 wrote:
>> The whole implementation reminds me a lot of qcow2's check function,
>> which basically just recalculates the refcounts.  So I'm wondering
>> whether you could just count how many clusters with non-0 refcount there
>> are and thus simplify the implementation dramatically.
> 
> Thanks for your review, Max.
> 
> Yes, just get the highest non-0 refcount cluster index can simply get the
> end offset. But in some situations(such as some errors happen), a cluster
> is indexed in index table, but the refcount may be 0 error, just like the
> qcow2 check inconsistency. So I traverse the whole index part, include L1,
> L2, snapshot and so on.

Yeah, well, then you use qemu-img check to repair it.  The refcount
structure is there to know which clusters are allocated, so we should
use it when we want to know that and not assume it's broken.  If the
user thinks it may be broken, they are free to run qemu-img check to check.

Note that qemu makes sure that no refcount ever is lower than it must
be.  Refcounts are always updated in an order such that if qemu crashes
during some operations the refcount is equal to or higher than what it
needs to be.  Anything else is a bug in qemu (or the result of
lazy-refcounts, but then the image is marked dirty and automatically
repaired when opened R/W the next time).

Max

> I try to reuse the qcow2 check code, but the check routine limit the
> avaliable
> end to SIZE_MAX which work well in file situation, however the block
> device has
> a fix end. And the check routine print a lot check info which I don't need.
> 
> 
> 
>>> +static int64_t qcow2_get_allocated_file_size(BlockDriverState *bs)
>>> +{
>>> +    struct stat st;
>>> +    if (stat(bs->filename, &st) < 0 || !S_ISBLK(st.st_mode)) {
>>> +        goto get_file_size;
>>> +    }
>>
>> This definitely doesn't work because nobody guarantees that bs->filename
>> is something that stat() can work with.  I'm aware that you need to do
>> the S_ISBLK() check somewhere, but the qcow2 driver is not the right
> place.
>>
>> I don't really have a good way around this, though.  These things come
>> to mind:
>> ...
> 
> Yea, thank you for your suggestion. I think a hack to
> qcow2_get_allocated_file_size
> will work well.
>> (3) As a hack, qcow2_get_allocated_file_size() could first always call
>> bdrv_get_allocated_file_size(bs->file->bs), and if that returns 0 (which
>> is absolutely impossible for qcow2 files because they have an image
>> header that takes up some space), we fall back to
>> qcow2_get_block_allocated_size().  While I consider it a hack, I can't
>> come up with a scenario where it wouldn't work.
> 
> 
> Max Reitz <mreitz@redhat.com <mailto:mreitz@redhat.com>> 于2018年5月2日
> 周三 下午10:37写道:
> 
>     Hi,
> 
>     [replying to this version because the previous mail doesn't seem to have
>     made it to the mailing lists for whatever reason]
> 
>     On 2018-05-02 15:34, Ivan Ren wrote:
>     > qemu-img info with a block device which has a qcow2 format always
>     > return 0 for disk size, and this can not reflect the qcow2 size
>     > and the used space of the block device. This patch return the
>     > allocated size of qcow2 as the disk size.
> 
>     I'm not quite sure whether you really need this information for block
>     devices (I tend to agree with Eric that wr_highest_cluster is the more
>     important information there), but I can imagine it just being nice
>     to have.
> 
>     So the basic idea makes sense to me, but I think the implementation can
>     be simplified and the reporting in qemu-img should be done differently.
> 
>     >
>     > Signed-off-by: Ivan Ren <ivanren@tencent.com
>     <mailto:ivanren@tencent.com>>
>     > ---
>     >  block/qcow2-bitmap.c |  69 +++++++++++++++++
>     >  block/qcow2.c        | 212
>     +++++++++++++++++++++++++++++++++++++++++++++++++++
>     >  block/qcow2.h        |  42 ++++++++++
>     >  3 files changed, 323 insertions(+)
> 
>     The whole implementation reminds me a lot of qcow2's check function,
>     which basically just recalculates the refcounts.  So I'm wondering
>     whether you could just count how many clusters with non-0 refcount there
>     are and thus simplify the implementation dramatically.
> 
>     [...]
> 
>     > +static int64_t qcow2_get_allocated_file_size(BlockDriverState *bs)
>     > +{
>     > +    struct stat st;
>     > +    if (stat(bs->filename, &st) < 0 || !S_ISBLK(st.st_mode)) {
>     > +        goto get_file_size;
>     > +    }
> 
>     This definitely doesn't work because nobody guarantees that bs->filename
>     is something that stat() can work with.  I'm aware that you need to do
>     the S_ISBLK() check somewhere, but the qcow2 driver is not the right
>     place.
> 
>     I don't really have a good way around this, though.  These things come
>     to mind:
> 
>     (1) We could let file-posix report an error for S_ISBLK because the
>     information is known to be usually useless -- but I think that is not
>     quite the right thing to do because maybe some block devices do report
>     useful information there, I don't know.
> 
>     (2) Or we introduce a new field in qemu-img info (and thus in ImageInfo,
>     too, I suppose?).  qemu-img info (or rather bdrv_query_image_info())
>     could detect whether the format layer supports
>     bdrv_get_allocated_file_size, and if so, it emits that information
>     separately from the allocated size of bs->file->bs.  But that would
>     break vmdk...
> 
>     (3) As a hack, qcow2_get_allocated_file_size() could first always call
>     bdrv_get_allocated_file_size(bs->file->bs), and if that returns 0 (which
>     is absolutely impossible for qcow2 files because they have an image
>     header that takes up some space), we fall back to
>     qcow2_get_block_allocated_size().  While I consider it a hack, I can't
>     come up with a scenario where it wouldn't work.
> 
>     Max
> 
>     > +
>     > +    return qcow2_get_block_allocated_size(bs);
>     > +
>     > +get_file_size:
>     > +    if (bs->file) {
>     > +        return bdrv_get_allocated_file_size(bs->file->bs);
>     > +    }
>     > +    return -ENOTSUP;
>     > +}
>     > +
> 



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-03 13:06   ` 叶残风
@ 2018-05-04 14:35     ` Max Reitz
  2018-05-04 15:57       ` Ivan Ren
  0 siblings, 1 reply; 12+ messages in thread
From: Max Reitz @ 2018-05-04 14:35 UTC (permalink / raw)
  To: 叶残风, eblake; +Cc: qemu-devel, kwolf, qemu-block

[-- Attachment #1: Type: text/plain, Size: 877 bytes --]

On 2018-05-03 15:06, 叶残风 wrote:
> 
> Thanks for your review, Eric.
> Yes, the wr_highest_offset can tell the end offset at runtime, and 
> write_threshold similar to it. But in my situation, I need to know 
> the allocated end without a vm running.

Oh, I've just seen that your patch doesn't even get the allocated size,
it actually determines the highest allocated offset.  Well, those are
two different things.  You should not return this value via
qcow2_get_allocated_file_size().  I'm not sure where it should go
(probably wr_highest_offset?) and how it should be presented in
qemu-img, though.

Also note that bdrv_get_allocated_file_size() is called by QMP's
query-named-block-nodes and query-block, both of which are online
synchronous functions.  It should not take a long time, specifically it
should not scan the whole qcow2 file.

Max


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format
  2018-05-04 14:35     ` Max Reitz
@ 2018-05-04 15:57       ` Ivan Ren
  0 siblings, 0 replies; 12+ messages in thread
From: Ivan Ren @ 2018-05-04 15:57 UTC (permalink / raw)
  To: mreitz; +Cc: eblake, qemu-devel, kwolf, qemu-block

> Yeah, well, then you use qemu-img check to repair it.  The refcount
> structure is there to know which clusters are allocated, so we should
> use it when we want to know that and not assume it's broken.  If the
> user thinks it may be broken, they are free to run qemu-img check to
check.

Yea, it is right. It makes things easy.


> Oh, I've just seen that your patch doesn't even get the allocated size,
> it actually determines the highest allocated offset.  Well, those are
> two different things.  You should not return this value via
> qcow2_get_allocated_file_size().  I'm not sure where it should go
> (probably wr_highest_offset?) and how it should be presented in
> qemu-img, though.
>
> Also note that bdrv_get_allocated_file_size() is called by QMP's
> query-named-block-nodes and query-block, both of which are online
> synchronous functions.  It should not take a long time, specifically it
> should not scan the whole qcow2 file.

Yes, My original idea was to get the highest allocated offset on the block
device and regard it as currently max allocated size(ignore the hole lower
than it).
Through the discuss, I think it is better to count how many clusters with
non-0 refcount as the real allocated size.
I'll try to get a new version patch to implement this.

On Fri, May 4, 2018 at 10:35 PM Max Reitz <mreitz@redhat.com> wrote:

> On 2018-05-03 15:06, 叶残风 wrote:
> >
> > Thanks for your review, Eric.
> > Yes, the wr_highest_offset can tell the end offset at runtime, and
> > write_threshold similar to it. But in my situation, I need to know
> > the allocated end without a vm running.
>
> Oh, I've just seen that your patch doesn't even get the allocated size,
> it actually determines the highest allocated offset.  Well, those are
> two different things.  You should not return this value via
> qcow2_get_allocated_file_size().  I'm not sure where it should go
> (probably wr_highest_offset?) and how it should be presented in
> qemu-img, though.
>
> Also note that bdrv_get_allocated_file_size() is called by QMP's
> query-named-block-nodes and query-block, both of which are online
> synchronous functions.  It should not take a long time, specifically it
> should not scan the whole qcow2 file.
>
> Max
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-05-04 15:58 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-02 13:34 [Qemu-devel] [PATCH] qemu-img: return allocated size for block device with qcow2 format Ivan Ren
2018-05-02 14:02 ` Eric Blake
2018-05-03 13:06   ` 叶残风
2018-05-04 14:35     ` Max Reitz
2018-05-04 15:57       ` Ivan Ren
2018-05-02 14:37 ` Max Reitz
2018-05-02 15:01   ` Eric Blake
2018-05-02 15:13     ` Max Reitz
2018-05-02 15:19       ` Eric Blake
2018-05-02 15:33         ` Max Reitz
2018-05-03 13:08   ` 叶残风
2018-05-04 14:27     ` Max Reitz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.