All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support
@ 2014-07-07 14:54 Milos Vyletel
  2014-07-29 13:37 ` Stefan Hajnoczi
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Milos Vyletel @ 2014-07-07 14:54 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Fam Zheng, Stefan Hajnoczi, Milos Vyletel

VMDK's streamOptimized format is different that regular sparse format.
L1(GD) and L2(GT) tables are not predefined but rather generated and
written during image creation mainly because there is no way to tell
how much space data will occupy once they are compressed. Also the
location of header, L1 and L2 tables differs.

- L2 tables (grain tables) are written after all grains they point to
- L1 tables are written after all grains and L2 tables
- footer at the end is used instead of header in first sector

This patch improves streamOptimized support and adds possibility to
create true streamOptimized images using qemu-img. Some of the changes
are from VMDK specs, some of them from hexdump-ing images from VMWare
and VirtualBox.

I have compared these images to the ones generated by VMWare and vbox
and they are identical with the exception of DescriptorFile that has
some differences but none that would change behavior(CID and some
additional DDB entries differ) and streamOptimized image generated from
raw image was succesfully imported (as OVA) into VMWare ESXi and Oracle
OVM.

Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
---
 block/vmdk.c |  363 +++++++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 281 insertions(+), 82 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 27a78da..f482225 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -81,6 +81,21 @@ typedef struct {
     uint16_t compressAlgorithm;
 } QEMU_PACKED VMDK4Header;
 
+typedef struct {
+    uint64_t val;
+    uint32_t size;
+    uint32_t type;
+    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
+} QEMU_PACKED VMDK4MetaMarker;
+
+typedef struct {
+    VMDK4MetaMarker footer_marker;
+    uint32_t magic;
+    VMDK4Header header;
+    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
+    VMDK4MetaMarker eos_marker;
+} QEMU_PACKED VMDK4Footer;
+
 #define L2_CACHE_SIZE 16
 
 typedef struct VmdkExtent {
@@ -89,24 +104,29 @@ typedef struct VmdkExtent {
     bool compressed;
     bool has_marker;
     bool has_zero_grain;
+    bool has_footer;
     int version;
     int64_t sectors;
     int64_t end_sector;
     int64_t flat_start_offset;
     int64_t l1_table_offset;
     int64_t l1_backup_table_offset;
+    uint32_t l1_index;
     uint32_t *l1_table;
     uint32_t *l1_backup_table;
     unsigned int l1_size;
     uint32_t l1_entry_sectors;
 
     unsigned int l2_size;
+    uint32_t *l2_table;
     uint32_t *l2_cache;
     uint32_t l2_cache_offsets[L2_CACHE_SIZE];
     uint32_t l2_cache_counts[L2_CACHE_SIZE];
 
     int64_t cluster_sectors;
     char *type;
+
+    VMDK4Footer footer;
 } VmdkExtent;
 
 typedef struct BDRVVmdkState {
@@ -555,14 +575,51 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
     return buf;
 }
 
+static int vmdk_read_footer(BlockDriverState *bs,
+                            VMDK4Footer *footer)
+{
+    int ret;
+
+    /*
+    * footer starts 3 sectors from end
+    * - footer marker
+    * - footer
+    * - end-of-stream marker
+    */
+    ret = bdrv_pread(bs->file,
+        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
+        footer, sizeof(*footer));
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* Some sanity checks for the footer */
+    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
+        le32_to_cpu(footer->footer_marker.size) != 0  ||
+        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
+        le64_to_cpu(footer->eos_marker.val) != 0  ||
+        le32_to_cpu(footer->eos_marker.size) != 0  ||
+        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM)
+    {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = VMDK_OK;
+ out:
+    return ret;
+}
+
 static int vmdk_open_vmdk4(BlockDriverState *bs,
                            BlockDriverState *file,
                            int flags, Error **errp)
 {
     int ret;
+    bool has_footer = false;
     uint32_t magic;
     uint32_t l1_size, l1_entry_sectors;
     VMDK4Header header;
+    VMDK4Footer footer;
     VmdkExtent *extent;
     BDRVVmdkState *s = bs->opaque;
     int64_t l1_backup_offset = 0;
@@ -593,48 +650,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
 
     if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
         /*
-         * The footer takes precedence over the header, so read it in. The
-         * footer starts at offset -1024 from the end: One sector for the
-         * footer, and another one for the end-of-stream marker.
+         * The footer takes precedence over the header, so read it in.
          */
-        struct {
-            struct {
-                uint64_t val;
-                uint32_t size;
-                uint32_t type;
-                uint8_t pad[512 - 16];
-            } QEMU_PACKED footer_marker;
-
-            uint32_t magic;
-            VMDK4Header header;
-            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
-
-            struct {
-                uint64_t val;
-                uint32_t size;
-                uint32_t type;
-                uint8_t pad[512 - 16];
-            } QEMU_PACKED eos_marker;
-        } QEMU_PACKED footer;
-
-        ret = bdrv_pread(file,
-            bs->file->total_sectors * 512 - 1536,
-            &footer, sizeof(footer));
+        ret = vmdk_read_footer(bs, &footer);
         if (ret < 0) {
             return ret;
         }
-
-        /* Some sanity checks for the footer */
-        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
-            le32_to_cpu(footer.footer_marker.size) != 0  ||
-            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
-            le64_to_cpu(footer.eos_marker.val) != 0  ||
-            le32_to_cpu(footer.eos_marker.size) != 0  ||
-            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
-        {
-            return -EINVAL;
-        }
-
+        has_footer = true;
         header = footer.header;
     }
 
@@ -645,11 +667,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
                   bs->device_name, "vmdk", buf);
         return -ENOTSUP;
-    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
+    } else if (le32_to_cpu(header.version) == 3 &&
+               (flags & BDRV_O_RDWR) &&
+               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
         /* VMware KB 2064959 explains that version 3 added support for
          * persistent changed block tracking (CBT), and backup software can
          * read it as version=1 if it doesn't care about the changed area
-         * information. So we are safe to enable read only. */
+         * information. So we are safe to enable read only.
+         * Note that this does not apply to streamOptimized images which
+         * are written only once and are used as transport format */
         error_setg(errp, "VMDK version 3 must be read only");
         return -EINVAL;
     }
@@ -689,11 +715,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     if (ret < 0) {
         return ret;
     }
+    if (has_footer) {
+        extent->has_footer = has_footer;
+        extent->footer = footer;
+    }
+
     extent->compressed =
         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
     if (extent->compressed) {
         g_free(s->create_type);
         s->create_type = g_strdup("streamOptimized");
+
+        if (flags & BDRV_O_RDWR)
+            bdrv_truncate(file,
+                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
     }
     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
     extent->version = le32_to_cpu(header.version);
@@ -1000,6 +1035,11 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     uint32_t offset;
     QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
     offset = cpu_to_le32(m_data->offset);
+
+    /* do not update on streamOptimized */
+    if (extent->compressed)
+        return VMDK_OK;
+
     /* update L2 table */
     if (bdrv_pwrite_sync(
                 extent->file,
@@ -1026,6 +1066,97 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     return VMDK_OK;
 }
 
+static int vmdk_write_footer(BlockDriverState *bs,
+                           VMDK4Footer *footer,
+                           VmdkExtent *extent)
+{
+    int i, ret, gd_buf_size;
+    uint32_t *gd_buf = NULL;
+    uint32_t grains, gd_sectors, gt_size, gt_count;
+    uint64_t offset;
+    VMDK4Header header;
+    VMDK4MetaMarker gd_marker;
+
+    header = footer->header;
+    offset = le64_to_cpu(header.gd_offset);
+
+    grains = DIV_ROUND_UP(header.capacity, header.granularity);
+    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
+                           BDRV_SECTOR_SIZE);
+    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
+    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
+
+    /* write grain directory marker */
+    memset(&gd_marker, 0, sizeof(gd_marker));
+    gd_marker.val = cpu_to_le64(gd_sectors);
+    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
+
+    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, &gd_marker, sizeof(gd_marker));
+    if (ret < 0)
+        goto exit;
+    offset += sizeof(gd_marker) / BDRV_SECTOR_SIZE;
+
+    /* write grain directory */
+    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
+    gd_buf = g_malloc0(gd_buf_size);
+    if (extent) {
+        /* copy over L1 table if we have it */
+        for (i = 0; i < gt_count; i++) {
+            gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
+        }
+    }
+    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size);
+    if (ret < 0)
+        goto exit;
+
+    /* save real gd_offset */
+    footer->header.gd_offset = cpu_to_le64(offset);
+    offset += gd_sectors;
+
+    /* write footer */
+    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
+    if (ret < 0)
+        goto exit;
+
+    ret = 0;
+ exit:
+    return ret;
+}
+
+static int vmdk_write_grain_table(VmdkExtent *extent)
+{
+    int ret;
+    VMDK4MetaMarker gtm;
+    uint64_t offset;
+
+    offset = le64_to_cpu(extent->footer.header.gd_offset) << 9;
+
+    memset(&gtm, 0, sizeof(gtm));
+    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
+    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
+    if (bdrv_pwrite(extent->file, offset, &gtm, sizeof(gtm)) < 0) {
+        ret = -EIO;
+        goto out;
+    }
+
+    offset += sizeof(gtm);
+    extent->l1_table[extent->l1_index] = offset >> 9;
+    if (bdrv_pwrite(extent->file, offset,
+                    extent->l2_table, extent->l2_size * sizeof(uint32_t)
+                   ) != extent->l2_size * sizeof(uint32_t)) {
+        ret = -EIO;
+        goto out;
+    }
+
+    offset += extent->l2_size * sizeof(uint32_t);
+    extent->l1_table_offset = offset;
+    extent->footer.header.gd_offset = cpu_to_le64(offset >> 9);
+
+    ret = 0;
+ out:
+    return ret;
+}
+
 static int get_cluster_offset(BlockDriverState *bs,
                                     VmdkExtent *extent,
                                     VmdkMetaData *m_data,
@@ -1034,8 +1165,8 @@ static int get_cluster_offset(BlockDriverState *bs,
                                     uint64_t *cluster_offset)
 {
     unsigned int l1_index, l2_offset, l2_index;
-    int min_index, i, j;
-    uint32_t min_count, *l2_table;
+    int min_index, i, j, ret;
+    uint32_t min_count;
     bool zeroed = false;
 
     if (m_data) {
@@ -1048,11 +1179,25 @@ static int get_cluster_offset(BlockDriverState *bs,
 
     offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
     l1_index = (offset >> 9) / extent->l1_entry_sectors;
-    if (l1_index >= extent->l1_size) {
+    if (extent->compressed && l1_index &&
+            extent->l1_index != l1_index) {
+        ret = vmdk_write_grain_table(extent);
+        if (ret < 0)
+            return ret;
+    }
+
+    extent->l1_index = l1_index;
+    if (extent->l1_index >= extent->l1_size) {
         return VMDK_ERROR;
     }
-    l2_offset = extent->l1_table[l1_index];
+ retry:
+    l2_offset = extent->l1_table[extent->l1_index];
+
     if (!l2_offset) {
+        if (extent->compressed) {
+            extent->l1_table[extent->l1_index] = bdrv_getlength(extent->file);
+            goto retry;
+        }
         return VMDK_UNALLOC;
     }
     for (i = 0; i < L2_CACHE_SIZE; i++) {
@@ -1063,7 +1208,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                     extent->l2_cache_counts[j] >>= 1;
                 }
             }
-            l2_table = extent->l2_cache + (i * extent->l2_size);
+            extent->l2_table = extent->l2_cache + (i * extent->l2_size);
             goto found;
         }
     }
@@ -1076,11 +1221,11 @@ static int get_cluster_offset(BlockDriverState *bs,
             min_index = i;
         }
     }
-    l2_table = extent->l2_cache + (min_index * extent->l2_size);
+    extent->l2_table = extent->l2_cache + (min_index * extent->l2_size);
     if (bdrv_pread(
                 extent->file,
                 (int64_t)l2_offset * 512,
-                l2_table,
+                extent->l2_table,
                 extent->l2_size * sizeof(uint32_t)
             ) != extent->l2_size * sizeof(uint32_t)) {
         return VMDK_ERROR;
@@ -1090,15 +1235,15 @@ static int get_cluster_offset(BlockDriverState *bs,
     extent->l2_cache_counts[min_index] = 1;
  found:
     l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
-    *cluster_offset = le32_to_cpu(l2_table[l2_index]);
+    *cluster_offset = le32_to_cpu(extent->l2_table[l2_index]);
 
     if (m_data) {
         m_data->valid = 1;
-        m_data->l1_index = l1_index;
+        m_data->l1_index = extent->l1_index;
         m_data->l2_index = l2_index;
         m_data->offset = *cluster_offset;
         m_data->l2_offset = l2_offset;
-        m_data->l2_cache_entry = &l2_table[l2_index];
+        m_data->l2_cache_entry = &extent->l2_table[l2_index];
     }
     if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
         zeroed = true;
@@ -1119,7 +1264,7 @@ static int get_cluster_offset(BlockDriverState *bs,
         }
 
         *cluster_offset >>= 9;
-        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
+        extent->l2_table[l2_index] = cpu_to_le32(*cluster_offset);
 
         /* First of all we write grain itself, to avoid race condition
          * that may to corrupt the image.
@@ -1210,6 +1355,7 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
     uLongf buf_len;
     const uint8_t *write_buf = buf;
     int write_len = nb_sectors * 512;
+    uint64_t gd_offset;
 
     if (extent->compressed) {
         if (!extent->has_marker) {
@@ -1227,6 +1373,7 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
         data->size = buf_len;
         write_buf = (uint8_t *)data;
         write_len = buf_len + sizeof(VmdkGrainMarker);
+
     }
     ret = bdrv_pwrite(extent->file,
                         cluster_offset + offset_in_cluster,
@@ -1236,6 +1383,13 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
         ret = ret < 0 ? ret : -EIO;
         goto out;
     }
+    if (extent->compressed) {
+        /* update GD offset after each write */
+        gd_offset = bdrv_getlength(extent->file);
+        extent->l1_table_offset = gd_offset;
+        gd_offset /= BDRV_SECTOR_SIZE;
+        extent->footer.header.gd_offset = cpu_to_le64(gd_offset);
+    }
     ret = 0;
  out:
     g_free(data);
@@ -1534,10 +1688,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     int ret, i;
     BlockDriverState *bs = NULL;
     VMDK4Header header;
+    VMDK4Footer footer;
     Error *local_err = NULL;
     uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
     uint32_t *gd_buf = NULL;
     int gd_buf_size;
+    uint64_t grain_offset, rgd_offset, gd_offset;
 
     ret = bdrv_create_file(filename, opts, &local_err);
     if (ret < 0) {
@@ -1562,28 +1718,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     }
     magic = cpu_to_be32(VMDK4_MAGIC);
     memset(&header, 0, sizeof(header));
-    header.version = zeroed_grain ? 2 : 1;
-    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
-                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
+    memset(&footer, 0, sizeof(footer));
+
+    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
+    header.flags = VMDK4_FLAG_NL_DETECT
+                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
+                               : VMDK4_FLAG_RGD)
                    | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
     header.capacity = filesize / BDRV_SECTOR_SIZE;
     header.granularity = 128;
     header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
 
-    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
+    grains = DIV_ROUND_UP(header.capacity, header.granularity);
     gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
                            BDRV_SECTOR_SIZE);
     gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
     gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
 
     header.desc_offset = 1;
-    header.desc_size = 20;
-    header.rgd_offset = header.desc_offset + header.desc_size;
-    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
-    header.grain_offset =
+    header.desc_size = (compress ? 2 : 20);
+    rgd_offset = header.desc_offset + header.desc_size;
+    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
+    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
+    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
+    grain_offset =
         ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
                  header.granularity);
+    /* streamOptimized reserves first 128 sectors */
+    header.grain_offset = (compress ? header.granularity : grain_offset);
+    /* streamOptimzed's grain directory is at the end */
+    gd_offset = header.grain_offset + 1;
+
     /* swap endianness for all header fields */
     header.version = cpu_to_le32(header.version);
     header.flags = cpu_to_le32(header.flags);
@@ -1620,30 +1786,54 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         goto exit;
     }
 
-    /* write grain directory */
-    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
-    gd_buf = g_malloc0(gd_buf_size);
-    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
-         i < gt_count; i++, tmp += gt_size) {
-        gd_buf[i] = cpu_to_le32(tmp);
-    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
-    if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
-        goto exit;
-    }
+    if (compress) {
+        /* footer marker */
+        footer.footer_marker.val = cpu_to_le64(1);
+        footer.footer_marker.type = cpu_to_le32(MARKER_FOOTER);
 
-    /* write backup grain directory */
-    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
-         i < gt_count; i++, tmp += gt_size) {
-        gd_buf[i] = cpu_to_le32(tmp);
-    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
-    if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
-        goto exit;
+        /* header */
+        footer.magic = cpu_to_be32(VMDK4_MAGIC);
+        footer.header = header;
+
+        /* grain directory offset */
+        footer.header.gd_offset = cpu_to_le64(gd_offset);
+
+        /* EOS marker */
+        footer.eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
+
+        ret = vmdk_write_footer(bs, &footer, NULL);
+        if (ret < 0) {
+            error_set(errp, QERR_IO_ERROR);
+            goto exit;
+        }
+    } else {
+        /* write redundant grain directory (if applicable) */
+        if (le64_to_cpu(header.rgd_offset)) {
+            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
+            gd_buf = g_malloc0(gd_buf_size);
+            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
+                 i < gt_count; i++, tmp += gt_size) {
+                gd_buf[i] = cpu_to_le32(tmp);
+            }
+            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
+                              gd_buf, gd_buf_size);
+            if (ret < 0) {
+                error_set(errp, QERR_IO_ERROR);
+                goto exit;
+            }
+        }
+
+        /* write grain directory */
+        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
+             i < gt_count; i++, tmp += gt_size) {
+            gd_buf[i] = cpu_to_le32(tmp);
+        }
+        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                          gd_buf, gd_buf_size);
+        if (ret < 0) {
+            error_set(errp, QERR_IO_ERROR);
+            goto exit;
+        }
     }
 
     ret = 0;
@@ -1914,6 +2104,15 @@ exit:
 static void vmdk_close(BlockDriverState *bs)
 {
     BDRVVmdkState *s = bs->opaque;
+    VmdkExtent *extent = &s->extents[0];
+
+    if (extent->compressed) {
+        while (extent < &s->extents[s->num_extents]) {
+            vmdk_write_grain_table(extent);
+            vmdk_write_footer(extent->file, &extent->footer, extent);
+            extent++;
+        }
+    }
 
     vmdk_free_extents(bs);
     g_free(s->create_type);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support
  2014-07-07 14:54 [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support Milos Vyletel
@ 2014-07-29 13:37 ` Stefan Hajnoczi
  2014-07-29 13:46   ` Milos Vyletel
  2014-07-30  7:51 ` Fam Zheng
  2014-08-04 17:10 ` [Qemu-devel] [PATCH v2] " Milos Vyletel
  2 siblings, 1 reply; 21+ messages in thread
From: Stefan Hajnoczi @ 2014-07-29 13:37 UTC (permalink / raw)
  To: Milos Vyletel; +Cc: Kevin Wolf, Fam Zheng, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 23953 bytes --]

On Mon, Jul 07, 2014 at 10:54:27AM -0400, Milos Vyletel wrote:
> VMDK's streamOptimized format is different that regular sparse format.
> L1(GD) and L2(GT) tables are not predefined but rather generated and
> written during image creation mainly because there is no way to tell
> how much space data will occupy once they are compressed. Also the
> location of header, L1 and L2 tables differs.
> 
> - L2 tables (grain tables) are written after all grains they point to
> - L1 tables are written after all grains and L2 tables
> - footer at the end is used instead of header in first sector
> 
> This patch improves streamOptimized support and adds possibility to
> create true streamOptimized images using qemu-img. Some of the changes
> are from VMDK specs, some of them from hexdump-ing images from VMWare
> and VirtualBox.
> 
> I have compared these images to the ones generated by VMWare and vbox
> and they are identical with the exception of DescriptorFile that has
> some differences but none that would change behavior(CID and some
> additional DDB entries differ) and streamOptimized image generated from
> raw image was succesfully imported (as OVA) into VMWare ESXi and Oracle
> OVM.
> 
> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
> ---
>  block/vmdk.c |  363 +++++++++++++++++++++++++++++++++++++++++++++-------------
>  1 files changed, 281 insertions(+), 82 deletions(-)

What does this patch do beyond what QEMU already supports today?

Is there a particular application that rejected QEMU's streamOptimized
images?  Is this a bug fix?

Please use scripts/checkpatch.pl to check coding style.

Fam: Please review

> diff --git a/block/vmdk.c b/block/vmdk.c
> index 27a78da..f482225 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -81,6 +81,21 @@ typedef struct {
>      uint16_t compressAlgorithm;
>  } QEMU_PACKED VMDK4Header;
>  
> +typedef struct {
> +    uint64_t val;
> +    uint32_t size;
> +    uint32_t type;
> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
> +} QEMU_PACKED VMDK4MetaMarker;
> +
> +typedef struct {
> +    VMDK4MetaMarker footer_marker;
> +    uint32_t magic;
> +    VMDK4Header header;
> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
> +    VMDK4MetaMarker eos_marker;
> +} QEMU_PACKED VMDK4Footer;
> +
>  #define L2_CACHE_SIZE 16
>  
>  typedef struct VmdkExtent {
> @@ -89,24 +104,29 @@ typedef struct VmdkExtent {
>      bool compressed;
>      bool has_marker;
>      bool has_zero_grain;
> +    bool has_footer;
>      int version;
>      int64_t sectors;
>      int64_t end_sector;
>      int64_t flat_start_offset;
>      int64_t l1_table_offset;
>      int64_t l1_backup_table_offset;
> +    uint32_t l1_index;
>      uint32_t *l1_table;
>      uint32_t *l1_backup_table;
>      unsigned int l1_size;
>      uint32_t l1_entry_sectors;
>  
>      unsigned int l2_size;
> +    uint32_t *l2_table;
>      uint32_t *l2_cache;
>      uint32_t l2_cache_offsets[L2_CACHE_SIZE];
>      uint32_t l2_cache_counts[L2_CACHE_SIZE];
>  
>      int64_t cluster_sectors;
>      char *type;
> +
> +    VMDK4Footer footer;
>  } VmdkExtent;
>  
>  typedef struct BDRVVmdkState {
> @@ -555,14 +575,51 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
>      return buf;
>  }
>  
> +static int vmdk_read_footer(BlockDriverState *bs,
> +                            VMDK4Footer *footer)
> +{
> +    int ret;
> +
> +    /*
> +    * footer starts 3 sectors from end
> +    * - footer marker
> +    * - footer
> +    * - end-of-stream marker
> +    */
> +    ret = bdrv_pread(bs->file,
> +        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
> +        footer, sizeof(*footer));
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* Some sanity checks for the footer */
> +    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
> +        le32_to_cpu(footer->footer_marker.size) != 0  ||
> +        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
> +        le64_to_cpu(footer->eos_marker.val) != 0  ||
> +        le32_to_cpu(footer->eos_marker.size) != 0  ||
> +        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM)
> +    {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
> +    ret = VMDK_OK;
> + out:
> +    return ret;
> +}
> +
>  static int vmdk_open_vmdk4(BlockDriverState *bs,
>                             BlockDriverState *file,
>                             int flags, Error **errp)
>  {
>      int ret;
> +    bool has_footer = false;
>      uint32_t magic;
>      uint32_t l1_size, l1_entry_sectors;
>      VMDK4Header header;
> +    VMDK4Footer footer;
>      VmdkExtent *extent;
>      BDRVVmdkState *s = bs->opaque;
>      int64_t l1_backup_offset = 0;
> @@ -593,48 +650,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>  
>      if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
>          /*
> -         * The footer takes precedence over the header, so read it in. The
> -         * footer starts at offset -1024 from the end: One sector for the
> -         * footer, and another one for the end-of-stream marker.
> +         * The footer takes precedence over the header, so read it in.
>           */
> -        struct {
> -            struct {
> -                uint64_t val;
> -                uint32_t size;
> -                uint32_t type;
> -                uint8_t pad[512 - 16];
> -            } QEMU_PACKED footer_marker;
> -
> -            uint32_t magic;
> -            VMDK4Header header;
> -            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
> -
> -            struct {
> -                uint64_t val;
> -                uint32_t size;
> -                uint32_t type;
> -                uint8_t pad[512 - 16];
> -            } QEMU_PACKED eos_marker;
> -        } QEMU_PACKED footer;
> -
> -        ret = bdrv_pread(file,
> -            bs->file->total_sectors * 512 - 1536,
> -            &footer, sizeof(footer));
> +        ret = vmdk_read_footer(bs, &footer);
>          if (ret < 0) {
>              return ret;
>          }
> -
> -        /* Some sanity checks for the footer */
> -        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
> -            le32_to_cpu(footer.footer_marker.size) != 0  ||
> -            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
> -            le64_to_cpu(footer.eos_marker.val) != 0  ||
> -            le32_to_cpu(footer.eos_marker.size) != 0  ||
> -            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
> -        {
> -            return -EINVAL;
> -        }
> -
> +        has_footer = true;
>          header = footer.header;
>      }
>  
> @@ -645,11 +667,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>          error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
>                    bs->device_name, "vmdk", buf);
>          return -ENOTSUP;
> -    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
> +    } else if (le32_to_cpu(header.version) == 3 &&
> +               (flags & BDRV_O_RDWR) &&
> +               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
>          /* VMware KB 2064959 explains that version 3 added support for
>           * persistent changed block tracking (CBT), and backup software can
>           * read it as version=1 if it doesn't care about the changed area
> -         * information. So we are safe to enable read only. */
> +         * information. So we are safe to enable read only.
> +         * Note that this does not apply to streamOptimized images which
> +         * are written only once and are used as transport format */
>          error_setg(errp, "VMDK version 3 must be read only");
>          return -EINVAL;
>      }
> @@ -689,11 +715,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>      if (ret < 0) {
>          return ret;
>      }
> +    if (has_footer) {
> +        extent->has_footer = has_footer;
> +        extent->footer = footer;
> +    }
> +
>      extent->compressed =
>          le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
>      if (extent->compressed) {
>          g_free(s->create_type);
>          s->create_type = g_strdup("streamOptimized");
> +
> +        if (flags & BDRV_O_RDWR)
> +            bdrv_truncate(file,
> +                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
>      }
>      extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
>      extent->version = le32_to_cpu(header.version);
> @@ -1000,6 +1035,11 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      uint32_t offset;
>      QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
>      offset = cpu_to_le32(m_data->offset);
> +
> +    /* do not update on streamOptimized */
> +    if (extent->compressed)
> +        return VMDK_OK;
> +
>      /* update L2 table */
>      if (bdrv_pwrite_sync(
>                  extent->file,
> @@ -1026,6 +1066,97 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      return VMDK_OK;
>  }
>  
> +static int vmdk_write_footer(BlockDriverState *bs,
> +                           VMDK4Footer *footer,
> +                           VmdkExtent *extent)
> +{
> +    int i, ret, gd_buf_size;
> +    uint32_t *gd_buf = NULL;
> +    uint32_t grains, gd_sectors, gt_size, gt_count;
> +    uint64_t offset;
> +    VMDK4Header header;
> +    VMDK4MetaMarker gd_marker;
> +
> +    header = footer->header;
> +    offset = le64_to_cpu(header.gd_offset);
> +
> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
> +    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
> +                           BDRV_SECTOR_SIZE);
> +    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
> +    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
> +
> +    /* write grain directory marker */
> +    memset(&gd_marker, 0, sizeof(gd_marker));
> +    gd_marker.val = cpu_to_le64(gd_sectors);
> +    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
> +
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, &gd_marker, sizeof(gd_marker));
> +    if (ret < 0)
> +        goto exit;
> +    offset += sizeof(gd_marker) / BDRV_SECTOR_SIZE;
> +
> +    /* write grain directory */
> +    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> +    gd_buf = g_malloc0(gd_buf_size);
> +    if (extent) {
> +        /* copy over L1 table if we have it */
> +        for (i = 0; i < gt_count; i++) {
> +            gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
> +        }
> +    }
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size);
> +    if (ret < 0)
> +        goto exit;
> +
> +    /* save real gd_offset */
> +    footer->header.gd_offset = cpu_to_le64(offset);
> +    offset += gd_sectors;
> +
> +    /* write footer */
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
> +    if (ret < 0)
> +        goto exit;
> +
> +    ret = 0;
> + exit:
> +    return ret;
> +}
> +
> +static int vmdk_write_grain_table(VmdkExtent *extent)
> +{
> +    int ret;
> +    VMDK4MetaMarker gtm;
> +    uint64_t offset;
> +
> +    offset = le64_to_cpu(extent->footer.header.gd_offset) << 9;
> +
> +    memset(&gtm, 0, sizeof(gtm));
> +    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
> +    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
> +    if (bdrv_pwrite(extent->file, offset, &gtm, sizeof(gtm)) < 0) {
> +        ret = -EIO;
> +        goto out;
> +    }
> +
> +    offset += sizeof(gtm);
> +    extent->l1_table[extent->l1_index] = offset >> 9;
> +    if (bdrv_pwrite(extent->file, offset,
> +                    extent->l2_table, extent->l2_size * sizeof(uint32_t)
> +                   ) != extent->l2_size * sizeof(uint32_t)) {
> +        ret = -EIO;
> +        goto out;
> +    }
> +
> +    offset += extent->l2_size * sizeof(uint32_t);
> +    extent->l1_table_offset = offset;
> +    extent->footer.header.gd_offset = cpu_to_le64(offset >> 9);
> +
> +    ret = 0;
> + out:
> +    return ret;
> +}
> +
>  static int get_cluster_offset(BlockDriverState *bs,
>                                      VmdkExtent *extent,
>                                      VmdkMetaData *m_data,
> @@ -1034,8 +1165,8 @@ static int get_cluster_offset(BlockDriverState *bs,
>                                      uint64_t *cluster_offset)
>  {
>      unsigned int l1_index, l2_offset, l2_index;
> -    int min_index, i, j;
> -    uint32_t min_count, *l2_table;
> +    int min_index, i, j, ret;
> +    uint32_t min_count;
>      bool zeroed = false;
>  
>      if (m_data) {
> @@ -1048,11 +1179,25 @@ static int get_cluster_offset(BlockDriverState *bs,
>  
>      offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
>      l1_index = (offset >> 9) / extent->l1_entry_sectors;
> -    if (l1_index >= extent->l1_size) {
> +    if (extent->compressed && l1_index &&
> +            extent->l1_index != l1_index) {
> +        ret = vmdk_write_grain_table(extent);
> +        if (ret < 0)
> +            return ret;
> +    }
> +
> +    extent->l1_index = l1_index;
> +    if (extent->l1_index >= extent->l1_size) {
>          return VMDK_ERROR;
>      }
> -    l2_offset = extent->l1_table[l1_index];
> + retry:
> +    l2_offset = extent->l1_table[extent->l1_index];
> +
>      if (!l2_offset) {
> +        if (extent->compressed) {
> +            extent->l1_table[extent->l1_index] = bdrv_getlength(extent->file);
> +            goto retry;
> +        }
>          return VMDK_UNALLOC;
>      }
>      for (i = 0; i < L2_CACHE_SIZE; i++) {
> @@ -1063,7 +1208,7 @@ static int get_cluster_offset(BlockDriverState *bs,
>                      extent->l2_cache_counts[j] >>= 1;
>                  }
>              }
> -            l2_table = extent->l2_cache + (i * extent->l2_size);
> +            extent->l2_table = extent->l2_cache + (i * extent->l2_size);
>              goto found;
>          }
>      }
> @@ -1076,11 +1221,11 @@ static int get_cluster_offset(BlockDriverState *bs,
>              min_index = i;
>          }
>      }
> -    l2_table = extent->l2_cache + (min_index * extent->l2_size);
> +    extent->l2_table = extent->l2_cache + (min_index * extent->l2_size);
>      if (bdrv_pread(
>                  extent->file,
>                  (int64_t)l2_offset * 512,
> -                l2_table,
> +                extent->l2_table,
>                  extent->l2_size * sizeof(uint32_t)
>              ) != extent->l2_size * sizeof(uint32_t)) {
>          return VMDK_ERROR;
> @@ -1090,15 +1235,15 @@ static int get_cluster_offset(BlockDriverState *bs,
>      extent->l2_cache_counts[min_index] = 1;
>   found:
>      l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
> -    *cluster_offset = le32_to_cpu(l2_table[l2_index]);
> +    *cluster_offset = le32_to_cpu(extent->l2_table[l2_index]);
>  
>      if (m_data) {
>          m_data->valid = 1;
> -        m_data->l1_index = l1_index;
> +        m_data->l1_index = extent->l1_index;
>          m_data->l2_index = l2_index;
>          m_data->offset = *cluster_offset;
>          m_data->l2_offset = l2_offset;
> -        m_data->l2_cache_entry = &l2_table[l2_index];
> +        m_data->l2_cache_entry = &extent->l2_table[l2_index];
>      }
>      if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
>          zeroed = true;
> @@ -1119,7 +1264,7 @@ static int get_cluster_offset(BlockDriverState *bs,
>          }
>  
>          *cluster_offset >>= 9;
> -        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
> +        extent->l2_table[l2_index] = cpu_to_le32(*cluster_offset);
>  
>          /* First of all we write grain itself, to avoid race condition
>           * that may to corrupt the image.
> @@ -1210,6 +1355,7 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>      uLongf buf_len;
>      const uint8_t *write_buf = buf;
>      int write_len = nb_sectors * 512;
> +    uint64_t gd_offset;
>  
>      if (extent->compressed) {
>          if (!extent->has_marker) {
> @@ -1227,6 +1373,7 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>          data->size = buf_len;
>          write_buf = (uint8_t *)data;
>          write_len = buf_len + sizeof(VmdkGrainMarker);
> +
>      }
>      ret = bdrv_pwrite(extent->file,
>                          cluster_offset + offset_in_cluster,
> @@ -1236,6 +1383,13 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>          ret = ret < 0 ? ret : -EIO;
>          goto out;
>      }
> +    if (extent->compressed) {
> +        /* update GD offset after each write */
> +        gd_offset = bdrv_getlength(extent->file);
> +        extent->l1_table_offset = gd_offset;
> +        gd_offset /= BDRV_SECTOR_SIZE;
> +        extent->footer.header.gd_offset = cpu_to_le64(gd_offset);
> +    }
>      ret = 0;
>   out:
>      g_free(data);
> @@ -1534,10 +1688,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>      int ret, i;
>      BlockDriverState *bs = NULL;
>      VMDK4Header header;
> +    VMDK4Footer footer;
>      Error *local_err = NULL;
>      uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
>      uint32_t *gd_buf = NULL;
>      int gd_buf_size;
> +    uint64_t grain_offset, rgd_offset, gd_offset;
>  
>      ret = bdrv_create_file(filename, opts, &local_err);
>      if (ret < 0) {
> @@ -1562,28 +1718,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>      }
>      magic = cpu_to_be32(VMDK4_MAGIC);
>      memset(&header, 0, sizeof(header));
> -    header.version = zeroed_grain ? 2 : 1;
> -    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
> -                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
> +    memset(&footer, 0, sizeof(footer));
> +
> +    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
> +    header.flags = VMDK4_FLAG_NL_DETECT
> +                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
> +                               : VMDK4_FLAG_RGD)
>                     | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
>      header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
>      header.capacity = filesize / BDRV_SECTOR_SIZE;
>      header.granularity = 128;
>      header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
>  
> -    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
>      gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
>                             BDRV_SECTOR_SIZE);
>      gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
>      gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>  
>      header.desc_offset = 1;
> -    header.desc_size = 20;
> -    header.rgd_offset = header.desc_offset + header.desc_size;
> -    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
> -    header.grain_offset =
> +    header.desc_size = (compress ? 2 : 20);
> +    rgd_offset = header.desc_offset + header.desc_size;
> +    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
> +    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
> +    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
> +    grain_offset =
>          ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
>                   header.granularity);
> +    /* streamOptimized reserves first 128 sectors */
> +    header.grain_offset = (compress ? header.granularity : grain_offset);
> +    /* streamOptimzed's grain directory is at the end */
> +    gd_offset = header.grain_offset + 1;
> +
>      /* swap endianness for all header fields */
>      header.version = cpu_to_le32(header.version);
>      header.flags = cpu_to_le32(header.flags);
> @@ -1620,30 +1786,54 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>          goto exit;
>      }
>  
> -    /* write grain directory */
> -    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> -    gd_buf = g_malloc0(gd_buf_size);
> -    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
> -         i < gt_count; i++, tmp += gt_size) {
> -        gd_buf[i] = cpu_to_le32(tmp);
> -    }
> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
> -                      gd_buf, gd_buf_size);
> -    if (ret < 0) {
> -        error_set(errp, QERR_IO_ERROR);
> -        goto exit;
> -    }
> +    if (compress) {
> +        /* footer marker */
> +        footer.footer_marker.val = cpu_to_le64(1);
> +        footer.footer_marker.type = cpu_to_le32(MARKER_FOOTER);
>  
> -    /* write backup grain directory */
> -    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
> -         i < gt_count; i++, tmp += gt_size) {
> -        gd_buf[i] = cpu_to_le32(tmp);
> -    }
> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
> -                      gd_buf, gd_buf_size);
> -    if (ret < 0) {
> -        error_set(errp, QERR_IO_ERROR);
> -        goto exit;
> +        /* header */
> +        footer.magic = cpu_to_be32(VMDK4_MAGIC);
> +        footer.header = header;
> +
> +        /* grain directory offset */
> +        footer.header.gd_offset = cpu_to_le64(gd_offset);
> +
> +        /* EOS marker */
> +        footer.eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
> +
> +        ret = vmdk_write_footer(bs, &footer, NULL);
> +        if (ret < 0) {
> +            error_set(errp, QERR_IO_ERROR);
> +            goto exit;
> +        }
> +    } else {
> +        /* write redundant grain directory (if applicable) */
> +        if (le64_to_cpu(header.rgd_offset)) {
> +            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> +            gd_buf = g_malloc0(gd_buf_size);
> +            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
> +                 i < gt_count; i++, tmp += gt_size) {
> +                gd_buf[i] = cpu_to_le32(tmp);
> +            }
> +            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
> +                              gd_buf, gd_buf_size);
> +            if (ret < 0) {
> +                error_set(errp, QERR_IO_ERROR);
> +                goto exit;
> +            }
> +        }
> +
> +        /* write grain directory */
> +        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
> +             i < gt_count; i++, tmp += gt_size) {
> +            gd_buf[i] = cpu_to_le32(tmp);
> +        }
> +        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
> +                          gd_buf, gd_buf_size);
> +        if (ret < 0) {
> +            error_set(errp, QERR_IO_ERROR);
> +            goto exit;
> +        }
>      }
>  
>      ret = 0;
> @@ -1914,6 +2104,15 @@ exit:
>  static void vmdk_close(BlockDriverState *bs)
>  {
>      BDRVVmdkState *s = bs->opaque;
> +    VmdkExtent *extent = &s->extents[0];
> +
> +    if (extent->compressed) {
> +        while (extent < &s->extents[s->num_extents]) {
> +            vmdk_write_grain_table(extent);
> +            vmdk_write_footer(extent->file, &extent->footer, extent);
> +            extent++;
> +        }
> +    }
>  
>      vmdk_free_extents(bs);
>      g_free(s->create_type);
> -- 
> 1.7.1
> 

[-- Attachment #2: Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support
  2014-07-29 13:37 ` Stefan Hajnoczi
@ 2014-07-29 13:46   ` Milos Vyletel
  2014-07-29 14:37     ` Stefan Hajnoczi
  0 siblings, 1 reply; 21+ messages in thread
From: Milos Vyletel @ 2014-07-29 13:46 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, Fam Zheng, qemu-devel

On Tue, Jul 29, 2014 at 9:37 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> On Mon, Jul 07, 2014 at 10:54:27AM -0400, Milos Vyletel wrote:
>> VMDK's streamOptimized format is different that regular sparse format.
>> L1(GD) and L2(GT) tables are not predefined but rather generated and
>> written during image creation mainly because there is no way to tell
>> how much space data will occupy once they are compressed. Also the
>> location of header, L1 and L2 tables differs.
>>
>> - L2 tables (grain tables) are written after all grains they point to
>> - L1 tables are written after all grains and L2 tables
>> - footer at the end is used instead of header in first sector
>>
>> This patch improves streamOptimized support and adds possibility to
>> create true streamOptimized images using qemu-img. Some of the changes
>> are from VMDK specs, some of them from hexdump-ing images from VMWare
>> and VirtualBox.
>>
>> I have compared these images to the ones generated by VMWare and vbox
>> and they are identical with the exception of DescriptorFile that has
>> some differences but none that would change behavior(CID and some
>> additional DDB entries differ) and streamOptimized image generated from
>> raw image was succesfully imported (as OVA) into VMWare ESXi and Oracle
>> OVM.
>>
>> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
>> ---
>>  block/vmdk.c |  363 +++++++++++++++++++++++++++++++++++++++++++++-------------
>>  1 files changed, 281 insertions(+), 82 deletions(-)
>
> What does this patch do beyond what QEMU already supports today?
>
> Is there a particular application that rejected QEMU's streamOptimized
> images?  Is this a bug fix?
>
> Please use scripts/checkpatch.pl to check coding style.
>
> Fam: Please review
>

Images created/converted using QEMU were rejected by VMWare ESXi,
vCloud, VirtualBox and Oracle OVM. I did not try anything else.

Generally speaking streamOptimized format is not followed by QEMU.
Instead regular VMDK format + encryption is used. We can say this is
bug fix since streamOptimzed format is already included but does not
work well.

I'll check the style and fix the code. I'll postpone resubmit until
further review is done since I'm sure there will be things that will
need to be fixed.

Milos

>> diff --git a/block/vmdk.c b/block/vmdk.c
>> index 27a78da..f482225 100644
>> --- a/block/vmdk.c
>> +++ b/block/vmdk.c
>> @@ -81,6 +81,21 @@ typedef struct {
>>      uint16_t compressAlgorithm;
>>  } QEMU_PACKED VMDK4Header;
>>
>> +typedef struct {
>> +    uint64_t val;
>> +    uint32_t size;
>> +    uint32_t type;
>> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
>> +} QEMU_PACKED VMDK4MetaMarker;
>> +
>> +typedef struct {
>> +    VMDK4MetaMarker footer_marker;
>> +    uint32_t magic;
>> +    VMDK4Header header;
>> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
>> +    VMDK4MetaMarker eos_marker;
>> +} QEMU_PACKED VMDK4Footer;
>> +
>>  #define L2_CACHE_SIZE 16
>>
>>  typedef struct VmdkExtent {
>> @@ -89,24 +104,29 @@ typedef struct VmdkExtent {
>>      bool compressed;
>>      bool has_marker;
>>      bool has_zero_grain;
>> +    bool has_footer;
>>      int version;
>>      int64_t sectors;
>>      int64_t end_sector;
>>      int64_t flat_start_offset;
>>      int64_t l1_table_offset;
>>      int64_t l1_backup_table_offset;
>> +    uint32_t l1_index;
>>      uint32_t *l1_table;
>>      uint32_t *l1_backup_table;
>>      unsigned int l1_size;
>>      uint32_t l1_entry_sectors;
>>
>>      unsigned int l2_size;
>> +    uint32_t *l2_table;
>>      uint32_t *l2_cache;
>>      uint32_t l2_cache_offsets[L2_CACHE_SIZE];
>>      uint32_t l2_cache_counts[L2_CACHE_SIZE];
>>
>>      int64_t cluster_sectors;
>>      char *type;
>> +
>> +    VMDK4Footer footer;
>>  } VmdkExtent;
>>
>>  typedef struct BDRVVmdkState {
>> @@ -555,14 +575,51 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
>>      return buf;
>>  }
>>
>> +static int vmdk_read_footer(BlockDriverState *bs,
>> +                            VMDK4Footer *footer)
>> +{
>> +    int ret;
>> +
>> +    /*
>> +    * footer starts 3 sectors from end
>> +    * - footer marker
>> +    * - footer
>> +    * - end-of-stream marker
>> +    */
>> +    ret = bdrv_pread(bs->file,
>> +        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
>> +        footer, sizeof(*footer));
>> +    if (ret < 0) {
>> +        goto out;
>> +    }
>> +
>> +    /* Some sanity checks for the footer */
>> +    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
>> +        le32_to_cpu(footer->footer_marker.size) != 0  ||
>> +        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
>> +        le64_to_cpu(footer->eos_marker.val) != 0  ||
>> +        le32_to_cpu(footer->eos_marker.size) != 0  ||
>> +        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM)
>> +    {
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>> +    ret = VMDK_OK;
>> + out:
>> +    return ret;
>> +}
>> +
>>  static int vmdk_open_vmdk4(BlockDriverState *bs,
>>                             BlockDriverState *file,
>>                             int flags, Error **errp)
>>  {
>>      int ret;
>> +    bool has_footer = false;
>>      uint32_t magic;
>>      uint32_t l1_size, l1_entry_sectors;
>>      VMDK4Header header;
>> +    VMDK4Footer footer;
>>      VmdkExtent *extent;
>>      BDRVVmdkState *s = bs->opaque;
>>      int64_t l1_backup_offset = 0;
>> @@ -593,48 +650,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>>
>>      if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
>>          /*
>> -         * The footer takes precedence over the header, so read it in. The
>> -         * footer starts at offset -1024 from the end: One sector for the
>> -         * footer, and another one for the end-of-stream marker.
>> +         * The footer takes precedence over the header, so read it in.
>>           */
>> -        struct {
>> -            struct {
>> -                uint64_t val;
>> -                uint32_t size;
>> -                uint32_t type;
>> -                uint8_t pad[512 - 16];
>> -            } QEMU_PACKED footer_marker;
>> -
>> -            uint32_t magic;
>> -            VMDK4Header header;
>> -            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
>> -
>> -            struct {
>> -                uint64_t val;
>> -                uint32_t size;
>> -                uint32_t type;
>> -                uint8_t pad[512 - 16];
>> -            } QEMU_PACKED eos_marker;
>> -        } QEMU_PACKED footer;
>> -
>> -        ret = bdrv_pread(file,
>> -            bs->file->total_sectors * 512 - 1536,
>> -            &footer, sizeof(footer));
>> +        ret = vmdk_read_footer(bs, &footer);
>>          if (ret < 0) {
>>              return ret;
>>          }
>> -
>> -        /* Some sanity checks for the footer */
>> -        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
>> -            le32_to_cpu(footer.footer_marker.size) != 0  ||
>> -            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
>> -            le64_to_cpu(footer.eos_marker.val) != 0  ||
>> -            le32_to_cpu(footer.eos_marker.size) != 0  ||
>> -            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
>> -        {
>> -            return -EINVAL;
>> -        }
>> -
>> +        has_footer = true;
>>          header = footer.header;
>>      }
>>
>> @@ -645,11 +667,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>>          error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
>>                    bs->device_name, "vmdk", buf);
>>          return -ENOTSUP;
>> -    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
>> +    } else if (le32_to_cpu(header.version) == 3 &&
>> +               (flags & BDRV_O_RDWR) &&
>> +               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
>>          /* VMware KB 2064959 explains that version 3 added support for
>>           * persistent changed block tracking (CBT), and backup software can
>>           * read it as version=1 if it doesn't care about the changed area
>> -         * information. So we are safe to enable read only. */
>> +         * information. So we are safe to enable read only.
>> +         * Note that this does not apply to streamOptimized images which
>> +         * are written only once and are used as transport format */
>>          error_setg(errp, "VMDK version 3 must be read only");
>>          return -EINVAL;
>>      }
>> @@ -689,11 +715,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>>      if (ret < 0) {
>>          return ret;
>>      }
>> +    if (has_footer) {
>> +        extent->has_footer = has_footer;
>> +        extent->footer = footer;
>> +    }
>> +
>>      extent->compressed =
>>          le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
>>      if (extent->compressed) {
>>          g_free(s->create_type);
>>          s->create_type = g_strdup("streamOptimized");
>> +
>> +        if (flags & BDRV_O_RDWR)
>> +            bdrv_truncate(file,
>> +                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
>>      }
>>      extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
>>      extent->version = le32_to_cpu(header.version);
>> @@ -1000,6 +1035,11 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>>      uint32_t offset;
>>      QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
>>      offset = cpu_to_le32(m_data->offset);
>> +
>> +    /* do not update on streamOptimized */
>> +    if (extent->compressed)
>> +        return VMDK_OK;
>> +
>>      /* update L2 table */
>>      if (bdrv_pwrite_sync(
>>                  extent->file,
>> @@ -1026,6 +1066,97 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>>      return VMDK_OK;
>>  }
>>
>> +static int vmdk_write_footer(BlockDriverState *bs,
>> +                           VMDK4Footer *footer,
>> +                           VmdkExtent *extent)
>> +{
>> +    int i, ret, gd_buf_size;
>> +    uint32_t *gd_buf = NULL;
>> +    uint32_t grains, gd_sectors, gt_size, gt_count;
>> +    uint64_t offset;
>> +    VMDK4Header header;
>> +    VMDK4MetaMarker gd_marker;
>> +
>> +    header = footer->header;
>> +    offset = le64_to_cpu(header.gd_offset);
>> +
>> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
>> +    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
>> +                           BDRV_SECTOR_SIZE);
>> +    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
>> +    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>> +
>> +    /* write grain directory marker */
>> +    memset(&gd_marker, 0, sizeof(gd_marker));
>> +    gd_marker.val = cpu_to_le64(gd_sectors);
>> +    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
>> +
>> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, &gd_marker, sizeof(gd_marker));
>> +    if (ret < 0)
>> +        goto exit;
>> +    offset += sizeof(gd_marker) / BDRV_SECTOR_SIZE;
>> +
>> +    /* write grain directory */
>> +    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
>> +    gd_buf = g_malloc0(gd_buf_size);
>> +    if (extent) {
>> +        /* copy over L1 table if we have it */
>> +        for (i = 0; i < gt_count; i++) {
>> +            gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
>> +        }
>> +    }
>> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size);
>> +    if (ret < 0)
>> +        goto exit;
>> +
>> +    /* save real gd_offset */
>> +    footer->header.gd_offset = cpu_to_le64(offset);
>> +    offset += gd_sectors;
>> +
>> +    /* write footer */
>> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
>> +    if (ret < 0)
>> +        goto exit;
>> +
>> +    ret = 0;
>> + exit:
>> +    return ret;
>> +}
>> +
>> +static int vmdk_write_grain_table(VmdkExtent *extent)
>> +{
>> +    int ret;
>> +    VMDK4MetaMarker gtm;
>> +    uint64_t offset;
>> +
>> +    offset = le64_to_cpu(extent->footer.header.gd_offset) << 9;
>> +
>> +    memset(&gtm, 0, sizeof(gtm));
>> +    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
>> +    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
>> +    if (bdrv_pwrite(extent->file, offset, &gtm, sizeof(gtm)) < 0) {
>> +        ret = -EIO;
>> +        goto out;
>> +    }
>> +
>> +    offset += sizeof(gtm);
>> +    extent->l1_table[extent->l1_index] = offset >> 9;
>> +    if (bdrv_pwrite(extent->file, offset,
>> +                    extent->l2_table, extent->l2_size * sizeof(uint32_t)
>> +                   ) != extent->l2_size * sizeof(uint32_t)) {
>> +        ret = -EIO;
>> +        goto out;
>> +    }
>> +
>> +    offset += extent->l2_size * sizeof(uint32_t);
>> +    extent->l1_table_offset = offset;
>> +    extent->footer.header.gd_offset = cpu_to_le64(offset >> 9);
>> +
>> +    ret = 0;
>> + out:
>> +    return ret;
>> +}
>> +
>>  static int get_cluster_offset(BlockDriverState *bs,
>>                                      VmdkExtent *extent,
>>                                      VmdkMetaData *m_data,
>> @@ -1034,8 +1165,8 @@ static int get_cluster_offset(BlockDriverState *bs,
>>                                      uint64_t *cluster_offset)
>>  {
>>      unsigned int l1_index, l2_offset, l2_index;
>> -    int min_index, i, j;
>> -    uint32_t min_count, *l2_table;
>> +    int min_index, i, j, ret;
>> +    uint32_t min_count;
>>      bool zeroed = false;
>>
>>      if (m_data) {
>> @@ -1048,11 +1179,25 @@ static int get_cluster_offset(BlockDriverState *bs,
>>
>>      offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
>>      l1_index = (offset >> 9) / extent->l1_entry_sectors;
>> -    if (l1_index >= extent->l1_size) {
>> +    if (extent->compressed && l1_index &&
>> +            extent->l1_index != l1_index) {
>> +        ret = vmdk_write_grain_table(extent);
>> +        if (ret < 0)
>> +            return ret;
>> +    }
>> +
>> +    extent->l1_index = l1_index;
>> +    if (extent->l1_index >= extent->l1_size) {
>>          return VMDK_ERROR;
>>      }
>> -    l2_offset = extent->l1_table[l1_index];
>> + retry:
>> +    l2_offset = extent->l1_table[extent->l1_index];
>> +
>>      if (!l2_offset) {
>> +        if (extent->compressed) {
>> +            extent->l1_table[extent->l1_index] = bdrv_getlength(extent->file);
>> +            goto retry;
>> +        }
>>          return VMDK_UNALLOC;
>>      }
>>      for (i = 0; i < L2_CACHE_SIZE; i++) {
>> @@ -1063,7 +1208,7 @@ static int get_cluster_offset(BlockDriverState *bs,
>>                      extent->l2_cache_counts[j] >>= 1;
>>                  }
>>              }
>> -            l2_table = extent->l2_cache + (i * extent->l2_size);
>> +            extent->l2_table = extent->l2_cache + (i * extent->l2_size);
>>              goto found;
>>          }
>>      }
>> @@ -1076,11 +1221,11 @@ static int get_cluster_offset(BlockDriverState *bs,
>>              min_index = i;
>>          }
>>      }
>> -    l2_table = extent->l2_cache + (min_index * extent->l2_size);
>> +    extent->l2_table = extent->l2_cache + (min_index * extent->l2_size);
>>      if (bdrv_pread(
>>                  extent->file,
>>                  (int64_t)l2_offset * 512,
>> -                l2_table,
>> +                extent->l2_table,
>>                  extent->l2_size * sizeof(uint32_t)
>>              ) != extent->l2_size * sizeof(uint32_t)) {
>>          return VMDK_ERROR;
>> @@ -1090,15 +1235,15 @@ static int get_cluster_offset(BlockDriverState *bs,
>>      extent->l2_cache_counts[min_index] = 1;
>>   found:
>>      l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
>> -    *cluster_offset = le32_to_cpu(l2_table[l2_index]);
>> +    *cluster_offset = le32_to_cpu(extent->l2_table[l2_index]);
>>
>>      if (m_data) {
>>          m_data->valid = 1;
>> -        m_data->l1_index = l1_index;
>> +        m_data->l1_index = extent->l1_index;
>>          m_data->l2_index = l2_index;
>>          m_data->offset = *cluster_offset;
>>          m_data->l2_offset = l2_offset;
>> -        m_data->l2_cache_entry = &l2_table[l2_index];
>> +        m_data->l2_cache_entry = &extent->l2_table[l2_index];
>>      }
>>      if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
>>          zeroed = true;
>> @@ -1119,7 +1264,7 @@ static int get_cluster_offset(BlockDriverState *bs,
>>          }
>>
>>          *cluster_offset >>= 9;
>> -        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
>> +        extent->l2_table[l2_index] = cpu_to_le32(*cluster_offset);
>>
>>          /* First of all we write grain itself, to avoid race condition
>>           * that may to corrupt the image.
>> @@ -1210,6 +1355,7 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>>      uLongf buf_len;
>>      const uint8_t *write_buf = buf;
>>      int write_len = nb_sectors * 512;
>> +    uint64_t gd_offset;
>>
>>      if (extent->compressed) {
>>          if (!extent->has_marker) {
>> @@ -1227,6 +1373,7 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>>          data->size = buf_len;
>>          write_buf = (uint8_t *)data;
>>          write_len = buf_len + sizeof(VmdkGrainMarker);
>> +
>>      }
>>      ret = bdrv_pwrite(extent->file,
>>                          cluster_offset + offset_in_cluster,
>> @@ -1236,6 +1383,13 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>>          ret = ret < 0 ? ret : -EIO;
>>          goto out;
>>      }
>> +    if (extent->compressed) {
>> +        /* update GD offset after each write */
>> +        gd_offset = bdrv_getlength(extent->file);
>> +        extent->l1_table_offset = gd_offset;
>> +        gd_offset /= BDRV_SECTOR_SIZE;
>> +        extent->footer.header.gd_offset = cpu_to_le64(gd_offset);
>> +    }
>>      ret = 0;
>>   out:
>>      g_free(data);
>> @@ -1534,10 +1688,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>>      int ret, i;
>>      BlockDriverState *bs = NULL;
>>      VMDK4Header header;
>> +    VMDK4Footer footer;
>>      Error *local_err = NULL;
>>      uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
>>      uint32_t *gd_buf = NULL;
>>      int gd_buf_size;
>> +    uint64_t grain_offset, rgd_offset, gd_offset;
>>
>>      ret = bdrv_create_file(filename, opts, &local_err);
>>      if (ret < 0) {
>> @@ -1562,28 +1718,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>>      }
>>      magic = cpu_to_be32(VMDK4_MAGIC);
>>      memset(&header, 0, sizeof(header));
>> -    header.version = zeroed_grain ? 2 : 1;
>> -    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
>> -                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
>> +    memset(&footer, 0, sizeof(footer));
>> +
>> +    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
>> +    header.flags = VMDK4_FLAG_NL_DETECT
>> +                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
>> +                               : VMDK4_FLAG_RGD)
>>                     | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
>>      header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
>>      header.capacity = filesize / BDRV_SECTOR_SIZE;
>>      header.granularity = 128;
>>      header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
>>
>> -    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
>> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
>>      gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
>>                             BDRV_SECTOR_SIZE);
>>      gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
>>      gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>>
>>      header.desc_offset = 1;
>> -    header.desc_size = 20;
>> -    header.rgd_offset = header.desc_offset + header.desc_size;
>> -    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
>> -    header.grain_offset =
>> +    header.desc_size = (compress ? 2 : 20);
>> +    rgd_offset = header.desc_offset + header.desc_size;
>> +    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
>> +    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
>> +    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
>> +    grain_offset =
>>          ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
>>                   header.granularity);
>> +    /* streamOptimized reserves first 128 sectors */
>> +    header.grain_offset = (compress ? header.granularity : grain_offset);
>> +    /* streamOptimzed's grain directory is at the end */
>> +    gd_offset = header.grain_offset + 1;
>> +
>>      /* swap endianness for all header fields */
>>      header.version = cpu_to_le32(header.version);
>>      header.flags = cpu_to_le32(header.flags);
>> @@ -1620,30 +1786,54 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>>          goto exit;
>>      }
>>
>> -    /* write grain directory */
>> -    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
>> -    gd_buf = g_malloc0(gd_buf_size);
>> -    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
>> -         i < gt_count; i++, tmp += gt_size) {
>> -        gd_buf[i] = cpu_to_le32(tmp);
>> -    }
>> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
>> -                      gd_buf, gd_buf_size);
>> -    if (ret < 0) {
>> -        error_set(errp, QERR_IO_ERROR);
>> -        goto exit;
>> -    }
>> +    if (compress) {
>> +        /* footer marker */
>> +        footer.footer_marker.val = cpu_to_le64(1);
>> +        footer.footer_marker.type = cpu_to_le32(MARKER_FOOTER);
>>
>> -    /* write backup grain directory */
>> -    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
>> -         i < gt_count; i++, tmp += gt_size) {
>> -        gd_buf[i] = cpu_to_le32(tmp);
>> -    }
>> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
>> -                      gd_buf, gd_buf_size);
>> -    if (ret < 0) {
>> -        error_set(errp, QERR_IO_ERROR);
>> -        goto exit;
>> +        /* header */
>> +        footer.magic = cpu_to_be32(VMDK4_MAGIC);
>> +        footer.header = header;
>> +
>> +        /* grain directory offset */
>> +        footer.header.gd_offset = cpu_to_le64(gd_offset);
>> +
>> +        /* EOS marker */
>> +        footer.eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
>> +
>> +        ret = vmdk_write_footer(bs, &footer, NULL);
>> +        if (ret < 0) {
>> +            error_set(errp, QERR_IO_ERROR);
>> +            goto exit;
>> +        }
>> +    } else {
>> +        /* write redundant grain directory (if applicable) */
>> +        if (le64_to_cpu(header.rgd_offset)) {
>> +            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
>> +            gd_buf = g_malloc0(gd_buf_size);
>> +            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
>> +                 i < gt_count; i++, tmp += gt_size) {
>> +                gd_buf[i] = cpu_to_le32(tmp);
>> +            }
>> +            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
>> +                              gd_buf, gd_buf_size);
>> +            if (ret < 0) {
>> +                error_set(errp, QERR_IO_ERROR);
>> +                goto exit;
>> +            }
>> +        }
>> +
>> +        /* write grain directory */
>> +        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
>> +             i < gt_count; i++, tmp += gt_size) {
>> +            gd_buf[i] = cpu_to_le32(tmp);
>> +        }
>> +        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
>> +                          gd_buf, gd_buf_size);
>> +        if (ret < 0) {
>> +            error_set(errp, QERR_IO_ERROR);
>> +            goto exit;
>> +        }
>>      }
>>
>>      ret = 0;
>> @@ -1914,6 +2104,15 @@ exit:
>>  static void vmdk_close(BlockDriverState *bs)
>>  {
>>      BDRVVmdkState *s = bs->opaque;
>> +    VmdkExtent *extent = &s->extents[0];
>> +
>> +    if (extent->compressed) {
>> +        while (extent < &s->extents[s->num_extents]) {
>> +            vmdk_write_grain_table(extent);
>> +            vmdk_write_footer(extent->file, &extent->footer, extent);
>> +            extent++;
>> +        }
>> +    }
>>
>>      vmdk_free_extents(bs);
>>      g_free(s->create_type);
>> --
>> 1.7.1
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support
  2014-07-29 13:46   ` Milos Vyletel
@ 2014-07-29 14:37     ` Stefan Hajnoczi
  2014-07-29 14:49       ` Milos Vyletel
  0 siblings, 1 reply; 21+ messages in thread
From: Stefan Hajnoczi @ 2014-07-29 14:37 UTC (permalink / raw)
  To: Milos Vyletel; +Cc: Kevin Wolf, Fam Zheng, qemu-devel, Stefan Hajnoczi

On Tue, Jul 29, 2014 at 2:46 PM, Milos Vyletel <milos.vyletel@gmail.com> wrote:
> On Tue, Jul 29, 2014 at 9:37 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>> On Mon, Jul 07, 2014 at 10:54:27AM -0400, Milos Vyletel wrote:
>>> VMDK's streamOptimized format is different that regular sparse format.
>>> L1(GD) and L2(GT) tables are not predefined but rather generated and
>>> written during image creation mainly because there is no way to tell
>>> how much space data will occupy once they are compressed. Also the
>>> location of header, L1 and L2 tables differs.
>>>
>>> - L2 tables (grain tables) are written after all grains they point to
>>> - L1 tables are written after all grains and L2 tables
>>> - footer at the end is used instead of header in first sector
>>>
>>> This patch improves streamOptimized support and adds possibility to
>>> create true streamOptimized images using qemu-img. Some of the changes
>>> are from VMDK specs, some of them from hexdump-ing images from VMWare
>>> and VirtualBox.
>>>
>>> I have compared these images to the ones generated by VMWare and vbox
>>> and they are identical with the exception of DescriptorFile that has
>>> some differences but none that would change behavior(CID and some
>>> additional DDB entries differ) and streamOptimized image generated from
>>> raw image was succesfully imported (as OVA) into VMWare ESXi and Oracle
>>> OVM.
>>>
>>> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
>>> ---
>>>  block/vmdk.c |  363 +++++++++++++++++++++++++++++++++++++++++++++-------------
>>>  1 files changed, 281 insertions(+), 82 deletions(-)
>>
>> What does this patch do beyond what QEMU already supports today?
>>
>> Is there a particular application that rejected QEMU's streamOptimized
>> images?  Is this a bug fix?
>>
>> Please use scripts/checkpatch.pl to check coding style.
>>
>> Fam: Please review
>>
>
> Images created/converted using QEMU were rejected by VMWare ESXi,
> vCloud, VirtualBox and Oracle OVM. I did not try anything else.
>
> Generally speaking streamOptimized format is not followed by QEMU.
> Instead regular VMDK format + encryption is used. We can say this is
> bug fix since streamOptimzed format is already included but does not
> work well.
>
> I'll check the style and fix the code. I'll postpone resubmit until
> further review is done since I'm sure there will be things that will
> need to be fixed.

What was the command-line you used?

qemu-img convert -f qcow2 -O vmdk -o subformat=streamOptimized
input.qcow2 output.vmdk

Stefan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support
  2014-07-29 14:37     ` Stefan Hajnoczi
@ 2014-07-29 14:49       ` Milos Vyletel
  2014-07-30  8:09         ` Stefan Hajnoczi
  0 siblings, 1 reply; 21+ messages in thread
From: Milos Vyletel @ 2014-07-29 14:49 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, Fam Zheng, qemu-devel, Stefan Hajnoczi

On Tue, Jul 29, 2014 at 10:37 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Tue, Jul 29, 2014 at 2:46 PM, Milos Vyletel <milos.vyletel@gmail.com> wrote:
>> On Tue, Jul 29, 2014 at 9:37 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> On Mon, Jul 07, 2014 at 10:54:27AM -0400, Milos Vyletel wrote:
>>>> VMDK's streamOptimized format is different that regular sparse format.
>>>> L1(GD) and L2(GT) tables are not predefined but rather generated and
>>>> written during image creation mainly because there is no way to tell
>>>> how much space data will occupy once they are compressed. Also the
>>>> location of header, L1 and L2 tables differs.
>>>>
>>>> - L2 tables (grain tables) are written after all grains they point to
>>>> - L1 tables are written after all grains and L2 tables
>>>> - footer at the end is used instead of header in first sector
>>>>
>>>> This patch improves streamOptimized support and adds possibility to
>>>> create true streamOptimized images using qemu-img. Some of the changes
>>>> are from VMDK specs, some of them from hexdump-ing images from VMWare
>>>> and VirtualBox.
>>>>
>>>> I have compared these images to the ones generated by VMWare and vbox
>>>> and they are identical with the exception of DescriptorFile that has
>>>> some differences but none that would change behavior(CID and some
>>>> additional DDB entries differ) and streamOptimized image generated from
>>>> raw image was succesfully imported (as OVA) into VMWare ESXi and Oracle
>>>> OVM.
>>>>
>>>> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
>>>> ---
>>>>  block/vmdk.c |  363 +++++++++++++++++++++++++++++++++++++++++++++-------------
>>>>  1 files changed, 281 insertions(+), 82 deletions(-)
>>>
>>> What does this patch do beyond what QEMU already supports today?
>>>
>>> Is there a particular application that rejected QEMU's streamOptimized
>>> images?  Is this a bug fix?
>>>
>>> Please use scripts/checkpatch.pl to check coding style.
>>>
>>> Fam: Please review
>>>
>>
>> Images created/converted using QEMU were rejected by VMWare ESXi,
>> vCloud, VirtualBox and Oracle OVM. I did not try anything else.
>>
>> Generally speaking streamOptimized format is not followed by QEMU.
>> Instead regular VMDK format + encryption is used. We can say this is
>> bug fix since streamOptimzed format is already included but does not
>> work well.
>>
>> I'll check the style and fix the code. I'll postpone resubmit until
>> further review is done since I'm sure there will be things that will
>> need to be fixed.
>
> What was the command-line you used?
>
> qemu-img convert -f qcow2 -O vmdk -o subformat=streamOptimized
> input.qcow2 output.vmdk
>
> Stefan

I've been using test image from
http://wiki.qemu.org/download/linux-0.2.img.bz2 and converting it like
this:

qemu-img convert -O vmdk -o
adapter_type=lsilogic,compat6,subformat=streamOptimized linux-0.2.img
linux-0.2.vmdk

Milos

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support
  2014-07-07 14:54 [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support Milos Vyletel
  2014-07-29 13:37 ` Stefan Hajnoczi
@ 2014-07-30  7:51 ` Fam Zheng
  2014-07-31 18:24   ` Milos Vyletel
  2014-08-04 17:10 ` [Qemu-devel] [PATCH v2] " Milos Vyletel
  2 siblings, 1 reply; 21+ messages in thread
From: Fam Zheng @ 2014-07-30  7:51 UTC (permalink / raw)
  To: Milos Vyletel; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi

On Mon, 07/07 10:54, Milos Vyletel wrote:
> VMDK's streamOptimized format is different that regular sparse format.

s/that/from/

> L1(GD) and L2(GT) tables are not predefined but rather generated and
> written during image creation mainly because there is no way to tell
> how much space data will occupy once they are compressed. Also the
> location of header, L1 and L2 tables differs.

s/differs/differ/

> 
> - L2 tables (grain tables) are written after all grains they point to
> - L1 tables are written after all grains and L2 tables
> - footer at the end is used instead of header in first sector
> 
> This patch improves streamOptimized support and adds possibility to
> create true streamOptimized images using qemu-img. Some of the changes
> are from VMDK specs, some of them from hexdump-ing images from VMWare
> and VirtualBox.
> 
> I have compared these images to the ones generated by VMWare and vbox
> and they are identical with the exception of DescriptorFile that has
> some differences but none that would change behavior(CID and some
> additional DDB entries differ) and streamOptimized image generated from
> raw image was succesfully imported (as OVA) into VMWare ESXi and Oracle
> OVM.
> 
> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
> ---
>  block/vmdk.c |  363 +++++++++++++++++++++++++++++++++++++++++++++-------------
>  1 files changed, 281 insertions(+), 82 deletions(-)
> 
> diff --git a/block/vmdk.c b/block/vmdk.c
> index 27a78da..f482225 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -81,6 +81,21 @@ typedef struct {
>      uint16_t compressAlgorithm;
>  } QEMU_PACKED VMDK4Header;
>  
> +typedef struct {
> +    uint64_t val;
> +    uint32_t size;
> +    uint32_t type;
> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
> +} QEMU_PACKED VMDK4MetaMarker;
> +
> +typedef struct {
> +    VMDK4MetaMarker footer_marker;
> +    uint32_t magic;
> +    VMDK4Header header;
> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
> +    VMDK4MetaMarker eos_marker;
> +} QEMU_PACKED VMDK4Footer;
> +
>  #define L2_CACHE_SIZE 16
>  
>  typedef struct VmdkExtent {
> @@ -89,24 +104,29 @@ typedef struct VmdkExtent {
>      bool compressed;
>      bool has_marker;
>      bool has_zero_grain;
> +    bool has_footer;
>      int version;
>      int64_t sectors;
>      int64_t end_sector;
>      int64_t flat_start_offset;
>      int64_t l1_table_offset;
>      int64_t l1_backup_table_offset;
> +    uint32_t l1_index;

Could you track the allocation staus of grain table with l1_table entry value?
For those with value 0, we allocate grain table in file, and update its l1
entry. That way the fields l1_index and l2_table are not necessary here.

>      uint32_t *l1_table;
>      uint32_t *l1_backup_table;
>      unsigned int l1_size;
>      uint32_t l1_entry_sectors;
>  
>      unsigned int l2_size;
> +    uint32_t *l2_table;
>      uint32_t *l2_cache;
>      uint32_t l2_cache_offsets[L2_CACHE_SIZE];
>      uint32_t l2_cache_counts[L2_CACHE_SIZE];
>  
>      int64_t cluster_sectors;
>      char *type;
> +
> +    VMDK4Footer footer;
>  } VmdkExtent;

<snip>

>  
>  typedef struct BDRVVmdkState {
> @@ -1026,6 +1066,97 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      return VMDK_OK;
>  }
>  
> +static int vmdk_write_footer(BlockDriverState *bs,
> +                           VMDK4Footer *footer,
> +                           VmdkExtent *extent)

Bad alignment.

> +{
> +    int i, ret, gd_buf_size;
> +    uint32_t *gd_buf = NULL;
> +    uint32_t grains, gd_sectors, gt_size, gt_count;
> +    uint64_t offset;
> +    VMDK4Header header;
> +    VMDK4MetaMarker gd_marker;
> +
> +    header = footer->header;
> +    offset = le64_to_cpu(header.gd_offset);
> +
> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
> +    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
> +                           BDRV_SECTOR_SIZE);
> +    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
> +    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
> +
> +    /* write grain directory marker */
> +    memset(&gd_marker, 0, sizeof(gd_marker));
> +    gd_marker.val = cpu_to_le64(gd_sectors);
> +    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
> +
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, &gd_marker, sizeof(gd_marker));
> +    if (ret < 0)
> +        goto exit;

Always add braces around if body. Again, scripts/checkpatch.pl can help
check style issue.

> +    offset += sizeof(gd_marker) / BDRV_SECTOR_SIZE;
> +
> +    /* write grain directory */
> +    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> +    gd_buf = g_malloc0(gd_buf_size);

gd_buf is never freed.

> +    if (extent) {
> +        /* copy over L1 table if we have it */
> +        for (i = 0; i < gt_count; i++) {
> +            gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
> +        }
> +    }
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size);
> +    if (ret < 0)
> +        goto exit;
> +
> +    /* save real gd_offset */
> +    footer->header.gd_offset = cpu_to_le64(offset);
> +    offset += gd_sectors;
> +
> +    /* write footer */
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
> +    if (ret < 0)
> +        goto exit;
> +
> +    ret = 0;
> + exit:
> +    return ret;
> +}

<snip>

>  static int get_cluster_offset(BlockDriverState *bs,
>                                      VmdkExtent *extent,
>                                      VmdkMetaData *m_data,
> @@ -1034,8 +1165,8 @@ static int get_cluster_offset(BlockDriverState *bs,
>                                      uint64_t *cluster_offset)
>  {
>      unsigned int l1_index, l2_offset, l2_index;
> -    int min_index, i, j;
> -    uint32_t min_count, *l2_table;
> +    int min_index, i, j, ret;
> +    uint32_t min_count;
>      bool zeroed = false;
>  
>      if (m_data) {
> @@ -1048,11 +1179,25 @@ static int get_cluster_offset(BlockDriverState *bs,
>  
>      offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
>      l1_index = (offset >> 9) / extent->l1_entry_sectors;
> -    if (l1_index >= extent->l1_size) {
> +    if (extent->compressed && l1_index &&
> +            extent->l1_index != l1_index) {
> +        ret = vmdk_write_grain_table(extent);
> +        if (ret < 0)
> +            return ret;
> +    }
> +
> +    extent->l1_index = l1_index;
> +    if (extent->l1_index >= extent->l1_size) {
>          return VMDK_ERROR;
>      }
> -    l2_offset = extent->l1_table[l1_index];
> + retry:
> +    l2_offset = extent->l1_table[extent->l1_index];
> +
>      if (!l2_offset) {
> +        if (extent->compressed) {
> +            extent->l1_table[extent->l1_index] = bdrv_getlength(extent->file);

This control flow desn't make sense. We just write a new grain table at the end
of file and update the l1_table entry.

> +            goto retry;
> +        }
>          return VMDK_UNALLOC;
>      }
>      for (i = 0; i < L2_CACHE_SIZE; i++) {

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support
  2014-07-29 14:49       ` Milos Vyletel
@ 2014-07-30  8:09         ` Stefan Hajnoczi
  2014-07-31 18:22           ` Milos Vyletel
  0 siblings, 1 reply; 21+ messages in thread
From: Stefan Hajnoczi @ 2014-07-30  8:09 UTC (permalink / raw)
  To: Milos Vyletel; +Cc: Kevin Wolf, Fam Zheng, qemu-devel, Stefan Hajnoczi

On Tue, Jul 29, 2014 at 3:49 PM, Milos Vyletel <milos.vyletel@gmail.com> wrote:
> On Tue, Jul 29, 2014 at 10:37 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Tue, Jul 29, 2014 at 2:46 PM, Milos Vyletel <milos.vyletel@gmail.com> wrote:
>>> On Tue, Jul 29, 2014 at 9:37 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>> On Mon, Jul 07, 2014 at 10:54:27AM -0400, Milos Vyletel wrote:
>>>>> VMDK's streamOptimized format is different that regular sparse format.
>>>>> L1(GD) and L2(GT) tables are not predefined but rather generated and
>>>>> written during image creation mainly because there is no way to tell
>>>>> how much space data will occupy once they are compressed. Also the
>>>>> location of header, L1 and L2 tables differs.
>>>>>
>>>>> - L2 tables (grain tables) are written after all grains they point to
>>>>> - L1 tables are written after all grains and L2 tables
>>>>> - footer at the end is used instead of header in first sector
>>>>>
>>>>> This patch improves streamOptimized support and adds possibility to
>>>>> create true streamOptimized images using qemu-img. Some of the changes
>>>>> are from VMDK specs, some of them from hexdump-ing images from VMWare
>>>>> and VirtualBox.
>>>>>
>>>>> I have compared these images to the ones generated by VMWare and vbox
>>>>> and they are identical with the exception of DescriptorFile that has
>>>>> some differences but none that would change behavior(CID and some
>>>>> additional DDB entries differ) and streamOptimized image generated from
>>>>> raw image was succesfully imported (as OVA) into VMWare ESXi and Oracle
>>>>> OVM.
>>>>>
>>>>> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
>>>>> ---
>>>>>  block/vmdk.c |  363 +++++++++++++++++++++++++++++++++++++++++++++-------------
>>>>>  1 files changed, 281 insertions(+), 82 deletions(-)
>>>>
>>>> What does this patch do beyond what QEMU already supports today?
>>>>
>>>> Is there a particular application that rejected QEMU's streamOptimized
>>>> images?  Is this a bug fix?
>>>>
>>>> Please use scripts/checkpatch.pl to check coding style.
>>>>
>>>> Fam: Please review
>>>>
>>>
>>> Images created/converted using QEMU were rejected by VMWare ESXi,
>>> vCloud, VirtualBox and Oracle OVM. I did not try anything else.
>>>
>>> Generally speaking streamOptimized format is not followed by QEMU.
>>> Instead regular VMDK format + encryption is used. We can say this is
>>> bug fix since streamOptimzed format is already included but does not
>>> work well.
>>>
>>> I'll check the style and fix the code. I'll postpone resubmit until
>>> further review is done since I'm sure there will be things that will
>>> need to be fixed.
>>
>> What was the command-line you used?
>>
>> qemu-img convert -f qcow2 -O vmdk -o subformat=streamOptimized
>> input.qcow2 output.vmdk
>>
>> Stefan
>
> I've been using test image from
> http://wiki.qemu.org/download/linux-0.2.img.bz2 and converting it like
> this:
>
> qemu-img convert -O vmdk -o
> adapter_type=lsilogic,compat6,subformat=streamOptimized linux-0.2.img
> linux-0.2.vmdk

Okay, great.  I wanted to check that you used subformat=streamOptimized.

If possible, please be more specific in the commit description about
what caused ESXi and other software to reject the image and what
exactly this patch changes.

Stefan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support
  2014-07-30  8:09         ` Stefan Hajnoczi
@ 2014-07-31 18:22           ` Milos Vyletel
  0 siblings, 0 replies; 21+ messages in thread
From: Milos Vyletel @ 2014-07-31 18:22 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, Fam Zheng, qemu-devel, Stefan Hajnoczi

I will update description in v2 with more info.

On Wed, Jul 30, 2014 at 4:09 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Tue, Jul 29, 2014 at 3:49 PM, Milos Vyletel <milos.vyletel@gmail.com> wrote:
>> On Tue, Jul 29, 2014 at 10:37 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>>> On Tue, Jul 29, 2014 at 2:46 PM, Milos Vyletel <milos.vyletel@gmail.com> wrote:
>>>> On Tue, Jul 29, 2014 at 9:37 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>>> On Mon, Jul 07, 2014 at 10:54:27AM -0400, Milos Vyletel wrote:
>>>>>> VMDK's streamOptimized format is different that regular sparse format.
>>>>>> L1(GD) and L2(GT) tables are not predefined but rather generated and
>>>>>> written during image creation mainly because there is no way to tell
>>>>>> how much space data will occupy once they are compressed. Also the
>>>>>> location of header, L1 and L2 tables differs.
>>>>>>
>>>>>> - L2 tables (grain tables) are written after all grains they point to
>>>>>> - L1 tables are written after all grains and L2 tables
>>>>>> - footer at the end is used instead of header in first sector
>>>>>>
>>>>>> This patch improves streamOptimized support and adds possibility to
>>>>>> create true streamOptimized images using qemu-img. Some of the changes
>>>>>> are from VMDK specs, some of them from hexdump-ing images from VMWare
>>>>>> and VirtualBox.
>>>>>>
>>>>>> I have compared these images to the ones generated by VMWare and vbox
>>>>>> and they are identical with the exception of DescriptorFile that has
>>>>>> some differences but none that would change behavior(CID and some
>>>>>> additional DDB entries differ) and streamOptimized image generated from
>>>>>> raw image was succesfully imported (as OVA) into VMWare ESXi and Oracle
>>>>>> OVM.
>>>>>>
>>>>>> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
>>>>>> ---
>>>>>>  block/vmdk.c |  363 +++++++++++++++++++++++++++++++++++++++++++++-------------
>>>>>>  1 files changed, 281 insertions(+), 82 deletions(-)
>>>>>
>>>>> What does this patch do beyond what QEMU already supports today?
>>>>>
>>>>> Is there a particular application that rejected QEMU's streamOptimized
>>>>> images?  Is this a bug fix?
>>>>>
>>>>> Please use scripts/checkpatch.pl to check coding style.
>>>>>
>>>>> Fam: Please review
>>>>>
>>>>
>>>> Images created/converted using QEMU were rejected by VMWare ESXi,
>>>> vCloud, VirtualBox and Oracle OVM. I did not try anything else.
>>>>
>>>> Generally speaking streamOptimized format is not followed by QEMU.
>>>> Instead regular VMDK format + encryption is used. We can say this is
>>>> bug fix since streamOptimzed format is already included but does not
>>>> work well.
>>>>
>>>> I'll check the style and fix the code. I'll postpone resubmit until
>>>> further review is done since I'm sure there will be things that will
>>>> need to be fixed.
>>>
>>> What was the command-line you used?
>>>
>>> qemu-img convert -f qcow2 -O vmdk -o subformat=streamOptimized
>>> input.qcow2 output.vmdk
>>>
>>> Stefan
>>
>> I've been using test image from
>> http://wiki.qemu.org/download/linux-0.2.img.bz2 and converting it like
>> this:
>>
>> qemu-img convert -O vmdk -o
>> adapter_type=lsilogic,compat6,subformat=streamOptimized linux-0.2.img
>> linux-0.2.vmdk
>
> Okay, great.  I wanted to check that you used subformat=streamOptimized.
>
> If possible, please be more specific in the commit description about
> what caused ESXi and other software to reject the image and what
> exactly this patch changes.
>
> Stefan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support
  2014-07-30  7:51 ` Fam Zheng
@ 2014-07-31 18:24   ` Milos Vyletel
  0 siblings, 0 replies; 21+ messages in thread
From: Milos Vyletel @ 2014-07-31 18:24 UTC (permalink / raw)
  To: Fam Zheng; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi

On Wed, Jul 30, 2014 at 3:51 AM, Fam Zheng <famz@redhat.com> wrote:
> On Mon, 07/07 10:54, Milos Vyletel wrote:
>> VMDK's streamOptimized format is different that regular sparse format.
>
> s/that/from/
>
>> L1(GD) and L2(GT) tables are not predefined but rather generated and
>> written during image creation mainly because there is no way to tell
>> how much space data will occupy once they are compressed. Also the
>> location of header, L1 and L2 tables differs.
>
> s/differs/differ/
>
>>
>> - L2 tables (grain tables) are written after all grains they point to
>> - L1 tables are written after all grains and L2 tables
>> - footer at the end is used instead of header in first sector
>>
>> This patch improves streamOptimized support and adds possibility to
>> create true streamOptimized images using qemu-img. Some of the changes
>> are from VMDK specs, some of them from hexdump-ing images from VMWare
>> and VirtualBox.
>>
>> I have compared these images to the ones generated by VMWare and vbox
>> and they are identical with the exception of DescriptorFile that has
>> some differences but none that would change behavior(CID and some
>> additional DDB entries differ) and streamOptimized image generated from
>> raw image was succesfully imported (as OVA) into VMWare ESXi and Oracle
>> OVM.
>>
>> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
>> ---
>>  block/vmdk.c |  363 +++++++++++++++++++++++++++++++++++++++++++++-------------
>>  1 files changed, 281 insertions(+), 82 deletions(-)
>>
>> diff --git a/block/vmdk.c b/block/vmdk.c
>> index 27a78da..f482225 100644
>> --- a/block/vmdk.c
>> +++ b/block/vmdk.c
>> @@ -81,6 +81,21 @@ typedef struct {
>>      uint16_t compressAlgorithm;
>>  } QEMU_PACKED VMDK4Header;
>>
>> +typedef struct {
>> +    uint64_t val;
>> +    uint32_t size;
>> +    uint32_t type;
>> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
>> +} QEMU_PACKED VMDK4MetaMarker;
>> +
>> +typedef struct {
>> +    VMDK4MetaMarker footer_marker;
>> +    uint32_t magic;
>> +    VMDK4Header header;
>> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
>> +    VMDK4MetaMarker eos_marker;
>> +} QEMU_PACKED VMDK4Footer;
>> +
>>  #define L2_CACHE_SIZE 16
>>
>>  typedef struct VmdkExtent {
>> @@ -89,24 +104,29 @@ typedef struct VmdkExtent {
>>      bool compressed;
>>      bool has_marker;
>>      bool has_zero_grain;
>> +    bool has_footer;
>>      int version;
>>      int64_t sectors;
>>      int64_t end_sector;
>>      int64_t flat_start_offset;
>>      int64_t l1_table_offset;
>>      int64_t l1_backup_table_offset;
>> +    uint32_t l1_index;
>
> Could you track the allocation staus of grain table with l1_table entry value?
> For those with value 0, we allocate grain table in file, and update its l1
> entry. That way the fields l1_index and l2_table are not necessary here.

I possibly could and I'm trying that now. I can use VmdkMetaData
structure that already has these values and leave VmdkExtent as it is.

>
>>      uint32_t *l1_table;
>>      uint32_t *l1_backup_table;
>>      unsigned int l1_size;
>>      uint32_t l1_entry_sectors;
>>
>>      unsigned int l2_size;
>> +    uint32_t *l2_table;
>>      uint32_t *l2_cache;
>>      uint32_t l2_cache_offsets[L2_CACHE_SIZE];
>>      uint32_t l2_cache_counts[L2_CACHE_SIZE];
>>
>>      int64_t cluster_sectors;
>>      char *type;
>> +
>> +    VMDK4Footer footer;
>>  } VmdkExtent;
>
> <snip>
>
>>
>>  typedef struct BDRVVmdkState {
>> @@ -1026,6 +1066,97 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>>      return VMDK_OK;
>>  }
>>
>> +static int vmdk_write_footer(BlockDriverState *bs,
>> +                           VMDK4Footer *footer,
>> +                           VmdkExtent *extent)
>
> Bad alignment.
>
>> +{
>> +    int i, ret, gd_buf_size;
>> +    uint32_t *gd_buf = NULL;
>> +    uint32_t grains, gd_sectors, gt_size, gt_count;
>> +    uint64_t offset;
>> +    VMDK4Header header;
>> +    VMDK4MetaMarker gd_marker;
>> +
>> +    header = footer->header;
>> +    offset = le64_to_cpu(header.gd_offset);
>> +
>> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
>> +    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
>> +                           BDRV_SECTOR_SIZE);
>> +    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
>> +    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>> +
>> +    /* write grain directory marker */
>> +    memset(&gd_marker, 0, sizeof(gd_marker));
>> +    gd_marker.val = cpu_to_le64(gd_sectors);
>> +    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
>> +
>> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, &gd_marker, sizeof(gd_marker));
>> +    if (ret < 0)
>> +        goto exit;
>
> Always add braces around if body. Again, scripts/checkpatch.pl can help
> check style issue.
>
>> +    offset += sizeof(gd_marker) / BDRV_SECTOR_SIZE;
>> +
>> +    /* write grain directory */
>> +    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
>> +    gd_buf = g_malloc0(gd_buf_size);
>
> gd_buf is never freed.
>
>> +    if (extent) {
>> +        /* copy over L1 table if we have it */
>> +        for (i = 0; i < gt_count; i++) {
>> +            gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
>> +        }
>> +    }
>> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size);
>> +    if (ret < 0)
>> +        goto exit;
>> +
>> +    /* save real gd_offset */
>> +    footer->header.gd_offset = cpu_to_le64(offset);
>> +    offset += gd_sectors;
>> +
>> +    /* write footer */
>> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
>> +    if (ret < 0)
>> +        goto exit;
>> +
>> +    ret = 0;
>> + exit:
>> +    return ret;
>> +}
>
> <snip>
>
>>  static int get_cluster_offset(BlockDriverState *bs,
>>                                      VmdkExtent *extent,
>>                                      VmdkMetaData *m_data,
>> @@ -1034,8 +1165,8 @@ static int get_cluster_offset(BlockDriverState *bs,
>>                                      uint64_t *cluster_offset)
>>  {
>>      unsigned int l1_index, l2_offset, l2_index;
>> -    int min_index, i, j;
>> -    uint32_t min_count, *l2_table;
>> +    int min_index, i, j, ret;
>> +    uint32_t min_count;
>>      bool zeroed = false;
>>
>>      if (m_data) {
>> @@ -1048,11 +1179,25 @@ static int get_cluster_offset(BlockDriverState *bs,
>>
>>      offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
>>      l1_index = (offset >> 9) / extent->l1_entry_sectors;
>> -    if (l1_index >= extent->l1_size) {
>> +    if (extent->compressed && l1_index &&
>> +            extent->l1_index != l1_index) {
>> +        ret = vmdk_write_grain_table(extent);
>> +        if (ret < 0)
>> +            return ret;
>> +    }
>> +
>> +    extent->l1_index = l1_index;
>> +    if (extent->l1_index >= extent->l1_size) {
>>          return VMDK_ERROR;
>>      }
>> -    l2_offset = extent->l1_table[l1_index];
>> + retry:
>> +    l2_offset = extent->l1_table[extent->l1_index];
>> +
>>      if (!l2_offset) {
>> +        if (extent->compressed) {
>> +            extent->l1_table[extent->l1_index] = bdrv_getlength(extent->file);
>
> This control flow desn't make sense. We just write a new grain table at the end
> of file and update the l1_table entry.
>
>> +            goto retry;
>> +        }
>>          return VMDK_UNALLOC;
>>      }
>>      for (i = 0; i < L2_CACHE_SIZE; i++) {

Thanks for the comments. I'm working on them and I hopefully have v2
ready tomorrow.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH v2] vmdk: improve streamOptimized vmdk support
  2014-07-07 14:54 [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support Milos Vyletel
  2014-07-29 13:37 ` Stefan Hajnoczi
  2014-07-30  7:51 ` Fam Zheng
@ 2014-08-04 17:10 ` Milos Vyletel
  2014-08-05  5:27   ` Fam Zheng
                     ` (2 more replies)
  2 siblings, 3 replies; 21+ messages in thread
From: Milos Vyletel @ 2014-08-04 17:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Fam Zheng, Stefan Hajnoczi, Milos Vyletel

VMDK's streamOptimized format is different from regular sparse format.
L1(GD) and L2(GT) tables are not predefined but rather generated and
written during image creation mainly because there is no way to tell
how much space data will occupy once they are compressed. Also the
location of header, L1 and L2 tables differ.

- L2 tables (grain tables) are written after all grains they point to
- L1 tables are written after all grains and L2 tables
- footer at the end is used instead of header in first sector

Images generated by qemu-img could not be imported (as part of OVA archive)
to neither VMWare nor OVM because of errors.

- VMWare during OVA import:
Not a supported disk format (sparse VMDK too old)

- OVM's vbox-img during conversion:
vbox-img: error: Error while copying the image: VERR_EOF

This patch fixes streamOptimized support in qemu which was not fully
compatible with VMDK specifications as defined in latest avaialble version
at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.

Qemu generated images are identical to the ones generated by VMWare and
OVM (vbox-img) with the exception of DescriptorFile but that is expected
(CID and some additional DDB entries differ). They were also succesfully
imported to VMWare vCloud, ESXi and Oracle OVM.

Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
---
v2 changes:
- updated commit message description with errors received
- style/grammar fixes (clean checkpatch pass)
- removed l2_table pointer from VmdkExtent struct
- fixed memory leak in vmdk_write_footer()

 block/vmdk.c |  365 ++++++++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 290 insertions(+), 75 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 0517bba..ecaca8f 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -81,6 +81,21 @@ typedef struct {
     uint16_t compressAlgorithm;
 } QEMU_PACKED VMDK4Header;
 
+typedef struct {
+    uint64_t val;
+    uint32_t size;
+    uint32_t type;
+    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
+} QEMU_PACKED VMDK4MetaMarker;
+
+typedef struct {
+    VMDK4MetaMarker footer_marker;
+    uint32_t magic;
+    VMDK4Header header;
+    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
+    VMDK4MetaMarker eos_marker;
+} QEMU_PACKED VMDK4Footer;
+
 #define L2_CACHE_SIZE 16
 
 typedef struct VmdkExtent {
@@ -89,12 +104,14 @@ typedef struct VmdkExtent {
     bool compressed;
     bool has_marker;
     bool has_zero_grain;
+    bool has_footer;
     int version;
     int64_t sectors;
     int64_t end_sector;
     int64_t flat_start_offset;
     int64_t l1_table_offset;
     int64_t l1_backup_table_offset;
+    uint32_t l1_index;
     uint32_t *l1_table;
     uint32_t *l1_backup_table;
     unsigned int l1_size;
@@ -107,6 +124,8 @@ typedef struct VmdkExtent {
 
     int64_t cluster_sectors;
     char *type;
+
+    VMDK4Footer footer;
 } VmdkExtent;
 
 typedef struct BDRVVmdkState {
@@ -125,7 +144,6 @@ typedef struct BDRVVmdkState {
 
 typedef struct VmdkMetaData {
     uint32_t offset;
-    unsigned int l1_index;
     unsigned int l2_index;
     unsigned int l2_offset;
     int valid;
@@ -555,14 +573,50 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
     return buf;
 }
 
+static int vmdk_read_footer(BlockDriverState *bs,
+                            VMDK4Footer *footer)
+{
+    int ret;
+
+    /*
+    * footer starts 3 sectors from end
+    * - footer marker
+    * - footer
+    * - end-of-stream marker
+    */
+    ret = bdrv_pread(bs->file,
+        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
+        footer, sizeof(*footer));
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* Some sanity checks for the footer */
+    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
+        le32_to_cpu(footer->footer_marker.size) != 0  ||
+        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
+        le64_to_cpu(footer->eos_marker.val) != 0  ||
+        le32_to_cpu(footer->eos_marker.size) != 0  ||
+        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = VMDK_OK;
+ out:
+    return ret;
+}
+
 static int vmdk_open_vmdk4(BlockDriverState *bs,
                            BlockDriverState *file,
                            int flags, Error **errp)
 {
     int ret;
+    bool has_footer = false;
     uint32_t magic;
     uint32_t l1_size, l1_entry_sectors;
     VMDK4Header header;
+    VMDK4Footer footer;
     VmdkExtent *extent;
     BDRVVmdkState *s = bs->opaque;
     int64_t l1_backup_offset = 0;
@@ -593,48 +647,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
 
     if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
         /*
-         * The footer takes precedence over the header, so read it in. The
-         * footer starts at offset -1024 from the end: One sector for the
-         * footer, and another one for the end-of-stream marker.
+         * The footer takes precedence over the header, so read it in.
          */
-        struct {
-            struct {
-                uint64_t val;
-                uint32_t size;
-                uint32_t type;
-                uint8_t pad[512 - 16];
-            } QEMU_PACKED footer_marker;
-
-            uint32_t magic;
-            VMDK4Header header;
-            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
-
-            struct {
-                uint64_t val;
-                uint32_t size;
-                uint32_t type;
-                uint8_t pad[512 - 16];
-            } QEMU_PACKED eos_marker;
-        } QEMU_PACKED footer;
-
-        ret = bdrv_pread(file,
-            bs->file->total_sectors * 512 - 1536,
-            &footer, sizeof(footer));
+        ret = vmdk_read_footer(bs, &footer);
         if (ret < 0) {
             return ret;
         }
-
-        /* Some sanity checks for the footer */
-        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
-            le32_to_cpu(footer.footer_marker.size) != 0  ||
-            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
-            le64_to_cpu(footer.eos_marker.val) != 0  ||
-            le32_to_cpu(footer.eos_marker.size) != 0  ||
-            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
-        {
-            return -EINVAL;
-        }
-
+        has_footer = true;
         header = footer.header;
     }
 
@@ -645,11 +664,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
                   bs->device_name, "vmdk", buf);
         return -ENOTSUP;
-    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
+    } else if (le32_to_cpu(header.version) == 3 &&
+               (flags & BDRV_O_RDWR) &&
+               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
         /* VMware KB 2064959 explains that version 3 added support for
          * persistent changed block tracking (CBT), and backup software can
          * read it as version=1 if it doesn't care about the changed area
-         * information. So we are safe to enable read only. */
+         * information. So we are safe to enable read only.
+         * Note that this does not apply to streamOptimized images which
+         * are written only once and are used as transport format */
         error_setg(errp, "VMDK version 3 must be read only");
         return -EINVAL;
     }
@@ -689,11 +712,21 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     if (ret < 0) {
         return ret;
     }
+    if (has_footer) {
+        extent->has_footer = has_footer;
+        extent->footer = footer;
+    }
+
     extent->compressed =
         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
     if (extent->compressed) {
         g_free(s->create_type);
         s->create_type = g_strdup("streamOptimized");
+
+        if (flags & BDRV_O_RDWR) {
+            bdrv_truncate(file,
+                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
+        }
     }
     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
     extent->version = le32_to_cpu(header.version);
@@ -998,6 +1031,12 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     uint32_t offset;
     QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
     offset = cpu_to_le32(m_data->offset);
+
+    /* do not update on streamOptimized */
+    if (extent->compressed) {
+        return VMDK_OK;
+    }
+
     /* update L2 table */
     if (bdrv_pwrite_sync(
                 extent->file,
@@ -1008,7 +1047,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     }
     /* update backup L2 table */
     if (extent->l1_backup_table_offset != 0) {
-        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
+        m_data->l2_offset = extent->l1_backup_table[extent->l1_index];
         if (bdrv_pwrite_sync(
                     extent->file,
                     ((int64_t)m_data->l2_offset * 512)
@@ -1024,6 +1063,113 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     return VMDK_OK;
 }
 
+static int vmdk_write_footer(BlockDriverState *bs,
+                             VMDK4Footer *footer,
+                             VmdkExtent *extent)
+{
+    int i, ret, gd_buf_size;
+    uint32_t *gd_buf = NULL;
+    uint32_t grains, gd_sectors, gt_size, gt_count;
+    uint64_t offset;
+    VMDK4Header header;
+    VMDK4MetaMarker gd_marker;
+
+    header = footer->header;
+    offset = le64_to_cpu(header.gd_offset);
+
+    grains = DIV_ROUND_UP(header.capacity, header.granularity);
+    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
+                           BDRV_SECTOR_SIZE);
+    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
+    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
+
+    /* write grain directory marker */
+    memset(&gd_marker, 0, sizeof(gd_marker));
+    gd_marker.val = cpu_to_le64(gd_sectors);
+    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
+
+    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE,
+                      &gd_marker, sizeof(gd_marker));
+    if (ret < 0) {
+        return ret;
+    }
+    offset += sizeof(gd_marker) / BDRV_SECTOR_SIZE;
+
+    /* write grain directory */
+    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
+    gd_buf = g_malloc0(gd_buf_size);
+    if (extent) {
+        /* copy over L1 table if we have it */
+        for (i = 0; i < gt_count; i++) {
+            gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
+        }
+    }
+    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    /* save real gd_offset */
+    footer->header.gd_offset = cpu_to_le64(offset);
+    offset += gd_sectors;
+
+    /* write footer */
+    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
+    if (ret < 0) {
+        goto exit;
+    }
+
+    ret = 0;
+ exit:
+    g_free(gd_buf);
+    return ret;
+}
+
+static int vmdk_write_grain_table(VmdkExtent *extent)
+{
+    int i, ret;
+    uint64_t offset;
+    uint32_t *l2_table = NULL;
+    VMDK4MetaMarker gtm;
+
+    for (i = 0; i < L2_CACHE_SIZE; i++) {
+        if (extent->l1_table[extent->l1_index] == extent->l2_cache_offsets[i]) {
+            l2_table = extent->l2_cache + (i * extent->l2_size);
+        }
+    }
+    if (!l2_table) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    offset = le64_to_cpu(extent->footer.header.gd_offset) << 9;
+
+    memset(&gtm, 0, sizeof(gtm));
+    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
+    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
+    if (bdrv_pwrite(extent->file, offset, &gtm, sizeof(gtm)) < 0) {
+        ret = -EIO;
+        goto out;
+    }
+
+    offset += sizeof(gtm);
+    extent->l1_table[extent->l1_index] = offset >> 9;
+    if (bdrv_pwrite(extent->file, offset,
+                    l2_table, extent->l2_size * sizeof(uint32_t)
+                   ) != extent->l2_size * sizeof(uint32_t)) {
+        ret = -EIO;
+        goto out;
+    }
+
+    offset += extent->l2_size * sizeof(uint32_t);
+    extent->l1_table_offset = offset;
+    extent->footer.header.gd_offset = cpu_to_le64(offset >> 9);
+
+    ret = 0;
+ out:
+    return ret;
+}
+
 static int get_cluster_offset(BlockDriverState *bs,
                                     VmdkExtent *extent,
                                     VmdkMetaData *m_data,
@@ -1032,7 +1178,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                                     uint64_t *cluster_offset)
 {
     unsigned int l1_index, l2_offset, l2_index;
-    int min_index, i, j;
+    int min_index, i, j, ret;
     uint32_t min_count, *l2_table;
     bool zeroed = false;
 
@@ -1046,6 +1192,22 @@ static int get_cluster_offset(BlockDriverState *bs,
 
     offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
     l1_index = (offset >> 9) / extent->l1_entry_sectors;
+    if (extent->compressed && !extent->l1_table[l1_index]) {
+        if (l1_index) {
+            /* grain (L2) tables follow compressed data so first L2 table will
+             * be written when we move to next index or when we close image.
+             * that is why we need to save l1_index in extent itself for easy
+             * access from both here and vmdk_close */
+            ret = vmdk_write_grain_table(extent);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+        /* allocate new L2; set it to GD offset for now */
+        extent->l1_table[l1_index] = extent->l1_table_offset;
+    }
+    extent->l1_index = l1_index;
+
     if (l1_index >= extent->l1_size) {
         return VMDK_ERROR;
     }
@@ -1092,7 +1254,6 @@ static int get_cluster_offset(BlockDriverState *bs,
 
     if (m_data) {
         m_data->valid = 1;
-        m_data->l1_index = l1_index;
         m_data->l2_index = l2_index;
         m_data->offset = *cluster_offset;
         m_data->l2_offset = l2_offset;
@@ -1208,6 +1369,7 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
     uLongf buf_len;
     const uint8_t *write_buf = buf;
     int write_len = nb_sectors * 512;
+    uint64_t gd_offset;
 
     if (extent->compressed) {
         if (!extent->has_marker) {
@@ -1234,6 +1396,13 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
         ret = ret < 0 ? ret : -EIO;
         goto out;
     }
+    if (extent->compressed) {
+        /* update GD offset after each write */
+        gd_offset = bdrv_getlength(extent->file);
+        extent->l1_table_offset = gd_offset;
+        gd_offset /= BDRV_SECTOR_SIZE;
+        extent->footer.header.gd_offset = cpu_to_le64(gd_offset);
+    }
     ret = 0;
  out:
     g_free(data);
@@ -1532,10 +1701,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     int ret, i;
     BlockDriverState *bs = NULL;
     VMDK4Header header;
+    VMDK4Footer footer;
     Error *local_err = NULL;
     uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
     uint32_t *gd_buf = NULL;
     int gd_buf_size;
+    uint64_t grain_offset, rgd_offset, gd_offset;
 
     ret = bdrv_create_file(filename, opts, &local_err);
     if (ret < 0) {
@@ -1560,28 +1731,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     }
     magic = cpu_to_be32(VMDK4_MAGIC);
     memset(&header, 0, sizeof(header));
-    header.version = zeroed_grain ? 2 : 1;
-    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
-                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
+    memset(&footer, 0, sizeof(footer));
+
+    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
+    header.flags = VMDK4_FLAG_NL_DETECT
+                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
+                               : VMDK4_FLAG_RGD)
                    | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
     header.capacity = filesize / BDRV_SECTOR_SIZE;
     header.granularity = 128;
     header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
 
-    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
+    grains = DIV_ROUND_UP(header.capacity, header.granularity);
     gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
                            BDRV_SECTOR_SIZE);
     gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
     gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
 
     header.desc_offset = 1;
-    header.desc_size = 20;
-    header.rgd_offset = header.desc_offset + header.desc_size;
-    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
-    header.grain_offset =
+    header.desc_size = (compress ? 2 : 20);
+    rgd_offset = header.desc_offset + header.desc_size;
+    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
+    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
+    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
+    grain_offset =
         ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
                  header.granularity);
+    /* streamOptimized reserves first 128 sectors */
+    header.grain_offset = (compress ? header.granularity : grain_offset);
+    /* streamOptimzed's grain directory is at the end */
+    gd_offset = header.grain_offset + 1;
+
     /* swap endianness for all header fields */
     header.version = cpu_to_le32(header.version);
     header.flags = cpu_to_le32(header.flags);
@@ -1618,30 +1799,55 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         goto exit;
     }
 
-    /* write grain directory */
-    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
-    gd_buf = g_malloc0(gd_buf_size);
-    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
-         i < gt_count; i++, tmp += gt_size) {
-        gd_buf[i] = cpu_to_le32(tmp);
-    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
-    if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
-        goto exit;
-    }
+    if (compress) {
+        /* footer marker */
+        footer.footer_marker.val = cpu_to_le64(1);
+        footer.footer_marker.type = cpu_to_le32(MARKER_FOOTER);
 
-    /* write backup grain directory */
-    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
-         i < gt_count; i++, tmp += gt_size) {
-        gd_buf[i] = cpu_to_le32(tmp);
-    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
-    if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
-        goto exit;
+        /* header */
+        footer.magic = cpu_to_be32(VMDK4_MAGIC);
+        footer.header = header;
+
+        /* grain directory offset */
+        footer.header.gd_offset = cpu_to_le64(gd_offset);
+
+        /* EOS marker */
+        footer.eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
+
+        ret = vmdk_write_footer(bs, &footer, NULL);
+        if (ret < 0) {
+            error_set(errp, QERR_IO_ERROR);
+            goto exit;
+        }
+    } else {
+        /* write redundant grain directory (if applicable) */
+        if (le64_to_cpu(header.rgd_offset)) {
+            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
+            gd_buf = g_malloc0(gd_buf_size);
+            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
+                 i < gt_count; i++, tmp += gt_size) {
+                gd_buf[i] = cpu_to_le32(tmp);
+            }
+            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) *
+                                  BDRV_SECTOR_SIZE,
+                              gd_buf, gd_buf_size);
+            if (ret < 0) {
+                error_set(errp, QERR_IO_ERROR);
+                goto exit;
+            }
+        }
+
+        /* write grain directory */
+        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
+             i < gt_count; i++, tmp += gt_size) {
+            gd_buf[i] = cpu_to_le32(tmp);
+        }
+        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                          gd_buf, gd_buf_size);
+        if (ret < 0) {
+            error_set(errp, QERR_IO_ERROR);
+            goto exit;
+        }
     }
 
     ret = 0;
@@ -1912,6 +2118,15 @@ exit:
 static void vmdk_close(BlockDriverState *bs)
 {
     BDRVVmdkState *s = bs->opaque;
+    VmdkExtent *extent = &s->extents[0];
+
+    if (extent->compressed) {
+        while (extent < &s->extents[s->num_extents]) {
+            vmdk_write_grain_table(extent);
+            vmdk_write_footer(extent->file, &extent->footer, extent);
+            extent++;
+        }
+    }
 
     vmdk_free_extents(bs);
     g_free(s->create_type);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v2] vmdk: improve streamOptimized vmdk support
  2014-08-04 17:10 ` [Qemu-devel] [PATCH v2] " Milos Vyletel
@ 2014-08-05  5:27   ` Fam Zheng
  2014-08-05 16:44     ` Milos Vyletel
  2014-08-06 20:57   ` Milos Vyletel
  2014-08-06 21:24   ` [Qemu-devel] [PATCH v3] " Milos Vyletel
  2 siblings, 1 reply; 21+ messages in thread
From: Fam Zheng @ 2014-08-05  5:27 UTC (permalink / raw)
  To: Milos Vyletel; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi

On Mon, 08/04 13:10, Milos Vyletel wrote:
> VMDK's streamOptimized format is different from regular sparse format.
> L1(GD) and L2(GT) tables are not predefined but rather generated and
> written during image creation mainly because there is no way to tell
> how much space data will occupy once they are compressed. Also the
> location of header, L1 and L2 tables differ.
> 
> - L2 tables (grain tables) are written after all grains they point to
> - L1 tables are written after all grains and L2 tables
> - footer at the end is used instead of header in first sector
> 
> Images generated by qemu-img could not be imported (as part of OVA archive)
> to neither VMWare nor OVM because of errors.

Does putting a monolithicSparse into the OVA work in this case?

> 
> - VMWare during OVA import:
> Not a supported disk format (sparse VMDK too old)
> 
> - OVM's vbox-img during conversion:
> vbox-img: error: Error while copying the image: VERR_EOF
> 
> This patch fixes streamOptimized support in qemu which was not fully
> compatible with VMDK specifications as defined in latest avaialble version
> at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.
> 
> Qemu generated images are identical to the ones generated by VMWare and
> OVM (vbox-img) with the exception of DescriptorFile but that is expected
> (CID and some additional DDB entries differ). They were also succesfully
> imported to VMWare vCloud, ESXi and Oracle OVM.
> 
> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
> ---
> v2 changes:
> - updated commit message description with errors received
> - style/grammar fixes (clean checkpatch pass)
> - removed l2_table pointer from VmdkExtent struct
> - fixed memory leak in vmdk_write_footer()
> 
>  block/vmdk.c |  365 ++++++++++++++++++++++++++++++++++++++++++++++------------
>  1 files changed, 290 insertions(+), 75 deletions(-)
> 
> diff --git a/block/vmdk.c b/block/vmdk.c
> index 0517bba..ecaca8f 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -81,6 +81,21 @@ typedef struct {
>      uint16_t compressAlgorithm;
>  } QEMU_PACKED VMDK4Header;
>  
> +typedef struct {
> +    uint64_t val;
> +    uint32_t size;
> +    uint32_t type;
> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
> +} QEMU_PACKED VMDK4MetaMarker;
> +
> +typedef struct {
> +    VMDK4MetaMarker footer_marker;
> +    uint32_t magic;
> +    VMDK4Header header;
> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
> +    VMDK4MetaMarker eos_marker;
> +} QEMU_PACKED VMDK4Footer;
> +
>  #define L2_CACHE_SIZE 16
>  
>  typedef struct VmdkExtent {
> @@ -89,12 +104,14 @@ typedef struct VmdkExtent {
>      bool compressed;
>      bool has_marker;
>      bool has_zero_grain;
> +    bool has_footer;
>      int version;
>      int64_t sectors;
>      int64_t end_sector;
>      int64_t flat_start_offset;
>      int64_t l1_table_offset;
>      int64_t l1_backup_table_offset;
> +    uint32_t l1_index;
>      uint32_t *l1_table;
>      uint32_t *l1_backup_table;
>      unsigned int l1_size;
> @@ -107,6 +124,8 @@ typedef struct VmdkExtent {
>  
>      int64_t cluster_sectors;
>      char *type;
> +
> +    VMDK4Footer footer;

This adds hunderds of bytes into the memory structure, but most are
unnecessary. I'm dubious about that. Fields in the footer all have counterparts
in header, which are as well fields of VmdkExtent. So please reuse them instead
of introducing redundancy.

When we close the extent, we could reuse the same logic as in
vmdk_create_extent, to generate footer dynamically, instead of writting what is
read when closing. That way footer here is not needed.

>  } VmdkExtent;
>  
>  typedef struct BDRVVmdkState {
> @@ -125,7 +144,6 @@ typedef struct BDRVVmdkState {
>  
>  typedef struct VmdkMetaData {
>      uint32_t offset;
> -    unsigned int l1_index;
>      unsigned int l2_index;
>      unsigned int l2_offset;
>      int valid;
> @@ -555,14 +573,50 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
>      return buf;
>  }
>  
> +static int vmdk_read_footer(BlockDriverState *bs,
> +                            VMDK4Footer *footer)
> +{
> +    int ret;
> +
> +    /*
> +    * footer starts 3 sectors from end
> +    * - footer marker
> +    * - footer
> +    * - end-of-stream marker
> +    */
> +    ret = bdrv_pread(bs->file,
> +        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
> +        footer, sizeof(*footer));
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* Some sanity checks for the footer */
> +    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
> +        le32_to_cpu(footer->footer_marker.size) != 0  ||
> +        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
> +        le64_to_cpu(footer->eos_marker.val) != 0  ||
> +        le32_to_cpu(footer->eos_marker.size) != 0  ||
> +        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
> +    ret = VMDK_OK;
> + out:
> +    return ret;
> +}
> +
>  static int vmdk_open_vmdk4(BlockDriverState *bs,
>                             BlockDriverState *file,
>                             int flags, Error **errp)
>  {
>      int ret;
> +    bool has_footer = false;
>      uint32_t magic;
>      uint32_t l1_size, l1_entry_sectors;
>      VMDK4Header header;
> +    VMDK4Footer footer;
>      VmdkExtent *extent;
>      BDRVVmdkState *s = bs->opaque;
>      int64_t l1_backup_offset = 0;
> @@ -593,48 +647,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>  
>      if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
>          /*
> -         * The footer takes precedence over the header, so read it in. The
> -         * footer starts at offset -1024 from the end: One sector for the
> -         * footer, and another one for the end-of-stream marker.
> +         * The footer takes precedence over the header, so read it in.
>           */
> -        struct {
> -            struct {
> -                uint64_t val;
> -                uint32_t size;
> -                uint32_t type;
> -                uint8_t pad[512 - 16];
> -            } QEMU_PACKED footer_marker;
> -
> -            uint32_t magic;
> -            VMDK4Header header;
> -            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
> -
> -            struct {
> -                uint64_t val;
> -                uint32_t size;
> -                uint32_t type;
> -                uint8_t pad[512 - 16];
> -            } QEMU_PACKED eos_marker;
> -        } QEMU_PACKED footer;
> -
> -        ret = bdrv_pread(file,
> -            bs->file->total_sectors * 512 - 1536,
> -            &footer, sizeof(footer));
> +        ret = vmdk_read_footer(bs, &footer);
>          if (ret < 0) {
>              return ret;
>          }
> -
> -        /* Some sanity checks for the footer */
> -        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
> -            le32_to_cpu(footer.footer_marker.size) != 0  ||
> -            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
> -            le64_to_cpu(footer.eos_marker.val) != 0  ||
> -            le32_to_cpu(footer.eos_marker.size) != 0  ||
> -            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
> -        {
> -            return -EINVAL;
> -        }
> -
> +        has_footer = true;
>          header = footer.header;
>      }
>  
> @@ -645,11 +664,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>          error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
>                    bs->device_name, "vmdk", buf);
>          return -ENOTSUP;
> -    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
> +    } else if (le32_to_cpu(header.version) == 3 &&
> +               (flags & BDRV_O_RDWR) &&
> +               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
>          /* VMware KB 2064959 explains that version 3 added support for
>           * persistent changed block tracking (CBT), and backup software can
>           * read it as version=1 if it doesn't care about the changed area
> -         * information. So we are safe to enable read only. */
> +         * information. So we are safe to enable read only.
> +         * Note that this does not apply to streamOptimized images which
> +         * are written only once and are used as transport format */
>          error_setg(errp, "VMDK version 3 must be read only");
>          return -EINVAL;
>      }
> @@ -689,11 +712,21 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>      if (ret < 0) {
>          return ret;
>      }
> +    if (has_footer) {
> +        extent->has_footer = has_footer;
> +        extent->footer = footer;
> +    }
> +
>      extent->compressed =
>          le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
>      if (extent->compressed) {
>          g_free(s->create_type);
>          s->create_type = g_strdup("streamOptimized");
> +
> +        if (flags & BDRV_O_RDWR) {
> +            bdrv_truncate(file,
> +                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);

Looks dangerous. What if has_footer == false, as for the imaged created from
older implementation?

> +        }
>      }
>      extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
>      extent->version = le32_to_cpu(header.version);
> @@ -998,6 +1031,12 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      uint32_t offset;
>      QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
>      offset = cpu_to_le32(m_data->offset);
> +
> +    /* do not update on streamOptimized */
> +    if (extent->compressed) {
> +        return VMDK_OK;
> +    }
> +
>      /* update L2 table */
>      if (bdrv_pwrite_sync(
>                  extent->file,
> @@ -1008,7 +1047,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      }
>      /* update backup L2 table */
>      if (extent->l1_backup_table_offset != 0) {
> -        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
> +        m_data->l2_offset = extent->l1_backup_table[extent->l1_index];
>          if (bdrv_pwrite_sync(
>                      extent->file,
>                      ((int64_t)m_data->l2_offset * 512)
> @@ -1024,6 +1063,113 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      return VMDK_OK;
>  }
>  
> +static int vmdk_write_footer(BlockDriverState *bs,
> +                             VMDK4Footer *footer,
> +                             VmdkExtent *extent)
> +{
> +    int i, ret, gd_buf_size;
> +    uint32_t *gd_buf = NULL;
> +    uint32_t grains, gd_sectors, gt_size, gt_count;
> +    uint64_t offset;
> +    VMDK4Header header;
> +    VMDK4MetaMarker gd_marker;
> +
> +    header = footer->header;
> +    offset = le64_to_cpu(header.gd_offset);
> +
> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
> +    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
> +                           BDRV_SECTOR_SIZE);
> +    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
> +    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
> +
> +    /* write grain directory marker */
> +    memset(&gd_marker, 0, sizeof(gd_marker));
> +    gd_marker.val = cpu_to_le64(gd_sectors);
> +    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
> +
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE,
> +                      &gd_marker, sizeof(gd_marker));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    offset += sizeof(gd_marker) / BDRV_SECTOR_SIZE;
> +
> +    /* write grain directory */
> +    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> +    gd_buf = g_malloc0(gd_buf_size);
> +    if (extent) {
> +        /* copy over L1 table if we have it */
> +        for (i = 0; i < gt_count; i++) {
> +            gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
> +        }
> +    }
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size);
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +
> +    /* save real gd_offset */
> +    footer->header.gd_offset = cpu_to_le64(offset);
> +    offset += gd_sectors;
> +
> +    /* write footer */
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +
> +    ret = 0;
> + exit:
> +    g_free(gd_buf);
> +    return ret;
> +}
> +
> +static int vmdk_write_grain_table(VmdkExtent *extent)
> +{
> +    int i, ret;
> +    uint64_t offset;
> +    uint32_t *l2_table = NULL;
> +    VMDK4MetaMarker gtm;
> +
> +    for (i = 0; i < L2_CACHE_SIZE; i++) {
> +        if (extent->l1_table[extent->l1_index] == extent->l2_cache_offsets[i]) {
> +            l2_table = extent->l2_cache + (i * extent->l2_size);
> +        }
> +    }
> +    if (!l2_table) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
> +    offset = le64_to_cpu(extent->footer.header.gd_offset) << 9;
> +
> +    memset(&gtm, 0, sizeof(gtm));
> +    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
> +    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
> +    if (bdrv_pwrite(extent->file, offset, &gtm, sizeof(gtm)) < 0) {
> +        ret = -EIO;
> +        goto out;
> +    }
> +
> +    offset += sizeof(gtm);
> +    extent->l1_table[extent->l1_index] = offset >> 9;
> +    if (bdrv_pwrite(extent->file, offset,
> +                    l2_table, extent->l2_size * sizeof(uint32_t)
> +                   ) != extent->l2_size * sizeof(uint32_t)) {
> +        ret = -EIO;
> +        goto out;
> +    }
> +
> +    offset += extent->l2_size * sizeof(uint32_t);
> +    extent->l1_table_offset = offset;
> +    extent->footer.header.gd_offset = cpu_to_le64(offset >> 9);
> +
> +    ret = 0;
> + out:
> +    return ret;
> +}
> +
>  static int get_cluster_offset(BlockDriverState *bs,
>                                      VmdkExtent *extent,
>                                      VmdkMetaData *m_data,
> @@ -1032,7 +1178,7 @@ static int get_cluster_offset(BlockDriverState *bs,
>                                      uint64_t *cluster_offset)
>  {
>      unsigned int l1_index, l2_offset, l2_index;
> -    int min_index, i, j;
> +    int min_index, i, j, ret;
>      uint32_t min_count, *l2_table;
>      bool zeroed = false;
>  
> @@ -1046,6 +1192,22 @@ static int get_cluster_offset(BlockDriverState *bs,
>  
>      offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
>      l1_index = (offset >> 9) / extent->l1_entry_sectors;
> +    if (extent->compressed && !extent->l1_table[l1_index]) {
> +        if (l1_index) {
> +            /* grain (L2) tables follow compressed data so first L2 table will
> +             * be written when we move to next index or when we close image.
> +             * that is why we need to save l1_index in extent itself for easy
> +             * access from both here and vmdk_close */
> +            ret = vmdk_write_grain_table(extent);
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        }
> +        /* allocate new L2; set it to GD offset for now */
> +        extent->l1_table[l1_index] = extent->l1_table_offset;
> +    }
> +    extent->l1_index = l1_index;
> +
>      if (l1_index >= extent->l1_size) {
>          return VMDK_ERROR;
>      }
> @@ -1092,7 +1254,6 @@ static int get_cluster_offset(BlockDriverState *bs,
>  
>      if (m_data) {
>          m_data->valid = 1;
> -        m_data->l1_index = l1_index;
>          m_data->l2_index = l2_index;
>          m_data->offset = *cluster_offset;
>          m_data->l2_offset = l2_offset;
> @@ -1208,6 +1369,7 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>      uLongf buf_len;
>      const uint8_t *write_buf = buf;
>      int write_len = nb_sectors * 512;
> +    uint64_t gd_offset;
>  
>      if (extent->compressed) {
>          if (!extent->has_marker) {
> @@ -1234,6 +1396,13 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>          ret = ret < 0 ? ret : -EIO;
>          goto out;
>      }
> +    if (extent->compressed) {
> +        /* update GD offset after each write */
> +        gd_offset = bdrv_getlength(extent->file);
> +        extent->l1_table_offset = gd_offset;
> +        gd_offset /= BDRV_SECTOR_SIZE;
> +        extent->footer.header.gd_offset = cpu_to_le64(gd_offset);
> +    }
>      ret = 0;
>   out:
>      g_free(data);
> @@ -1532,10 +1701,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>      int ret, i;
>      BlockDriverState *bs = NULL;
>      VMDK4Header header;
> +    VMDK4Footer footer;
>      Error *local_err = NULL;
>      uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
>      uint32_t *gd_buf = NULL;
>      int gd_buf_size;
> +    uint64_t grain_offset, rgd_offset, gd_offset;
>  
>      ret = bdrv_create_file(filename, opts, &local_err);
>      if (ret < 0) {
> @@ -1560,28 +1731,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>      }
>      magic = cpu_to_be32(VMDK4_MAGIC);
>      memset(&header, 0, sizeof(header));
> -    header.version = zeroed_grain ? 2 : 1;
> -    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
> -                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
> +    memset(&footer, 0, sizeof(footer));
> +
> +    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
> +    header.flags = VMDK4_FLAG_NL_DETECT
> +                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
> +                               : VMDK4_FLAG_RGD)
>                     | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
>      header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
>      header.capacity = filesize / BDRV_SECTOR_SIZE;
>      header.granularity = 128;
>      header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
>  
> -    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
>      gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
>                             BDRV_SECTOR_SIZE);
>      gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
>      gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>  
>      header.desc_offset = 1;
> -    header.desc_size = 20;
> -    header.rgd_offset = header.desc_offset + header.desc_size;
> -    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
> -    header.grain_offset =
> +    header.desc_size = (compress ? 2 : 20);
> +    rgd_offset = header.desc_offset + header.desc_size;
> +    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
> +    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
> +    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
> +    grain_offset =
>          ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
>                   header.granularity);
> +    /* streamOptimized reserves first 128 sectors */
> +    header.grain_offset = (compress ? header.granularity : grain_offset);
> +    /* streamOptimzed's grain directory is at the end */
> +    gd_offset = header.grain_offset + 1;
> +
>      /* swap endianness for all header fields */
>      header.version = cpu_to_le32(header.version);
>      header.flags = cpu_to_le32(header.flags);
> @@ -1618,30 +1799,55 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>          goto exit;
>      }
>  
> -    /* write grain directory */
> -    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> -    gd_buf = g_malloc0(gd_buf_size);
> -    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
> -         i < gt_count; i++, tmp += gt_size) {
> -        gd_buf[i] = cpu_to_le32(tmp);
> -    }
> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
> -                      gd_buf, gd_buf_size);
> -    if (ret < 0) {
> -        error_set(errp, QERR_IO_ERROR);
> -        goto exit;
> -    }
> +    if (compress) {
> +        /* footer marker */
> +        footer.footer_marker.val = cpu_to_le64(1);
> +        footer.footer_marker.type = cpu_to_le32(MARKER_FOOTER);
>  
> -    /* write backup grain directory */
> -    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
> -         i < gt_count; i++, tmp += gt_size) {
> -        gd_buf[i] = cpu_to_le32(tmp);
> -    }
> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
> -                      gd_buf, gd_buf_size);
> -    if (ret < 0) {
> -        error_set(errp, QERR_IO_ERROR);
> -        goto exit;
> +        /* header */
> +        footer.magic = cpu_to_be32(VMDK4_MAGIC);
> +        footer.header = header;
> +
> +        /* grain directory offset */
> +        footer.header.gd_offset = cpu_to_le64(gd_offset);
> +
> +        /* EOS marker */
> +        footer.eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
> +
> +        ret = vmdk_write_footer(bs, &footer, NULL);
> +        if (ret < 0) {
> +            error_set(errp, QERR_IO_ERROR);
> +            goto exit;
> +        }
> +    } else {
> +        /* write redundant grain directory (if applicable) */
> +        if (le64_to_cpu(header.rgd_offset)) {
> +            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> +            gd_buf = g_malloc0(gd_buf_size);
> +            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
> +                 i < gt_count; i++, tmp += gt_size) {
> +                gd_buf[i] = cpu_to_le32(tmp);
> +            }
> +            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) *
> +                                  BDRV_SECTOR_SIZE,
> +                              gd_buf, gd_buf_size);
> +            if (ret < 0) {
> +                error_set(errp, QERR_IO_ERROR);
> +                goto exit;
> +            }
> +        }
> +
> +        /* write grain directory */
> +        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
> +             i < gt_count; i++, tmp += gt_size) {
> +            gd_buf[i] = cpu_to_le32(tmp);
> +        }
> +        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
> +                          gd_buf, gd_buf_size);
> +        if (ret < 0) {
> +            error_set(errp, QERR_IO_ERROR);
> +            goto exit;
> +        }
>      }
>  
>      ret = 0;
> @@ -1912,6 +2118,15 @@ exit:
>  static void vmdk_close(BlockDriverState *bs)
>  {
>      BDRVVmdkState *s = bs->opaque;
> +    VmdkExtent *extent = &s->extents[0];
> +
> +    if (extent->compressed) {
> +        while (extent < &s->extents[s->num_extents]) {
> +            vmdk_write_grain_table(extent);
> +            vmdk_write_footer(extent->file, &extent->footer, extent);
> +            extent++;
> +        }
> +    }
>  
>      vmdk_free_extents(bs);
>      g_free(s->create_type);
> -- 
> 1.7.1
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v2] vmdk: improve streamOptimized vmdk support
  2014-08-05  5:27   ` Fam Zheng
@ 2014-08-05 16:44     ` Milos Vyletel
  2014-08-06  1:44       ` Fam Zheng
  0 siblings, 1 reply; 21+ messages in thread
From: Milos Vyletel @ 2014-08-05 16:44 UTC (permalink / raw)
  To: Fam Zheng; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi

On Tue, Aug 5, 2014 at 1:27 AM, Fam Zheng <famz@redhat.com> wrote:
> On Mon, 08/04 13:10, Milos Vyletel wrote:
>> VMDK's streamOptimized format is different from regular sparse format.
>> L1(GD) and L2(GT) tables are not predefined but rather generated and
>> written during image creation mainly because there is no way to tell
>> how much space data will occupy once they are compressed. Also the
>> location of header, L1 and L2 tables differ.
>>
>> - L2 tables (grain tables) are written after all grains they point to
>> - L1 tables are written after all grains and L2 tables
>> - footer at the end is used instead of header in first sector
>>
>> Images generated by qemu-img could not be imported (as part of OVA archive)
>> to neither VMWare nor OVM because of errors.
>
> Does putting a monolithicSparse into the OVA work in this case?

It does not. I did not try to import it to OVM but ESXi server
complained about the format of vmdk in OVA archive (don't have exact
error message anymore). I did look in OVF specifications and it does
not state that the vmdk must be streamOptimized but it looks like
VMWare is enforcing it.

I can try to create OVA with monolithicSparse image and retry if you
want to know exact error I got.

>
>>
>> - VMWare during OVA import:
>> Not a supported disk format (sparse VMDK too old)
>>
>> - OVM's vbox-img during conversion:
>> vbox-img: error: Error while copying the image: VERR_EOF
>>
>> This patch fixes streamOptimized support in qemu which was not fully
>> compatible with VMDK specifications as defined in latest avaialble version
>> at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.
>>
>> Qemu generated images are identical to the ones generated by VMWare and
>> OVM (vbox-img) with the exception of DescriptorFile but that is expected
>> (CID and some additional DDB entries differ). They were also succesfully
>> imported to VMWare vCloud, ESXi and Oracle OVM.
>>
>> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
>> ---
>> v2 changes:
>> - updated commit message description with errors received
>> - style/grammar fixes (clean checkpatch pass)
>> - removed l2_table pointer from VmdkExtent struct
>> - fixed memory leak in vmdk_write_footer()
>>
>>  block/vmdk.c |  365 ++++++++++++++++++++++++++++++++++++++++++++++------------
>>  1 files changed, 290 insertions(+), 75 deletions(-)
>>
>> diff --git a/block/vmdk.c b/block/vmdk.c
>> index 0517bba..ecaca8f 100644
>> --- a/block/vmdk.c
>> +++ b/block/vmdk.c
>> @@ -81,6 +81,21 @@ typedef struct {
>>      uint16_t compressAlgorithm;
>>  } QEMU_PACKED VMDK4Header;
>>
>> +typedef struct {
>> +    uint64_t val;
>> +    uint32_t size;
>> +    uint32_t type;
>> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
>> +} QEMU_PACKED VMDK4MetaMarker;
>> +
>> +typedef struct {
>> +    VMDK4MetaMarker footer_marker;
>> +    uint32_t magic;
>> +    VMDK4Header header;
>> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
>> +    VMDK4MetaMarker eos_marker;
>> +} QEMU_PACKED VMDK4Footer;
>> +
>>  #define L2_CACHE_SIZE 16
>>
>>  typedef struct VmdkExtent {
>> @@ -89,12 +104,14 @@ typedef struct VmdkExtent {
>>      bool compressed;
>>      bool has_marker;
>>      bool has_zero_grain;
>> +    bool has_footer;
>>      int version;
>>      int64_t sectors;
>>      int64_t end_sector;
>>      int64_t flat_start_offset;
>>      int64_t l1_table_offset;
>>      int64_t l1_backup_table_offset;
>> +    uint32_t l1_index;
>>      uint32_t *l1_table;
>>      uint32_t *l1_backup_table;
>>      unsigned int l1_size;
>> @@ -107,6 +124,8 @@ typedef struct VmdkExtent {
>>
>>      int64_t cluster_sectors;
>>      char *type;
>> +
>> +    VMDK4Footer footer;
>
> This adds hunderds of bytes into the memory structure, but most are
> unnecessary. I'm dubious about that. Fields in the footer all have counterparts
> in header, which are as well fields of VmdkExtent. So please reuse them instead
> of introducing redundancy.
>
> When we close the extent, we could reuse the same logic as in
> vmdk_create_extent, to generate footer dynamically, instead of writting what is
> read when closing. That way footer here is not needed.
>

Good point. I'll remove footer from VmdkExtent structure.

>>  } VmdkExtent;
>>
>>  typedef struct BDRVVmdkState {
>> @@ -125,7 +144,6 @@ typedef struct BDRVVmdkState {
>>
>>  typedef struct VmdkMetaData {
>>      uint32_t offset;
>> -    unsigned int l1_index;
>>      unsigned int l2_index;
>>      unsigned int l2_offset;
>>      int valid;
>> @@ -555,14 +573,50 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
>>      return buf;
>>  }
>>
>> +static int vmdk_read_footer(BlockDriverState *bs,
>> +                            VMDK4Footer *footer)
>> +{
>> +    int ret;
>> +
>> +    /*
>> +    * footer starts 3 sectors from end
>> +    * - footer marker
>> +    * - footer
>> +    * - end-of-stream marker
>> +    */
>> +    ret = bdrv_pread(bs->file,
>> +        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
>> +        footer, sizeof(*footer));
>> +    if (ret < 0) {
>> +        goto out;
>> +    }
>> +
>> +    /* Some sanity checks for the footer */
>> +    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
>> +        le32_to_cpu(footer->footer_marker.size) != 0  ||
>> +        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
>> +        le64_to_cpu(footer->eos_marker.val) != 0  ||
>> +        le32_to_cpu(footer->eos_marker.size) != 0  ||
>> +        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) {
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>> +    ret = VMDK_OK;
>> + out:
>> +    return ret;
>> +}
>> +
>>  static int vmdk_open_vmdk4(BlockDriverState *bs,
>>                             BlockDriverState *file,
>>                             int flags, Error **errp)
>>  {
>>      int ret;
>> +    bool has_footer = false;
>>      uint32_t magic;
>>      uint32_t l1_size, l1_entry_sectors;
>>      VMDK4Header header;
>> +    VMDK4Footer footer;
>>      VmdkExtent *extent;
>>      BDRVVmdkState *s = bs->opaque;
>>      int64_t l1_backup_offset = 0;
>> @@ -593,48 +647,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>>
>>      if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
>>          /*
>> -         * The footer takes precedence over the header, so read it in. The
>> -         * footer starts at offset -1024 from the end: One sector for the
>> -         * footer, and another one for the end-of-stream marker.
>> +         * The footer takes precedence over the header, so read it in.
>>           */
>> -        struct {
>> -            struct {
>> -                uint64_t val;
>> -                uint32_t size;
>> -                uint32_t type;
>> -                uint8_t pad[512 - 16];
>> -            } QEMU_PACKED footer_marker;
>> -
>> -            uint32_t magic;
>> -            VMDK4Header header;
>> -            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
>> -
>> -            struct {
>> -                uint64_t val;
>> -                uint32_t size;
>> -                uint32_t type;
>> -                uint8_t pad[512 - 16];
>> -            } QEMU_PACKED eos_marker;
>> -        } QEMU_PACKED footer;
>> -
>> -        ret = bdrv_pread(file,
>> -            bs->file->total_sectors * 512 - 1536,
>> -            &footer, sizeof(footer));
>> +        ret = vmdk_read_footer(bs, &footer);
>>          if (ret < 0) {
>>              return ret;
>>          }
>> -
>> -        /* Some sanity checks for the footer */
>> -        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
>> -            le32_to_cpu(footer.footer_marker.size) != 0  ||
>> -            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
>> -            le64_to_cpu(footer.eos_marker.val) != 0  ||
>> -            le32_to_cpu(footer.eos_marker.size) != 0  ||
>> -            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
>> -        {
>> -            return -EINVAL;
>> -        }
>> -
>> +        has_footer = true;
>>          header = footer.header;
>>      }
>>
>> @@ -645,11 +664,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>>          error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
>>                    bs->device_name, "vmdk", buf);
>>          return -ENOTSUP;
>> -    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
>> +    } else if (le32_to_cpu(header.version) == 3 &&
>> +               (flags & BDRV_O_RDWR) &&
>> +               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
>>          /* VMware KB 2064959 explains that version 3 added support for
>>           * persistent changed block tracking (CBT), and backup software can
>>           * read it as version=1 if it doesn't care about the changed area
>> -         * information. So we are safe to enable read only. */
>> +         * information. So we are safe to enable read only.
>> +         * Note that this does not apply to streamOptimized images which
>> +         * are written only once and are used as transport format */
>>          error_setg(errp, "VMDK version 3 must be read only");
>>          return -EINVAL;
>>      }
>> @@ -689,11 +712,21 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>>      if (ret < 0) {
>>          return ret;
>>      }
>> +    if (has_footer) {
>> +        extent->has_footer = has_footer;
>> +        extent->footer = footer;
>> +    }
>> +
>>      extent->compressed =
>>          le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
>>      if (extent->compressed) {
>>          g_free(s->create_type);
>>          s->create_type = g_strdup("streamOptimized");
>> +
>> +        if (flags & BDRV_O_RDWR) {
>> +            bdrv_truncate(file,
>> +                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
>
> Looks dangerous. What if has_footer == false, as for the imaged created from
> older implementation?
>
Will check for has_footer.

>> +        }
>>      }
>>      extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
>>      extent->version = le32_to_cpu(header.version);
>> @@ -998,6 +1031,12 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>>      uint32_t offset;
>>      QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
>>      offset = cpu_to_le32(m_data->offset);
>> +
>> +    /* do not update on streamOptimized */
>> +    if (extent->compressed) {
>> +        return VMDK_OK;
>> +    }
>> +
>>      /* update L2 table */
>>      if (bdrv_pwrite_sync(
>>                  extent->file,
>> @@ -1008,7 +1047,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>>      }
>>      /* update backup L2 table */
>>      if (extent->l1_backup_table_offset != 0) {
>> -        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
>> +        m_data->l2_offset = extent->l1_backup_table[extent->l1_index];
>>          if (bdrv_pwrite_sync(
>>                      extent->file,
>>                      ((int64_t)m_data->l2_offset * 512)
>> @@ -1024,6 +1063,113 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>>      return VMDK_OK;
>>  }
>>
>> +static int vmdk_write_footer(BlockDriverState *bs,
>> +                             VMDK4Footer *footer,
>> +                             VmdkExtent *extent)
>> +{
>> +    int i, ret, gd_buf_size;
>> +    uint32_t *gd_buf = NULL;
>> +    uint32_t grains, gd_sectors, gt_size, gt_count;
>> +    uint64_t offset;
>> +    VMDK4Header header;
>> +    VMDK4MetaMarker gd_marker;
>> +
>> +    header = footer->header;
>> +    offset = le64_to_cpu(header.gd_offset);
>> +
>> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
>> +    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
>> +                           BDRV_SECTOR_SIZE);
>> +    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
>> +    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>> +
>> +    /* write grain directory marker */
>> +    memset(&gd_marker, 0, sizeof(gd_marker));
>> +    gd_marker.val = cpu_to_le64(gd_sectors);
>> +    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
>> +
>> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE,
>> +                      &gd_marker, sizeof(gd_marker));
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +    offset += sizeof(gd_marker) / BDRV_SECTOR_SIZE;
>> +
>> +    /* write grain directory */
>> +    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
>> +    gd_buf = g_malloc0(gd_buf_size);
>> +    if (extent) {
>> +        /* copy over L1 table if we have it */
>> +        for (i = 0; i < gt_count; i++) {
>> +            gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
>> +        }
>> +    }
>> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size);
>> +    if (ret < 0) {
>> +        goto exit;
>> +    }
>> +
>> +    /* save real gd_offset */
>> +    footer->header.gd_offset = cpu_to_le64(offset);
>> +    offset += gd_sectors;
>> +
>> +    /* write footer */
>> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
>> +    if (ret < 0) {
>> +        goto exit;
>> +    }
>> +
>> +    ret = 0;
>> + exit:
>> +    g_free(gd_buf);
>> +    return ret;
>> +}
>> +
>> +static int vmdk_write_grain_table(VmdkExtent *extent)
>> +{
>> +    int i, ret;
>> +    uint64_t offset;
>> +    uint32_t *l2_table = NULL;
>> +    VMDK4MetaMarker gtm;
>> +
>> +    for (i = 0; i < L2_CACHE_SIZE; i++) {
>> +        if (extent->l1_table[extent->l1_index] == extent->l2_cache_offsets[i]) {
>> +            l2_table = extent->l2_cache + (i * extent->l2_size);
>> +        }
>> +    }
>> +    if (!l2_table) {
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>> +    offset = le64_to_cpu(extent->footer.header.gd_offset) << 9;
>> +
>> +    memset(&gtm, 0, sizeof(gtm));
>> +    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
>> +    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
>> +    if (bdrv_pwrite(extent->file, offset, &gtm, sizeof(gtm)) < 0) {
>> +        ret = -EIO;
>> +        goto out;
>> +    }
>> +
>> +    offset += sizeof(gtm);
>> +    extent->l1_table[extent->l1_index] = offset >> 9;
>> +    if (bdrv_pwrite(extent->file, offset,
>> +                    l2_table, extent->l2_size * sizeof(uint32_t)
>> +                   ) != extent->l2_size * sizeof(uint32_t)) {
>> +        ret = -EIO;
>> +        goto out;
>> +    }
>> +
>> +    offset += extent->l2_size * sizeof(uint32_t);
>> +    extent->l1_table_offset = offset;
>> +    extent->footer.header.gd_offset = cpu_to_le64(offset >> 9);
>> +
>> +    ret = 0;
>> + out:
>> +    return ret;
>> +}
>> +
>>  static int get_cluster_offset(BlockDriverState *bs,
>>                                      VmdkExtent *extent,
>>                                      VmdkMetaData *m_data,
>> @@ -1032,7 +1178,7 @@ static int get_cluster_offset(BlockDriverState *bs,
>>                                      uint64_t *cluster_offset)
>>  {
>>      unsigned int l1_index, l2_offset, l2_index;
>> -    int min_index, i, j;
>> +    int min_index, i, j, ret;
>>      uint32_t min_count, *l2_table;
>>      bool zeroed = false;
>>
>> @@ -1046,6 +1192,22 @@ static int get_cluster_offset(BlockDriverState *bs,
>>
>>      offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
>>      l1_index = (offset >> 9) / extent->l1_entry_sectors;
>> +    if (extent->compressed && !extent->l1_table[l1_index]) {
>> +        if (l1_index) {
>> +            /* grain (L2) tables follow compressed data so first L2 table will
>> +             * be written when we move to next index or when we close image.
>> +             * that is why we need to save l1_index in extent itself for easy
>> +             * access from both here and vmdk_close */
>> +            ret = vmdk_write_grain_table(extent);
>> +            if (ret < 0) {
>> +                return ret;
>> +            }
>> +        }
>> +        /* allocate new L2; set it to GD offset for now */
>> +        extent->l1_table[l1_index] = extent->l1_table_offset;
>> +    }
>> +    extent->l1_index = l1_index;
>> +
>>      if (l1_index >= extent->l1_size) {
>>          return VMDK_ERROR;
>>      }
>> @@ -1092,7 +1254,6 @@ static int get_cluster_offset(BlockDriverState *bs,
>>
>>      if (m_data) {
>>          m_data->valid = 1;
>> -        m_data->l1_index = l1_index;
>>          m_data->l2_index = l2_index;
>>          m_data->offset = *cluster_offset;
>>          m_data->l2_offset = l2_offset;
>> @@ -1208,6 +1369,7 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>>      uLongf buf_len;
>>      const uint8_t *write_buf = buf;
>>      int write_len = nb_sectors * 512;
>> +    uint64_t gd_offset;
>>
>>      if (extent->compressed) {
>>          if (!extent->has_marker) {
>> @@ -1234,6 +1396,13 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>>          ret = ret < 0 ? ret : -EIO;
>>          goto out;
>>      }
>> +    if (extent->compressed) {
>> +        /* update GD offset after each write */
>> +        gd_offset = bdrv_getlength(extent->file);
>> +        extent->l1_table_offset = gd_offset;
>> +        gd_offset /= BDRV_SECTOR_SIZE;
>> +        extent->footer.header.gd_offset = cpu_to_le64(gd_offset);
>> +    }
>>      ret = 0;
>>   out:
>>      g_free(data);
>> @@ -1532,10 +1701,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>>      int ret, i;
>>      BlockDriverState *bs = NULL;
>>      VMDK4Header header;
>> +    VMDK4Footer footer;
>>      Error *local_err = NULL;
>>      uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
>>      uint32_t *gd_buf = NULL;
>>      int gd_buf_size;
>> +    uint64_t grain_offset, rgd_offset, gd_offset;
>>
>>      ret = bdrv_create_file(filename, opts, &local_err);
>>      if (ret < 0) {
>> @@ -1560,28 +1731,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>>      }
>>      magic = cpu_to_be32(VMDK4_MAGIC);
>>      memset(&header, 0, sizeof(header));
>> -    header.version = zeroed_grain ? 2 : 1;
>> -    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
>> -                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
>> +    memset(&footer, 0, sizeof(footer));
>> +
>> +    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
>> +    header.flags = VMDK4_FLAG_NL_DETECT
>> +                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
>> +                               : VMDK4_FLAG_RGD)
>>                     | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
>>      header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
>>      header.capacity = filesize / BDRV_SECTOR_SIZE;
>>      header.granularity = 128;
>>      header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
>>
>> -    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
>> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
>>      gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
>>                             BDRV_SECTOR_SIZE);
>>      gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
>>      gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>>
>>      header.desc_offset = 1;
>> -    header.desc_size = 20;
>> -    header.rgd_offset = header.desc_offset + header.desc_size;
>> -    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
>> -    header.grain_offset =
>> +    header.desc_size = (compress ? 2 : 20);
>> +    rgd_offset = header.desc_offset + header.desc_size;
>> +    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
>> +    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
>> +    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
>> +    grain_offset =
>>          ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
>>                   header.granularity);
>> +    /* streamOptimized reserves first 128 sectors */
>> +    header.grain_offset = (compress ? header.granularity : grain_offset);
>> +    /* streamOptimzed's grain directory is at the end */
>> +    gd_offset = header.grain_offset + 1;
>> +
>>      /* swap endianness for all header fields */
>>      header.version = cpu_to_le32(header.version);
>>      header.flags = cpu_to_le32(header.flags);
>> @@ -1618,30 +1799,55 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>>          goto exit;
>>      }
>>
>> -    /* write grain directory */
>> -    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
>> -    gd_buf = g_malloc0(gd_buf_size);
>> -    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
>> -         i < gt_count; i++, tmp += gt_size) {
>> -        gd_buf[i] = cpu_to_le32(tmp);
>> -    }
>> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
>> -                      gd_buf, gd_buf_size);
>> -    if (ret < 0) {
>> -        error_set(errp, QERR_IO_ERROR);
>> -        goto exit;
>> -    }
>> +    if (compress) {
>> +        /* footer marker */
>> +        footer.footer_marker.val = cpu_to_le64(1);
>> +        footer.footer_marker.type = cpu_to_le32(MARKER_FOOTER);
>>
>> -    /* write backup grain directory */
>> -    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
>> -         i < gt_count; i++, tmp += gt_size) {
>> -        gd_buf[i] = cpu_to_le32(tmp);
>> -    }
>> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
>> -                      gd_buf, gd_buf_size);
>> -    if (ret < 0) {
>> -        error_set(errp, QERR_IO_ERROR);
>> -        goto exit;
>> +        /* header */
>> +        footer.magic = cpu_to_be32(VMDK4_MAGIC);
>> +        footer.header = header;
>> +
>> +        /* grain directory offset */
>> +        footer.header.gd_offset = cpu_to_le64(gd_offset);
>> +
>> +        /* EOS marker */
>> +        footer.eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
>> +
>> +        ret = vmdk_write_footer(bs, &footer, NULL);
>> +        if (ret < 0) {
>> +            error_set(errp, QERR_IO_ERROR);
>> +            goto exit;
>> +        }
>> +    } else {
>> +        /* write redundant grain directory (if applicable) */
>> +        if (le64_to_cpu(header.rgd_offset)) {
>> +            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
>> +            gd_buf = g_malloc0(gd_buf_size);
>> +            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
>> +                 i < gt_count; i++, tmp += gt_size) {
>> +                gd_buf[i] = cpu_to_le32(tmp);
>> +            }
>> +            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) *
>> +                                  BDRV_SECTOR_SIZE,
>> +                              gd_buf, gd_buf_size);
>> +            if (ret < 0) {
>> +                error_set(errp, QERR_IO_ERROR);
>> +                goto exit;
>> +            }
>> +        }
>> +
>> +        /* write grain directory */
>> +        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
>> +             i < gt_count; i++, tmp += gt_size) {
>> +            gd_buf[i] = cpu_to_le32(tmp);
>> +        }
>> +        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
>> +                          gd_buf, gd_buf_size);
>> +        if (ret < 0) {
>> +            error_set(errp, QERR_IO_ERROR);
>> +            goto exit;
>> +        }
>>      }
>>
>>      ret = 0;
>> @@ -1912,6 +2118,15 @@ exit:
>>  static void vmdk_close(BlockDriverState *bs)
>>  {
>>      BDRVVmdkState *s = bs->opaque;
>> +    VmdkExtent *extent = &s->extents[0];
>> +
>> +    if (extent->compressed) {
>> +        while (extent < &s->extents[s->num_extents]) {
>> +            vmdk_write_grain_table(extent);
>> +            vmdk_write_footer(extent->file, &extent->footer, extent);
>> +            extent++;
>> +        }
>> +    }
>>
>>      vmdk_free_extents(bs);
>>      g_free(s->create_type);
>> --
>> 1.7.1
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v2] vmdk: improve streamOptimized vmdk support
  2014-08-05 16:44     ` Milos Vyletel
@ 2014-08-06  1:44       ` Fam Zheng
  2014-08-06 16:36         ` Milos Vyletel
  0 siblings, 1 reply; 21+ messages in thread
From: Fam Zheng @ 2014-08-06  1:44 UTC (permalink / raw)
  To: Milos Vyletel; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi

On Tue, 08/05 12:44, Milos Vyletel wrote:
> On Tue, Aug 5, 2014 at 1:27 AM, Fam Zheng <famz@redhat.com> wrote:
> > Does putting a monolithicSparse into the OVA work in this case?
> 
> It does not. I did not try to import it to OVM but ESXi server
> complained about the format of vmdk in OVA archive (don't have exact
> error message anymore). I did look in OVF specifications and it does
> not state that the vmdk must be streamOptimized but it looks like
> VMWare is enforcing it.
> 
> I can try to create OVA with monolithicSparse image and retry if you
> want to know exact error I got.

I'm curious about the tool you're using to create OVA, because there are xml
files in OVA/OVF that qemu has no knowledge about.

Is it some tool from VMware?

Fam

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v2] vmdk: improve streamOptimized vmdk support
  2014-08-06  1:44       ` Fam Zheng
@ 2014-08-06 16:36         ` Milos Vyletel
  0 siblings, 0 replies; 21+ messages in thread
From: Milos Vyletel @ 2014-08-06 16:36 UTC (permalink / raw)
  To: Fam Zheng; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi

On Tue, Aug 5, 2014 at 9:44 PM, Fam Zheng <famz@redhat.com> wrote:
> On Tue, 08/05 12:44, Milos Vyletel wrote:
>> On Tue, Aug 5, 2014 at 1:27 AM, Fam Zheng <famz@redhat.com> wrote:
>> > Does putting a monolithicSparse into the OVA work in this case?
>>
>> It does not. I did not try to import it to OVM but ESXi server
>> complained about the format of vmdk in OVA archive (don't have exact
>> error message anymore). I did look in OVF specifications and it does
>> not state that the vmdk must be streamOptimized but it looks like
>> VMWare is enforcing it.
>>
>> I can try to create OVA with monolithicSparse image and retry if you
>> want to know exact error I got.
>
> I'm curious about the tool you're using to create OVA, because there are xml
> files in OVA/OVF that qemu has no knowledge about.
>
> Is it some tool from VMware?
>
> Fam

Nope. It's just perl script that creates OVF, manifest and packs them
together with vmdk. Qemu just does the conversion to vmdk in this
case.

Milos

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH v2] vmdk: improve streamOptimized vmdk support
  2014-08-04 17:10 ` [Qemu-devel] [PATCH v2] " Milos Vyletel
  2014-08-05  5:27   ` Fam Zheng
@ 2014-08-06 20:57   ` Milos Vyletel
  2014-08-06 21:20     ` Milos Vyletel
  2014-08-06 21:24   ` [Qemu-devel] [PATCH v3] " Milos Vyletel
  2 siblings, 1 reply; 21+ messages in thread
From: Milos Vyletel @ 2014-08-06 20:57 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Fam Zheng, Stefan Hajnoczi, Milos Vyletel

VMDK's streamOptimized format is different from regular sparse format.
L1(GD) and L2(GT) tables are not predefined but rather generated and
written during image creation mainly because there is no way to tell
how much space data will occupy once they are compressed. Also the
location of header, L1 and L2 tables differ.

- L2 tables (grain tables) are written after all grains they point to
- L1 tables are written after all grains and L2 tables
- footer at the end is used instead of header in first sector

Images generated by qemu-img could not be imported (as part of OVA archive)
to neither VMWare nor OVM because of errors.

- VMWare during OVA import:
Not a supported disk format (sparse VMDK too old)

- OVM's vbox-img during conversion:
vbox-img: error: Error while copying the image: VERR_EOF

This patch fixes streamOptimized support in qemu which was not fully
compatible with VMDK specifications as defined in latest avaialble version
at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.

Qemu generated images are identical to the ones generated by VMWare and
OVM (vbox-img) with the exception of DescriptorFile but that is expected
(CID and some additional DDB entries differ). They were also succesfully
imported to VMWare vCloud, ESXi and Oracle OVM.

Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
---
v2 changes:
- updated commit message description with errors received
- style/grammar fixes (clean checkpatch pass)
- removed l2_table pointer from VmdkExtent struct
- fixed memory leak in vmdk_write_footer()

v3 changes:
- removed footer from VmdkExtent structure
- split added vmdk_write_grain_directory function to separate GD and footer
  writes
- fix possible problems with opening of images created by older implementation
 block/vmdk.c |  355 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 280 insertions(+), 75 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 0517bba..3ea1c31 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -81,6 +81,21 @@ typedef struct {
     uint16_t compressAlgorithm;
 } QEMU_PACKED VMDK4Header;
 
+typedef struct {
+    uint64_t val;
+    uint32_t size;
+    uint32_t type;
+    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
+} QEMU_PACKED VMDK4MetaMarker;
+
+typedef struct {
+    VMDK4MetaMarker footer_marker;
+    uint32_t magic;
+    VMDK4Header header;
+    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
+    VMDK4MetaMarker eos_marker;
+} QEMU_PACKED VMDK4Footer;
+
 #define L2_CACHE_SIZE 16
 
 typedef struct VmdkExtent {
@@ -89,12 +104,14 @@ typedef struct VmdkExtent {
     bool compressed;
     bool has_marker;
     bool has_zero_grain;
+    bool has_footer;
     int version;
     int64_t sectors;
     int64_t end_sector;
     int64_t flat_start_offset;
     int64_t l1_table_offset;
     int64_t l1_backup_table_offset;
+    uint32_t l1_index;
     uint32_t *l1_table;
     uint32_t *l1_backup_table;
     unsigned int l1_size;
@@ -125,7 +142,6 @@ typedef struct BDRVVmdkState {
 
 typedef struct VmdkMetaData {
     uint32_t offset;
-    unsigned int l1_index;
     unsigned int l2_index;
     unsigned int l2_offset;
     int valid;
@@ -555,14 +571,50 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
     return buf;
 }
 
+static int vmdk_read_footer(BlockDriverState *bs,
+                            VMDK4Footer *footer)
+{
+    int ret;
+
+    /*
+    * footer starts 3 sectors from end
+    * - footer marker
+    * - footer
+    * - end-of-stream marker
+    */
+    ret = bdrv_pread(bs->file,
+        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
+        footer, sizeof(*footer));
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* Some sanity checks for the footer */
+    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
+        le32_to_cpu(footer->footer_marker.size) != 0  ||
+        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
+        le64_to_cpu(footer->eos_marker.val) != 0  ||
+        le32_to_cpu(footer->eos_marker.size) != 0  ||
+        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = VMDK_OK;
+ out:
+    return ret;
+}
+
 static int vmdk_open_vmdk4(BlockDriverState *bs,
                            BlockDriverState *file,
                            int flags, Error **errp)
 {
     int ret;
+    bool has_footer = false;
     uint32_t magic;
     uint32_t l1_size, l1_entry_sectors;
     VMDK4Header header;
+    VMDK4Footer footer;
     VmdkExtent *extent;
     BDRVVmdkState *s = bs->opaque;
     int64_t l1_backup_offset = 0;
@@ -593,48 +645,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
 
     if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
         /*
-         * The footer takes precedence over the header, so read it in. The
-         * footer starts at offset -1024 from the end: One sector for the
-         * footer, and another one for the end-of-stream marker.
+         * The footer takes precedence over the header, so read it in.
          */
-        struct {
-            struct {
-                uint64_t val;
-                uint32_t size;
-                uint32_t type;
-                uint8_t pad[512 - 16];
-            } QEMU_PACKED footer_marker;
-
-            uint32_t magic;
-            VMDK4Header header;
-            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
-
-            struct {
-                uint64_t val;
-                uint32_t size;
-                uint32_t type;
-                uint8_t pad[512 - 16];
-            } QEMU_PACKED eos_marker;
-        } QEMU_PACKED footer;
-
-        ret = bdrv_pread(file,
-            bs->file->total_sectors * 512 - 1536,
-            &footer, sizeof(footer));
+        ret = vmdk_read_footer(bs, &footer);
         if (ret < 0) {
             return ret;
         }
-
-        /* Some sanity checks for the footer */
-        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
-            le32_to_cpu(footer.footer_marker.size) != 0  ||
-            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
-            le64_to_cpu(footer.eos_marker.val) != 0  ||
-            le32_to_cpu(footer.eos_marker.size) != 0  ||
-            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
-        {
-            return -EINVAL;
-        }
-
+        has_footer = true;
         header = footer.header;
     }
 
@@ -645,11 +662,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
                   bs->device_name, "vmdk", buf);
         return -ENOTSUP;
-    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
+    } else if (le32_to_cpu(header.version) == 3 &&
+               (flags & BDRV_O_RDWR) &&
+               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
         /* VMware KB 2064959 explains that version 3 added support for
          * persistent changed block tracking (CBT), and backup software can
          * read it as version=1 if it doesn't care about the changed area
-         * information. So we are safe to enable read only. */
+         * information. So we are safe to enable read only.
+         * Note that this does not apply to streamOptimized images which
+         * are written only once and are used as transport format */
         error_setg(errp, "VMDK version 3 must be read only");
         return -EINVAL;
     }
@@ -689,11 +710,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     if (ret < 0) {
         return ret;
     }
+    if (has_footer) {
+        extent->has_footer = has_footer;
+    }
+
     extent->compressed =
         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
     if (extent->compressed) {
         g_free(s->create_type);
         s->create_type = g_strdup("streamOptimized");
+
+        if (extent->has_footer && (flags & BDRV_O_RDWR)) {
+            bdrv_truncate(file,
+                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
+        }
     }
     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
     extent->version = le32_to_cpu(header.version);
@@ -998,6 +1028,12 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     uint32_t offset;
     QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
     offset = cpu_to_le32(m_data->offset);
+
+    /* nothing to update on streamOptimized */
+    if (extent->compressed) {
+        return VMDK_OK;
+    }
+
     /* update L2 table */
     if (bdrv_pwrite_sync(
                 extent->file,
@@ -1008,7 +1044,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     }
     /* update backup L2 table */
     if (extent->l1_backup_table_offset != 0) {
-        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
+        m_data->l2_offset = extent->l1_backup_table[extent->l1_index];
         if (bdrv_pwrite_sync(
                     extent->file,
                     ((int64_t)m_data->l2_offset * 512)
@@ -1024,6 +1060,108 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     return VMDK_OK;
 }
 
+static int vmdk_write_footer(BlockDriverState *bs, VMDK4Footer *footer)
+{
+    int ret;
+    uint64_t offset;
+    uint32_t grains, gt_count, gd_sectors;
+
+    if (!footer) {
+        return -EINVAL;
+    }
+
+    grains = DIV_ROUND_UP(le64_to_cpu(footer->header.capacity),
+                          le64_to_cpu(footer->header.granularity));
+    gt_count = DIV_ROUND_UP(grains,
+                            le32_to_cpu(footer->header.num_gtes_per_gt));
+    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
+
+    offset = le64_to_cpu(footer->header.gd_offset) + gd_sectors;
+    footer->footer_marker.val = cpu_to_le64(1);
+    footer->footer_marker.type = cpu_to_le32(MARKER_FOOTER);
+    footer->magic = cpu_to_be32(VMDK4_MAGIC);
+    footer->eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
+
+    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
+    if (ret < 0) {
+        return ret;
+    }
+
+    return VMDK_OK;
+}
+
+static int vmdk_write_grain_directory(VmdkExtent *extent)
+{
+    int i, ret, gd_buf_size;
+    uint32_t *gd_buf = NULL, gd_sectors;
+    VMDK4MetaMarker gd_marker;
+
+    /* write grain directory marker */
+    memset(&gd_marker, 0, sizeof(gd_marker));
+    gd_sectors = DIV_ROUND_UP(extent->l1_size * sizeof(uint32_t),
+                              BDRV_SECTOR_SIZE);
+    gd_marker.val = cpu_to_le64(gd_sectors);
+    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
+    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
+                      &gd_marker, sizeof(gd_marker));
+    if (ret < 0) {
+        return ret;
+    }
+    extent->l1_table_offset += sizeof(gd_marker);
+
+    /* write grain directory */
+    gd_buf_size = extent->l1_size * sizeof(uint32_t);
+    gd_buf = g_malloc0(gd_buf_size);
+    for (i = 0; i < extent->l1_size; i++) {
+        gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
+    }
+    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
+                      gd_buf, gd_buf_size);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    ret = VMDK_OK;
+ exit:
+    g_free(gd_buf);
+    return ret;
+}
+
+static int vmdk_write_grain_table(VmdkExtent *extent)
+{
+    int i;
+    uint32_t *l2_table = NULL;
+    VMDK4MetaMarker gtm;
+
+    for (i = 0; i < L2_CACHE_SIZE; i++) {
+        if (extent->l1_table[extent->l1_index] == extent->l2_cache_offsets[i]) {
+            l2_table = extent->l2_cache + (i * extent->l2_size);
+        }
+    }
+    if (!l2_table) {
+        return -EINVAL;
+    }
+
+    memset(&gtm, 0, sizeof(gtm));
+    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
+    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
+    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
+                    &gtm, sizeof(gtm)) != sizeof(gtm)) {
+        return -EIO;
+    }
+    extent->l1_table_offset += sizeof(gtm);
+
+    extent->l1_table[extent->l1_index] = extent->l1_table_offset >> 9;
+    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
+                    l2_table, extent->l2_size * sizeof(uint32_t)
+                   ) != extent->l2_size * sizeof(uint32_t)) {
+        return -EIO;
+    }
+    extent->l1_table_offset += extent->l2_size * sizeof(uint32_t);
+
+    return VMDK_OK;
+}
+
 static int get_cluster_offset(BlockDriverState *bs,
                                     VmdkExtent *extent,
                                     VmdkMetaData *m_data,
@@ -1032,7 +1170,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                                     uint64_t *cluster_offset)
 {
     unsigned int l1_index, l2_offset, l2_index;
-    int min_index, i, j;
+    int min_index, i, j, ret;
     uint32_t min_count, *l2_table;
     bool zeroed = false;
 
@@ -1046,6 +1184,22 @@ static int get_cluster_offset(BlockDriverState *bs,
 
     offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
     l1_index = (offset >> 9) / extent->l1_entry_sectors;
+    if (extent->compressed && !extent->l1_table[l1_index]) {
+        if (l1_index) {
+            /* grain (L2) tables follow compressed data so first L2 table will
+             * be written when we move to next index or when we close image.
+             * that is why we need to save l1_index in extent itself for easy
+             * access from both here and vmdk_close */
+            ret = vmdk_write_grain_table(extent);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+        /* allocate new L2; set it to GD offset for now */
+        extent->l1_table[l1_index] = extent->l1_table_offset;
+    }
+    extent->l1_index = l1_index;
+
     if (l1_index >= extent->l1_size) {
         return VMDK_ERROR;
     }
@@ -1092,7 +1246,6 @@ static int get_cluster_offset(BlockDriverState *bs,
 
     if (m_data) {
         m_data->valid = 1;
-        m_data->l1_index = l1_index;
         m_data->l2_index = l2_index;
         m_data->offset = *cluster_offset;
         m_data->l2_offset = l2_offset;
@@ -1234,6 +1387,10 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
         ret = ret < 0 ? ret : -EIO;
         goto out;
     }
+    if (extent->compressed) {
+        /* update GD offset after each write */
+        extent->l1_table_offset = bdrv_getlength(extent->file);
+    }
     ret = 0;
  out:
     g_free(data);
@@ -1532,10 +1689,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     int ret, i;
     BlockDriverState *bs = NULL;
     VMDK4Header header;
+    VMDK4Footer footer;
     Error *local_err = NULL;
     uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
     uint32_t *gd_buf = NULL;
     int gd_buf_size;
+    uint64_t grain_offset, rgd_offset, gd_offset;
 
     ret = bdrv_create_file(filename, opts, &local_err);
     if (ret < 0) {
@@ -1560,28 +1719,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     }
     magic = cpu_to_be32(VMDK4_MAGIC);
     memset(&header, 0, sizeof(header));
-    header.version = zeroed_grain ? 2 : 1;
-    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
-                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
+    memset(&footer, 0, sizeof(footer));
+
+    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
+    header.flags = VMDK4_FLAG_NL_DETECT
+                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
+                               : VMDK4_FLAG_RGD)
                    | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
     header.capacity = filesize / BDRV_SECTOR_SIZE;
     header.granularity = 128;
     header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
 
-    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
+    grains = DIV_ROUND_UP(header.capacity, header.granularity);
     gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
                            BDRV_SECTOR_SIZE);
     gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
     gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
 
     header.desc_offset = 1;
-    header.desc_size = 20;
-    header.rgd_offset = header.desc_offset + header.desc_size;
-    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
-    header.grain_offset =
+    header.desc_size = (compress ? 2 : 20);
+    rgd_offset = header.desc_offset + header.desc_size;
+    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
+    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
+    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
+    grain_offset =
         ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
                  header.granularity);
+    /* streamOptimized reserves first 128 sectors */
+    header.grain_offset = (compress ? header.granularity : grain_offset);
+    /* streamOptimzed's grain directory is at the end */
+    gd_offset = header.grain_offset + 1;
+
     /* swap endianness for all header fields */
     header.version = cpu_to_le32(header.version);
     header.flags = cpu_to_le32(header.flags);
@@ -1618,30 +1787,44 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         goto exit;
     }
 
-    /* write grain directory */
-    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
-    gd_buf = g_malloc0(gd_buf_size);
-    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
-         i < gt_count; i++, tmp += gt_size) {
-        gd_buf[i] = cpu_to_le32(tmp);
-    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
-    if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
-        goto exit;
-    }
+    if (compress) {
+        footer.header = header;
+        footer.header.gd_offset = cpu_to_le64(gd_offset);
 
-    /* write backup grain directory */
-    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
-         i < gt_count; i++, tmp += gt_size) {
-        gd_buf[i] = cpu_to_le32(tmp);
-    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
-    if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
-        goto exit;
+        ret = vmdk_write_footer(bs, &footer);
+        if (ret < 0) {
+            error_set(errp, QERR_IO_ERROR);
+            goto exit;
+        }
+    } else {
+        /* write redundant grain directory (if applicable) */
+        if (le64_to_cpu(header.rgd_offset)) {
+            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
+            gd_buf = g_malloc0(gd_buf_size);
+            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
+                 i < gt_count; i++, tmp += gt_size) {
+                gd_buf[i] = cpu_to_le32(tmp);
+            }
+            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) *
+                                  BDRV_SECTOR_SIZE,
+                              gd_buf, gd_buf_size);
+            if (ret < 0) {
+                error_set(errp, QERR_IO_ERROR);
+                goto exit;
+            }
+        }
+
+        /* write grain directory */
+        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
+             i < gt_count; i++, tmp += gt_size) {
+            gd_buf[i] = cpu_to_le32(tmp);
+        }
+        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                          gd_buf, gd_buf_size);
+        if (ret < 0) {
+            error_set(errp, QERR_IO_ERROR);
+            goto exit;
+        }
     }
 
     ret = 0;
@@ -1911,7 +2094,29 @@ exit:
 
 static void vmdk_close(BlockDriverState *bs)
 {
+    int ret;
     BDRVVmdkState *s = bs->opaque;
+    VmdkExtent *extent = &s->extents[0];
+    VMDK4Footer footer;
+
+    if (extent->compressed) {
+        while (extent < &s->extents[s->num_extents]) {
+            vmdk_write_grain_table(extent);
+            vmdk_write_grain_directory(extent);
+            if (extent->has_footer) {
+                memset(&footer, 0, sizeof(footer));
+                ret = bdrv_pread(extent->file, sizeof(uint32_t),
+                                 &footer.header, sizeof(footer.header));
+                if (ret < 0) {
+                    continue;
+                }
+                footer.header.gd_offset =
+                    cpu_to_le64(extent->l1_table_offset >> 9);
+                vmdk_write_footer(extent->file, &footer);
+            }
+            extent++;
+        }
+    }
 
     vmdk_free_extents(bs);
     g_free(s->create_type);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v2] vmdk: improve streamOptimized vmdk support
  2014-08-06 20:57   ` Milos Vyletel
@ 2014-08-06 21:20     ` Milos Vyletel
  0 siblings, 0 replies; 21+ messages in thread
From: Milos Vyletel @ 2014-08-06 21:20 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Fam Zheng, Stefan Hajnoczi, Milos Vyletel

On Wed, Aug 6, 2014 at 4:57 PM, Milos Vyletel <milos.vyletel@gmail.com> wrote:
> VMDK's streamOptimized format is different from regular sparse format.
> L1(GD) and L2(GT) tables are not predefined but rather generated and
> written during image creation mainly because there is no way to tell
> how much space data will occupy once they are compressed. Also the
> location of header, L1 and L2 tables differ.
>
> - L2 tables (grain tables) are written after all grains they point to
> - L1 tables are written after all grains and L2 tables
> - footer at the end is used instead of header in first sector
>
> Images generated by qemu-img could not be imported (as part of OVA archive)
> to neither VMWare nor OVM because of errors.
>
> - VMWare during OVA import:
> Not a supported disk format (sparse VMDK too old)
>
> - OVM's vbox-img during conversion:
> vbox-img: error: Error while copying the image: VERR_EOF
>
> This patch fixes streamOptimized support in qemu which was not fully
> compatible with VMDK specifications as defined in latest avaialble version
> at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.
>
> Qemu generated images are identical to the ones generated by VMWare and
> OVM (vbox-img) with the exception of DescriptorFile but that is expected
> (CID and some additional DDB entries differ). They were also succesfully
> imported to VMWare vCloud, ESXi and Oracle OVM.
>
> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
> ---
> v2 changes:
> - updated commit message description with errors received
> - style/grammar fixes (clean checkpatch pass)
> - removed l2_table pointer from VmdkExtent struct
> - fixed memory leak in vmdk_write_footer()
>
> v3 changes:
> - removed footer from VmdkExtent structure
> - split added vmdk_write_grain_directory function to separate GD and footer
>   writes
> - fix possible problems with opening of images created by older implementation
>  block/vmdk.c |  355 +++++++++++++++++++++++++++++++++++++++++++++------------
>  1 files changed, 280 insertions(+), 75 deletions(-)
>
> diff --git a/block/vmdk.c b/block/vmdk.c
> index 0517bba..3ea1c31 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -81,6 +81,21 @@ typedef struct {
>      uint16_t compressAlgorithm;
>  } QEMU_PACKED VMDK4Header;
>
> +typedef struct {
> +    uint64_t val;
> +    uint32_t size;
> +    uint32_t type;
> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
> +} QEMU_PACKED VMDK4MetaMarker;
> +
> +typedef struct {
> +    VMDK4MetaMarker footer_marker;
> +    uint32_t magic;
> +    VMDK4Header header;
> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
> +    VMDK4MetaMarker eos_marker;
> +} QEMU_PACKED VMDK4Footer;
> +
>  #define L2_CACHE_SIZE 16
>
>  typedef struct VmdkExtent {
> @@ -89,12 +104,14 @@ typedef struct VmdkExtent {
>      bool compressed;
>      bool has_marker;
>      bool has_zero_grain;
> +    bool has_footer;
>      int version;
>      int64_t sectors;
>      int64_t end_sector;
>      int64_t flat_start_offset;
>      int64_t l1_table_offset;
>      int64_t l1_backup_table_offset;
> +    uint32_t l1_index;
>      uint32_t *l1_table;
>      uint32_t *l1_backup_table;
>      unsigned int l1_size;
> @@ -125,7 +142,6 @@ typedef struct BDRVVmdkState {
>
>  typedef struct VmdkMetaData {
>      uint32_t offset;
> -    unsigned int l1_index;
>      unsigned int l2_index;
>      unsigned int l2_offset;
>      int valid;
> @@ -555,14 +571,50 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
>      return buf;
>  }
>
> +static int vmdk_read_footer(BlockDriverState *bs,
> +                            VMDK4Footer *footer)
> +{
> +    int ret;
> +
> +    /*
> +    * footer starts 3 sectors from end
> +    * - footer marker
> +    * - footer
> +    * - end-of-stream marker
> +    */
> +    ret = bdrv_pread(bs->file,
> +        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
> +        footer, sizeof(*footer));
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* Some sanity checks for the footer */
> +    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
> +        le32_to_cpu(footer->footer_marker.size) != 0  ||
> +        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
> +        le64_to_cpu(footer->eos_marker.val) != 0  ||
> +        le32_to_cpu(footer->eos_marker.size) != 0  ||
> +        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
> +    ret = VMDK_OK;
> + out:
> +    return ret;
> +}
> +
>  static int vmdk_open_vmdk4(BlockDriverState *bs,
>                             BlockDriverState *file,
>                             int flags, Error **errp)
>  {
>      int ret;
> +    bool has_footer = false;
>      uint32_t magic;
>      uint32_t l1_size, l1_entry_sectors;
>      VMDK4Header header;
> +    VMDK4Footer footer;
>      VmdkExtent *extent;
>      BDRVVmdkState *s = bs->opaque;
>      int64_t l1_backup_offset = 0;
> @@ -593,48 +645,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>
>      if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
>          /*
> -         * The footer takes precedence over the header, so read it in. The
> -         * footer starts at offset -1024 from the end: One sector for the
> -         * footer, and another one for the end-of-stream marker.
> +         * The footer takes precedence over the header, so read it in.
>           */
> -        struct {
> -            struct {
> -                uint64_t val;
> -                uint32_t size;
> -                uint32_t type;
> -                uint8_t pad[512 - 16];
> -            } QEMU_PACKED footer_marker;
> -
> -            uint32_t magic;
> -            VMDK4Header header;
> -            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
> -
> -            struct {
> -                uint64_t val;
> -                uint32_t size;
> -                uint32_t type;
> -                uint8_t pad[512 - 16];
> -            } QEMU_PACKED eos_marker;
> -        } QEMU_PACKED footer;
> -
> -        ret = bdrv_pread(file,
> -            bs->file->total_sectors * 512 - 1536,
> -            &footer, sizeof(footer));
> +        ret = vmdk_read_footer(bs, &footer);
>          if (ret < 0) {
>              return ret;
>          }
> -
> -        /* Some sanity checks for the footer */
> -        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
> -            le32_to_cpu(footer.footer_marker.size) != 0  ||
> -            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
> -            le64_to_cpu(footer.eos_marker.val) != 0  ||
> -            le32_to_cpu(footer.eos_marker.size) != 0  ||
> -            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
> -        {
> -            return -EINVAL;
> -        }
> -
> +        has_footer = true;
>          header = footer.header;
>      }
>
> @@ -645,11 +662,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>          error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
>                    bs->device_name, "vmdk", buf);
>          return -ENOTSUP;
> -    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
> +    } else if (le32_to_cpu(header.version) == 3 &&
> +               (flags & BDRV_O_RDWR) &&
> +               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
>          /* VMware KB 2064959 explains that version 3 added support for
>           * persistent changed block tracking (CBT), and backup software can
>           * read it as version=1 if it doesn't care about the changed area
> -         * information. So we are safe to enable read only. */
> +         * information. So we are safe to enable read only.
> +         * Note that this does not apply to streamOptimized images which
> +         * are written only once and are used as transport format */
>          error_setg(errp, "VMDK version 3 must be read only");
>          return -EINVAL;
>      }
> @@ -689,11 +710,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>      if (ret < 0) {
>          return ret;
>      }
> +    if (has_footer) {
> +        extent->has_footer = has_footer;
> +    }
> +
>      extent->compressed =
>          le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
>      if (extent->compressed) {
>          g_free(s->create_type);
>          s->create_type = g_strdup("streamOptimized");
> +
> +        if (extent->has_footer && (flags & BDRV_O_RDWR)) {
> +            bdrv_truncate(file,
> +                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
> +        }
>      }
>      extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
>      extent->version = le32_to_cpu(header.version);
> @@ -998,6 +1028,12 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      uint32_t offset;
>      QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
>      offset = cpu_to_le32(m_data->offset);
> +
> +    /* nothing to update on streamOptimized */
> +    if (extent->compressed) {
> +        return VMDK_OK;
> +    }
> +
>      /* update L2 table */
>      if (bdrv_pwrite_sync(
>                  extent->file,
> @@ -1008,7 +1044,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      }
>      /* update backup L2 table */
>      if (extent->l1_backup_table_offset != 0) {
> -        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
> +        m_data->l2_offset = extent->l1_backup_table[extent->l1_index];
>          if (bdrv_pwrite_sync(
>                      extent->file,
>                      ((int64_t)m_data->l2_offset * 512)
> @@ -1024,6 +1060,108 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      return VMDK_OK;
>  }
>
> +static int vmdk_write_footer(BlockDriverState *bs, VMDK4Footer *footer)
> +{
> +    int ret;
> +    uint64_t offset;
> +    uint32_t grains, gt_count, gd_sectors;
> +
> +    if (!footer) {
> +        return -EINVAL;
> +    }
> +
> +    grains = DIV_ROUND_UP(le64_to_cpu(footer->header.capacity),
> +                          le64_to_cpu(footer->header.granularity));
> +    gt_count = DIV_ROUND_UP(grains,
> +                            le32_to_cpu(footer->header.num_gtes_per_gt));
> +    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
> +
> +    offset = le64_to_cpu(footer->header.gd_offset) + gd_sectors;
> +    footer->footer_marker.val = cpu_to_le64(1);
> +    footer->footer_marker.type = cpu_to_le32(MARKER_FOOTER);
> +    footer->magic = cpu_to_be32(VMDK4_MAGIC);
> +    footer->eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
> +
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return VMDK_OK;
> +}
> +
> +static int vmdk_write_grain_directory(VmdkExtent *extent)
> +{
> +    int i, ret, gd_buf_size;
> +    uint32_t *gd_buf = NULL, gd_sectors;
> +    VMDK4MetaMarker gd_marker;
> +
> +    /* write grain directory marker */
> +    memset(&gd_marker, 0, sizeof(gd_marker));
> +    gd_sectors = DIV_ROUND_UP(extent->l1_size * sizeof(uint32_t),
> +                              BDRV_SECTOR_SIZE);
> +    gd_marker.val = cpu_to_le64(gd_sectors);
> +    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
> +    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
> +                      &gd_marker, sizeof(gd_marker));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    extent->l1_table_offset += sizeof(gd_marker);
> +
> +    /* write grain directory */
> +    gd_buf_size = extent->l1_size * sizeof(uint32_t);
> +    gd_buf = g_malloc0(gd_buf_size);
> +    for (i = 0; i < extent->l1_size; i++) {
> +        gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
> +    }
> +    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
> +                      gd_buf, gd_buf_size);
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +
> +    ret = VMDK_OK;
> + exit:
> +    g_free(gd_buf);
> +    return ret;
> +}
> +
> +static int vmdk_write_grain_table(VmdkExtent *extent)
> +{
> +    int i;
> +    uint32_t *l2_table = NULL;
> +    VMDK4MetaMarker gtm;
> +
> +    for (i = 0; i < L2_CACHE_SIZE; i++) {
> +        if (extent->l1_table[extent->l1_index] == extent->l2_cache_offsets[i]) {
> +            l2_table = extent->l2_cache + (i * extent->l2_size);
> +        }
> +    }
> +    if (!l2_table) {
> +        return -EINVAL;
> +    }
> +
> +    memset(&gtm, 0, sizeof(gtm));
> +    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
> +    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
> +    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
> +                    &gtm, sizeof(gtm)) != sizeof(gtm)) {
> +        return -EIO;
> +    }
> +    extent->l1_table_offset += sizeof(gtm);
> +
> +    extent->l1_table[extent->l1_index] = extent->l1_table_offset >> 9;
> +    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
> +                    l2_table, extent->l2_size * sizeof(uint32_t)
> +                   ) != extent->l2_size * sizeof(uint32_t)) {
> +        return -EIO;
> +    }
> +    extent->l1_table_offset += extent->l2_size * sizeof(uint32_t);
> +
> +    return VMDK_OK;
> +}
> +
>  static int get_cluster_offset(BlockDriverState *bs,
>                                      VmdkExtent *extent,
>                                      VmdkMetaData *m_data,
> @@ -1032,7 +1170,7 @@ static int get_cluster_offset(BlockDriverState *bs,
>                                      uint64_t *cluster_offset)
>  {
>      unsigned int l1_index, l2_offset, l2_index;
> -    int min_index, i, j;
> +    int min_index, i, j, ret;
>      uint32_t min_count, *l2_table;
>      bool zeroed = false;
>
> @@ -1046,6 +1184,22 @@ static int get_cluster_offset(BlockDriverState *bs,
>
>      offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
>      l1_index = (offset >> 9) / extent->l1_entry_sectors;
> +    if (extent->compressed && !extent->l1_table[l1_index]) {
> +        if (l1_index) {
> +            /* grain (L2) tables follow compressed data so first L2 table will
> +             * be written when we move to next index or when we close image.
> +             * that is why we need to save l1_index in extent itself for easy
> +             * access from both here and vmdk_close */
> +            ret = vmdk_write_grain_table(extent);
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        }
> +        /* allocate new L2; set it to GD offset for now */
> +        extent->l1_table[l1_index] = extent->l1_table_offset;
> +    }
> +    extent->l1_index = l1_index;
> +
>      if (l1_index >= extent->l1_size) {
>          return VMDK_ERROR;
>      }
> @@ -1092,7 +1246,6 @@ static int get_cluster_offset(BlockDriverState *bs,
>
>      if (m_data) {
>          m_data->valid = 1;
> -        m_data->l1_index = l1_index;
>          m_data->l2_index = l2_index;
>          m_data->offset = *cluster_offset;
>          m_data->l2_offset = l2_offset;
> @@ -1234,6 +1387,10 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>          ret = ret < 0 ? ret : -EIO;
>          goto out;
>      }
> +    if (extent->compressed) {
> +        /* update GD offset after each write */
> +        extent->l1_table_offset = bdrv_getlength(extent->file);
> +    }
>      ret = 0;
>   out:
>      g_free(data);
> @@ -1532,10 +1689,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>      int ret, i;
>      BlockDriverState *bs = NULL;
>      VMDK4Header header;
> +    VMDK4Footer footer;
>      Error *local_err = NULL;
>      uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
>      uint32_t *gd_buf = NULL;
>      int gd_buf_size;
> +    uint64_t grain_offset, rgd_offset, gd_offset;
>
>      ret = bdrv_create_file(filename, opts, &local_err);
>      if (ret < 0) {
> @@ -1560,28 +1719,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>      }
>      magic = cpu_to_be32(VMDK4_MAGIC);
>      memset(&header, 0, sizeof(header));
> -    header.version = zeroed_grain ? 2 : 1;
> -    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
> -                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
> +    memset(&footer, 0, sizeof(footer));
> +
> +    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
> +    header.flags = VMDK4_FLAG_NL_DETECT
> +                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
> +                               : VMDK4_FLAG_RGD)
>                     | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
>      header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
>      header.capacity = filesize / BDRV_SECTOR_SIZE;
>      header.granularity = 128;
>      header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
>
> -    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
>      gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
>                             BDRV_SECTOR_SIZE);
>      gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
>      gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>
>      header.desc_offset = 1;
> -    header.desc_size = 20;
> -    header.rgd_offset = header.desc_offset + header.desc_size;
> -    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
> -    header.grain_offset =
> +    header.desc_size = (compress ? 2 : 20);
> +    rgd_offset = header.desc_offset + header.desc_size;
> +    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
> +    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
> +    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
> +    grain_offset =
>          ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
>                   header.granularity);
> +    /* streamOptimized reserves first 128 sectors */
> +    header.grain_offset = (compress ? header.granularity : grain_offset);
> +    /* streamOptimzed's grain directory is at the end */
> +    gd_offset = header.grain_offset + 1;
> +
>      /* swap endianness for all header fields */
>      header.version = cpu_to_le32(header.version);
>      header.flags = cpu_to_le32(header.flags);
> @@ -1618,30 +1787,44 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>          goto exit;
>      }
>
> -    /* write grain directory */
> -    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> -    gd_buf = g_malloc0(gd_buf_size);
> -    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
> -         i < gt_count; i++, tmp += gt_size) {
> -        gd_buf[i] = cpu_to_le32(tmp);
> -    }
> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
> -                      gd_buf, gd_buf_size);
> -    if (ret < 0) {
> -        error_set(errp, QERR_IO_ERROR);
> -        goto exit;
> -    }
> +    if (compress) {
> +        footer.header = header;
> +        footer.header.gd_offset = cpu_to_le64(gd_offset);
>
> -    /* write backup grain directory */
> -    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
> -         i < gt_count; i++, tmp += gt_size) {
> -        gd_buf[i] = cpu_to_le32(tmp);
> -    }
> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
> -                      gd_buf, gd_buf_size);
> -    if (ret < 0) {
> -        error_set(errp, QERR_IO_ERROR);
> -        goto exit;
> +        ret = vmdk_write_footer(bs, &footer);
> +        if (ret < 0) {
> +            error_set(errp, QERR_IO_ERROR);
> +            goto exit;
> +        }
> +    } else {
> +        /* write redundant grain directory (if applicable) */
> +        if (le64_to_cpu(header.rgd_offset)) {
> +            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> +            gd_buf = g_malloc0(gd_buf_size);
> +            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
> +                 i < gt_count; i++, tmp += gt_size) {
> +                gd_buf[i] = cpu_to_le32(tmp);
> +            }
> +            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) *
> +                                  BDRV_SECTOR_SIZE,
> +                              gd_buf, gd_buf_size);
> +            if (ret < 0) {
> +                error_set(errp, QERR_IO_ERROR);
> +                goto exit;
> +            }
> +        }
> +
> +        /* write grain directory */
> +        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
> +             i < gt_count; i++, tmp += gt_size) {
> +            gd_buf[i] = cpu_to_le32(tmp);
> +        }
> +        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
> +                          gd_buf, gd_buf_size);
> +        if (ret < 0) {
> +            error_set(errp, QERR_IO_ERROR);
> +            goto exit;
> +        }
>      }
>
>      ret = 0;
> @@ -1911,7 +2094,29 @@ exit:
>
>  static void vmdk_close(BlockDriverState *bs)
>  {
> +    int ret;
>      BDRVVmdkState *s = bs->opaque;
> +    VmdkExtent *extent = &s->extents[0];
> +    VMDK4Footer footer;
> +
> +    if (extent->compressed) {
> +        while (extent < &s->extents[s->num_extents]) {
> +            vmdk_write_grain_table(extent);
> +            vmdk_write_grain_directory(extent);
> +            if (extent->has_footer) {
> +                memset(&footer, 0, sizeof(footer));
> +                ret = bdrv_pread(extent->file, sizeof(uint32_t),
> +                                 &footer.header, sizeof(footer.header));
> +                if (ret < 0) {
> +                    continue;
> +                }
> +                footer.header.gd_offset =
> +                    cpu_to_le64(extent->l1_table_offset >> 9);
> +                vmdk_write_footer(extent->file, &footer);
> +            }
> +            extent++;
> +        }
> +    }
>
>      vmdk_free_extents(bs);
>      g_free(s->create_type);
> --
> 1.7.1
>

Please ignore this patch. Found small bug and will resend with fix and
correct subject this time.

Milos

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH v3] vmdk: improve streamOptimized vmdk support
  2014-08-04 17:10 ` [Qemu-devel] [PATCH v2] " Milos Vyletel
  2014-08-05  5:27   ` Fam Zheng
  2014-08-06 20:57   ` Milos Vyletel
@ 2014-08-06 21:24   ` Milos Vyletel
  2014-08-12 10:45     ` Stefan Hajnoczi
  2014-08-18  6:52     ` Fam Zheng
  2 siblings, 2 replies; 21+ messages in thread
From: Milos Vyletel @ 2014-08-06 21:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Fam Zheng, Stefan Hajnoczi, Milos Vyletel

VMDK's streamOptimized format is different from regular sparse format.
L1(GD) and L2(GT) tables are not predefined but rather generated and
written during image creation mainly because there is no way to tell
how much space data will occupy once they are compressed. Also the
location of header, L1 and L2 tables differ.

- L2 tables (grain tables) are written after all grains they point to
- L1 tables are written after all grains and L2 tables
- footer at the end is used instead of header in first sector

Images generated by qemu-img could not be imported (as part of OVA archive)
to neither VMWare nor OVM because of errors.

- VMWare during OVA import:
Not a supported disk format (sparse VMDK too old)

- OVM's vbox-img during conversion:
vbox-img: error: Error while copying the image: VERR_EOF

This patch fixes streamOptimized support in qemu which was not fully
compatible with VMDK specifications as defined in latest avaialble version
at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.

Qemu generated images are identical to the ones generated by VMWare and
OVM (vbox-img) with the exception of DescriptorFile but that is expected
(CID and some additional DDB entries differ). They were also succesfully
imported to VMWare vCloud, ESXi and Oracle OVM.

Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
---
v2 changes:
- updated commit message description with errors received
- style/grammar fixes (clean checkpatch pass)
- removed l2_table pointer from VmdkExtent struct
- fixed memory leak in vmdk_write_footer()

v3 changes:
- removed footer from VmdkExtent structure
- added vmdk_write_grain_directory function to separate GD and footer writes
- fix possible problems with opening of images created by older implementation
- fix reverse conversion from VMDK to other formats

 block/vmdk.c |  355 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 280 insertions(+), 75 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 0517bba..0a1a08c 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -81,6 +81,21 @@ typedef struct {
     uint16_t compressAlgorithm;
 } QEMU_PACKED VMDK4Header;
 
+typedef struct {
+    uint64_t val;
+    uint32_t size;
+    uint32_t type;
+    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
+} QEMU_PACKED VMDK4MetaMarker;
+
+typedef struct {
+    VMDK4MetaMarker footer_marker;
+    uint32_t magic;
+    VMDK4Header header;
+    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
+    VMDK4MetaMarker eos_marker;
+} QEMU_PACKED VMDK4Footer;
+
 #define L2_CACHE_SIZE 16
 
 typedef struct VmdkExtent {
@@ -89,12 +104,14 @@ typedef struct VmdkExtent {
     bool compressed;
     bool has_marker;
     bool has_zero_grain;
+    bool has_footer;
     int version;
     int64_t sectors;
     int64_t end_sector;
     int64_t flat_start_offset;
     int64_t l1_table_offset;
     int64_t l1_backup_table_offset;
+    uint32_t l1_index;
     uint32_t *l1_table;
     uint32_t *l1_backup_table;
     unsigned int l1_size;
@@ -125,7 +142,6 @@ typedef struct BDRVVmdkState {
 
 typedef struct VmdkMetaData {
     uint32_t offset;
-    unsigned int l1_index;
     unsigned int l2_index;
     unsigned int l2_offset;
     int valid;
@@ -555,14 +571,50 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
     return buf;
 }
 
+static int vmdk_read_footer(BlockDriverState *bs,
+                            VMDK4Footer *footer)
+{
+    int ret;
+
+    /*
+    * footer starts 3 sectors from end
+    * - footer marker
+    * - footer
+    * - end-of-stream marker
+    */
+    ret = bdrv_pread(bs->file,
+        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
+        footer, sizeof(*footer));
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* Some sanity checks for the footer */
+    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
+        le32_to_cpu(footer->footer_marker.size) != 0  ||
+        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
+        le64_to_cpu(footer->eos_marker.val) != 0  ||
+        le32_to_cpu(footer->eos_marker.size) != 0  ||
+        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = VMDK_OK;
+ out:
+    return ret;
+}
+
 static int vmdk_open_vmdk4(BlockDriverState *bs,
                            BlockDriverState *file,
                            int flags, Error **errp)
 {
     int ret;
+    bool has_footer = false;
     uint32_t magic;
     uint32_t l1_size, l1_entry_sectors;
     VMDK4Header header;
+    VMDK4Footer footer;
     VmdkExtent *extent;
     BDRVVmdkState *s = bs->opaque;
     int64_t l1_backup_offset = 0;
@@ -593,48 +645,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
 
     if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
         /*
-         * The footer takes precedence over the header, so read it in. The
-         * footer starts at offset -1024 from the end: One sector for the
-         * footer, and another one for the end-of-stream marker.
+         * The footer takes precedence over the header, so read it in.
          */
-        struct {
-            struct {
-                uint64_t val;
-                uint32_t size;
-                uint32_t type;
-                uint8_t pad[512 - 16];
-            } QEMU_PACKED footer_marker;
-
-            uint32_t magic;
-            VMDK4Header header;
-            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
-
-            struct {
-                uint64_t val;
-                uint32_t size;
-                uint32_t type;
-                uint8_t pad[512 - 16];
-            } QEMU_PACKED eos_marker;
-        } QEMU_PACKED footer;
-
-        ret = bdrv_pread(file,
-            bs->file->total_sectors * 512 - 1536,
-            &footer, sizeof(footer));
+        ret = vmdk_read_footer(bs, &footer);
         if (ret < 0) {
             return ret;
         }
-
-        /* Some sanity checks for the footer */
-        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
-            le32_to_cpu(footer.footer_marker.size) != 0  ||
-            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
-            le64_to_cpu(footer.eos_marker.val) != 0  ||
-            le32_to_cpu(footer.eos_marker.size) != 0  ||
-            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
-        {
-            return -EINVAL;
-        }
-
+        has_footer = true;
         header = footer.header;
     }
 
@@ -645,11 +662,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
                   bs->device_name, "vmdk", buf);
         return -ENOTSUP;
-    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
+    } else if (le32_to_cpu(header.version) == 3 &&
+               (flags & BDRV_O_RDWR) &&
+               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
         /* VMware KB 2064959 explains that version 3 added support for
          * persistent changed block tracking (CBT), and backup software can
          * read it as version=1 if it doesn't care about the changed area
-         * information. So we are safe to enable read only. */
+         * information. So we are safe to enable read only.
+         * Note that this does not apply to streamOptimized images which
+         * are written only once and are used as transport format */
         error_setg(errp, "VMDK version 3 must be read only");
         return -EINVAL;
     }
@@ -689,11 +710,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     if (ret < 0) {
         return ret;
     }
+    if (has_footer) {
+        extent->has_footer = has_footer;
+    }
+
     extent->compressed =
         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
     if (extent->compressed) {
         g_free(s->create_type);
         s->create_type = g_strdup("streamOptimized");
+
+        if (extent->has_footer && (flags & BDRV_O_RDWR)) {
+            bdrv_truncate(file,
+                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
+        }
     }
     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
     extent->version = le32_to_cpu(header.version);
@@ -998,6 +1028,12 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     uint32_t offset;
     QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
     offset = cpu_to_le32(m_data->offset);
+
+    /* nothing to update on streamOptimized */
+    if (extent->compressed) {
+        return VMDK_OK;
+    }
+
     /* update L2 table */
     if (bdrv_pwrite_sync(
                 extent->file,
@@ -1008,7 +1044,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     }
     /* update backup L2 table */
     if (extent->l1_backup_table_offset != 0) {
-        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
+        m_data->l2_offset = extent->l1_backup_table[extent->l1_index];
         if (bdrv_pwrite_sync(
                     extent->file,
                     ((int64_t)m_data->l2_offset * 512)
@@ -1024,6 +1060,108 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
     return VMDK_OK;
 }
 
+static int vmdk_write_footer(BlockDriverState *bs, VMDK4Footer *footer)
+{
+    int ret;
+    uint64_t offset;
+    uint32_t grains, gt_count, gd_sectors;
+
+    if (!footer) {
+        return -EINVAL;
+    }
+
+    grains = DIV_ROUND_UP(le64_to_cpu(footer->header.capacity),
+                          le64_to_cpu(footer->header.granularity));
+    gt_count = DIV_ROUND_UP(grains,
+                            le32_to_cpu(footer->header.num_gtes_per_gt));
+    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
+
+    offset = le64_to_cpu(footer->header.gd_offset) + gd_sectors;
+    footer->footer_marker.val = cpu_to_le64(1);
+    footer->footer_marker.type = cpu_to_le32(MARKER_FOOTER);
+    footer->magic = cpu_to_be32(VMDK4_MAGIC);
+    footer->eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
+
+    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
+    if (ret < 0) {
+        return ret;
+    }
+
+    return VMDK_OK;
+}
+
+static int vmdk_write_grain_directory(VmdkExtent *extent)
+{
+    int i, ret, gd_buf_size;
+    uint32_t *gd_buf = NULL, gd_sectors;
+    VMDK4MetaMarker gd_marker;
+
+    /* write grain directory marker */
+    memset(&gd_marker, 0, sizeof(gd_marker));
+    gd_sectors = DIV_ROUND_UP(extent->l1_size * sizeof(uint32_t),
+                              BDRV_SECTOR_SIZE);
+    gd_marker.val = cpu_to_le64(gd_sectors);
+    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
+    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
+                      &gd_marker, sizeof(gd_marker));
+    if (ret < 0) {
+        return ret;
+    }
+    extent->l1_table_offset += sizeof(gd_marker);
+
+    /* write grain directory */
+    gd_buf_size = extent->l1_size * sizeof(uint32_t);
+    gd_buf = g_malloc0(gd_buf_size);
+    for (i = 0; i < extent->l1_size; i++) {
+        gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
+    }
+    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
+                      gd_buf, gd_buf_size);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    ret = VMDK_OK;
+ exit:
+    g_free(gd_buf);
+    return ret;
+}
+
+static int vmdk_write_grain_table(VmdkExtent *extent)
+{
+    int i;
+    uint32_t *l2_table = NULL;
+    VMDK4MetaMarker gtm;
+
+    for (i = 0; i < L2_CACHE_SIZE; i++) {
+        if (extent->l1_table[extent->l1_index] == extent->l2_cache_offsets[i]) {
+            l2_table = extent->l2_cache + (i * extent->l2_size);
+        }
+    }
+    if (!l2_table) {
+        return -EINVAL;
+    }
+
+    memset(&gtm, 0, sizeof(gtm));
+    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
+    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
+    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
+                    &gtm, sizeof(gtm)) != sizeof(gtm)) {
+        return -EIO;
+    }
+    extent->l1_table_offset += sizeof(gtm);
+
+    extent->l1_table[extent->l1_index] = extent->l1_table_offset >> 9;
+    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
+                    l2_table, extent->l2_size * sizeof(uint32_t)
+                   ) != extent->l2_size * sizeof(uint32_t)) {
+        return -EIO;
+    }
+    extent->l1_table_offset += extent->l2_size * sizeof(uint32_t);
+
+    return VMDK_OK;
+}
+
 static int get_cluster_offset(BlockDriverState *bs,
                                     VmdkExtent *extent,
                                     VmdkMetaData *m_data,
@@ -1032,7 +1170,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                                     uint64_t *cluster_offset)
 {
     unsigned int l1_index, l2_offset, l2_index;
-    int min_index, i, j;
+    int min_index, i, j, ret;
     uint32_t min_count, *l2_table;
     bool zeroed = false;
 
@@ -1046,6 +1184,22 @@ static int get_cluster_offset(BlockDriverState *bs,
 
     offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
     l1_index = (offset >> 9) / extent->l1_entry_sectors;
+    if (extent->compressed && !extent->l1_table[l1_index] && allocate) {
+        if (l1_index) {
+            /* grain (L2) tables follow compressed data so first L2 table will
+             * be written when we move to next index or when we close image.
+             * that is why we need to save l1_index in extent itself for easy
+             * access from both here and vmdk_close */
+            ret = vmdk_write_grain_table(extent);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+        /* allocate new L2; set it to GD offset for now */
+        extent->l1_table[l1_index] = extent->l1_table_offset;
+    }
+    extent->l1_index = l1_index;
+
     if (l1_index >= extent->l1_size) {
         return VMDK_ERROR;
     }
@@ -1092,7 +1246,6 @@ static int get_cluster_offset(BlockDriverState *bs,
 
     if (m_data) {
         m_data->valid = 1;
-        m_data->l1_index = l1_index;
         m_data->l2_index = l2_index;
         m_data->offset = *cluster_offset;
         m_data->l2_offset = l2_offset;
@@ -1234,6 +1387,10 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
         ret = ret < 0 ? ret : -EIO;
         goto out;
     }
+    if (extent->compressed) {
+        /* update GD offset after each write */
+        extent->l1_table_offset = bdrv_getlength(extent->file);
+    }
     ret = 0;
  out:
     g_free(data);
@@ -1532,10 +1689,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     int ret, i;
     BlockDriverState *bs = NULL;
     VMDK4Header header;
+    VMDK4Footer footer;
     Error *local_err = NULL;
     uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
     uint32_t *gd_buf = NULL;
     int gd_buf_size;
+    uint64_t grain_offset, rgd_offset, gd_offset;
 
     ret = bdrv_create_file(filename, opts, &local_err);
     if (ret < 0) {
@@ -1560,28 +1719,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     }
     magic = cpu_to_be32(VMDK4_MAGIC);
     memset(&header, 0, sizeof(header));
-    header.version = zeroed_grain ? 2 : 1;
-    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
-                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
+    memset(&footer, 0, sizeof(footer));
+
+    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
+    header.flags = VMDK4_FLAG_NL_DETECT
+                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
+                               : VMDK4_FLAG_RGD)
                    | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
     header.capacity = filesize / BDRV_SECTOR_SIZE;
     header.granularity = 128;
     header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
 
-    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
+    grains = DIV_ROUND_UP(header.capacity, header.granularity);
     gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
                            BDRV_SECTOR_SIZE);
     gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
     gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
 
     header.desc_offset = 1;
-    header.desc_size = 20;
-    header.rgd_offset = header.desc_offset + header.desc_size;
-    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
-    header.grain_offset =
+    header.desc_size = (compress ? 2 : 20);
+    rgd_offset = header.desc_offset + header.desc_size;
+    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
+    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
+    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
+    grain_offset =
         ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
                  header.granularity);
+    /* streamOptimized reserves first 128 sectors */
+    header.grain_offset = (compress ? header.granularity : grain_offset);
+    /* streamOptimzed's grain directory is at the end */
+    gd_offset = header.grain_offset + 1;
+
     /* swap endianness for all header fields */
     header.version = cpu_to_le32(header.version);
     header.flags = cpu_to_le32(header.flags);
@@ -1618,30 +1787,44 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         goto exit;
     }
 
-    /* write grain directory */
-    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
-    gd_buf = g_malloc0(gd_buf_size);
-    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
-         i < gt_count; i++, tmp += gt_size) {
-        gd_buf[i] = cpu_to_le32(tmp);
-    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
-    if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
-        goto exit;
-    }
+    if (compress) {
+        footer.header = header;
+        footer.header.gd_offset = cpu_to_le64(gd_offset);
 
-    /* write backup grain directory */
-    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
-         i < gt_count; i++, tmp += gt_size) {
-        gd_buf[i] = cpu_to_le32(tmp);
-    }
-    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
-                      gd_buf, gd_buf_size);
-    if (ret < 0) {
-        error_set(errp, QERR_IO_ERROR);
-        goto exit;
+        ret = vmdk_write_footer(bs, &footer);
+        if (ret < 0) {
+            error_set(errp, QERR_IO_ERROR);
+            goto exit;
+        }
+    } else {
+        /* write redundant grain directory (if applicable) */
+        if (le64_to_cpu(header.rgd_offset)) {
+            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
+            gd_buf = g_malloc0(gd_buf_size);
+            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
+                 i < gt_count; i++, tmp += gt_size) {
+                gd_buf[i] = cpu_to_le32(tmp);
+            }
+            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) *
+                                  BDRV_SECTOR_SIZE,
+                              gd_buf, gd_buf_size);
+            if (ret < 0) {
+                error_set(errp, QERR_IO_ERROR);
+                goto exit;
+            }
+        }
+
+        /* write grain directory */
+        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
+             i < gt_count; i++, tmp += gt_size) {
+            gd_buf[i] = cpu_to_le32(tmp);
+        }
+        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                          gd_buf, gd_buf_size);
+        if (ret < 0) {
+            error_set(errp, QERR_IO_ERROR);
+            goto exit;
+        }
     }
 
     ret = 0;
@@ -1911,7 +2094,29 @@ exit:
 
 static void vmdk_close(BlockDriverState *bs)
 {
+    int ret;
     BDRVVmdkState *s = bs->opaque;
+    VmdkExtent *extent = &s->extents[0];
+    VMDK4Footer footer;
+
+    if (extent->compressed) {
+        while (extent < &s->extents[s->num_extents]) {
+            vmdk_write_grain_table(extent);
+            vmdk_write_grain_directory(extent);
+            if (extent->has_footer) {
+                memset(&footer, 0, sizeof(footer));
+                ret = bdrv_pread(extent->file, sizeof(uint32_t),
+                                 &footer.header, sizeof(footer.header));
+                if (ret < 0) {
+                    continue;
+                }
+                footer.header.gd_offset =
+                    cpu_to_le64(extent->l1_table_offset >> 9);
+                vmdk_write_footer(extent->file, &footer);
+            }
+            extent++;
+        }
+    }
 
     vmdk_free_extents(bs);
     g_free(s->create_type);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v3] vmdk: improve streamOptimized vmdk support
  2014-08-06 21:24   ` [Qemu-devel] [PATCH v3] " Milos Vyletel
@ 2014-08-12 10:45     ` Stefan Hajnoczi
  2014-08-12 12:44       ` Milos Vyletel
  2014-08-18  6:52     ` Fam Zheng
  1 sibling, 1 reply; 21+ messages in thread
From: Stefan Hajnoczi @ 2014-08-12 10:45 UTC (permalink / raw)
  To: Milos Vyletel; +Cc: Kevin Wolf, Fam Zheng, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 2441 bytes --]

On Wed, Aug 06, 2014 at 05:24:42PM -0400, Milos Vyletel wrote:
> VMDK's streamOptimized format is different from regular sparse format.
> L1(GD) and L2(GT) tables are not predefined but rather generated and
> written during image creation mainly because there is no way to tell
> how much space data will occupy once they are compressed. Also the
> location of header, L1 and L2 tables differ.
> 
> - L2 tables (grain tables) are written after all grains they point to
> - L1 tables are written after all grains and L2 tables
> - footer at the end is used instead of header in first sector
> 
> Images generated by qemu-img could not be imported (as part of OVA archive)
> to neither VMWare nor OVM because of errors.
> 
> - VMWare during OVA import:
> Not a supported disk format (sparse VMDK too old)
> 
> - OVM's vbox-img during conversion:
> vbox-img: error: Error while copying the image: VERR_EOF
> 
> This patch fixes streamOptimized support in qemu which was not fully
> compatible with VMDK specifications as defined in latest avaialble version
> at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.
> 
> Qemu generated images are identical to the ones generated by VMWare and
> OVM (vbox-img) with the exception of DescriptorFile but that is expected
> (CID and some additional DDB entries differ). They were also succesfully
> imported to VMWare vCloud, ESXi and Oracle OVM.
> 
> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
> ---
> v2 changes:
> - updated commit message description with errors received
> - style/grammar fixes (clean checkpatch pass)
> - removed l2_table pointer from VmdkExtent struct
> - fixed memory leak in vmdk_write_footer()
> 
> v3 changes:
> - removed footer from VmdkExtent structure
> - added vmdk_write_grain_directory function to separate GD and footer writes
> - fix possible problems with opening of images created by older implementation
> - fix reverse conversion from VMDK to other formats
> 
>  block/vmdk.c |  355 +++++++++++++++++++++++++++++++++++++++++++++------------
>  1 files changed, 280 insertions(+), 75 deletions(-)

In the future, please send revisions as top-level email threads.  When
people use threaded email clients your patch can be overlooked since
it's a subthread of an old discussion, rather than a new thread in the
inbox.

I skimmed the patch and am relying on Fam for a detailed review.

[-- Attachment #2: Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v3] vmdk: improve streamOptimized vmdk support
  2014-08-12 10:45     ` Stefan Hajnoczi
@ 2014-08-12 12:44       ` Milos Vyletel
  0 siblings, 0 replies; 21+ messages in thread
From: Milos Vyletel @ 2014-08-12 12:44 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, Fam Zheng, qemu-devel

On Tue, Aug 12, 2014 at 6:45 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> On Wed, Aug 06, 2014 at 05:24:42PM -0400, Milos Vyletel wrote:
>> VMDK's streamOptimized format is different from regular sparse format.
>> L1(GD) and L2(GT) tables are not predefined but rather generated and
>> written during image creation mainly because there is no way to tell
>> how much space data will occupy once they are compressed. Also the
>> location of header, L1 and L2 tables differ.
>>
>> - L2 tables (grain tables) are written after all grains they point to
>> - L1 tables are written after all grains and L2 tables
>> - footer at the end is used instead of header in first sector
>>
>> Images generated by qemu-img could not be imported (as part of OVA archive)
>> to neither VMWare nor OVM because of errors.
>>
>> - VMWare during OVA import:
>> Not a supported disk format (sparse VMDK too old)
>>
>> - OVM's vbox-img during conversion:
>> vbox-img: error: Error while copying the image: VERR_EOF
>>
>> This patch fixes streamOptimized support in qemu which was not fully
>> compatible with VMDK specifications as defined in latest avaialble version
>> at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.
>>
>> Qemu generated images are identical to the ones generated by VMWare and
>> OVM (vbox-img) with the exception of DescriptorFile but that is expected
>> (CID and some additional DDB entries differ). They were also succesfully
>> imported to VMWare vCloud, ESXi and Oracle OVM.
>>
>> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
>> ---
>> v2 changes:
>> - updated commit message description with errors received
>> - style/grammar fixes (clean checkpatch pass)
>> - removed l2_table pointer from VmdkExtent struct
>> - fixed memory leak in vmdk_write_footer()
>>
>> v3 changes:
>> - removed footer from VmdkExtent structure
>> - added vmdk_write_grain_directory function to separate GD and footer writes
>> - fix possible problems with opening of images created by older implementation
>> - fix reverse conversion from VMDK to other formats
>>
>>  block/vmdk.c |  355 +++++++++++++++++++++++++++++++++++++++++++++------------
>>  1 files changed, 280 insertions(+), 75 deletions(-)
>
> In the future, please send revisions as top-level email threads.  When
> people use threaded email clients your patch can be overlooked since
> it's a subthread of an old discussion, rather than a new thread in the
> inbox.
>
> I skimmed the patch and am relying on Fam for a detailed review.

Will do. Sorry about that. I just thought it's easier to have it all
in one thread so lets say that in future you find this thread in
archive and you'll have all patch versions in one place and don't have
to look for other threads.

Milos

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v3] vmdk: improve streamOptimized vmdk support
  2014-08-06 21:24   ` [Qemu-devel] [PATCH v3] " Milos Vyletel
  2014-08-12 10:45     ` Stefan Hajnoczi
@ 2014-08-18  6:52     ` Fam Zheng
  2014-08-29 19:49       ` Milos Vyletel
  1 sibling, 1 reply; 21+ messages in thread
From: Fam Zheng @ 2014-08-18  6:52 UTC (permalink / raw)
  To: Milos Vyletel; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi

On Wed, 08/06 17:24, Milos Vyletel wrote:
> VMDK's streamOptimized format is different from regular sparse format.
> L1(GD) and L2(GT) tables are not predefined but rather generated and
> written during image creation mainly because there is no way to tell
> how much space data will occupy once they are compressed. Also the
> location of header, L1 and L2 tables differ.
> 
> - L2 tables (grain tables) are written after all grains they point to
> - L1 tables are written after all grains and L2 tables
> - footer at the end is used instead of header in first sector
> 
> Images generated by qemu-img could not be imported (as part of OVA archive)
> to neither VMWare nor OVM because of errors.
> 
> - VMWare during OVA import:
> Not a supported disk format (sparse VMDK too old)
> 
> - OVM's vbox-img during conversion:
> vbox-img: error: Error while copying the image: VERR_EOF
> 
> This patch fixes streamOptimized support in qemu which was not fully
> compatible with VMDK specifications as defined in latest avaialble version
> at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.
> 
> Qemu generated images are identical to the ones generated by VMWare and
> OVM (vbox-img) with the exception of DescriptorFile but that is expected
> (CID and some additional DDB entries differ). They were also succesfully
> imported to VMWare vCloud, ESXi and Oracle OVM.
> 
> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
> ---
> v2 changes:
> - updated commit message description with errors received
> - style/grammar fixes (clean checkpatch pass)
> - removed l2_table pointer from VmdkExtent struct
> - fixed memory leak in vmdk_write_footer()
> 
> v3 changes:
> - removed footer from VmdkExtent structure
> - added vmdk_write_grain_directory function to separate GD and footer writes
> - fix possible problems with opening of images created by older implementation
> - fix reverse conversion from VMDK to other formats
> 
>  block/vmdk.c |  355 +++++++++++++++++++++++++++++++++++++++++++++------------
>  1 files changed, 280 insertions(+), 75 deletions(-)
> 
> diff --git a/block/vmdk.c b/block/vmdk.c
> index 0517bba..0a1a08c 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -81,6 +81,21 @@ typedef struct {
>      uint16_t compressAlgorithm;
>  } QEMU_PACKED VMDK4Header;
>  
> +typedef struct {
> +    uint64_t val;
> +    uint32_t size;
> +    uint32_t type;
> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
> +} QEMU_PACKED VMDK4MetaMarker;
> +
> +typedef struct {
> +    VMDK4MetaMarker footer_marker;
> +    uint32_t magic;
> +    VMDK4Header header;
> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
> +    VMDK4MetaMarker eos_marker;
> +} QEMU_PACKED VMDK4Footer;
> +
>  #define L2_CACHE_SIZE 16
>  
>  typedef struct VmdkExtent {
> @@ -89,12 +104,14 @@ typedef struct VmdkExtent {
>      bool compressed;
>      bool has_marker;
>      bool has_zero_grain;
> +    bool has_footer;
>      int version;
>      int64_t sectors;
>      int64_t end_sector;
>      int64_t flat_start_offset;
>      int64_t l1_table_offset;
>      int64_t l1_backup_table_offset;
> +    uint32_t l1_index;
>      uint32_t *l1_table;
>      uint32_t *l1_backup_table;
>      unsigned int l1_size;
> @@ -125,7 +142,6 @@ typedef struct BDRVVmdkState {
>  
>  typedef struct VmdkMetaData {
>      uint32_t offset;
> -    unsigned int l1_index;
>      unsigned int l2_index;
>      unsigned int l2_offset;
>      int valid;
> @@ -555,14 +571,50 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
>      return buf;
>  }
>  
> +static int vmdk_read_footer(BlockDriverState *bs,
> +                            VMDK4Footer *footer)
> +{
> +    int ret;
> +
> +    /*
> +    * footer starts 3 sectors from end
> +    * - footer marker
> +    * - footer
> +    * - end-of-stream marker
> +    */
> +    ret = bdrv_pread(bs->file,
> +        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
> +        footer, sizeof(*footer));
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* Some sanity checks for the footer */
> +    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
> +        le32_to_cpu(footer->footer_marker.size) != 0  ||
> +        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
> +        le64_to_cpu(footer->eos_marker.val) != 0  ||
> +        le32_to_cpu(footer->eos_marker.size) != 0  ||
> +        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
> +    ret = VMDK_OK;
> + out:
> +    return ret;
> +}
> +
>  static int vmdk_open_vmdk4(BlockDriverState *bs,
>                             BlockDriverState *file,
>                             int flags, Error **errp)
>  {
>      int ret;
> +    bool has_footer = false;
>      uint32_t magic;
>      uint32_t l1_size, l1_entry_sectors;
>      VMDK4Header header;
> +    VMDK4Footer footer;
>      VmdkExtent *extent;
>      BDRVVmdkState *s = bs->opaque;
>      int64_t l1_backup_offset = 0;
> @@ -593,48 +645,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>  
>      if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
>          /*
> -         * The footer takes precedence over the header, so read it in. The
> -         * footer starts at offset -1024 from the end: One sector for the
> -         * footer, and another one for the end-of-stream marker.
> +         * The footer takes precedence over the header, so read it in.
>           */
> -        struct {
> -            struct {
> -                uint64_t val;
> -                uint32_t size;
> -                uint32_t type;
> -                uint8_t pad[512 - 16];
> -            } QEMU_PACKED footer_marker;
> -
> -            uint32_t magic;
> -            VMDK4Header header;
> -            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
> -
> -            struct {
> -                uint64_t val;
> -                uint32_t size;
> -                uint32_t type;
> -                uint8_t pad[512 - 16];
> -            } QEMU_PACKED eos_marker;
> -        } QEMU_PACKED footer;
> -
> -        ret = bdrv_pread(file,
> -            bs->file->total_sectors * 512 - 1536,
> -            &footer, sizeof(footer));
> +        ret = vmdk_read_footer(bs, &footer);
>          if (ret < 0) {
>              return ret;
>          }
> -
> -        /* Some sanity checks for the footer */
> -        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
> -            le32_to_cpu(footer.footer_marker.size) != 0  ||
> -            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
> -            le64_to_cpu(footer.eos_marker.val) != 0  ||
> -            le32_to_cpu(footer.eos_marker.size) != 0  ||
> -            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
> -        {
> -            return -EINVAL;
> -        }
> -
> +        has_footer = true;
>          header = footer.header;
>      }
>  
> @@ -645,11 +662,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>          error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
>                    bs->device_name, "vmdk", buf);
>          return -ENOTSUP;
> -    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
> +    } else if (le32_to_cpu(header.version) == 3 &&
> +               (flags & BDRV_O_RDWR) &&
> +               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
>          /* VMware KB 2064959 explains that version 3 added support for
>           * persistent changed block tracking (CBT), and backup software can
>           * read it as version=1 if it doesn't care about the changed area
> -         * information. So we are safe to enable read only. */
> +         * information. So we are safe to enable read only.
> +         * Note that this does not apply to streamOptimized images which
> +         * are written only once and are used as transport format */
>          error_setg(errp, "VMDK version 3 must be read only");
>          return -EINVAL;
>      }
> @@ -689,11 +710,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>      if (ret < 0) {
>          return ret;
>      }
> +    if (has_footer) {
> +        extent->has_footer = has_footer;
> +    }
> +
>      extent->compressed =
>          le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
>      if (extent->compressed) {
>          g_free(s->create_type);
>          s->create_type = g_strdup("streamOptimized");
> +
> +        if (extent->has_footer && (flags & BDRV_O_RDWR)) {
> +            bdrv_truncate(file,
> +                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
> +        }
>      }
>      extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
>      extent->version = le32_to_cpu(header.version);
> @@ -998,6 +1028,12 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      uint32_t offset;
>      QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
>      offset = cpu_to_le32(m_data->offset);
> +
> +    /* nothing to update on streamOptimized */
> +    if (extent->compressed) {
> +        return VMDK_OK;
> +    }
> +
>      /* update L2 table */
>      if (bdrv_pwrite_sync(
>                  extent->file,
> @@ -1008,7 +1044,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      }
>      /* update backup L2 table */
>      if (extent->l1_backup_table_offset != 0) {
> -        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
> +        m_data->l2_offset = extent->l1_backup_table[extent->l1_index];
>          if (bdrv_pwrite_sync(
>                      extent->file,
>                      ((int64_t)m_data->l2_offset * 512)
> @@ -1024,6 +1060,108 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>      return VMDK_OK;
>  }
>  
> +static int vmdk_write_footer(BlockDriverState *bs, VMDK4Footer *footer)
> +{
> +    int ret;
> +    uint64_t offset;
> +    uint32_t grains, gt_count, gd_sectors;
> +
> +    if (!footer) {
> +        return -EINVAL;
> +    }
> +
> +    grains = DIV_ROUND_UP(le64_to_cpu(footer->header.capacity),
> +                          le64_to_cpu(footer->header.granularity));
> +    gt_count = DIV_ROUND_UP(grains,
> +                            le32_to_cpu(footer->header.num_gtes_per_gt));
> +    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
> +
> +    offset = le64_to_cpu(footer->header.gd_offset) + gd_sectors;
> +    footer->footer_marker.val = cpu_to_le64(1);
> +    footer->footer_marker.type = cpu_to_le32(MARKER_FOOTER);
> +    footer->magic = cpu_to_be32(VMDK4_MAGIC);
> +    footer->eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
> +
> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return VMDK_OK;
> +}
> +
> +static int vmdk_write_grain_directory(VmdkExtent *extent)
> +{
> +    int i, ret, gd_buf_size;
> +    uint32_t *gd_buf = NULL, gd_sectors;
> +    VMDK4MetaMarker gd_marker;
> +
> +    /* write grain directory marker */
> +    memset(&gd_marker, 0, sizeof(gd_marker));
> +    gd_sectors = DIV_ROUND_UP(extent->l1_size * sizeof(uint32_t),
> +                              BDRV_SECTOR_SIZE);
> +    gd_marker.val = cpu_to_le64(gd_sectors);
> +    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
> +    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
> +                      &gd_marker, sizeof(gd_marker));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    extent->l1_table_offset += sizeof(gd_marker);
> +
> +    /* write grain directory */
> +    gd_buf_size = extent->l1_size * sizeof(uint32_t);
> +    gd_buf = g_malloc0(gd_buf_size);
> +    for (i = 0; i < extent->l1_size; i++) {
> +        gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
> +    }
> +    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
> +                      gd_buf, gd_buf_size);
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +
> +    ret = VMDK_OK;
> + exit:
> +    g_free(gd_buf);
> +    return ret;
> +}
> +
> +static int vmdk_write_grain_table(VmdkExtent *extent)
> +{
> +    int i;
> +    uint32_t *l2_table = NULL;
> +    VMDK4MetaMarker gtm;
> +
> +    for (i = 0; i < L2_CACHE_SIZE; i++) {
> +        if (extent->l1_table[extent->l1_index] == extent->l2_cache_offsets[i]) {
> +            l2_table = extent->l2_cache + (i * extent->l2_size);

break?

> +        }
> +    }
> +    if (!l2_table) {
> +        return -EINVAL;
> +    }
> +
> +    memset(&gtm, 0, sizeof(gtm));
> +    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
> +    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
> +    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
> +                    &gtm, sizeof(gtm)) != sizeof(gtm)) {

I feel that the variable extent->l1_table_offest is abused. Could you rebase to
current master and use extent->next_cluster_sector to track the position for
next metadata write?

> +        return -EIO;

Why not return bdrv_pwrite's return value instead?

> +    }
> +    extent->l1_table_offset += sizeof(gtm);
> +
> +    extent->l1_table[extent->l1_index] = extent->l1_table_offset >> 9;
> +    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
> +                    l2_table, extent->l2_size * sizeof(uint32_t)
> +                   ) != extent->l2_size * sizeof(uint32_t)) {
> +        return -EIO;

Ditto.

> +    }
> +    extent->l1_table_offset += extent->l2_size * sizeof(uint32_t);
> +
> +    return VMDK_OK;
> +}
> +
>  static int get_cluster_offset(BlockDriverState *bs,
>                                      VmdkExtent *extent,
>                                      VmdkMetaData *m_data,
> @@ -1032,7 +1170,7 @@ static int get_cluster_offset(BlockDriverState *bs,
>                                      uint64_t *cluster_offset)
>  {
>      unsigned int l1_index, l2_offset, l2_index;
> -    int min_index, i, j;
> +    int min_index, i, j, ret;
>      uint32_t min_count, *l2_table;
>      bool zeroed = false;
>  
> @@ -1046,6 +1184,22 @@ static int get_cluster_offset(BlockDriverState *bs,
>  
>      offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
>      l1_index = (offset >> 9) / extent->l1_entry_sectors;
> +    if (extent->compressed && !extent->l1_table[l1_index] && allocate) {
> +        if (l1_index) {
> +            /* grain (L2) tables follow compressed data so first L2 table will
> +             * be written when we move to next index or when we close image.
> +             * that is why we need to save l1_index in extent itself for easy
> +             * access from both here and vmdk_close */
> +            ret = vmdk_write_grain_table(extent);
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        }
> +        /* allocate new L2; set it to GD offset for now */
> +        extent->l1_table[l1_index] = extent->l1_table_offset;
> +    }
> +    extent->l1_index = l1_index;
> +
>      if (l1_index >= extent->l1_size) {
>          return VMDK_ERROR;
>      }
> @@ -1092,7 +1246,6 @@ static int get_cluster_offset(BlockDriverState *bs,
>  
>      if (m_data) {
>          m_data->valid = 1;
> -        m_data->l1_index = l1_index;
>          m_data->l2_index = l2_index;
>          m_data->offset = *cluster_offset;
>          m_data->l2_offset = l2_offset;
> @@ -1234,6 +1387,10 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>          ret = ret < 0 ? ret : -EIO;
>          goto out;
>      }
> +    if (extent->compressed) {
> +        /* update GD offset after each write */
> +        extent->l1_table_offset = bdrv_getlength(extent->file);
> +    }
>      ret = 0;
>   out:
>      g_free(data);
> @@ -1532,10 +1689,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>      int ret, i;
>      BlockDriverState *bs = NULL;
>      VMDK4Header header;
> +    VMDK4Footer footer;
>      Error *local_err = NULL;
>      uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
>      uint32_t *gd_buf = NULL;
>      int gd_buf_size;
> +    uint64_t grain_offset, rgd_offset, gd_offset;
>  
>      ret = bdrv_create_file(filename, opts, &local_err);
>      if (ret < 0) {
> @@ -1560,28 +1719,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>      }
>      magic = cpu_to_be32(VMDK4_MAGIC);
>      memset(&header, 0, sizeof(header));
> -    header.version = zeroed_grain ? 2 : 1;
> -    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
> -                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
> +    memset(&footer, 0, sizeof(footer));
> +
> +    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
> +    header.flags = VMDK4_FLAG_NL_DETECT
> +                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
> +                               : VMDK4_FLAG_RGD)
>                     | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
>      header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
>      header.capacity = filesize / BDRV_SECTOR_SIZE;
>      header.granularity = 128;
>      header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
>  
> -    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
>      gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
>                             BDRV_SECTOR_SIZE);
>      gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
>      gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>  
>      header.desc_offset = 1;
> -    header.desc_size = 20;
> -    header.rgd_offset = header.desc_offset + header.desc_size;
> -    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
> -    header.grain_offset =
> +    header.desc_size = (compress ? 2 : 20);
> +    rgd_offset = header.desc_offset + header.desc_size;
> +    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
> +    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
> +    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
> +    grain_offset =
>          ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
>                   header.granularity);
> +    /* streamOptimized reserves first 128 sectors */
> +    header.grain_offset = (compress ? header.granularity : grain_offset);
> +    /* streamOptimzed's grain directory is at the end */
> +    gd_offset = header.grain_offset + 1;
> +
>      /* swap endianness for all header fields */
>      header.version = cpu_to_le32(header.version);
>      header.flags = cpu_to_le32(header.flags);
> @@ -1618,30 +1787,44 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>          goto exit;
>      }
>  
> -    /* write grain directory */
> -    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> -    gd_buf = g_malloc0(gd_buf_size);
> -    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
> -         i < gt_count; i++, tmp += gt_size) {
> -        gd_buf[i] = cpu_to_le32(tmp);
> -    }
> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
> -                      gd_buf, gd_buf_size);
> -    if (ret < 0) {
> -        error_set(errp, QERR_IO_ERROR);
> -        goto exit;
> -    }
> +    if (compress) {
> +        footer.header = header;
> +        footer.header.gd_offset = cpu_to_le64(gd_offset);
>  
> -    /* write backup grain directory */
> -    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
> -         i < gt_count; i++, tmp += gt_size) {
> -        gd_buf[i] = cpu_to_le32(tmp);
> -    }
> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
> -                      gd_buf, gd_buf_size);
> -    if (ret < 0) {
> -        error_set(errp, QERR_IO_ERROR);
> -        goto exit;
> +        ret = vmdk_write_footer(bs, &footer);
> +        if (ret < 0) {
> +            error_set(errp, QERR_IO_ERROR);
> +            goto exit;
> +        }
> +    } else {
> +        /* write redundant grain directory (if applicable) */
> +        if (le64_to_cpu(header.rgd_offset)) {
> +            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
> +            gd_buf = g_malloc0(gd_buf_size);
> +            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
> +                 i < gt_count; i++, tmp += gt_size) {
> +                gd_buf[i] = cpu_to_le32(tmp);
> +            }
> +            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) *
> +                                  BDRV_SECTOR_SIZE,
> +                              gd_buf, gd_buf_size);
> +            if (ret < 0) {
> +                error_set(errp, QERR_IO_ERROR);
> +                goto exit;
> +            }
> +        }
> +
> +        /* write grain directory */
> +        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
> +             i < gt_count; i++, tmp += gt_size) {
> +            gd_buf[i] = cpu_to_le32(tmp);
> +        }
> +        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
> +                          gd_buf, gd_buf_size);
> +        if (ret < 0) {
> +            error_set(errp, QERR_IO_ERROR);
> +            goto exit;
> +        }
>      }
>  
>      ret = 0;
> @@ -1911,7 +2094,29 @@ exit:
>  
>  static void vmdk_close(BlockDriverState *bs)
>  {
> +    int ret;
>      BDRVVmdkState *s = bs->opaque;
> +    VmdkExtent *extent = &s->extents[0];
> +    VMDK4Footer footer;
> +
> +    if (extent->compressed) {
> +        while (extent < &s->extents[s->num_extents]) {
> +            vmdk_write_grain_table(extent);
> +            vmdk_write_grain_directory(extent);
> +            if (extent->has_footer) {
> +                memset(&footer, 0, sizeof(footer));
> +                ret = bdrv_pread(extent->file, sizeof(uint32_t),
> +                                 &footer.header, sizeof(footer.header));

You read the header and write it to the end of file. Shouldn't this be reading
from the footer at the end of the extent?

Fam

> +                if (ret < 0) {
> +                    continue;
> +                }
> +                footer.header.gd_offset =
> +                    cpu_to_le64(extent->l1_table_offset >> 9);
> +                vmdk_write_footer(extent->file, &footer);
> +            }
> +            extent++;
> +        }
> +    }
>  
>      vmdk_free_extents(bs);
>      g_free(s->create_type);
> -- 
> 1.7.1
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v3] vmdk: improve streamOptimized vmdk support
  2014-08-18  6:52     ` Fam Zheng
@ 2014-08-29 19:49       ` Milos Vyletel
  0 siblings, 0 replies; 21+ messages in thread
From: Milos Vyletel @ 2014-08-29 19:49 UTC (permalink / raw)
  To: Fam Zheng; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi

On Mon, Aug 18, 2014 at 2:52 AM, Fam Zheng <famz@redhat.com> wrote:
> On Wed, 08/06 17:24, Milos Vyletel wrote:
>> VMDK's streamOptimized format is different from regular sparse format.
>> L1(GD) and L2(GT) tables are not predefined but rather generated and
>> written during image creation mainly because there is no way to tell
>> how much space data will occupy once they are compressed. Also the
>> location of header, L1 and L2 tables differ.
>>
>> - L2 tables (grain tables) are written after all grains they point to
>> - L1 tables are written after all grains and L2 tables
>> - footer at the end is used instead of header in first sector
>>
>> Images generated by qemu-img could not be imported (as part of OVA archive)
>> to neither VMWare nor OVM because of errors.
>>
>> - VMWare during OVA import:
>> Not a supported disk format (sparse VMDK too old)
>>
>> - OVM's vbox-img during conversion:
>> vbox-img: error: Error while copying the image: VERR_EOF
>>
>> This patch fixes streamOptimized support in qemu which was not fully
>> compatible with VMDK specifications as defined in latest avaialble version
>> at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf.
>>
>> Qemu generated images are identical to the ones generated by VMWare and
>> OVM (vbox-img) with the exception of DescriptorFile but that is expected
>> (CID and some additional DDB entries differ). They were also succesfully
>> imported to VMWare vCloud, ESXi and Oracle OVM.
>>
>> Signed-off-by: Milos Vyletel <milos.vyletel@gmail.com>
>> ---
>> v2 changes:
>> - updated commit message description with errors received
>> - style/grammar fixes (clean checkpatch pass)
>> - removed l2_table pointer from VmdkExtent struct
>> - fixed memory leak in vmdk_write_footer()
>>
>> v3 changes:
>> - removed footer from VmdkExtent structure
>> - added vmdk_write_grain_directory function to separate GD and footer writes
>> - fix possible problems with opening of images created by older implementation
>> - fix reverse conversion from VMDK to other formats
>>
>>  block/vmdk.c |  355 +++++++++++++++++++++++++++++++++++++++++++++------------
>>  1 files changed, 280 insertions(+), 75 deletions(-)
>>
>> diff --git a/block/vmdk.c b/block/vmdk.c
>> index 0517bba..0a1a08c 100644
>> --- a/block/vmdk.c
>> +++ b/block/vmdk.c
>> @@ -81,6 +81,21 @@ typedef struct {
>>      uint16_t compressAlgorithm;
>>  } QEMU_PACKED VMDK4Header;
>>
>> +typedef struct {
>> +    uint64_t val;
>> +    uint32_t size;
>> +    uint32_t type;
>> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)];
>> +} QEMU_PACKED VMDK4MetaMarker;
>> +
>> +typedef struct {
>> +    VMDK4MetaMarker footer_marker;
>> +    uint32_t magic;
>> +    VMDK4Header header;
>> +    uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)];
>> +    VMDK4MetaMarker eos_marker;
>> +} QEMU_PACKED VMDK4Footer;
>> +
>>  #define L2_CACHE_SIZE 16
>>
>>  typedef struct VmdkExtent {
>> @@ -89,12 +104,14 @@ typedef struct VmdkExtent {
>>      bool compressed;
>>      bool has_marker;
>>      bool has_zero_grain;
>> +    bool has_footer;
>>      int version;
>>      int64_t sectors;
>>      int64_t end_sector;
>>      int64_t flat_start_offset;
>>      int64_t l1_table_offset;
>>      int64_t l1_backup_table_offset;
>> +    uint32_t l1_index;
>>      uint32_t *l1_table;
>>      uint32_t *l1_backup_table;
>>      unsigned int l1_size;
>> @@ -125,7 +142,6 @@ typedef struct BDRVVmdkState {
>>
>>  typedef struct VmdkMetaData {
>>      uint32_t offset;
>> -    unsigned int l1_index;
>>      unsigned int l2_index;
>>      unsigned int l2_offset;
>>      int valid;
>> @@ -555,14 +571,50 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
>>      return buf;
>>  }
>>
>> +static int vmdk_read_footer(BlockDriverState *bs,
>> +                            VMDK4Footer *footer)
>> +{
>> +    int ret;
>> +
>> +    /*
>> +    * footer starts 3 sectors from end
>> +    * - footer marker
>> +    * - footer
>> +    * - end-of-stream marker
>> +    */
>> +    ret = bdrv_pread(bs->file,
>> +        (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE,
>> +        footer, sizeof(*footer));
>> +    if (ret < 0) {
>> +        goto out;
>> +    }
>> +
>> +    /* Some sanity checks for the footer */
>> +    if (be32_to_cpu(footer->magic) != VMDK4_MAGIC ||
>> +        le32_to_cpu(footer->footer_marker.size) != 0  ||
>> +        le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER ||
>> +        le64_to_cpu(footer->eos_marker.val) != 0  ||
>> +        le32_to_cpu(footer->eos_marker.size) != 0  ||
>> +        le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) {
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>> +    ret = VMDK_OK;
>> + out:
>> +    return ret;
>> +}
>> +
>>  static int vmdk_open_vmdk4(BlockDriverState *bs,
>>                             BlockDriverState *file,
>>                             int flags, Error **errp)
>>  {
>>      int ret;
>> +    bool has_footer = false;
>>      uint32_t magic;
>>      uint32_t l1_size, l1_entry_sectors;
>>      VMDK4Header header;
>> +    VMDK4Footer footer;
>>      VmdkExtent *extent;
>>      BDRVVmdkState *s = bs->opaque;
>>      int64_t l1_backup_offset = 0;
>> @@ -593,48 +645,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>>
>>      if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
>>          /*
>> -         * The footer takes precedence over the header, so read it in. The
>> -         * footer starts at offset -1024 from the end: One sector for the
>> -         * footer, and another one for the end-of-stream marker.
>> +         * The footer takes precedence over the header, so read it in.
>>           */
>> -        struct {
>> -            struct {
>> -                uint64_t val;
>> -                uint32_t size;
>> -                uint32_t type;
>> -                uint8_t pad[512 - 16];
>> -            } QEMU_PACKED footer_marker;
>> -
>> -            uint32_t magic;
>> -            VMDK4Header header;
>> -            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
>> -
>> -            struct {
>> -                uint64_t val;
>> -                uint32_t size;
>> -                uint32_t type;
>> -                uint8_t pad[512 - 16];
>> -            } QEMU_PACKED eos_marker;
>> -        } QEMU_PACKED footer;
>> -
>> -        ret = bdrv_pread(file,
>> -            bs->file->total_sectors * 512 - 1536,
>> -            &footer, sizeof(footer));
>> +        ret = vmdk_read_footer(bs, &footer);
>>          if (ret < 0) {
>>              return ret;
>>          }
>> -
>> -        /* Some sanity checks for the footer */
>> -        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
>> -            le32_to_cpu(footer.footer_marker.size) != 0  ||
>> -            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
>> -            le64_to_cpu(footer.eos_marker.val) != 0  ||
>> -            le32_to_cpu(footer.eos_marker.size) != 0  ||
>> -            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
>> -        {
>> -            return -EINVAL;
>> -        }
>> -
>> +        has_footer = true;
>>          header = footer.header;
>>      }
>>
>> @@ -645,11 +662,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>>          error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
>>                    bs->device_name, "vmdk", buf);
>>          return -ENOTSUP;
>> -    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
>> +    } else if (le32_to_cpu(header.version) == 3 &&
>> +               (flags & BDRV_O_RDWR) &&
>> +               !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) {
>>          /* VMware KB 2064959 explains that version 3 added support for
>>           * persistent changed block tracking (CBT), and backup software can
>>           * read it as version=1 if it doesn't care about the changed area
>> -         * information. So we are safe to enable read only. */
>> +         * information. So we are safe to enable read only.
>> +         * Note that this does not apply to streamOptimized images which
>> +         * are written only once and are used as transport format */
>>          error_setg(errp, "VMDK version 3 must be read only");
>>          return -EINVAL;
>>      }
>> @@ -689,11 +710,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
>>      if (ret < 0) {
>>          return ret;
>>      }
>> +    if (has_footer) {
>> +        extent->has_footer = has_footer;
>> +    }
>> +
>>      extent->compressed =
>>          le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
>>      if (extent->compressed) {
>>          g_free(s->create_type);
>>          s->create_type = g_strdup("streamOptimized");
>> +
>> +        if (extent->has_footer && (flags & BDRV_O_RDWR)) {
>> +            bdrv_truncate(file,
>> +                    le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
>> +        }
>>      }
>>      extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
>>      extent->version = le32_to_cpu(header.version);
>> @@ -998,6 +1028,12 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>>      uint32_t offset;
>>      QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
>>      offset = cpu_to_le32(m_data->offset);
>> +
>> +    /* nothing to update on streamOptimized */
>> +    if (extent->compressed) {
>> +        return VMDK_OK;
>> +    }
>> +
>>      /* update L2 table */
>>      if (bdrv_pwrite_sync(
>>                  extent->file,
>> @@ -1008,7 +1044,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>>      }
>>      /* update backup L2 table */
>>      if (extent->l1_backup_table_offset != 0) {
>> -        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
>> +        m_data->l2_offset = extent->l1_backup_table[extent->l1_index];
>>          if (bdrv_pwrite_sync(
>>                      extent->file,
>>                      ((int64_t)m_data->l2_offset * 512)
>> @@ -1024,6 +1060,108 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
>>      return VMDK_OK;
>>  }
>>
>> +static int vmdk_write_footer(BlockDriverState *bs, VMDK4Footer *footer)
>> +{
>> +    int ret;
>> +    uint64_t offset;
>> +    uint32_t grains, gt_count, gd_sectors;
>> +
>> +    if (!footer) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    grains = DIV_ROUND_UP(le64_to_cpu(footer->header.capacity),
>> +                          le64_to_cpu(footer->header.granularity));
>> +    gt_count = DIV_ROUND_UP(grains,
>> +                            le32_to_cpu(footer->header.num_gtes_per_gt));
>> +    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>> +
>> +    offset = le64_to_cpu(footer->header.gd_offset) + gd_sectors;
>> +    footer->footer_marker.val = cpu_to_le64(1);
>> +    footer->footer_marker.type = cpu_to_le32(MARKER_FOOTER);
>> +    footer->magic = cpu_to_be32(VMDK4_MAGIC);
>> +    footer->eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM);
>> +
>> +    ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, sizeof(*footer));
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    return VMDK_OK;
>> +}
>> +
>> +static int vmdk_write_grain_directory(VmdkExtent *extent)
>> +{
>> +    int i, ret, gd_buf_size;
>> +    uint32_t *gd_buf = NULL, gd_sectors;
>> +    VMDK4MetaMarker gd_marker;
>> +
>> +    /* write grain directory marker */
>> +    memset(&gd_marker, 0, sizeof(gd_marker));
>> +    gd_sectors = DIV_ROUND_UP(extent->l1_size * sizeof(uint32_t),
>> +                              BDRV_SECTOR_SIZE);
>> +    gd_marker.val = cpu_to_le64(gd_sectors);
>> +    gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY);
>> +    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
>> +                      &gd_marker, sizeof(gd_marker));
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +    extent->l1_table_offset += sizeof(gd_marker);
>> +
>> +    /* write grain directory */
>> +    gd_buf_size = extent->l1_size * sizeof(uint32_t);
>> +    gd_buf = g_malloc0(gd_buf_size);
>> +    for (i = 0; i < extent->l1_size; i++) {
>> +        gd_buf[i] = cpu_to_le32(extent->l1_table[i]);
>> +    }
>> +    ret = bdrv_pwrite(extent->file, extent->l1_table_offset,
>> +                      gd_buf, gd_buf_size);
>> +    if (ret < 0) {
>> +        goto exit;
>> +    }
>> +
>> +    ret = VMDK_OK;
>> + exit:
>> +    g_free(gd_buf);
>> +    return ret;
>> +}
>> +
>> +static int vmdk_write_grain_table(VmdkExtent *extent)
>> +{
>> +    int i;
>> +    uint32_t *l2_table = NULL;
>> +    VMDK4MetaMarker gtm;
>> +
>> +    for (i = 0; i < L2_CACHE_SIZE; i++) {
>> +        if (extent->l1_table[extent->l1_index] == extent->l2_cache_offsets[i]) {
>> +            l2_table = extent->l2_cache + (i * extent->l2_size);
>
> break?
>
>> +        }
>> +    }
>> +    if (!l2_table) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    memset(&gtm, 0, sizeof(gtm));
>> +    gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9);
>> +    gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE);
>> +    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
>> +                    &gtm, sizeof(gtm)) != sizeof(gtm)) {
>
> I feel that the variable extent->l1_table_offest is abused. Could you rebase to
> current master and use extent->next_cluster_sector to track the position for
> next metadata write?
>
>> +        return -EIO;
>
> Why not return bdrv_pwrite's return value instead?
>
>> +    }
>> +    extent->l1_table_offset += sizeof(gtm);
>> +
>> +    extent->l1_table[extent->l1_index] = extent->l1_table_offset >> 9;
>> +    if (bdrv_pwrite(extent->file, extent->l1_table_offset,
>> +                    l2_table, extent->l2_size * sizeof(uint32_t)
>> +                   ) != extent->l2_size * sizeof(uint32_t)) {
>> +        return -EIO;
>
> Ditto.
>
>> +    }
>> +    extent->l1_table_offset += extent->l2_size * sizeof(uint32_t);
>> +
>> +    return VMDK_OK;
>> +}
>> +
>>  static int get_cluster_offset(BlockDriverState *bs,
>>                                      VmdkExtent *extent,
>>                                      VmdkMetaData *m_data,
>> @@ -1032,7 +1170,7 @@ static int get_cluster_offset(BlockDriverState *bs,
>>                                      uint64_t *cluster_offset)
>>  {
>>      unsigned int l1_index, l2_offset, l2_index;
>> -    int min_index, i, j;
>> +    int min_index, i, j, ret;
>>      uint32_t min_count, *l2_table;
>>      bool zeroed = false;
>>
>> @@ -1046,6 +1184,22 @@ static int get_cluster_offset(BlockDriverState *bs,
>>
>>      offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
>>      l1_index = (offset >> 9) / extent->l1_entry_sectors;
>> +    if (extent->compressed && !extent->l1_table[l1_index] && allocate) {
>> +        if (l1_index) {
>> +            /* grain (L2) tables follow compressed data so first L2 table will
>> +             * be written when we move to next index or when we close image.
>> +             * that is why we need to save l1_index in extent itself for easy
>> +             * access from both here and vmdk_close */
>> +            ret = vmdk_write_grain_table(extent);
>> +            if (ret < 0) {
>> +                return ret;
>> +            }
>> +        }
>> +        /* allocate new L2; set it to GD offset for now */
>> +        extent->l1_table[l1_index] = extent->l1_table_offset;
>> +    }
>> +    extent->l1_index = l1_index;
>> +
>>      if (l1_index >= extent->l1_size) {
>>          return VMDK_ERROR;
>>      }
>> @@ -1092,7 +1246,6 @@ static int get_cluster_offset(BlockDriverState *bs,
>>
>>      if (m_data) {
>>          m_data->valid = 1;
>> -        m_data->l1_index = l1_index;
>>          m_data->l2_index = l2_index;
>>          m_data->offset = *cluster_offset;
>>          m_data->l2_offset = l2_offset;
>> @@ -1234,6 +1387,10 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
>>          ret = ret < 0 ? ret : -EIO;
>>          goto out;
>>      }
>> +    if (extent->compressed) {
>> +        /* update GD offset after each write */
>> +        extent->l1_table_offset = bdrv_getlength(extent->file);
>> +    }
>>      ret = 0;
>>   out:
>>      g_free(data);
>> @@ -1532,10 +1689,12 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>>      int ret, i;
>>      BlockDriverState *bs = NULL;
>>      VMDK4Header header;
>> +    VMDK4Footer footer;
>>      Error *local_err = NULL;
>>      uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
>>      uint32_t *gd_buf = NULL;
>>      int gd_buf_size;
>> +    uint64_t grain_offset, rgd_offset, gd_offset;
>>
>>      ret = bdrv_create_file(filename, opts, &local_err);
>>      if (ret < 0) {
>> @@ -1560,28 +1719,38 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>>      }
>>      magic = cpu_to_be32(VMDK4_MAGIC);
>>      memset(&header, 0, sizeof(header));
>> -    header.version = zeroed_grain ? 2 : 1;
>> -    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
>> -                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
>> +    memset(&footer, 0, sizeof(footer));
>> +
>> +    header.version = (compress ? 3 : zeroed_grain ? 2 : 1);
>> +    header.flags = VMDK4_FLAG_NL_DETECT
>> +                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER
>> +                               : VMDK4_FLAG_RGD)
>>                     | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
>>      header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
>>      header.capacity = filesize / BDRV_SECTOR_SIZE;
>>      header.granularity = 128;
>>      header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
>>
>> -    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
>> +    grains = DIV_ROUND_UP(header.capacity, header.granularity);
>>      gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
>>                             BDRV_SECTOR_SIZE);
>>      gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
>>      gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
>>
>>      header.desc_offset = 1;
>> -    header.desc_size = 20;
>> -    header.rgd_offset = header.desc_offset + header.desc_size;
>> -    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
>> -    header.grain_offset =
>> +    header.desc_size = (compress ? 2 : 20);
>> +    rgd_offset = header.desc_offset + header.desc_size;
>> +    header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0);
>> +    gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count);
>> +    header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset);
>> +    grain_offset =
>>          ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
>>                   header.granularity);
>> +    /* streamOptimized reserves first 128 sectors */
>> +    header.grain_offset = (compress ? header.granularity : grain_offset);
>> +    /* streamOptimzed's grain directory is at the end */
>> +    gd_offset = header.grain_offset + 1;
>> +
>>      /* swap endianness for all header fields */
>>      header.version = cpu_to_le32(header.version);
>>      header.flags = cpu_to_le32(header.flags);
>> @@ -1618,30 +1787,44 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
>>          goto exit;
>>      }
>>
>> -    /* write grain directory */
>> -    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
>> -    gd_buf = g_malloc0(gd_buf_size);
>> -    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
>> -         i < gt_count; i++, tmp += gt_size) {
>> -        gd_buf[i] = cpu_to_le32(tmp);
>> -    }
>> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
>> -                      gd_buf, gd_buf_size);
>> -    if (ret < 0) {
>> -        error_set(errp, QERR_IO_ERROR);
>> -        goto exit;
>> -    }
>> +    if (compress) {
>> +        footer.header = header;
>> +        footer.header.gd_offset = cpu_to_le64(gd_offset);
>>
>> -    /* write backup grain directory */
>> -    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
>> -         i < gt_count; i++, tmp += gt_size) {
>> -        gd_buf[i] = cpu_to_le32(tmp);
>> -    }
>> -    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
>> -                      gd_buf, gd_buf_size);
>> -    if (ret < 0) {
>> -        error_set(errp, QERR_IO_ERROR);
>> -        goto exit;
>> +        ret = vmdk_write_footer(bs, &footer);
>> +        if (ret < 0) {
>> +            error_set(errp, QERR_IO_ERROR);
>> +            goto exit;
>> +        }
>> +    } else {
>> +        /* write redundant grain directory (if applicable) */
>> +        if (le64_to_cpu(header.rgd_offset)) {
>> +            gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
>> +            gd_buf = g_malloc0(gd_buf_size);
>> +            for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
>> +                 i < gt_count; i++, tmp += gt_size) {
>> +                gd_buf[i] = cpu_to_le32(tmp);
>> +            }
>> +            ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) *
>> +                                  BDRV_SECTOR_SIZE,
>> +                              gd_buf, gd_buf_size);
>> +            if (ret < 0) {
>> +                error_set(errp, QERR_IO_ERROR);
>> +                goto exit;
>> +            }
>> +        }
>> +
>> +        /* write grain directory */
>> +        for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
>> +             i < gt_count; i++, tmp += gt_size) {
>> +            gd_buf[i] = cpu_to_le32(tmp);
>> +        }
>> +        ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
>> +                          gd_buf, gd_buf_size);
>> +        if (ret < 0) {
>> +            error_set(errp, QERR_IO_ERROR);
>> +            goto exit;
>> +        }
>>      }
>>
>>      ret = 0;
>> @@ -1911,7 +2094,29 @@ exit:
>>
>>  static void vmdk_close(BlockDriverState *bs)
>>  {
>> +    int ret;
>>      BDRVVmdkState *s = bs->opaque;
>> +    VmdkExtent *extent = &s->extents[0];
>> +    VMDK4Footer footer;
>> +
>> +    if (extent->compressed) {
>> +        while (extent < &s->extents[s->num_extents]) {
>> +            vmdk_write_grain_table(extent);
>> +            vmdk_write_grain_directory(extent);
>> +            if (extent->has_footer) {
>> +                memset(&footer, 0, sizeof(footer));
>> +                ret = bdrv_pread(extent->file, sizeof(uint32_t),
>> +                                 &footer.header, sizeof(footer.header));
>
> You read the header and write it to the end of file. Shouldn't this be reading
> from the footer at the end of the extent?
>
> Fam
>

Sorry for taking so long to reply but I'm kind of busy at the moment
(moving between continents) so I'll get back to you as soon as I can.

Milos

>> +                if (ret < 0) {
>> +                    continue;
>> +                }
>> +                footer.header.gd_offset =
>> +                    cpu_to_le64(extent->l1_table_offset >> 9);
>> +                vmdk_write_footer(extent->file, &footer);
>> +            }
>> +            extent++;
>> +        }
>> +    }
>>
>>      vmdk_free_extents(bs);
>>      g_free(s->create_type);
>> --
>> 1.7.1
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2014-08-29 19:49 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-07 14:54 [Qemu-devel] [PATCH] vmdk: improve streamOptimized vmdk support Milos Vyletel
2014-07-29 13:37 ` Stefan Hajnoczi
2014-07-29 13:46   ` Milos Vyletel
2014-07-29 14:37     ` Stefan Hajnoczi
2014-07-29 14:49       ` Milos Vyletel
2014-07-30  8:09         ` Stefan Hajnoczi
2014-07-31 18:22           ` Milos Vyletel
2014-07-30  7:51 ` Fam Zheng
2014-07-31 18:24   ` Milos Vyletel
2014-08-04 17:10 ` [Qemu-devel] [PATCH v2] " Milos Vyletel
2014-08-05  5:27   ` Fam Zheng
2014-08-05 16:44     ` Milos Vyletel
2014-08-06  1:44       ` Fam Zheng
2014-08-06 16:36         ` Milos Vyletel
2014-08-06 20:57   ` Milos Vyletel
2014-08-06 21:20     ` Milos Vyletel
2014-08-06 21:24   ` [Qemu-devel] [PATCH v3] " Milos Vyletel
2014-08-12 10:45     ` Stefan Hajnoczi
2014-08-12 12:44       ` Milos Vyletel
2014-08-18  6:52     ` Fam Zheng
2014-08-29 19:49       ` Milos Vyletel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.