From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:50105) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1XFo9h-00041Z-UT for qemu-devel@nongnu.org; Fri, 08 Aug 2014 13:40:50 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1XFo9b-0006et-ER for qemu-devel@nongnu.org; Fri, 08 Aug 2014 13:40:41 -0400 Received: from mx1.redhat.com ([209.132.183.28]:11798) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1XFo9a-0006em-Px for qemu-devel@nongnu.org; Fri, 08 Aug 2014 13:40:35 -0400 Received: from int-mx10.intmail.prod.int.phx2.redhat.com (int-mx10.intmail.prod.int.phx2.redhat.com [10.5.11.23]) by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id s78HeYtD001522 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK) for ; Fri, 8 Aug 2014 13:40:34 -0400 From: Kevin Wolf Date: Fri, 8 Aug 2014 19:39:21 +0200 Message-Id: <1407519603-6635-21-git-send-email-kwolf@redhat.com> In-Reply-To: <1407519603-6635-1-git-send-email-kwolf@redhat.com> References: <1407519603-6635-1-git-send-email-kwolf@redhat.com> Subject: [Qemu-devel] [PULL 20/62] vmdk: Optimize cluster allocation List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: kwolf@redhat.com From: Fam Zheng This drops the unnecessary bdrv_truncate() from, and also improves, cluster allocation code path. Before, when we need a new cluster, get_cluster_offset truncates the image to bdrv_getlength() + cluster_size, and returns the offset of added area, i.e. the image length before truncating. This is not efficient, so it's now rewritten as: - Save the extent file length when opening. - When allocating cluster, use the saved length as cluster offset. - Don't truncate image, because we'll anyway write data there: just write any data at the EOF position, in descending priority: * New user data (cluster allocation happens in a write request). * Filling data in the beginning and/or ending of the new cluster, if not covered by user data: either backing file content (COW), or zero for standalone images. One major benifit of this change is, on host mounted NFS images, even over a fast network, ftruncate is slow (see the example below). This change significantly speeds up cluster allocation. Comparing by converting a cirros image (296M) to VMDK on an NFS mount point, over 1Gbe LAN: $ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk Before: real 0m21.796s user 0m0.130s sys 0m0.483s After: real 0m2.017s user 0m0.047s sys 0m0.190s We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and get a little more documentation in function comments. Tested that this passes qemu-iotests for all VMDK subformats. Signed-off-by: Fam Zheng Signed-off-by: Stefan Hajnoczi --- block/vmdk.c | 222 +++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 140 insertions(+), 82 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 33d9bec..3700a11 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -106,6 +106,7 @@ typedef struct VmdkExtent { uint32_t l2_cache_counts[L2_CACHE_SIZE]; int64_t cluster_sectors; + int64_t next_cluster_sector; char *type; } VmdkExtent; @@ -124,7 +125,6 @@ typedef struct BDRVVmdkState { } BDRVVmdkState; typedef struct VmdkMetaData { - uint32_t offset; unsigned int l1_index; unsigned int l2_index; unsigned int l2_offset; @@ -397,6 +397,7 @@ static int vmdk_add_extent(BlockDriverState *bs, { VmdkExtent *extent; BDRVVmdkState *s = bs->opaque; + int64_t length; if (cluster_sectors > 0x200000) { /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ @@ -412,6 +413,11 @@ static int vmdk_add_extent(BlockDriverState *bs, return -EFBIG; } + length = bdrv_getlength(file); + if (length < 0) { + return length; + } + s->extents = g_realloc(s->extents, (s->num_extents + 1) * sizeof(VmdkExtent)); extent = &s->extents[s->num_extents]; @@ -427,6 +433,8 @@ static int vmdk_add_extent(BlockDriverState *bs, extent->l1_entry_sectors = l2_size * cluster_sectors; extent->l2_size = l2_size; extent->cluster_sectors = flat ? sectors : cluster_sectors; + extent->next_cluster_sector = + ROUND_UP(DIV_ROUND_UP(length, BDRV_SECTOR_SIZE), cluster_sectors); if (s->num_extents > 1) { extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; @@ -951,57 +959,97 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } +/** + * get_whole_cluster + * + * Copy backing file's cluster that covers @sector_num, otherwise write zero, + * to the cluster at @cluster_sector_num. + * + * If @skip_start_sector < @skip_end_sector, the relative range + * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave + * it for call to write user data in the request. + */ static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_offset, - uint64_t offset, - bool allocate) + VmdkExtent *extent, + uint64_t cluster_sector_num, + uint64_t sector_num, + uint64_t skip_start_sector, + uint64_t skip_end_sector) { int ret = VMDK_OK; - uint8_t *whole_grain = NULL; + int64_t cluster_bytes; + uint8_t *whole_grain; + + /* For COW, align request sector_num to cluster start */ + sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors); + cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS; + whole_grain = qemu_blockalign(bs, cluster_bytes); + if (!bs->backing_hd) { + memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS); + memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0, + cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS)); + } + + assert(skip_end_sector <= extent->cluster_sectors); /* we will be here if it's first write on non-exist grain(cluster). * try to read from parent image, if exist */ - if (bs->backing_hd) { - whole_grain = - qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS); - if (!vmdk_is_cid_valid(bs)) { - ret = VMDK_ERROR; - goto exit; - } + if (bs->backing_hd && !vmdk_is_cid_valid(bs)) { + ret = VMDK_ERROR; + goto exit; + } - /* floor offset to cluster */ - offset -= offset % (extent->cluster_sectors * 512); - ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain, - extent->cluster_sectors); + /* Read backing data before skip range */ + if (skip_start_sector > 0) { + if (bs->backing_hd) { + ret = bdrv_read(bs->backing_hd, sector_num, + whole_grain, skip_start_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } + } + ret = bdrv_write(extent->file, cluster_sector_num, whole_grain, + skip_start_sector); if (ret < 0) { ret = VMDK_ERROR; goto exit; } - - /* Write grain only into the active image */ - ret = bdrv_write(extent->file, cluster_offset, whole_grain, - extent->cluster_sectors); + } + /* Read backing data after skip range */ + if (skip_end_sector < extent->cluster_sectors) { + if (bs->backing_hd) { + ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector, + whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), + extent->cluster_sectors - skip_end_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } + } + ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector, + whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), + extent->cluster_sectors - skip_end_sector); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } + exit: qemu_vfree(whole_grain); return ret; } -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + uint32_t offset) { - uint32_t offset; - QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset)); - offset = cpu_to_le32(m_data->offset); + offset = cpu_to_le32(offset); /* update L2 table */ if (bdrv_pwrite_sync( extent->file, ((int64_t)m_data->l2_offset * 512) - + (m_data->l2_index * sizeof(m_data->offset)), + + (m_data->l2_index * sizeof(offset)), &offset, sizeof(offset)) < 0) { return VMDK_ERROR; } @@ -1011,7 +1059,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) if (bdrv_pwrite_sync( extent->file, ((int64_t)m_data->l2_offset * 512) - + (m_data->l2_index * sizeof(m_data->offset)), + + (m_data->l2_index * sizeof(offset)), &offset, sizeof(offset)) < 0) { return VMDK_ERROR; } @@ -1023,17 +1071,41 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) return VMDK_OK; } +/** + * get_cluster_offset + * + * Look up cluster offset in extent file by sector number, and store in + * @cluster_offset. + * + * For flat extents, the start offset as parsed from the description file is + * returned. + * + * For sparse extents, look up in L1, L2 table. If allocate is true, return an + * offset for a new cluster and update L2 cache. If there is a backing file, + * COW is done before returning; otherwise, zeroes are written to the allocated + * cluster. Both COW and zero writing skips the sector range + * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller + * has new data to write there. + * + * Returns: VMDK_OK if cluster exists and mapped in the image. + * VMDK_UNALLOC if cluster is not mapped and @allocate is false. + * VMDK_ERROR if failed. + */ static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - int allocate, - uint64_t *cluster_offset) + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + bool allocate, + uint64_t *cluster_offset, + uint64_t skip_start_sector, + uint64_t skip_end_sector) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; uint32_t min_count, *l2_table; bool zeroed = false; + int64_t ret; + int32_t cluster_sector; if (m_data) { m_data->valid = 0; @@ -1087,52 +1159,41 @@ static int get_cluster_offset(BlockDriverState *bs, extent->l2_cache_counts[min_index] = 1; found: l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; - *cluster_offset = le32_to_cpu(l2_table[l2_index]); + cluster_sector = le32_to_cpu(l2_table[l2_index]); if (m_data) { m_data->valid = 1; m_data->l1_index = l1_index; m_data->l2_index = l2_index; - m_data->offset = *cluster_offset; m_data->l2_offset = l2_offset; m_data->l2_cache_entry = &l2_table[l2_index]; } - if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) { + if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { zeroed = true; } - if (!*cluster_offset || zeroed) { + if (!cluster_sector || zeroed) { if (!allocate) { return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; } - /* Avoid the L2 tables update for the images that have snapshots. */ - *cluster_offset = bdrv_getlength(extent->file); - if (!extent->compressed) { - bdrv_truncate( - extent->file, - *cluster_offset + (extent->cluster_sectors << 9) - ); - } - - *cluster_offset >>= 9; - l2_table[l2_index] = cpu_to_le32(*cluster_offset); + cluster_sector = extent->next_cluster_sector; + extent->next_cluster_sector += extent->cluster_sectors; /* First of all we write grain itself, to avoid race condition * that may to corrupt the image. * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ - if (get_whole_cluster( - bs, extent, *cluster_offset, offset, allocate) == -1) { - return VMDK_ERROR; - } - - if (m_data) { - m_data->offset = *cluster_offset; + ret = get_whole_cluster(bs, extent, + cluster_sector, + offset >> BDRV_SECTOR_BITS, + skip_start_sector, skip_end_sector); + if (ret) { + return ret; } } - *cluster_offset <<= 9; + *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; return VMDK_OK; } @@ -1167,7 +1228,8 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, } qemu_co_mutex_lock(&s->lock); ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, 0, &offset); + sector_num * 512, false, &offset, + 0, 0); qemu_co_mutex_unlock(&s->lock); switch (ret) { @@ -1320,9 +1382,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num, if (!extent) { return -EIO; } - ret = get_cluster_offset( - bs, extent, NULL, - sector_num << 9, 0, &cluster_offset); + ret = get_cluster_offset(bs, extent, NULL, + sector_num << 9, false, &cluster_offset, + 0, 0); extent_begin_sector = extent->end_sector - extent->sectors; extent_relative_sector_num = sector_num - extent_begin_sector; index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; @@ -1403,12 +1465,17 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, if (!extent) { return -EIO; } - ret = get_cluster_offset( - bs, - extent, - &m_data, - sector_num << 9, !extent->compressed, - &cluster_offset); + extent_begin_sector = extent->end_sector - extent->sectors; + extent_relative_sector_num = sector_num - extent_begin_sector; + index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; + n = extent->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + !(extent->compressed || zeroed), + &cluster_offset, + index_in_cluster, index_in_cluster + n); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1417,24 +1484,13 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, return -EIO; } else { /* allocate */ - ret = get_cluster_offset( - bs, - extent, - &m_data, - sector_num << 9, 1, - &cluster_offset); + ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + true, &cluster_offset, 0, 0); } } if (ret == VMDK_ERROR) { return -EINVAL; } - extent_begin_sector = extent->end_sector - extent->sectors; - extent_relative_sector_num = sector_num - extent_begin_sector; - index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } if (zeroed) { /* Do zeroed write, buf is ignored */ if (extent->has_zero_grain && @@ -1442,9 +1498,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, n >= extent->cluster_sectors) { n = extent->cluster_sectors; if (!zero_dry_run) { - m_data.offset = VMDK_GTE_ZEROED; /* update L2 tables */ - if (vmdk_L2update(extent, &m_data) != VMDK_OK) { + if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED) + != VMDK_OK) { return -EIO; } } @@ -1460,7 +1516,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, } if (m_data.valid) { /* update L2 tables */ - if (vmdk_L2update(extent, &m_data) != VMDK_OK) { + if (vmdk_L2update(extent, &m_data, + cluster_offset >> BDRV_SECTOR_BITS) + != VMDK_OK) { return -EIO; } } @@ -2019,7 +2077,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, } ret = get_cluster_offset(bs, extent, NULL, sector_num << BDRV_SECTOR_BITS, - 0, &cluster_offset); + false, &cluster_offset, 0, 0); if (ret == VMDK_ERROR) { fprintf(stderr, "ERROR: could not get cluster_offset for sector %" -- 1.8.3.1