[Qemu-devel] Re: [PATCH v4 2/5] qed: Add QEMU Enhanced Disk image format

From: Kevin Wolf <kwolf@redhat.com>
To: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>,
	Christoph Hellwig <hch@lst.de>,
	qemu-devel@nongnu.org, Avi Kivity <avi@redhat.com>
Subject: [Qemu-devel] Re: [PATCH v4 2/5] qed: Add QEMU Enhanced Disk image format
Date: Fri, 12 Nov 2010 16:43:01 +0100	[thread overview]
Message-ID: <4CDD6085.8020007@redhat.com> (raw)
In-Reply-To: <1288263684-18892-3-git-send-email-stefanha@linux.vnet.ibm.com>

Am 28.10.2010 13:01, schrieb Stefan Hajnoczi:
> This patch introduces the qed on-disk layout and implements image
> creation.  Later patches add read/write and other functionality.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
>  Makefile.objs |    1 +
>  block/qed.c   |  548 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/qed.h   |  145 +++++++++++++++
>  block_int.h   |    1 +
>  4 files changed, 695 insertions(+), 0 deletions(-)
>  create mode 100644 block/qed.c
>  create mode 100644 block/qed.h
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index f07fb01..7bae72a 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -14,6 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>  
>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
> +block-nested-y += qed.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>  block-nested-$(CONFIG_POSIX) += raw-posix.o
> diff --git a/block/qed.c b/block/qed.c
> new file mode 100644
> index 0000000..8469cf0
> --- /dev/null
> +++ b/block/qed.c
> @@ -0,0 +1,548 @@
> +/*
> + * QEMU Enhanced Disk Format
> + *
> + * Copyright IBM, Corp. 2010
> + *
> + * Authors:
> + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
> + *  Anthony Liguori   <aliguori@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "qed.h"
> +
> +static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
> +                          const char *filename)
> +{
> +    const QEDHeader *header = (const QEDHeader *)buf;
> +
> +    if (buf_size < sizeof(*header)) {
> +        return 0;
> +    }
> +    if (le32_to_cpu(header->magic) != QED_MAGIC) {
> +        return 0;
> +    }
> +    return 100;
> +}
> +
> +/**
> + * Check whether an image format is raw
> + *
> + * @fmt:    Backing file format, may be NULL
> + */
> +static bool qed_fmt_is_raw(const char *fmt)
> +{
> +    return fmt && strcmp(fmt, "raw") == 0;
> +}

People shouldn't use them directly, but should we also consider file,
host_device, etc.?

> +
> +static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
> +{
> +    cpu->magic = le32_to_cpu(le->magic);
> +    cpu->cluster_size = le32_to_cpu(le->cluster_size);
> +    cpu->table_size = le32_to_cpu(le->table_size);
> +    cpu->header_size = le32_to_cpu(le->header_size);
> +    cpu->features = le64_to_cpu(le->features);
> +    cpu->compat_features = le64_to_cpu(le->compat_features);
> +    cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
> +    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
> +    cpu->image_size = le64_to_cpu(le->image_size);
> +    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
> +    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
> +}
> +
> +static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
> +{
> +    le->magic = cpu_to_le32(cpu->magic);
> +    le->cluster_size = cpu_to_le32(cpu->cluster_size);
> +    le->table_size = cpu_to_le32(cpu->table_size);
> +    le->header_size = cpu_to_le32(cpu->header_size);
> +    le->features = cpu_to_le64(cpu->features);
> +    le->compat_features = cpu_to_le64(cpu->compat_features);
> +    le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
> +    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
> +    le->image_size = cpu_to_le64(cpu->image_size);
> +    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
> +    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
> +}
> +
> +static int qed_write_header_sync(BDRVQEDState *s)
> +{
> +    QEDHeader le;
> +    int ret;
> +
> +    qed_header_cpu_to_le(&s->header, &le);
> +    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
> +    if (ret != sizeof(le)) {
> +        return ret;
> +    }
> +    return 0;
> +}
> +
> +static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
> +{
> +    uint64_t table_entries;
> +    uint64_t l2_size;
> +
> +    table_entries = (table_size * cluster_size) / sizeof(uint64_t);
> +    l2_size = table_entries * cluster_size;
> +
> +    return l2_size * table_entries;
> +}
> +
> +static bool qed_is_cluster_size_valid(uint32_t cluster_size)
> +{
> +    if (cluster_size < QED_MIN_CLUSTER_SIZE ||
> +        cluster_size > QED_MAX_CLUSTER_SIZE) {
> +        return false;
> +    }
> +    if (cluster_size & (cluster_size - 1)) {
> +        return false; /* not power of 2 */
> +    }
> +    return true;
> +}
> +
> +static bool qed_is_table_size_valid(uint32_t table_size)
> +{
> +    if (table_size < QED_MIN_TABLE_SIZE ||
> +        table_size > QED_MAX_TABLE_SIZE) {
> +        return false;
> +    }
> +    if (table_size & (table_size - 1)) {
> +        return false; /* not power of 2 */
> +    }
> +    return true;
> +}
> +
> +static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
> +                                    uint32_t table_size)
> +{
> +    if (image_size % BDRV_SECTOR_SIZE != 0) {
> +        return false; /* not multiple of sector size */
> +    }
> +    if (image_size > qed_max_image_size(cluster_size, table_size)) {
> +        return false; /* image is too large */
> +    }
> +    return true;
> +}
> +
> +/**
> + * Read a string of known length from the image file
> + *
> + * @file:       Image file
> + * @offset:     File offset to start of string, in bytes
> + * @n:          String length in bytes
> + * @buf:        Destination buffer
> + * @buflen:     Destination buffer length in bytes
> + * @ret:        0 on success, -errno on failure
> + *
> + * The string is NUL-terminated.
> + */
> +static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
> +                           char *buf, size_t buflen)
> +{
> +    int ret;
> +    if (n >= buflen) {
> +        return -EINVAL;
> +    }
> +    ret = bdrv_pread(file, offset, buf, n);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    buf[n] = '\0';
> +    return 0;
> +}
> +
> +static int bdrv_qed_open(BlockDriverState *bs, int flags)
> +{
> +    BDRVQEDState *s = bs->opaque;
> +    QEDHeader le_header;
> +    int64_t file_size;
> +    int ret;
> +
> +    s->bs = bs;
> +
> +    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    ret = 0; /* ret should always be 0 or -errno */
> +    qed_header_le_to_cpu(&le_header, &s->header);
> +
> +    if (s->header.magic != QED_MAGIC) {
> +        return -ENOENT;
> +    }

ENOENT seems a bit odd for a wrong magic number, especially if it's used
for the error message. Wouldn't EINVAL or ENOTSUP be a closer match?

> +    if (s->header.features & ~QED_FEATURE_MASK) {
> +        return -ENOTSUP; /* image uses unsupported feature bits */
> +    }
> +    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
> +        return -EINVAL;
> +    }
> +
> +    /* Round down file size to the last cluster */
> +    file_size = bdrv_getlength(bs->file);
> +    if (file_size < 0) {
> +        return file_size;
> +    }
> +    s->file_size = qed_start_of_cluster(s, file_size);
> +
> +    if (!qed_is_table_size_valid(s->header.table_size)) {
> +        return -EINVAL;
> +    }
> +    if (!qed_is_image_size_valid(s->header.image_size,
> +                                 s->header.cluster_size,
> +                                 s->header.table_size)) {
> +        return -EINVAL;
> +    }
> +    if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
> +        return -EINVAL;
> +    }
> +
> +    s->table_nelems = (s->header.cluster_size * s->header.table_size) /
> +                      sizeof(uint64_t);
> +    s->l2_shift = ffs(s->header.cluster_size) - 1;
> +    s->l2_mask = s->table_nelems - 1;
> +    s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
> +
> +    if ((s->header.features & QED_F_BACKING_FILE)) {
> +        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
> +                              s->header.backing_filename_size, bs->backing_file,
> +                              sizeof(bs->backing_file));
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
> +            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
> +        }
> +    }
> +
> +    /* Reset unknown autoclear feature bits.  This is a backwards
> +     * compatibility mechanism that allows images to be opened by older
> +     * programs, which "knock out" unknown feature bits.  When an image is
> +     * opened by a newer program again it can detect that the autoclear
> +     * feature is no longer valid.
> +     */
> +    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
> +        !bdrv_is_read_only(bs->file)) {
> +        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
> +
> +        ret = qed_write_header_sync(s);
> +        if (ret) {
> +            return ret;
> +        }
> +
> +        /* From here on only known autoclear feature bits are valid */
> +        bdrv_flush(bs->file);
> +    }
> +
> +    return ret;
> +}
> +
> +static void bdrv_qed_close(BlockDriverState *bs)
> +{
> +}
> +
> +static void bdrv_qed_flush(BlockDriverState *bs)
> +{
> +    bdrv_flush(bs->file);
> +}

This conflicts with one of my recent changes. bdrv_flush should return
an int now.

> +
> +static int qed_create(const char *filename, uint32_t cluster_size,
> +                      uint64_t image_size, uint32_t table_size,
> +                      const char *backing_file, const char *backing_fmt)
> +{
> +    QEDHeader header = {
> +        .magic = QED_MAGIC,
> +        .cluster_size = cluster_size,
> +        .table_size = table_size,
> +        .header_size = 1,
> +        .features = 0,
> +        .compat_features = 0,
> +        .l1_table_offset = cluster_size,
> +        .image_size = image_size,
> +    };
> +    QEDHeader le_header;
> +    uint8_t *l1_table = NULL;
> +    size_t l1_size = header.cluster_size * header.table_size;
> +    int ret = 0;
> +    BlockDriverState *bs = NULL;
> +
> +    ret = bdrv_create_file(filename, NULL);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    if (backing_file) {
> +        header.features |= QED_F_BACKING_FILE;
> +        header.backing_filename_offset = sizeof(le_header);
> +        header.backing_filename_size = strlen(backing_file);
> +
> +        if (qed_fmt_is_raw(backing_fmt)) {
> +            header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
> +        }
> +    }
> +
> +    qed_header_cpu_to_le(&header, &le_header);
> +    ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
> +                      header.backing_filename_size);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    l1_table = qemu_mallocz(l1_size);
> +    ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = 0; /* success */
> +out:
> +    qemu_free(l1_table);
> +    bdrv_delete(bs);
> +    return ret;
> +}
> +
> +static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
> +{
> +    uint64_t image_size = 0;
> +    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
> +    uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
> +    const char *backing_file = NULL;
> +    const char *backing_fmt = NULL;
> +
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            image_size = options->value.n;
> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
> +            backing_file = options->value.s;
> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
> +            backing_fmt = options->value.s;

I'm not sure here. It doesn't really matter for QED, but I find it
strange that -o backing_fmt=foobar works and gives you a non-raw backing
file. Should we check if the format exists at least?

Kevin