All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
@ 2022-08-26 16:17 Sam Li
  2022-08-29 19:29 ` Stefan Hajnoczi
  2022-08-30 11:57 ` Markus Armbruster
  0 siblings, 2 replies; 14+ messages in thread
From: Sam Li @ 2022-08-26 16:17 UTC (permalink / raw)
  To: qemu-devel
  Cc: stefanha, damien.lemoal, Dmitry.Fomichev, hare, qemu-block,
	hreitz, eblake, armbru, fam, kwolf, Sam Li

By adding zone management operations in BlockDriver, storage controller
emulation can use the new block layer APIs including Report Zone and
four zone management operations (open, close, finish, reset).

Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
zone_close(zc), zone_reset(zrs), zone_finish(zf).

For example, to test zone_report, use following command:
$ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
-c "zrp offset nr_zones"

Signed-off-by: Sam Li <faithilikerun@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 block/block-backend.c             |  51 +++++
 block/file-posix.c                | 326 +++++++++++++++++++++++++++++-
 block/io.c                        |  41 ++++
 include/block/block-io.h          |   7 +
 include/block/block_int-common.h  |  21 ++
 include/block/raw-aio.h           |   6 +-
 include/sysemu/block-backend-io.h |  17 ++
 meson.build                       |   1 +
 qapi/block-core.json              |   8 +-
 qemu-io-cmds.c                    | 143 +++++++++++++
 10 files changed, 617 insertions(+), 4 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index d4a5df2ac2..c5798651df 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1775,6 +1775,57 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
     return ret;
 }
 
+/*
+ * Send a zone_report command.
+ * offset is a byte offset from the start of the device. No alignment
+ * required for offset.
+ * nr_zones represents IN maximum and OUT actual.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    unsigned int *nr_zones,
+                                    BlockZoneDescriptor *zones)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk); /* increase before waiting */
+    blk_wait_while_drained(blk);
+    if (!blk_is_available(blk)) {
+        blk_dec_in_flight(blk);
+        return -ENOMEDIUM;
+    }
+    ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
+/*
+ * Send a zone_management command.
+ * op is the zone operation.
+ * offset is the starting zone specified as a sector offset.
+ * len is the maximum number of sectors the command should operate on. It
+ * should be aligned with the zone sector size.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+        int64_t offset, int64_t len)
+{
+    int ret;
+    IO_CODE();
+
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+
+    ret = blk_check_byte_request(blk, offset, len);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 0a8b4b426e..e3efba6db7 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -67,6 +67,9 @@
 #include <sys/param.h>
 #include <sys/syscall.h>
 #include <sys/vfs.h>
+#if defined(CONFIG_BLKZONED)
+#include <linux/blkzoned.h>
+#endif
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <linux/fs.h>
@@ -216,6 +219,13 @@ typedef struct RawPosixAIOData {
             PreallocMode prealloc;
             Error **errp;
         } truncate;
+        struct {
+            unsigned int *nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report;
+        struct {
+            unsigned long zone_op;
+        } zone_mgmt;
     };
 } RawPosixAIOData;
 
@@ -1339,7 +1349,7 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 #endif
 
     if (bs->sg || S_ISBLK(st.st_mode)) {
-        int ret = hdev_get_max_hw_transfer(s->fd, &st);
+        ret = hdev_get_max_hw_transfer(s->fd, &st);
 
         if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
             bs->bl.max_hw_transfer = ret;
@@ -1356,6 +1366,27 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
         zoned = BLK_Z_NONE;
     }
     bs->bl.zoned = zoned;
+    if (zoned != BLK_Z_NONE) {
+        ret = get_sysfs_long_val(&st, "chunk_sectors");
+        if (ret > 0) {
+            bs->bl.zone_sectors = ret;
+        }
+
+        ret = get_sysfs_long_val(&st, "zone_append_max_bytes");
+        if (ret > 0) {
+            bs->bl.zone_append_max_bytes = ret;
+        }
+
+        ret = get_sysfs_long_val(&st, "max_open_zones");
+        if (ret >= 0) {
+            bs->bl.max_open_zones = ret;
+        }
+
+        ret = get_sysfs_long_val(&st, "max_active_zones");
+        if (ret >= 0) {
+            bs->bl.max_active_zones = ret;
+        }
+    }
 }
 
 static int check_for_dasd(int fd)
@@ -1850,6 +1881,136 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
 }
 #endif
 
+/*
+ * parse_zone - Fill a zone descriptor
+ */
+#if defined(CONFIG_BLKZONED)
+static inline void parse_zone(struct BlockZoneDescriptor *zone,
+                              const struct blk_zone *blkz) {
+    zone->start = blkz->start;
+    zone->length = blkz->len;
+    zone->cap = blkz->capacity;
+    zone->wp = blkz->wp;
+
+    switch (blkz->type) {
+    case BLK_ZONE_TYPE_SEQWRITE_REQ:
+        zone->type = BLK_ZT_SWR;
+        break;
+    case BLK_ZONE_TYPE_SEQWRITE_PREF:
+        zone->type = BLK_ZT_SWP;
+        break;
+    case BLK_ZONE_TYPE_CONVENTIONAL:
+        zone->type = BLK_ZT_CONV;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    switch (blkz->cond) {
+    case BLK_ZONE_COND_NOT_WP:
+        zone->cond = BLK_ZS_NOT_WP;
+        break;
+    case BLK_ZONE_COND_EMPTY:
+        zone->cond = BLK_ZS_EMPTY;
+        break;
+    case BLK_ZONE_COND_IMP_OPEN:
+        zone->cond =BLK_ZS_IOPEN;
+        break;
+    case BLK_ZONE_COND_EXP_OPEN:
+        zone->cond = BLK_ZS_EOPEN;
+        break;
+    case BLK_ZONE_COND_CLOSED:
+        zone->cond = BLK_ZS_CLOSED;
+        break;
+    case BLK_ZONE_COND_READONLY:
+        zone->cond = BLK_ZS_RDONLY;
+        break;
+    case BLK_ZONE_COND_FULL:
+        zone->cond = BLK_ZS_FULL;
+        break;
+    case BLK_ZONE_COND_OFFLINE:
+        zone->cond = BLK_ZS_OFFLINE;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+#endif
+
+static int handle_aiocb_zone_report(void *opaque) {
+#if defined(CONFIG_BLKZONED)
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    unsigned int *nr_zones = aiocb->zone_report.nr_zones;
+    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
+    /* zoned block devices use 512-byte sectors */
+    int64_t sector = aiocb->aio_offset / 512;
+
+    struct blk_zone *blkz;
+    int64_t rep_size;
+    unsigned int nrz;
+    int ret, n = 0, i = 0;
+
+    nrz = *nr_zones;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+    rep = g_malloc(rep_size);
+
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                         fd, sector, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            parse_zone(&zones[n], &blkz[i]);
+            /* The next report should start after the last zone reported */
+            sector = blkz[i].start + blkz[i].len;
+        }
+    }
+
+    *nr_zones = n;
+    return 0;
+#else
+    return -ENOTSUP;
+#endif
+}
+
+static int handle_aiocb_zone_mgmt(void *opaque) {
+#if defined(CONFIG_BLKZONED)
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    int64_t sector = aiocb->aio_offset;
+    int64_t nr_sectors = aiocb->aio_nbytes;
+    struct blk_zone_range range;
+    int ret;
+
+    /* Execute the operation */
+    range.sector = sector;
+    range.nr_sectors = nr_sectors;
+    do {
+        ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
+    } while (ret != 0 && errno == EINTR);
+
+    return ret;
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static int handle_aiocb_copy_range(void *opaque)
 {
     RawPosixAIOData *aiocb = opaque;
@@ -3022,6 +3183,118 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
     }
 }
 
+/*
+ * zone report - Get a zone block device's information in the form
+ * of an array of zone descriptors.
+ *
+ * @param bs: passing zone block device file descriptor
+ * @param zones: an array of zone descriptors to hold zone
+ * information on reply
+ * @param offset: offset can be any byte within the zone size.
+ * @param len: (not sure yet.
+ * @return 0 on success, -1 on failure
+ */
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+                                           unsigned int *nr_zones,
+                                           BlockZoneDescriptor *zones) {
+#if defined(CONFIG_BLKZONED)
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+
+    acb = (RawPosixAIOData) {
+        .bs         = bs,
+        .aio_fildes = s->fd,
+        .aio_type   = QEMU_AIO_ZONE_REPORT,
+        .aio_offset = offset,
+        .zone_report    = {
+                .nr_zones       = nr_zones,
+                .zones          = zones,
+        },
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
+#else
+    return -ENOTSUP;
+#endif
+}
+
+/*
+ * zone management operations - Execute an operation on a zone
+ */
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+        int64_t offset, int64_t len) {
+#if defined(CONFIG_BLKZONED)
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+    int64_t zone_sector, zone_sector_mask;
+    const char *ioctl_name;
+    unsigned long zone_op;
+    int ret;
+
+    struct stat st;
+    if (fstat(s->fd, &st) < 0) {
+        ret = -errno;
+        return ret;
+    }
+    zone_sector = bs->bl.zone_sectors;
+    zone_sector_mask = zone_sector - 1;
+    if (offset & zone_sector_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size "
+                     "%" PRId64 "", offset, zone_sector);
+        return -EINVAL;
+    }
+
+    if (len & zone_sector_mask) {
+        error_report("number of sectors %" PRId64 " is not aligned to zone size"
+                      " %" PRId64 "", len, zone_sector);
+        return -EINVAL;
+    }
+
+    switch (op) {
+    case BLK_ZO_OPEN:
+        ioctl_name = "BLKOPENZONE";
+        zone_op = BLKOPENZONE;
+        break;
+    case BLK_ZO_CLOSE:
+        ioctl_name = "BLKCLOSEZONE";
+        zone_op = BLKCLOSEZONE;
+        break;
+    case BLK_ZO_FINISH:
+        ioctl_name = "BLKFINISHZONE";
+        zone_op = BLKFINISHZONE;
+        break;
+    case BLK_ZO_RESET:
+        ioctl_name = "BLKRESETZONE";
+        zone_op = BLKRESETZONE;
+        break;
+    default:
+        error_report("Invalid zone operation 0x%x", op);
+        return -EINVAL;
+    }
+
+    acb = (RawPosixAIOData) {
+        .bs             = bs,
+        .aio_fildes     = s->fd,
+        .aio_type       = QEMU_AIO_ZONE_MGMT,
+        .aio_offset     = offset,
+        .aio_nbytes     = len,
+        .zone_mgmt  = {
+                .zone_op = zone_op,
+        },
+    };
+
+    ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
+    if (ret != 0) {
+        error_report("ioctl %s failed %d", ioctl_name, errno);
+        return -errno;
+    }
+
+    return ret;
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static coroutine_fn int
 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
                 bool blkdev)
@@ -3752,6 +4025,54 @@ static BlockDriver bdrv_host_device = {
 #endif
 };
 
+#if defined(CONFIG_BLKZONED)
+static BlockDriver bdrv_zoned_host_device = {
+        .format_name = "zoned_host_device",
+        .protocol_name = "zoned_host_device",
+        .instance_size = sizeof(BDRVRawState),
+        .bdrv_needs_filename = true,
+        .bdrv_probe_device  = hdev_probe_device,
+        .bdrv_file_open     = hdev_open,
+        .bdrv_close         = raw_close,
+        .bdrv_reopen_prepare = raw_reopen_prepare,
+        .bdrv_reopen_commit  = raw_reopen_commit,
+        .bdrv_reopen_abort   = raw_reopen_abort,
+        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
+        .create_opts         = &bdrv_create_opts_simple,
+        .mutable_opts        = mutable_opts,
+        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
+        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
+
+        .bdrv_co_preadv         = raw_co_preadv,
+        .bdrv_co_pwritev        = raw_co_pwritev,
+        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
+        .bdrv_co_pdiscard       = hdev_co_pdiscard,
+        .bdrv_co_copy_range_from = raw_co_copy_range_from,
+        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
+        .bdrv_refresh_limits = raw_refresh_limits,
+        .bdrv_io_plug = raw_aio_plug,
+        .bdrv_io_unplug = raw_aio_unplug,
+        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
+
+        .bdrv_co_truncate       = raw_co_truncate,
+        .bdrv_getlength = raw_getlength,
+        .bdrv_get_info = raw_get_info,
+        .bdrv_get_allocated_file_size
+                            = raw_get_allocated_file_size,
+        .bdrv_get_specific_stats = hdev_get_specific_stats,
+        .bdrv_check_perm = raw_check_perm,
+        .bdrv_set_perm   = raw_set_perm,
+        .bdrv_abort_perm_update = raw_abort_perm_update,
+        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
+        .bdrv_probe_geometry = hdev_probe_geometry,
+        .bdrv_co_ioctl = hdev_co_ioctl,
+
+        /* zone management operations */
+        .bdrv_co_zone_report = raw_co_zone_report,
+        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+};
+#endif
+
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static void cdrom_parse_filename(const char *filename, QDict *options,
                                  Error **errp)
@@ -4012,6 +4333,9 @@ static void bdrv_file_init(void)
     bdrv_register(&bdrv_file);
 #if defined(HAVE_HOST_BLOCK_DEVICE)
     bdrv_register(&bdrv_host_device);
+#if defined(CONFIG_BLKZONED)
+    bdrv_register(&bdrv_zoned_host_device);
+#endif
 #ifdef __linux__
     bdrv_register(&bdrv_host_cdrom);
 #endif
diff --git a/block/io.c b/block/io.c
index 0a8cbefe86..de9ec1d740 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3198,6 +3198,47 @@ out:
     return co.ret;
 }
 
+int bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
+                        unsigned int *nr_zones,
+                        BlockZoneDescriptor *zones)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_report) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+        int64_t offset, int64_t len)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_mgmt) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
     IO_CODE();
diff --git a/include/block/block-io.h b/include/block/block-io.h
index fd25ffa9be..65463b88d9 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
 /* Ensure contents are flushed to disk.  */
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
 
+/* Report zone information of zone block device. */
+int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
+                                     unsigned int *nr_zones,
+                                     BlockZoneDescriptor *zones);
+int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+                                   int64_t offset, int64_t len);
+
 int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
 int bdrv_block_status(BlockDriverState *bs, int64_t offset,
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 7f7863cc9e..8541f36123 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -691,6 +691,12 @@ struct BlockDriver {
                                           QEMUIOVector *qiov,
                                           int64_t pos);
 
+    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
+            int64_t offset, unsigned int *nr_zones,
+            BlockZoneDescriptor *zones);
+    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
+            int64_t offset, int64_t len);
+
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
     void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
@@ -828,6 +834,21 @@ typedef struct BlockLimits {
 
     /* device zone model */
     BlockZoneModel zoned;
+
+    /* zone size expressed in 512-byte sectors */
+    uint32_t zone_sectors;
+
+    /* total number of zones */
+    unsigned int nr_zones;
+
+    /* maximum size in bytes of a zone append write operation */
+    int64_t zone_append_max_bytes;
+
+    /* maximum number of open zones */
+    int64_t max_open_zones;
+
+    /* maximum number of active zones */
+    int64_t max_active_zones;
 } BlockLimits;
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 21fc10c4c9..3d26929cdd 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -29,6 +29,8 @@
 #define QEMU_AIO_WRITE_ZEROES 0x0020
 #define QEMU_AIO_COPY_RANGE   0x0040
 #define QEMU_AIO_TRUNCATE     0x0080
+#define QEMU_AIO_ZONE_REPORT  0x0100
+#define QEMU_AIO_ZONE_MGMT    0x0200
 #define QEMU_AIO_TYPE_MASK \
         (QEMU_AIO_READ | \
          QEMU_AIO_WRITE | \
@@ -37,7 +39,9 @@
          QEMU_AIO_DISCARD | \
          QEMU_AIO_WRITE_ZEROES | \
          QEMU_AIO_COPY_RANGE | \
-         QEMU_AIO_TRUNCATE)
+         QEMU_AIO_TRUNCATE  | \
+         QEMU_AIO_ZONE_REPORT | \
+         QEMU_AIO_ZONE_MGMT)
 
 /* AIO flags */
 #define QEMU_AIO_MISALIGNED   0x1000
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index 50f5aa2e07..6835525582 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -45,6 +45,12 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
                             BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
                           BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
+                                unsigned int *nr_zones, BlockZoneDescriptor *zones,
+                                BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+                              int64_t offset, int64_t len,
+                              BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
                              BlockCompletionFunc *cb, void *opaque);
 void blk_aio_cancel_async(BlockAIOCB *acb);
@@ -156,6 +162,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
                                       int64_t bytes, BdrvRequestFlags flags);
 
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    unsigned int *nr_zones,
+                                    BlockZoneDescriptor *zones);
+int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
+                                         unsigned int *nr_zones,
+                                         BlockZoneDescriptor *zones);
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+                                  int64_t offset, int64_t len);
+int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+                                       int64_t offset, int64_t len);
+
 int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
                                       int64_t bytes);
 int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
diff --git a/meson.build b/meson.build
index 20fddbd707..2f436bb355 100644
--- a/meson.build
+++ b/meson.build
@@ -1883,6 +1883,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('live_block_migration').al
 # has_header
 config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
 config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h'))
+config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h'))
 config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h'))
 config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h'))
 config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h'))
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 2173e7734a..c6bbb7a037 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2942,6 +2942,7 @@
 # @compress: Since 5.0
 # @copy-before-write: Since 6.2
 # @snapshot-access: Since 7.0
+# @zoned_host_device: Since 7.2
 #
 # Since: 2.9
 ##
@@ -2955,7 +2956,8 @@
             'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
             'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
-            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
+            { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }
 
 ##
 # @BlockdevOptionsFile:
@@ -4329,7 +4331,9 @@
       'vhdx':       'BlockdevOptionsGenericFormat',
       'vmdk':       'BlockdevOptionsGenericCOWFormat',
       'vpc':        'BlockdevOptionsGenericFormat',
-      'vvfat':      'BlockdevOptionsVVFAT'
+      'vvfat':      'BlockdevOptionsVVFAT',
+      'zoned_host_device': { 'type': 'BlockdevOptionsFile',
+                             'if': 'CONFIG_BLKZONED' }
   } }
 
 ##
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 952dc940f1..446a059603 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1712,6 +1712,144 @@ static const cmdinfo_t flush_cmd = {
     .oneline    = "flush all in-core file state to disk",
 };
 
+static int zone_report_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset;
+    unsigned int nr_zones;
+
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    nr_zones = cvtnum(argv[optind]);
+
+    g_autofree BlockZoneDescriptor *zones = NULL;
+    zones = g_new(BlockZoneDescriptor, nr_zones);
+    ret = blk_zone_report(blk, offset, &nr_zones, zones);
+    if (ret < 0) {
+        printf("zone report failed: %s\n", strerror(-ret));
+    } else {
+        for (int i = 0; i < nr_zones; ++i) {
+            printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", "
+                   "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", "
+                   "zcond:%u, [type: %u]\n",
+                   zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
+                   zones[i].cond, zones[i].type);
+        }
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_report_cmd = {
+        .name = "zone_report",
+        .altname = "zrp",
+        .cfunc = zone_report_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset number",
+        .oneline = "report zone information",
+};
+
+static int zone_open_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len);
+    if (ret < 0) {
+        printf("zone open failed: %s\n", strerror(-ret));
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_open_cmd = {
+        .name = "zone_open",
+        .altname = "zo",
+        .cfunc = zone_open_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset len",
+        .oneline = "explicit open a range of zones in zone block device",
+};
+
+static int zone_close_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len);
+    if (ret < 0) {
+        printf("zone close failed: %s\n", strerror(-ret));
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_close_cmd = {
+        .name = "zone_close",
+        .altname = "zc",
+        .cfunc = zone_close_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset len",
+        .oneline = "close a range of zones in zone block device",
+};
+
+static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len);
+    if (ret < 0) {
+        printf("zone finish failed: %s\n", strerror(-ret));
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_finish_cmd = {
+        .name = "zone_finish",
+        .altname = "zf",
+        .cfunc = zone_finish_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset len",
+        .oneline = "finish a range of zones in zone block device",
+};
+
+static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len);
+    if (ret < 0) {
+        printf("zone reset failed: %s\n", strerror(-ret));
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_reset_cmd = {
+        .name = "zone_reset",
+        .altname = "zrs",
+        .cfunc = zone_reset_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset len",
+        .oneline = "reset a zone write pointer in zone block device",
+};
+
 static int truncate_f(BlockBackend *blk, int argc, char **argv);
 static const cmdinfo_t truncate_cmd = {
     .name       = "truncate",
@@ -2504,6 +2642,11 @@ static void __attribute((constructor)) init_qemuio_commands(void)
     qemuio_add_command(&aio_write_cmd);
     qemuio_add_command(&aio_flush_cmd);
     qemuio_add_command(&flush_cmd);
+    qemuio_add_command(&zone_report_cmd);
+    qemuio_add_command(&zone_open_cmd);
+    qemuio_add_command(&zone_close_cmd);
+    qemuio_add_command(&zone_finish_cmd);
+    qemuio_add_command(&zone_reset_cmd);
     qemuio_add_command(&truncate_cmd);
     qemuio_add_command(&length_cmd);
     qemuio_add_command(&info_cmd);
-- 
2.37.2



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-08-26 16:17 [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls Sam Li
@ 2022-08-29 19:29 ` Stefan Hajnoczi
  2022-08-30 11:57 ` Markus Armbruster
  1 sibling, 0 replies; 14+ messages in thread
From: Stefan Hajnoczi @ 2022-08-29 19:29 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, damien.lemoal, Dmitry.Fomichev, hare, qemu-block,
	hreitz, eblake, armbru, fam, kwolf

[-- Attachment #1: Type: text/plain, Size: 4655 bytes --]

On Sat, Aug 27, 2022 at 12:17:04AM +0800, Sam Li wrote:
> +/*
> + * Send a zone_management command.
> + * op is the zone operation.
> + * offset is the starting zone specified as a sector offset.

Does "sector offset" mean "byte offset from the start of the device" or
does it mean in 512B sector units? For consistency this should be in
bytes.

> + * len is the maximum number of sectors the command should operate on. It
> + * should be aligned with the zone sector size.

Please use bytes for consistency with QEMU's block layer APIs.

> @@ -3022,6 +3183,118 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
>      }
>  }
>  
> +/*
> + * zone report - Get a zone block device's information in the form
> + * of an array of zone descriptors.
> + *
> + * @param bs: passing zone block device file descriptor
> + * @param zones: an array of zone descriptors to hold zone
> + * information on reply
> + * @param offset: offset can be any byte within the zone size.

This isn't an offset within a zone, it's an offset within the entire
device, so I think "zone size" is confusing here.

> + * @param len: (not sure yet.

Please remove this and document nr_zones instead.

> + * @return 0 on success, -1 on failure
> + */
> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                                           unsigned int *nr_zones,
> +                                           BlockZoneDescriptor *zones) {
> +#if defined(CONFIG_BLKZONED)
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs         = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type   = QEMU_AIO_ZONE_REPORT,
> +        .aio_offset = offset,
> +        .zone_report    = {
> +                .nr_zones       = nr_zones,
> +                .zones          = zones,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
> +/*
> + * zone management operations - Execute an operation on a zone
> + */
> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> +        int64_t offset, int64_t len) {
> +#if defined(CONFIG_BLKZONED)
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +    int64_t zone_sector, zone_sector_mask;
> +    const char *ioctl_name;
> +    unsigned long zone_op;
> +    int ret;
> +
> +    struct stat st;
> +    if (fstat(s->fd, &st) < 0) {
> +        ret = -errno;
> +        return ret;
> +    }

st is not used and can be removed.

> +    zone_sector = bs->bl.zone_sectors;
> +    zone_sector_mask = zone_sector - 1;
> +    if (offset & zone_sector_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> +                     "%" PRId64 "", offset, zone_sector);
> +        return -EINVAL;
> +    }
> +
> +    if (len & zone_sector_mask) {
> +        error_report("number of sectors %" PRId64 " is not aligned to zone size"
> +                      " %" PRId64 "", len, zone_sector);
> +        return -EINVAL;
> +    }
> +
> +    switch (op) {
> +    case BLK_ZO_OPEN:
> +        ioctl_name = "BLKOPENZONE";
> +        zone_op = BLKOPENZONE;
> +        break;
> +    case BLK_ZO_CLOSE:
> +        ioctl_name = "BLKCLOSEZONE";
> +        zone_op = BLKCLOSEZONE;
> +        break;
> +    case BLK_ZO_FINISH:
> +        ioctl_name = "BLKFINISHZONE";
> +        zone_op = BLKFINISHZONE;
> +        break;
> +    case BLK_ZO_RESET:
> +        ioctl_name = "BLKRESETZONE";
> +        zone_op = BLKRESETZONE;
> +        break;
> +    default:
> +        error_report("Invalid zone operation 0x%x", op);
> +        return -EINVAL;
> +    }
> +
> +    acb = (RawPosixAIOData) {
> +        .bs             = bs,
> +        .aio_fildes     = s->fd,
> +        .aio_type       = QEMU_AIO_ZONE_MGMT,
> +        .aio_offset     = offset,
> +        .aio_nbytes     = len,
> +        .zone_mgmt  = {
> +                .zone_op = zone_op,
> +        },
> +    };
> +
> +    ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> +    if (ret != 0) {
> +        error_report("ioctl %s failed %d", ioctl_name, errno);
> +        return -errno;

ret contains a negative errno value. The errno variable is not used by
raw_thread_pool_submit().

I suggest simplifying it to:

  return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);

That's what most of the other raw_thread_pool_submit() callers.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-08-26 16:17 [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls Sam Li
  2022-08-29 19:29 ` Stefan Hajnoczi
@ 2022-08-30 11:57 ` Markus Armbruster
  2022-08-30 15:05   ` Sam Li
  1 sibling, 1 reply; 14+ messages in thread
From: Markus Armbruster @ 2022-08-30 11:57 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, stefanha, damien.lemoal, Dmitry.Fomichev, hare,
	qemu-block, hreitz, eblake, fam, kwolf

Sam Li <faithilikerun@gmail.com> writes:

> By adding zone management operations in BlockDriver, storage controller
> emulation can use the new block layer APIs including Report Zone and
> four zone management operations (open, close, finish, reset).
>
> Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
> zone_close(zc), zone_reset(zrs), zone_finish(zf).
>
> For example, to test zone_report, use following command:
> $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
> -c "zrp offset nr_zones"
>
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> Reviewed-by: Hannes Reinecke <hare@suse.de>

[...]

> diff --git a/block/file-posix.c b/block/file-posix.c
> index 0a8b4b426e..e3efba6db7 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c

[...]

> @@ -3752,6 +4025,54 @@ static BlockDriver bdrv_host_device = {
>  #endif
>  };
>  
> +#if defined(CONFIG_BLKZONED)
> +static BlockDriver bdrv_zoned_host_device = {
> +        .format_name = "zoned_host_device",

Indentation should be 4, not 8.

> +        .protocol_name = "zoned_host_device",
> +        .instance_size = sizeof(BDRVRawState),
> +        .bdrv_needs_filename = true,
> +        .bdrv_probe_device  = hdev_probe_device,
> +        .bdrv_file_open     = hdev_open,
> +        .bdrv_close         = raw_close,
> +        .bdrv_reopen_prepare = raw_reopen_prepare,
> +        .bdrv_reopen_commit  = raw_reopen_commit,
> +        .bdrv_reopen_abort   = raw_reopen_abort,
> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> +        .create_opts         = &bdrv_create_opts_simple,
> +        .mutable_opts        = mutable_opts,
> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> +
> +        .bdrv_co_preadv         = raw_co_preadv,
> +        .bdrv_co_pwritev        = raw_co_pwritev,
> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> +        .bdrv_refresh_limits = raw_refresh_limits,
> +        .bdrv_io_plug = raw_aio_plug,
> +        .bdrv_io_unplug = raw_aio_unplug,
> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> +
> +        .bdrv_co_truncate       = raw_co_truncate,
> +        .bdrv_getlength = raw_getlength,
> +        .bdrv_get_info = raw_get_info,
> +        .bdrv_get_allocated_file_size
> +                            = raw_get_allocated_file_size,
> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> +        .bdrv_check_perm = raw_check_perm,
> +        .bdrv_set_perm   = raw_set_perm,
> +        .bdrv_abort_perm_update = raw_abort_perm_update,
> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> +        .bdrv_probe_geometry = hdev_probe_geometry,
> +        .bdrv_co_ioctl = hdev_co_ioctl,
> +
> +        /* zone management operations */
> +        .bdrv_co_zone_report = raw_co_zone_report,
> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +};

Differences to bdrv_host_device:

* .bdrv_parse_filename is not set

* .bdrv_co_ioctl is not set

* .bdrv_co_zone_report and .bdrv_co_zone_mgmt are set

Notably common is .bdrv_file_open = hdev_open.  What happens when you
try to create a zoned_host_device where the @filename argument is not in
fact a zoned device?

Do we really need a separate, but almost identical BlockDriver?  Could
the existing one provide zoned functionality exactly when the underlying
host device does?

Forgive me if these are ignorant questions, or have been discussed
before.

> +#endif
> +
>  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>  static void cdrom_parse_filename(const char *filename, QDict *options,
>                                   Error **errp)
> @@ -4012,6 +4333,9 @@ static void bdrv_file_init(void)
>      bdrv_register(&bdrv_file);
>  #if defined(HAVE_HOST_BLOCK_DEVICE)
>      bdrv_register(&bdrv_host_device);
> +#if defined(CONFIG_BLKZONED)
> +    bdrv_register(&bdrv_zoned_host_device);
> +#endif
>  #ifdef __linux__
>      bdrv_register(&bdrv_host_cdrom);
>  #endif

[...]

> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 2173e7734a..c6bbb7a037 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -2942,6 +2942,7 @@
>  # @compress: Since 5.0
>  # @copy-before-write: Since 6.2
>  # @snapshot-access: Since 7.0
> +# @zoned_host_device: Since 7.2
>  #
>  # Since: 2.9
>  ##
> @@ -2955,7 +2956,8 @@
>              'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
>              'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
>              { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
> -            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
> +            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
> +            { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }

QAPI naming conventions ask for 'zoned-host-device'.  We may choose to
ignore them to stay closer to existing 'host_device'.

>  
>  ##
>  # @BlockdevOptionsFile:
> @@ -4329,7 +4331,9 @@
>        'vhdx':       'BlockdevOptionsGenericFormat',
>        'vmdk':       'BlockdevOptionsGenericCOWFormat',
>        'vpc':        'BlockdevOptionsGenericFormat',
> -      'vvfat':      'BlockdevOptionsVVFAT'
> +      'vvfat':      'BlockdevOptionsVVFAT',
> +      'zoned_host_device': { 'type': 'BlockdevOptionsFile',
> +                             'if': 'CONFIG_BLKZONED' }
>    } }
>  
>  ##

[...]



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-08-30 11:57 ` Markus Armbruster
@ 2022-08-30 15:05   ` Sam Li
  2022-08-30 15:09     ` Markus Armbruster
  2022-08-31  8:35     ` Markus Armbruster
  0 siblings, 2 replies; 14+ messages in thread
From: Sam Li @ 2022-08-30 15:05 UTC (permalink / raw)
  To: Markus Armbruster
  Cc: qemu-devel, Stefan Hajnoczi, Damien Le Moal, Dmitry Fomichev,
	Hannes Reinecke, qemu block, Hanna Reitz, Eric Blake, Fam Zheng,
	Kevin Wolf

Markus Armbruster <armbru@redhat.com> 于2022年8月30日周二 19:57写道:
>
> Sam Li <faithilikerun@gmail.com> writes:
>
> > By adding zone management operations in BlockDriver, storage controller
> > emulation can use the new block layer APIs including Report Zone and
> > four zone management operations (open, close, finish, reset).
> >
> > Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
> > zone_close(zc), zone_reset(zrs), zone_finish(zf).
> >
> > For example, to test zone_report, use following command:
> > $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
> > -c "zrp offset nr_zones"
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > Reviewed-by: Hannes Reinecke <hare@suse.de>
>
> [...]
>
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 0a8b4b426e..e3efba6db7 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
>
> [...]
>
> > @@ -3752,6 +4025,54 @@ static BlockDriver bdrv_host_device = {
> >  #endif
> >  };
> >
> > +#if defined(CONFIG_BLKZONED)
> > +static BlockDriver bdrv_zoned_host_device = {
> > +        .format_name = "zoned_host_device",
>
> Indentation should be 4, not 8.
>
> > +        .protocol_name = "zoned_host_device",
> > +        .instance_size = sizeof(BDRVRawState),
> > +        .bdrv_needs_filename = true,
> > +        .bdrv_probe_device  = hdev_probe_device,
> > +        .bdrv_file_open     = hdev_open,
> > +        .bdrv_close         = raw_close,
> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
> > +        .bdrv_reopen_commit  = raw_reopen_commit,
> > +        .bdrv_reopen_abort   = raw_reopen_abort,
> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> > +        .create_opts         = &bdrv_create_opts_simple,
> > +        .mutable_opts        = mutable_opts,
> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> > +
> > +        .bdrv_co_preadv         = raw_co_preadv,
> > +        .bdrv_co_pwritev        = raw_co_pwritev,
> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> > +        .bdrv_refresh_limits = raw_refresh_limits,
> > +        .bdrv_io_plug = raw_aio_plug,
> > +        .bdrv_io_unplug = raw_aio_unplug,
> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > +
> > +        .bdrv_co_truncate       = raw_co_truncate,
> > +        .bdrv_getlength = raw_getlength,
> > +        .bdrv_get_info = raw_get_info,
> > +        .bdrv_get_allocated_file_size
> > +                            = raw_get_allocated_file_size,
> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> > +        .bdrv_check_perm = raw_check_perm,
> > +        .bdrv_set_perm   = raw_set_perm,
> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> > +        .bdrv_probe_geometry = hdev_probe_geometry,
> > +        .bdrv_co_ioctl = hdev_co_ioctl,
> > +
> > +        /* zone management operations */
> > +        .bdrv_co_zone_report = raw_co_zone_report,
> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +};
>
> Differences to bdrv_host_device:
>
> * .bdrv_parse_filename is not set
>
> * .bdrv_co_ioctl is not set
>
> * .bdrv_co_zone_report and .bdrv_co_zone_mgmt are set

As Stefan mentioned, zoned_host_device is a new driver that doesn't
work with string filenames. .bdrv_parse_filename() helps legacy
drivers strip the optional protocol prefix off the filename and no use
here. Therefore it can be dropped.

.bdrv_co_ioctl is set actually.

Zoned_host_device is basically host_device + zone operations. It
serves for a simple purpose: if the host device is zoned, register
zoned_host_device driver; else, register host_device.

>
> Notably common is .bdrv_file_open = hdev_open.  What happens when you
> try to create a zoned_host_device where the @filename argument is not in
> fact a zoned device?

If the device is a regular block device, QEMU will still open the
device. For instance, I use a loopback device to test zone_report in
qemu-io. It returns ENOTTY which indicates Inappropriate ioctl for the
device. Meanwhile, if using a regular block device when emulation a
zoned device on a guest os, the best case is that the guest can boot
but has no emulated block device. In some cases, QEMU just terminates
because the block device has not met the alignment requirements.

>
> Do we really need a separate, but almost identical BlockDriver?  Could
> the existing one provide zoned functionality exactly when the underlying
> host device does?

I did use the existing one host device to add zoned commands at first.
But then, we decided to change that and use a separate BlockDriver.
Though the existing one can provide zoned functionality, a new
BlockDriver makes it clear when mixing block drivers, adding more
configurations/constraints, etc. For example, zoned devices must
enforce direct I/O instead of using page cache to ensure the order of
writes. It would be good to print a message for users when using
zoned_host_device without setting direct I/O.

However, it's still a simple version I was thinking about and can be
improved/changed afterward. Using host_device only is possible I think
but needs more carefully thinking.

Maybe Damien and Stefan can talk more about this?

>
> Forgive me if these are ignorant questions, or have been discussed
> before.

Always a pleasure.

>
> > +#endif
> > +
> >  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> >  static void cdrom_parse_filename(const char *filename, QDict *options,
> >                                   Error **errp)
> > @@ -4012,6 +4333,9 @@ static void bdrv_file_init(void)
> >      bdrv_register(&bdrv_file);
> >  #if defined(HAVE_HOST_BLOCK_DEVICE)
> >      bdrv_register(&bdrv_host_device);
> > +#if defined(CONFIG_BLKZONED)
> > +    bdrv_register(&bdrv_zoned_host_device);
> > +#endif
> >  #ifdef __linux__
> >      bdrv_register(&bdrv_host_cdrom);
> >  #endif
>
> [...]
>
> > diff --git a/qapi/block-core.json b/qapi/block-core.json
> > index 2173e7734a..c6bbb7a037 100644
> > --- a/qapi/block-core.json
> > +++ b/qapi/block-core.json
> > @@ -2942,6 +2942,7 @@
> >  # @compress: Since 5.0
> >  # @copy-before-write: Since 6.2
> >  # @snapshot-access: Since 7.0
> > +# @zoned_host_device: Since 7.2
> >  #
> >  # Since: 2.9
> >  ##
> > @@ -2955,7 +2956,8 @@
> >              'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
> >              'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
> >              { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
> > -            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
> > +            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
> > +            { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }
>
> QAPI naming conventions ask for 'zoned-host-device'.  We may choose to
> ignore them to stay closer to existing 'host_device'.

I am not sure why should ignore zoned_host_device. Can you be more specific?

>
> >
> >  ##
> >  # @BlockdevOptionsFile:
> > @@ -4329,7 +4331,9 @@
> >        'vhdx':       'BlockdevOptionsGenericFormat',
> >        'vmdk':       'BlockdevOptionsGenericCOWFormat',
> >        'vpc':        'BlockdevOptionsGenericFormat',
> > -      'vvfat':      'BlockdevOptionsVVFAT'
> > +      'vvfat':      'BlockdevOptionsVVFAT',
> > +      'zoned_host_device': { 'type': 'BlockdevOptionsFile',
> > +                             'if': 'CONFIG_BLKZONED' }
> >    } }
> >
> >  ##
>
> [...]
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-08-30 15:05   ` Sam Li
@ 2022-08-30 15:09     ` Markus Armbruster
  2022-08-30 15:19       ` Sam Li
  2022-08-31  8:35     ` Markus Armbruster
  1 sibling, 1 reply; 14+ messages in thread
From: Markus Armbruster @ 2022-08-30 15:09 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, Stefan Hajnoczi, Damien Le Moal, Dmitry Fomichev,
	Hannes Reinecke, qemu block, Hanna Reitz, Eric Blake, Fam Zheng,
	Kevin Wolf

Sam Li <faithilikerun@gmail.com> writes:

> Markus Armbruster <armbru@redhat.com> 于2022年8月30日周二 19:57写道:
>>
>> Sam Li <faithilikerun@gmail.com> writes:
>>
>> > By adding zone management operations in BlockDriver, storage controller
>> > emulation can use the new block layer APIs including Report Zone and
>> > four zone management operations (open, close, finish, reset).
>> >
>> > Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
>> > zone_close(zc), zone_reset(zrs), zone_finish(zf).
>> >
>> > For example, to test zone_report, use following command:
>> > $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
>> > -c "zrp offset nr_zones"
>> >
>> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
>> > Reviewed-by: Hannes Reinecke <hare@suse.de>

[...]

>> > diff --git a/qapi/block-core.json b/qapi/block-core.json
>> > index 2173e7734a..c6bbb7a037 100644
>> > --- a/qapi/block-core.json
>> > +++ b/qapi/block-core.json
>> > @@ -2942,6 +2942,7 @@
>> >  # @compress: Since 5.0
>> >  # @copy-before-write: Since 6.2
>> >  # @snapshot-access: Since 7.0
>> > +# @zoned_host_device: Since 7.2
>> >  #
>> >  # Since: 2.9
>> >  ##
>> > @@ -2955,7 +2956,8 @@
>> >              'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
>> >              'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
>> >              { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
>> > -            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
>> > +            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
>> > +            { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }
>>
>> QAPI naming conventions ask for 'zoned-host-device'.  We may choose to
>> ignore them to stay closer to existing 'host_device'.
>
> I am not sure why should ignore zoned_host_device. Can you be more specific?

"them" = QAPI naming conventions.  Clear now?

[...]



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-08-30 15:09     ` Markus Armbruster
@ 2022-08-30 15:19       ` Sam Li
  0 siblings, 0 replies; 14+ messages in thread
From: Sam Li @ 2022-08-30 15:19 UTC (permalink / raw)
  To: Markus Armbruster
  Cc: qemu-devel, Stefan Hajnoczi, Damien Le Moal, Dmitry Fomichev,
	Hannes Reinecke, qemu block, Hanna Reitz, Eric Blake, Fam Zheng,
	Kevin Wolf

Markus Armbruster <armbru@redhat.com> 于2022年8月30日周二 23:09写道:
>
> Sam Li <faithilikerun@gmail.com> writes:
>
> > Markus Armbruster <armbru@redhat.com> 于2022年8月30日周二 19:57写道:
> >>
> >> Sam Li <faithilikerun@gmail.com> writes:
> >>
> >> > By adding zone management operations in BlockDriver, storage controller
> >> > emulation can use the new block layer APIs including Report Zone and
> >> > four zone management operations (open, close, finish, reset).
> >> >
> >> > Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
> >> > zone_close(zc), zone_reset(zrs), zone_finish(zf).
> >> >
> >> > For example, to test zone_report, use following command:
> >> > $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
> >> > -c "zrp offset nr_zones"
> >> >
> >> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> >> > Reviewed-by: Hannes Reinecke <hare@suse.de>
>
> [...]
>
> >> > diff --git a/qapi/block-core.json b/qapi/block-core.json
> >> > index 2173e7734a..c6bbb7a037 100644
> >> > --- a/qapi/block-core.json
> >> > +++ b/qapi/block-core.json
> >> > @@ -2942,6 +2942,7 @@
> >> >  # @compress: Since 5.0
> >> >  # @copy-before-write: Since 6.2
> >> >  # @snapshot-access: Since 7.0
> >> > +# @zoned_host_device: Since 7.2
> >> >  #
> >> >  # Since: 2.9
> >> >  ##
> >> > @@ -2955,7 +2956,8 @@
> >> >              'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
> >> >              'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
> >> >              { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
> >> > -            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
> >> > +            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
> >> > +            { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }
> >>
> >> QAPI naming conventions ask for 'zoned-host-device'.  We may choose to
> >> ignore them to stay closer to existing 'host_device'.
> >
> > I am not sure why should ignore zoned_host_device. Can you be more specific?
>
> "them" = QAPI naming conventions.  Clear now?

Ok, I thought "them" means 'zoned_host_device'.

>
> [...]
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-08-30 15:05   ` Sam Li
  2022-08-30 15:09     ` Markus Armbruster
@ 2022-08-31  8:35     ` Markus Armbruster
  2022-08-31  8:48       ` Sam Li
  1 sibling, 1 reply; 14+ messages in thread
From: Markus Armbruster @ 2022-08-31  8:35 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, Stefan Hajnoczi, Damien Le Moal, Dmitry Fomichev,
	Hannes Reinecke, qemu block, Hanna Reitz, Eric Blake, Fam Zheng,
	Kevin Wolf

Sam Li <faithilikerun@gmail.com> writes:

> Markus Armbruster <armbru@redhat.com> 于2022年8月30日周二 19:57写道:
>>
>> Sam Li <faithilikerun@gmail.com> writes:
>>
>> > By adding zone management operations in BlockDriver, storage controller
>> > emulation can use the new block layer APIs including Report Zone and
>> > four zone management operations (open, close, finish, reset).
>> >
>> > Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
>> > zone_close(zc), zone_reset(zrs), zone_finish(zf).
>> >
>> > For example, to test zone_report, use following command:
>> > $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
>> > -c "zrp offset nr_zones"
>> >
>> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
>> > Reviewed-by: Hannes Reinecke <hare@suse.de>
>>
>> [...]
>>
>> > diff --git a/block/file-posix.c b/block/file-posix.c
>> > index 0a8b4b426e..e3efba6db7 100644
>> > --- a/block/file-posix.c
>> > +++ b/block/file-posix.c
>>
>> [...]
>>
>> > @@ -3752,6 +4025,54 @@ static BlockDriver bdrv_host_device = {
>> >  #endif
>> >  };
>> >
>> > +#if defined(CONFIG_BLKZONED)
>> > +static BlockDriver bdrv_zoned_host_device = {
>> > +        .format_name = "zoned_host_device",
>>
>> Indentation should be 4, not 8.
>>
>> > +        .protocol_name = "zoned_host_device",
>> > +        .instance_size = sizeof(BDRVRawState),
>> > +        .bdrv_needs_filename = true,
>> > +        .bdrv_probe_device  = hdev_probe_device,
>> > +        .bdrv_file_open     = hdev_open,
>> > +        .bdrv_close         = raw_close,
>> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
>> > +        .bdrv_reopen_commit  = raw_reopen_commit,
>> > +        .bdrv_reopen_abort   = raw_reopen_abort,
>> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
>> > +        .create_opts         = &bdrv_create_opts_simple,
>> > +        .mutable_opts        = mutable_opts,
>> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>> > +
>> > +        .bdrv_co_preadv         = raw_co_preadv,
>> > +        .bdrv_co_pwritev        = raw_co_pwritev,
>> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
>> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
>> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>> > +        .bdrv_refresh_limits = raw_refresh_limits,
>> > +        .bdrv_io_plug = raw_aio_plug,
>> > +        .bdrv_io_unplug = raw_aio_unplug,
>> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>> > +
>> > +        .bdrv_co_truncate       = raw_co_truncate,
>> > +        .bdrv_getlength = raw_getlength,
>> > +        .bdrv_get_info = raw_get_info,
>> > +        .bdrv_get_allocated_file_size
>> > +                            = raw_get_allocated_file_size,
>> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
>> > +        .bdrv_check_perm = raw_check_perm,
>> > +        .bdrv_set_perm   = raw_set_perm,
>> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
>> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
>> > +        .bdrv_probe_geometry = hdev_probe_geometry,
>> > +        .bdrv_co_ioctl = hdev_co_ioctl,
>> > +
>> > +        /* zone management operations */
>> > +        .bdrv_co_zone_report = raw_co_zone_report,
>> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>> > +};
>>
>> Differences to bdrv_host_device:
>>
>> * .bdrv_parse_filename is not set
>>
>> * .bdrv_co_ioctl is not set
>>
>> * .bdrv_co_zone_report and .bdrv_co_zone_mgmt are set
>
> As Stefan mentioned, zoned_host_device is a new driver that doesn't
> work with string filenames. .bdrv_parse_filename() helps legacy
> drivers strip the optional protocol prefix off the filename and no use
> here. Therefore it can be dropped.

Makes sense.

> .bdrv_co_ioctl is set actually.

You're right; I diffed the two and misread the result.

> Zoned_host_device is basically host_device + zone operations. It
> serves for a simple purpose: if the host device is zoned, register
> zoned_host_device driver; else, register host_device.

Why would I ever want to use host_device instead of zoned_host_device?

To answer this question, we need to understand how their behavior
differs.

We can ignore the legacy protocol prefix / string filename part.

All that's left seems to be "if the host device is zoned, then using the
zoned_host_device driver gets you the zoned features, whereas using the
host_device driver doesn't".  What am I missing?

>> Notably common is .bdrv_file_open = hdev_open.  What happens when you
>> try to create a zoned_host_device where the @filename argument is not in
>> fact a zoned device?
>
> If the device is a regular block device, QEMU will still open the
> device. For instance, I use a loopback device to test zone_report in
> qemu-io. It returns ENOTTY which indicates Inappropriate ioctl for the
> device. Meanwhile, if using a regular block device when emulation a
> zoned device on a guest os, the best case is that the guest can boot
> but has no emulated block device. In some cases, QEMU just terminates
> because the block device has not met the alignment requirements.

I'm not sure I understand all of this.  I'm also not sure I have to :)

>> Do we really need a separate, but almost identical BlockDriver?  Could
>> the existing one provide zoned functionality exactly when the underlying
>> host device does?
>
> I did use the existing one host device to add zoned commands at first.
> But then, we decided to change that and use a separate BlockDriver.
> Though the existing one can provide zoned functionality, a new
> BlockDriver makes it clear when mixing block drivers, adding more
> configurations/constraints, etc. For example, zoned devices must
> enforce direct I/O instead of using page cache to ensure the order of
> writes. It would be good to print a message for users when using
> zoned_host_device without setting direct I/O.
>
> However, it's still a simple version I was thinking about and can be
> improved/changed afterward. Using host_device only is possible I think
> but needs more carefully thinking.

I'm not opposed to making this a separate driver.  But the case for it
should be made in the commit message.  Discussing it in review is a fine
way to get to a better commit message, of course.

> Maybe Damien and Stefan can talk more about this?
>
>>
>> Forgive me if these are ignorant questions, or have been discussed
>> before.
>
> Always a pleasure.

Thanks!

[...]



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-08-31  8:35     ` Markus Armbruster
@ 2022-08-31  8:48       ` Sam Li
  2022-09-01 14:57         ` Markus Armbruster
  0 siblings, 1 reply; 14+ messages in thread
From: Sam Li @ 2022-08-31  8:48 UTC (permalink / raw)
  To: Markus Armbruster
  Cc: qemu-devel, Stefan Hajnoczi, Damien Le Moal, Dmitry Fomichev,
	Hannes Reinecke, qemu block, Hanna Reitz, Eric Blake, Fam Zheng,
	Kevin Wolf

Markus Armbruster <armbru@redhat.com> 于2022年8月31日周三 16:35写道:
>
> Sam Li <faithilikerun@gmail.com> writes:
>
> > Markus Armbruster <armbru@redhat.com> 于2022年8月30日周二 19:57写道:
> >>
> >> Sam Li <faithilikerun@gmail.com> writes:
> >>
> >> > By adding zone management operations in BlockDriver, storage controller
> >> > emulation can use the new block layer APIs including Report Zone and
> >> > four zone management operations (open, close, finish, reset).
> >> >
> >> > Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
> >> > zone_close(zc), zone_reset(zrs), zone_finish(zf).
> >> >
> >> > For example, to test zone_report, use following command:
> >> > $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
> >> > -c "zrp offset nr_zones"
> >> >
> >> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> >> > Reviewed-by: Hannes Reinecke <hare@suse.de>
> >>
> >> [...]
> >>
> >> > diff --git a/block/file-posix.c b/block/file-posix.c
> >> > index 0a8b4b426e..e3efba6db7 100644
> >> > --- a/block/file-posix.c
> >> > +++ b/block/file-posix.c
> >>
> >> [...]
> >>
> >> > @@ -3752,6 +4025,54 @@ static BlockDriver bdrv_host_device = {
> >> >  #endif
> >> >  };
> >> >
> >> > +#if defined(CONFIG_BLKZONED)
> >> > +static BlockDriver bdrv_zoned_host_device = {
> >> > +        .format_name = "zoned_host_device",
> >>
> >> Indentation should be 4, not 8.
> >>
> >> > +        .protocol_name = "zoned_host_device",
> >> > +        .instance_size = sizeof(BDRVRawState),
> >> > +        .bdrv_needs_filename = true,
> >> > +        .bdrv_probe_device  = hdev_probe_device,
> >> > +        .bdrv_file_open     = hdev_open,
> >> > +        .bdrv_close         = raw_close,
> >> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
> >> > +        .bdrv_reopen_commit  = raw_reopen_commit,
> >> > +        .bdrv_reopen_abort   = raw_reopen_abort,
> >> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> >> > +        .create_opts         = &bdrv_create_opts_simple,
> >> > +        .mutable_opts        = mutable_opts,
> >> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> >> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> >> > +
> >> > +        .bdrv_co_preadv         = raw_co_preadv,
> >> > +        .bdrv_co_pwritev        = raw_co_pwritev,
> >> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> >> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> >> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> >> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> >> > +        .bdrv_refresh_limits = raw_refresh_limits,
> >> > +        .bdrv_io_plug = raw_aio_plug,
> >> > +        .bdrv_io_unplug = raw_aio_unplug,
> >> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> >> > +
> >> > +        .bdrv_co_truncate       = raw_co_truncate,
> >> > +        .bdrv_getlength = raw_getlength,
> >> > +        .bdrv_get_info = raw_get_info,
> >> > +        .bdrv_get_allocated_file_size
> >> > +                            = raw_get_allocated_file_size,
> >> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> >> > +        .bdrv_check_perm = raw_check_perm,
> >> > +        .bdrv_set_perm   = raw_set_perm,
> >> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
> >> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> >> > +        .bdrv_probe_geometry = hdev_probe_geometry,
> >> > +        .bdrv_co_ioctl = hdev_co_ioctl,
> >> > +
> >> > +        /* zone management operations */
> >> > +        .bdrv_co_zone_report = raw_co_zone_report,
> >> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> >> > +};
> >>
> >> Differences to bdrv_host_device:
> >>
> >> * .bdrv_parse_filename is not set
> >>
> >> * .bdrv_co_ioctl is not set
> >>
> >> * .bdrv_co_zone_report and .bdrv_co_zone_mgmt are set
> >
> > As Stefan mentioned, zoned_host_device is a new driver that doesn't
> > work with string filenames. .bdrv_parse_filename() helps legacy
> > drivers strip the optional protocol prefix off the filename and no use
> > here. Therefore it can be dropped.
>
> Makes sense.
>
> > .bdrv_co_ioctl is set actually.
>
> You're right; I diffed the two and misread the result.
>
> > Zoned_host_device is basically host_device + zone operations. It
> > serves for a simple purpose: if the host device is zoned, register
> > zoned_host_device driver; else, register host_device.
>
> Why would I ever want to use host_device instead of zoned_host_device?
>
> To answer this question, we need to understand how their behavior
> differs.
>
> We can ignore the legacy protocol prefix / string filename part.
>
> All that's left seems to be "if the host device is zoned, then using the
> zoned_host_device driver gets you the zoned features, whereas using the
> host_device driver doesn't".  What am I missing?

I think that's basically what users need to know about.

>
> >> Notably common is .bdrv_file_open = hdev_open.  What happens when you
> >> try to create a zoned_host_device where the @filename argument is not in
> >> fact a zoned device?
> >
> > If the device is a regular block device, QEMU will still open the
> > device. For instance, I use a loopback device to test zone_report in
> > qemu-io. It returns ENOTTY which indicates Inappropriate ioctl for the
> > device. Meanwhile, if using a regular block device when emulation a
> > zoned device on a guest os, the best case is that the guest can boot
> > but has no emulated block device. In some cases, QEMU just terminates
> > because the block device has not met the alignment requirements.
>
> I'm not sure I understand all of this.  I'm also not sure I have to :)

Maybe I didn't explain it very well. Which part would you like to know
more about?

>
> >> Do we really need a separate, but almost identical BlockDriver?  Could
> >> the existing one provide zoned functionality exactly when the underlying
> >> host device does?
> >
> > I did use the existing one host device to add zoned commands at first.
> > But then, we decided to change that and use a separate BlockDriver.
> > Though the existing one can provide zoned functionality, a new
> > BlockDriver makes it clear when mixing block drivers, adding more
> > configurations/constraints, etc. For example, zoned devices must
> > enforce direct I/O instead of using page cache to ensure the order of
> > writes. It would be good to print a message for users when using
> > zoned_host_device without setting direct I/O.
> >
> > However, it's still a simple version I was thinking about and can be
> > improved/changed afterward. Using host_device only is possible I think
> > but needs more carefully thinking.
>
> I'm not opposed to making this a separate driver.  But the case for it
> should be made in the commit message.  Discussing it in review is a fine
> way to get to a better commit message, of course.

That's great! I'll mention the zoned_host device BlockDriver in the
commit message of the next revision.


Thanks for reviewing. If I missed anything, please tell me.

>
> > Maybe Damien and Stefan can talk more about this?
> >
> >>
> >> Forgive me if these are ignorant questions, or have been discussed
> >> before.
> >
> > Always a pleasure.
>
> Thanks!
>
> [...]
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-08-31  8:48       ` Sam Li
@ 2022-09-01 14:57         ` Markus Armbruster
  2022-09-01 16:18           ` Markus Armbruster
  2022-09-02  2:13           ` Damien Le Moal
  0 siblings, 2 replies; 14+ messages in thread
From: Markus Armbruster @ 2022-09-01 14:57 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, Stefan Hajnoczi, Damien Le Moal, Dmitry Fomichev,
	Hannes Reinecke, qemu block, Hanna Reitz, Eric Blake, Fam Zheng,
	Kevin Wolf

Sam Li <faithilikerun@gmail.com> writes:

> Markus Armbruster <armbru@redhat.com> 于2022年8月31日周三 16:35写道:
>>
>> Sam Li <faithilikerun@gmail.com> writes:
>>
>> > Markus Armbruster <armbru@redhat.com> 于2022年8月30日周二 19:57写道:
>> >>
>> >> Sam Li <faithilikerun@gmail.com> writes:
>> >>
>> >> > By adding zone management operations in BlockDriver, storage controller
>> >> > emulation can use the new block layer APIs including Report Zone and
>> >> > four zone management operations (open, close, finish, reset).
>> >> >
>> >> > Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
>> >> > zone_close(zc), zone_reset(zrs), zone_finish(zf).
>> >> >
>> >> > For example, to test zone_report, use following command:
>> >> > $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
>> >> > -c "zrp offset nr_zones"
>> >> >
>> >> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
>> >> > Reviewed-by: Hannes Reinecke <hare@suse.de>
>> >>
>> >> [...]
>> >>
>> >> > diff --git a/block/file-posix.c b/block/file-posix.c
>> >> > index 0a8b4b426e..e3efba6db7 100644
>> >> > --- a/block/file-posix.c
>> >> > +++ b/block/file-posix.c
>> >>
>> >> [...]
>> >>
>> >> > @@ -3752,6 +4025,54 @@ static BlockDriver bdrv_host_device = {
>> >> >  #endif
>> >> >  };
>> >> >
>> >> > +#if defined(CONFIG_BLKZONED)
>> >> > +static BlockDriver bdrv_zoned_host_device = {
>> >> > +        .format_name = "zoned_host_device",
>> >>
>> >> Indentation should be 4, not 8.
>> >>
>> >> > +        .protocol_name = "zoned_host_device",
>> >> > +        .instance_size = sizeof(BDRVRawState),
>> >> > +        .bdrv_needs_filename = true,
>> >> > +        .bdrv_probe_device  = hdev_probe_device,
>> >> > +        .bdrv_file_open     = hdev_open,
>> >> > +        .bdrv_close         = raw_close,
>> >> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
>> >> > +        .bdrv_reopen_commit  = raw_reopen_commit,
>> >> > +        .bdrv_reopen_abort   = raw_reopen_abort,
>> >> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
>> >> > +        .create_opts         = &bdrv_create_opts_simple,
>> >> > +        .mutable_opts        = mutable_opts,
>> >> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>> >> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>> >> > +
>> >> > +        .bdrv_co_preadv         = raw_co_preadv,
>> >> > +        .bdrv_co_pwritev        = raw_co_pwritev,
>> >> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>> >> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
>> >> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
>> >> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>> >> > +        .bdrv_refresh_limits = raw_refresh_limits,
>> >> > +        .bdrv_io_plug = raw_aio_plug,
>> >> > +        .bdrv_io_unplug = raw_aio_unplug,
>> >> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>> >> > +
>> >> > +        .bdrv_co_truncate       = raw_co_truncate,
>> >> > +        .bdrv_getlength = raw_getlength,
>> >> > +        .bdrv_get_info = raw_get_info,
>> >> > +        .bdrv_get_allocated_file_size
>> >> > +                            = raw_get_allocated_file_size,
>> >> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
>> >> > +        .bdrv_check_perm = raw_check_perm,
>> >> > +        .bdrv_set_perm   = raw_set_perm,
>> >> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
>> >> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
>> >> > +        .bdrv_probe_geometry = hdev_probe_geometry,
>> >> > +        .bdrv_co_ioctl = hdev_co_ioctl,
>> >> > +
>> >> > +        /* zone management operations */
>> >> > +        .bdrv_co_zone_report = raw_co_zone_report,
>> >> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>> >> > +};
>> >>
>> >> Differences to bdrv_host_device:
>> >>
>> >> * .bdrv_parse_filename is not set
>> >>
>> >> * .bdrv_co_ioctl is not set
>> >>
>> >> * .bdrv_co_zone_report and .bdrv_co_zone_mgmt are set
>> >
>> > As Stefan mentioned, zoned_host_device is a new driver that doesn't
>> > work with string filenames. .bdrv_parse_filename() helps legacy
>> > drivers strip the optional protocol prefix off the filename and no use
>> > here. Therefore it can be dropped.
>>
>> Makes sense.
>>
>> > .bdrv_co_ioctl is set actually.
>>
>> You're right; I diffed the two and misread the result.
>>
>> > Zoned_host_device is basically host_device + zone operations. It
>> > serves for a simple purpose: if the host device is zoned, register
>> > zoned_host_device driver; else, register host_device.
>>
>> Why would I ever want to use host_device instead of zoned_host_device?
>>
>> To answer this question, we need to understand how their behavior
>> differs.
>>
>> We can ignore the legacy protocol prefix / string filename part.
>>
>> All that's left seems to be "if the host device is zoned, then using the
>> zoned_host_device driver gets you the zoned features, whereas using the
>> host_device driver doesn't".  What am I missing?
>
> I think that's basically what users need to know about.

Now answer my previous question, please: why would I ever want to use
host_device instead of zoned_host_device?

Or in other words, why would I ever want to present a zoned host device
to a guest as non-zoned device?

>> >> Notably common is .bdrv_file_open = hdev_open.  What happens when you
>> >> try to create a zoned_host_device where the @filename argument is not in
>> >> fact a zoned device?
>> >
>> > If the device is a regular block device, QEMU will still open the
>> > device. For instance, I use a loopback device to test zone_report in
>> > qemu-io. It returns ENOTTY which indicates Inappropriate ioctl for the
>> > device. Meanwhile, if using a regular block device when emulation a
>> > zoned device on a guest os, the best case is that the guest can boot
>> > but has no emulated block device. In some cases, QEMU just terminates
>> > because the block device has not met the alignment requirements.
>>
>> I'm not sure I understand all of this.  I'm also not sure I have to :)
>
> Maybe I didn't explain it very well. Which part would you like to know
> more about?

Let's try more specific questions.  Say I configure a zoned_host_device
backed by a host device that isn't zoned.

1. Is this configuration accepted?

2. Would a guest work as long as it doesn't touch this device?

3. Would a guest using this device work as long as it uses no zoned
   features?

4. What happens when a guest tries to use zoned features?

[...]



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-09-01 14:57         ` Markus Armbruster
@ 2022-09-01 16:18           ` Markus Armbruster
  2022-09-02  2:13           ` Damien Le Moal
  1 sibling, 0 replies; 14+ messages in thread
From: Markus Armbruster @ 2022-09-01 16:18 UTC (permalink / raw)
  To: Markus Armbruster
  Cc: Sam Li, qemu-devel, Stefan Hajnoczi, Damien Le Moal,
	Dmitry Fomichev, Hannes Reinecke, qemu block, Hanna Reitz,
	Eric Blake, Fam Zheng, Kevin Wolf

Markus Armbruster <armbru@redhat.com> writes:

> Sam Li <faithilikerun@gmail.com> writes:
>
>> Markus Armbruster <armbru@redhat.com> 于2022年8月31日周三 16:35写道:
>>>
>>> Sam Li <faithilikerun@gmail.com> writes:
>>>
>>> > Markus Armbruster <armbru@redhat.com> 于2022年8月30日周二 19:57写道:
>>> >>
>>> >> Sam Li <faithilikerun@gmail.com> writes:
>>> >>
>>> >> > By adding zone management operations in BlockDriver, storage controller
>>> >> > emulation can use the new block layer APIs including Report Zone and
>>> >> > four zone management operations (open, close, finish, reset).
>>> >> >
>>> >> > Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
>>> >> > zone_close(zc), zone_reset(zrs), zone_finish(zf).
>>> >> >
>>> >> > For example, to test zone_report, use following command:
>>> >> > $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
>>> >> > -c "zrp offset nr_zones"
>>> >> >
>>> >> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
>>> >> > Reviewed-by: Hannes Reinecke <hare@suse.de>
>>> >>
>>> >> [...]
>>> >>
>>> >> > diff --git a/block/file-posix.c b/block/file-posix.c
>>> >> > index 0a8b4b426e..e3efba6db7 100644
>>> >> > --- a/block/file-posix.c
>>> >> > +++ b/block/file-posix.c
>>> >>
>>> >> [...]
>>> >>
>>> >> > @@ -3752,6 +4025,54 @@ static BlockDriver bdrv_host_device = {
>>> >> >  #endif
>>> >> >  };
>>> >> >
>>> >> > +#if defined(CONFIG_BLKZONED)
>>> >> > +static BlockDriver bdrv_zoned_host_device = {
>>> >> > +        .format_name = "zoned_host_device",
>>> >>
>>> >> Indentation should be 4, not 8.
>>> >>
>>> >> > +        .protocol_name = "zoned_host_device",
>>> >> > +        .instance_size = sizeof(BDRVRawState),
>>> >> > +        .bdrv_needs_filename = true,
>>> >> > +        .bdrv_probe_device  = hdev_probe_device,
>>> >> > +        .bdrv_file_open     = hdev_open,
>>> >> > +        .bdrv_close         = raw_close,
>>> >> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
>>> >> > +        .bdrv_reopen_commit  = raw_reopen_commit,
>>> >> > +        .bdrv_reopen_abort   = raw_reopen_abort,
>>> >> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
>>> >> > +        .create_opts         = &bdrv_create_opts_simple,
>>> >> > +        .mutable_opts        = mutable_opts,
>>> >> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>>> >> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>>> >> > +
>>> >> > +        .bdrv_co_preadv         = raw_co_preadv,
>>> >> > +        .bdrv_co_pwritev        = raw_co_pwritev,
>>> >> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>>> >> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
>>> >> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
>>> >> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>>> >> > +        .bdrv_refresh_limits = raw_refresh_limits,
>>> >> > +        .bdrv_io_plug = raw_aio_plug,
>>> >> > +        .bdrv_io_unplug = raw_aio_unplug,
>>> >> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>>> >> > +
>>> >> > +        .bdrv_co_truncate       = raw_co_truncate,
>>> >> > +        .bdrv_getlength = raw_getlength,
>>> >> > +        .bdrv_get_info = raw_get_info,
>>> >> > +        .bdrv_get_allocated_file_size
>>> >> > +                            = raw_get_allocated_file_size,
>>> >> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
>>> >> > +        .bdrv_check_perm = raw_check_perm,
>>> >> > +        .bdrv_set_perm   = raw_set_perm,
>>> >> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
>>> >> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
>>> >> > +        .bdrv_probe_geometry = hdev_probe_geometry,
>>> >> > +        .bdrv_co_ioctl = hdev_co_ioctl,
>>> >> > +
>>> >> > +        /* zone management operations */
>>> >> > +        .bdrv_co_zone_report = raw_co_zone_report,
>>> >> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>>> >> > +};
>>> >>
>>> >> Differences to bdrv_host_device:
>>> >>
>>> >> * .bdrv_parse_filename is not set
>>> >>
>>> >> * .bdrv_co_ioctl is not set
>>> >>
>>> >> * .bdrv_co_zone_report and .bdrv_co_zone_mgmt are set
>>> >
>>> > As Stefan mentioned, zoned_host_device is a new driver that doesn't
>>> > work with string filenames. .bdrv_parse_filename() helps legacy
>>> > drivers strip the optional protocol prefix off the filename and no use
>>> > here. Therefore it can be dropped.
>>>
>>> Makes sense.
>>>
>>> > .bdrv_co_ioctl is set actually.
>>>
>>> You're right; I diffed the two and misread the result.
>>>
>>> > Zoned_host_device is basically host_device + zone operations. It
>>> > serves for a simple purpose: if the host device is zoned, register
>>> > zoned_host_device driver; else, register host_device.
>>>
>>> Why would I ever want to use host_device instead of zoned_host_device?
>>>
>>> To answer this question, we need to understand how their behavior
>>> differs.
>>>
>>> We can ignore the legacy protocol prefix / string filename part.
>>>
>>> All that's left seems to be "if the host device is zoned, then using the
>>> zoned_host_device driver gets you the zoned features, whereas using the
>>> host_device driver doesn't".  What am I missing?
>>
>> I think that's basically what users need to know about.
>
> Now answer my previous question, please: why would I ever want to use
> host_device instead of zoned_host_device?
>
> Or in other words, why would I ever want to present a zoned host device
> to a guest as non-zoned device?
>
>>> >> Notably common is .bdrv_file_open = hdev_open.  What happens when you
>>> >> try to create a zoned_host_device where the @filename argument is not in
>>> >> fact a zoned device?
>>> >
>>> > If the device is a regular block device, QEMU will still open the
>>> > device. For instance, I use a loopback device to test zone_report in
>>> > qemu-io. It returns ENOTTY which indicates Inappropriate ioctl for the
>>> > device. Meanwhile, if using a regular block device when emulation a
>>> > zoned device on a guest os, the best case is that the guest can boot
>>> > but has no emulated block device. In some cases, QEMU just terminates
>>> > because the block device has not met the alignment requirements.
>>>
>>> I'm not sure I understand all of this.  I'm also not sure I have to :)
>>
>> Maybe I didn't explain it very well. Which part would you like to know
>> more about?
>
> Let's try more specific questions.  Say I configure a zoned_host_device
> backed by a host device that isn't zoned.
>
> 1. Is this configuration accepted?
>
> 2. Would a guest work as long as it doesn't touch this device?

2.5. Does the device look like a zoned device to the guest?

> 3. Would a guest using this device work as long as it uses no zoned
>    features?
>
> 4. What happens when a guest tries to use zoned features?
>
> [...]



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-09-01 14:57         ` Markus Armbruster
  2022-09-01 16:18           ` Markus Armbruster
@ 2022-09-02  2:13           ` Damien Le Moal
  2022-09-29  6:22             ` Markus Armbruster
  1 sibling, 1 reply; 14+ messages in thread
From: Damien Le Moal @ 2022-09-02  2:13 UTC (permalink / raw)
  To: Markus Armbruster, Sam Li
  Cc: qemu-devel, Stefan Hajnoczi, Dmitry Fomichev, Hannes Reinecke,
	qemu block, Hanna Reitz, Eric Blake, Fam Zheng, Kevin Wolf

On 9/1/22 23:57, Markus Armbruster wrote:
> Sam Li <faithilikerun@gmail.com> writes:
> 
>> Markus Armbruster <armbru@redhat.com> 于2022年8月31日周三 16:35写道:
>>>
>>> Sam Li <faithilikerun@gmail.com> writes:
>>>
>>>> Markus Armbruster <armbru@redhat.com> 于2022年8月30日周二 19:57写道:
>>>>>
>>>>> Sam Li <faithilikerun@gmail.com> writes:
>>>>>
>>>>>> By adding zone management operations in BlockDriver, storage controller
>>>>>> emulation can use the new block layer APIs including Report Zone and
>>>>>> four zone management operations (open, close, finish, reset).
>>>>>>
>>>>>> Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
>>>>>> zone_close(zc), zone_reset(zrs), zone_finish(zf).
>>>>>>
>>>>>> For example, to test zone_report, use following command:
>>>>>> $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
>>>>>> -c "zrp offset nr_zones"
>>>>>>
>>>>>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>>>>>> Reviewed-by: Hannes Reinecke <hare@suse.de>
>>>>>
>>>>> [...]
>>>>>
>>>>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>>>>> index 0a8b4b426e..e3efba6db7 100644
>>>>>> --- a/block/file-posix.c
>>>>>> +++ b/block/file-posix.c
>>>>>
>>>>> [...]
>>>>>
>>>>>> @@ -3752,6 +4025,54 @@ static BlockDriver bdrv_host_device = {
>>>>>>  #endif
>>>>>>  };
>>>>>>
>>>>>> +#if defined(CONFIG_BLKZONED)
>>>>>> +static BlockDriver bdrv_zoned_host_device = {
>>>>>> +        .format_name = "zoned_host_device",
>>>>>
>>>>> Indentation should be 4, not 8.
>>>>>
>>>>>> +        .protocol_name = "zoned_host_device",
>>>>>> +        .instance_size = sizeof(BDRVRawState),
>>>>>> +        .bdrv_needs_filename = true,
>>>>>> +        .bdrv_probe_device  = hdev_probe_device,
>>>>>> +        .bdrv_file_open     = hdev_open,
>>>>>> +        .bdrv_close         = raw_close,
>>>>>> +        .bdrv_reopen_prepare = raw_reopen_prepare,
>>>>>> +        .bdrv_reopen_commit  = raw_reopen_commit,
>>>>>> +        .bdrv_reopen_abort   = raw_reopen_abort,
>>>>>> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
>>>>>> +        .create_opts         = &bdrv_create_opts_simple,
>>>>>> +        .mutable_opts        = mutable_opts,
>>>>>> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>>>>>> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>>>>>> +
>>>>>> +        .bdrv_co_preadv         = raw_co_preadv,
>>>>>> +        .bdrv_co_pwritev        = raw_co_pwritev,
>>>>>> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>>>>>> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
>>>>>> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
>>>>>> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>>>>>> +        .bdrv_refresh_limits = raw_refresh_limits,
>>>>>> +        .bdrv_io_plug = raw_aio_plug,
>>>>>> +        .bdrv_io_unplug = raw_aio_unplug,
>>>>>> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>>>>>> +
>>>>>> +        .bdrv_co_truncate       = raw_co_truncate,
>>>>>> +        .bdrv_getlength = raw_getlength,
>>>>>> +        .bdrv_get_info = raw_get_info,
>>>>>> +        .bdrv_get_allocated_file_size
>>>>>> +                            = raw_get_allocated_file_size,
>>>>>> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
>>>>>> +        .bdrv_check_perm = raw_check_perm,
>>>>>> +        .bdrv_set_perm   = raw_set_perm,
>>>>>> +        .bdrv_abort_perm_update = raw_abort_perm_update,
>>>>>> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
>>>>>> +        .bdrv_probe_geometry = hdev_probe_geometry,
>>>>>> +        .bdrv_co_ioctl = hdev_co_ioctl,
>>>>>> +
>>>>>> +        /* zone management operations */
>>>>>> +        .bdrv_co_zone_report = raw_co_zone_report,
>>>>>> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>>>>>> +};
>>>>>
>>>>> Differences to bdrv_host_device:
>>>>>
>>>>> * .bdrv_parse_filename is not set
>>>>>
>>>>> * .bdrv_co_ioctl is not set
>>>>>
>>>>> * .bdrv_co_zone_report and .bdrv_co_zone_mgmt are set
>>>>
>>>> As Stefan mentioned, zoned_host_device is a new driver that doesn't
>>>> work with string filenames. .bdrv_parse_filename() helps legacy
>>>> drivers strip the optional protocol prefix off the filename and no use
>>>> here. Therefore it can be dropped.
>>>
>>> Makes sense.
>>>
>>>> .bdrv_co_ioctl is set actually.
>>>
>>> You're right; I diffed the two and misread the result.
>>>
>>>> Zoned_host_device is basically host_device + zone operations. It
>>>> serves for a simple purpose: if the host device is zoned, register
>>>> zoned_host_device driver; else, register host_device.
>>>
>>> Why would I ever want to use host_device instead of zoned_host_device?
>>>
>>> To answer this question, we need to understand how their behavior
>>> differs.
>>>
>>> We can ignore the legacy protocol prefix / string filename part.
>>>
>>> All that's left seems to be "if the host device is zoned, then using the
>>> zoned_host_device driver gets you the zoned features, whereas using the
>>> host_device driver doesn't".  What am I missing?
>>
>> I think that's basically what users need to know about.
> 
> Now answer my previous question, please: why would I ever want to use
> host_device instead of zoned_host_device?
> 
> Or in other words, why would I ever want to present a zoned host device
> to a guest as non-zoned device?
> 
>>>>> Notably common is .bdrv_file_open = hdev_open.  What happens when you
>>>>> try to create a zoned_host_device where the @filename argument is not in
>>>>> fact a zoned device?
>>>>
>>>> If the device is a regular block device, QEMU will still open the
>>>> device. For instance, I use a loopback device to test zone_report in
>>>> qemu-io. It returns ENOTTY which indicates Inappropriate ioctl for the
>>>> device. Meanwhile, if using a regular block device when emulation a
>>>> zoned device on a guest os, the best case is that the guest can boot
>>>> but has no emulated block device. In some cases, QEMU just terminates
>>>> because the block device has not met the alignment requirements.
>>>
>>> I'm not sure I understand all of this.  I'm also not sure I have to :)
>>
>> Maybe I didn't explain it very well. Which part would you like to know
>> more about?
> 
> Let's try more specific questions.  Say I configure a zoned_host_device
> backed by a host device that isn't zoned.
> 
> 1. Is this configuration accepted?

If we assume we have the special zoned_host_device driver, with the
associated command line zoned_host_device option explicitly calling for
it, then no, I do not think this should be allowed at all and an error
should be returned on startup. That would be consistent with the fact that
the options zoned_host_device and host_device are different to make sure
we can check that the user knows what he/she/them is doing.

If we have only host_device as a setup option and driver, the driver
methods can be trivially adjusted to do the right thing based on the
device type (i.e. zoned vs regular/not zoned). However, that would prevent
an interesting future extension of this work to implement a full zone
emulation on top of a regular (not zoned) host block device.

With this in mind, we currently have the following:

1) host_device option -> accept only regular non-zoned host block devices
2) zoned_host_device option -> accept only zoned host block devices

And in the future, we can have:

1) host_device option -> accept only regular non-zoned host block devices
2) zoned_host_device option -> accept any host block device type
	a) Use native zone kernel API for zoned host block devices
	b) Use full zone emulation for regular host block devices

But sure, internally, we could have a single driver structure with methods
adjusted to do the correct thing based on the device type and option
specified. Having a 1:1 mapping between the driver name and driver
structure does clarify things I think (even though there are indeed a lot
of methods that are identical).

> 
> 2. Would a guest work as long as it doesn't touch this device?
> 
> 3. Would a guest using this device work as long as it uses no zoned
>    features?
> 
> 4. What happens when a guest tries to use zoned features?
> 
> [...]
> 

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-09-02  2:13           ` Damien Le Moal
@ 2022-09-29  6:22             ` Markus Armbruster
  0 siblings, 0 replies; 14+ messages in thread
From: Markus Armbruster @ 2022-09-29  6:22 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Sam Li, qemu-devel, Stefan Hajnoczi, Dmitry Fomichev,
	Hannes Reinecke, qemu block, Hanna Reitz, Eric Blake, Fam Zheng,
	Kevin Wolf

Damien Le Moal <damien.lemoal@opensource.wdc.com> writes:

> On 9/1/22 23:57, Markus Armbruster wrote:
>> Sam Li <faithilikerun@gmail.com> writes:
>> 
>>> Markus Armbruster <armbru@redhat.com> 于2022年8月31日周三 16:35写道:
>>>>
>>>> Sam Li <faithilikerun@gmail.com> writes:
>>>>
>>>>> Markus Armbruster <armbru@redhat.com> 于2022年8月30日周二 19:57写道:

[...]

>>>>> Zoned_host_device is basically host_device + zone operations. It
>>>>> serves for a simple purpose: if the host device is zoned, register
>>>>> zoned_host_device driver; else, register host_device.
>>>>
>>>> Why would I ever want to use host_device instead of zoned_host_device?
>>>>
>>>> To answer this question, we need to understand how their behavior
>>>> differs.
>>>>
>>>> We can ignore the legacy protocol prefix / string filename part.
>>>>
>>>> All that's left seems to be "if the host device is zoned, then using the
>>>> zoned_host_device driver gets you the zoned features, whereas using the
>>>> host_device driver doesn't".  What am I missing?
>>>
>>> I think that's basically what users need to know about.
>> 
>> Now answer my previous question, please: why would I ever want to use
>> host_device instead of zoned_host_device?
>> 
>> Or in other words, why would I ever want to present a zoned host device
>> to a guest as non-zoned device?
>> 
>>>>>> Notably common is .bdrv_file_open = hdev_open.  What happens when you
>>>>>> try to create a zoned_host_device where the @filename argument is not in
>>>>>> fact a zoned device?
>>>>>
>>>>> If the device is a regular block device, QEMU will still open the
>>>>> device. For instance, I use a loopback device to test zone_report in
>>>>> qemu-io. It returns ENOTTY which indicates Inappropriate ioctl for the
>>>>> device. Meanwhile, if using a regular block device when emulation a
>>>>> zoned device on a guest os, the best case is that the guest can boot
>>>>> but has no emulated block device. In some cases, QEMU just terminates
>>>>> because the block device has not met the alignment requirements.
>>>>
>>>> I'm not sure I understand all of this.  I'm also not sure I have to :)
>>>
>>> Maybe I didn't explain it very well. Which part would you like to know
>>> more about?
>> 
>> Let's try more specific questions.  Say I configure a zoned_host_device
>> backed by a host device that isn't zoned.
>> 
>> 1. Is this configuration accepted?
>
> If we assume we have the special zoned_host_device driver, with the
> associated command line zoned_host_device option explicitly calling for
> it, then no, I do not think this should be allowed at all and an error
> should be returned on startup. That would be consistent with the fact that
> the options zoned_host_device and host_device are different to make sure
> we can check that the user knows what he/she/them is doing.
>
> If we have only host_device as a setup option and driver, the driver
> methods can be trivially adjusted to do the right thing based on the
> device type (i.e. zoned vs regular/not zoned). However, that would prevent
> an interesting future extension of this work to implement a full zone
> emulation on top of a regular (not zoned) host block device.
>
> With this in mind, we currently have the following:
>
> 1) host_device option -> accept only regular non-zoned host block devices
> 2) zoned_host_device option -> accept only zoned host block devices

2) matches my intuitive expectations for this driver name.

However, if host_device works even with a zoned host device before the
patch presenting it as non-zoned to the guest, then it needs to continue
to do so.

> And in the future, we can have:
>
> 1) host_device option -> accept only regular non-zoned host block devices
> 2) zoned_host_device option -> accept any host block device type
> 	a) Use native zone kernel API for zoned host block devices
> 	b) Use full zone emulation for regular host block devices

Understood.

> But sure, internally, we could have a single driver structure with methods
> adjusted to do the correct thing based on the device type and option
> specified. Having a 1:1 mapping between the driver name and driver
> structure does clarify things I think (even though there are indeed a lot
> of methods that are identical).

I think this is basically a matter of user interface design.  Let's
review what we have: host_device and host_cdrom.  I'm only passingly
familiar with them, so please correct my misunderstandings, if any.

host_device and host_cdrom let you "pass through" a host device to a
guest.

host_cdrom presents a removable device to the guest.  I appears to
accept any host block device, even a non-removable one.  What happens
when you try to use a non-removable host device as removable guest
device I don't know.

host_device presents a non-removable device to the guest.  It accepts
any host block device, even a removable one (as long as it has a
medium).

host_device detects whether the host device is a SCSI generic device.
Guest devices scsi-hd and scsi-cd reject a SCSI generic host device.
Guest device scsi-block requires one (I think).

On the one hand, there is precedence for using different driver types
for different kinds of host devices: host_cdrom for removable ones,
host_device for non-removable ones.

On the other hand, there is precedence for using a single driver type
for different kinds of host devices, with dynamic detection: host_device
both for SCSI generic and for others.

On the third hand, the "different driver type" story is complicated by
us accepting the "wrong" kind of host device at least in some scenarios.

Kevin, do you have an opinion on how the user interface should be?

Next, do you have one on how it can be, given what we have?

[...]



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  2022-08-29 12:52 Sam Li
@ 2022-08-29 13:00 ` Sam Li
  0 siblings, 0 replies; 14+ messages in thread
From: Sam Li @ 2022-08-29 13:00 UTC (permalink / raw)
  To: qemu-devel
  Cc: Stefan Hajnoczi, Damien Le Moal, Dmitry Fomichev,
	Hannes Reinecke, qemu block, Hanna Reitz, Eric Blake,
	Markus Armbruster, Fam Zheng, Kevin Wolf

Sam Li <faithilikerun@gmail.com> 于2022年8月29日周一 20:53写道:
>
> By adding zone management operations in BlockDriver, storage controller
> emulation can use the new block layer APIs including Report Zone and
> four zone management operations (open, close, finish, reset).
>
> Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
> zone_close(zc), zone_reset(zrs), zone_finish(zf).
>
> For example, to test zone_report, use following command:
> $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
> -c "zrp offset nr_zones"
>
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> Reviewed-by: Hannes Reinecke <hare@suse.de>
> ---
>  block/block-backend.c             |  51 +++++
>  block/file-posix.c                | 326 +++++++++++++++++++++++++++++-
>  block/io.c                        |  41 ++++
>  include/block/block-io.h          |   7 +
>  include/block/block_int-common.h  |  21 ++
>  include/block/raw-aio.h           |   6 +-
>  include/sysemu/block-backend-io.h |  17 ++
>  meson.build                       |   1 +
>  qapi/block-core.json              |   8 +-
>  qemu-io-cmds.c                    | 143 +++++++++++++
>  10 files changed, 617 insertions(+), 4 deletions(-)
>
> diff --git a/block/block-backend.c b/block/block-backend.c
> index d4a5df2ac2..c5798651df 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1775,6 +1775,57 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>      return ret;
>  }
>
> +/*
> + * Send a zone_report command.
> + * offset is a byte offset from the start of the device. No alignment
> + * required for offset.
> + * nr_zones represents IN maximum and OUT actual.
> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    unsigned int *nr_zones,
> +                                    BlockZoneDescriptor *zones)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk); /* increase before waiting */
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +        blk_dec_in_flight(blk);
> +        return -ENOMEDIUM;
> +    }
> +    ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
> +/*
> + * Send a zone_management command.
> + * op is the zone operation.
> + * offset is the starting zone specified as a sector offset.
> + * len is the maximum number of sectors the command should operate on. It
> + * should be aligned with the zone sector size.
> + */
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +        int64_t offset, int64_t len)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +
> +    ret = blk_check_byte_request(blk, offset, len);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 0a8b4b426e..e3efba6db7 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -67,6 +67,9 @@
>  #include <sys/param.h>
>  #include <sys/syscall.h>
>  #include <sys/vfs.h>
> +#if defined(CONFIG_BLKZONED)
> +#include <linux/blkzoned.h>
> +#endif
>  #include <linux/cdrom.h>
>  #include <linux/fd.h>
>  #include <linux/fs.h>
> @@ -216,6 +219,13 @@ typedef struct RawPosixAIOData {
>              PreallocMode prealloc;
>              Error **errp;
>          } truncate;
> +        struct {
> +            unsigned int *nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report;
> +        struct {
> +            unsigned long zone_op;
> +        } zone_mgmt;
>      };
>  } RawPosixAIOData;
>
> @@ -1339,7 +1349,7 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  #endif
>
>      if (bs->sg || S_ISBLK(st.st_mode)) {
> -        int ret = hdev_get_max_hw_transfer(s->fd, &st);
> +        ret = hdev_get_max_hw_transfer(s->fd, &st);
>
>          if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
>              bs->bl.max_hw_transfer = ret;
> @@ -1356,6 +1366,27 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>          zoned = BLK_Z_NONE;
>      }
>      bs->bl.zoned = zoned;
> +    if (zoned != BLK_Z_NONE) {
> +        ret = get_sysfs_long_val(&st, "chunk_sectors");
> +        if (ret > 0) {
> +            bs->bl.zone_sectors = ret;
> +        }
> +
> +        ret = get_sysfs_long_val(&st, "zone_append_max_bytes");
> +        if (ret > 0) {
> +            bs->bl.zone_append_max_bytes = ret;
> +        }
> +
> +        ret = get_sysfs_long_val(&st, "max_open_zones");
> +        if (ret >= 0) {
> +            bs->bl.max_open_zones = ret;
> +        }
> +
> +        ret = get_sysfs_long_val(&st, "max_active_zones");
> +        if (ret >= 0) {
> +            bs->bl.max_active_zones = ret;
> +        }
> +    }
>  }
>
>  static int check_for_dasd(int fd)
> @@ -1850,6 +1881,136 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
>  }
>  #endif
>
> +/*
> + * parse_zone - Fill a zone descriptor
> + */
> +#if defined(CONFIG_BLKZONED)
> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> +                              const struct blk_zone *blkz) {
> +    zone->start = blkz->start;
> +    zone->length = blkz->len;
> +    zone->cap = blkz->capacity;
> +    zone->wp = blkz->wp;
> +
> +    switch (blkz->type) {
> +    case BLK_ZONE_TYPE_SEQWRITE_REQ:
> +        zone->type = BLK_ZT_SWR;
> +        break;
> +    case BLK_ZONE_TYPE_SEQWRITE_PREF:
> +        zone->type = BLK_ZT_SWP;
> +        break;
> +    case BLK_ZONE_TYPE_CONVENTIONAL:
> +        zone->type = BLK_ZT_CONV;
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> +    switch (blkz->cond) {
> +    case BLK_ZONE_COND_NOT_WP:
> +        zone->cond = BLK_ZS_NOT_WP;
> +        break;
> +    case BLK_ZONE_COND_EMPTY:
> +        zone->cond = BLK_ZS_EMPTY;
> +        break;
> +    case BLK_ZONE_COND_IMP_OPEN:
> +        zone->cond =BLK_ZS_IOPEN;
> +        break;
> +    case BLK_ZONE_COND_EXP_OPEN:
> +        zone->cond = BLK_ZS_EOPEN;
> +        break;
> +    case BLK_ZONE_COND_CLOSED:
> +        zone->cond = BLK_ZS_CLOSED;
> +        break;
> +    case BLK_ZONE_COND_READONLY:
> +        zone->cond = BLK_ZS_RDONLY;
> +        break;
> +    case BLK_ZONE_COND_FULL:
> +        zone->cond = BLK_ZS_FULL;
> +        break;
> +    case BLK_ZONE_COND_OFFLINE:
> +        zone->cond = BLK_ZS_OFFLINE;
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +}
> +#endif
> +
> +static int handle_aiocb_zone_report(void *opaque) {
> +#if defined(CONFIG_BLKZONED)
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    unsigned int *nr_zones = aiocb->zone_report.nr_zones;
> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> +    /* zoned block devices use 512-byte sectors */
> +    int64_t sector = aiocb->aio_offset / 512;
> +
> +    struct blk_zone *blkz;
> +    int64_t rep_size;
> +    unsigned int nrz;
> +    int ret, n = 0, i = 0;
> +
> +    nrz = *nr_zones;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +    rep = g_malloc(rep_size);
> +
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = sector;
> +        rep->nr_zones = nrz - n;
> +
> +        do {
> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> +        } while (ret != 0 && errno == EINTR);
> +        if (ret != 0) {
> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +                         fd, sector, errno);
> +            return -errno;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            parse_zone(&zones[n], &blkz[i]);
> +            /* The next report should start after the last zone reported */
> +            sector = blkz[i].start + blkz[i].len;
> +        }
> +    }
> +
> +    *nr_zones = n;
> +    return 0;
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
> +static int handle_aiocb_zone_mgmt(void *opaque) {
> +#if defined(CONFIG_BLKZONED)
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t sector = aiocb->aio_offset;
> +    int64_t nr_sectors = aiocb->aio_nbytes;

Should be:
+    int64_t sector = aiocb->aio_offset / 512;
+    int64_t nr_sectors = aiocb->aio_nbytes / 512;

To be clear with the unit of zone commands, specify here as we discussed before:
Block layer APIs use byte unit for offset, len while virtio-blk device
tends to use sector unit. So blkzone in the guest passes sector unit
of offset, len, which should be multiply by 512.
Besides, pwrite()/pwritev() that zone append command takes advantage
of uses byte offset.


> +    struct blk_zone_range range;
> +    int ret;
> +
> +    /* Execute the operation */
> +    range.sector = sector;
> +    range.nr_sectors = nr_sectors;
> +    do {
> +        ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
> +    } while (ret != 0 && errno == EINTR);
> +
> +    return ret;
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
>  static int handle_aiocb_copy_range(void *opaque)
>  {
>      RawPosixAIOData *aiocb = opaque;
> @@ -3022,6 +3183,118 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
>      }
>  }
>
> +/*
> + * zone report - Get a zone block device's information in the form
> + * of an array of zone descriptors.
> + *
> + * @param bs: passing zone block device file descriptor
> + * @param zones: an array of zone descriptors to hold zone
> + * information on reply
> + * @param offset: offset can be any byte within the zone size.
> + * @param len: (not sure yet.
> + * @return 0 on success, -1 on failure
> + */
> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                                           unsigned int *nr_zones,
> +                                           BlockZoneDescriptor *zones) {
> +#if defined(CONFIG_BLKZONED)
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs         = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type   = QEMU_AIO_ZONE_REPORT,
> +        .aio_offset = offset,
> +        .zone_report    = {
> +                .nr_zones       = nr_zones,
> +                .zones          = zones,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
> +/*
> + * zone management operations - Execute an operation on a zone
> + */
> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> +        int64_t offset, int64_t len) {
> +#if defined(CONFIG_BLKZONED)
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +    int64_t zone_sector, zone_sector_mask;
> +    const char *ioctl_name;
> +    unsigned long zone_op;
> +    int ret;
> +
> +    struct stat st;
> +    if (fstat(s->fd, &st) < 0) {
> +        ret = -errno;
> +        return ret;
> +    }
> +    zone_sector = bs->bl.zone_sectors;
> +    zone_sector_mask = zone_sector - 1;
> +    if (offset & zone_sector_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> +                     "%" PRId64 "", offset, zone_sector);
> +        return -EINVAL;
> +    }
> +
> +    if (len & zone_sector_mask) {
> +        error_report("number of sectors %" PRId64 " is not aligned to zone size"
> +                      " %" PRId64 "", len, zone_sector);
> +        return -EINVAL;
> +    }
> +
> +    switch (op) {
> +    case BLK_ZO_OPEN:
> +        ioctl_name = "BLKOPENZONE";
> +        zone_op = BLKOPENZONE;
> +        break;
> +    case BLK_ZO_CLOSE:
> +        ioctl_name = "BLKCLOSEZONE";
> +        zone_op = BLKCLOSEZONE;
> +        break;
> +    case BLK_ZO_FINISH:
> +        ioctl_name = "BLKFINISHZONE";
> +        zone_op = BLKFINISHZONE;
> +        break;
> +    case BLK_ZO_RESET:
> +        ioctl_name = "BLKRESETZONE";
> +        zone_op = BLKRESETZONE;
> +        break;
> +    default:
> +        error_report("Invalid zone operation 0x%x", op);
> +        return -EINVAL;
> +    }
> +
> +    acb = (RawPosixAIOData) {
> +        .bs             = bs,
> +        .aio_fildes     = s->fd,
> +        .aio_type       = QEMU_AIO_ZONE_MGMT,
> +        .aio_offset     = offset,
> +        .aio_nbytes     = len,
> +        .zone_mgmt  = {
> +                .zone_op = zone_op,
> +        },
> +    };
> +
> +    ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> +    if (ret != 0) {
> +        error_report("ioctl %s failed %d", ioctl_name, errno);
> +        return -errno;
> +    }
> +
> +    return ret;
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
>  static coroutine_fn int
>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                  bool blkdev)
> @@ -3752,6 +4025,54 @@ static BlockDriver bdrv_host_device = {
>  #endif
>  };
>
> +#if defined(CONFIG_BLKZONED)
> +static BlockDriver bdrv_zoned_host_device = {
> +        .format_name = "zoned_host_device",
> +        .protocol_name = "zoned_host_device",
> +        .instance_size = sizeof(BDRVRawState),
> +        .bdrv_needs_filename = true,
> +        .bdrv_probe_device  = hdev_probe_device,
> +        .bdrv_file_open     = hdev_open,
> +        .bdrv_close         = raw_close,
> +        .bdrv_reopen_prepare = raw_reopen_prepare,
> +        .bdrv_reopen_commit  = raw_reopen_commit,
> +        .bdrv_reopen_abort   = raw_reopen_abort,
> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> +        .create_opts         = &bdrv_create_opts_simple,
> +        .mutable_opts        = mutable_opts,
> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> +
> +        .bdrv_co_preadv         = raw_co_preadv,
> +        .bdrv_co_pwritev        = raw_co_pwritev,
> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> +        .bdrv_refresh_limits = raw_refresh_limits,
> +        .bdrv_io_plug = raw_aio_plug,
> +        .bdrv_io_unplug = raw_aio_unplug,
> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> +
> +        .bdrv_co_truncate       = raw_co_truncate,
> +        .bdrv_getlength = raw_getlength,
> +        .bdrv_get_info = raw_get_info,
> +        .bdrv_get_allocated_file_size
> +                            = raw_get_allocated_file_size,
> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> +        .bdrv_check_perm = raw_check_perm,
> +        .bdrv_set_perm   = raw_set_perm,
> +        .bdrv_abort_perm_update = raw_abort_perm_update,
> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> +        .bdrv_probe_geometry = hdev_probe_geometry,
> +        .bdrv_co_ioctl = hdev_co_ioctl,
> +
> +        /* zone management operations */
> +        .bdrv_co_zone_report = raw_co_zone_report,
> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +};
> +#endif
> +
>  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>  static void cdrom_parse_filename(const char *filename, QDict *options,
>                                   Error **errp)
> @@ -4012,6 +4333,9 @@ static void bdrv_file_init(void)
>      bdrv_register(&bdrv_file);
>  #if defined(HAVE_HOST_BLOCK_DEVICE)
>      bdrv_register(&bdrv_host_device);
> +#if defined(CONFIG_BLKZONED)
> +    bdrv_register(&bdrv_zoned_host_device);
> +#endif
>  #ifdef __linux__
>      bdrv_register(&bdrv_host_cdrom);
>  #endif
> diff --git a/block/io.c b/block/io.c
> index 0a8cbefe86..de9ec1d740 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3198,6 +3198,47 @@ out:
>      return co.ret;
>  }
>
> +int bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                        unsigned int *nr_zones,
> +                        BlockZoneDescriptor *zones)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +            .coroutine = qemu_coroutine_self(),
> +    };
> +    IO_CODE();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_zone_report) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +    co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> +        int64_t offset, int64_t len)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +            .coroutine = qemu_coroutine_self(),
> +    };
> +    IO_CODE();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_zone_mgmt) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +    co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>  {
>      IO_CODE();
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index fd25ffa9be..65463b88d9 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
>  /* Ensure contents are flushed to disk.  */
>  int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
>
> +/* Report zone information of zone block device. */
> +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                                     unsigned int *nr_zones,
> +                                     BlockZoneDescriptor *zones);
> +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> +                                   int64_t offset, int64_t len);
> +
>  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
>  int bdrv_block_status(BlockDriverState *bs, int64_t offset,
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 7f7863cc9e..8541f36123 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -691,6 +691,12 @@ struct BlockDriver {
>                                            QEMUIOVector *qiov,
>                                            int64_t pos);
>
> +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> +            int64_t offset, unsigned int *nr_zones,
> +            BlockZoneDescriptor *zones);
> +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
> +            int64_t offset, int64_t len);
> +
>      /* removable device specific */
>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
>      void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
> @@ -828,6 +834,21 @@ typedef struct BlockLimits {
>
>      /* device zone model */
>      BlockZoneModel zoned;
> +
> +    /* zone size expressed in 512-byte sectors */
> +    uint32_t zone_sectors;
> +
> +    /* total number of zones */
> +    unsigned int nr_zones;
> +
> +    /* maximum size in bytes of a zone append write operation */
> +    int64_t zone_append_max_bytes;
> +
> +    /* maximum number of open zones */
> +    int64_t max_open_zones;
> +
> +    /* maximum number of active zones */
> +    int64_t max_active_zones;
>  } BlockLimits;
>
>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> index 21fc10c4c9..3d26929cdd 100644
> --- a/include/block/raw-aio.h
> +++ b/include/block/raw-aio.h
> @@ -29,6 +29,8 @@
>  #define QEMU_AIO_WRITE_ZEROES 0x0020
>  #define QEMU_AIO_COPY_RANGE   0x0040
>  #define QEMU_AIO_TRUNCATE     0x0080
> +#define QEMU_AIO_ZONE_REPORT  0x0100
> +#define QEMU_AIO_ZONE_MGMT    0x0200
>  #define QEMU_AIO_TYPE_MASK \
>          (QEMU_AIO_READ | \
>           QEMU_AIO_WRITE | \
> @@ -37,7 +39,9 @@
>           QEMU_AIO_DISCARD | \
>           QEMU_AIO_WRITE_ZEROES | \
>           QEMU_AIO_COPY_RANGE | \
> -         QEMU_AIO_TRUNCATE)
> +         QEMU_AIO_TRUNCATE  | \
> +         QEMU_AIO_ZONE_REPORT | \
> +         QEMU_AIO_ZONE_MGMT)
>
>  /* AIO flags */
>  #define QEMU_AIO_MISALIGNED   0x1000
> diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
> index 50f5aa2e07..6835525582 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -45,6 +45,12 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
>                              BlockCompletionFunc *cb, void *opaque);
>  BlockAIOCB *blk_aio_flush(BlockBackend *blk,
>                            BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> +                                unsigned int *nr_zones, BlockZoneDescriptor *zones,
> +                                BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +                              int64_t offset, int64_t len,
> +                              BlockCompletionFunc *cb, void *opaque);
>  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
>                               BlockCompletionFunc *cb, void *opaque);
>  void blk_aio_cancel_async(BlockAIOCB *acb);
> @@ -156,6 +162,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
>  int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
>                                        int64_t bytes, BdrvRequestFlags flags);
>
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    unsigned int *nr_zones,
> +                                    BlockZoneDescriptor *zones);
> +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
> +                                         unsigned int *nr_zones,
> +                                         BlockZoneDescriptor *zones);
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +                                  int64_t offset, int64_t len);
> +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +                                       int64_t offset, int64_t len);
> +
>  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
>                                        int64_t bytes);
>  int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
> diff --git a/meson.build b/meson.build
> index 20fddbd707..2f436bb355 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -1883,6 +1883,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('live_block_migration').al
>  # has_header
>  config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
>  config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h'))
> +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h'))
>  config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h'))
>  config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h'))
>  config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h'))
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 2173e7734a..c6bbb7a037 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -2942,6 +2942,7 @@
>  # @compress: Since 5.0
>  # @copy-before-write: Since 6.2
>  # @snapshot-access: Since 7.0
> +# @zoned_host_device: Since 7.2
>  #
>  # Since: 2.9
>  ##
> @@ -2955,7 +2956,8 @@
>              'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
>              'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
>              { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
> -            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
> +            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
> +            { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }
>
>  ##
>  # @BlockdevOptionsFile:
> @@ -4329,7 +4331,9 @@
>        'vhdx':       'BlockdevOptionsGenericFormat',
>        'vmdk':       'BlockdevOptionsGenericCOWFormat',
>        'vpc':        'BlockdevOptionsGenericFormat',
> -      'vvfat':      'BlockdevOptionsVVFAT'
> +      'vvfat':      'BlockdevOptionsVVFAT',
> +      'zoned_host_device': { 'type': 'BlockdevOptionsFile',
> +                             'if': 'CONFIG_BLKZONED' }
>    } }
>
>  ##
> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
> index 952dc940f1..446a059603 100644
> --- a/qemu-io-cmds.c
> +++ b/qemu-io-cmds.c
> @@ -1712,6 +1712,144 @@ static const cmdinfo_t flush_cmd = {
>      .oneline    = "flush all in-core file state to disk",
>  };
>
> +static int zone_report_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int ret;
> +    int64_t offset;
> +    unsigned int nr_zones;
> +
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    nr_zones = cvtnum(argv[optind]);
> +
> +    g_autofree BlockZoneDescriptor *zones = NULL;
> +    zones = g_new(BlockZoneDescriptor, nr_zones);
> +    ret = blk_zone_report(blk, offset, &nr_zones, zones);
> +    if (ret < 0) {
> +        printf("zone report failed: %s\n", strerror(-ret));
> +    } else {
> +        for (int i = 0; i < nr_zones; ++i) {
> +            printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", "
> +                   "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", "
> +                   "zcond:%u, [type: %u]\n",
> +                   zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
> +                   zones[i].cond, zones[i].type);
> +        }
> +    }
> +    return ret;
> +}
> +
> +static const cmdinfo_t zone_report_cmd = {
> +        .name = "zone_report",
> +        .altname = "zrp",
> +        .cfunc = zone_report_f,
> +        .argmin = 2,
> +        .argmax = 2,
> +        .args = "offset number",
> +        .oneline = "report zone information",
> +};
> +
> +static int zone_open_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int ret;
> +    int64_t offset, len;
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);
> +    ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len);
> +    if (ret < 0) {
> +        printf("zone open failed: %s\n", strerror(-ret));
> +    }
> +    return ret;
> +}
> +
> +static const cmdinfo_t zone_open_cmd = {
> +        .name = "zone_open",
> +        .altname = "zo",
> +        .cfunc = zone_open_f,
> +        .argmin = 2,
> +        .argmax = 2,
> +        .args = "offset len",
> +        .oneline = "explicit open a range of zones in zone block device",
> +};
> +
> +static int zone_close_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int ret;
> +    int64_t offset, len;
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);
> +    ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len);
> +    if (ret < 0) {
> +        printf("zone close failed: %s\n", strerror(-ret));
> +    }
> +    return ret;
> +}
> +
> +static const cmdinfo_t zone_close_cmd = {
> +        .name = "zone_close",
> +        .altname = "zc",
> +        .cfunc = zone_close_f,
> +        .argmin = 2,
> +        .argmax = 2,
> +        .args = "offset len",
> +        .oneline = "close a range of zones in zone block device",
> +};
> +
> +static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int ret;
> +    int64_t offset, len;
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);
> +    ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len);
> +    if (ret < 0) {
> +        printf("zone finish failed: %s\n", strerror(-ret));
> +    }
> +    return ret;
> +}
> +
> +static const cmdinfo_t zone_finish_cmd = {
> +        .name = "zone_finish",
> +        .altname = "zf",
> +        .cfunc = zone_finish_f,
> +        .argmin = 2,
> +        .argmax = 2,
> +        .args = "offset len",
> +        .oneline = "finish a range of zones in zone block device",
> +};
> +
> +static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int ret;
> +    int64_t offset, len;
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);
> +    ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len);
> +    if (ret < 0) {
> +        printf("zone reset failed: %s\n", strerror(-ret));
> +    }
> +    return ret;
> +}
> +
> +static const cmdinfo_t zone_reset_cmd = {
> +        .name = "zone_reset",
> +        .altname = "zrs",
> +        .cfunc = zone_reset_f,
> +        .argmin = 2,
> +        .argmax = 2,
> +        .args = "offset len",
> +        .oneline = "reset a zone write pointer in zone block device",
> +};
> +
>  static int truncate_f(BlockBackend *blk, int argc, char **argv);
>  static const cmdinfo_t truncate_cmd = {
>      .name       = "truncate",
> @@ -2504,6 +2642,11 @@ static void __attribute((constructor)) init_qemuio_commands(void)
>      qemuio_add_command(&aio_write_cmd);
>      qemuio_add_command(&aio_flush_cmd);
>      qemuio_add_command(&flush_cmd);
> +    qemuio_add_command(&zone_report_cmd);
> +    qemuio_add_command(&zone_open_cmd);
> +    qemuio_add_command(&zone_close_cmd);
> +    qemuio_add_command(&zone_finish_cmd);
> +    qemuio_add_command(&zone_reset_cmd);
>      qemuio_add_command(&truncate_cmd);
>      qemuio_add_command(&length_cmd);
>      qemuio_add_command(&info_cmd);
> --
> 2.37.2
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
@ 2022-08-29 12:52 Sam Li
  2022-08-29 13:00 ` Sam Li
  0 siblings, 1 reply; 14+ messages in thread
From: Sam Li @ 2022-08-29 12:52 UTC (permalink / raw)
  To: qemu-devel
  Cc: stefanha, damien.lemoal, Dmitry.Fomichev, hare, qemu-block,
	hreitz, eblake, armbru, fam, kwolf, Sam Li

By adding zone management operations in BlockDriver, storage controller
emulation can use the new block layer APIs including Report Zone and
four zone management operations (open, close, finish, reset).

Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
zone_close(zc), zone_reset(zrs), zone_finish(zf).

For example, to test zone_report, use following command:
$ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
-c "zrp offset nr_zones"

Signed-off-by: Sam Li <faithilikerun@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 block/block-backend.c             |  51 +++++
 block/file-posix.c                | 326 +++++++++++++++++++++++++++++-
 block/io.c                        |  41 ++++
 include/block/block-io.h          |   7 +
 include/block/block_int-common.h  |  21 ++
 include/block/raw-aio.h           |   6 +-
 include/sysemu/block-backend-io.h |  17 ++
 meson.build                       |   1 +
 qapi/block-core.json              |   8 +-
 qemu-io-cmds.c                    | 143 +++++++++++++
 10 files changed, 617 insertions(+), 4 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index d4a5df2ac2..c5798651df 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1775,6 +1775,57 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
     return ret;
 }
 
+/*
+ * Send a zone_report command.
+ * offset is a byte offset from the start of the device. No alignment
+ * required for offset.
+ * nr_zones represents IN maximum and OUT actual.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    unsigned int *nr_zones,
+                                    BlockZoneDescriptor *zones)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk); /* increase before waiting */
+    blk_wait_while_drained(blk);
+    if (!blk_is_available(blk)) {
+        blk_dec_in_flight(blk);
+        return -ENOMEDIUM;
+    }
+    ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
+/*
+ * Send a zone_management command.
+ * op is the zone operation.
+ * offset is the starting zone specified as a sector offset.
+ * len is the maximum number of sectors the command should operate on. It
+ * should be aligned with the zone sector size.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+        int64_t offset, int64_t len)
+{
+    int ret;
+    IO_CODE();
+
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+
+    ret = blk_check_byte_request(blk, offset, len);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 0a8b4b426e..e3efba6db7 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -67,6 +67,9 @@
 #include <sys/param.h>
 #include <sys/syscall.h>
 #include <sys/vfs.h>
+#if defined(CONFIG_BLKZONED)
+#include <linux/blkzoned.h>
+#endif
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <linux/fs.h>
@@ -216,6 +219,13 @@ typedef struct RawPosixAIOData {
             PreallocMode prealloc;
             Error **errp;
         } truncate;
+        struct {
+            unsigned int *nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report;
+        struct {
+            unsigned long zone_op;
+        } zone_mgmt;
     };
 } RawPosixAIOData;
 
@@ -1339,7 +1349,7 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 #endif
 
     if (bs->sg || S_ISBLK(st.st_mode)) {
-        int ret = hdev_get_max_hw_transfer(s->fd, &st);
+        ret = hdev_get_max_hw_transfer(s->fd, &st);
 
         if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
             bs->bl.max_hw_transfer = ret;
@@ -1356,6 +1366,27 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
         zoned = BLK_Z_NONE;
     }
     bs->bl.zoned = zoned;
+    if (zoned != BLK_Z_NONE) {
+        ret = get_sysfs_long_val(&st, "chunk_sectors");
+        if (ret > 0) {
+            bs->bl.zone_sectors = ret;
+        }
+
+        ret = get_sysfs_long_val(&st, "zone_append_max_bytes");
+        if (ret > 0) {
+            bs->bl.zone_append_max_bytes = ret;
+        }
+
+        ret = get_sysfs_long_val(&st, "max_open_zones");
+        if (ret >= 0) {
+            bs->bl.max_open_zones = ret;
+        }
+
+        ret = get_sysfs_long_val(&st, "max_active_zones");
+        if (ret >= 0) {
+            bs->bl.max_active_zones = ret;
+        }
+    }
 }
 
 static int check_for_dasd(int fd)
@@ -1850,6 +1881,136 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
 }
 #endif
 
+/*
+ * parse_zone - Fill a zone descriptor
+ */
+#if defined(CONFIG_BLKZONED)
+static inline void parse_zone(struct BlockZoneDescriptor *zone,
+                              const struct blk_zone *blkz) {
+    zone->start = blkz->start;
+    zone->length = blkz->len;
+    zone->cap = blkz->capacity;
+    zone->wp = blkz->wp;
+
+    switch (blkz->type) {
+    case BLK_ZONE_TYPE_SEQWRITE_REQ:
+        zone->type = BLK_ZT_SWR;
+        break;
+    case BLK_ZONE_TYPE_SEQWRITE_PREF:
+        zone->type = BLK_ZT_SWP;
+        break;
+    case BLK_ZONE_TYPE_CONVENTIONAL:
+        zone->type = BLK_ZT_CONV;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    switch (blkz->cond) {
+    case BLK_ZONE_COND_NOT_WP:
+        zone->cond = BLK_ZS_NOT_WP;
+        break;
+    case BLK_ZONE_COND_EMPTY:
+        zone->cond = BLK_ZS_EMPTY;
+        break;
+    case BLK_ZONE_COND_IMP_OPEN:
+        zone->cond =BLK_ZS_IOPEN;
+        break;
+    case BLK_ZONE_COND_EXP_OPEN:
+        zone->cond = BLK_ZS_EOPEN;
+        break;
+    case BLK_ZONE_COND_CLOSED:
+        zone->cond = BLK_ZS_CLOSED;
+        break;
+    case BLK_ZONE_COND_READONLY:
+        zone->cond = BLK_ZS_RDONLY;
+        break;
+    case BLK_ZONE_COND_FULL:
+        zone->cond = BLK_ZS_FULL;
+        break;
+    case BLK_ZONE_COND_OFFLINE:
+        zone->cond = BLK_ZS_OFFLINE;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+#endif
+
+static int handle_aiocb_zone_report(void *opaque) {
+#if defined(CONFIG_BLKZONED)
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    unsigned int *nr_zones = aiocb->zone_report.nr_zones;
+    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
+    /* zoned block devices use 512-byte sectors */
+    int64_t sector = aiocb->aio_offset / 512;
+
+    struct blk_zone *blkz;
+    int64_t rep_size;
+    unsigned int nrz;
+    int ret, n = 0, i = 0;
+
+    nrz = *nr_zones;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+    rep = g_malloc(rep_size);
+
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                         fd, sector, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            parse_zone(&zones[n], &blkz[i]);
+            /* The next report should start after the last zone reported */
+            sector = blkz[i].start + blkz[i].len;
+        }
+    }
+
+    *nr_zones = n;
+    return 0;
+#else
+    return -ENOTSUP;
+#endif
+}
+
+static int handle_aiocb_zone_mgmt(void *opaque) {
+#if defined(CONFIG_BLKZONED)
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    int64_t sector = aiocb->aio_offset;
+    int64_t nr_sectors = aiocb->aio_nbytes;
+    struct blk_zone_range range;
+    int ret;
+
+    /* Execute the operation */
+    range.sector = sector;
+    range.nr_sectors = nr_sectors;
+    do {
+        ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
+    } while (ret != 0 && errno == EINTR);
+
+    return ret;
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static int handle_aiocb_copy_range(void *opaque)
 {
     RawPosixAIOData *aiocb = opaque;
@@ -3022,6 +3183,118 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
     }
 }
 
+/*
+ * zone report - Get a zone block device's information in the form
+ * of an array of zone descriptors.
+ *
+ * @param bs: passing zone block device file descriptor
+ * @param zones: an array of zone descriptors to hold zone
+ * information on reply
+ * @param offset: offset can be any byte within the zone size.
+ * @param len: (not sure yet.
+ * @return 0 on success, -1 on failure
+ */
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+                                           unsigned int *nr_zones,
+                                           BlockZoneDescriptor *zones) {
+#if defined(CONFIG_BLKZONED)
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+
+    acb = (RawPosixAIOData) {
+        .bs         = bs,
+        .aio_fildes = s->fd,
+        .aio_type   = QEMU_AIO_ZONE_REPORT,
+        .aio_offset = offset,
+        .zone_report    = {
+                .nr_zones       = nr_zones,
+                .zones          = zones,
+        },
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
+#else
+    return -ENOTSUP;
+#endif
+}
+
+/*
+ * zone management operations - Execute an operation on a zone
+ */
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+        int64_t offset, int64_t len) {
+#if defined(CONFIG_BLKZONED)
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+    int64_t zone_sector, zone_sector_mask;
+    const char *ioctl_name;
+    unsigned long zone_op;
+    int ret;
+
+    struct stat st;
+    if (fstat(s->fd, &st) < 0) {
+        ret = -errno;
+        return ret;
+    }
+    zone_sector = bs->bl.zone_sectors;
+    zone_sector_mask = zone_sector - 1;
+    if (offset & zone_sector_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size "
+                     "%" PRId64 "", offset, zone_sector);
+        return -EINVAL;
+    }
+
+    if (len & zone_sector_mask) {
+        error_report("number of sectors %" PRId64 " is not aligned to zone size"
+                      " %" PRId64 "", len, zone_sector);
+        return -EINVAL;
+    }
+
+    switch (op) {
+    case BLK_ZO_OPEN:
+        ioctl_name = "BLKOPENZONE";
+        zone_op = BLKOPENZONE;
+        break;
+    case BLK_ZO_CLOSE:
+        ioctl_name = "BLKCLOSEZONE";
+        zone_op = BLKCLOSEZONE;
+        break;
+    case BLK_ZO_FINISH:
+        ioctl_name = "BLKFINISHZONE";
+        zone_op = BLKFINISHZONE;
+        break;
+    case BLK_ZO_RESET:
+        ioctl_name = "BLKRESETZONE";
+        zone_op = BLKRESETZONE;
+        break;
+    default:
+        error_report("Invalid zone operation 0x%x", op);
+        return -EINVAL;
+    }
+
+    acb = (RawPosixAIOData) {
+        .bs             = bs,
+        .aio_fildes     = s->fd,
+        .aio_type       = QEMU_AIO_ZONE_MGMT,
+        .aio_offset     = offset,
+        .aio_nbytes     = len,
+        .zone_mgmt  = {
+                .zone_op = zone_op,
+        },
+    };
+
+    ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
+    if (ret != 0) {
+        error_report("ioctl %s failed %d", ioctl_name, errno);
+        return -errno;
+    }
+
+    return ret;
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static coroutine_fn int
 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
                 bool blkdev)
@@ -3752,6 +4025,54 @@ static BlockDriver bdrv_host_device = {
 #endif
 };
 
+#if defined(CONFIG_BLKZONED)
+static BlockDriver bdrv_zoned_host_device = {
+        .format_name = "zoned_host_device",
+        .protocol_name = "zoned_host_device",
+        .instance_size = sizeof(BDRVRawState),
+        .bdrv_needs_filename = true,
+        .bdrv_probe_device  = hdev_probe_device,
+        .bdrv_file_open     = hdev_open,
+        .bdrv_close         = raw_close,
+        .bdrv_reopen_prepare = raw_reopen_prepare,
+        .bdrv_reopen_commit  = raw_reopen_commit,
+        .bdrv_reopen_abort   = raw_reopen_abort,
+        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
+        .create_opts         = &bdrv_create_opts_simple,
+        .mutable_opts        = mutable_opts,
+        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
+        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
+
+        .bdrv_co_preadv         = raw_co_preadv,
+        .bdrv_co_pwritev        = raw_co_pwritev,
+        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
+        .bdrv_co_pdiscard       = hdev_co_pdiscard,
+        .bdrv_co_copy_range_from = raw_co_copy_range_from,
+        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
+        .bdrv_refresh_limits = raw_refresh_limits,
+        .bdrv_io_plug = raw_aio_plug,
+        .bdrv_io_unplug = raw_aio_unplug,
+        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
+
+        .bdrv_co_truncate       = raw_co_truncate,
+        .bdrv_getlength = raw_getlength,
+        .bdrv_get_info = raw_get_info,
+        .bdrv_get_allocated_file_size
+                            = raw_get_allocated_file_size,
+        .bdrv_get_specific_stats = hdev_get_specific_stats,
+        .bdrv_check_perm = raw_check_perm,
+        .bdrv_set_perm   = raw_set_perm,
+        .bdrv_abort_perm_update = raw_abort_perm_update,
+        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
+        .bdrv_probe_geometry = hdev_probe_geometry,
+        .bdrv_co_ioctl = hdev_co_ioctl,
+
+        /* zone management operations */
+        .bdrv_co_zone_report = raw_co_zone_report,
+        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+};
+#endif
+
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static void cdrom_parse_filename(const char *filename, QDict *options,
                                  Error **errp)
@@ -4012,6 +4333,9 @@ static void bdrv_file_init(void)
     bdrv_register(&bdrv_file);
 #if defined(HAVE_HOST_BLOCK_DEVICE)
     bdrv_register(&bdrv_host_device);
+#if defined(CONFIG_BLKZONED)
+    bdrv_register(&bdrv_zoned_host_device);
+#endif
 #ifdef __linux__
     bdrv_register(&bdrv_host_cdrom);
 #endif
diff --git a/block/io.c b/block/io.c
index 0a8cbefe86..de9ec1d740 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3198,6 +3198,47 @@ out:
     return co.ret;
 }
 
+int bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
+                        unsigned int *nr_zones,
+                        BlockZoneDescriptor *zones)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_report) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+        int64_t offset, int64_t len)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_mgmt) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
     IO_CODE();
diff --git a/include/block/block-io.h b/include/block/block-io.h
index fd25ffa9be..65463b88d9 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
 /* Ensure contents are flushed to disk.  */
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
 
+/* Report zone information of zone block device. */
+int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
+                                     unsigned int *nr_zones,
+                                     BlockZoneDescriptor *zones);
+int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+                                   int64_t offset, int64_t len);
+
 int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
 int bdrv_block_status(BlockDriverState *bs, int64_t offset,
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 7f7863cc9e..8541f36123 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -691,6 +691,12 @@ struct BlockDriver {
                                           QEMUIOVector *qiov,
                                           int64_t pos);
 
+    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
+            int64_t offset, unsigned int *nr_zones,
+            BlockZoneDescriptor *zones);
+    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
+            int64_t offset, int64_t len);
+
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
     void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
@@ -828,6 +834,21 @@ typedef struct BlockLimits {
 
     /* device zone model */
     BlockZoneModel zoned;
+
+    /* zone size expressed in 512-byte sectors */
+    uint32_t zone_sectors;
+
+    /* total number of zones */
+    unsigned int nr_zones;
+
+    /* maximum size in bytes of a zone append write operation */
+    int64_t zone_append_max_bytes;
+
+    /* maximum number of open zones */
+    int64_t max_open_zones;
+
+    /* maximum number of active zones */
+    int64_t max_active_zones;
 } BlockLimits;
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 21fc10c4c9..3d26929cdd 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -29,6 +29,8 @@
 #define QEMU_AIO_WRITE_ZEROES 0x0020
 #define QEMU_AIO_COPY_RANGE   0x0040
 #define QEMU_AIO_TRUNCATE     0x0080
+#define QEMU_AIO_ZONE_REPORT  0x0100
+#define QEMU_AIO_ZONE_MGMT    0x0200
 #define QEMU_AIO_TYPE_MASK \
         (QEMU_AIO_READ | \
          QEMU_AIO_WRITE | \
@@ -37,7 +39,9 @@
          QEMU_AIO_DISCARD | \
          QEMU_AIO_WRITE_ZEROES | \
          QEMU_AIO_COPY_RANGE | \
-         QEMU_AIO_TRUNCATE)
+         QEMU_AIO_TRUNCATE  | \
+         QEMU_AIO_ZONE_REPORT | \
+         QEMU_AIO_ZONE_MGMT)
 
 /* AIO flags */
 #define QEMU_AIO_MISALIGNED   0x1000
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index 50f5aa2e07..6835525582 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -45,6 +45,12 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
                             BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
                           BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
+                                unsigned int *nr_zones, BlockZoneDescriptor *zones,
+                                BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+                              int64_t offset, int64_t len,
+                              BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
                              BlockCompletionFunc *cb, void *opaque);
 void blk_aio_cancel_async(BlockAIOCB *acb);
@@ -156,6 +162,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
                                       int64_t bytes, BdrvRequestFlags flags);
 
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    unsigned int *nr_zones,
+                                    BlockZoneDescriptor *zones);
+int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
+                                         unsigned int *nr_zones,
+                                         BlockZoneDescriptor *zones);
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+                                  int64_t offset, int64_t len);
+int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+                                       int64_t offset, int64_t len);
+
 int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
                                       int64_t bytes);
 int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
diff --git a/meson.build b/meson.build
index 20fddbd707..2f436bb355 100644
--- a/meson.build
+++ b/meson.build
@@ -1883,6 +1883,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('live_block_migration').al
 # has_header
 config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
 config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h'))
+config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h'))
 config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h'))
 config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h'))
 config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h'))
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 2173e7734a..c6bbb7a037 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2942,6 +2942,7 @@
 # @compress: Since 5.0
 # @copy-before-write: Since 6.2
 # @snapshot-access: Since 7.0
+# @zoned_host_device: Since 7.2
 #
 # Since: 2.9
 ##
@@ -2955,7 +2956,8 @@
             'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
             'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
-            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
+            { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }
 
 ##
 # @BlockdevOptionsFile:
@@ -4329,7 +4331,9 @@
       'vhdx':       'BlockdevOptionsGenericFormat',
       'vmdk':       'BlockdevOptionsGenericCOWFormat',
       'vpc':        'BlockdevOptionsGenericFormat',
-      'vvfat':      'BlockdevOptionsVVFAT'
+      'vvfat':      'BlockdevOptionsVVFAT',
+      'zoned_host_device': { 'type': 'BlockdevOptionsFile',
+                             'if': 'CONFIG_BLKZONED' }
   } }
 
 ##
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 952dc940f1..446a059603 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1712,6 +1712,144 @@ static const cmdinfo_t flush_cmd = {
     .oneline    = "flush all in-core file state to disk",
 };
 
+static int zone_report_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset;
+    unsigned int nr_zones;
+
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    nr_zones = cvtnum(argv[optind]);
+
+    g_autofree BlockZoneDescriptor *zones = NULL;
+    zones = g_new(BlockZoneDescriptor, nr_zones);
+    ret = blk_zone_report(blk, offset, &nr_zones, zones);
+    if (ret < 0) {
+        printf("zone report failed: %s\n", strerror(-ret));
+    } else {
+        for (int i = 0; i < nr_zones; ++i) {
+            printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", "
+                   "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", "
+                   "zcond:%u, [type: %u]\n",
+                   zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
+                   zones[i].cond, zones[i].type);
+        }
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_report_cmd = {
+        .name = "zone_report",
+        .altname = "zrp",
+        .cfunc = zone_report_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset number",
+        .oneline = "report zone information",
+};
+
+static int zone_open_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len);
+    if (ret < 0) {
+        printf("zone open failed: %s\n", strerror(-ret));
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_open_cmd = {
+        .name = "zone_open",
+        .altname = "zo",
+        .cfunc = zone_open_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset len",
+        .oneline = "explicit open a range of zones in zone block device",
+};
+
+static int zone_close_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len);
+    if (ret < 0) {
+        printf("zone close failed: %s\n", strerror(-ret));
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_close_cmd = {
+        .name = "zone_close",
+        .altname = "zc",
+        .cfunc = zone_close_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset len",
+        .oneline = "close a range of zones in zone block device",
+};
+
+static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len);
+    if (ret < 0) {
+        printf("zone finish failed: %s\n", strerror(-ret));
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_finish_cmd = {
+        .name = "zone_finish",
+        .altname = "zf",
+        .cfunc = zone_finish_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset len",
+        .oneline = "finish a range of zones in zone block device",
+};
+
+static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len);
+    if (ret < 0) {
+        printf("zone reset failed: %s\n", strerror(-ret));
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_reset_cmd = {
+        .name = "zone_reset",
+        .altname = "zrs",
+        .cfunc = zone_reset_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset len",
+        .oneline = "reset a zone write pointer in zone block device",
+};
+
 static int truncate_f(BlockBackend *blk, int argc, char **argv);
 static const cmdinfo_t truncate_cmd = {
     .name       = "truncate",
@@ -2504,6 +2642,11 @@ static void __attribute((constructor)) init_qemuio_commands(void)
     qemuio_add_command(&aio_write_cmd);
     qemuio_add_command(&aio_flush_cmd);
     qemuio_add_command(&flush_cmd);
+    qemuio_add_command(&zone_report_cmd);
+    qemuio_add_command(&zone_open_cmd);
+    qemuio_add_command(&zone_close_cmd);
+    qemuio_add_command(&zone_finish_cmd);
+    qemuio_add_command(&zone_reset_cmd);
     qemuio_add_command(&truncate_cmd);
     qemuio_add_command(&length_cmd);
     qemuio_add_command(&info_cmd);
-- 
2.37.2



^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-09-29  6:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-26 16:17 [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls Sam Li
2022-08-29 19:29 ` Stefan Hajnoczi
2022-08-30 11:57 ` Markus Armbruster
2022-08-30 15:05   ` Sam Li
2022-08-30 15:09     ` Markus Armbruster
2022-08-30 15:19       ` Sam Li
2022-08-31  8:35     ` Markus Armbruster
2022-08-31  8:48       ` Sam Li
2022-09-01 14:57         ` Markus Armbruster
2022-09-01 16:18           ` Markus Armbruster
2022-09-02  2:13           ` Damien Le Moal
2022-09-29  6:22             ` Markus Armbruster
2022-08-29 12:52 Sam Li
2022-08-29 13:00 ` Sam Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.