All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC v3 0/5] *** Add support for zoned device ***
@ 2022-06-27  0:19 Sam Li
  2022-06-27  0:19 ` [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls Sam Li
                   ` (4 more replies)
  0 siblings, 5 replies; 32+ messages in thread
From: Sam Li @ 2022-06-27  0:19 UTC (permalink / raw)
  To: qemu-devel
  Cc: hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block, Sam Li

*** This patch series adds support for zoned device to virtio-blk emulation. Zoned
Storage can support sequential writes, which reduces write amplification in SSD,
leading to higher write throughput and increased capacity.

v3:
- add block layer APIs resembling Linux ZoneBlockDevice ioctls (developing) ***

Sam Li (5):
  block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  qemu-io: add zoned block device operations.
  file-posix: introduce get_sysfs_long_val for zoned device information.
  file-posix: introduce get_sysfs_str_val for device zoned model.
  qemu-iotests: add zone operation tests.

 block/block-backend.c             |  56 +++++
 block/coroutines.h                |   5 +
 block/file-posix.c                | 325 +++++++++++++++++++++++++++++-
 block/io.c                        |  21 ++
 include/block/block-common.h      |  43 +++-
 include/block/block-io.h          |  13 ++
 include/block/block_int-common.h  |  20 ++
 qemu-io-cmds.c                    | 121 +++++++++++
 tests/qemu-iotests/tests/zoned.sh |  49 +++++
 9 files changed, 645 insertions(+), 8 deletions(-)
 create mode 100755 tests/qemu-iotests/tests/zoned.sh

-- 
2.35.3



^ permalink raw reply	[flat|nested] 32+ messages in thread

* [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-27  0:19 [RFC v3 0/5] *** Add support for zoned device *** Sam Li
@ 2022-06-27  0:19 ` Sam Li
  2022-06-27  7:21   ` Hannes Reinecke
  2022-06-27 14:15   ` Stefan Hajnoczi
  2022-06-27  0:19 ` [RFC v3 2/5] qemu-io: add zoned block device operations Sam Li
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 32+ messages in thread
From: Sam Li @ 2022-06-27  0:19 UTC (permalink / raw)
  To: qemu-devel
  Cc: hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block, Sam Li

By adding zone management operations in BlockDriver, storage
controller emulation can use the new block layer APIs including
zone_report and zone_mgmt(open, close, finish, reset).
---
 block/block-backend.c            |  56 ++++++++
 block/coroutines.h               |   5 +
 block/file-posix.c               | 238 +++++++++++++++++++++++++++++++
 include/block/block-common.h     |  43 +++++-
 include/block/block_int-common.h |  20 +++
 5 files changed, 361 insertions(+), 1 deletion(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index e0e1aff4b1..786f964d02 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
     return ret;
 }
 
+/*
+ * Return zone_report from BlockDriver. Offset can be any number within
+ * the zone size. No alignment for offset and len.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                       int64_t len, int64_t *nr_zones,
+                       BlockZoneDescriptor *zones)
+{
+    int ret;
+    BlockDriverState *bs;
+    IO_CODE();
+
+    blk_inc_in_flight(blk); /* increase before waiting */
+    blk_wait_while_drained(blk);
+    bs = blk_bs(blk);
+
+    ret = blk_check_byte_request(blk, offset, len);
+    if (ret < 0) {
+        return ret;
+    }
+
+    bdrv_inc_in_flight(bs);
+    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
+                              nr_zones, zones);
+    bdrv_dec_in_flight(bs);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
+/*
+ * Return zone_mgmt from BlockDriver.
+ * Offset is the start of a zone and len is aligned to zones.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
+        int64_t offset, int64_t len)
+{
+    int ret;
+    BlockDriverState *bs;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+    bs = blk_bs(blk);
+
+    ret = blk_check_byte_request(blk, offset, len);
+    if (ret < 0) {
+        return ret;
+    }
+
+    bdrv_inc_in_flight(bs);
+    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
+    bdrv_dec_in_flight(bs);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
diff --git a/block/coroutines.h b/block/coroutines.h
index 830ecaa733..a114d7bc30 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -80,6 +80,11 @@ int coroutine_fn
 blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
 
 int coroutine_fn blk_co_do_flush(BlockBackend *blk);
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    int64_t len, int64_t *nr_zones,
+                                    BlockZoneDescriptor *zones);
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
+        int64_t offset, int64_t len);
 
 
 /*
diff --git a/block/file-posix.c b/block/file-posix.c
index 48cd096624..1b8b0d351f 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -67,6 +67,7 @@
 #include <sys/param.h>
 #include <sys/syscall.h>
 #include <sys/vfs.h>
+#include <linux/blkzoned.h>
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <linux/fs.h>
@@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
             PreallocMode prealloc;
             Error **errp;
         } truncate;
+        struct {
+            int64_t *nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report;
+        zone_op op;
     };
 } RawPosixAIOData;
 
@@ -1801,6 +1807,135 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
 }
 #endif
 
+/*
+ * parse_zone - Fill a zone descriptor
+ */
+static inline void parse_zone(struct BlockZoneDescriptor *zone,
+                              struct blk_zone *blkz) {
+    zone->start = blkz->start;
+    zone->length = blkz->len;
+    zone->cap = blkz->capacity;
+    zone->wp = blkz->wp - blkz->start;
+    zone->type = blkz->type;
+    zone->cond = blkz->cond;
+}
+
+static int handle_aiocb_zone_report(void *opaque) {
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    int64_t *nr_zones = aiocb->zone_report.nr_zones;
+    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
+    int64_t offset = aiocb->aio_offset;
+    int64_t len = aiocb->aio_nbytes;
+
+    struct blk_zone *blkz;
+    int64_t rep_size, nrz;
+    int ret, n = 0, i = 0;
+
+    nrz = *nr_zones;
+    if (len == -1) {
+        return -errno;
+    }
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
+    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
+    printf("start to report zone with offset: 0x%lx\n", offset);
+
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = offset;
+        rep->nr_zones = nrz;
+
+        ret = ioctl(fd, BLKREPORTZONE, rep);
+        if (ret != 0) {
+            ret = -errno;
+            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
+                         fd, offset, errno);
+            return ret;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            parse_zone(&zones[n], &blkz[i]);
+            /* The next report should start after the last zone reported */
+            offset = blkz[i].start + blkz[i].len;
+        }
+    }
+
+    *nr_zones = n;
+    return 0;
+}
+
+static int handle_aiocb_zone_mgmt(void *opaque) {
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    int64_t offset = aiocb->aio_offset;
+    int64_t len = aiocb->aio_nbytes;
+    zone_op op = aiocb->op;
+
+    struct blk_zone_range range;
+    const char *ioctl_name;
+    unsigned long ioctl_op;
+    int64_t zone_size;
+    int64_t zone_size_mask;
+    int ret;
+
+    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
+    if (ret) {
+        return -1;
+    }
+
+    zone_size_mask = zone_size - 1;
+    if (offset & zone_size_mask) {
+        error_report("offset is not the start of a zone");
+        return -1;
+    }
+
+    if (len & zone_size_mask) {
+        error_report("len is not aligned to zones");
+        return -1;
+    }
+
+    switch (op) {
+    case zone_open:
+        ioctl_name = "BLKOPENZONE";
+        ioctl_op = BLKOPENZONE;
+        break;
+    case zone_close:
+        ioctl_name = "BLKCLOSEZONE";
+        ioctl_op = BLKCLOSEZONE;
+        break;
+    case zone_finish:
+        ioctl_name = "BLKFINISHZONE";
+        ioctl_op = BLKFINISHZONE;
+        break;
+    case zone_reset:
+        ioctl_name = "BLKRESETZONE";
+        ioctl_op = BLKRESETZONE;
+        break;
+    default:
+        error_report("Invalid zone operation 0x%x", op);
+        errno = -EINVAL;
+        return -1;
+    }
+
+    /* Execute the operation */
+    range.sector = offset;
+    range.nr_sectors = len;
+    ret = ioctl(fd, ioctl_op, &range);
+    if (ret != 0) {
+        error_report("ioctl %s failed %d",
+                     ioctl_name, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
 static int handle_aiocb_copy_range(void *opaque)
 {
     RawPosixAIOData *aiocb = opaque;
@@ -2973,6 +3108,58 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
     }
 }
 
+/*
+ * zone report - Get a zone block device's information in the form
+ * of an array of zone descriptors.
+ *
+ * @param bs: passing zone block device file descriptor
+ * @param zones: an array of zone descriptors to hold zone
+ * information on reply
+ * @param offset: offset can be any byte within the zone size.
+ * @param len: (not sure yet.
+ * @return 0 on success, -1 on failure
+ */
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+        int64_t len, int64_t *nr_zones,
+        BlockZoneDescriptor *zones) {
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+
+    acb = (RawPosixAIOData) {
+        .bs         = bs,
+        .aio_fildes = s->fd,
+        .aio_type   = QEMU_AIO_IOCTL,
+        .aio_offset = offset,
+        .aio_nbytes = len,
+        .zone_report    = {
+                .nr_zones       = nr_zones,
+                .zones          = zones,
+                },
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
+}
+
+/*
+ * zone management operations - Execute an operation on a zone
+ */
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
+        int64_t offset, int64_t len) {
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+
+    acb = (RawPosixAIOData) {
+        .bs             = bs,
+        .aio_fildes     = s->fd,
+        .aio_type       = QEMU_AIO_IOCTL,
+        .aio_offset     = offset,
+        .aio_nbytes     = len,
+        .op             = op,
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
+}
+
 static coroutine_fn int
 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
                 bool blkdev)
@@ -3324,6 +3511,9 @@ BlockDriver bdrv_file = {
     .bdrv_abort_perm_update = raw_abort_perm_update,
     .create_opts = &raw_create_opts,
     .mutable_opts = mutable_opts,
+
+    .bdrv_co_zone_report = raw_co_zone_report,
+    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
 };
 
 /***********************************************/
@@ -3703,6 +3893,53 @@ static BlockDriver bdrv_host_device = {
 #endif
 };
 
+static BlockDriver bdrv_zoned_host_device = {
+        .format_name = "zoned_host_device",
+        .protocol_name = "zoned_host_device",
+        .instance_size = sizeof(BDRVRawState),
+        .bdrv_needs_filename = true,
+        .bdrv_probe_device  = hdev_probe_device,
+        .bdrv_parse_filename = hdev_parse_filename,
+        .bdrv_file_open     = hdev_open,
+        .bdrv_close         = raw_close,
+        .bdrv_reopen_prepare = raw_reopen_prepare,
+        .bdrv_reopen_commit  = raw_reopen_commit,
+        .bdrv_reopen_abort   = raw_reopen_abort,
+        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
+        .create_opts         = &bdrv_create_opts_simple,
+        .mutable_opts        = mutable_opts,
+        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
+        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
+
+        .bdrv_co_preadv         = raw_co_preadv,
+        .bdrv_co_pwritev        = raw_co_pwritev,
+        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
+        .bdrv_co_pdiscard       = hdev_co_pdiscard,
+        .bdrv_co_copy_range_from = raw_co_copy_range_from,
+        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
+        .bdrv_refresh_limits = raw_refresh_limits,
+        .bdrv_io_plug = raw_aio_plug,
+        .bdrv_io_unplug = raw_aio_unplug,
+        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
+
+        .bdrv_co_truncate       = raw_co_truncate,
+        .bdrv_getlength = raw_getlength,
+        .bdrv_get_info = raw_get_info,
+        .bdrv_get_allocated_file_size
+                            = raw_get_allocated_file_size,
+        .bdrv_get_specific_stats = hdev_get_specific_stats,
+        .bdrv_check_perm = raw_check_perm,
+        .bdrv_set_perm   = raw_set_perm,
+        .bdrv_abort_perm_update = raw_abort_perm_update,
+        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
+        .bdrv_probe_geometry = hdev_probe_geometry,
+        .bdrv_co_ioctl = hdev_co_ioctl,
+
+        /* zone management operations */
+        .bdrv_co_zone_report = raw_co_zone_report,
+        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+};
+
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static void cdrom_parse_filename(const char *filename, QDict *options,
                                  Error **errp)
@@ -3964,6 +4201,7 @@ static void bdrv_file_init(void)
 #if defined(HAVE_HOST_BLOCK_DEVICE)
     bdrv_register(&bdrv_host_device);
 #ifdef __linux__
+    bdrv_register(&bdrv_zoned_host_device);
     bdrv_register(&bdrv_host_cdrom);
 #endif
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
diff --git a/include/block/block-common.h b/include/block/block-common.h
index fdb7306e78..78cddeeda5 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -23,7 +23,6 @@
  */
 #ifndef BLOCK_COMMON_H
 #define BLOCK_COMMON_H
-
 #include "block/aio.h"
 #include "block/aio-wait.h"
 #include "qemu/iov.h"
@@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildClass BdrvChildClass;
 
+typedef enum zone_op {
+    zone_open,
+    zone_close,
+    zone_finish,
+    zone_reset,
+} zone_op;
+
+typedef enum zone_model {
+    BLK_Z_HM,
+    BLK_Z_HA,
+} zone_model;
+
+typedef enum BlkZoneCondition {
+    BLK_ZS_NOT_WP = 0x0,
+    BLK_ZS_EMPTY = 0x1,
+    BLK_ZS_IOPEN = 0x2,
+    BLK_ZS_EOPEN = 0x3,
+    BLK_ZS_CLOSED = 0x4,
+    BLK_ZS_RDONLY = 0xD,
+    BLK_ZS_FULL = 0xE,
+    BLK_ZS_OFFLINE = 0xF,
+} BlkZoneCondition;
+
+typedef enum BlkZoneType {
+    BLK_ZT_CONV = 0x1,
+    BLK_ZT_SWR = 0x2,
+    BLK_ZT_SWP = 0x3,
+} BlkZoneType;
+
+/*
+ * Zone descriptor data structure.
+ * Provide information on a zone with all position and size values in bytes.
+ */
+typedef struct BlockZoneDescriptor {
+    uint64_t start;
+    uint64_t length;
+    uint64_t cap;
+    uint64_t wp;
+    BlkZoneType type;
+    BlkZoneCondition cond;
+} BlockZoneDescriptor;
+
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
     int cluster_size;
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 8947abab76..b9ea9db6dc 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
     struct BdrvTrackedRequest *waiting_for;
 } BdrvTrackedRequest;
 
+/**
+ * Zone device information data structure.
+ * Provide information on a device.
+ */
+typedef struct zbd_dev {
+    uint32_t zone_size;
+    zone_model model;
+    uint32_t block_size;
+    uint32_t write_granularity;
+    uint32_t nr_zones;
+    struct BlockZoneDescriptor *zones; /* array of zones */
+    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
+    uint32_t max_nr_active_zones;
+} zbd_dev;
 
 struct BlockDriver {
     /*
@@ -691,6 +705,12 @@ struct BlockDriver {
                                           QEMUIOVector *qiov,
                                           int64_t pos);
 
+    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
+            int64_t offset, int64_t len, int64_t *nr_zones,
+            BlockZoneDescriptor *zones);
+    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
+            int64_t offset, int64_t len);
+
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
     void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC v3 2/5] qemu-io: add zoned block device operations.
  2022-06-27  0:19 [RFC v3 0/5] *** Add support for zoned device *** Sam Li
  2022-06-27  0:19 ` [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls Sam Li
@ 2022-06-27  0:19 ` Sam Li
  2022-06-27  7:30   ` Hannes Reinecke
  2022-06-28  7:56   ` Stefan Hajnoczi
  2022-06-27  0:19 ` [RFC v3 3/5] file-posix: introduce get_sysfs_long_val for zoned device information Sam Li
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 32+ messages in thread
From: Sam Li @ 2022-06-27  0:19 UTC (permalink / raw)
  To: qemu-devel
  Cc: hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block, Sam Li

---
 block/io.c               |  21 +++++++
 include/block/block-io.h |  13 +++++
 qemu-io-cmds.c           | 121 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+)

diff --git a/block/io.c b/block/io.c
index 789e6373d5..656a1b7271 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3258,6 +3258,27 @@ out:
     return co.ret;
 }
 
+int bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
+                        int64_t len, int64_t *nr_zones,
+                        BlockZoneDescriptor *zones)
+{
+    if (!bs->drv->bdrv_co_zone_report) {
+        return -ENOTSUP;
+    }
+
+    return bs->drv->bdrv_co_zone_report(bs, offset, len, nr_zones, zones);
+}
+
+int bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
+        int64_t offset, int64_t len)
+{
+    if (!bs->drv->bdrv_co_zone_mgmt) {
+        return -ENOTSUP;
+    }
+
+    return bs->drv->bdrv_co_zone_mgmt(bs, op, offset, len);
+}
+
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
     IO_CODE();
diff --git a/include/block/block-io.h b/include/block/block-io.h
index 62c84f0519..c85c174579 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -80,6 +80,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
 /* Ensure contents are flushed to disk.  */
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
 
+/* Report zone information of zone block device. */
+int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
+                                     int64_t len, int64_t *nr_zones,
+                                     BlockZoneDescriptor *zones);
+int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, zone_op op,
+        int64_t offset, int64_t len);
+
 int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
 int bdrv_block_status(BlockDriverState *bs, int64_t offset,
@@ -290,6 +297,12 @@ bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 int generated_co_wrapper
 bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 
+int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
+                                         int64_t len, int64_t *nr_zones,
+                                         BlockZoneDescriptor *zones);
+int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, enum zone_op op,
+        int64_t offset, int64_t len);
+
 /**
  * bdrv_parent_drained_begin_single:
  *
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 2f0d8ac25a..3f2592b9f5 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1706,6 +1706,122 @@ static const cmdinfo_t flush_cmd = {
     .oneline    = "flush all in-core file state to disk",
 };
 
+static int zone_report_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int64_t offset, len, nr_zones;
+    int i = 0;
+
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    ++optind;
+    nr_zones = cvtnum(argv[optind]);
+
+    g_autofree BlockZoneDescriptor *zones = g_new(BlockZoneDescriptor, nr_zones);
+    ret = blk_zone_report(blk, offset, len, &nr_zones, zones);
+    while (i < nr_zones) {
+        fprintf(stdout, "start: 0x%lx, len 0x%lx, cap 0x%lx, wptr 0x%lx, "
+                        "zcond:%u, [type: %u]\n",
+                zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
+                zones[i].cond, zones[i].type);
+        ++i;
+    }
+    return ret;
+}
+
+static const cmdinfo_t zone_report_cmd = {
+        .name = "zone_report",
+        .altname = "f",
+        .cfunc = zone_report_f,
+        .argmin = 3,
+        .argmax = 3,
+        .args = "offset [offset..] len [len..] number [num..]",
+        .oneline = "report a number of zones",
+};
+
+static int zone_open_f(BlockBackend *blk, int argc, char **argv)
+{
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    return blk_zone_mgmt(blk, zone_open, offset, len);
+}
+
+static const cmdinfo_t zone_open_cmd = {
+        .name = "zone_open",
+        .altname = "f",
+        .cfunc = zone_open_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset [offset..] len [len..]",
+        .oneline = "explicit open a range of zones in zone block device",
+};
+
+static int zone_close_f(BlockBackend *blk, int argc, char **argv)
+{
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    return blk_zone_mgmt(blk, zone_close, offset, len);
+}
+
+static const cmdinfo_t zone_close_cmd = {
+        .name = "zone_close",
+        .altname = "f",
+        .cfunc = zone_close_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset [offset..] len [len..]",
+        .oneline = "close a range of zones in zone block device",
+};
+
+static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
+{
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    return blk_zone_mgmt(blk, zone_finish, offset, len);
+}
+
+static const cmdinfo_t zone_finish_cmd = {
+        .name = "zone_finish",
+        .altname = "f",
+        .cfunc = zone_finish_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset [offset..] len [len..]",
+        .oneline = "finish a range of zones in zone block device",
+};
+
+static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
+{
+    int64_t offset, len;
+    ++optind;
+    offset = cvtnum(argv[optind]);
+    ++optind;
+    len = cvtnum(argv[optind]);
+    return blk_zone_mgmt(blk, zone_reset, offset, len);
+}
+
+static const cmdinfo_t zone_reset_cmd = {
+        .name = "zone_reset",
+        .altname = "f",
+        .cfunc = zone_reset_f,
+        .argmin = 2,
+        .argmax = 2,
+        .args = "offset [offset..] len [len..]",
+        .oneline = "reset a zone write pointer in zone block device",
+};
+
+
 static int truncate_f(BlockBackend *blk, int argc, char **argv);
 static const cmdinfo_t truncate_cmd = {
     .name       = "truncate",
@@ -2498,6 +2614,11 @@ static void __attribute((constructor)) init_qemuio_commands(void)
     qemuio_add_command(&aio_write_cmd);
     qemuio_add_command(&aio_flush_cmd);
     qemuio_add_command(&flush_cmd);
+    qemuio_add_command(&zone_report_cmd);
+    qemuio_add_command(&zone_open_cmd);
+    qemuio_add_command(&zone_close_cmd);
+    qemuio_add_command(&zone_finish_cmd);
+    qemuio_add_command(&zone_reset_cmd);
     qemuio_add_command(&truncate_cmd);
     qemuio_add_command(&length_cmd);
     qemuio_add_command(&info_cmd);
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC v3 3/5] file-posix: introduce get_sysfs_long_val for zoned device information.
  2022-06-27  0:19 [RFC v3 0/5] *** Add support for zoned device *** Sam Li
  2022-06-27  0:19 ` [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls Sam Li
  2022-06-27  0:19 ` [RFC v3 2/5] qemu-io: add zoned block device operations Sam Li
@ 2022-06-27  0:19 ` Sam Li
  2022-06-27  7:31   ` Hannes Reinecke
  2022-06-28  8:09   ` Stefan Hajnoczi
  2022-06-27  0:19 ` [RFC v3 4/5] file-posix: introduce get_sysfs_str_val for device zoned model Sam Li
  2022-06-27  0:19 ` [RFC v3 5/5] qemu-iotests: add zone operation tests Sam Li
  4 siblings, 2 replies; 32+ messages in thread
From: Sam Li @ 2022-06-27  0:19 UTC (permalink / raw)
  To: qemu-devel
  Cc: hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block, Sam Li

Use sysfs attribute files to get the zoned device information in case
that ioctl() commands of zone management interface won't work. It can
return long type of value like chunk_sectors, zoned_append_max_bytes,
max_open_zones, max_active_zones.
---
 block/file-posix.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 1b8b0d351f..73c2cdfbca 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1216,15 +1216,19 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
 #endif
 }
 
-static int hdev_get_max_segments(int fd, struct stat *st)
-{
+/*
+ * Get zoned device information (chunk_sectors, zoned_append_max_bytes,
+ * max_open_zones, max_active_zones) through sysfs attribute files.
+ */
+static long get_sysfs_long_val(int fd, struct stat *st,
+                               const char *attribute) {
 #ifdef CONFIG_LINUX
     char buf[32];
     const char *end;
     char *sysfspath = NULL;
     int ret;
     int sysfd = -1;
-    long max_segments;
+    long val;
 
     if (S_ISCHR(st->st_mode)) {
         if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
@@ -1237,8 +1241,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
         return -ENOTSUP;
     }
 
-    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-                                major(st->st_rdev), minor(st->st_rdev));
+    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
+                                major(st->st_rdev), minor(st->st_rdev),
+                                attribute);
     sysfd = open(sysfspath, O_RDONLY);
     if (sysfd == -1) {
         ret = -errno;
@@ -1256,9 +1261,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
     }
     buf[ret] = 0;
     /* The file is ended with '\n', pass 'end' to accept that. */
-    ret = qemu_strtol(buf, &end, 10, &max_segments);
+    ret = qemu_strtol(buf, &end, 10, &val);
     if (ret == 0 && end && *end == '\n') {
-        ret = max_segments;
+        ret = val;
     }
 
 out:
@@ -1272,6 +1277,15 @@ out:
 #endif
 }
 
+static int hdev_get_max_segments(int fd, struct stat *st) {
+    int ret;
+    ret = get_sysfs_long_val(fd, st, "max_segments");
+    if (ret < 0) {
+        return -1;
+    }
+    return ret;
+}
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
@@ -1872,6 +1886,7 @@ static int handle_aiocb_zone_report(void *opaque) {
 
 static int handle_aiocb_zone_mgmt(void *opaque) {
     RawPosixAIOData *aiocb = opaque;
+    BlockDriverState *s = aiocb->bs;
     int fd = aiocb->aio_fildes;
     int64_t offset = aiocb->aio_offset;
     int64_t len = aiocb->aio_nbytes;
@@ -1884,11 +1899,9 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
     int64_t zone_size_mask;
     int ret;
 
-    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
-    if (ret) {
-        return -1;
-    }
-
+    g_autofree struct stat *file = g_new(struct stat, 1);
+    stat(s->filename, file);
+    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
     zone_size_mask = zone_size - 1;
     if (offset & zone_size_mask) {
         error_report("offset is not the start of a zone");
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC v3 4/5] file-posix: introduce get_sysfs_str_val for device zoned model.
  2022-06-27  0:19 [RFC v3 0/5] *** Add support for zoned device *** Sam Li
                   ` (2 preceding siblings ...)
  2022-06-27  0:19 ` [RFC v3 3/5] file-posix: introduce get_sysfs_long_val for zoned device information Sam Li
@ 2022-06-27  0:19 ` Sam Li
  2022-06-27  7:41   ` Hannes Reinecke
  2022-06-27  0:19 ` [RFC v3 5/5] qemu-iotests: add zone operation tests Sam Li
  4 siblings, 1 reply; 32+ messages in thread
From: Sam Li @ 2022-06-27  0:19 UTC (permalink / raw)
  To: qemu-devel
  Cc: hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block, Sam Li

---
 block/file-posix.c           | 60 ++++++++++++++++++++++++++++++++++++
 include/block/block-common.h |  4 +--
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 73c2cdfbca..74c0245e0f 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1277,6 +1277,66 @@ out:
 #endif
 }
 
+/*
+ * Convert the zoned attribute file in sysfs to internal value.
+ */
+static zone_model get_sysfs_str_val(int fd, struct stat *st) {
+#ifdef CONFIG_LINUX
+    char buf[32];
+    char *sysfspath = NULL;
+    int ret;
+    int sysfd = -1;
+
+    if (S_ISCHR(st->st_mode)) {
+        if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
+            return ret;
+        }
+        return -ENOTSUP;
+    }
+
+    if (!S_ISBLK(st->st_mode)) {
+        return -ENOTSUP;
+    }
+
+    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/zoned",
+                                major(st->st_rdev), minor(st->st_rdev));
+    sysfd = open(sysfspath, O_RDONLY);
+    if (sysfd == -1) {
+        ret = -errno;
+        goto out;
+    }
+    do {
+        ret = read(sysfd, buf, sizeof(buf) - 1);
+    } while (ret == -1 && errno == EINTR);
+    if (ret < 0) {
+        ret = -errno;
+        goto out;
+    } else if (ret == 0) {
+        ret = -EIO;
+        goto out;
+    }
+    buf[ret] = 0;
+
+    /* The file is ended with '\n' */
+    if (strcmp(buf, "host-managed\n") == 0) {
+        return BLK_Z_HM;
+    } else if (strcmp(buf, "host-aware\n") == 0) {
+        return BLK_Z_HA;
+    } else {
+        return -ENOTSUP;
+    }
+
+out:
+    if (sysfd != -1) {
+        close(sysfd);
+    }
+    g_free(sysfspath);
+    return ret;
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static int hdev_get_max_segments(int fd, struct stat *st) {
     int ret;
     ret = get_sysfs_long_val(fd, st, "max_segments");
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 78cddeeda5..35e00afe8e 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -56,8 +56,8 @@ typedef enum zone_op {
 } zone_op;
 
 typedef enum zone_model {
-    BLK_Z_HM,
-    BLK_Z_HA,
+    BLK_Z_HM = 0x1,
+    BLK_Z_HA = 0x2,
 } zone_model;
 
 typedef enum BlkZoneCondition {
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC v3 5/5] qemu-iotests: add zone operation tests.
  2022-06-27  0:19 [RFC v3 0/5] *** Add support for zoned device *** Sam Li
                   ` (3 preceding siblings ...)
  2022-06-27  0:19 ` [RFC v3 4/5] file-posix: introduce get_sysfs_str_val for device zoned model Sam Li
@ 2022-06-27  0:19 ` Sam Li
  2022-06-27  7:42   ` Hannes Reinecke
  2022-06-28  8:19   ` Stefan Hajnoczi
  4 siblings, 2 replies; 32+ messages in thread
From: Sam Li @ 2022-06-27  0:19 UTC (permalink / raw)
  To: qemu-devel
  Cc: hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block, Sam Li

---
 tests/qemu-iotests/tests/zoned.sh | 49 +++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100755 tests/qemu-iotests/tests/zoned.sh

diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-iotests/tests/zoned.sh
new file mode 100755
index 0000000000..262c0b5427
--- /dev/null
+++ b/tests/qemu-iotests/tests/zoned.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+#
+# Test zone management operations.
+#
+
+QEMU_IO="build/qemu-io"
+IMG="--image-opts driver=zoned_host_device,filename=/dev/nullb0"
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
+
+echo "Testing a null_blk device"
+echo "Simple cases: if the operations work"
+sudo modprobe null_blk nr_devices=1 zoned=1
+# hidden issues:
+# 1. memory allocation error of "unaligned tcache chunk detected" when the nr_zone=1 in zone report
+# 2. qemu-io: after report 10 zones, the program failed at double free error and exited.
+echo "report the first zone"
+sudo $QEMU_IO $IMG -c "zone_report 0 0 1"
+echo "report: the first 10 zones"
+sudo $QEMU_IO $IMG -c "zone_report 0 0 10"
+
+echo "open the first zone"
+sudo $QEMU_IO $IMG -c "zone_open 0 0x80000"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zone_report 0 0 1"
+echo "open the last zone"
+sudo $QEMU_IO $IMG -c "zone_open 0x3e70000000 0x80000"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zone_report 0x3e70000000 0 2"
+
+echo "close the first zone"
+sudo $QEMU_IO $IMG -c "zone_close 0 0x80000"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zone_report 0 0 1"
+echo "close the last zone"
+sudo $QEMU_IO $IMG -c "zone_close 0x3e70000000 0x80000"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zone_report 0x3e70000000 0 2"
+
+
+echo "reset the second zone"
+sudo $QEMU_IO $IMG -c "zone_reset 0x80000 0x80000"
+echo "After resetting a zone:"
+sudo $QEMU_IO $IMG -c "zone_report 0x80000 0 5"
+
+# success, all done
+sudo rmmod null_blk
+echo "*** done"
+#rm -f $seq.full
+status=0
-- 
2.35.3



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-27  0:19 ` [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls Sam Li
@ 2022-06-27  7:21   ` Hannes Reinecke
  2022-06-27  7:45     ` Sam Li
  2022-06-27 14:15   ` Stefan Hajnoczi
  1 sibling, 1 reply; 32+ messages in thread
From: Hannes Reinecke @ 2022-06-27  7:21 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block

On 6/27/22 02:19, Sam Li wrote:
> By adding zone management operations in BlockDriver, storage
> controller emulation can use the new block layer APIs including
> zone_report and zone_mgmt(open, close, finish, reset).
> ---
>   block/block-backend.c            |  56 ++++++++
>   block/coroutines.h               |   5 +
>   block/file-posix.c               | 238 +++++++++++++++++++++++++++++++
>   include/block/block-common.h     |  43 +++++-
>   include/block/block_int-common.h |  20 +++
>   5 files changed, 361 insertions(+), 1 deletion(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index e0e1aff4b1..786f964d02 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
>       return ret;
>   }
>   
> +/*
> + * Return zone_report from BlockDriver. Offset can be any number within
> + * the zone size. No alignment for offset and len.
> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                       int64_t len, int64_t *nr_zones,
> +                       BlockZoneDescriptor *zones)
> +{
> +    int ret;
> +    BlockDriverState *bs;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk); /* increase before waiting */
> +    blk_wait_while_drained(blk);
> +    bs = blk_bs(blk);
> +
> +    ret = blk_check_byte_request(blk, offset, len);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    bdrv_inc_in_flight(bs);
> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> +                              nr_zones, zones);
> +    bdrv_dec_in_flight(bs);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
> +/*
> + * Return zone_mgmt from BlockDriver.
> + * Offset is the start of a zone and len is aligned to zones.
> + */
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +        int64_t offset, int64_t len)
> +{
> +    int ret;
> +    BlockDriverState *bs;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    bs = blk_bs(blk);
> +
> +    ret = blk_check_byte_request(blk, offset, len);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    bdrv_inc_in_flight(bs);
> +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> +    bdrv_dec_in_flight(bs);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>   void blk_drain(BlockBackend *blk)
>   {
>       BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/coroutines.h b/block/coroutines.h
> index 830ecaa733..a114d7bc30 100644
> --- a/block/coroutines.h
> +++ b/block/coroutines.h
> @@ -80,6 +80,11 @@ int coroutine_fn
>   blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
>   
>   int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    int64_t len, int64_t *nr_zones,
> +                                    BlockZoneDescriptor *zones);
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +        int64_t offset, int64_t len);
>   
>   
>   /*
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 48cd096624..1b8b0d351f 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -67,6 +67,7 @@
>   #include <sys/param.h>
>   #include <sys/syscall.h>
>   #include <sys/vfs.h>
> +#include <linux/blkzoned.h>
>   #include <linux/cdrom.h>
>   #include <linux/fd.h>
>   #include <linux/fs.h>
> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
>               PreallocMode prealloc;
>               Error **errp;
>           } truncate;
> +        struct {
> +            int64_t *nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report;
> +        zone_op op;
>       };
>   } RawPosixAIOData;
>   
> @@ -1801,6 +1807,135 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
>   }
>   #endif
>   
> +/*
> + * parse_zone - Fill a zone descriptor
> + */
> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> +                              struct blk_zone *blkz) {
> +    zone->start = blkz->start;
> +    zone->length = blkz->len;
> +    zone->cap = blkz->capacity;
> +    zone->wp = blkz->wp - blkz->start;
> +    zone->type = blkz->type;
> +    zone->cond = blkz->cond;
> +}
> +
> +static int handle_aiocb_zone_report(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> +    int64_t offset = aiocb->aio_offset;
> +    int64_t len = aiocb->aio_nbytes;
> +
> +    struct blk_zone *blkz;
> +    int64_t rep_size, nrz;
> +    int ret, n = 0, i = 0;
> +
> +    nrz = *nr_zones;
> +    if (len == -1) {
> +        return -errno;
> +    }
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> +    printf("start to report zone with offset: 0x%lx\n", offset);
> +
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = offset;
> +        rep->nr_zones = nrz;
> +
> +        ret = ioctl(fd, BLKREPORTZONE, rep);
> +        if (ret != 0) {
> +            ret = -errno;
> +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> +                         fd, offset, errno);
> +            return ret;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            parse_zone(&zones[n], &blkz[i]);
> +            /* The next report should start after the last zone reported */
> +            offset = blkz[i].start + blkz[i].len;
> +        }

Where do you increase 'n' such that the loop can make forward progress?
Wouldn't it be better to use a for() loop here?

> +    }
> +
> +    *nr_zones = n;
> +    return 0;
> +}
> +
> +static int handle_aiocb_zone_mgmt(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t offset = aiocb->aio_offset;
> +    int64_t len = aiocb->aio_nbytes;
> +    zone_op op = aiocb->op;
> +
> +    struct blk_zone_range range;
> +    const char *ioctl_name;
> +    unsigned long ioctl_op;
> +    int64_t zone_size;
> +    int64_t zone_size_mask;
> +    int ret;
> +

Shouldn't we add a check here if 'fd' points to a zoned device?
ioctl errors are not _that_ helpful here, as you might get a variety
of errors and it's not quite obvious which of those errors indicate
an unsupported feature.

> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> +    if (ret) {
> +        return -1;
> +    }
> +
> +    zone_size_mask = zone_size - 1;
> +    if (offset & zone_size_mask) {
> +        error_report("offset is not the start of a zone");
> +        return -1;
> +    }
> +
> +    if (len & zone_size_mask) {
> +        error_report("len is not aligned to zones");
> +        return -1;
> +    }
> +
> +    switch (op) {
> +    case zone_open:
> +        ioctl_name = "BLKOPENZONE";
> +        ioctl_op = BLKOPENZONE;
> +        break;
> +    case zone_close:
> +        ioctl_name = "BLKCLOSEZONE";
> +        ioctl_op = BLKCLOSEZONE;
> +        break;
> +    case zone_finish:
> +        ioctl_name = "BLKFINISHZONE";
> +        ioctl_op = BLKFINISHZONE;
> +        break;
> +    case zone_reset:
> +        ioctl_name = "BLKRESETZONE";
> +        ioctl_op = BLKRESETZONE;
> +        break;
> +    default:
> +        error_report("Invalid zone operation 0x%x", op);
> +        errno = -EINVAL;
> +        return -1;
> +    }
> +
> +    /* Execute the operation */
> +    range.sector = offset;
> +    range.nr_sectors = len;
> +    ret = ioctl(fd, ioctl_op, &range);
> +    if (ret != 0) {
> +        error_report("ioctl %s failed %d",
> +                     ioctl_name, errno);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
>   static int handle_aiocb_copy_range(void *opaque)
>   {
>       RawPosixAIOData *aiocb = opaque;
> @@ -2973,6 +3108,58 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
>       }
>   }
>   
> +/*
> + * zone report - Get a zone block device's information in the form
> + * of an array of zone descriptors.
> + *
> + * @param bs: passing zone block device file descriptor
> + * @param zones: an array of zone descriptors to hold zone
> + * information on reply
> + * @param offset: offset can be any byte within the zone size.
> + * @param len: (not sure yet.
> + * @return 0 on success, -1 on failure
> + */
> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> +        int64_t len, int64_t *nr_zones,
> +        BlockZoneDescriptor *zones) {
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs         = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type   = QEMU_AIO_IOCTL,
> +        .aio_offset = offset,
> +        .aio_nbytes = len,
> +        .zone_report    = {
> +                .nr_zones       = nr_zones,
> +                .zones          = zones,
> +                },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> +}
> +
> +/*
> + * zone management operations - Execute an operation on a zone
> + */
> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> +        int64_t offset, int64_t len) {
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs             = bs,
> +        .aio_fildes     = s->fd,
> +        .aio_type       = QEMU_AIO_IOCTL,
> +        .aio_offset     = offset,
> +        .aio_nbytes     = len,
> +        .op             = op,
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> +}
> +
>   static coroutine_fn int
>   raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                   bool blkdev)
> @@ -3324,6 +3511,9 @@ BlockDriver bdrv_file = {
>       .bdrv_abort_perm_update = raw_abort_perm_update,
>       .create_opts = &raw_create_opts,
>       .mutable_opts = mutable_opts,
> +
> +    .bdrv_co_zone_report = raw_co_zone_report,
> +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>   };
>   
>   /***********************************************/
> @@ -3703,6 +3893,53 @@ static BlockDriver bdrv_host_device = {
>   #endif
>   };
>   
> +static BlockDriver bdrv_zoned_host_device = {
> +        .format_name = "zoned_host_device",
> +        .protocol_name = "zoned_host_device",
> +        .instance_size = sizeof(BDRVRawState),
> +        .bdrv_needs_filename = true,
> +        .bdrv_probe_device  = hdev_probe_device,
> +        .bdrv_parse_filename = hdev_parse_filename,
> +        .bdrv_file_open     = hdev_open,
> +        .bdrv_close         = raw_close,
> +        .bdrv_reopen_prepare = raw_reopen_prepare,
> +        .bdrv_reopen_commit  = raw_reopen_commit,
> +        .bdrv_reopen_abort   = raw_reopen_abort,
> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> +        .create_opts         = &bdrv_create_opts_simple,
> +        .mutable_opts        = mutable_opts,
> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> +
> +        .bdrv_co_preadv         = raw_co_preadv,
> +        .bdrv_co_pwritev        = raw_co_pwritev,
> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> +        .bdrv_refresh_limits = raw_refresh_limits,
> +        .bdrv_io_plug = raw_aio_plug,
> +        .bdrv_io_unplug = raw_aio_unplug,
> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> +
> +        .bdrv_co_truncate       = raw_co_truncate,
> +        .bdrv_getlength = raw_getlength,
> +        .bdrv_get_info = raw_get_info,
> +        .bdrv_get_allocated_file_size
> +                            = raw_get_allocated_file_size,
> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> +        .bdrv_check_perm = raw_check_perm,
> +        .bdrv_set_perm   = raw_set_perm,
> +        .bdrv_abort_perm_update = raw_abort_perm_update,
> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> +        .bdrv_probe_geometry = hdev_probe_geometry,
> +        .bdrv_co_ioctl = hdev_co_ioctl,
> +
> +        /* zone management operations */
> +        .bdrv_co_zone_report = raw_co_zone_report,
> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +};
> +
>   #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>   static void cdrom_parse_filename(const char *filename, QDict *options,
>                                    Error **errp)
> @@ -3964,6 +4201,7 @@ static void bdrv_file_init(void)
>   #if defined(HAVE_HOST_BLOCK_DEVICE)
>       bdrv_register(&bdrv_host_device);
>   #ifdef __linux__
> +    bdrv_register(&bdrv_zoned_host_device);
>       bdrv_register(&bdrv_host_cdrom);
>   #endif
>   #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index fdb7306e78..78cddeeda5 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -23,7 +23,6 @@
>    */
>   #ifndef BLOCK_COMMON_H
>   #define BLOCK_COMMON_H
> -
>   #include "block/aio.h"
>   #include "block/aio-wait.h"
>   #include "qemu/iov.h"
> @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
>   typedef struct BdrvChild BdrvChild;
>   typedef struct BdrvChildClass BdrvChildClass;
>   
> +typedef enum zone_op {
> +    zone_open,
> +    zone_close,
> +    zone_finish,
> +    zone_reset,
> +} zone_op;
> +
> +typedef enum zone_model {
> +    BLK_Z_HM,
> +    BLK_Z_HA,
> +} zone_model;
> +
> +typedef enum BlkZoneCondition {
> +    BLK_ZS_NOT_WP = 0x0,
> +    BLK_ZS_EMPTY = 0x1,
> +    BLK_ZS_IOPEN = 0x2,
> +    BLK_ZS_EOPEN = 0x3,
> +    BLK_ZS_CLOSED = 0x4,
> +    BLK_ZS_RDONLY = 0xD,
> +    BLK_ZS_FULL = 0xE,
> +    BLK_ZS_OFFLINE = 0xF,
> +} BlkZoneCondition;
> +
> +typedef enum BlkZoneType {
> +    BLK_ZT_CONV = 0x1,
> +    BLK_ZT_SWR = 0x2,
> +    BLK_ZT_SWP = 0x3,
> +} BlkZoneType;
> +
> +/*
> + * Zone descriptor data structure.
> + * Provide information on a zone with all position and size values in bytes.
> + */
> +typedef struct BlockZoneDescriptor {
> +    uint64_t start;
> +    uint64_t length;
> +    uint64_t cap;
> +    uint64_t wp;
> +    BlkZoneType type;
> +    BlkZoneCondition cond;
> +} BlockZoneDescriptor;
> +
>   typedef struct BlockDriverInfo {
>       /* in bytes, 0 if irrelevant */
>       int cluster_size;
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 8947abab76..b9ea9db6dc 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
>       struct BdrvTrackedRequest *waiting_for;
>   } BdrvTrackedRequest;
>   
> +/**
> + * Zone device information data structure.
> + * Provide information on a device.
> + */
> +typedef struct zbd_dev {
> +    uint32_t zone_size;
> +    zone_model model;
> +    uint32_t block_size;
> +    uint32_t write_granularity;
> +    uint32_t nr_zones;
> +    struct BlockZoneDescriptor *zones; /* array of zones */
> +    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
> +    uint32_t max_nr_active_zones;
> +} zbd_dev;
>   
>   struct BlockDriver {
>       /*
> @@ -691,6 +705,12 @@ struct BlockDriver {
>                                             QEMUIOVector *qiov,
>                                             int64_t pos);
>   
> +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> +            int64_t offset, int64_t len, int64_t *nr_zones,
> +            BlockZoneDescriptor *zones);
> +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
> +            int64_t offset, int64_t len);
> +
>       /* removable device specific */
>       bool (*bdrv_is_inserted)(BlockDriverState *bs);
>       void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);

Other than that: Well done!

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 2/5] qemu-io: add zoned block device operations.
  2022-06-27  0:19 ` [RFC v3 2/5] qemu-io: add zoned block device operations Sam Li
@ 2022-06-27  7:30   ` Hannes Reinecke
  2022-06-27  8:31     ` Sam Li
  2022-06-28  7:56   ` Stefan Hajnoczi
  1 sibling, 1 reply; 32+ messages in thread
From: Hannes Reinecke @ 2022-06-27  7:30 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block

On 6/27/22 02:19, Sam Li wrote:
> ---

Good coding style would advise to add some text here what the patch does.

>   block/io.c               |  21 +++++++
>   include/block/block-io.h |  13 +++++
>   qemu-io-cmds.c           | 121 +++++++++++++++++++++++++++++++++++++++
>   3 files changed, 155 insertions(+)
> 
> diff --git a/block/io.c b/block/io.c
> index 789e6373d5..656a1b7271 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3258,6 +3258,27 @@ out:
>       return co.ret;
>   }
>   
> +int bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                        int64_t len, int64_t *nr_zones,
> +                        BlockZoneDescriptor *zones)
> +{
> +    if (!bs->drv->bdrv_co_zone_report) {
> +        return -ENOTSUP;

ENOTSUP or EOPNOTSUP?
Kevin?

> +    }
> +
> +    return bs->drv->bdrv_co_zone_report(bs, offset, len, nr_zones, zones);
> +}
> +
> +int bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
> +        int64_t offset, int64_t len)
> +{
> +    if (!bs->drv->bdrv_co_zone_mgmt) {
> +        return -ENOTSUP;
> +    }
> +
> +    return bs->drv->bdrv_co_zone_mgmt(bs, op, offset, len);
> +}
> +
>   void *qemu_blockalign(BlockDriverState *bs, size_t size)
>   {
>       IO_CODE();
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index 62c84f0519..c85c174579 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -80,6 +80,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
>   /* Ensure contents are flushed to disk.  */
>   int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
>   
> +/* Report zone information of zone block device. */
> +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                                     int64_t len, int64_t *nr_zones,
> +                                     BlockZoneDescriptor *zones);
> +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> +        int64_t offset, int64_t len);
> +

There's the thing with the intendation ... please make it consistent, 
and ideally follow with whatever the remaining prototypes do.

>   int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>   bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
>   int bdrv_block_status(BlockDriverState *bs, int64_t offset,
> @@ -290,6 +297,12 @@ bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
>   int generated_co_wrapper
>   bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
>   
> +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
> +                                         int64_t len, int64_t *nr_zones,
> +                                         BlockZoneDescriptor *zones);
> +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +        int64_t offset, int64_t len);
> +

Again here.

>   /**
>    * bdrv_parent_drained_begin_single:
>    *
> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
> index 2f0d8ac25a..3f2592b9f5 100644
> --- a/qemu-io-cmds.c
> +++ b/qemu-io-cmds.c
> @@ -1706,6 +1706,122 @@ static const cmdinfo_t flush_cmd = {
>       .oneline    = "flush all in-core file state to disk",
>   };
>   
> +static int zone_report_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int ret;
> +    int64_t offset, len, nr_zones;
> +    int i = 0;
> +
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);
> +    ++optind;
> +    nr_zones = cvtnum(argv[optind]);
> +
And 'optind' is set where?
Plus do check for 'argv' overflow; before increasing 'optind' and using 
'argv[optind]' you have to validate that 'argv[optind]' is a valid pointer.

> +    g_autofree BlockZoneDescriptor *zones = g_new(BlockZoneDescriptor, nr_zones);
> +    ret = blk_zone_report(blk, offset, len, &nr_zones, zones);
> +    while (i < nr_zones) {
> +        fprintf(stdout, "start: 0x%lx, len 0x%lx, cap 0x%lx, wptr 0x%lx, "
> +                        "zcond:%u, [type: %u]\n",
> +                zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
> +                zones[i].cond, zones[i].type);
> +        ++i;
As 'i' is a simple iterator maybe use a 'for' loop here.
But that really is a matter of preference :-)

> +    }
> +    return ret;
> +}
> +
> +static const cmdinfo_t zone_report_cmd = {
> +        .name = "zone_report",
> +        .altname = "f",

altname 'f'?
Is that correct?

> +        .cfunc = zone_report_f,
> +        .argmin = 3,
> +        .argmax = 3,
> +        .args = "offset [offset..] len [len..] number [num..]",
> +        .oneline = "report a number of zones",
> +};
> +
> +static int zone_open_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int64_t offset, len;
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);

Same here: please check for 'argv' overflow.

> +    return blk_zone_mgmt(blk, zone_open, offset, len);
> +}
> +
> +static const cmdinfo_t zone_open_cmd = {
> +        .name = "zone_open",
> +        .altname = "f",

Same here; shouldn't 'altname' be different for each function?
'zo', maybe?

> +        .cfunc = zone_open_f,
> +        .argmin = 2,
> +        .argmax = 2,
> +        .args = "offset [offset..] len [len..]",
> +        .oneline = "explicit open a range of zones in zone block device",
> +};
> +
> +static int zone_close_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int64_t offset, len;
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);

argv checking.

> +    return blk_zone_mgmt(blk, zone_close, offset, len);
> +}
> +
> +static const cmdinfo_t zone_close_cmd = {
> +        .name = "zone_close",
> +        .altname = "f",

altname 'zc'

> +        .cfunc = zone_close_f,
> +        .argmin = 2,
> +        .argmax = 2,
> +        .args = "offset [offset..] len [len..]",
> +        .oneline = "close a range of zones in zone block device",
> +};
> +
> +static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int64_t offset, len;
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);

Argv checking.

> +    return blk_zone_mgmt(blk, zone_finish, offset, len);
> +}
> +
> +static const cmdinfo_t zone_finish_cmd = {
> +        .name = "zone_finish",
> +        .altname = "f",

altname 'zf'

> +        .cfunc = zone_finish_f,
> +        .argmin = 2,
> +        .argmax = 2,
> +        .args = "offset [offset..] len [len..]",
> +        .oneline = "finish a range of zones in zone block device",
> +};
> +
> +static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int64_t offset, len;
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);

Argv checking.

> +    return blk_zone_mgmt(blk, zone_reset, offset, len);
> +}
> +
> +static const cmdinfo_t zone_reset_cmd = {
> +        .name = "zone_reset",
> +        .altname = "f",

altname 'zf'

> +        .cfunc = zone_reset_f, > +        .argmin = 2,
> +        .argmax = 2,
> +        .args = "offset [offset..] len [len..]",
> +        .oneline = "reset a zone write pointer in zone block device",
> +};
> +
> +
>   static int truncate_f(BlockBackend *blk, int argc, char **argv);
>   static const cmdinfo_t truncate_cmd = {
>       .name       = "truncate",
> @@ -2498,6 +2614,11 @@ static void __attribute((constructor)) init_qemuio_commands(void)
>       qemuio_add_command(&aio_write_cmd);
>       qemuio_add_command(&aio_flush_cmd);
>       qemuio_add_command(&flush_cmd);
> +    qemuio_add_command(&zone_report_cmd);
> +    qemuio_add_command(&zone_open_cmd);
> +    qemuio_add_command(&zone_close_cmd);
> +    qemuio_add_command(&zone_finish_cmd);
> +    qemuio_add_command(&zone_reset_cmd);
>       qemuio_add_command(&truncate_cmd);
>       qemuio_add_command(&length_cmd);
>       qemuio_add_command(&info_cmd);

Otherwise looks okay.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 3/5] file-posix: introduce get_sysfs_long_val for zoned device information.
  2022-06-27  0:19 ` [RFC v3 3/5] file-posix: introduce get_sysfs_long_val for zoned device information Sam Li
@ 2022-06-27  7:31   ` Hannes Reinecke
  2022-06-27  8:32     ` Sam Li
  2022-06-28  8:09   ` Stefan Hajnoczi
  1 sibling, 1 reply; 32+ messages in thread
From: Hannes Reinecke @ 2022-06-27  7:31 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block

On 6/27/22 02:19, Sam Li wrote:
> Use sysfs attribute files to get the zoned device information in case
> that ioctl() commands of zone management interface won't work. It can
> return long type of value like chunk_sectors, zoned_append_max_bytes,
> max_open_zones, max_active_zones.
> ---
>   block/file-posix.c | 37 +++++++++++++++++++++++++------------
>   1 file changed, 25 insertions(+), 12 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 1b8b0d351f..73c2cdfbca 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1216,15 +1216,19 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
>   #endif
>   }
>   
> -static int hdev_get_max_segments(int fd, struct stat *st)
> -{
> +/*
> + * Get zoned device information (chunk_sectors, zoned_append_max_bytes,
> + * max_open_zones, max_active_zones) through sysfs attribute files.
> + */
> +static long get_sysfs_long_val(int fd, struct stat *st,
> +                               const char *attribute) {
>   #ifdef CONFIG_LINUX
>       char buf[32];
>       const char *end;
>       char *sysfspath = NULL;
>       int ret;
>       int sysfd = -1;
> -    long max_segments;
> +    long val;
>   
>       if (S_ISCHR(st->st_mode)) {
>           if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
> @@ -1237,8 +1241,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
>           return -ENOTSUP;
>       }
>   
> -    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> -                                major(st->st_rdev), minor(st->st_rdev));
> +    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
> +                                major(st->st_rdev), minor(st->st_rdev),
> +                                attribute);
>       sysfd = open(sysfspath, O_RDONLY);
>       if (sysfd == -1) {
>           ret = -errno;
> @@ -1256,9 +1261,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
>       }
>       buf[ret] = 0;
>       /* The file is ended with '\n', pass 'end' to accept that. */
> -    ret = qemu_strtol(buf, &end, 10, &max_segments);
> +    ret = qemu_strtol(buf, &end, 10, &val);
>       if (ret == 0 && end && *end == '\n') {
> -        ret = max_segments;
> +        ret = val;
>       }
>   
>   out:
> @@ -1272,6 +1277,15 @@ out:
>   #endif
>   }
>   
> +static int hdev_get_max_segments(int fd, struct stat *st) {
> +    int ret;
> +    ret = get_sysfs_long_val(fd, st, "max_segments");
> +    if (ret < 0) {
> +        return -1;
> +    }
> +    return ret;
> +}
> +
>   static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>   {
>       BDRVRawState *s = bs->opaque;
> @@ -1872,6 +1886,7 @@ static int handle_aiocb_zone_report(void *opaque) {
>   
>   static int handle_aiocb_zone_mgmt(void *opaque) {
>       RawPosixAIOData *aiocb = opaque;
> +    BlockDriverState *s = aiocb->bs;
>       int fd = aiocb->aio_fildes;
>       int64_t offset = aiocb->aio_offset;
>       int64_t len = aiocb->aio_nbytes;
> @@ -1884,11 +1899,9 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
>       int64_t zone_size_mask;
>       int ret;
>   
> -    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> -    if (ret) {
> -        return -1;
> -    }
> -
> +    g_autofree struct stat *file = g_new(struct stat, 1);
> +    stat(s->filename, file);
> +    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
>       zone_size_mask = zone_size - 1;
>       if (offset & zone_size_mask) {
>           error_report("offset is not the start of a zone");

Round of applause.

Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 4/5] file-posix: introduce get_sysfs_str_val for device zoned model.
  2022-06-27  0:19 ` [RFC v3 4/5] file-posix: introduce get_sysfs_str_val for device zoned model Sam Li
@ 2022-06-27  7:41   ` Hannes Reinecke
  2022-06-27  8:44     ` Sam Li
  0 siblings, 1 reply; 32+ messages in thread
From: Hannes Reinecke @ 2022-06-27  7:41 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block

On 6/27/22 02:19, Sam Li wrote:
> ---
>   block/file-posix.c           | 60 ++++++++++++++++++++++++++++++++++++
>   include/block/block-common.h |  4 +--
>   2 files changed, 62 insertions(+), 2 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 73c2cdfbca..74c0245e0f 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1277,6 +1277,66 @@ out:
>   #endif
>   }
>   
> +/*
> + * Convert the zoned attribute file in sysfs to internal value.
> + */
> +static zone_model get_sysfs_str_val(int fd, struct stat *st) {
> +#ifdef CONFIG_LINUX
> +    char buf[32];
> +    char *sysfspath = NULL;
> +    int ret;
> +    int sysfd = -1;
> +
> +    if (S_ISCHR(st->st_mode)) {
> +        if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
> +            return ret;
> +        }
> +        return -ENOTSUP;
> +    }
> +
> +    if (!S_ISBLK(st->st_mode)) {
> +        return -ENOTSUP;
> +    }
> +
> +    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/zoned",
> +                                major(st->st_rdev), minor(st->st_rdev));
> +    sysfd = open(sysfspath, O_RDONLY);
> +    if (sysfd == -1) {
> +        ret = -errno;
> +        goto out;
> +    }
> +    do {
> +        ret = read(sysfd, buf, sizeof(buf) - 1);
> +    } while (ret == -1 && errno == EINTR);

This is wrong.
read() might return a value smaller than the 'len' argument (sizeof(buf) 
-1 in your case). But in that case it's a short read, and one need to 
call 'read()' again to fetch the remaining bytes.

So the correct code would be something like:

offset = 0;
do {
     ret = read(sysfd, buf + offset, sizeof(buf) - 1 + offset);
     if (ret > 0)
         offset += ret;
} while (ret > 0);

Not that you'd actually need it; reads from sysfs are basically never 
interrupted, so you should be able to read from an attribute in one go.
IE alternatively you can drop the 'while' loop and just call read().

> +    if (ret < 0) {
> +        ret = -errno;
> +        goto out;
> +    } else if (ret == 0) {
> +        ret = -EIO;
> +        goto out;
> +    }
> +    buf[ret] = 0;
> +
> +    /* The file is ended with '\n' */

I'd rather check if the string ends with an '\n', and overwrite
it with a '\0'. That way you'd be insulated against any changes
to sysfs.

> +    if (strcmp(buf, "host-managed\n") == 0) {
> +        return BLK_Z_HM;
> +    } else if (strcmp(buf, "host-aware\n") == 0) {
> +        return BLK_Z_HA;
> +    } else {
> +        return -ENOTSUP;
> +    }
> +
> +out:
> +    if (sysfd != -1) {
> +        close(sysfd);
> +    }
> +    g_free(sysfspath);
> +    return ret;
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
>   static int hdev_get_max_segments(int fd, struct stat *st) {
>       int ret;
>       ret = get_sysfs_long_val(fd, st, "max_segments");

And as you already set a precedent in your previous patch, I'd recommend 
split this in two patches, one introducing a generic function 
'get_sysfs_str_val()' which returns a string and another function
(eg hdev_get_zone_model()) which calls this function to fetch the device 
zoned model.

> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index 78cddeeda5..35e00afe8e 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -56,8 +56,8 @@ typedef enum zone_op {
>   } zone_op;
>   
>   typedef enum zone_model {
> -    BLK_Z_HM,
> -    BLK_Z_HA,
> +    BLK_Z_HM = 0x1,
> +    BLK_Z_HA = 0x2,
>   } zone_model;
>   
>   typedef enum BlkZoneCondition {

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 5/5] qemu-iotests: add zone operation tests.
  2022-06-27  0:19 ` [RFC v3 5/5] qemu-iotests: add zone operation tests Sam Li
@ 2022-06-27  7:42   ` Hannes Reinecke
  2022-06-28  8:19   ` Stefan Hajnoczi
  1 sibling, 0 replies; 32+ messages in thread
From: Hannes Reinecke @ 2022-06-27  7:42 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: Hanna Reitz, dmitry.fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, damien.lemoal, qemu-block

On 6/27/22 02:19, Sam Li wrote:
> ---
>   tests/qemu-iotests/tests/zoned.sh | 49 +++++++++++++++++++++++++++++++
>   1 file changed, 49 insertions(+)
>   create mode 100755 tests/qemu-iotests/tests/zoned.sh
> 
> diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-iotests/tests/zoned.sh
> new file mode 100755
> index 0000000000..262c0b5427
> --- /dev/null
> +++ b/tests/qemu-iotests/tests/zoned.sh
> @@ -0,0 +1,49 @@
> +#!/usr/bin/env bash
> +#
> +# Test zone management operations.
> +#
> +
> +QEMU_IO="build/qemu-io"
> +IMG="--image-opts driver=zoned_host_device,filename=/dev/nullb0"
> +QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
> +
> +echo "Testing a null_blk device"
> +echo "Simple cases: if the operations work"
> +sudo modprobe null_blk nr_devices=1 zoned=1
> +# hidden issues:
> +# 1. memory allocation error of "unaligned tcache chunk detected" when the nr_zone=1 in zone report
> +# 2. qemu-io: after report 10 zones, the program failed at double free error and exited.
> +echo "report the first zone"
> +sudo $QEMU_IO $IMG -c "zone_report 0 0 1"
> +echo "report: the first 10 zones"
> +sudo $QEMU_IO $IMG -c "zone_report 0 0 10"
> +
> +echo "open the first zone"
> +sudo $QEMU_IO $IMG -c "zone_open 0 0x80000"
> +echo "report after:"
> +sudo $QEMU_IO $IMG -c "zone_report 0 0 1"
> +echo "open the last zone"
> +sudo $QEMU_IO $IMG -c "zone_open 0x3e70000000 0x80000"
> +echo "report after:"
> +sudo $QEMU_IO $IMG -c "zone_report 0x3e70000000 0 2"
> +
> +echo "close the first zone"
> +sudo $QEMU_IO $IMG -c "zone_close 0 0x80000"
> +echo "report after:"
> +sudo $QEMU_IO $IMG -c "zone_report 0 0 1"
> +echo "close the last zone"
> +sudo $QEMU_IO $IMG -c "zone_close 0x3e70000000 0x80000"
> +echo "report after:"
> +sudo $QEMU_IO $IMG -c "zone_report 0x3e70000000 0 2"
> +
> +
> +echo "reset the second zone"
> +sudo $QEMU_IO $IMG -c "zone_reset 0x80000 0x80000"
> +echo "After resetting a zone:"
> +sudo $QEMU_IO $IMG -c "zone_report 0x80000 0 5"
> +
> +# success, all done
> +sudo rmmod null_blk
> +echo "*** done"
> +#rm -f $seq.full
> +status=0

Caveat: I'm not that familiar with qemu-iotests.
FWIW:

Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-27  7:21   ` Hannes Reinecke
@ 2022-06-27  7:45     ` Sam Li
  0 siblings, 0 replies; 32+ messages in thread
From: Sam Li @ 2022-06-27  7:45 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: qemu-devel, Hanna Reitz, Dmitry Fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, Damien Le Moal, qemu block

Hi Hannes,

Hannes Reinecke <hare@suse.de> 于2022年6月27日周一 15:21写道:

>
> On 6/27/22 02:19, Sam Li wrote:
> > By adding zone management operations in BlockDriver, storage
> > controller emulation can use the new block layer APIs including
> > zone_report and zone_mgmt(open, close, finish, reset).
> > ---
> >   block/block-backend.c            |  56 ++++++++
> >   block/coroutines.h               |   5 +
> >   block/file-posix.c               | 238 +++++++++++++++++++++++++++++++
> >   include/block/block-common.h     |  43 +++++-
> >   include/block/block_int-common.h |  20 +++
> >   5 files changed, 361 insertions(+), 1 deletion(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index e0e1aff4b1..786f964d02 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
> >       return ret;
> >   }
> >
> > +/*
> > + * Return zone_report from BlockDriver. Offset can be any number within
> > + * the zone size. No alignment for offset and len.
> > + */
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                       int64_t len, int64_t *nr_zones,
> > +                       BlockZoneDescriptor *zones)
> > +{
> > +    int ret;
> > +    BlockDriverState *bs;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk); /* increase before waiting */
> > +    blk_wait_while_drained(blk);
> > +    bs = blk_bs(blk);
> > +
> > +    ret = blk_check_byte_request(blk, offset, len);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    bdrv_inc_in_flight(bs);
> > +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> > +                              nr_zones, zones);
> > +    bdrv_dec_in_flight(bs);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> > +/*
> > + * Return zone_mgmt from BlockDriver.
> > + * Offset is the start of a zone and len is aligned to zones.
> > + */
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +        int64_t offset, int64_t len)
> > +{
> > +    int ret;
> > +    BlockDriverState *bs;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    blk_wait_while_drained(blk);
> > +    bs = blk_bs(blk);
> > +
> > +    ret = blk_check_byte_request(blk, offset, len);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    bdrv_inc_in_flight(bs);
> > +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> > +    bdrv_dec_in_flight(bs);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> >   void blk_drain(BlockBackend *blk)
> >   {
> >       BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/coroutines.h b/block/coroutines.h
> > index 830ecaa733..a114d7bc30 100644
> > --- a/block/coroutines.h
> > +++ b/block/coroutines.h
> > @@ -80,6 +80,11 @@ int coroutine_fn
> >   blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
> >
> >   int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t len, int64_t *nr_zones,
> > +                                    BlockZoneDescriptor *zones);
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +        int64_t offset, int64_t len);
> >
> >
> >   /*
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 48cd096624..1b8b0d351f 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -67,6 +67,7 @@
> >   #include <sys/param.h>
> >   #include <sys/syscall.h>
> >   #include <sys/vfs.h>
> > +#include <linux/blkzoned.h>
> >   #include <linux/cdrom.h>
> >   #include <linux/fd.h>
> >   #include <linux/fs.h>
> > @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
> >               PreallocMode prealloc;
> >               Error **errp;
> >           } truncate;
> > +        struct {
> > +            int64_t *nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report;
> > +        zone_op op;
> >       };
> >   } RawPosixAIOData;
> >
> > @@ -1801,6 +1807,135 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
> >   }
> >   #endif
> >
> > +/*
> > + * parse_zone - Fill a zone descriptor
> > + */
> > +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> > +                              struct blk_zone *blkz) {
> > +    zone->start = blkz->start;
> > +    zone->length = blkz->len;
> > +    zone->cap = blkz->capacity;
> > +    zone->wp = blkz->wp - blkz->start;
> > +    zone->type = blkz->type;
> > +    zone->cond = blkz->cond;
> > +}
> > +
> > +static int handle_aiocb_zone_report(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> > +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +
> > +    struct blk_zone *blkz;
> > +    int64_t rep_size, nrz;
> > +    int ret, n = 0, i = 0;
> > +
> > +    nrz = *nr_zones;
> > +    if (len == -1) {
> > +        return -errno;
> > +    }
> > +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> > +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
> > +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> > +    printf("start to report zone with offset: 0x%lx\n", offset);
> > +
> > +    blkz = (struct blk_zone *)(rep + 1);
> > +    while (n < nrz) {
> > +        memset(rep, 0, rep_size);
> > +        rep->sector = offset;
> > +        rep->nr_zones = nrz;
> > +
> > +        ret = ioctl(fd, BLKREPORTZONE, rep);
> > +        if (ret != 0) {
> > +            ret = -errno;
> > +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> > +                         fd, offset, errno);
> > +            return ret;
> > +        }
> > +
> > +        if (!rep->nr_zones) {
> > +            break;
> > +        }
> > +
> > +        for (i = 0; i < rep->nr_zones; i++, n++) {
> > +            parse_zone(&zones[n], &blkz[i]);
> > +            /* The next report should start after the last zone reported */
> > +            offset = blkz[i].start + blkz[i].len;
> > +        }
>
> Where do you increase 'n' such that the loop can make forward progress?
> Wouldn't it be better to use a for() loop here?
>
'n' increases in this for loop as 'i' increases. I think the for()
loop can serve the same purpose with some modifications.

> > +    }
> > +
> > +    *nr_zones = n;
> > +    return 0;
> > +}
> > +
> > +static int handle_aiocb_zone_mgmt(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +    zone_op op = aiocb->op;
> > +
> > +    struct blk_zone_range range;
> > +    const char *ioctl_name;
> > +    unsigned long ioctl_op;
> > +    int64_t zone_size;
> > +    int64_t zone_size_mask;
> > +    int ret;
> > +
>
> Shouldn't we add a check here if 'fd' points to a zoned device?
> ioctl errors are not _that_ helpful here, as you might get a variety
> of errors and it's not quite obvious which of those errors indicate
> an unsupported feature.
>
Yes, I'll add it in the next patch.

> > +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> > +    if (ret) {
> > +        return -1;
> > +    }
> > +
> > +    zone_size_mask = zone_size - 1;
> > +    if (offset & zone_size_mask) {
> > +        error_report("offset is not the start of a zone");
> > +        return -1;
> > +    }
> > +
> > +    if (len & zone_size_mask) {
> > +        error_report("len is not aligned to zones");
> > +        return -1;
> > +    }
> > +
> > +    switch (op) {
> > +    case zone_open:
> > +        ioctl_name = "BLKOPENZONE";
> > +        ioctl_op = BLKOPENZONE;
> > +        break;
> > +    case zone_close:
> > +        ioctl_name = "BLKCLOSEZONE";
> > +        ioctl_op = BLKCLOSEZONE;
> > +        break;
> > +    case zone_finish:
> > +        ioctl_name = "BLKFINISHZONE";
> > +        ioctl_op = BLKFINISHZONE;
> > +        break;
> > +    case zone_reset:
> > +        ioctl_name = "BLKRESETZONE";
> > +        ioctl_op = BLKRESETZONE;
> > +        break;
> > +    default:
> > +        error_report("Invalid zone operation 0x%x", op);
> > +        errno = -EINVAL;
> > +        return -1;
> > +    }
> > +
> > +    /* Execute the operation */
> > +    range.sector = offset;
> > +    range.nr_sectors = len;
> > +    ret = ioctl(fd, ioctl_op, &range);
> > +    if (ret != 0) {
> > +        error_report("ioctl %s failed %d",
> > +                     ioctl_name, errno);
> > +        return -1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >   static int handle_aiocb_copy_range(void *opaque)
> >   {
> >       RawPosixAIOData *aiocb = opaque;
> > @@ -2973,6 +3108,58 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
> >       }
> >   }
> >
> > +/*
> > + * zone report - Get a zone block device's information in the form
> > + * of an array of zone descriptors.
> > + *
> > + * @param bs: passing zone block device file descriptor
> > + * @param zones: an array of zone descriptors to hold zone
> > + * information on reply
> > + * @param offset: offset can be any byte within the zone size.
> > + * @param len: (not sure yet.
> > + * @return 0 on success, -1 on failure
> > + */
> > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> > +        int64_t len, int64_t *nr_zones,
> > +        BlockZoneDescriptor *zones) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs         = bs,
> > +        .aio_fildes = s->fd,
> > +        .aio_type   = QEMU_AIO_IOCTL,
> > +        .aio_offset = offset,
> > +        .aio_nbytes = len,
> > +        .zone_report    = {
> > +                .nr_zones       = nr_zones,
> > +                .zones          = zones,
> > +                },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> > +}
> > +
> > +/*
> > + * zone management operations - Execute an operation on a zone
> > + */
> > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> > +        int64_t offset, int64_t len) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs             = bs,
> > +        .aio_fildes     = s->fd,
> > +        .aio_type       = QEMU_AIO_IOCTL,
> > +        .aio_offset     = offset,
> > +        .aio_nbytes     = len,
> > +        .op             = op,
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> > +}
> > +
> >   static coroutine_fn int
> >   raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >                   bool blkdev)
> > @@ -3324,6 +3511,9 @@ BlockDriver bdrv_file = {
> >       .bdrv_abort_perm_update = raw_abort_perm_update,
> >       .create_opts = &raw_create_opts,
> >       .mutable_opts = mutable_opts,
> > +
> > +    .bdrv_co_zone_report = raw_co_zone_report,
> > +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> >   };
> >
> >   /***********************************************/
> > @@ -3703,6 +3893,53 @@ static BlockDriver bdrv_host_device = {
> >   #endif
> >   };
> >
> > +static BlockDriver bdrv_zoned_host_device = {
> > +        .format_name = "zoned_host_device",
> > +        .protocol_name = "zoned_host_device",
> > +        .instance_size = sizeof(BDRVRawState),
> > +        .bdrv_needs_filename = true,
> > +        .bdrv_probe_device  = hdev_probe_device,
> > +        .bdrv_parse_filename = hdev_parse_filename,
> > +        .bdrv_file_open     = hdev_open,
> > +        .bdrv_close         = raw_close,
> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
> > +        .bdrv_reopen_commit  = raw_reopen_commit,
> > +        .bdrv_reopen_abort   = raw_reopen_abort,
> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> > +        .create_opts         = &bdrv_create_opts_simple,
> > +        .mutable_opts        = mutable_opts,
> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> > +
> > +        .bdrv_co_preadv         = raw_co_preadv,
> > +        .bdrv_co_pwritev        = raw_co_pwritev,
> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> > +        .bdrv_refresh_limits = raw_refresh_limits,
> > +        .bdrv_io_plug = raw_aio_plug,
> > +        .bdrv_io_unplug = raw_aio_unplug,
> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > +
> > +        .bdrv_co_truncate       = raw_co_truncate,
> > +        .bdrv_getlength = raw_getlength,
> > +        .bdrv_get_info = raw_get_info,
> > +        .bdrv_get_allocated_file_size
> > +                            = raw_get_allocated_file_size,
> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> > +        .bdrv_check_perm = raw_check_perm,
> > +        .bdrv_set_perm   = raw_set_perm,
> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> > +        .bdrv_probe_geometry = hdev_probe_geometry,
> > +        .bdrv_co_ioctl = hdev_co_ioctl,
> > +
> > +        /* zone management operations */
> > +        .bdrv_co_zone_report = raw_co_zone_report,
> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +};
> > +
> >   #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> >   static void cdrom_parse_filename(const char *filename, QDict *options,
> >                                    Error **errp)
> > @@ -3964,6 +4201,7 @@ static void bdrv_file_init(void)
> >   #if defined(HAVE_HOST_BLOCK_DEVICE)
> >       bdrv_register(&bdrv_host_device);
> >   #ifdef __linux__
> > +    bdrv_register(&bdrv_zoned_host_device);
> >       bdrv_register(&bdrv_host_cdrom);
> >   #endif
> >   #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index fdb7306e78..78cddeeda5 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -23,7 +23,6 @@
> >    */
> >   #ifndef BLOCK_COMMON_H
> >   #define BLOCK_COMMON_H
> > -
> >   #include "block/aio.h"
> >   #include "block/aio-wait.h"
> >   #include "qemu/iov.h"
> > @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
> >   typedef struct BdrvChild BdrvChild;
> >   typedef struct BdrvChildClass BdrvChildClass;
> >
> > +typedef enum zone_op {
> > +    zone_open,
> > +    zone_close,
> > +    zone_finish,
> > +    zone_reset,
> > +} zone_op;
> > +
> > +typedef enum zone_model {
> > +    BLK_Z_HM,
> > +    BLK_Z_HA,
> > +} zone_model;
> > +
> > +typedef enum BlkZoneCondition {
> > +    BLK_ZS_NOT_WP = 0x0,
> > +    BLK_ZS_EMPTY = 0x1,
> > +    BLK_ZS_IOPEN = 0x2,
> > +    BLK_ZS_EOPEN = 0x3,
> > +    BLK_ZS_CLOSED = 0x4,
> > +    BLK_ZS_RDONLY = 0xD,
> > +    BLK_ZS_FULL = 0xE,
> > +    BLK_ZS_OFFLINE = 0xF,
> > +} BlkZoneCondition;
> > +
> > +typedef enum BlkZoneType {
> > +    BLK_ZT_CONV = 0x1,
> > +    BLK_ZT_SWR = 0x2,
> > +    BLK_ZT_SWP = 0x3,
> > +} BlkZoneType;
> > +
> > +/*
> > + * Zone descriptor data structure.
> > + * Provide information on a zone with all position and size values in bytes.
> > + */
> > +typedef struct BlockZoneDescriptor {
> > +    uint64_t start;
> > +    uint64_t length;
> > +    uint64_t cap;
> > +    uint64_t wp;
> > +    BlkZoneType type;
> > +    BlkZoneCondition cond;
> > +} BlockZoneDescriptor;
> > +
> >   typedef struct BlockDriverInfo {
> >       /* in bytes, 0 if irrelevant */
> >       int cluster_size;
> > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> > index 8947abab76..b9ea9db6dc 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
> >       struct BdrvTrackedRequest *waiting_for;
> >   } BdrvTrackedRequest;
> >
> > +/**
> > + * Zone device information data structure.
> > + * Provide information on a device.
> > + */
> > +typedef struct zbd_dev {
> > +    uint32_t zone_size;
> > +    zone_model model;
> > +    uint32_t block_size;
> > +    uint32_t write_granularity;
> > +    uint32_t nr_zones;
> > +    struct BlockZoneDescriptor *zones; /* array of zones */
> > +    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
> > +    uint32_t max_nr_active_zones;
> > +} zbd_dev;
> >
> >   struct BlockDriver {
> >       /*
> > @@ -691,6 +705,12 @@ struct BlockDriver {
> >                                             QEMUIOVector *qiov,
> >                                             int64_t pos);
> >
> > +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> > +            int64_t offset, int64_t len, int64_t *nr_zones,
> > +            BlockZoneDescriptor *zones);
> > +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
> > +            int64_t offset, int64_t len);
> > +
> >       /* removable device specific */
> >       bool (*bdrv_is_inserted)(BlockDriverState *bs);
> >       void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
>
> Other than that: Well done!
>
Thanks for reviewing!

Have a good day!
Sam
> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke                        Kernel Storage Architect
> hare@suse.de                                      +49 911 74053 688
> SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
> HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 2/5] qemu-io: add zoned block device operations.
  2022-06-27  7:30   ` Hannes Reinecke
@ 2022-06-27  8:31     ` Sam Li
  0 siblings, 0 replies; 32+ messages in thread
From: Sam Li @ 2022-06-27  8:31 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: qemu-devel, Hanna Reitz, Dmitry Fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, Damien Le Moal, qemu block

Hannes Reinecke <hare@suse.de> 于2022年6月27日周一 15:30写道:
>
> On 6/27/22 02:19, Sam Li wrote:
> > ---
>
> Good coding style would advise to add some text here what the patch does.
>
> >   block/io.c               |  21 +++++++
> >   include/block/block-io.h |  13 +++++
> >   qemu-io-cmds.c           | 121 +++++++++++++++++++++++++++++++++++++++
> >   3 files changed, 155 insertions(+)
> >
> > diff --git a/block/io.c b/block/io.c
> > index 789e6373d5..656a1b7271 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -3258,6 +3258,27 @@ out:
> >       return co.ret;
> >   }
> >
> > +int bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
> > +                        int64_t len, int64_t *nr_zones,
> > +                        BlockZoneDescriptor *zones)
> > +{
> > +    if (!bs->drv->bdrv_co_zone_report) {
> > +        return -ENOTSUP;
>
> ENOTSUP or EOPNOTSUP?
> Kevin?
>
> > +    }
> > +
> > +    return bs->drv->bdrv_co_zone_report(bs, offset, len, nr_zones, zones);
> > +}
> > +
> > +int bdrv_co_zone_mgmt(BlockDriverState *bs, enum zone_op op,
> > +        int64_t offset, int64_t len)
> > +{
> > +    if (!bs->drv->bdrv_co_zone_mgmt) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    return bs->drv->bdrv_co_zone_mgmt(bs, op, offset, len);
> > +}
> > +
> >   void *qemu_blockalign(BlockDriverState *bs, size_t size)
> >   {
> >       IO_CODE();
> > diff --git a/include/block/block-io.h b/include/block/block-io.h
> > index 62c84f0519..c85c174579 100644
> > --- a/include/block/block-io.h
> > +++ b/include/block/block-io.h
> > @@ -80,6 +80,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
> >   /* Ensure contents are flushed to disk.  */
> >   int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
> >
> > +/* Report zone information of zone block device. */
> > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
> > +                                     int64_t len, int64_t *nr_zones,
> > +                                     BlockZoneDescriptor *zones);
> > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> > +        int64_t offset, int64_t len);
> > +
>
> There's the thing with the intendation ... please make it consistent,
> and ideally follow with whatever the remaining prototypes do.
>
> >   int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
> >   bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> >   int bdrv_block_status(BlockDriverState *bs, int64_t offset,
> > @@ -290,6 +297,12 @@ bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
> >   int generated_co_wrapper
> >   bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
> >
> > +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
> > +                                         int64_t len, int64_t *nr_zones,
> > +                                         BlockZoneDescriptor *zones);
> > +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +        int64_t offset, int64_t len);
> > +
>
> Again here.
>
> >   /**
> >    * bdrv_parent_drained_begin_single:
> >    *
> > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
> > index 2f0d8ac25a..3f2592b9f5 100644
> > --- a/qemu-io-cmds.c
> > +++ b/qemu-io-cmds.c
> > @@ -1706,6 +1706,122 @@ static const cmdinfo_t flush_cmd = {
> >       .oneline    = "flush all in-core file state to disk",
> >   };
> >
> > +static int zone_report_f(BlockBackend *blk, int argc, char **argv)
> > +{
> > +    int ret;
> > +    int64_t offset, len, nr_zones;
> > +    int i = 0;
> > +
> > +    ++optind;
> > +    offset = cvtnum(argv[optind]);
> > +    ++optind;
> > +    len = cvtnum(argv[optind]);
> > +    ++optind;
> > +    nr_zones = cvtnum(argv[optind]);
> > +
> And 'optind' is set where?

optind is declared in getopt_core.h.

> Plus do check for 'argv' overflow; before increasing 'optind' and using
> 'argv[optind]' you have to validate that 'argv[optind]' is a valid pointer.
>

Maybe we can check if argc is bigger than what we want?


> > +    g_autofree BlockZoneDescriptor *zones = g_new(BlockZoneDescriptor, nr_zones);
> > +    ret = blk_zone_report(blk, offset, len, &nr_zones, zones);
> > +    while (i < nr_zones) {
> > +        fprintf(stdout, "start: 0x%lx, len 0x%lx, cap 0x%lx, wptr 0x%lx, "
> > +                        "zcond:%u, [type: %u]\n",
> > +                zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
> > +                zones[i].cond, zones[i].type);
> > +        ++i;
> As 'i' is a simple iterator maybe use a 'for' loop here.
> But that really is a matter of preference :-)
>
Ok!

> > +    }
> > +    return ret;
> > +}
> > +
> > +static const cmdinfo_t zone_report_cmd = {
> > +        .name = "zone_report",
> > +        .altname = "f",
>
> altname 'f'?
> Is that correct?
>
No, it's not. I misunderstood altname's meaning. Changed it now.

> > +        .cfunc = zone_report_f,
> > +        .argmin = 3,
> > +        .argmax = 3,
> > +        .args = "offset [offset..] len [len..] number [num..]",
> > +        .oneline = "report a number of zones",
> > +};
> > +
> > +static int zone_open_f(BlockBackend *blk, int argc, char **argv)
> > +{
> > +    int64_t offset, len;
> > +    ++optind;
> > +    offset = cvtnum(argv[optind]);
> > +    ++optind;
> > +    len = cvtnum(argv[optind]);
>
> Same here: please check for 'argv' overflow.
>
> > +    return blk_zone_mgmt(blk, zone_open, offset, len);
> > +}
> > +
> > +static const cmdinfo_t zone_open_cmd = {
> > +        .name = "zone_open",
> > +        .altname = "f",
>
> Same here; shouldn't 'altname' be different for each function?
> 'zo', maybe?
>
> > +        .cfunc = zone_open_f,
> > +        .argmin = 2,
> > +        .argmax = 2,
> > +        .args = "offset [offset..] len [len..]",
> > +        .oneline = "explicit open a range of zones in zone block device",
> > +};
> > +
> > +static int zone_close_f(BlockBackend *blk, int argc, char **argv)
> > +{
> > +    int64_t offset, len;
> > +    ++optind;
> > +    offset = cvtnum(argv[optind]);
> > +    ++optind;
> > +    len = cvtnum(argv[optind]);
>
> argv checking.
>
> > +    return blk_zone_mgmt(blk, zone_close, offset, len);
> > +}
> > +
> > +static const cmdinfo_t zone_close_cmd = {
> > +        .name = "zone_close",
> > +        .altname = "f",
>
> altname 'zc'
>
> > +        .cfunc = zone_close_f,
> > +        .argmin = 2,
> > +        .argmax = 2,
> > +        .args = "offset [offset..] len [len..]",
> > +        .oneline = "close a range of zones in zone block device",
> > +};
> > +
> > +static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
> > +{
> > +    int64_t offset, len;
> > +    ++optind;
> > +    offset = cvtnum(argv[optind]);
> > +    ++optind;
> > +    len = cvtnum(argv[optind]);
>
> Argv checking.
>
> > +    return blk_zone_mgmt(blk, zone_finish, offset, len);
> > +}
> > +
> > +static const cmdinfo_t zone_finish_cmd = {
> > +        .name = "zone_finish",
> > +        .altname = "f",
>
> altname 'zf'
>
> > +        .cfunc = zone_finish_f,
> > +        .argmin = 2,
> > +        .argmax = 2,
> > +        .args = "offset [offset..] len [len..]",
> > +        .oneline = "finish a range of zones in zone block device",
> > +};
> > +
> > +static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
> > +{
> > +    int64_t offset, len;
> > +    ++optind;
> > +    offset = cvtnum(argv[optind]);
> > +    ++optind;
> > +    len = cvtnum(argv[optind]);
>
> Argv checking.
>
> > +    return blk_zone_mgmt(blk, zone_reset, offset, len);
> > +}
> > +
> > +static const cmdinfo_t zone_reset_cmd = {
> > +        .name = "zone_reset",
> > +        .altname = "f",
>
> altname 'zf'
>
> > +        .cfunc = zone_reset_f, > +        .argmin = 2,
> > +        .argmax = 2,
> > +        .args = "offset [offset..] len [len..]",
> > +        .oneline = "reset a zone write pointer in zone block device",
> > +};
> > +
> > +
> >   static int truncate_f(BlockBackend *blk, int argc, char **argv);
> >   static const cmdinfo_t truncate_cmd = {
> >       .name       = "truncate",
> > @@ -2498,6 +2614,11 @@ static void __attribute((constructor)) init_qemuio_commands(void)
> >       qemuio_add_command(&aio_write_cmd);
> >       qemuio_add_command(&aio_flush_cmd);
> >       qemuio_add_command(&flush_cmd);
> > +    qemuio_add_command(&zone_report_cmd);
> > +    qemuio_add_command(&zone_open_cmd);
> > +    qemuio_add_command(&zone_close_cmd);
> > +    qemuio_add_command(&zone_finish_cmd);
> > +    qemuio_add_command(&zone_reset_cmd);
> >       qemuio_add_command(&truncate_cmd);
> >       qemuio_add_command(&length_cmd);
> >       qemuio_add_command(&info_cmd);
>
> Otherwise looks okay.
>
Thanks!

> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke                        Kernel Storage Architect
> hare@suse.de                                      +49 911 74053 688
> SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
> HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 3/5] file-posix: introduce get_sysfs_long_val for zoned device information.
  2022-06-27  7:31   ` Hannes Reinecke
@ 2022-06-27  8:32     ` Sam Li
  0 siblings, 0 replies; 32+ messages in thread
From: Sam Li @ 2022-06-27  8:32 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: qemu-devel, Hanna Reitz, Dmitry Fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, Damien Le Moal, qemu block

Hannes Reinecke <hare@suse.de> 于2022年6月27日周一 15:31写道:
>
> On 6/27/22 02:19, Sam Li wrote:
> > Use sysfs attribute files to get the zoned device information in case
> > that ioctl() commands of zone management interface won't work. It can
> > return long type of value like chunk_sectors, zoned_append_max_bytes,
> > max_open_zones, max_active_zones.
> > ---
> >   block/file-posix.c | 37 +++++++++++++++++++++++++------------
> >   1 file changed, 25 insertions(+), 12 deletions(-)
> >
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 1b8b0d351f..73c2cdfbca 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -1216,15 +1216,19 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
> >   #endif
> >   }
> >
> > -static int hdev_get_max_segments(int fd, struct stat *st)
> > -{
> > +/*
> > + * Get zoned device information (chunk_sectors, zoned_append_max_bytes,
> > + * max_open_zones, max_active_zones) through sysfs attribute files.
> > + */
> > +static long get_sysfs_long_val(int fd, struct stat *st,
> > +                               const char *attribute) {
> >   #ifdef CONFIG_LINUX
> >       char buf[32];
> >       const char *end;
> >       char *sysfspath = NULL;
> >       int ret;
> >       int sysfd = -1;
> > -    long max_segments;
> > +    long val;
> >
> >       if (S_ISCHR(st->st_mode)) {
> >           if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
> > @@ -1237,8 +1241,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
> >           return -ENOTSUP;
> >       }
> >
> > -    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> > -                                major(st->st_rdev), minor(st->st_rdev));
> > +    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
> > +                                major(st->st_rdev), minor(st->st_rdev),
> > +                                attribute);
> >       sysfd = open(sysfspath, O_RDONLY);
> >       if (sysfd == -1) {
> >           ret = -errno;
> > @@ -1256,9 +1261,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
> >       }
> >       buf[ret] = 0;
> >       /* The file is ended with '\n', pass 'end' to accept that. */
> > -    ret = qemu_strtol(buf, &end, 10, &max_segments);
> > +    ret = qemu_strtol(buf, &end, 10, &val);
> >       if (ret == 0 && end && *end == '\n') {
> > -        ret = max_segments;
> > +        ret = val;
> >       }
> >
> >   out:
> > @@ -1272,6 +1277,15 @@ out:
> >   #endif
> >   }
> >
> > +static int hdev_get_max_segments(int fd, struct stat *st) {
> > +    int ret;
> > +    ret = get_sysfs_long_val(fd, st, "max_segments");
> > +    if (ret < 0) {
> > +        return -1;
> > +    }
> > +    return ret;
> > +}
> > +
> >   static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >   {
> >       BDRVRawState *s = bs->opaque;
> > @@ -1872,6 +1886,7 @@ static int handle_aiocb_zone_report(void *opaque) {
> >
> >   static int handle_aiocb_zone_mgmt(void *opaque) {
> >       RawPosixAIOData *aiocb = opaque;
> > +    BlockDriverState *s = aiocb->bs;
> >       int fd = aiocb->aio_fildes;
> >       int64_t offset = aiocb->aio_offset;
> >       int64_t len = aiocb->aio_nbytes;
> > @@ -1884,11 +1899,9 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
> >       int64_t zone_size_mask;
> >       int ret;
> >
> > -    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> > -    if (ret) {
> > -        return -1;
> > -    }
> > -
> > +    g_autofree struct stat *file = g_new(struct stat, 1);
> > +    stat(s->filename, file);
> > +    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
> >       zone_size_mask = zone_size - 1;
> >       if (offset & zone_size_mask) {
> >           error_report("offset is not the start of a zone");
>
> Round of applause.
>
Thanks! It was based on Damien's suggestion.

> Reviewed-by: Hannes Reinecke <hare@suse.de>
>
> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke                        Kernel Storage Architect
> hare@suse.de                                      +49 911 74053 688
> SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
> HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 4/5] file-posix: introduce get_sysfs_str_val for device zoned model.
  2022-06-27  7:41   ` Hannes Reinecke
@ 2022-06-27  8:44     ` Sam Li
  0 siblings, 0 replies; 32+ messages in thread
From: Sam Li @ 2022-06-27  8:44 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: qemu-devel, Hanna Reitz, Dmitry Fomichev, Kevin Wolf, Fam Zheng,
	Stefan Hajnoczi, Damien Le Moal, qemu block

Hannes Reinecke <hare@suse.de> 于2022年6月27日周一 15:41写道:
>
> On 6/27/22 02:19, Sam Li wrote:
> > ---
> >   block/file-posix.c           | 60 ++++++++++++++++++++++++++++++++++++
> >   include/block/block-common.h |  4 +--
> >   2 files changed, 62 insertions(+), 2 deletions(-)
> >
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 73c2cdfbca..74c0245e0f 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -1277,6 +1277,66 @@ out:
> >   #endif
> >   }
> >
> > +/*
> > + * Convert the zoned attribute file in sysfs to internal value.
> > + */
> > +static zone_model get_sysfs_str_val(int fd, struct stat *st) {
> > +#ifdef CONFIG_LINUX
> > +    char buf[32];
> > +    char *sysfspath = NULL;
> > +    int ret;
> > +    int sysfd = -1;
> > +
> > +    if (S_ISCHR(st->st_mode)) {
> > +        if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
> > +            return ret;
> > +        }
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    if (!S_ISBLK(st->st_mode)) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/zoned",
> > +                                major(st->st_rdev), minor(st->st_rdev));
> > +    sysfd = open(sysfspath, O_RDONLY);
> > +    if (sysfd == -1) {
> > +        ret = -errno;
> > +        goto out;
> > +    }
> > +    do {
> > +        ret = read(sysfd, buf, sizeof(buf) - 1);
> > +    } while (ret == -1 && errno == EINTR);
>
> This is wrong.
> read() might return a value smaller than the 'len' argument (sizeof(buf)
> -1 in your case). But in that case it's a short read, and one need to
> call 'read()' again to fetch the remaining bytes.
>
> So the correct code would be something like:
>
> offset = 0;
> do {
>      ret = read(sysfd, buf + offset, sizeof(buf) - 1 + offset);
>      if (ret > 0)
>          offset += ret;
> } while (ret > 0);
>
> Not that you'd actually need it; reads from sysfs are basically never
> interrupted, so you should be able to read from an attribute in one go.
> IE alternatively you can drop the 'while' loop and just call read().
>
Yes, I didn't think it through.

> > +    if (ret < 0) {
> > +        ret = -errno;
> > +        goto out;
> > +    } else if (ret == 0) {
> > +        ret = -EIO;
> > +        goto out;
> > +    }
> > +    buf[ret] = 0;
> > +
> > +    /* The file is ended with '\n' */
>
> I'd rather check if the string ends with an '\n', and overwrite
> it with a '\0'. That way you'd be insulated against any changes
> to sysfs.
>
Right! I was thinking about it but got hasty to hardcode everything.

> > +    if (strcmp(buf, "host-managed\n") == 0) {
> > +        return BLK_Z_HM;
> > +    } else if (strcmp(buf, "host-aware\n") == 0) {
> > +        return BLK_Z_HA;
> > +    } else {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +out:
> > +    if (sysfd != -1) {
> > +        close(sysfd);
> > +    }
> > +    g_free(sysfspath);
> > +    return ret;
> > +#else
> > +    return -ENOTSUP;
> > +#endif
> > +}
> > +
> >   static int hdev_get_max_segments(int fd, struct stat *st) {
> >       int ret;
> >       ret = get_sysfs_long_val(fd, st, "max_segments");
>
> And as you already set a precedent in your previous patch, I'd recommend
> split this in two patches, one introducing a generic function
> 'get_sysfs_str_val()' which returns a string and another function
> (eg hdev_get_zone_model()) which calls this function to fetch the device
> zoned model.
>
Maybe we can just return a str in get_sysfs_str_val and ignore
returning a zone_model for now? I was using zone_model for testing
purpose. Right now, this function is not used by others in
file-posix.c that causes compilation error in QEMU.

Thanks for reviewing!
Sam
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index 78cddeeda5..35e00afe8e 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -56,8 +56,8 @@ typedef enum zone_op {
> >   } zone_op;
> >
> >   typedef enum zone_model {
> > -    BLK_Z_HM,
> > -    BLK_Z_HA,
> > +    BLK_Z_HM = 0x1,
> > +    BLK_Z_HA = 0x2,
> >   } zone_model;
> >
> >   typedef enum BlkZoneCondition {
>
> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke                        Kernel Storage Architect
> hare@suse.de                                      +49 911 74053 688
> SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
> HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-27  0:19 ` [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls Sam Li
  2022-06-27  7:21   ` Hannes Reinecke
@ 2022-06-27 14:15   ` Stefan Hajnoczi
  2022-06-28  8:05     ` Sam Li
  1 sibling, 1 reply; 32+ messages in thread
From: Stefan Hajnoczi @ 2022-06-27 14:15 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf,
	Fam Zheng, damien.lemoal, qemu-block

[-- Attachment #1: Type: text/plain, Size: 5299 bytes --]

On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
> diff --git a/block/block-backend.c b/block/block-backend.c
> index e0e1aff4b1..786f964d02 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
>      return ret;
>  }
>  
> +/*
> + * Return zone_report from BlockDriver. Offset can be any number within
> + * the zone size. No alignment for offset and len.

What is the purpose of len? Is it the maximum number of zones to return
in nr_zones[]?

How does the caller know how many zones were returned?

> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                       int64_t len, int64_t *nr_zones,
> +                       BlockZoneDescriptor *zones)
> +{
> +    int ret;
> +    BlockDriverState *bs;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk); /* increase before waiting */
> +    blk_wait_while_drained(blk);
> +    bs = blk_bs(blk);
> +
> +    ret = blk_check_byte_request(blk, offset, len);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    bdrv_inc_in_flight(bs);

The bdrv_inc/dec_in_flight() call should be inside
bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.

> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> +                              nr_zones, zones);
> +    bdrv_dec_in_flight(bs);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
> +/*
> + * Return zone_mgmt from BlockDriver.

Maybe this should be:

  Send a zone management command.

> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
>              PreallocMode prealloc;
>              Error **errp;
>          } truncate;
> +        struct {
> +            int64_t *nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report;
> +        zone_op op;

It's cleaner to put op inside a struct zone_mgmt so its purpose is
self-explanatory:

  struct {
      zone_op op;
  } zone_mgmt;

> +static int handle_aiocb_zone_report(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> +    int64_t offset = aiocb->aio_offset;
> +    int64_t len = aiocb->aio_nbytes;
> +
> +    struct blk_zone *blkz;
> +    int64_t rep_size, nrz;
> +    int ret, n = 0, i = 0;
> +
> +    nrz = *nr_zones;
> +    if (len == -1) {
> +        return -errno;

Where is errno set? Should this be an errno constant instead like
-EINVAL?

> +    }
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);

g_new() looks incorrect. There should be 1 struct blk_zone_report
followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
instead.

> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> +    printf("start to report zone with offset: 0x%lx\n", offset);
> +
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = offset;
> +        rep->nr_zones = nrz;

What prevents zones[] overflowing? nrz isn't being reduced in the loop,
so maybe the rep->nr_zones return value will eventually exceed the
number of elements still available in zones[n..]?

> +static int handle_aiocb_zone_mgmt(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t offset = aiocb->aio_offset;
> +    int64_t len = aiocb->aio_nbytes;
> +    zone_op op = aiocb->op;
> +
> +    struct blk_zone_range range;
> +    const char *ioctl_name;
> +    unsigned long ioctl_op;
> +    int64_t zone_size;
> +    int64_t zone_size_mask;
> +    int ret;
> +
> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);

Can this value be stored in bs (maybe in BlockLimits) to avoid calling
ioctl(BLKGETZONESZ) each time?

> +    if (ret) {
> +        return -1;

The return value should be a negative errno. -1 is -EPERM but it's
probably not that error code you wanted. This should be:

  return -errno;

> +    }
> +
> +    zone_size_mask = zone_size - 1;
> +    if (offset & zone_size_mask) {
> +        error_report("offset is not the start of a zone");
> +        return -1;

return -EINVAL;

> +    }
> +
> +    if (len & zone_size_mask) {
> +        error_report("len is not aligned to zones");
> +        return -1;

return -EINVAL;

> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> +        int64_t len, int64_t *nr_zones,
> +        BlockZoneDescriptor *zones) {
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs         = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type   = QEMU_AIO_IOCTL,
> +        .aio_offset = offset,
> +        .aio_nbytes = len,
> +        .zone_report    = {
> +                .nr_zones       = nr_zones,
> +                .zones          = zones,
> +                },

Indentation is off here. Please use 4-space indentation.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 2/5] qemu-io: add zoned block device operations.
  2022-06-27  0:19 ` [RFC v3 2/5] qemu-io: add zoned block device operations Sam Li
  2022-06-27  7:30   ` Hannes Reinecke
@ 2022-06-28  7:56   ` Stefan Hajnoczi
  2022-06-28  9:02     ` Sam Li
  2022-06-28  9:11     ` Damien Le Moal
  1 sibling, 2 replies; 32+ messages in thread
From: Stefan Hajnoczi @ 2022-06-28  7:56 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf,
	Fam Zheng, damien.lemoal, qemu-block

[-- Attachment #1: Type: text/plain, Size: 3049 bytes --]

On Mon, Jun 27, 2022 at 08:19:14AM +0800, Sam Li wrote:
> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
> index 2f0d8ac25a..3f2592b9f5 100644
> --- a/qemu-io-cmds.c
> +++ b/qemu-io-cmds.c
> @@ -1706,6 +1706,122 @@ static const cmdinfo_t flush_cmd = {
>      .oneline    = "flush all in-core file state to disk",
>  };
>  
> +static int zone_report_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int ret;
> +    int64_t offset, len, nr_zones;
> +    int i = 0;
> +
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);
> +    ++optind;
> +    nr_zones = cvtnum(argv[optind]);
> +
> +    g_autofree BlockZoneDescriptor *zones = g_new(BlockZoneDescriptor, nr_zones);
> +    ret = blk_zone_report(blk, offset, len, &nr_zones, zones);
> +    while (i < nr_zones) {

Does blk_zone_report() set nr_zones to 0 on failure or do we need to
check if (ret < 0) here?

> +        fprintf(stdout, "start: 0x%lx, len 0x%lx, cap 0x%lx, wptr 0x%lx, "

The rest of the source file uses printf() instead of fprintf(stdout,
...). That's usually preferred because it's shorter.

> +                        "zcond:%u, [type: %u]\n",

Please use PRIx64 instead of lx format specifiers for portability. On
32-bit hosts lx is 32-bit, not 64-bit. You can grep QEMU's code for
examples of PRIx64.

> +                zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
> +                zones[i].cond, zones[i].type);
> +        ++i;
> +    }

A for loop is more idiomatic:

  for (int i = 0; i < nr_zones; i++) {
      ...
  }

> +    return ret;
> +}
> +
> +static const cmdinfo_t zone_report_cmd = {
> +        .name = "zone_report",
> +        .altname = "f",
> +        .cfunc = zone_report_f,
> +        .argmin = 3,
> +        .argmax = 3,
> +        .args = "offset [offset..] len [len..] number [num..]",

The arguments are "offset len number". This command does not accept
optional offset/len/num arguments.

> +        .oneline = "report a number of zones",

Maybe "report zone information".

> +};
> +
> +static int zone_open_f(BlockBackend *blk, int argc, char **argv)
> +{
> +    int64_t offset, len;
> +    ++optind;
> +    offset = cvtnum(argv[optind]);
> +    ++optind;
> +    len = cvtnum(argv[optind]);
> +    return blk_zone_mgmt(blk, zone_open, offset, len);

Where is the error reported? When I look at read_f() I see:

    if (ret < 0) {
        printf("read failed: %s\n", strerror(-ret));

I think something similar is needed because qemu-io.c does not print an
error message for us. The same is true for the other commands defined in
this patch.

> +}
> +
> +static const cmdinfo_t zone_open_cmd = {
> +        .name = "zone_open",
> +        .altname = "f",
> +        .cfunc = zone_open_f,
> +        .argmin = 2,
> +        .argmax = 2,
> +        .args = "offset [offset..] len [len..]",

There are no optional offset/len args. The same is true for the other
commands below.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-27 14:15   ` Stefan Hajnoczi
@ 2022-06-28  8:05     ` Sam Li
  2022-06-28  9:05       ` Damien Le Moal
  0 siblings, 1 reply; 32+ messages in thread
From: Sam Li @ 2022-06-28  8:05 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: qemu-devel, Hannes Reinecke, Hanna Reitz, Dmitry Fomichev,
	Kevin Wolf, Fam Zheng, Damien Le Moal, qemu block

Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道:
>
> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index e0e1aff4b1..786f964d02 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
> >      return ret;
> >  }
> >
> > +/*
> > + * Return zone_report from BlockDriver. Offset can be any number within
> > + * the zone size. No alignment for offset and len.
>
> What is the purpose of len? Is it the maximum number of zones to return
> in nr_zones[]?

len is actually not used in bdrv_co_zone_report. It is needed by
blk_check_byte_request.

> How does the caller know how many zones were returned?

nr_zones represents IN maximum and OUT actual. The caller will know by
nr_zones which is changed in bdrv_co_zone_report. I'll add it in the
comments.

>
> > + */
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                       int64_t len, int64_t *nr_zones,
> > +                       BlockZoneDescriptor *zones)
> > +{
> > +    int ret;
> > +    BlockDriverState *bs;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk); /* increase before waiting */
> > +    blk_wait_while_drained(blk);
> > +    bs = blk_bs(blk);
> > +
> > +    ret = blk_check_byte_request(blk, offset, len);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    bdrv_inc_in_flight(bs);
>
> The bdrv_inc/dec_in_flight() call should be inside
> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.
>
> > +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> > +                              nr_zones, zones);
> > +    bdrv_dec_in_flight(bs);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> > +/*
> > + * Return zone_mgmt from BlockDriver.
>
> Maybe this should be:
>
>   Send a zone management command.

Yes, it's more accurate.

>
> > @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
> >              PreallocMode prealloc;
> >              Error **errp;
> >          } truncate;
> > +        struct {
> > +            int64_t *nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report;
> > +        zone_op op;
>
> It's cleaner to put op inside a struct zone_mgmt so its purpose is
> self-explanatory:
>
>   struct {
>       zone_op op;
>   } zone_mgmt;
>
> > +static int handle_aiocb_zone_report(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> > +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +
> > +    struct blk_zone *blkz;
> > +    int64_t rep_size, nrz;
> > +    int ret, n = 0, i = 0;
> > +
> > +    nrz = *nr_zones;
> > +    if (len == -1) {
> > +        return -errno;
>
> Where is errno set? Should this be an errno constant instead like
> -EINVAL?

That's right! Noted.

>
> > +    }
> > +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> > +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
>
> g_new() looks incorrect. There should be 1 struct blk_zone_report
> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
> instead.

Yes! However, it still has a memory leak error when using g_autofree
&& g_malloc.

>
> > +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> > +    printf("start to report zone with offset: 0x%lx\n", offset);
> > +
> > +    blkz = (struct blk_zone *)(rep + 1);
> > +    while (n < nrz) {
> > +        memset(rep, 0, rep_size);
> > +        rep->sector = offset;
> > +        rep->nr_zones = nrz;
>
> What prevents zones[] overflowing? nrz isn't being reduced in the loop,
> so maybe the rep->nr_zones return value will eventually exceed the
> number of elements still available in zones[n..]?

I suppose the number of zones[] is restricted in the subsequent for
loop where zones[] copy one zone at a time as n increases. Even if
rep->zones exceeds the available room in zones[], the extra zone will
not be copied.

>
> > +static int handle_aiocb_zone_mgmt(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +    zone_op op = aiocb->op;
> > +
> > +    struct blk_zone_range range;
> > +    const char *ioctl_name;
> > +    unsigned long ioctl_op;
> > +    int64_t zone_size;
> > +    int64_t zone_size_mask;
> > +    int ret;
> > +
> > +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
>
> Can this value be stored in bs (maybe in BlockLimits) to avoid calling
> ioctl(BLKGETZONESZ) each time?

Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after
I think through the configurations. In addition, it's a temporary
approach. It is substituted by get_sysfs_long_val now.

>
> > +    if (ret) {
> > +        return -1;
>
> The return value should be a negative errno. -1 is -EPERM but it's
> probably not that error code you wanted. This should be:
>
>   return -errno;
>
> > +    }
> > +
> > +    zone_size_mask = zone_size - 1;
> > +    if (offset & zone_size_mask) {
> > +        error_report("offset is not the start of a zone");
> > +        return -1;
>
> return -EINVAL;
>
> > +    }
> > +
> > +    if (len & zone_size_mask) {
> > +        error_report("len is not aligned to zones");
> > +        return -1;
>
> return -EINVAL;
>
> > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> > +        int64_t len, int64_t *nr_zones,
> > +        BlockZoneDescriptor *zones) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs         = bs,
> > +        .aio_fildes = s->fd,
> > +        .aio_type   = QEMU_AIO_IOCTL,
> > +        .aio_offset = offset,
> > +        .aio_nbytes = len,
> > +        .zone_report    = {
> > +                .nr_zones       = nr_zones,
> > +                .zones          = zones,
> > +                },
>
> Indentation is off here. Please use 4-space indentation.
Noted!

Thanks for reviewing!

Sam


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 3/5] file-posix: introduce get_sysfs_long_val for zoned device information.
  2022-06-27  0:19 ` [RFC v3 3/5] file-posix: introduce get_sysfs_long_val for zoned device information Sam Li
  2022-06-27  7:31   ` Hannes Reinecke
@ 2022-06-28  8:09   ` Stefan Hajnoczi
  2022-06-28  9:18     ` Sam Li
  1 sibling, 1 reply; 32+ messages in thread
From: Stefan Hajnoczi @ 2022-06-28  8:09 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf,
	Fam Zheng, damien.lemoal, qemu-block

[-- Attachment #1: Type: text/plain, Size: 4533 bytes --]

On Mon, Jun 27, 2022 at 08:19:15AM +0800, Sam Li wrote:
> Use sysfs attribute files to get the zoned device information in case
> that ioctl() commands of zone management interface won't work. It can
> return long type of value like chunk_sectors, zoned_append_max_bytes,
> max_open_zones, max_active_zones.
> ---
>  block/file-posix.c | 37 +++++++++++++++++++++++++------------
>  1 file changed, 25 insertions(+), 12 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 1b8b0d351f..73c2cdfbca 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1216,15 +1216,19 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
>  #endif
>  }
>  
> -static int hdev_get_max_segments(int fd, struct stat *st)
> -{
> +/*
> + * Get zoned device information (chunk_sectors, zoned_append_max_bytes,
> + * max_open_zones, max_active_zones) through sysfs attribute files.

This function is also used to get max_segments, which is not related to
zoned devices. How about:

  Get a block queue sysfs attribute value.

> + */
> +static long get_sysfs_long_val(int fd, struct stat *st,
> +                               const char *attribute) {
>  #ifdef CONFIG_LINUX
>      char buf[32];
>      const char *end;
>      char *sysfspath = NULL;
>      int ret;
>      int sysfd = -1;
> -    long max_segments;
> +    long val;
>  
>      if (S_ISCHR(st->st_mode)) {
>          if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
> @@ -1237,8 +1241,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
>          return -ENOTSUP;
>      }
>  
> -    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> -                                major(st->st_rdev), minor(st->st_rdev));
> +    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
> +                                major(st->st_rdev), minor(st->st_rdev),
> +                                attribute);
>      sysfd = open(sysfspath, O_RDONLY);
>      if (sysfd == -1) {
>          ret = -errno;
> @@ -1256,9 +1261,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
>      }
>      buf[ret] = 0;
>      /* The file is ended with '\n', pass 'end' to accept that. */
> -    ret = qemu_strtol(buf, &end, 10, &max_segments);
> +    ret = qemu_strtol(buf, &end, 10, &val);
>      if (ret == 0 && end && *end == '\n') {
> -        ret = max_segments;
> +        ret = val;
>      }
>  
>  out:
> @@ -1272,6 +1277,15 @@ out:
>  #endif
>  }
>  
> +static int hdev_get_max_segments(int fd, struct stat *st) {
> +    int ret;
> +    ret = get_sysfs_long_val(fd, st, "max_segments");
> +    if (ret < 0) {
> +        return -1;

This hides the actual error number. The if statement can be dropped and
the function can be simplified to:

  static int hdev_get_max_segments(int fd, struct stat *st) {
      return get_sysfs_long_val(fd, st, "max_segments");
  }

Whether you want to keep the tiny helper function or inline
get_sysfs_long_val(fd, st, "max_segments") into the caller is up to you.

> +    }
> +    return ret;
> +}
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -1872,6 +1886,7 @@ static int handle_aiocb_zone_report(void *opaque) {
>  
>  static int handle_aiocb_zone_mgmt(void *opaque) {
>      RawPosixAIOData *aiocb = opaque;
> +    BlockDriverState *s = aiocb->bs;
>      int fd = aiocb->aio_fildes;
>      int64_t offset = aiocb->aio_offset;
>      int64_t len = aiocb->aio_nbytes;
> @@ -1884,11 +1899,9 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
>      int64_t zone_size_mask;
>      int ret;
>  
> -    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> -    if (ret) {
> -        return -1;
> -    }
> -
> +    g_autofree struct stat *file = g_new(struct stat, 1);
> +    stat(s->filename, file);

There is no need to allocate struct stat using g_new(). It's a small
struct that can be on the stack.

Also, fstat(fd, &st) is preferred when the file descriptor is already
open because it avoids race conditions due to file renaming/path
traversal.

Here is how this could be written:

  struct stat st;
  if (fstat(fd, &st) < 0) {
      return -errno;
  }

> +    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
>      zone_size_mask = zone_size - 1;
>      if (offset & zone_size_mask) {
>          error_report("offset is not the start of a zone");
> -- 
> 2.35.3
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 5/5] qemu-iotests: add zone operation tests.
  2022-06-27  0:19 ` [RFC v3 5/5] qemu-iotests: add zone operation tests Sam Li
  2022-06-27  7:42   ` Hannes Reinecke
@ 2022-06-28  8:19   ` Stefan Hajnoczi
  2022-06-28  9:34     ` Sam Li
  1 sibling, 1 reply; 32+ messages in thread
From: Stefan Hajnoczi @ 2022-06-28  8:19 UTC (permalink / raw)
  To: Sam Li
  Cc: qemu-devel, hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf,
	Fam Zheng, damien.lemoal, qemu-block

[-- Attachment #1: Type: text/plain, Size: 909 bytes --]

On Mon, Jun 27, 2022 at 08:19:17AM +0800, Sam Li wrote:
> diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-iotests/tests/zoned.sh
> new file mode 100755
> index 0000000000..262c0b5427
> --- /dev/null
> +++ b/tests/qemu-iotests/tests/zoned.sh
> @@ -0,0 +1,49 @@
> +#!/usr/bin/env bash
> +#
> +# Test zone management operations.
> +#
> +
> +QEMU_IO="build/qemu-io"
> +IMG="--image-opts driver=zoned_host_device,filename=/dev/nullb0"
> +QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
> +
> +echo "Testing a null_blk device"
> +echo "Simple cases: if the operations work"
> +sudo modprobe null_blk nr_devices=1 zoned=1

Please use bash's "trap" command to remove null_blk on exit. That way
cleanup happens whether the script exits successfully or not. See
tests/qemu-iotests/108 for an example.

> +# success, all done
> +sudo rmmod null_blk
> +echo "*** done"
> +#rm -f $seq.full

Why is this commented out?

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 2/5] qemu-io: add zoned block device operations.
  2022-06-28  7:56   ` Stefan Hajnoczi
@ 2022-06-28  9:02     ` Sam Li
  2022-06-28  9:11     ` Damien Le Moal
  1 sibling, 0 replies; 32+ messages in thread
From: Sam Li @ 2022-06-28  9:02 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: qemu-devel, Hannes Reinecke, Hanna Reitz, Dmitry Fomichev,
	Kevin Wolf, Fam Zheng, Damien Le Moal, qemu block

Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 16:20写道:
>
> On Mon, Jun 27, 2022 at 08:19:14AM +0800, Sam Li wrote:
> > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
> > index 2f0d8ac25a..3f2592b9f5 100644
> > --- a/qemu-io-cmds.c
> > +++ b/qemu-io-cmds.c
> > @@ -1706,6 +1706,122 @@ static const cmdinfo_t flush_cmd = {
> >      .oneline    = "flush all in-core file state to disk",
> >  };
> >
> > +static int zone_report_f(BlockBackend *blk, int argc, char **argv)
> > +{
> > +    int ret;
> > +    int64_t offset, len, nr_zones;
> > +    int i = 0;
> > +
> > +    ++optind;
> > +    offset = cvtnum(argv[optind]);
> > +    ++optind;
> > +    len = cvtnum(argv[optind]);
> > +    ++optind;
> > +    nr_zones = cvtnum(argv[optind]);
> > +
> > +    g_autofree BlockZoneDescriptor *zones = g_new(BlockZoneDescriptor, nr_zones);
> > +    ret = blk_zone_report(blk, offset, len, &nr_zones, zones);
> > +    while (i < nr_zones) {
>
> Does blk_zone_report() set nr_zones to 0 on failure or do we need to
> check if (ret < 0) here?

I'll check if (ret<0) in zone_report and other commands in this patch as well.

>
> > +        fprintf(stdout, "start: 0x%lx, len 0x%lx, cap 0x%lx, wptr 0x%lx, "
>
> The rest of the source file uses printf() instead of fprintf(stdout,
> ...). That's usually preferred because it's shorter.
>
> > +                        "zcond:%u, [type: %u]\n",
>
> Please use PRIx64 instead of lx format specifiers for portability. On
> 32-bit hosts lx is 32-bit, not 64-bit. You can grep QEMU's code for
> examples of PRIx64.

Noted. It is necessary.

>
> > +                zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
> > +                zones[i].cond, zones[i].type);
> > +        ++i;
> > +    }
>
> A for loop is more idiomatic:
>
>   for (int i = 0; i < nr_zones; i++) {
>       ...
>   }
>
> > +    return ret;
> > +}
> > +
> > +static const cmdinfo_t zone_report_cmd = {
> > +        .name = "zone_report",
> > +        .altname = "f",
> > +        .cfunc = zone_report_f,
> > +        .argmin = 3,
> > +        .argmax = 3,
> > +        .args = "offset [offset..] len [len..] number [num..]",
>
> The arguments are "offset len number". This command does not accept
> optional offset/len/num arguments.
>
> > +        .oneline = "report a number of zones",
>
> Maybe "report zone information".
>
> > +};
> > +
> > +static int zone_open_f(BlockBackend *blk, int argc, char **argv)
> > +{
> > +    int64_t offset, len;
> > +    ++optind;
> > +    offset = cvtnum(argv[optind]);
> > +    ++optind;
> > +    len = cvtnum(argv[optind]);
> > +    return blk_zone_mgmt(blk, zone_open, offset, len);
>
> Where is the error reported? When I look at read_f() I see:
>
>     if (ret < 0) {
>         printf("read failed: %s\n", strerror(-ret));
>
> I think something similar is needed because qemu-io.c does not print an
> error message for us. The same is true for the other commands defined in
> this patch.
>
> > +}
> > +
> > +static const cmdinfo_t zone_open_cmd = {
> > +        .name = "zone_open",
> > +        .altname = "f",
> > +        .cfunc = zone_open_f,
> > +        .argmin = 2,
> > +        .argmax = 2,
> > +        .args = "offset [offset..] len [len..]",
>
> There are no optional offset/len args. The same is true for the other
> commands below.

Thanks for reviewing!

Sam


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-28  8:05     ` Sam Li
@ 2022-06-28  9:05       ` Damien Le Moal
  2022-06-28 10:23         ` Sam Li
  0 siblings, 1 reply; 32+ messages in thread
From: Damien Le Moal @ 2022-06-28  9:05 UTC (permalink / raw)
  To: Sam Li, Stefan Hajnoczi
  Cc: qemu-devel, Hannes Reinecke, Hanna Reitz, Dmitry Fomichev,
	Kevin Wolf, Fam Zheng, qemu block

On 6/28/22 17:05, Sam Li wrote:
> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道:
>>
>> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>> index e0e1aff4b1..786f964d02 100644
>>> --- a/block/block-backend.c
>>> +++ b/block/block-backend.c
>>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
>>>      return ret;
>>>  }
>>>
>>> +/*
>>> + * Return zone_report from BlockDriver. Offset can be any number within
>>> + * the zone size. No alignment for offset and len.
>>
>> What is the purpose of len? Is it the maximum number of zones to return
>> in nr_zones[]?
> 
> len is actually not used in bdrv_co_zone_report. It is needed by
> blk_check_byte_request.

There is no IO buffer being passed. Only an array of zone descriptors and
an in-out number of zones. len is definitely not needed. Not sure what
blk_check_byte_request() is intended to check though.

> 
>> How does the caller know how many zones were returned?
> 
> nr_zones represents IN maximum and OUT actual. The caller will know by
> nr_zones which is changed in bdrv_co_zone_report. I'll add it in the
> comments.
> 
>>
>>> + */
>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>>> +                       int64_t len, int64_t *nr_zones,
>>> +                       BlockZoneDescriptor *zones)
>>> +{
>>> +    int ret;
>>> +    BlockDriverState *bs;
>>> +    IO_CODE();
>>> +
>>> +    blk_inc_in_flight(blk); /* increase before waiting */
>>> +    blk_wait_while_drained(blk);
>>> +    bs = blk_bs(blk);
>>> +
>>> +    ret = blk_check_byte_request(blk, offset, len);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    bdrv_inc_in_flight(bs);
>>
>> The bdrv_inc/dec_in_flight() call should be inside
>> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.
>>
>>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
>>> +                              nr_zones, zones);
>>> +    bdrv_dec_in_flight(bs);
>>> +    blk_dec_in_flight(blk);
>>> +    return ret;
>>> +}
>>> +
>>> +/*
>>> + * Return zone_mgmt from BlockDriver.
>>
>> Maybe this should be:
>>
>>   Send a zone management command.
> 
> Yes, it's more accurate.
> 
>>
>>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
>>>              PreallocMode prealloc;
>>>              Error **errp;
>>>          } truncate;
>>> +        struct {
>>> +            int64_t *nr_zones;
>>> +            BlockZoneDescriptor *zones;
>>> +        } zone_report;
>>> +        zone_op op;
>>
>> It's cleaner to put op inside a struct zone_mgmt so its purpose is
>> self-explanatory:
>>
>>   struct {
>>       zone_op op;
>>   } zone_mgmt;
>>
>>> +static int handle_aiocb_zone_report(void *opaque) {
>>> +    RawPosixAIOData *aiocb = opaque;
>>> +    int fd = aiocb->aio_fildes;
>>> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
>>> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
>>> +    int64_t offset = aiocb->aio_offset;
>>> +    int64_t len = aiocb->aio_nbytes;

Since you have the zone array and number of zones (size of that array) I
really do not see why you need len.

>>> +
>>> +    struct blk_zone *blkz;
>>> +    int64_t rep_size, nrz;
>>> +    int ret, n = 0, i = 0;
>>> +
>>> +    nrz = *nr_zones;
>>> +    if (len == -1) {
>>> +        return -errno;
>>
>> Where is errno set? Should this be an errno constant instead like
>> -EINVAL?
> 
> That's right! Noted.
> 
>>
>>> +    }
>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
>>
>> g_new() looks incorrect. There should be 1 struct blk_zone_report
>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
>> instead.
> 
> Yes! However, it still has a memory leak error when using g_autofree
> && g_malloc.

That may be because you are changing the value of the rep pointer while
parsing the report ?

> 
>>
>>> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
>>> +    printf("start to report zone with offset: 0x%lx\n", offset);
>>> +
>>> +    blkz = (struct blk_zone *)(rep + 1);
>>> +    while (n < nrz) {
>>> +        memset(rep, 0, rep_size);
>>> +        rep->sector = offset;
>>> +        rep->nr_zones = nrz;
>>
>> What prevents zones[] overflowing? nrz isn't being reduced in the loop,
>> so maybe the rep->nr_zones return value will eventually exceed the
>> number of elements still available in zones[n..]?
> 
> I suppose the number of zones[] is restricted in the subsequent for
> loop where zones[] copy one zone at a time as n increases. Even if
> rep->zones exceeds the available room in zones[], the extra zone will
> not be copied.
> 
>>
>>> +static int handle_aiocb_zone_mgmt(void *opaque) {
>>> +    RawPosixAIOData *aiocb = opaque;
>>> +    int fd = aiocb->aio_fildes;
>>> +    int64_t offset = aiocb->aio_offset;
>>> +    int64_t len = aiocb->aio_nbytes;
>>> +    zone_op op = aiocb->op;
>>> +
>>> +    struct blk_zone_range range;
>>> +    const char *ioctl_name;
>>> +    unsigned long ioctl_op;
>>> +    int64_t zone_size;
>>> +    int64_t zone_size_mask;
>>> +    int ret;
>>> +
>>> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
>>
>> Can this value be stored in bs (maybe in BlockLimits) to avoid calling
>> ioctl(BLKGETZONESZ) each time?
> 
> Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after
> I think through the configurations. In addition, it's a temporary
> approach. It is substituted by get_sysfs_long_val now.
> 
>>
>>> +    if (ret) {
>>> +        return -1;
>>
>> The return value should be a negative errno. -1 is -EPERM but it's
>> probably not that error code you wanted. This should be:
>>
>>   return -errno;
>>
>>> +    }
>>> +
>>> +    zone_size_mask = zone_size - 1;
>>> +    if (offset & zone_size_mask) {
>>> +        error_report("offset is not the start of a zone");
>>> +        return -1;
>>
>> return -EINVAL;
>>
>>> +    }
>>> +
>>> +    if (len & zone_size_mask) {
>>> +        error_report("len is not aligned to zones");
>>> +        return -1;
>>
>> return -EINVAL;
>>
>>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
>>> +        int64_t len, int64_t *nr_zones,
>>> +        BlockZoneDescriptor *zones) {
>>> +    BDRVRawState *s = bs->opaque;
>>> +    RawPosixAIOData acb;
>>> +
>>> +    acb = (RawPosixAIOData) {
>>> +        .bs         = bs,
>>> +        .aio_fildes = s->fd,
>>> +        .aio_type   = QEMU_AIO_IOCTL,
>>> +        .aio_offset = offset,
>>> +        .aio_nbytes = len,
>>> +        .zone_report    = {
>>> +                .nr_zones       = nr_zones,
>>> +                .zones          = zones,
>>> +                },
>>
>> Indentation is off here. Please use 4-space indentation.
> Noted!
> 
> Thanks for reviewing!
> 
> Sam


-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 2/5] qemu-io: add zoned block device operations.
  2022-06-28  7:56   ` Stefan Hajnoczi
  2022-06-28  9:02     ` Sam Li
@ 2022-06-28  9:11     ` Damien Le Moal
  2022-06-28 10:27       ` Sam Li
  1 sibling, 1 reply; 32+ messages in thread
From: Damien Le Moal @ 2022-06-28  9:11 UTC (permalink / raw)
  To: Stefan Hajnoczi, Sam Li
  Cc: qemu-devel, hare, Hanna Reitz, dmitry.fomichev, Kevin Wolf,
	Fam Zheng, qemu-block

On 6/28/22 16:56, Stefan Hajnoczi wrote:
> On Mon, Jun 27, 2022 at 08:19:14AM +0800, Sam Li wrote:
>> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
>> index 2f0d8ac25a..3f2592b9f5 100644
>> --- a/qemu-io-cmds.c
>> +++ b/qemu-io-cmds.c
>> @@ -1706,6 +1706,122 @@ static const cmdinfo_t flush_cmd = {
>>      .oneline    = "flush all in-core file state to disk",
>>  };
>>  
>> +static int zone_report_f(BlockBackend *blk, int argc, char **argv)
>> +{
>> +    int ret;
>> +    int64_t offset, len, nr_zones;
>> +    int i = 0;
>> +
>> +    ++optind;
>> +    offset = cvtnum(argv[optind]);
>> +    ++optind;
>> +    len = cvtnum(argv[optind]);
>> +    ++optind;
>> +    nr_zones = cvtnum(argv[optind]);
>> +
>> +    g_autofree BlockZoneDescriptor *zones = g_new(BlockZoneDescriptor, nr_zones);
>> +    ret = blk_zone_report(blk, offset, len, &nr_zones, zones);
>> +    while (i < nr_zones) {
> 
> Does blk_zone_report() set nr_zones to 0 on failure or do we need to
> check if (ret < 0) here?

ret = 0 means "no zone reported" which happen only if nr_zones is 0 or the
start offset is past the end of the disk capacity. ret < 0 would mean that
a report zone operation was actually attempted and failed (EIO, ENOMEM etc).

> 
>> +        fprintf(stdout, "start: 0x%lx, len 0x%lx, cap 0x%lx, wptr 0x%lx, "
> 
> The rest of the source file uses printf() instead of fprintf(stdout,
> ...). That's usually preferred because it's shorter.
> 
>> +                        "zcond:%u, [type: %u]\n",
> 
> Please use PRIx64 instead of lx format specifiers for portability. On
> 32-bit hosts lx is 32-bit, not 64-bit. You can grep QEMU's code for
> examples of PRIx64.
> 
>> +                zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
>> +                zones[i].cond, zones[i].type);
>> +        ++i;
>> +    }
> 
> A for loop is more idiomatic:
> 
>   for (int i = 0; i < nr_zones; i++) {
>       ...
>   }
> 
>> +    return ret;
>> +}
>> +
>> +static const cmdinfo_t zone_report_cmd = {
>> +        .name = "zone_report",
>> +        .altname = "f",
>> +        .cfunc = zone_report_f,
>> +        .argmin = 3,
>> +        .argmax = 3,
>> +        .args = "offset [offset..] len [len..] number [num..]",
> 
> The arguments are "offset len number". This command does not accept
> optional offset/len/num arguments.

The arguments should be offset + len OR offset + number of zones. Having
the 3 of them does not make sense to me. The interface would then be:

(1) offset + len -> report all zones in the block range [offset .. offset
+ len - 1]

(2) offset + number of zones -> report at most "number of zones" from the
zone containing the block at "offset".

(2) matches the semantic used at the device command level. So I prefer to
approach (1).

> 
>> +        .oneline = "report a number of zones",
> 
> Maybe "report zone information".
> 
>> +};
>> +
>> +static int zone_open_f(BlockBackend *blk, int argc, char **argv)
>> +{
>> +    int64_t offset, len;
>> +    ++optind;
>> +    offset = cvtnum(argv[optind]);
>> +    ++optind;
>> +    len = cvtnum(argv[optind]);
>> +    return blk_zone_mgmt(blk, zone_open, offset, len);
> 
> Where is the error reported? When I look at read_f() I see:
> 
>     if (ret < 0) {
>         printf("read failed: %s\n", strerror(-ret));
> 
> I think something similar is needed because qemu-io.c does not print an
> error message for us. The same is true for the other commands defined in
> this patch.
> 
>> +}
>> +
>> +static const cmdinfo_t zone_open_cmd = {
>> +        .name = "zone_open",
>> +        .altname = "f",
>> +        .cfunc = zone_open_f,
>> +        .argmin = 2,
>> +        .argmax = 2,
>> +        .args = "offset [offset..] len [len..]",
> 
> There are no optional offset/len args. The same is true for the other
> commands below.


-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 3/5] file-posix: introduce get_sysfs_long_val for zoned device information.
  2022-06-28  8:09   ` Stefan Hajnoczi
@ 2022-06-28  9:18     ` Sam Li
  0 siblings, 0 replies; 32+ messages in thread
From: Sam Li @ 2022-06-28  9:18 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: qemu-devel, Hannes Reinecke, Hanna Reitz, Dmitry Fomichev,
	Kevin Wolf, Fam Zheng, Damien Le Moal, qemu block

Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 16:20写道:
>
> On Mon, Jun 27, 2022 at 08:19:15AM +0800, Sam Li wrote:
> > Use sysfs attribute files to get the zoned device information in case
> > that ioctl() commands of zone management interface won't work. It can
> > return long type of value like chunk_sectors, zoned_append_max_bytes,
> > max_open_zones, max_active_zones.
> > ---
> >  block/file-posix.c | 37 +++++++++++++++++++++++++------------
> >  1 file changed, 25 insertions(+), 12 deletions(-)
> >
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 1b8b0d351f..73c2cdfbca 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -1216,15 +1216,19 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
> >  #endif
> >  }
> >
> > -static int hdev_get_max_segments(int fd, struct stat *st)
> > -{
> > +/*
> > + * Get zoned device information (chunk_sectors, zoned_append_max_bytes,
> > + * max_open_zones, max_active_zones) through sysfs attribute files.
>
> This function is also used to get max_segments, which is not related to
> zoned devices. How about:
>
>   Get a block queue sysfs attribute value.
>
> > + */
> > +static long get_sysfs_long_val(int fd, struct stat *st,
> > +                               const char *attribute) {
> >  #ifdef CONFIG_LINUX
> >      char buf[32];
> >      const char *end;
> >      char *sysfspath = NULL;
> >      int ret;
> >      int sysfd = -1;
> > -    long max_segments;
> > +    long val;
> >
> >      if (S_ISCHR(st->st_mode)) {
> >          if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
> > @@ -1237,8 +1241,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
> >          return -ENOTSUP;
> >      }
> >
> > -    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> > -                                major(st->st_rdev), minor(st->st_rdev));
> > +    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
> > +                                major(st->st_rdev), minor(st->st_rdev),
> > +                                attribute);
> >      sysfd = open(sysfspath, O_RDONLY);
> >      if (sysfd == -1) {
> >          ret = -errno;
> > @@ -1256,9 +1261,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
> >      }
> >      buf[ret] = 0;
> >      /* The file is ended with '\n', pass 'end' to accept that. */
> > -    ret = qemu_strtol(buf, &end, 10, &max_segments);
> > +    ret = qemu_strtol(buf, &end, 10, &val);
> >      if (ret == 0 && end && *end == '\n') {
> > -        ret = max_segments;
> > +        ret = val;
> >      }
> >
> >  out:
> > @@ -1272,6 +1277,15 @@ out:
> >  #endif
> >  }
> >
> > +static int hdev_get_max_segments(int fd, struct stat *st) {
> > +    int ret;
> > +    ret = get_sysfs_long_val(fd, st, "max_segments");
> > +    if (ret < 0) {
> > +        return -1;
>
> This hides the actual error number. The if statement can be dropped and
> the function can be simplified to:
>
>   static int hdev_get_max_segments(int fd, struct stat *st) {
>       return get_sysfs_long_val(fd, st, "max_segments");
>   }
>
> Whether you want to keep the tiny helper function or inline
> get_sysfs_long_val(fd, st, "max_segments") into the caller is up to you.
>
> > +    }
> > +    return ret;
> > +}
> > +
> >  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >  {
> >      BDRVRawState *s = bs->opaque;
> > @@ -1872,6 +1886,7 @@ static int handle_aiocb_zone_report(void *opaque) {
> >
> >  static int handle_aiocb_zone_mgmt(void *opaque) {
> >      RawPosixAIOData *aiocb = opaque;
> > +    BlockDriverState *s = aiocb->bs;
> >      int fd = aiocb->aio_fildes;
> >      int64_t offset = aiocb->aio_offset;
> >      int64_t len = aiocb->aio_nbytes;
> > @@ -1884,11 +1899,9 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
> >      int64_t zone_size_mask;
> >      int ret;
> >
> > -    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> > -    if (ret) {
> > -        return -1;
> > -    }
> > -
> > +    g_autofree struct stat *file = g_new(struct stat, 1);
> > +    stat(s->filename, file);
>
> There is no need to allocate struct stat using g_new(). It's a small
> struct that can be on the stack.
>
> Also, fstat(fd, &st) is preferred when the file descriptor is already
> open because it avoids race conditions due to file renaming/path
> traversal.
>
> Here is how this could be written:
>
>   struct stat st;
>   if (fstat(fd, &st) < 0) {
>       return -errno;
>   }

Thanks for the suggestions!

>
> > +    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
> >      zone_size_mask = zone_size - 1;
> >      if (offset & zone_size_mask) {
> >          error_report("offset is not the start of a zone");
> > --
> > 2.35.3
> >


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 5/5] qemu-iotests: add zone operation tests.
  2022-06-28  8:19   ` Stefan Hajnoczi
@ 2022-06-28  9:34     ` Sam Li
  2022-06-30 10:11       ` Stefan Hajnoczi
  0 siblings, 1 reply; 32+ messages in thread
From: Sam Li @ 2022-06-28  9:34 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: qemu-devel, Hannes Reinecke, Hanna Reitz, Dmitry Fomichev,
	Kevin Wolf, Fam Zheng, Damien Le Moal, qemu block

Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 16:20写道:
>
> On Mon, Jun 27, 2022 at 08:19:17AM +0800, Sam Li wrote:
> > diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-iotests/tests/zoned.sh
> > new file mode 100755
> > index 0000000000..262c0b5427
> > --- /dev/null
> > +++ b/tests/qemu-iotests/tests/zoned.sh
> > @@ -0,0 +1,49 @@
> > +#!/usr/bin/env bash
> > +#
> > +# Test zone management operations.
> > +#
> > +
> > +QEMU_IO="build/qemu-io"
> > +IMG="--image-opts driver=zoned_host_device,filename=/dev/nullb0"
> > +QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
> > +
> > +echo "Testing a null_blk device"
> > +echo "Simple cases: if the operations work"
> > +sudo modprobe null_blk nr_devices=1 zoned=1
>
> Please use bash's "trap" command to remove null_blk on exit. That way
> cleanup happens whether the script exits successfully or not. See
> tests/qemu-iotests/108 for an example.

Noted. Should I just include "rmmod null_blk" in _cleanup()? I'm a
little confused about the normal way to write qemu-iotests.

>
> > +# success, all done
> > +sudo rmmod null_blk
> > +echo "*** done"
> > +#rm -f $seq.full
>
> Why is this commented out?
I should just remove it. seq is not used.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-28  9:05       ` Damien Le Moal
@ 2022-06-28 10:23         ` Sam Li
  2022-06-29  1:43           ` Damien Le Moal
  0 siblings, 1 reply; 32+ messages in thread
From: Sam Li @ 2022-06-28 10:23 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Stefan Hajnoczi, qemu-devel, Hannes Reinecke, Hanna Reitz,
	Dmitry Fomichev, Kevin Wolf, Fam Zheng, qemu block

Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月28日周二 17:05写道:
>
> On 6/28/22 17:05, Sam Li wrote:
> > Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道:
> >>
> >> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
> >>> diff --git a/block/block-backend.c b/block/block-backend.c
> >>> index e0e1aff4b1..786f964d02 100644
> >>> --- a/block/block-backend.c
> >>> +++ b/block/block-backend.c
> >>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
> >>>      return ret;
> >>>  }
> >>>
> >>> +/*
> >>> + * Return zone_report from BlockDriver. Offset can be any number within
> >>> + * the zone size. No alignment for offset and len.
> >>
> >> What is the purpose of len? Is it the maximum number of zones to return
> >> in nr_zones[]?
> >
> > len is actually not used in bdrv_co_zone_report. It is needed by
> > blk_check_byte_request.
>
> There is no IO buffer being passed. Only an array of zone descriptors and
> an in-out number of zones. len is definitely not needed. Not sure what
> blk_check_byte_request() is intended to check though.

Can I just remove len argument and blk_check_byte_request() if it's not used?

>
> >
> >> How does the caller know how many zones were returned?
> >
> > nr_zones represents IN maximum and OUT actual. The caller will know by
> > nr_zones which is changed in bdrv_co_zone_report. I'll add it in the
> > comments.
> >
> >>
> >>> + */
> >>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> >>> +                       int64_t len, int64_t *nr_zones,
> >>> +                       BlockZoneDescriptor *zones)
> >>> +{
> >>> +    int ret;
> >>> +    BlockDriverState *bs;
> >>> +    IO_CODE();
> >>> +
> >>> +    blk_inc_in_flight(blk); /* increase before waiting */
> >>> +    blk_wait_while_drained(blk);
> >>> +    bs = blk_bs(blk);
> >>> +
> >>> +    ret = blk_check_byte_request(blk, offset, len);
> >>> +    if (ret < 0) {
> >>> +        return ret;
> >>> +    }
> >>> +
> >>> +    bdrv_inc_in_flight(bs);
> >>
> >> The bdrv_inc/dec_in_flight() call should be inside
> >> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.
> >>
> >>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> >>> +                              nr_zones, zones);
> >>> +    bdrv_dec_in_flight(bs);
> >>> +    blk_dec_in_flight(blk);
> >>> +    return ret;
> >>> +}
> >>> +
> >>> +/*
> >>> + * Return zone_mgmt from BlockDriver.
> >>
> >> Maybe this should be:
> >>
> >>   Send a zone management command.
> >
> > Yes, it's more accurate.
> >
> >>
> >>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
> >>>              PreallocMode prealloc;
> >>>              Error **errp;
> >>>          } truncate;
> >>> +        struct {
> >>> +            int64_t *nr_zones;
> >>> +            BlockZoneDescriptor *zones;
> >>> +        } zone_report;
> >>> +        zone_op op;
> >>
> >> It's cleaner to put op inside a struct zone_mgmt so its purpose is
> >> self-explanatory:
> >>
> >>   struct {
> >>       zone_op op;
> >>   } zone_mgmt;
> >>
> >>> +static int handle_aiocb_zone_report(void *opaque) {
> >>> +    RawPosixAIOData *aiocb = opaque;
> >>> +    int fd = aiocb->aio_fildes;
> >>> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> >>> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> >>> +    int64_t offset = aiocb->aio_offset;
> >>> +    int64_t len = aiocb->aio_nbytes;
>
> Since you have the zone array and number of zones (size of that array) I
> really do not see why you need len.
>
> >>> +
> >>> +    struct blk_zone *blkz;
> >>> +    int64_t rep_size, nrz;
> >>> +    int ret, n = 0, i = 0;
> >>> +
> >>> +    nrz = *nr_zones;
> >>> +    if (len == -1) {
> >>> +        return -errno;
> >>
> >> Where is errno set? Should this be an errno constant instead like
> >> -EINVAL?
> >
> > That's right! Noted.
> >
> >>
> >>> +    }
> >>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
> >>
> >> g_new() looks incorrect. There should be 1 struct blk_zone_report
> >> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
> >> instead.
> >
> > Yes! However, it still has a memory leak error when using g_autofree
> > && g_malloc.
>
> That may be because you are changing the value of the rep pointer while
> parsing the report ?

I am not sure it is the case. Can you show me some way to find the problem?

Thanks for reviewing!


>
> >
> >>
> >>> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> >>> +    printf("start to report zone with offset: 0x%lx\n", offset);
> >>> +
> >>> +    blkz = (struct blk_zone *)(rep + 1);
> >>> +    while (n < nrz) {
> >>> +        memset(rep, 0, rep_size);
> >>> +        rep->sector = offset;
> >>> +        rep->nr_zones = nrz;
> >>
> >> What prevents zones[] overflowing? nrz isn't being reduced in the loop,
> >> so maybe the rep->nr_zones return value will eventually exceed the
> >> number of elements still available in zones[n..]?
> >
> > I suppose the number of zones[] is restricted in the subsequent for
> > loop where zones[] copy one zone at a time as n increases. Even if
> > rep->zones exceeds the available room in zones[], the extra zone will
> > not be copied.
> >
> >>
> >>> +static int handle_aiocb_zone_mgmt(void *opaque) {
> >>> +    RawPosixAIOData *aiocb = opaque;
> >>> +    int fd = aiocb->aio_fildes;
> >>> +    int64_t offset = aiocb->aio_offset;
> >>> +    int64_t len = aiocb->aio_nbytes;
> >>> +    zone_op op = aiocb->op;
> >>> +
> >>> +    struct blk_zone_range range;
> >>> +    const char *ioctl_name;
> >>> +    unsigned long ioctl_op;
> >>> +    int64_t zone_size;
> >>> +    int64_t zone_size_mask;
> >>> +    int ret;
> >>> +
> >>> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> >>
> >> Can this value be stored in bs (maybe in BlockLimits) to avoid calling
> >> ioctl(BLKGETZONESZ) each time?
> >
> > Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after
> > I think through the configurations. In addition, it's a temporary
> > approach. It is substituted by get_sysfs_long_val now.
> >
> >>
> >>> +    if (ret) {
> >>> +        return -1;
> >>
> >> The return value should be a negative errno. -1 is -EPERM but it's
> >> probably not that error code you wanted. This should be:
> >>
> >>   return -errno;
> >>
> >>> +    }
> >>> +
> >>> +    zone_size_mask = zone_size - 1;
> >>> +    if (offset & zone_size_mask) {
> >>> +        error_report("offset is not the start of a zone");
> >>> +        return -1;
> >>
> >> return -EINVAL;
> >>
> >>> +    }
> >>> +
> >>> +    if (len & zone_size_mask) {
> >>> +        error_report("len is not aligned to zones");
> >>> +        return -1;
> >>
> >> return -EINVAL;
> >>
> >>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> >>> +        int64_t len, int64_t *nr_zones,
> >>> +        BlockZoneDescriptor *zones) {
> >>> +    BDRVRawState *s = bs->opaque;
> >>> +    RawPosixAIOData acb;
> >>> +
> >>> +    acb = (RawPosixAIOData) {
> >>> +        .bs         = bs,
> >>> +        .aio_fildes = s->fd,
> >>> +        .aio_type   = QEMU_AIO_IOCTL,
> >>> +        .aio_offset = offset,
> >>> +        .aio_nbytes = len,
> >>> +        .zone_report    = {
> >>> +                .nr_zones       = nr_zones,
> >>> +                .zones          = zones,
> >>> +                },
> >>
> >> Indentation is off here. Please use 4-space indentation.
> > Noted!
> >
> > Thanks for reviewing!
> >
> > Sam
>
>
> --
> Damien Le Moal
> Western Digital Research


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 2/5] qemu-io: add zoned block device operations.
  2022-06-28  9:11     ` Damien Le Moal
@ 2022-06-28 10:27       ` Sam Li
  0 siblings, 0 replies; 32+ messages in thread
From: Sam Li @ 2022-06-28 10:27 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Stefan Hajnoczi, qemu-devel, Hannes Reinecke, Hanna Reitz,
	Dmitry Fomichev, Kevin Wolf, Fam Zheng, qemu block

Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月28日周二 17:11写道:
>
> On 6/28/22 16:56, Stefan Hajnoczi wrote:
> > On Mon, Jun 27, 2022 at 08:19:14AM +0800, Sam Li wrote:
> >> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
> >> index 2f0d8ac25a..3f2592b9f5 100644
> >> --- a/qemu-io-cmds.c
> >> +++ b/qemu-io-cmds.c
> >> @@ -1706,6 +1706,122 @@ static const cmdinfo_t flush_cmd = {
> >>      .oneline    = "flush all in-core file state to disk",
> >>  };
> >>
> >> +static int zone_report_f(BlockBackend *blk, int argc, char **argv)
> >> +{
> >> +    int ret;
> >> +    int64_t offset, len, nr_zones;
> >> +    int i = 0;
> >> +
> >> +    ++optind;
> >> +    offset = cvtnum(argv[optind]);
> >> +    ++optind;
> >> +    len = cvtnum(argv[optind]);
> >> +    ++optind;
> >> +    nr_zones = cvtnum(argv[optind]);
> >> +
> >> +    g_autofree BlockZoneDescriptor *zones = g_new(BlockZoneDescriptor, nr_zones);
> >> +    ret = blk_zone_report(blk, offset, len, &nr_zones, zones);
> >> +    while (i < nr_zones) {
> >
> > Does blk_zone_report() set nr_zones to 0 on failure or do we need to
> > check if (ret < 0) here?
>
> ret = 0 means "no zone reported" which happen only if nr_zones is 0 or the
> start offset is past the end of the disk capacity. ret < 0 would mean that
> a report zone operation was actually attempted and failed (EIO, ENOMEM etc).
>
> >
> >> +        fprintf(stdout, "start: 0x%lx, len 0x%lx, cap 0x%lx, wptr 0x%lx, "
> >
> > The rest of the source file uses printf() instead of fprintf(stdout,
> > ...). That's usually preferred because it's shorter.
> >
> >> +                        "zcond:%u, [type: %u]\n",
> >
> > Please use PRIx64 instead of lx format specifiers for portability. On
> > 32-bit hosts lx is 32-bit, not 64-bit. You can grep QEMU's code for
> > examples of PRIx64.
> >
> >> +                zones[i].start, zones[i].length, zones[i].cap, zones[i].wp,
> >> +                zones[i].cond, zones[i].type);
> >> +        ++i;
> >> +    }
> >
> > A for loop is more idiomatic:
> >
> >   for (int i = 0; i < nr_zones; i++) {
> >       ...
> >   }
> >
> >> +    return ret;
> >> +}
> >> +
> >> +static const cmdinfo_t zone_report_cmd = {
> >> +        .name = "zone_report",
> >> +        .altname = "f",
> >> +        .cfunc = zone_report_f,
> >> +        .argmin = 3,
> >> +        .argmax = 3,
> >> +        .args = "offset [offset..] len [len..] number [num..]",
> >
> > The arguments are "offset len number". This command does not accept
> > optional offset/len/num arguments.
>
> The arguments should be offset + len OR offset + number of zones. Having
> the 3 of them does not make sense to me. The interface would then be:
>
> (1) offset + len -> report all zones in the block range [offset .. offset
> + len - 1]
>
> (2) offset + number of zones -> report at most "number of zones" from the
> zone containing the block at "offset".
>
> (2) matches the semantic used at the device command level. So I prefer to
> approach (1).
Yes, I'll remove the len argument then.

>
> >
> >> +        .oneline = "report a number of zones",
> >
> > Maybe "report zone information".
> >
> >> +};
> >> +
> >> +static int zone_open_f(BlockBackend *blk, int argc, char **argv)
> >> +{
> >> +    int64_t offset, len;
> >> +    ++optind;
> >> +    offset = cvtnum(argv[optind]);
> >> +    ++optind;
> >> +    len = cvtnum(argv[optind]);
> >> +    return blk_zone_mgmt(blk, zone_open, offset, len);
> >
> > Where is the error reported? When I look at read_f() I see:
> >
> >     if (ret < 0) {
> >         printf("read failed: %s\n", strerror(-ret));
> >
> > I think something similar is needed because qemu-io.c does not print an
> > error message for us. The same is true for the other commands defined in
> > this patch.
> >
> >> +}
> >> +
> >> +static const cmdinfo_t zone_open_cmd = {
> >> +        .name = "zone_open",
> >> +        .altname = "f",
> >> +        .cfunc = zone_open_f,
> >> +        .argmin = 2,
> >> +        .argmax = 2,
> >> +        .args = "offset [offset..] len [len..]",
> >
> > There are no optional offset/len args. The same is true for the other
> > commands below.
>
>
> --
> Damien Le Moal
> Western Digital Research


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-28 10:23         ` Sam Li
@ 2022-06-29  1:43           ` Damien Le Moal
  2022-06-29  1:50             ` Sam Li
  0 siblings, 1 reply; 32+ messages in thread
From: Damien Le Moal @ 2022-06-29  1:43 UTC (permalink / raw)
  To: Sam Li
  Cc: Stefan Hajnoczi, qemu-devel, Hannes Reinecke, Hanna Reitz,
	Dmitry Fomichev, Kevin Wolf, Fam Zheng, qemu block

On 6/28/22 19:23, Sam Li wrote:
> Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月28日周二 17:05写道:
>>
>> On 6/28/22 17:05, Sam Li wrote:
>>> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道:
>>>>
>>>> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
>>>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>>>> index e0e1aff4b1..786f964d02 100644
>>>>> --- a/block/block-backend.c
>>>>> +++ b/block/block-backend.c
>>>>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
>>>>>      return ret;
>>>>>  }
>>>>>
>>>>> +/*
>>>>> + * Return zone_report from BlockDriver. Offset can be any number within
>>>>> + * the zone size. No alignment for offset and len.
>>>>
>>>> What is the purpose of len? Is it the maximum number of zones to return
>>>> in nr_zones[]?
>>>
>>> len is actually not used in bdrv_co_zone_report. It is needed by
>>> blk_check_byte_request.
>>
>> There is no IO buffer being passed. Only an array of zone descriptors and
>> an in-out number of zones. len is definitely not needed. Not sure what
>> blk_check_byte_request() is intended to check though.
> 
> Can I just remove len argument and blk_check_byte_request() if it's not used?

If it is unused, yes, you must remove it.

> 
>>
>>>
>>>> How does the caller know how many zones were returned?
>>>
>>> nr_zones represents IN maximum and OUT actual. The caller will know by
>>> nr_zones which is changed in bdrv_co_zone_report. I'll add it in the
>>> comments.
>>>
>>>>
>>>>> + */
>>>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>>>>> +                       int64_t len, int64_t *nr_zones,
>>>>> +                       BlockZoneDescriptor *zones)
>>>>> +{
>>>>> +    int ret;
>>>>> +    BlockDriverState *bs;
>>>>> +    IO_CODE();
>>>>> +
>>>>> +    blk_inc_in_flight(blk); /* increase before waiting */
>>>>> +    blk_wait_while_drained(blk);
>>>>> +    bs = blk_bs(blk);
>>>>> +
>>>>> +    ret = blk_check_byte_request(blk, offset, len);
>>>>> +    if (ret < 0) {
>>>>> +        return ret;
>>>>> +    }
>>>>> +
>>>>> +    bdrv_inc_in_flight(bs);
>>>>
>>>> The bdrv_inc/dec_in_flight() call should be inside
>>>> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.
>>>>
>>>>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
>>>>> +                              nr_zones, zones);
>>>>> +    bdrv_dec_in_flight(bs);
>>>>> +    blk_dec_in_flight(blk);
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Return zone_mgmt from BlockDriver.
>>>>
>>>> Maybe this should be:
>>>>
>>>>   Send a zone management command.
>>>
>>> Yes, it's more accurate.
>>>
>>>>
>>>>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
>>>>>              PreallocMode prealloc;
>>>>>              Error **errp;
>>>>>          } truncate;
>>>>> +        struct {
>>>>> +            int64_t *nr_zones;
>>>>> +            BlockZoneDescriptor *zones;
>>>>> +        } zone_report;
>>>>> +        zone_op op;
>>>>
>>>> It's cleaner to put op inside a struct zone_mgmt so its purpose is
>>>> self-explanatory:
>>>>
>>>>   struct {
>>>>       zone_op op;
>>>>   } zone_mgmt;
>>>>
>>>>> +static int handle_aiocb_zone_report(void *opaque) {
>>>>> +    RawPosixAIOData *aiocb = opaque;
>>>>> +    int fd = aiocb->aio_fildes;
>>>>> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
>>>>> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
>>>>> +    int64_t offset = aiocb->aio_offset;
>>>>> +    int64_t len = aiocb->aio_nbytes;
>>
>> Since you have the zone array and number of zones (size of that array) I
>> really do not see why you need len.
>>
>>>>> +
>>>>> +    struct blk_zone *blkz;
>>>>> +    int64_t rep_size, nrz;
>>>>> +    int ret, n = 0, i = 0;
>>>>> +
>>>>> +    nrz = *nr_zones;
>>>>> +    if (len == -1) {
>>>>> +        return -errno;
>>>>
>>>> Where is errno set? Should this be an errno constant instead like
>>>> -EINVAL?
>>>
>>> That's right! Noted.
>>>
>>>>
>>>>> +    }
>>>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>>>>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
>>>>
>>>> g_new() looks incorrect. There should be 1 struct blk_zone_report
>>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
>>>> instead.
>>>
>>> Yes! However, it still has a memory leak error when using g_autofree
>>> && g_malloc.
>>
>> That may be because you are changing the value of the rep pointer while
>> parsing the report ?
> 
> I am not sure it is the case. Can you show me some way to find the problem?

Not sure. I never used this g_malloc()/g_autofree() before so not sure how
it works. It may be that g_autofree() work only with g_new() ?
Could you try separating the declaration and allocation ? e.g.

Declare at the beginning of the function:
g_autofree struct blk_zone_report *rep = NULL;

And then when needed do:

rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
rep = g_malloc(rep_size);

> 
> Thanks for reviewing!
> 
> 
>>
>>>
>>>>
>>>>> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
>>>>> +    printf("start to report zone with offset: 0x%lx\n", offset);
>>>>> +
>>>>> +    blkz = (struct blk_zone *)(rep + 1);
>>>>> +    while (n < nrz) {
>>>>> +        memset(rep, 0, rep_size);
>>>>> +        rep->sector = offset;
>>>>> +        rep->nr_zones = nrz;
>>>>
>>>> What prevents zones[] overflowing? nrz isn't being reduced in the loop,
>>>> so maybe the rep->nr_zones return value will eventually exceed the
>>>> number of elements still available in zones[n..]?
>>>
>>> I suppose the number of zones[] is restricted in the subsequent for
>>> loop where zones[] copy one zone at a time as n increases. Even if
>>> rep->zones exceeds the available room in zones[], the extra zone will
>>> not be copied.
>>>
>>>>
>>>>> +static int handle_aiocb_zone_mgmt(void *opaque) {
>>>>> +    RawPosixAIOData *aiocb = opaque;
>>>>> +    int fd = aiocb->aio_fildes;
>>>>> +    int64_t offset = aiocb->aio_offset;
>>>>> +    int64_t len = aiocb->aio_nbytes;
>>>>> +    zone_op op = aiocb->op;
>>>>> +
>>>>> +    struct blk_zone_range range;
>>>>> +    const char *ioctl_name;
>>>>> +    unsigned long ioctl_op;
>>>>> +    int64_t zone_size;
>>>>> +    int64_t zone_size_mask;
>>>>> +    int ret;
>>>>> +
>>>>> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
>>>>
>>>> Can this value be stored in bs (maybe in BlockLimits) to avoid calling
>>>> ioctl(BLKGETZONESZ) each time?
>>>
>>> Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after
>>> I think through the configurations. In addition, it's a temporary
>>> approach. It is substituted by get_sysfs_long_val now.
>>>
>>>>
>>>>> +    if (ret) {
>>>>> +        return -1;
>>>>
>>>> The return value should be a negative errno. -1 is -EPERM but it's
>>>> probably not that error code you wanted. This should be:
>>>>
>>>>   return -errno;
>>>>
>>>>> +    }
>>>>> +
>>>>> +    zone_size_mask = zone_size - 1;
>>>>> +    if (offset & zone_size_mask) {
>>>>> +        error_report("offset is not the start of a zone");
>>>>> +        return -1;
>>>>
>>>> return -EINVAL;
>>>>
>>>>> +    }
>>>>> +
>>>>> +    if (len & zone_size_mask) {
>>>>> +        error_report("len is not aligned to zones");
>>>>> +        return -1;
>>>>
>>>> return -EINVAL;
>>>>
>>>>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
>>>>> +        int64_t len, int64_t *nr_zones,
>>>>> +        BlockZoneDescriptor *zones) {
>>>>> +    BDRVRawState *s = bs->opaque;
>>>>> +    RawPosixAIOData acb;
>>>>> +
>>>>> +    acb = (RawPosixAIOData) {
>>>>> +        .bs         = bs,
>>>>> +        .aio_fildes = s->fd,
>>>>> +        .aio_type   = QEMU_AIO_IOCTL,
>>>>> +        .aio_offset = offset,
>>>>> +        .aio_nbytes = len,
>>>>> +        .zone_report    = {
>>>>> +                .nr_zones       = nr_zones,
>>>>> +                .zones          = zones,
>>>>> +                },
>>>>
>>>> Indentation is off here. Please use 4-space indentation.
>>> Noted!
>>>
>>> Thanks for reviewing!
>>>
>>> Sam
>>
>>
>> --
>> Damien Le Moal
>> Western Digital Research


-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-29  1:43           ` Damien Le Moal
@ 2022-06-29  1:50             ` Sam Li
  2022-06-29  2:32               ` Damien Le Moal
  0 siblings, 1 reply; 32+ messages in thread
From: Sam Li @ 2022-06-29  1:50 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Stefan Hajnoczi, qemu-devel, Hannes Reinecke, Hanna Reitz,
	Dmitry Fomichev, Kevin Wolf, Fam Zheng, qemu block

Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月29日周三 09:43写道:
>
> On 6/28/22 19:23, Sam Li wrote:
> > Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月28日周二 17:05写道:
> >>
> >> On 6/28/22 17:05, Sam Li wrote:
> >>> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道:
> >>>>
> >>>> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
> >>>>> diff --git a/block/block-backend.c b/block/block-backend.c
> >>>>> index e0e1aff4b1..786f964d02 100644
> >>>>> --- a/block/block-backend.c
> >>>>> +++ b/block/block-backend.c
> >>>>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
> >>>>>      return ret;
> >>>>>  }
> >>>>>
> >>>>> +/*
> >>>>> + * Return zone_report from BlockDriver. Offset can be any number within
> >>>>> + * the zone size. No alignment for offset and len.
> >>>>
> >>>> What is the purpose of len? Is it the maximum number of zones to return
> >>>> in nr_zones[]?
> >>>
> >>> len is actually not used in bdrv_co_zone_report. It is needed by
> >>> blk_check_byte_request.
> >>
> >> There is no IO buffer being passed. Only an array of zone descriptors and
> >> an in-out number of zones. len is definitely not needed. Not sure what
> >> blk_check_byte_request() is intended to check though.
> >
> > Can I just remove len argument and blk_check_byte_request() if it's not used?
>
> If it is unused, yes, you must remove it.
>
> >
> >>
> >>>
> >>>> How does the caller know how many zones were returned?
> >>>
> >>> nr_zones represents IN maximum and OUT actual. The caller will know by
> >>> nr_zones which is changed in bdrv_co_zone_report. I'll add it in the
> >>> comments.
> >>>
> >>>>
> >>>>> + */
> >>>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> >>>>> +                       int64_t len, int64_t *nr_zones,
> >>>>> +                       BlockZoneDescriptor *zones)
> >>>>> +{
> >>>>> +    int ret;
> >>>>> +    BlockDriverState *bs;
> >>>>> +    IO_CODE();
> >>>>> +
> >>>>> +    blk_inc_in_flight(blk); /* increase before waiting */
> >>>>> +    blk_wait_while_drained(blk);
> >>>>> +    bs = blk_bs(blk);
> >>>>> +
> >>>>> +    ret = blk_check_byte_request(blk, offset, len);
> >>>>> +    if (ret < 0) {
> >>>>> +        return ret;
> >>>>> +    }
> >>>>> +
> >>>>> +    bdrv_inc_in_flight(bs);
> >>>>
> >>>> The bdrv_inc/dec_in_flight() call should be inside
> >>>> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.
> >>>>
> >>>>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> >>>>> +                              nr_zones, zones);
> >>>>> +    bdrv_dec_in_flight(bs);
> >>>>> +    blk_dec_in_flight(blk);
> >>>>> +    return ret;
> >>>>> +}
> >>>>> +
> >>>>> +/*
> >>>>> + * Return zone_mgmt from BlockDriver.
> >>>>
> >>>> Maybe this should be:
> >>>>
> >>>>   Send a zone management command.
> >>>
> >>> Yes, it's more accurate.
> >>>
> >>>>
> >>>>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
> >>>>>              PreallocMode prealloc;
> >>>>>              Error **errp;
> >>>>>          } truncate;
> >>>>> +        struct {
> >>>>> +            int64_t *nr_zones;
> >>>>> +            BlockZoneDescriptor *zones;
> >>>>> +        } zone_report;
> >>>>> +        zone_op op;
> >>>>
> >>>> It's cleaner to put op inside a struct zone_mgmt so its purpose is
> >>>> self-explanatory:
> >>>>
> >>>>   struct {
> >>>>       zone_op op;
> >>>>   } zone_mgmt;
> >>>>
> >>>>> +static int handle_aiocb_zone_report(void *opaque) {
> >>>>> +    RawPosixAIOData *aiocb = opaque;
> >>>>> +    int fd = aiocb->aio_fildes;
> >>>>> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> >>>>> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> >>>>> +    int64_t offset = aiocb->aio_offset;
> >>>>> +    int64_t len = aiocb->aio_nbytes;
> >>
> >> Since you have the zone array and number of zones (size of that array) I
> >> really do not see why you need len.
> >>
> >>>>> +
> >>>>> +    struct blk_zone *blkz;
> >>>>> +    int64_t rep_size, nrz;
> >>>>> +    int ret, n = 0, i = 0;
> >>>>> +
> >>>>> +    nrz = *nr_zones;
> >>>>> +    if (len == -1) {
> >>>>> +        return -errno;
> >>>>
> >>>> Where is errno set? Should this be an errno constant instead like
> >>>> -EINVAL?
> >>>
> >>> That's right! Noted.
> >>>
> >>>>
> >>>>> +    }
> >>>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >>>>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
> >>>>
> >>>> g_new() looks incorrect. There should be 1 struct blk_zone_report
> >>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
> >>>> instead.
> >>>
> >>> Yes! However, it still has a memory leak error when using g_autofree
> >>> && g_malloc.
> >>
> >> That may be because you are changing the value of the rep pointer while
> >> parsing the report ?
> >
> > I am not sure it is the case. Can you show me some way to find the problem?
>
> Not sure. I never used this g_malloc()/g_autofree() before so not sure how
> it works. It may be that g_autofree() work only with g_new() ?
> Could you try separating the declaration and allocation ? e.g.
>
> Declare at the beginning of the function:
> g_autofree struct blk_zone_report *rep = NULL;
>
> And then when needed do:
>
> rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> rep = g_malloc(rep_size);

Actually, the memory leak occurs in that way. When using zone_mgmt,
memory leak still occurs. Asan gives the error information not much so
I haven't tracked down the problem yet.

>
> >
> > Thanks for reviewing!
> >
> >
> >>
> >>>
> >>>>
> >>>>> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> >>>>> +    printf("start to report zone with offset: 0x%lx\n", offset);
> >>>>> +
> >>>>> +    blkz = (struct blk_zone *)(rep + 1);
> >>>>> +    while (n < nrz) {
> >>>>> +        memset(rep, 0, rep_size);
> >>>>> +        rep->sector = offset;
> >>>>> +        rep->nr_zones = nrz;
> >>>>
> >>>> What prevents zones[] overflowing? nrz isn't being reduced in the loop,
> >>>> so maybe the rep->nr_zones return value will eventually exceed the
> >>>> number of elements still available in zones[n..]?
> >>>
> >>> I suppose the number of zones[] is restricted in the subsequent for
> >>> loop where zones[] copy one zone at a time as n increases. Even if
> >>> rep->zones exceeds the available room in zones[], the extra zone will
> >>> not be copied.
> >>>
> >>>>
> >>>>> +static int handle_aiocb_zone_mgmt(void *opaque) {
> >>>>> +    RawPosixAIOData *aiocb = opaque;
> >>>>> +    int fd = aiocb->aio_fildes;
> >>>>> +    int64_t offset = aiocb->aio_offset;
> >>>>> +    int64_t len = aiocb->aio_nbytes;
> >>>>> +    zone_op op = aiocb->op;
> >>>>> +
> >>>>> +    struct blk_zone_range range;
> >>>>> +    const char *ioctl_name;
> >>>>> +    unsigned long ioctl_op;
> >>>>> +    int64_t zone_size;
> >>>>> +    int64_t zone_size_mask;
> >>>>> +    int ret;
> >>>>> +
> >>>>> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> >>>>
> >>>> Can this value be stored in bs (maybe in BlockLimits) to avoid calling
> >>>> ioctl(BLKGETZONESZ) each time?
> >>>
> >>> Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after
> >>> I think through the configurations. In addition, it's a temporary
> >>> approach. It is substituted by get_sysfs_long_val now.
> >>>
> >>>>
> >>>>> +    if (ret) {
> >>>>> +        return -1;
> >>>>
> >>>> The return value should be a negative errno. -1 is -EPERM but it's
> >>>> probably not that error code you wanted. This should be:
> >>>>
> >>>>   return -errno;
> >>>>
> >>>>> +    }
> >>>>> +
> >>>>> +    zone_size_mask = zone_size - 1;
> >>>>> +    if (offset & zone_size_mask) {
> >>>>> +        error_report("offset is not the start of a zone");
> >>>>> +        return -1;
> >>>>
> >>>> return -EINVAL;
> >>>>
> >>>>> +    }
> >>>>> +
> >>>>> +    if (len & zone_size_mask) {
> >>>>> +        error_report("len is not aligned to zones");
> >>>>> +        return -1;
> >>>>
> >>>> return -EINVAL;
> >>>>
> >>>>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> >>>>> +        int64_t len, int64_t *nr_zones,
> >>>>> +        BlockZoneDescriptor *zones) {
> >>>>> +    BDRVRawState *s = bs->opaque;
> >>>>> +    RawPosixAIOData acb;
> >>>>> +
> >>>>> +    acb = (RawPosixAIOData) {
> >>>>> +        .bs         = bs,
> >>>>> +        .aio_fildes = s->fd,
> >>>>> +        .aio_type   = QEMU_AIO_IOCTL,
> >>>>> +        .aio_offset = offset,
> >>>>> +        .aio_nbytes = len,
> >>>>> +        .zone_report    = {
> >>>>> +                .nr_zones       = nr_zones,
> >>>>> +                .zones          = zones,
> >>>>> +                },
> >>>>
> >>>> Indentation is off here. Please use 4-space indentation.
> >>> Noted!
> >>>
> >>> Thanks for reviewing!
> >>>
> >>> Sam
> >>
> >>
> >> --
> >> Damien Le Moal
> >> Western Digital Research
>
>
> --
> Damien Le Moal
> Western Digital Research


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-29  1:50             ` Sam Li
@ 2022-06-29  2:32               ` Damien Le Moal
  2022-06-29  2:35                 ` Sam Li
  0 siblings, 1 reply; 32+ messages in thread
From: Damien Le Moal @ 2022-06-29  2:32 UTC (permalink / raw)
  To: Sam Li
  Cc: Stefan Hajnoczi, qemu-devel, Hannes Reinecke, Hanna Reitz,
	Dmitry Fomichev, Kevin Wolf, Fam Zheng, qemu block

On 6/29/22 10:50, Sam Li wrote:
>>>>>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>>>>>>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
>>>>>>
>>>>>> g_new() looks incorrect. There should be 1 struct blk_zone_report
>>>>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
>>>>>> instead.
>>>>>
>>>>> Yes! However, it still has a memory leak error when using g_autofree
>>>>> && g_malloc.
>>>>
>>>> That may be because you are changing the value of the rep pointer while
>>>> parsing the report ?
>>>
>>> I am not sure it is the case. Can you show me some way to find the problem?
>>
>> Not sure. I never used this g_malloc()/g_autofree() before so not sure how
>> it works. It may be that g_autofree() work only with g_new() ?
>> Could you try separating the declaration and allocation ? e.g.
>>
>> Declare at the beginning of the function:
>> g_autofree struct blk_zone_report *rep = NULL;
>>
>> And then when needed do:
>>
>> rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>> rep = g_malloc(rep_size);
> 
> Actually, the memory leak occurs in that way. When using zone_mgmt,
> memory leak still occurs. Asan gives the error information not much so
> I haven't tracked down the problem yet.

See this:

https://blog.fishsoup.net/2015/11/05/attributecleanup-mixed-declarations-and-code-and-goto/

Maybe you can find some hints.

-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.
  2022-06-29  2:32               ` Damien Le Moal
@ 2022-06-29  2:35                 ` Sam Li
  0 siblings, 0 replies; 32+ messages in thread
From: Sam Li @ 2022-06-29  2:35 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Stefan Hajnoczi, qemu-devel, Hannes Reinecke, Hanna Reitz,
	Dmitry Fomichev, Kevin Wolf, Fam Zheng, qemu block

Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月29日周三 10:32写道:
>
> On 6/29/22 10:50, Sam Li wrote:
> >>>>>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >>>>>>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
> >>>>>>
> >>>>>> g_new() looks incorrect. There should be 1 struct blk_zone_report
> >>>>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
> >>>>>> instead.
> >>>>>
> >>>>> Yes! However, it still has a memory leak error when using g_autofree
> >>>>> && g_malloc.
> >>>>
> >>>> That may be because you are changing the value of the rep pointer while
> >>>> parsing the report ?
> >>>
> >>> I am not sure it is the case. Can you show me some way to find the problem?
> >>
> >> Not sure. I never used this g_malloc()/g_autofree() before so not sure how
> >> it works. It may be that g_autofree() work only with g_new() ?
> >> Could you try separating the declaration and allocation ? e.g.
> >>
> >> Declare at the beginning of the function:
> >> g_autofree struct blk_zone_report *rep = NULL;
> >>
> >> And then when needed do:
> >>
> >> rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >> rep = g_malloc(rep_size);
> >
> > Actually, the memory leak occurs in that way. When using zone_mgmt,
> > memory leak still occurs. Asan gives the error information not much so
> > I haven't tracked down the problem yet.
>
> See this:
>
> https://blog.fishsoup.net/2015/11/05/attributecleanup-mixed-declarations-and-code-and-goto/
>
> Maybe you can find some hints.

Thanks!

>
> --
> Damien Le Moal
> Western Digital Research


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC v3 5/5] qemu-iotests: add zone operation tests.
  2022-06-28  9:34     ` Sam Li
@ 2022-06-30 10:11       ` Stefan Hajnoczi
  0 siblings, 0 replies; 32+ messages in thread
From: Stefan Hajnoczi @ 2022-06-30 10:11 UTC (permalink / raw)
  To: Sam Li, Kevin Wolf
  Cc: Stefan Hajnoczi, qemu-devel, Hannes Reinecke, Hanna Reitz,
	Dmitry Fomichev, Fam Zheng, Damien Le Moal, qemu block

On Tue, 28 Jun 2022 at 10:35, Sam Li <faithilikerun@gmail.com> wrote:
>
> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 16:20写道:
> >
> > On Mon, Jun 27, 2022 at 08:19:17AM +0800, Sam Li wrote:
> > > diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-iotests/tests/zoned.sh
> > > new file mode 100755
> > > index 0000000000..262c0b5427
> > > --- /dev/null
> > > +++ b/tests/qemu-iotests/tests/zoned.sh
> > > @@ -0,0 +1,49 @@
> > > +#!/usr/bin/env bash
> > > +#
> > > +# Test zone management operations.
> > > +#
> > > +
> > > +QEMU_IO="build/qemu-io"
> > > +IMG="--image-opts driver=zoned_host_device,filename=/dev/nullb0"
> > > +QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
> > > +
> > > +echo "Testing a null_blk device"
> > > +echo "Simple cases: if the operations work"
> > > +sudo modprobe null_blk nr_devices=1 zoned=1
> >
> > Please use bash's "trap" command to remove null_blk on exit. That way
> > cleanup happens whether the script exits successfully or not. See
> > tests/qemu-iotests/108 for an example.
>
> Noted. Should I just include "rmmod null_blk" in _cleanup()? I'm a
> little confused about the normal way to write qemu-iotests.

Yes, please. There are a few related issues here:

1. Cleaning up
qemu-iotests must not assume the environment is set up in a specific
way before ./check is launched and they must clean up after themselves
(whether the test succeeds or fails).

This is important so that tests are idempotent and do not leak
resources or change the state of the system, which could affect future
test runs.

2. Parallelism
Multiple instances of tests should be able to run at the same time.
This is a problem with null_blk because two tests that change the
state of /dev/nullb0 at the same time will interfere with each other
and fail.

I don't know if there is a good solution already implemented in
qemu-iotests. Maybe Kevin can answer this question?

3. Skipping tests on unsupported systems
Tests with specific requirements can use the following functions to
run only when the configuration is supported:

  _supported_fmt raw
  _supported_proto file
  _supported_os Linux

This means the test only runs on Linux hosts with raw image files. On
BSD, macOS, etc the test will be skipped. It will also be skipped if
./check was invoked with -qcow2 or another disk image format that is
not raw.

The test script can also check whether the system supports necessary
features and then skip tests when they are not available:

  if ! ...check for foo ...; then
    _notrun "cannot find dependency foo"
  fi

> >
> > > +# success, all done
> > > +sudo rmmod null_blk
> > > +echo "*** done"
> > > +#rm -f $seq.full
> >
> > Why is this commented out?
> I should just remove it. seq is not used.

Okay. I only found small remnants of $seq.full in common.rc and the
tests themselves don't seem to use it. It may be something that early
qemu-iotests or event xfstests had that's no longer needed.

Stefan


^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2022-06-30 10:12 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-27  0:19 [RFC v3 0/5] *** Add support for zoned device *** Sam Li
2022-06-27  0:19 ` [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls Sam Li
2022-06-27  7:21   ` Hannes Reinecke
2022-06-27  7:45     ` Sam Li
2022-06-27 14:15   ` Stefan Hajnoczi
2022-06-28  8:05     ` Sam Li
2022-06-28  9:05       ` Damien Le Moal
2022-06-28 10:23         ` Sam Li
2022-06-29  1:43           ` Damien Le Moal
2022-06-29  1:50             ` Sam Li
2022-06-29  2:32               ` Damien Le Moal
2022-06-29  2:35                 ` Sam Li
2022-06-27  0:19 ` [RFC v3 2/5] qemu-io: add zoned block device operations Sam Li
2022-06-27  7:30   ` Hannes Reinecke
2022-06-27  8:31     ` Sam Li
2022-06-28  7:56   ` Stefan Hajnoczi
2022-06-28  9:02     ` Sam Li
2022-06-28  9:11     ` Damien Le Moal
2022-06-28 10:27       ` Sam Li
2022-06-27  0:19 ` [RFC v3 3/5] file-posix: introduce get_sysfs_long_val for zoned device information Sam Li
2022-06-27  7:31   ` Hannes Reinecke
2022-06-27  8:32     ` Sam Li
2022-06-28  8:09   ` Stefan Hajnoczi
2022-06-28  9:18     ` Sam Li
2022-06-27  0:19 ` [RFC v3 4/5] file-posix: introduce get_sysfs_str_val for device zoned model Sam Li
2022-06-27  7:41   ` Hannes Reinecke
2022-06-27  8:44     ` Sam Li
2022-06-27  0:19 ` [RFC v3 5/5] qemu-iotests: add zone operation tests Sam Li
2022-06-27  7:42   ` Hannes Reinecke
2022-06-28  8:19   ` Stefan Hajnoczi
2022-06-28  9:34     ` Sam Li
2022-06-30 10:11       ` Stefan Hajnoczi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.